Image and video quality quantitative metrics

These are some notes with some code around FID, SSIM, and PSNR metrics.

Github repository for the full jupyter notebook:

Image and Video Metrics

1. Fréchet Inception Distance (FID) for videos

Mathematical intuition

FID measures the similarity between two sets of videos by comparing their feature distributions using the Fréchet distance between multivariate Gaussians. For videos, features are extracted using a 3D CNN to capture spatiotemporal patterns.

Formula:

\[\text{FID} = ||\mu_r - \mu_g||^2 + \text{Tr}\left(\Sigma_r + \Sigma_g - 2(\Sigma_r \Sigma_g)^{1/2}\right)\]

Implementation with a pretrained 3D CNN

We use I3D (Inflated 3D ConvNet) pretrained on Kinetics-400 for feature extraction (pretty standard).

import numpy as np
from scipy import linalg
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from pytorchvideo.models.hub import i3d_r50
from tqdm import tqdm

class VideoFeatureExtractor(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = i3d_r50(pretrained=True)
        self.model.eval()

    def forward(self, videos):
        # videos: (B, C, T, H, W), pixel values in [0, 255]
        videos = videos / 255.0  # Normalize to [0, 1]

        # Resize spatial dimensions using Option 1:
        B, C, T, H, W = videos.shape
        videos = videos.view(B * T, C, H, W)
        videos = F.interpolate(videos, size=(224, 224), mode='bilinear')
        videos = videos.view(B, T, C, 224, 224).permute(0, 2, 1, 3, 4)
        
        features = self.model(videos)  # Output: (B, 2048)
        return features

def calculate_fid(real_features, generated_features, eps=1e-6):
    mu1, sigma1 = np.mean(real_features, axis=0), np.cov(real_features, rowvar=False)
    mu2, sigma2 = np.mean(generated_features, axis=0), np.cov(generated_features, rowvar=False)

    sigma1 += eps * np.eye(sigma1.shape[0])
    sigma2 += eps * np.eye(sigma2.shape[0])

    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
    covmean = covmean.real

    fid = np.sum((mu1 - mu2) ** 2) + np.trace(sigma1 + sigma2 - 2 * covmean)
    return fid

def get_features(dataloader, model, device="cuda"):
    model.eval()
    features = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Extracting Features"):
            batch = batch.to(device)
            feats = model(batch).cpu().numpy()
            features.append(feats)
    return np.concatenate(features, axis=0)
   
# device settings
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VideoFeatureExtractor().to(device)

# Number of videos, channels, frames, height, width
N, C, T, H, W = 100, 3, 16, 224, 224

# Generate "real" videos using random integers in the range [0, 255]
real_videos = torch.randint(0, 256, (N, C, T, H, W), dtype=torch.uint8).float()

# Generate "generated" videos: add a small bias to simulate differences
# For instance, add a bias of 10 to the pixel values and clamp them to [0, 255]
gen_videos = real_videos + 10.0
gen_videos = gen_videos.clamp(0, 255)

# Create DataLoaders
real_loader = DataLoader(real_videos, batch_size=16)
gen_loader = DataLoader(gen_videos, batch_size=16)

# Extract features
real_feats = get_features(real_loader, model, device)
gen_feats = get_features(gen_loader, model, device)

# Calculate FID
fid_score = calculate_fid(real_feats, gen_feats)
print(f"FID: {fid_score:.3f}")

Key Notes:

2. PSNR and SSIM for videos

Mathematical Foundations

\[\text{PSNR} = 20 \cdot \log_{10}\left(\frac{\text{MAX}_I}{\sqrt{\text{MSE}}}\right)\]

with $\text{MAX}_I = 255$ for 8-bit images

Vectorized implementation:

from skimage.metrics import structural_similarity

def calculate_psnr(video1, video2):
    # Inputs: (N, C, T, H, W) in [0, 255]
    mse = np.mean((video1 - video2)**2, axis=(1, 2, 3, 4))
    psnr = 20 * np.log10(255.0 / np.sqrt(mse))
    return np.mean(psnr)

def calculate_ssim(video1, video2):
    # Inputs: (N, T, H, W, C) in [0, 255]
    ssim_scores = []
    for vid1, vid2 in zip(video1, video2):
        vid1 = vid1.astype(np.float32)
        vid2 = vid2.astype(np.float32)
        ssim_batch = [structural_similarity(f1, f2, channel_axis=-1, data_range=255) for f1, f2 in zip(vid1, vid2)]
        ssim_scores.append(np.mean(ssim_batch))
    return np.mean(ssim_scores)

# Basic example Usage
real_video = np.random.randint(0, 256, (10, 16, 224, 224, 3), dtype=np.uint8)  # (N, T, H, W, C)
gen_video = np.random.randint(0, 256, (10, 16, 224, 224, 3), dtype=np.uint8)

psnr = calculate_psnr(real_video, gen_video)
ssim = calculate_ssim(real_video, gen_video)
print(f"PSNR: {psnr:.3f} dB, SSIM: {ssim:.4f}")

3. Conclusion

Quantitative metrics like FID, PSNR, and SSIM provide complementary insights into video quality. While FID evaluates high-level feature distributions, PSNR and SSIM focus on pixel and structural fidelity. For robust evaluation:

References

  1. FID: Heusel et al. (2017)
  2. SSIM: Wang et al. (2004)
  3. I3D: Carreira & Zisserman (2017)