These are some notes with some code around FID, SSIM, and PSNR metrics.
Github repository for the full jupyter notebook:
FID measures the similarity between two sets of videos by comparing their feature distributions using the Fréchet distance between multivariate Gaussians. For videos, features are extracted using a 3D CNN to capture spatiotemporal patterns.
Formula:
\[\text{FID} = ||\mu_r - \mu_g||^2 + \text{Tr}\left(\Sigma_r + \Sigma_g - 2(\Sigma_r \Sigma_g)^{1/2}\right)\]| Mean distance: $ | \mu_r - \mu_g | ^2$ reflects how close the average features of real $\mu_r$ and generated $\mu_g$ videos are. |
We use I3D (Inflated 3D ConvNet) pretrained on Kinetics-400 for feature extraction (pretty standard).
import numpy as np
from scipy import linalg
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from pytorchvideo.models.hub import i3d_r50
from tqdm import tqdm
class VideoFeatureExtractor(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = i3d_r50(pretrained=True)
self.model.eval()
def forward(self, videos):
# videos: (B, C, T, H, W), pixel values in [0, 255]
videos = videos / 255.0 # Normalize to [0, 1]
# Resize spatial dimensions using Option 1:
B, C, T, H, W = videos.shape
videos = videos.view(B * T, C, H, W)
videos = F.interpolate(videos, size=(224, 224), mode='bilinear')
videos = videos.view(B, T, C, 224, 224).permute(0, 2, 1, 3, 4)
features = self.model(videos) # Output: (B, 2048)
return features
def calculate_fid(real_features, generated_features, eps=1e-6):
mu1, sigma1 = np.mean(real_features, axis=0), np.cov(real_features, rowvar=False)
mu2, sigma2 = np.mean(generated_features, axis=0), np.cov(generated_features, rowvar=False)
sigma1 += eps * np.eye(sigma1.shape[0])
sigma2 += eps * np.eye(sigma2.shape[0])
covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
covmean = covmean.real
fid = np.sum((mu1 - mu2) ** 2) + np.trace(sigma1 + sigma2 - 2 * covmean)
return fid
def get_features(dataloader, model, device="cuda"):
model.eval()
features = []
with torch.no_grad():
for batch in tqdm(dataloader, desc="Extracting Features"):
batch = batch.to(device)
feats = model(batch).cpu().numpy()
features.append(feats)
return np.concatenate(features, axis=0)
# device settings
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VideoFeatureExtractor().to(device)
# Number of videos, channels, frames, height, width
N, C, T, H, W = 100, 3, 16, 224, 224
# Generate "real" videos using random integers in the range [0, 255]
real_videos = torch.randint(0, 256, (N, C, T, H, W), dtype=torch.uint8).float()
# Generate "generated" videos: add a small bias to simulate differences
# For instance, add a bias of 10 to the pixel values and clamp them to [0, 255]
gen_videos = real_videos + 10.0
gen_videos = gen_videos.clamp(0, 255)
# Create DataLoaders
real_loader = DataLoader(real_videos, batch_size=16)
gen_loader = DataLoader(gen_videos, batch_size=16)
# Extract features
real_feats = get_features(real_loader, model, device)
gen_feats = get_features(gen_loader, model, device)
# Calculate FID
fid_score = calculate_fid(real_feats, gen_feats)
print(f"FID: {fid_score:.3f}")
Key Notes:
[0, 1] and resized to 224x224.with $\text{MAX}_I = 255$ for 8-bit images
SSIM: Assesses perceptual quality through luminance $l$, contrast $c$, and structure $s$:
\[\text{SSIM}(x, y) = \frac{(2\mu_x\mu_y + C_1)(2\sigma_{xy} + C_2)}{(\mu_x^2 + \mu_y^2 + C_1)(\sigma_x^2 + \sigma_y^2 + C_2)}\]$C_1, C_2$ are constants.
We will not go through the details of the SSIM (Structural SIMilarity) formula, you can check them out here (with the source code of the method in skimage):
https://github.com/scikit-image/scikit-image/blob/main/skimage/metrics/_structural_similarity.py
from skimage.metrics import structural_similarity
def calculate_psnr(video1, video2):
# Inputs: (N, C, T, H, W) in [0, 255]
mse = np.mean((video1 - video2)**2, axis=(1, 2, 3, 4))
psnr = 20 * np.log10(255.0 / np.sqrt(mse))
return np.mean(psnr)
def calculate_ssim(video1, video2):
# Inputs: (N, T, H, W, C) in [0, 255]
ssim_scores = []
for vid1, vid2 in zip(video1, video2):
vid1 = vid1.astype(np.float32)
vid2 = vid2.astype(np.float32)
ssim_batch = [structural_similarity(f1, f2, channel_axis=-1, data_range=255) for f1, f2 in zip(vid1, vid2)]
ssim_scores.append(np.mean(ssim_batch))
return np.mean(ssim_scores)
# Basic example Usage
real_video = np.random.randint(0, 256, (10, 16, 224, 224, 3), dtype=np.uint8) # (N, T, H, W, C)
gen_video = np.random.randint(0, 256, (10, 16, 224, 224, 3), dtype=np.uint8)
psnr = calculate_psnr(real_video, gen_video)
ssim = calculate_ssim(real_video, gen_video)
print(f"PSNR: {psnr:.3f} dB, SSIM: {ssim:.4f}")
Quantitative metrics like FID, PSNR, and SSIM provide complementary insights into video quality. While FID evaluates high-level feature distributions, PSNR and SSIM focus on pixel and structural fidelity. For robust evaluation: