cheenchan commited on
Commit
fad2ba6
·
1 Parent(s): c4fcf6a

Deploy frame extraction matcher

Browse files
app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from frame_extraction.app import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
catalog/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Catalog Placeholder
2
+
3
+ Upload your generated `catalog.json` and `references/` images here before deploying the Space. Update the `FRAME_CATALOG` environment variable accordingly.
frame_extraction/.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .venv/
2
+ outputs/
3
+ app_outputs/
4
+ dummy_frames/
5
+ catalog/
6
+ __pycache__/
frame_extraction/README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Frame Extraction & Character Matching
2
+
3
+ This package turns raw video into character reference catalogs and lets you match new frames against those references. It is designed to be deployed quickly (e.g., on Hugging Face Spaces) for interactive character discovery.
4
+
5
+ ## Features
6
+ - Shot-aware frame sampling to keep only useful stills.
7
+ - Face detection, embedding, and clustering (MTCNN + InceptionResnet).
8
+ - Automatic reference selection per character (sharpest, most frontal crop).
9
+ - JSON catalog output and optional reference thumbnails.
10
+ - Matching API/CLI for user-uploaded frames with multi-character support.
11
+ - Gradio app template ready for Hugging Face hosting.
12
+
13
+ ## Install
14
+ ```bash
15
+ cd projects/UMO-Qwen-Edit/data_curation_scripts/frame_extraction
16
+ pip install -e .
17
+ ```
18
+
19
+ ## CLI Usage
20
+ ### Build a catalog from a video
21
+ ```bash
22
+ frame-catalog catalog \
23
+ --video-path data/source.mp4 \
24
+ --output-dir outputs/catalog \
25
+ --frame-interval 12 \
26
+ --min-track-length 5
27
+ ```
28
+
29
+ ### Match new frames against the catalog
30
+ ```bash
31
+ frame-catalog match \
32
+ --catalog-path outputs/catalog/catalog.json \
33
+ --frames-dir uploads/ \
34
+ --output-path outputs/matches.json
35
+ ```
36
+
37
+ ## Deploy on Hugging Face Spaces
38
+ 1. Copy this folder to a new Space (Python SDK).
39
+ 2. Install dependencies with `pip install -e .`.
40
+ 3. Upload a pre-built `catalog/catalog.json` plus the `references/` images.
41
+ 4. Set environment variables in the Space:
42
+ - `FRAME_CATALOG=/home/user/app/catalog/catalog.json`
43
+ - `FRAME_OUTPUT_DIR=/home/user/app/output`
44
+ 5. Set the Space entrypoint to `python -m frame_extraction.app`.
45
+
46
+ ## Outputs
47
+ - `catalog.json`: character reference metadata with embeddings and chosen frames.
48
+ - `references/`: cropped reference images per character.
49
+ - `matches.json`: mapping from user frames to character IDs with similarity scores.
50
+
51
+ ## Roadmap
52
+ - Integrate more robust trackers (DeepSort/ByteTrack).
53
+ - Add active learning loop for manual character corrections.
54
+ - Expose REST endpoints for automated ingestion.
frame_extraction/pyproject.toml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "frame-extraction"
3
+ version = "0.1.0"
4
+ description = "Character-centric frame extraction and matching pipeline"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "typer>=0.12",
9
+ "rich>=13.7",
10
+ "numpy>=1.24",
11
+ "pandas>=2.1",
12
+ "opencv-python>=4.8",
13
+ "torch>=2.1",
14
+ "torchvision>=0.16",
15
+ "facenet-pytorch>=2.5.3",
16
+ "scikit-learn>=1.4",
17
+ "Pillow>=10.0",
18
+ "tqdm>=4.66",
19
+ "gradio>=4.0",
20
+ "faiss-cpu>=1.7.4",
21
+ ]
22
+
23
+ [project.scripts]
24
+ frame-catalog = "frame_extraction.cli:app"
25
+
26
+ [build-system]
27
+ requires = ["setuptools>=68", "wheel"]
28
+ build-backend = "setuptools.build_meta"
frame_extraction/src/frame_extraction.egg-info/PKG-INFO ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: frame-extraction
3
+ Version: 0.1.0
4
+ Summary: Character-centric frame extraction and matching pipeline
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: typer>=0.12
8
+ Requires-Dist: rich>=13.7
9
+ Requires-Dist: numpy>=1.24
10
+ Requires-Dist: pandas>=2.1
11
+ Requires-Dist: opencv-python>=4.8
12
+ Requires-Dist: torch>=2.1
13
+ Requires-Dist: torchvision>=0.16
14
+ Requires-Dist: facenet-pytorch>=2.5.3
15
+ Requires-Dist: scikit-learn>=1.4
16
+ Requires-Dist: Pillow>=10.0
17
+ Requires-Dist: tqdm>=4.66
18
+ Requires-Dist: gradio>=4.0
19
+ Requires-Dist: faiss-cpu>=1.7.4
20
+
21
+ # Frame Extraction & Character Matching
22
+
23
+ This package turns raw video into character reference catalogs and lets you match new frames against those references. It is designed to be deployed quickly (e.g., on Hugging Face Spaces) for interactive character discovery.
24
+
25
+ ## Features
26
+ - Shot-aware frame sampling to keep only useful stills.
27
+ - Face detection, embedding, and clustering (MTCNN + InceptionResnet).
28
+ - Automatic reference selection per character (sharpest, most frontal crop).
29
+ - JSON catalog output and optional reference thumbnails.
30
+ - Matching API/CLI for user-uploaded frames with multi-character support.
31
+ - Gradio app template ready for Hugging Face hosting.
32
+
33
+ ## Install
34
+ ```bash
35
+ cd projects/UMO-Qwen-Edit/data_curation_scripts/frame_extraction
36
+ pip install -e .
37
+ ```
38
+
39
+ ## CLI Usage
40
+ ### Build a catalog from a video
41
+ ```bash
42
+ frame-catalog catalog \
43
+ --video-path data/source.mp4 \
44
+ --output-dir outputs/catalog \
45
+ --frame-interval 12 \
46
+ --min-track-length 5
47
+ ```
48
+
49
+ ### Match new frames against the catalog
50
+ ```bash
51
+ frame-catalog match \
52
+ --catalog-path outputs/catalog/catalog.json \
53
+ --frames-dir uploads/ \
54
+ --output-path outputs/matches.json
55
+ ```
56
+
57
+ ## Deploy on Hugging Face Spaces
58
+ 1. Copy this folder to a new Space (Python SDK).
59
+ 2. Install dependencies with `pip install -e .`.
60
+ 3. Upload a pre-built `catalog/catalog.json` plus the `references/` images.
61
+ 4. Set environment variables in the Space:
62
+ - `FRAME_CATALOG=/home/user/app/catalog/catalog.json`
63
+ - `FRAME_OUTPUT_DIR=/home/user/app/output`
64
+ 5. Set the Space entrypoint to `python -m frame_extraction.app`.
65
+
66
+ ## Outputs
67
+ - `catalog.json`: character reference metadata with embeddings and chosen frames.
68
+ - `references/`: cropped reference images per character.
69
+ - `matches.json`: mapping from user frames to character IDs with similarity scores.
70
+
71
+ ## Roadmap
72
+ - Integrate more robust trackers (DeepSort/ByteTrack).
73
+ - Add active learning loop for manual character corrections.
74
+ - Expose REST endpoints for automated ingestion.
frame_extraction/src/frame_extraction.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ pyproject.toml
3
+ src/frame_extraction/__init__.py
4
+ src/frame_extraction/app.py
5
+ src/frame_extraction/catalog.py
6
+ src/frame_extraction/cli.py
7
+ src/frame_extraction/clustering.py
8
+ src/frame_extraction/config.py
9
+ src/frame_extraction/face.py
10
+ src/frame_extraction/matcher.py
11
+ src/frame_extraction/quality.py
12
+ src/frame_extraction/video.py
13
+ src/frame_extraction.egg-info/PKG-INFO
14
+ src/frame_extraction.egg-info/SOURCES.txt
15
+ src/frame_extraction.egg-info/dependency_links.txt
16
+ src/frame_extraction.egg-info/entry_points.txt
17
+ src/frame_extraction.egg-info/requires.txt
18
+ src/frame_extraction.egg-info/top_level.txt
frame_extraction/src/frame_extraction.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
frame_extraction/src/frame_extraction.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ frame-catalog = frame_extraction.cli:app
frame_extraction/src/frame_extraction.egg-info/requires.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ typer>=0.12
2
+ rich>=13.7
3
+ numpy>=1.24
4
+ pandas>=2.1
5
+ opencv-python>=4.8
6
+ torch>=2.1
7
+ torchvision>=0.16
8
+ facenet-pytorch>=2.5.3
9
+ scikit-learn>=1.4
10
+ Pillow>=10.0
11
+ tqdm>=4.66
12
+ gradio>=4.0
13
+ faiss-cpu>=1.7.4
frame_extraction/src/frame_extraction.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ frame_extraction
frame_extraction/src/frame_extraction/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """Frame extraction and character matching utilities."""
2
+
3
+ from .config import CatalogConfig, MatchConfig
4
+ from .catalog import build_catalog
5
+ from .matcher import match_frames
6
+
7
+ __all__ = ["CatalogConfig", "MatchConfig", "build_catalog", "match_frames"]
frame_extraction/src/frame_extraction/app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import gradio as gr
9
+ import numpy as np
10
+ import torch
11
+
12
+ from .config import MatchConfig
13
+ from .matcher import match_frames
14
+
15
+
16
+ CATALOG_PATH = Path(os.getenv("FRAME_CATALOG", "catalog/catalog.json"))
17
+ OUTPUT_DIR = Path(os.getenv("FRAME_OUTPUT_DIR", "app_outputs"))
18
+
19
+
20
+ def load_catalog() -> dict[str, Any] | None:
21
+ path = Path(CATALOG_PATH)
22
+ if path.exists():
23
+ return json.loads(path.read_text(encoding="utf-8"))
24
+ return None
25
+
26
+
27
+ catalog_cache = load_catalog()
28
+
29
+
30
+ def predict(images: list[np.ndarray]) -> tuple[list[str], list[str]]:
31
+ if catalog_cache is None:
32
+ raise gr.Error("Catalog not found. Upload catalog.json or set FRAME_CATALOG.")
33
+
34
+ if not images:
35
+ raise gr.Error("Please upload at least one frame.")
36
+
37
+ frames_dir = OUTPUT_DIR / "inputs"
38
+ frames_dir.mkdir(parents=True, exist_ok=True)
39
+ for idx, image in enumerate(images):
40
+ from PIL import Image
41
+
42
+ Image.fromarray(image).save(frames_dir / f"upload_{idx:03d}.png")
43
+
44
+ output_path = OUTPUT_DIR / "matches.json"
45
+ cfg = MatchConfig(
46
+ catalog_path=Path(CATALOG_PATH),
47
+ frames_dir=frames_dir,
48
+ output_path=output_path,
49
+ top_k=1,
50
+ similarity_threshold=0.5,
51
+ )
52
+ match_frames(cfg)
53
+ data = json.loads(output_path.read_text(encoding="utf-8"))
54
+ gallery_items = [(item["reference_crop"], f"{item['character_id']} ({item['similarity']:.2f})") for item in data]
55
+ return data, gallery_items
56
+
57
+
58
+ with gr.Blocks() as demo:
59
+ gr.Markdown("# Character Reference Matcher")
60
+ with gr.Row():
61
+ image_input = gr.Image(type="numpy", image_mode="RGB", label="Upload frames", multiple=True)
62
+ submit = gr.Button("Match Characters")
63
+ matches_json = gr.JSON(label="Matches")
64
+ gallery = gr.Gallery(label="Reference Thumbnails").style(grid=2)
65
+
66
+ submit.click(predict, inputs=image_input, outputs=[matches_json, gallery])
67
+
68
+
69
+ def main() -> None:
70
+ demo.launch()
71
+
72
+
73
+ if __name__ == "__main__":
74
+ main()
frame_extraction/src/frame_extraction/catalog.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import asdict, dataclass
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import cv2
9
+ import numpy as np
10
+ from rich.console import Console
11
+ from rich.progress import track
12
+
13
+ from .clustering import Clusterer
14
+ from .config import CatalogConfig
15
+ from .face import FaceDetector, FaceEmbedder, crop_faces, cv2_to_rgb
16
+ from .quality import laplacian_sharpness
17
+ from .video import iter_frames
18
+
19
+ console = Console()
20
+
21
+
22
+ @dataclass(slots=True)
23
+ class FaceRecord:
24
+ character_id: str
25
+ movie: str
26
+ frame_index: int
27
+ frame_path: str
28
+ bbox: list[float]
29
+ detection_score: float
30
+ sharpness: float
31
+ embedding: list[float]
32
+ reference_path: str | None = None
33
+
34
+
35
+ def build_catalog(cfg: CatalogConfig) -> Path:
36
+ cfg.ensure()
37
+ detector = FaceDetector(min_face_size=cfg.mtcnn_min_face_size)
38
+ embedder = FaceEmbedder(batch_size=cfg.embed_batch_size)
39
+ clusterer = Clusterer(eps=cfg.cluster_eps, min_samples=cfg.cluster_min_samples)
40
+
41
+ frames_dir = cfg.output_dir / "frames"
42
+ faces_dir = cfg.output_dir / "faces"
43
+ references_dir = cfg.output_dir / "references"
44
+
45
+ face_records: list[FaceRecord] = []
46
+ embeddings: list[np.ndarray] = []
47
+
48
+ video_name = cfg.video_path.stem
49
+
50
+ for frame_idx, frame in iter_frames(cfg.video_path, cfg.frame_interval):
51
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
52
+ frame_path = frames_dir / f"frame_{frame_idx:06d}.jpg"
53
+ cv2.imwrite(str(frame_path), cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR))
54
+
55
+ boxes, scores = detector.detect(frame_rgb)
56
+ if boxes.size == 0:
57
+ continue
58
+
59
+ crops = crop_faces(frame_rgb, boxes)
60
+ if not crops:
61
+ continue
62
+
63
+ crop_embeddings = embedder.embed(crops)
64
+ embeddings.append(crop_embeddings)
65
+ for idx, (crop, emb, box, score) in enumerate(zip(crops, crop_embeddings, boxes, scores, strict=False)):
66
+ crop_path = faces_dir / f"{frame_idx:06d}_{idx}.jpg"
67
+ crop.save(crop_path)
68
+ sharpness = laplacian_sharpness(np.asarray(crop))
69
+ face_records.append(
70
+ FaceRecord(
71
+ character_id="", # placeholder until clustering
72
+ movie=video_name,
73
+ frame_index=frame_idx,
74
+ frame_path=str(frame_path),
75
+ bbox=box.tolist(),
76
+ detection_score=float(score),
77
+ sharpness=sharpness,
78
+ embedding=emb.tolist(),
79
+ )
80
+ )
81
+
82
+ if not face_records:
83
+ raise RuntimeError("No faces detected in video.")
84
+
85
+ all_embeddings = np.vstack(embeddings)
86
+ labels = clusterer.cluster(all_embeddings)
87
+
88
+ label_counter: dict[int, int] = {}
89
+ for label in labels:
90
+ label_counter[label] = label_counter.get(label, 0) + 1
91
+
92
+ for record, label in zip(face_records, labels, strict=False):
93
+ if label < 0 or label_counter[label] < cfg.min_track_length:
94
+ record.character_id = f"{video_name}_char_noise"
95
+ else:
96
+ record.character_id = f"{video_name}_char_{label:03d}"
97
+
98
+ references: dict[str, FaceRecord] = {}
99
+ for record in face_records:
100
+ if record.character_id.endswith("char_noise"):
101
+ continue
102
+ current = references.get(record.character_id)
103
+ score = record.sharpness if cfg.reference_metric == "sharpness" else record.detection_score
104
+ if current is None:
105
+ references[record.character_id] = record
106
+ else:
107
+ current_score = current.sharpness if cfg.reference_metric == "sharpness" else current.detection_score
108
+ if score > current_score:
109
+ references[record.character_id] = record
110
+
111
+ for record in references.values():
112
+ source = Path(record.frame_path)
113
+ frame = cv2.imread(str(source))
114
+ x1, y1, x2, y2 = map(int, record.bbox)
115
+ crop = frame[y1:y2, x1:x2]
116
+ output_path = references_dir / f"{record.character_id}.jpg"
117
+ cv2.imwrite(str(output_path), crop)
118
+ record.reference_path = str(output_path)
119
+
120
+ catalog = {
121
+ "video": video_name,
122
+ "config": {
123
+ "frame_interval": cfg.frame_interval,
124
+ "min_track_length": cfg.min_track_length,
125
+ "reference_metric": cfg.reference_metric,
126
+ },
127
+ "characters": [asdict(record) for record in face_records if not record.character_id.endswith("char_noise")],
128
+ "references": [asdict(record) for record in references.values()],
129
+ }
130
+
131
+ catalog_path = cfg.output_dir / "catalog.json"
132
+ with catalog_path.open("w", encoding="utf-8") as f:
133
+ json.dump(catalog, f, indent=2)
134
+
135
+ console.log(f"Wrote catalog to {catalog_path}")
136
+ return catalog_path
frame_extraction/src/frame_extraction/cli.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import typer
6
+ from rich.console import Console
7
+
8
+ from .catalog import build_catalog
9
+ from .config import CatalogConfig, MatchConfig
10
+ from .matcher import match_frames
11
+
12
+ app = typer.Typer(add_completion=False)
13
+ console = Console()
14
+
15
+
16
+ @app.command()
17
+ def catalog(
18
+ video_path: Path = typer.Argument(..., help="Input video file"),
19
+ output_dir: Path = typer.Option(..., "--output-dir", "-o", help="Directory to store catalog outputs"),
20
+ frame_interval: int = typer.Option(12, help="Sample every Nth frame"),
21
+ min_track_length: int = typer.Option(5, help="Minimum detections per cluster"),
22
+ cluster_eps: float = typer.Option(0.55, help="DBSCAN epsilon for clustering"),
23
+ cluster_min_samples: int = typer.Option(3, help="Minimum samples for DBSCAN"),
24
+ ) -> None:
25
+ """Build a character reference catalog from a video."""
26
+ cfg = CatalogConfig(
27
+ video_path=video_path,
28
+ output_dir=output_dir,
29
+ frame_interval=frame_interval,
30
+ min_track_length=min_track_length,
31
+ cluster_eps=cluster_eps,
32
+ cluster_min_samples=cluster_min_samples,
33
+ )
34
+ build_catalog(cfg)
35
+
36
+
37
+ @app.command()
38
+ def match(
39
+ catalog_path: Path = typer.Argument(..., help="Catalog JSON path"),
40
+ frames_dir: Path = typer.Option(..., "--frames-dir", "-f", help="Directory with input frames"),
41
+ output_path: Path = typer.Option(..., "--output-path", "-o", help="Where to write matches JSON"),
42
+ top_k: int = typer.Option(1, help="Return up to K matches per face"),
43
+ similarity_threshold: float = typer.Option(0.5, help="Minimum cosine similarity to accept match"),
44
+ ) -> None:
45
+ """Match new frames against an existing catalog."""
46
+ cfg = MatchConfig(
47
+ catalog_path=catalog_path,
48
+ frames_dir=frames_dir,
49
+ output_path=output_path,
50
+ top_k=top_k,
51
+ similarity_threshold=similarity_threshold,
52
+ )
53
+ match_frames(cfg)
54
+
55
+
56
+ if __name__ == "__main__":
57
+ app()
frame_extraction/src/frame_extraction/clustering.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ import numpy as np
7
+ from sklearn.cluster import DBSCAN
8
+
9
+
10
+ @dataclass(slots=True)
11
+ class Clusterer:
12
+ eps: float = 0.55
13
+ min_samples: int = 3
14
+
15
+ def cluster(self, embeddings: np.ndarray) -> np.ndarray:
16
+ if len(embeddings) == 0:
17
+ return np.empty((0,), dtype=int)
18
+ model = DBSCAN(eps=self.eps, min_samples=self.min_samples, metric="cosine")
19
+ labels = model.fit_predict(embeddings)
20
+ return labels
frame_extraction/src/frame_extraction/config.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import Literal
6
+
7
+
8
+ @dataclass(slots=True)
9
+ class CatalogConfig:
10
+ video_path: Path
11
+ output_dir: Path
12
+ frame_interval: int = 12
13
+ mtcnn_min_face_size: int = 60
14
+ min_track_length: int = 5
15
+ embed_batch_size: int = 16
16
+ cluster_eps: float = 0.55
17
+ cluster_min_samples: int = 3
18
+ reference_metric: Literal["sharpness", "confidence"] = "sharpness"
19
+
20
+ def ensure(self) -> None:
21
+ self.video_path = Path(self.video_path)
22
+ self.output_dir = Path(self.output_dir)
23
+ self.output_dir.mkdir(parents=True, exist_ok=True)
24
+ (self.output_dir / "frames").mkdir(exist_ok=True)
25
+ (self.output_dir / "faces").mkdir(exist_ok=True)
26
+ (self.output_dir / "references").mkdir(exist_ok=True)
27
+
28
+
29
+ @dataclass(slots=True)
30
+ class MatchConfig:
31
+ catalog_path: Path
32
+ frames_dir: Path
33
+ output_path: Path
34
+ top_k: int = 1
35
+ similarity_threshold: float = 0.5
36
+
37
+ def ensure(self) -> None:
38
+ self.catalog_path = Path(self.catalog_path)
39
+ self.frames_dir = Path(self.frames_dir)
40
+ self.output_path = Path(self.output_path)
41
+ self.output_path.parent.mkdir(parents=True, exist_ok=True)
frame_extraction/src/frame_extraction/face.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Iterable, List, Tuple
6
+
7
+ import numpy as np
8
+ import torch
9
+ from facenet_pytorch import InceptionResnetV1, MTCNN
10
+ from PIL import Image
11
+
12
+
13
+ @dataclass(slots=True)
14
+ class FaceDetector:
15
+ device: str = "cuda" if torch.cuda.is_available() else "cpu"
16
+ min_face_size: int = 60
17
+
18
+ def __post_init__(self) -> None:
19
+ self.model = MTCNN(
20
+ keep_all=True,
21
+ device=self.device,
22
+ min_face_size=self.min_face_size,
23
+ post_process=False,
24
+ )
25
+
26
+ def detect(self, image: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
27
+ pil = Image.fromarray(cv2_to_rgb(image))
28
+ boxes, probs = self.model.detect(pil)
29
+ if boxes is None or probs is None:
30
+ return np.empty((0, 4), dtype=np.float32), np.empty((0,), dtype=np.float32)
31
+ return boxes.astype(np.float32), probs.astype(np.float32)
32
+
33
+
34
+ @dataclass(slots=True)
35
+ class FaceEmbedder:
36
+ device: str = "cuda" if torch.cuda.is_available() else "cpu"
37
+ batch_size: int = 16
38
+
39
+ def __post_init__(self) -> None:
40
+ self.model = InceptionResnetV1(pretrained="vggface2").eval().to(self.device)
41
+
42
+ @torch.no_grad()
43
+ def embed(self, crops: Iterable[Image.Image]) -> np.ndarray:
44
+ embeddings: List[np.ndarray] = []
45
+ batch: List[torch.Tensor] = []
46
+ for crop in crops:
47
+ tensor = preprocess_crop(crop)
48
+ batch.append(tensor)
49
+ if len(batch) == self.batch_size:
50
+ embeddings.append(self._forward(batch))
51
+ batch.clear()
52
+ if batch:
53
+ embeddings.append(self._forward(batch))
54
+ if not embeddings:
55
+ return np.empty((0, 512), dtype=np.float32)
56
+ return np.vstack(embeddings)
57
+
58
+ def _forward(self, batch: List[torch.Tensor]) -> np.ndarray:
59
+ stacked = torch.stack(batch).to(self.device)
60
+ out = self.model(stacked)
61
+ return out.cpu().numpy().astype(np.float32)
62
+
63
+
64
+ def preprocess_crop(crop: Image.Image) -> torch.Tensor:
65
+ crop = crop.resize((160, 160))
66
+ tensor = torch.from_numpy(np.asarray(crop).astype(np.float32))
67
+ tensor = tensor.permute(2, 0, 1) / 255.0
68
+ tensor = (tensor - 0.5) / 0.5
69
+ return tensor
70
+
71
+
72
+ def crop_faces(image: np.ndarray, boxes: np.ndarray, margin: float = 0.3) -> list[Image.Image]:
73
+ h, w = image.shape[:2]
74
+ crops: list[Image.Image] = []
75
+ for x1, y1, x2, y2 in boxes:
76
+ w_box = x2 - x1
77
+ h_box = y2 - y1
78
+ x1m = max(0, int(x1 - margin * w_box))
79
+ y1m = max(0, int(y1 - margin * h_box))
80
+ x2m = min(w, int(x2 + margin * w_box))
81
+ y2m = min(h, int(y2 + margin * h_box))
82
+ crop = image[y1m:y2m, x1m:x2m]
83
+ crops.append(Image.fromarray(cv2_to_rgb(crop)))
84
+ return crops
85
+
86
+
87
+ def cv2_to_rgb(image: np.ndarray) -> np.ndarray:
88
+ return image[:, :, ::-1]
frame_extraction/src/frame_extraction/matcher.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import cv2
8
+ import faiss
9
+ import numpy as np
10
+ from facenet_pytorch import MTCNN
11
+ from rich.console import Console
12
+
13
+ from .config import MatchConfig
14
+ from .face import FaceEmbedder, crop_faces
15
+
16
+ console = Console()
17
+
18
+
19
+ def load_catalog(catalog_path: Path) -> tuple[np.ndarray, list[dict[str, Any]]]:
20
+ data = json.loads(catalog_path.read_text(encoding="utf-8"))
21
+ references = data.get("references", [])
22
+ embeddings = np.array([ref["embedding"] for ref in references], dtype=np.float32)
23
+ return embeddings, references
24
+
25
+
26
+ def build_index(embeddings: np.ndarray) -> faiss.IndexFlatIP:
27
+ if embeddings.size == 0:
28
+ raise RuntimeError("Catalog has no reference embeddings.")
29
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-8
30
+ normalized = embeddings / norms
31
+ index = faiss.IndexFlatIP(embeddings.shape[1])
32
+ index.add(normalized)
33
+ return index
34
+
35
+
36
+ def match_frames(cfg: MatchConfig) -> Path:
37
+ cfg.ensure()
38
+ embeddings, references = load_catalog(cfg.catalog_path)
39
+ index = build_index(embeddings)
40
+ embedder = FaceEmbedder()
41
+ detector = MTCNN(keep_all=True, device=embedder.device)
42
+
43
+ matches: list[dict[str, Any]] = []
44
+ image_paths = sorted(p for p in cfg.frames_dir.glob("*") if p.suffix.lower() in {".png", ".jpg", ".jpeg"})
45
+
46
+ for image_path in image_paths:
47
+ image_bgr = cv2.imread(str(image_path))
48
+ image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
49
+ boxes, probs = detector.detect(image_rgb)
50
+ if boxes is None or probs is None:
51
+ continue
52
+ crops = crop_faces(image_rgb, boxes)
53
+ face_embeddings = embedder.embed(crops)
54
+ if face_embeddings.size == 0:
55
+ continue
56
+ norms = np.linalg.norm(face_embeddings, axis=1, keepdims=True) + 1e-8
57
+ normalized = face_embeddings / norms
58
+ scores, idxs = index.search(normalized, cfg.top_k)
59
+ for face_idx, (box, score_row, idx_row) in enumerate(zip(boxes, scores, idxs, strict=False)):
60
+ for score, idx in zip(score_row, idx_row, strict=False):
61
+ if score < cfg.similarity_threshold:
62
+ continue
63
+ reference = references[int(idx)]
64
+ matches.append(
65
+ {
66
+ "input_frame": str(image_path),
67
+ "face_index": face_idx,
68
+ "bbox": box.tolist(),
69
+ "similarity": float(score),
70
+ "character_id": reference["character_id"],
71
+ "reference_frame": reference["frame_path"],
72
+ "reference_crop": reference["reference_path"],
73
+ }
74
+ )
75
+
76
+ cfg.output_path.write_text(json.dumps(matches, indent=2), encoding="utf-8")
77
+ console.log(f"Wrote matches to {cfg.output_path}")
78
+ return cfg.output_path
frame_extraction/src/frame_extraction/quality.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import cv2
4
+ import numpy as np
5
+
6
+
7
+ def laplacian_sharpness(image: np.ndarray) -> float:
8
+ gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
9
+ return float(cv2.Laplacian(gray, cv2.CV_64F).var())
10
+
11
+
12
+ def brightness_score(image: np.ndarray) -> float:
13
+ gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
14
+ return float(gray.mean() / 255.0)
frame_extraction/src/frame_extraction/video.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from pathlib import Path
5
+ from typing import Iterator
6
+
7
+ import cv2
8
+ from tqdm import tqdm
9
+
10
+
11
+ def iter_frames(video_path: str | Path, frame_interval: int = 12) -> Iterator[tuple[int, "cv2.Mat"]]:
12
+ cap = cv2.VideoCapture(str(video_path))
13
+ if not cap.isOpened():
14
+ raise RuntimeError(f"Could not open video: {video_path}")
15
+
16
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
17
+ fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
18
+ interval = max(1, frame_interval)
19
+
20
+ with tqdm(total=math.ceil(total_frames / interval), desc="Extracting frames") as pbar:
21
+ frame_idx = 0
22
+ sampled = 0
23
+ while True:
24
+ ret, frame = cap.read()
25
+ if not ret:
26
+ break
27
+ if frame_idx % interval == 0:
28
+ yield frame_idx, frame
29
+ sampled += 1
30
+ pbar.update(1)
31
+ frame_idx += 1
32
+ cap.release()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ typer>=0.12
2
+ rich>=13.7
3
+ numpy>=1.24
4
+ pandas>=2.1
5
+ opencv-python-headless>=4.8
6
+ torch==2.2.2
7
+ torchvision==0.17.2
8
+ facenet-pytorch>=2.6.0
9
+ scikit-learn>=1.7
10
+ Pillow>=10.0
11
+ tqdm>=4.66
12
+ gradio>=5.0
13
+ faiss-cpu>=1.12.0