Deploy frame extraction matcher
Browse files- app.py +4 -0
- catalog/README.md +3 -0
- frame_extraction/.gitignore +6 -0
- frame_extraction/README.md +54 -0
- frame_extraction/pyproject.toml +28 -0
- frame_extraction/src/frame_extraction.egg-info/PKG-INFO +74 -0
- frame_extraction/src/frame_extraction.egg-info/SOURCES.txt +18 -0
- frame_extraction/src/frame_extraction.egg-info/dependency_links.txt +1 -0
- frame_extraction/src/frame_extraction.egg-info/entry_points.txt +2 -0
- frame_extraction/src/frame_extraction.egg-info/requires.txt +13 -0
- frame_extraction/src/frame_extraction.egg-info/top_level.txt +1 -0
- frame_extraction/src/frame_extraction/__init__.py +7 -0
- frame_extraction/src/frame_extraction/app.py +74 -0
- frame_extraction/src/frame_extraction/catalog.py +136 -0
- frame_extraction/src/frame_extraction/cli.py +57 -0
- frame_extraction/src/frame_extraction/clustering.py +20 -0
- frame_extraction/src/frame_extraction/config.py +41 -0
- frame_extraction/src/frame_extraction/face.py +88 -0
- frame_extraction/src/frame_extraction/matcher.py +78 -0
- frame_extraction/src/frame_extraction/quality.py +14 -0
- frame_extraction/src/frame_extraction/video.py +32 -0
- requirements.txt +13 -0
app.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from frame_extraction.app import main
|
| 2 |
+
|
| 3 |
+
if __name__ == "__main__":
|
| 4 |
+
main()
|
catalog/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Catalog Placeholder
|
| 2 |
+
|
| 3 |
+
Upload your generated `catalog.json` and `references/` images here before deploying the Space. Update the `FRAME_CATALOG` environment variable accordingly.
|
frame_extraction/.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
outputs/
|
| 3 |
+
app_outputs/
|
| 4 |
+
dummy_frames/
|
| 5 |
+
catalog/
|
| 6 |
+
__pycache__/
|
frame_extraction/README.md
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Frame Extraction & Character Matching
|
| 2 |
+
|
| 3 |
+
This package turns raw video into character reference catalogs and lets you match new frames against those references. It is designed to be deployed quickly (e.g., on Hugging Face Spaces) for interactive character discovery.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
- Shot-aware frame sampling to keep only useful stills.
|
| 7 |
+
- Face detection, embedding, and clustering (MTCNN + InceptionResnet).
|
| 8 |
+
- Automatic reference selection per character (sharpest, most frontal crop).
|
| 9 |
+
- JSON catalog output and optional reference thumbnails.
|
| 10 |
+
- Matching API/CLI for user-uploaded frames with multi-character support.
|
| 11 |
+
- Gradio app template ready for Hugging Face hosting.
|
| 12 |
+
|
| 13 |
+
## Install
|
| 14 |
+
```bash
|
| 15 |
+
cd projects/UMO-Qwen-Edit/data_curation_scripts/frame_extraction
|
| 16 |
+
pip install -e .
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
## CLI Usage
|
| 20 |
+
### Build a catalog from a video
|
| 21 |
+
```bash
|
| 22 |
+
frame-catalog catalog \
|
| 23 |
+
--video-path data/source.mp4 \
|
| 24 |
+
--output-dir outputs/catalog \
|
| 25 |
+
--frame-interval 12 \
|
| 26 |
+
--min-track-length 5
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
### Match new frames against the catalog
|
| 30 |
+
```bash
|
| 31 |
+
frame-catalog match \
|
| 32 |
+
--catalog-path outputs/catalog/catalog.json \
|
| 33 |
+
--frames-dir uploads/ \
|
| 34 |
+
--output-path outputs/matches.json
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
## Deploy on Hugging Face Spaces
|
| 38 |
+
1. Copy this folder to a new Space (Python SDK).
|
| 39 |
+
2. Install dependencies with `pip install -e .`.
|
| 40 |
+
3. Upload a pre-built `catalog/catalog.json` plus the `references/` images.
|
| 41 |
+
4. Set environment variables in the Space:
|
| 42 |
+
- `FRAME_CATALOG=/home/user/app/catalog/catalog.json`
|
| 43 |
+
- `FRAME_OUTPUT_DIR=/home/user/app/output`
|
| 44 |
+
5. Set the Space entrypoint to `python -m frame_extraction.app`.
|
| 45 |
+
|
| 46 |
+
## Outputs
|
| 47 |
+
- `catalog.json`: character reference metadata with embeddings and chosen frames.
|
| 48 |
+
- `references/`: cropped reference images per character.
|
| 49 |
+
- `matches.json`: mapping from user frames to character IDs with similarity scores.
|
| 50 |
+
|
| 51 |
+
## Roadmap
|
| 52 |
+
- Integrate more robust trackers (DeepSort/ByteTrack).
|
| 53 |
+
- Add active learning loop for manual character corrections.
|
| 54 |
+
- Expose REST endpoints for automated ingestion.
|
frame_extraction/pyproject.toml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "frame-extraction"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Character-centric frame extraction and matching pipeline"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"typer>=0.12",
|
| 9 |
+
"rich>=13.7",
|
| 10 |
+
"numpy>=1.24",
|
| 11 |
+
"pandas>=2.1",
|
| 12 |
+
"opencv-python>=4.8",
|
| 13 |
+
"torch>=2.1",
|
| 14 |
+
"torchvision>=0.16",
|
| 15 |
+
"facenet-pytorch>=2.5.3",
|
| 16 |
+
"scikit-learn>=1.4",
|
| 17 |
+
"Pillow>=10.0",
|
| 18 |
+
"tqdm>=4.66",
|
| 19 |
+
"gradio>=4.0",
|
| 20 |
+
"faiss-cpu>=1.7.4",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
[project.scripts]
|
| 24 |
+
frame-catalog = "frame_extraction.cli:app"
|
| 25 |
+
|
| 26 |
+
[build-system]
|
| 27 |
+
requires = ["setuptools>=68", "wheel"]
|
| 28 |
+
build-backend = "setuptools.build_meta"
|
frame_extraction/src/frame_extraction.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: frame-extraction
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Character-centric frame extraction and matching pipeline
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Description-Content-Type: text/markdown
|
| 7 |
+
Requires-Dist: typer>=0.12
|
| 8 |
+
Requires-Dist: rich>=13.7
|
| 9 |
+
Requires-Dist: numpy>=1.24
|
| 10 |
+
Requires-Dist: pandas>=2.1
|
| 11 |
+
Requires-Dist: opencv-python>=4.8
|
| 12 |
+
Requires-Dist: torch>=2.1
|
| 13 |
+
Requires-Dist: torchvision>=0.16
|
| 14 |
+
Requires-Dist: facenet-pytorch>=2.5.3
|
| 15 |
+
Requires-Dist: scikit-learn>=1.4
|
| 16 |
+
Requires-Dist: Pillow>=10.0
|
| 17 |
+
Requires-Dist: tqdm>=4.66
|
| 18 |
+
Requires-Dist: gradio>=4.0
|
| 19 |
+
Requires-Dist: faiss-cpu>=1.7.4
|
| 20 |
+
|
| 21 |
+
# Frame Extraction & Character Matching
|
| 22 |
+
|
| 23 |
+
This package turns raw video into character reference catalogs and lets you match new frames against those references. It is designed to be deployed quickly (e.g., on Hugging Face Spaces) for interactive character discovery.
|
| 24 |
+
|
| 25 |
+
## Features
|
| 26 |
+
- Shot-aware frame sampling to keep only useful stills.
|
| 27 |
+
- Face detection, embedding, and clustering (MTCNN + InceptionResnet).
|
| 28 |
+
- Automatic reference selection per character (sharpest, most frontal crop).
|
| 29 |
+
- JSON catalog output and optional reference thumbnails.
|
| 30 |
+
- Matching API/CLI for user-uploaded frames with multi-character support.
|
| 31 |
+
- Gradio app template ready for Hugging Face hosting.
|
| 32 |
+
|
| 33 |
+
## Install
|
| 34 |
+
```bash
|
| 35 |
+
cd projects/UMO-Qwen-Edit/data_curation_scripts/frame_extraction
|
| 36 |
+
pip install -e .
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## CLI Usage
|
| 40 |
+
### Build a catalog from a video
|
| 41 |
+
```bash
|
| 42 |
+
frame-catalog catalog \
|
| 43 |
+
--video-path data/source.mp4 \
|
| 44 |
+
--output-dir outputs/catalog \
|
| 45 |
+
--frame-interval 12 \
|
| 46 |
+
--min-track-length 5
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
### Match new frames against the catalog
|
| 50 |
+
```bash
|
| 51 |
+
frame-catalog match \
|
| 52 |
+
--catalog-path outputs/catalog/catalog.json \
|
| 53 |
+
--frames-dir uploads/ \
|
| 54 |
+
--output-path outputs/matches.json
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
## Deploy on Hugging Face Spaces
|
| 58 |
+
1. Copy this folder to a new Space (Python SDK).
|
| 59 |
+
2. Install dependencies with `pip install -e .`.
|
| 60 |
+
3. Upload a pre-built `catalog/catalog.json` plus the `references/` images.
|
| 61 |
+
4. Set environment variables in the Space:
|
| 62 |
+
- `FRAME_CATALOG=/home/user/app/catalog/catalog.json`
|
| 63 |
+
- `FRAME_OUTPUT_DIR=/home/user/app/output`
|
| 64 |
+
5. Set the Space entrypoint to `python -m frame_extraction.app`.
|
| 65 |
+
|
| 66 |
+
## Outputs
|
| 67 |
+
- `catalog.json`: character reference metadata with embeddings and chosen frames.
|
| 68 |
+
- `references/`: cropped reference images per character.
|
| 69 |
+
- `matches.json`: mapping from user frames to character IDs with similarity scores.
|
| 70 |
+
|
| 71 |
+
## Roadmap
|
| 72 |
+
- Integrate more robust trackers (DeepSort/ByteTrack).
|
| 73 |
+
- Add active learning loop for manual character corrections.
|
| 74 |
+
- Expose REST endpoints for automated ingestion.
|
frame_extraction/src/frame_extraction.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
pyproject.toml
|
| 3 |
+
src/frame_extraction/__init__.py
|
| 4 |
+
src/frame_extraction/app.py
|
| 5 |
+
src/frame_extraction/catalog.py
|
| 6 |
+
src/frame_extraction/cli.py
|
| 7 |
+
src/frame_extraction/clustering.py
|
| 8 |
+
src/frame_extraction/config.py
|
| 9 |
+
src/frame_extraction/face.py
|
| 10 |
+
src/frame_extraction/matcher.py
|
| 11 |
+
src/frame_extraction/quality.py
|
| 12 |
+
src/frame_extraction/video.py
|
| 13 |
+
src/frame_extraction.egg-info/PKG-INFO
|
| 14 |
+
src/frame_extraction.egg-info/SOURCES.txt
|
| 15 |
+
src/frame_extraction.egg-info/dependency_links.txt
|
| 16 |
+
src/frame_extraction.egg-info/entry_points.txt
|
| 17 |
+
src/frame_extraction.egg-info/requires.txt
|
| 18 |
+
src/frame_extraction.egg-info/top_level.txt
|
frame_extraction/src/frame_extraction.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
frame_extraction/src/frame_extraction.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
frame-catalog = frame_extraction.cli:app
|
frame_extraction/src/frame_extraction.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
typer>=0.12
|
| 2 |
+
rich>=13.7
|
| 3 |
+
numpy>=1.24
|
| 4 |
+
pandas>=2.1
|
| 5 |
+
opencv-python>=4.8
|
| 6 |
+
torch>=2.1
|
| 7 |
+
torchvision>=0.16
|
| 8 |
+
facenet-pytorch>=2.5.3
|
| 9 |
+
scikit-learn>=1.4
|
| 10 |
+
Pillow>=10.0
|
| 11 |
+
tqdm>=4.66
|
| 12 |
+
gradio>=4.0
|
| 13 |
+
faiss-cpu>=1.7.4
|
frame_extraction/src/frame_extraction.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
frame_extraction
|
frame_extraction/src/frame_extraction/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Frame extraction and character matching utilities."""
|
| 2 |
+
|
| 3 |
+
from .config import CatalogConfig, MatchConfig
|
| 4 |
+
from .catalog import build_catalog
|
| 5 |
+
from .matcher import match_frames
|
| 6 |
+
|
| 7 |
+
__all__ = ["CatalogConfig", "MatchConfig", "build_catalog", "match_frames"]
|
frame_extraction/src/frame_extraction/app.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
import gradio as gr
|
| 9 |
+
import numpy as np
|
| 10 |
+
import torch
|
| 11 |
+
|
| 12 |
+
from .config import MatchConfig
|
| 13 |
+
from .matcher import match_frames
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
CATALOG_PATH = Path(os.getenv("FRAME_CATALOG", "catalog/catalog.json"))
|
| 17 |
+
OUTPUT_DIR = Path(os.getenv("FRAME_OUTPUT_DIR", "app_outputs"))
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def load_catalog() -> dict[str, Any] | None:
|
| 21 |
+
path = Path(CATALOG_PATH)
|
| 22 |
+
if path.exists():
|
| 23 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 24 |
+
return None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
catalog_cache = load_catalog()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def predict(images: list[np.ndarray]) -> tuple[list[str], list[str]]:
|
| 31 |
+
if catalog_cache is None:
|
| 32 |
+
raise gr.Error("Catalog not found. Upload catalog.json or set FRAME_CATALOG.")
|
| 33 |
+
|
| 34 |
+
if not images:
|
| 35 |
+
raise gr.Error("Please upload at least one frame.")
|
| 36 |
+
|
| 37 |
+
frames_dir = OUTPUT_DIR / "inputs"
|
| 38 |
+
frames_dir.mkdir(parents=True, exist_ok=True)
|
| 39 |
+
for idx, image in enumerate(images):
|
| 40 |
+
from PIL import Image
|
| 41 |
+
|
| 42 |
+
Image.fromarray(image).save(frames_dir / f"upload_{idx:03d}.png")
|
| 43 |
+
|
| 44 |
+
output_path = OUTPUT_DIR / "matches.json"
|
| 45 |
+
cfg = MatchConfig(
|
| 46 |
+
catalog_path=Path(CATALOG_PATH),
|
| 47 |
+
frames_dir=frames_dir,
|
| 48 |
+
output_path=output_path,
|
| 49 |
+
top_k=1,
|
| 50 |
+
similarity_threshold=0.5,
|
| 51 |
+
)
|
| 52 |
+
match_frames(cfg)
|
| 53 |
+
data = json.loads(output_path.read_text(encoding="utf-8"))
|
| 54 |
+
gallery_items = [(item["reference_crop"], f"{item['character_id']} ({item['similarity']:.2f})") for item in data]
|
| 55 |
+
return data, gallery_items
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
with gr.Blocks() as demo:
|
| 59 |
+
gr.Markdown("# Character Reference Matcher")
|
| 60 |
+
with gr.Row():
|
| 61 |
+
image_input = gr.Image(type="numpy", image_mode="RGB", label="Upload frames", multiple=True)
|
| 62 |
+
submit = gr.Button("Match Characters")
|
| 63 |
+
matches_json = gr.JSON(label="Matches")
|
| 64 |
+
gallery = gr.Gallery(label="Reference Thumbnails").style(grid=2)
|
| 65 |
+
|
| 66 |
+
submit.click(predict, inputs=image_input, outputs=[matches_json, gallery])
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def main() -> None:
|
| 70 |
+
demo.launch()
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
if __name__ == "__main__":
|
| 74 |
+
main()
|
frame_extraction/src/frame_extraction/catalog.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from dataclasses import asdict, dataclass
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
import cv2
|
| 9 |
+
import numpy as np
|
| 10 |
+
from rich.console import Console
|
| 11 |
+
from rich.progress import track
|
| 12 |
+
|
| 13 |
+
from .clustering import Clusterer
|
| 14 |
+
from .config import CatalogConfig
|
| 15 |
+
from .face import FaceDetector, FaceEmbedder, crop_faces, cv2_to_rgb
|
| 16 |
+
from .quality import laplacian_sharpness
|
| 17 |
+
from .video import iter_frames
|
| 18 |
+
|
| 19 |
+
console = Console()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass(slots=True)
|
| 23 |
+
class FaceRecord:
|
| 24 |
+
character_id: str
|
| 25 |
+
movie: str
|
| 26 |
+
frame_index: int
|
| 27 |
+
frame_path: str
|
| 28 |
+
bbox: list[float]
|
| 29 |
+
detection_score: float
|
| 30 |
+
sharpness: float
|
| 31 |
+
embedding: list[float]
|
| 32 |
+
reference_path: str | None = None
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def build_catalog(cfg: CatalogConfig) -> Path:
|
| 36 |
+
cfg.ensure()
|
| 37 |
+
detector = FaceDetector(min_face_size=cfg.mtcnn_min_face_size)
|
| 38 |
+
embedder = FaceEmbedder(batch_size=cfg.embed_batch_size)
|
| 39 |
+
clusterer = Clusterer(eps=cfg.cluster_eps, min_samples=cfg.cluster_min_samples)
|
| 40 |
+
|
| 41 |
+
frames_dir = cfg.output_dir / "frames"
|
| 42 |
+
faces_dir = cfg.output_dir / "faces"
|
| 43 |
+
references_dir = cfg.output_dir / "references"
|
| 44 |
+
|
| 45 |
+
face_records: list[FaceRecord] = []
|
| 46 |
+
embeddings: list[np.ndarray] = []
|
| 47 |
+
|
| 48 |
+
video_name = cfg.video_path.stem
|
| 49 |
+
|
| 50 |
+
for frame_idx, frame in iter_frames(cfg.video_path, cfg.frame_interval):
|
| 51 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 52 |
+
frame_path = frames_dir / f"frame_{frame_idx:06d}.jpg"
|
| 53 |
+
cv2.imwrite(str(frame_path), cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR))
|
| 54 |
+
|
| 55 |
+
boxes, scores = detector.detect(frame_rgb)
|
| 56 |
+
if boxes.size == 0:
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
crops = crop_faces(frame_rgb, boxes)
|
| 60 |
+
if not crops:
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
crop_embeddings = embedder.embed(crops)
|
| 64 |
+
embeddings.append(crop_embeddings)
|
| 65 |
+
for idx, (crop, emb, box, score) in enumerate(zip(crops, crop_embeddings, boxes, scores, strict=False)):
|
| 66 |
+
crop_path = faces_dir / f"{frame_idx:06d}_{idx}.jpg"
|
| 67 |
+
crop.save(crop_path)
|
| 68 |
+
sharpness = laplacian_sharpness(np.asarray(crop))
|
| 69 |
+
face_records.append(
|
| 70 |
+
FaceRecord(
|
| 71 |
+
character_id="", # placeholder until clustering
|
| 72 |
+
movie=video_name,
|
| 73 |
+
frame_index=frame_idx,
|
| 74 |
+
frame_path=str(frame_path),
|
| 75 |
+
bbox=box.tolist(),
|
| 76 |
+
detection_score=float(score),
|
| 77 |
+
sharpness=sharpness,
|
| 78 |
+
embedding=emb.tolist(),
|
| 79 |
+
)
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
if not face_records:
|
| 83 |
+
raise RuntimeError("No faces detected in video.")
|
| 84 |
+
|
| 85 |
+
all_embeddings = np.vstack(embeddings)
|
| 86 |
+
labels = clusterer.cluster(all_embeddings)
|
| 87 |
+
|
| 88 |
+
label_counter: dict[int, int] = {}
|
| 89 |
+
for label in labels:
|
| 90 |
+
label_counter[label] = label_counter.get(label, 0) + 1
|
| 91 |
+
|
| 92 |
+
for record, label in zip(face_records, labels, strict=False):
|
| 93 |
+
if label < 0 or label_counter[label] < cfg.min_track_length:
|
| 94 |
+
record.character_id = f"{video_name}_char_noise"
|
| 95 |
+
else:
|
| 96 |
+
record.character_id = f"{video_name}_char_{label:03d}"
|
| 97 |
+
|
| 98 |
+
references: dict[str, FaceRecord] = {}
|
| 99 |
+
for record in face_records:
|
| 100 |
+
if record.character_id.endswith("char_noise"):
|
| 101 |
+
continue
|
| 102 |
+
current = references.get(record.character_id)
|
| 103 |
+
score = record.sharpness if cfg.reference_metric == "sharpness" else record.detection_score
|
| 104 |
+
if current is None:
|
| 105 |
+
references[record.character_id] = record
|
| 106 |
+
else:
|
| 107 |
+
current_score = current.sharpness if cfg.reference_metric == "sharpness" else current.detection_score
|
| 108 |
+
if score > current_score:
|
| 109 |
+
references[record.character_id] = record
|
| 110 |
+
|
| 111 |
+
for record in references.values():
|
| 112 |
+
source = Path(record.frame_path)
|
| 113 |
+
frame = cv2.imread(str(source))
|
| 114 |
+
x1, y1, x2, y2 = map(int, record.bbox)
|
| 115 |
+
crop = frame[y1:y2, x1:x2]
|
| 116 |
+
output_path = references_dir / f"{record.character_id}.jpg"
|
| 117 |
+
cv2.imwrite(str(output_path), crop)
|
| 118 |
+
record.reference_path = str(output_path)
|
| 119 |
+
|
| 120 |
+
catalog = {
|
| 121 |
+
"video": video_name,
|
| 122 |
+
"config": {
|
| 123 |
+
"frame_interval": cfg.frame_interval,
|
| 124 |
+
"min_track_length": cfg.min_track_length,
|
| 125 |
+
"reference_metric": cfg.reference_metric,
|
| 126 |
+
},
|
| 127 |
+
"characters": [asdict(record) for record in face_records if not record.character_id.endswith("char_noise")],
|
| 128 |
+
"references": [asdict(record) for record in references.values()],
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
catalog_path = cfg.output_dir / "catalog.json"
|
| 132 |
+
with catalog_path.open("w", encoding="utf-8") as f:
|
| 133 |
+
json.dump(catalog, f, indent=2)
|
| 134 |
+
|
| 135 |
+
console.log(f"Wrote catalog to {catalog_path}")
|
| 136 |
+
return catalog_path
|
frame_extraction/src/frame_extraction/cli.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import typer
|
| 6 |
+
from rich.console import Console
|
| 7 |
+
|
| 8 |
+
from .catalog import build_catalog
|
| 9 |
+
from .config import CatalogConfig, MatchConfig
|
| 10 |
+
from .matcher import match_frames
|
| 11 |
+
|
| 12 |
+
app = typer.Typer(add_completion=False)
|
| 13 |
+
console = Console()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@app.command()
|
| 17 |
+
def catalog(
|
| 18 |
+
video_path: Path = typer.Argument(..., help="Input video file"),
|
| 19 |
+
output_dir: Path = typer.Option(..., "--output-dir", "-o", help="Directory to store catalog outputs"),
|
| 20 |
+
frame_interval: int = typer.Option(12, help="Sample every Nth frame"),
|
| 21 |
+
min_track_length: int = typer.Option(5, help="Minimum detections per cluster"),
|
| 22 |
+
cluster_eps: float = typer.Option(0.55, help="DBSCAN epsilon for clustering"),
|
| 23 |
+
cluster_min_samples: int = typer.Option(3, help="Minimum samples for DBSCAN"),
|
| 24 |
+
) -> None:
|
| 25 |
+
"""Build a character reference catalog from a video."""
|
| 26 |
+
cfg = CatalogConfig(
|
| 27 |
+
video_path=video_path,
|
| 28 |
+
output_dir=output_dir,
|
| 29 |
+
frame_interval=frame_interval,
|
| 30 |
+
min_track_length=min_track_length,
|
| 31 |
+
cluster_eps=cluster_eps,
|
| 32 |
+
cluster_min_samples=cluster_min_samples,
|
| 33 |
+
)
|
| 34 |
+
build_catalog(cfg)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@app.command()
|
| 38 |
+
def match(
|
| 39 |
+
catalog_path: Path = typer.Argument(..., help="Catalog JSON path"),
|
| 40 |
+
frames_dir: Path = typer.Option(..., "--frames-dir", "-f", help="Directory with input frames"),
|
| 41 |
+
output_path: Path = typer.Option(..., "--output-path", "-o", help="Where to write matches JSON"),
|
| 42 |
+
top_k: int = typer.Option(1, help="Return up to K matches per face"),
|
| 43 |
+
similarity_threshold: float = typer.Option(0.5, help="Minimum cosine similarity to accept match"),
|
| 44 |
+
) -> None:
|
| 45 |
+
"""Match new frames against an existing catalog."""
|
| 46 |
+
cfg = MatchConfig(
|
| 47 |
+
catalog_path=catalog_path,
|
| 48 |
+
frames_dir=frames_dir,
|
| 49 |
+
output_path=output_path,
|
| 50 |
+
top_k=top_k,
|
| 51 |
+
similarity_threshold=similarity_threshold,
|
| 52 |
+
)
|
| 53 |
+
match_frames(cfg)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
app()
|
frame_extraction/src/frame_extraction/clustering.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Iterable
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
from sklearn.cluster import DBSCAN
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass(slots=True)
|
| 11 |
+
class Clusterer:
|
| 12 |
+
eps: float = 0.55
|
| 13 |
+
min_samples: int = 3
|
| 14 |
+
|
| 15 |
+
def cluster(self, embeddings: np.ndarray) -> np.ndarray:
|
| 16 |
+
if len(embeddings) == 0:
|
| 17 |
+
return np.empty((0,), dtype=int)
|
| 18 |
+
model = DBSCAN(eps=self.eps, min_samples=self.min_samples, metric="cosine")
|
| 19 |
+
labels = model.fit_predict(embeddings)
|
| 20 |
+
return labels
|
frame_extraction/src/frame_extraction/config.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Literal
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass(slots=True)
|
| 9 |
+
class CatalogConfig:
|
| 10 |
+
video_path: Path
|
| 11 |
+
output_dir: Path
|
| 12 |
+
frame_interval: int = 12
|
| 13 |
+
mtcnn_min_face_size: int = 60
|
| 14 |
+
min_track_length: int = 5
|
| 15 |
+
embed_batch_size: int = 16
|
| 16 |
+
cluster_eps: float = 0.55
|
| 17 |
+
cluster_min_samples: int = 3
|
| 18 |
+
reference_metric: Literal["sharpness", "confidence"] = "sharpness"
|
| 19 |
+
|
| 20 |
+
def ensure(self) -> None:
|
| 21 |
+
self.video_path = Path(self.video_path)
|
| 22 |
+
self.output_dir = Path(self.output_dir)
|
| 23 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 24 |
+
(self.output_dir / "frames").mkdir(exist_ok=True)
|
| 25 |
+
(self.output_dir / "faces").mkdir(exist_ok=True)
|
| 26 |
+
(self.output_dir / "references").mkdir(exist_ok=True)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass(slots=True)
|
| 30 |
+
class MatchConfig:
|
| 31 |
+
catalog_path: Path
|
| 32 |
+
frames_dir: Path
|
| 33 |
+
output_path: Path
|
| 34 |
+
top_k: int = 1
|
| 35 |
+
similarity_threshold: float = 0.5
|
| 36 |
+
|
| 37 |
+
def ensure(self) -> None:
|
| 38 |
+
self.catalog_path = Path(self.catalog_path)
|
| 39 |
+
self.frames_dir = Path(self.frames_dir)
|
| 40 |
+
self.output_path = Path(self.output_path)
|
| 41 |
+
self.output_path.parent.mkdir(parents=True, exist_ok=True)
|
frame_extraction/src/frame_extraction/face.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Iterable, List, Tuple
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
from facenet_pytorch import InceptionResnetV1, MTCNN
|
| 10 |
+
from PIL import Image
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass(slots=True)
|
| 14 |
+
class FaceDetector:
|
| 15 |
+
device: str = "cuda" if torch.cuda.is_available() else "cpu"
|
| 16 |
+
min_face_size: int = 60
|
| 17 |
+
|
| 18 |
+
def __post_init__(self) -> None:
|
| 19 |
+
self.model = MTCNN(
|
| 20 |
+
keep_all=True,
|
| 21 |
+
device=self.device,
|
| 22 |
+
min_face_size=self.min_face_size,
|
| 23 |
+
post_process=False,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
def detect(self, image: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
| 27 |
+
pil = Image.fromarray(cv2_to_rgb(image))
|
| 28 |
+
boxes, probs = self.model.detect(pil)
|
| 29 |
+
if boxes is None or probs is None:
|
| 30 |
+
return np.empty((0, 4), dtype=np.float32), np.empty((0,), dtype=np.float32)
|
| 31 |
+
return boxes.astype(np.float32), probs.astype(np.float32)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@dataclass(slots=True)
|
| 35 |
+
class FaceEmbedder:
|
| 36 |
+
device: str = "cuda" if torch.cuda.is_available() else "cpu"
|
| 37 |
+
batch_size: int = 16
|
| 38 |
+
|
| 39 |
+
def __post_init__(self) -> None:
|
| 40 |
+
self.model = InceptionResnetV1(pretrained="vggface2").eval().to(self.device)
|
| 41 |
+
|
| 42 |
+
@torch.no_grad()
|
| 43 |
+
def embed(self, crops: Iterable[Image.Image]) -> np.ndarray:
|
| 44 |
+
embeddings: List[np.ndarray] = []
|
| 45 |
+
batch: List[torch.Tensor] = []
|
| 46 |
+
for crop in crops:
|
| 47 |
+
tensor = preprocess_crop(crop)
|
| 48 |
+
batch.append(tensor)
|
| 49 |
+
if len(batch) == self.batch_size:
|
| 50 |
+
embeddings.append(self._forward(batch))
|
| 51 |
+
batch.clear()
|
| 52 |
+
if batch:
|
| 53 |
+
embeddings.append(self._forward(batch))
|
| 54 |
+
if not embeddings:
|
| 55 |
+
return np.empty((0, 512), dtype=np.float32)
|
| 56 |
+
return np.vstack(embeddings)
|
| 57 |
+
|
| 58 |
+
def _forward(self, batch: List[torch.Tensor]) -> np.ndarray:
|
| 59 |
+
stacked = torch.stack(batch).to(self.device)
|
| 60 |
+
out = self.model(stacked)
|
| 61 |
+
return out.cpu().numpy().astype(np.float32)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def preprocess_crop(crop: Image.Image) -> torch.Tensor:
|
| 65 |
+
crop = crop.resize((160, 160))
|
| 66 |
+
tensor = torch.from_numpy(np.asarray(crop).astype(np.float32))
|
| 67 |
+
tensor = tensor.permute(2, 0, 1) / 255.0
|
| 68 |
+
tensor = (tensor - 0.5) / 0.5
|
| 69 |
+
return tensor
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def crop_faces(image: np.ndarray, boxes: np.ndarray, margin: float = 0.3) -> list[Image.Image]:
|
| 73 |
+
h, w = image.shape[:2]
|
| 74 |
+
crops: list[Image.Image] = []
|
| 75 |
+
for x1, y1, x2, y2 in boxes:
|
| 76 |
+
w_box = x2 - x1
|
| 77 |
+
h_box = y2 - y1
|
| 78 |
+
x1m = max(0, int(x1 - margin * w_box))
|
| 79 |
+
y1m = max(0, int(y1 - margin * h_box))
|
| 80 |
+
x2m = min(w, int(x2 + margin * w_box))
|
| 81 |
+
y2m = min(h, int(y2 + margin * h_box))
|
| 82 |
+
crop = image[y1m:y2m, x1m:x2m]
|
| 83 |
+
crops.append(Image.fromarray(cv2_to_rgb(crop)))
|
| 84 |
+
return crops
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def cv2_to_rgb(image: np.ndarray) -> np.ndarray:
|
| 88 |
+
return image[:, :, ::-1]
|
frame_extraction/src/frame_extraction/matcher.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
import cv2
|
| 8 |
+
import faiss
|
| 9 |
+
import numpy as np
|
| 10 |
+
from facenet_pytorch import MTCNN
|
| 11 |
+
from rich.console import Console
|
| 12 |
+
|
| 13 |
+
from .config import MatchConfig
|
| 14 |
+
from .face import FaceEmbedder, crop_faces
|
| 15 |
+
|
| 16 |
+
console = Console()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def load_catalog(catalog_path: Path) -> tuple[np.ndarray, list[dict[str, Any]]]:
|
| 20 |
+
data = json.loads(catalog_path.read_text(encoding="utf-8"))
|
| 21 |
+
references = data.get("references", [])
|
| 22 |
+
embeddings = np.array([ref["embedding"] for ref in references], dtype=np.float32)
|
| 23 |
+
return embeddings, references
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def build_index(embeddings: np.ndarray) -> faiss.IndexFlatIP:
|
| 27 |
+
if embeddings.size == 0:
|
| 28 |
+
raise RuntimeError("Catalog has no reference embeddings.")
|
| 29 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-8
|
| 30 |
+
normalized = embeddings / norms
|
| 31 |
+
index = faiss.IndexFlatIP(embeddings.shape[1])
|
| 32 |
+
index.add(normalized)
|
| 33 |
+
return index
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def match_frames(cfg: MatchConfig) -> Path:
|
| 37 |
+
cfg.ensure()
|
| 38 |
+
embeddings, references = load_catalog(cfg.catalog_path)
|
| 39 |
+
index = build_index(embeddings)
|
| 40 |
+
embedder = FaceEmbedder()
|
| 41 |
+
detector = MTCNN(keep_all=True, device=embedder.device)
|
| 42 |
+
|
| 43 |
+
matches: list[dict[str, Any]] = []
|
| 44 |
+
image_paths = sorted(p for p in cfg.frames_dir.glob("*") if p.suffix.lower() in {".png", ".jpg", ".jpeg"})
|
| 45 |
+
|
| 46 |
+
for image_path in image_paths:
|
| 47 |
+
image_bgr = cv2.imread(str(image_path))
|
| 48 |
+
image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
|
| 49 |
+
boxes, probs = detector.detect(image_rgb)
|
| 50 |
+
if boxes is None or probs is None:
|
| 51 |
+
continue
|
| 52 |
+
crops = crop_faces(image_rgb, boxes)
|
| 53 |
+
face_embeddings = embedder.embed(crops)
|
| 54 |
+
if face_embeddings.size == 0:
|
| 55 |
+
continue
|
| 56 |
+
norms = np.linalg.norm(face_embeddings, axis=1, keepdims=True) + 1e-8
|
| 57 |
+
normalized = face_embeddings / norms
|
| 58 |
+
scores, idxs = index.search(normalized, cfg.top_k)
|
| 59 |
+
for face_idx, (box, score_row, idx_row) in enumerate(zip(boxes, scores, idxs, strict=False)):
|
| 60 |
+
for score, idx in zip(score_row, idx_row, strict=False):
|
| 61 |
+
if score < cfg.similarity_threshold:
|
| 62 |
+
continue
|
| 63 |
+
reference = references[int(idx)]
|
| 64 |
+
matches.append(
|
| 65 |
+
{
|
| 66 |
+
"input_frame": str(image_path),
|
| 67 |
+
"face_index": face_idx,
|
| 68 |
+
"bbox": box.tolist(),
|
| 69 |
+
"similarity": float(score),
|
| 70 |
+
"character_id": reference["character_id"],
|
| 71 |
+
"reference_frame": reference["frame_path"],
|
| 72 |
+
"reference_crop": reference["reference_path"],
|
| 73 |
+
}
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
cfg.output_path.write_text(json.dumps(matches, indent=2), encoding="utf-8")
|
| 77 |
+
console.log(f"Wrote matches to {cfg.output_path}")
|
| 78 |
+
return cfg.output_path
|
frame_extraction/src/frame_extraction/quality.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import cv2
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def laplacian_sharpness(image: np.ndarray) -> float:
|
| 8 |
+
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
|
| 9 |
+
return float(cv2.Laplacian(gray, cv2.CV_64F).var())
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def brightness_score(image: np.ndarray) -> float:
|
| 13 |
+
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
|
| 14 |
+
return float(gray.mean() / 255.0)
|
frame_extraction/src/frame_extraction/video.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Iterator
|
| 6 |
+
|
| 7 |
+
import cv2
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def iter_frames(video_path: str | Path, frame_interval: int = 12) -> Iterator[tuple[int, "cv2.Mat"]]:
|
| 12 |
+
cap = cv2.VideoCapture(str(video_path))
|
| 13 |
+
if not cap.isOpened():
|
| 14 |
+
raise RuntimeError(f"Could not open video: {video_path}")
|
| 15 |
+
|
| 16 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 17 |
+
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
|
| 18 |
+
interval = max(1, frame_interval)
|
| 19 |
+
|
| 20 |
+
with tqdm(total=math.ceil(total_frames / interval), desc="Extracting frames") as pbar:
|
| 21 |
+
frame_idx = 0
|
| 22 |
+
sampled = 0
|
| 23 |
+
while True:
|
| 24 |
+
ret, frame = cap.read()
|
| 25 |
+
if not ret:
|
| 26 |
+
break
|
| 27 |
+
if frame_idx % interval == 0:
|
| 28 |
+
yield frame_idx, frame
|
| 29 |
+
sampled += 1
|
| 30 |
+
pbar.update(1)
|
| 31 |
+
frame_idx += 1
|
| 32 |
+
cap.release()
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
typer>=0.12
|
| 2 |
+
rich>=13.7
|
| 3 |
+
numpy>=1.24
|
| 4 |
+
pandas>=2.1
|
| 5 |
+
opencv-python-headless>=4.8
|
| 6 |
+
torch==2.2.2
|
| 7 |
+
torchvision==0.17.2
|
| 8 |
+
facenet-pytorch>=2.6.0
|
| 9 |
+
scikit-learn>=1.7
|
| 10 |
+
Pillow>=10.0
|
| 11 |
+
tqdm>=4.66
|
| 12 |
+
gradio>=5.0
|
| 13 |
+
faiss-cpu>=1.12.0
|