| |
| """ |
| Report which processed videos need a later reprocess pass. |
| |
| This is intended for long-running library builds where we want to finish the |
| first full pass, then come back and selectively reprocess videos that were |
| indexed before a newer thumbnail or embedding recipe landed. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import sys |
| import time |
| from collections import Counter |
|
|
|
|
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
| REPO_ROOT = os.path.dirname(SCRIPT_DIR) |
| BACKEND_DIR = os.path.join(REPO_ROOT, "backend") |
| if BACKEND_DIR not in sys.path: |
| sys.path.insert(0, BACKEND_DIR) |
|
|
| from search_images import ImageSearch, get_labels_version, get_thumbnail_sampling_version |
| from search import SubtitleSearch |
| from utils import atomic_write_json |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="Report image-pipeline reprocess backlog.") |
| parser.add_argument( |
| "--output", |
| required=True, |
| help="Path to write the JSON report.", |
| ) |
| parser.add_argument( |
| "--sample-limit", |
| type=int, |
| default=100, |
| help="Maximum number of natural keys to include in each sample list.", |
| ) |
| return parser.parse_args() |
|
|
|
|
| def main() -> int: |
| args = parse_args() |
| image_search = ImageSearch() |
| subtitle_search = SubtitleSearch() |
| backlog = image_search.get_videos_needing_reprocessing() |
| subtitle_backlog = subtitle_search.get_videos_needing_reembedding("E") |
| concept_backlog = subtitle_search.get_videos_needing_concept_refresh("E") |
| reason_counts: Counter[str] = Counter() |
| by_reason: dict[str, list[str]] = {} |
| subtitle_reason_counts: Counter[str] = Counter() |
| subtitle_by_reason: dict[str, list[str]] = {} |
| concept_reason_counts: Counter[str] = Counter() |
| concept_by_reason: dict[str, list[str]] = {} |
|
|
| for natural_key, reasons in backlog: |
| for reason in reasons: |
| reason_counts[reason] += 1 |
| by_reason.setdefault(reason, []) |
| if len(by_reason[reason]) < args.sample_limit: |
| by_reason[reason].append(natural_key) |
|
|
| for natural_key, reasons in subtitle_backlog: |
| for reason in reasons: |
| subtitle_reason_counts[reason] += 1 |
| subtitle_by_reason.setdefault(reason, []) |
| if len(subtitle_by_reason[reason]) < args.sample_limit: |
| subtitle_by_reason[reason].append(natural_key) |
|
|
| for natural_key, reasons in concept_backlog: |
| for reason in reasons: |
| concept_reason_counts[reason] += 1 |
| concept_by_reason.setdefault(reason, []) |
| if len(concept_by_reason[reason]) < args.sample_limit: |
| concept_by_reason[reason].append(natural_key) |
|
|
| payload = { |
| "generated_at": time.strftime("%Y-%m-%dT%H:%M:%S"), |
| "image_pipeline": { |
| "current_labels_version": get_labels_version(), |
| "current_thumbnail_sampling_version": get_thumbnail_sampling_version(), |
| "processed_videos": image_search.count_processed_videos(), |
| "videos_needing_reprocessing": len(backlog), |
| "reason_counts": dict(reason_counts), |
| "sample_videos_by_reason": by_reason, |
| }, |
| "subtitle_pipeline": { |
| "current_recipe": subtitle_search.get_current_subtitle_embedding_recipe(), |
| "processed_videos": subtitle_search.count_indexed_subtitles("E"), |
| "videos_needing_reembedding": len(subtitle_backlog), |
| "reason_counts": dict(subtitle_reason_counts), |
| "sample_videos_by_reason": subtitle_by_reason, |
| }, |
| "video_concept_pipeline": { |
| "current_recipe": subtitle_search.get_current_video_concept_recipe(), |
| "processed_videos": subtitle_search.count_video_concepts("E"), |
| "videos_needing_refresh": len(concept_backlog), |
| "reason_counts": dict(concept_reason_counts), |
| "sample_videos_by_reason": concept_by_reason, |
| }, |
| } |
|
|
| atomic_write_json(args.output, payload, indent=2) |
| print(json.dumps(payload, indent=2)) |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|