#!/usr/bin/env python3
"""
Report which processed videos need a later reprocess pass.

This is intended for long-running library builds where we want to finish the
first full pass, then come back and selectively reprocess videos that were
indexed before a newer thumbnail or embedding recipe landed.
"""

from __future__ import annotations

import argparse
import json
import os
import sys
import time
from collections import Counter


SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
REPO_ROOT = os.path.dirname(SCRIPT_DIR)
BACKEND_DIR = os.path.join(REPO_ROOT, "backend")
if BACKEND_DIR not in sys.path:
    sys.path.insert(0, BACKEND_DIR)

from search_images import ImageSearch, get_labels_version, get_thumbnail_sampling_version
from search import SubtitleSearch
from utils import atomic_write_json


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Report image-pipeline reprocess backlog.")
    parser.add_argument(
        "--output",
        required=True,
        help="Path to write the JSON report.",
    )
    parser.add_argument(
        "--sample-limit",
        type=int,
        default=100,
        help="Maximum number of natural keys to include in each sample list.",
    )
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    image_search = ImageSearch()
    subtitle_search = SubtitleSearch()
    backlog = image_search.get_videos_needing_reprocessing()
    subtitle_backlog = subtitle_search.get_videos_needing_reembedding("E")
    concept_backlog = subtitle_search.get_videos_needing_concept_refresh("E")
    reason_counts: Counter[str] = Counter()
    by_reason: dict[str, list[str]] = {}
    subtitle_reason_counts: Counter[str] = Counter()
    subtitle_by_reason: dict[str, list[str]] = {}
    concept_reason_counts: Counter[str] = Counter()
    concept_by_reason: dict[str, list[str]] = {}

    for natural_key, reasons in backlog:
        for reason in reasons:
            reason_counts[reason] += 1
            by_reason.setdefault(reason, [])
            if len(by_reason[reason]) < args.sample_limit:
                by_reason[reason].append(natural_key)

    for natural_key, reasons in subtitle_backlog:
        for reason in reasons:
            subtitle_reason_counts[reason] += 1
            subtitle_by_reason.setdefault(reason, [])
            if len(subtitle_by_reason[reason]) < args.sample_limit:
                subtitle_by_reason[reason].append(natural_key)

    for natural_key, reasons in concept_backlog:
        for reason in reasons:
            concept_reason_counts[reason] += 1
            concept_by_reason.setdefault(reason, [])
            if len(concept_by_reason[reason]) < args.sample_limit:
                concept_by_reason[reason].append(natural_key)

    payload = {
        "generated_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
        "image_pipeline": {
            "current_labels_version": get_labels_version(),
            "current_thumbnail_sampling_version": get_thumbnail_sampling_version(),
            "processed_videos": image_search.count_processed_videos(),
            "videos_needing_reprocessing": len(backlog),
            "reason_counts": dict(reason_counts),
            "sample_videos_by_reason": by_reason,
        },
        "subtitle_pipeline": {
            "current_recipe": subtitle_search.get_current_subtitle_embedding_recipe(),
            "processed_videos": subtitle_search.count_indexed_subtitles("E"),
            "videos_needing_reembedding": len(subtitle_backlog),
            "reason_counts": dict(subtitle_reason_counts),
            "sample_videos_by_reason": subtitle_by_reason,
        },
        "video_concept_pipeline": {
            "current_recipe": subtitle_search.get_current_video_concept_recipe(),
            "processed_videos": subtitle_search.count_video_concepts("E"),
            "videos_needing_refresh": len(concept_backlog),
            "reason_counts": dict(concept_reason_counts),
            "sample_videos_by_reason": concept_by_reason,
        },
    }

    atomic_write_json(args.output, payload, indent=2)
    print(json.dumps(payload, indent=2))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())