#!/usr/bin/env python3
"""srt2xml — SRT + cut spec to FCP7 XML for Premiere Pro.

Supports:
  - 1 to N cameras (single-cam or multi-cam)
  - Frame rates: 23.976 / 24 / 25 / 29.97 / 30 / 50 / 59.94 / 60
  - NDF / DF (DF only valid for 29.97 and 59.94)
  - Configurable resolution, pixel aspect, audio (sample rate / depth / channels)
  - Multicam dual-track expansion via enable/disable flags
  - --analyze (JSON instead of XML), --validate (spec check only), stdin/stdout via "-"

Usage:
  python3 srt2xml.py --srt subtitle.srt --spec cuts.json --output edit.xml --verify
  python3 srt2xml.py --spec cuts.json --validate
  python3 srt2xml.py --srt s.srt --spec c.json --analyze --output -

See cuts.json schema in the README or docs/cuts.schema.md.
"""

import argparse
import json
import re
import sys
from pathlib import Path
from xml.sax.saxutils import escape as xml_escape


# =============================================================================
# Frame rate registry
# =============================================================================

FPS_PRESETS = {
    "23.976": {"timebase": 24, "ntsc": True,  "fps_int": 24},
    "24":     {"timebase": 24, "ntsc": False, "fps_int": 24},
    "25":     {"timebase": 25, "ntsc": False, "fps_int": 25},
    "29.97":  {"timebase": 30, "ntsc": True,  "fps_int": 30, "df_drop": 2},
    "30":     {"timebase": 30, "ntsc": False, "fps_int": 30},
    "50":     {"timebase": 50, "ntsc": False, "fps_int": 50},
    "59.94":  {"timebase": 60, "ntsc": True,  "fps_int": 60, "df_drop": 4},
    "60":     {"timebase": 60, "ntsc": False, "fps_int": 60},
}


def get_fps_preset(fps_str):
    s = str(fps_str).strip()
    if s not in FPS_PRESETS:
        raise SystemExit(
            f"unsupported fps: {fps_str!r}. supported: {sorted(FPS_PRESETS)}"
        )
    return FPS_PRESETS[s]


def actual_fps(fps_preset):
    """Real-world playback rate. For NTSC modes this is timebase / 1.001
    (e.g. 59.94 for timebase 60); for non-NTSC it equals the timebase.

    Use this — NOT `timebase` — whenever converting between real-world
    seconds (e.g. SRT timestamps, stopwatch measurements) and frame counts.
    """
    tb = fps_preset["timebase"]
    return tb / 1.001 if fps_preset["ntsc"] else tb


# =============================================================================
# Timecode parsing (NDF + SMPTE drop-frame)
# =============================================================================

def parse_tc_to_frame(tc_str, fps_preset, displayformat):
    parts = re.split(r"[:;]", tc_str.strip())
    if len(parts) != 4:
        raise ValueError(
            f"bad TC format: {tc_str!r} (expected HH:MM:SS:FF or HH:MM:SS;FF)"
        )
    h, m, s, f = map(int, parts)
    fps_int = fps_preset["fps_int"]
    if displayformat == "DF":
        if "df_drop" not in fps_preset:
            raise ValueError(
                f"DF not supported for {fps_preset['fps_int']}fps "
                "(only 29.97 and 59.94 use drop-frame)"
            )
        drop = fps_preset["df_drop"]
        total_minutes = h * 60 + m
        drops = drop * (total_minutes - total_minutes // 10)
        nominal = ((h * 60 + m) * 60 + s) * fps_int + f
        return nominal - drops
    return ((h * 60 + m) * 60 + s) * fps_int + f


# =============================================================================
# Spec field parsers
# =============================================================================

def parse_cues(s):
    """Parse cue spec. Accepts:
      - int: 5
      - list: [5, 7, 9]
      - str: "5", "5-10", "5,7,9", "5-7,12,20-22"
    Returns sorted unique list of ints.
    """
    if isinstance(s, int):
        return [s]
    if isinstance(s, list):
        return sorted({int(n) for n in s})
    out = set()
    for part in str(s).strip().split(","):
        p = part.strip()
        if "-" in p:
            a, b = p.split("-", 1)
            out.update(range(int(a), int(b) + 1))
        elif p:
            out.add(int(p))
    return sorted(out)


def parse_delay(value, fps_int):
    """Parse delay → frames. Accepts:
      - numeric seconds: 54.5
      - 'Ns Mf':         '54s29f'
      - TC:              '00:00:54:29' (treated as offset, not anchor)
    """
    if isinstance(value, (int, float)):
        return int(round(value * fps_int))
    s = str(value).strip()
    m = re.match(r"^(\d+)s(\d+)f$", s)
    if m:
        return int(m.group(1)) * fps_int + int(m.group(2))
    m = re.match(r"^(\d+):(\d+):(\d+)[:;](\d+)$", s)
    if m:
        h, mi, sec, fr = map(int, m.groups())
        return ((h * 60 + mi) * 60 + sec) * fps_int + fr
    try:
        return int(round(float(s) * fps_int))
    except ValueError:
        raise ValueError(
            f"bad delay format: {value!r}. accepted: '54s29f' / '00:00:54:29' / 54.5"
        )


def parse_duration(value):
    """Parse target duration → seconds (float). Accepts:
      - numeric: 60, 1.5  (treated as seconds)
      - '60s', '90s'
      - '1min', '1.5min', '10min'
      - '1h', '1.5h', '2h30min'
      - 'mm:ss' (e.g., '1:30' = 90s)
      - 'hh:mm:ss'
    """
    if isinstance(value, (int, float)):
        return float(value)
    s = str(value).strip().lower().replace(" ", "")
    if not s:
        return None

    # hh:mm:ss or mm:ss
    if ":" in s:
        parts = s.split(":")
        if len(parts) == 2:
            return float(parts[0]) * 60 + float(parts[1])
        if len(parts) == 3:
            return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2])
        raise ValueError(f"bad duration: {value!r}")

    # 1h30min, 90min, 60s, 1.5h, etc.
    total = 0.0
    matched = False
    pattern = re.compile(r"(\d+(?:\.\d+)?)(h|hr|hour|hours|m|min|mins|minute|minutes|s|sec|secs|second|seconds)")
    for num_str, unit in pattern.findall(s):
        matched = True
        n = float(num_str)
        if unit in ("h", "hr", "hour", "hours"):
            total += n * 3600
        elif unit in ("m", "min", "mins", "minute", "minutes"):
            total += n * 60
        else:
            total += n
    if matched:
        return total

    # Plain number string = seconds
    try:
        return float(s)
    except ValueError:
        raise ValueError(
            f"bad duration: {value!r}. accepted: 60 / '60s' / '1min' / '1h30min' / '1:30' / '1:30:00'"
        )


# =============================================================================
# SRT parsing
# =============================================================================

def parse_srt(text):
    cues = {}
    for block in re.split(r"\n\s*\n", text.strip()):
        lines = block.strip().split("\n")
        if len(lines) < 3:
            continue
        try:
            num = int(lines[0].strip())
        except ValueError:
            continue
        m = re.match(
            r"(\d{2}):(\d{2}):(\d{2})[,.](\d{3})\s*-->\s*"
            r"(\d{2}):(\d{2}):(\d{2})[,.](\d{3})",
            lines[1].strip(),
        )
        if not m:
            continue
        g = list(map(int, m.groups()))
        start = g[0] * 3600 + g[1] * 60 + g[2] + g[3] / 1000
        end = g[4] * 3600 + g[5] * 60 + g[6] + g[7] / 1000
        cues[num] = (start, end, "\n".join(lines[2:]).strip())
    return cues


# =============================================================================
# Cut expansion + frame computation
# =============================================================================

def compute_sequential_cuts(srt, remove_specs, default_cam):
    """Sequential mode: keep all SRT cues except those listed in `remove`.
    Group consecutive kept cues (boundary = a removed cue) into cuts.
    """
    removed = set()
    for r in remove_specs or []:
        for n in parse_cues(r["cues"]):
            removed.add(n)

    sorted_cues = sorted(srt.keys())
    cuts = []
    group = []

    def flush():
        if not group:
            return
        cue_str = f"{group[0]}-{group[-1]}" if len(group) > 1 else str(group[0])
        cuts.append({
            "cam": default_cam,
            "cues": cue_str,
            "label": f"seg {cue_str}",
        })

    for n in sorted_cues:
        if n in removed:
            flush()
            group = []
        else:
            group.append(n)
    flush()

    return cuts


def expand_cuts(specs, srt, default_padding):
    cuts = []
    for spec in specs:
        nums = parse_cues(spec["cues"])
        valid_nums = sorted([n for n in nums if n in srt])
        missing = [n for n in nums if n not in srt]
        if not valid_nums:
            raise SystemExit(
                f"all cues {nums} missing from SRT for cut {spec} "
                f"(SRT has cues 1-{max(srt.keys()) if srt else 0})"
            )
        if missing:
            # Soft-clip: keep valid cues, warn via stderr
            import sys as _sys
            _sys.stderr.write(
                f"WARNING: cues {missing} not in SRT (SRT max is "
                f"{max(srt.keys())}), clipping cut {spec.get('label') or spec.get('cues')!r} "
                f"to valid range {valid_nums[0]}-{valid_nums[-1]}\n"
            )
            nums = valid_nums
        # Use min start / max end for the cue range (handles non-contiguous)
        srt_in_raw = min(srt[n][0] for n in nums)
        srt_out_raw = max(srt[n][1] for n in nums)
        cuts.append({
            **spec,
            "cue_nums": nums,
            "srt_in_raw": srt_in_raw,
            "srt_out_raw": srt_out_raw,
        })

    for i, cut in enumerate(cuts):
        prev = cuts[i - 1] if i > 0 else None
        nxt = cuts[i + 1] if i < len(cuts) - 1 else None
        pad_in = cut.get("pad_in", cut.get("padding", default_padding))
        pad_out = cut.get("pad_out", cut.get("padding", default_padding))
        share_in = prev is not None and abs(cut["srt_in_raw"] - prev["srt_out_raw"]) < 0.01
        share_out = nxt is not None and abs(cut["srt_out_raw"] - nxt["srt_in_raw"]) < 0.01
        cut["srt_in"] = cut["srt_in_raw"] - (0 if share_in else pad_in)
        cut["srt_out"] = cut["srt_out_raw"] + (0 if share_out else pad_out)
    return cuts


def compute_offsets(cams, fps_preset, displayformat):
    """Compute frame-at-SRT-0 per camera. Returns (offsets dict, anchor_cam_id).

    Anchor camera can use EITHER:
      - "srt_starts_at_source_seconds": <float>   (REAL-WORLD seconds in source
                                                   where SRT time 0 occurs)
      - "anchor": {"srt_at": <float>, "source_tc": "HH:MM:SS:FF"}   (legacy TC form)
    Other cameras use "delay_from_<anchor_cam>" relative to the anchor.

    NTSC drift fix: SRT timestamps are real-world seconds, so converting to
    frame counts uses actual_fps (timebase / 1.001 for 59.94 / 29.97 / 23.976).
    Using `timebase` directly would introduce a 0.1% linear drift — visibly
    wrong by ~1 second over a 16-minute SRT.
    """
    fps_int = fps_preset["fps_int"]
    fps_real = actual_fps(fps_preset)
    offsets = {}

    anchor_cam = next(
        (cid for cid, c in cams.items()
         if ("anchor" in c) or ("srt_starts_at_source_seconds" in c)),
        None,
    )
    if anchor_cam is None:
        raise SystemExit(
            "at least one camera must have 'anchor' or 'srt_starts_at_source_seconds'"
        )

    for cid, cam in cams.items():
        if "srt_starts_at_source_seconds" in cam:
            offsets[cid] = int(round(
                float(cam["srt_starts_at_source_seconds"]) * fps_real
            ))
        elif "anchor" in cam:
            anchor_frame = parse_tc_to_frame(
                cam["anchor"]["source_tc"], fps_preset, displayformat
            )
            srt_at = float(cam["anchor"]["srt_at"])
            offsets[cid] = anchor_frame - int(round(srt_at * fps_real))
        else:
            delay_key = f"delay_from_{anchor_cam.lower()}"
            if delay_key not in cam:
                raise SystemExit(
                    f"camera {cid!r} needs 'anchor', "
                    f"'srt_starts_at_source_seconds', or {delay_key!r}"
                )
            delay = parse_delay(cam[delay_key], fps_int)
            offsets[cid] = offsets[anchor_cam] - delay

    return offsets, anchor_cam


def compute_frames(cuts, cam_offsets, fps_preset):
    """Compute per-camera source frame in/out and timeline in/out per cut.

    SRT timestamps are real-world seconds, so they convert to source frame
    counts via actual_fps (NOT timebase). For NTSC modes this avoids a 0.1%
    drift that would otherwise add up to ~1 second by the end of a long SRT.
    """
    # Accept either an fps_preset dict (new) or a bare timebase int (legacy
    # callers in tests / older specs). The bare-int form skips the NTSC
    # correction — it only matches non-NTSC framerates anyway.
    if isinstance(fps_preset, dict):
        fps_real = actual_fps(fps_preset)
    else:
        fps_real = fps_preset
    timeline_pos = 0
    cam_ids = list(cam_offsets.keys())
    for cut in cuts:
        in_f = cut["srt_in"] * fps_real
        out_f = cut["srt_out"] * fps_real
        for cid, offset in cam_offsets.items():
            cut[f"{cid.lower()}_in"] = int(round(offset + in_f))
            cut[f"{cid.lower()}_out"] = int(round(offset + out_f))
        first = cam_ids[0].lower()
        dur = cut[f"{first}_out"] - cut[f"{first}_in"]
        cut["timeline_in"] = timeline_pos
        cut["timeline_out"] = timeline_pos + dur
        timeline_pos += dur
    return timeline_pos


# =============================================================================
# XML emission
# =============================================================================

def emit_xml(spec, cuts, total_duration, cam_offsets):
    seq = spec["sequence"]
    cams = spec["cameras"]
    settings = spec.get("settings", {})
    multicam = settings.get("multicam", False) and len(cams) > 1

    fps_preset = get_fps_preset(seq["fps"])
    timebase = fps_preset["timebase"]
    ntsc = "TRUE" if fps_preset["ntsc"] else "FALSE"
    df = seq.get("displayformat", "NDF")
    width = seq.get("width", 1920)
    height = seq.get("height", 1080)
    pixel_aspect = seq.get("pixel_aspect", "square")
    name = seq.get("name", "Sequence")
    audio = seq.get("audio", {})
    audio_rate = audio.get("sample_rate", 48000)
    audio_depth = audio.get("depth", 16)
    audio_channels = audio.get("channels", 2)

    rate = f"<rate><timebase>{timebase}</timebase><ntsc>{ntsc}</ntsc></rate>"
    cam_ids = list(cams.keys())

    # File definitions per camera (full first time, ref afterwards).
    file_state = {}
    for cid in cam_ids:
        c = cams[cid]
        fid = f"file-{cid}"
        duration = c.get("duration", 200000)
        path = c.get("path", "<<RELINK>>")
        file_name = c.get("file", f"{cid}_camera")
        full = (
            f'<file id="{fid}">\n'
            f'              <name>{xml_escape(file_name)}</name>\n'
            f'              <pathurl>file://{xml_escape(path)}</pathurl>\n'
            f'              {rate}\n'
            f'              <duration>{duration}</duration>\n'
            f'              <timecode>{rate}<string>00:00:00:00</string><frame>0</frame><displayformat>{df}</displayformat></timecode>\n'
            f'              <media>\n'
            f'                <video><duration>{duration}</duration><samplecharacteristics>{rate}<width>{width}</width><height>{height}</height><pixelaspectratio>{pixel_aspect}</pixelaspectratio></samplecharacteristics></video>\n'
            f'                <audio><samplecharacteristics><depth>{audio_depth}</depth><samplerate>{audio_rate}</samplerate></samplecharacteristics><channelcount>{audio_channels}</channelcount></audio>\n'
            f'              </media>\n'
            f'            </file>'
        )
        file_state[cid] = {
            "full": full,
            "ref": f'<file id="{fid}"/>',
            "used": False,
            "duration": duration,
        }

    def file_xml(cid):
        s = file_state[cid]
        if not s["used"]:
            s["used"] = True
            return s["full"]
        return s["ref"]

    def v_track(cid):
        return cam_ids.index(cid) + 1

    def a_tracks(cid):
        i = cam_ids.index(cid)
        return [i * audio_channels + ch for ch in range(1, audio_channels + 1)]

    def group_idx(cid):
        return cam_ids.index(cid) + 1

    def link_block(cut_idx, cid, clipidx):
        v = v_track(cid)
        a_list = a_tracks(cid)
        g = group_idx(cid)
        v_id = f"clip-v{v}-c{cut_idx}"
        parts = [
            f'<link><linkclipref>{v_id}</linkclipref><mediatype>video</mediatype>'
            f'<trackindex>{v}</trackindex><clipindex>{clipidx}</clipindex></link>'
        ]
        for a in a_list:
            aid = f"clip-a{a}-c{cut_idx}"
            parts.append(
                f'<link><linkclipref>{aid}</linkclipref><mediatype>audio</mediatype>'
                f'<trackindex>{a}</trackindex><clipindex>{clipidx}</clipindex>'
                f'<groupindex>{g}</groupindex></link>'
            )
        return "\n            ".join(parts)

    def video_clip(cut, cut_idx, clipidx, cid, enabled):
        in_f = cut[f"{cid.lower()}_in"]
        out_f = cut[f"{cid.lower()}_out"]
        en = "TRUE" if enabled else "FALSE"
        v = v_track(cid)
        return (
            f'          <clipitem id="clip-v{v}-c{cut_idx}">\n'
            f'            <name>{cid}_camera</name><enabled>{en}</enabled>'
            f'<duration>{file_state[cid]["duration"]}</duration>\n'
            f'            {rate}\n'
            f'            <start>{cut["timeline_in"]}</start><end>{cut["timeline_out"]}</end>'
            f'<in>{in_f}</in><out>{out_f}</out>\n'
            f'            {file_xml(cid)}\n'
            f'            {link_block(cut_idx, cid, clipidx)}\n'
            f'          </clipitem>'
        )

    def audio_clip(cut, cut_idx, clipidx, cid, ch, enabled):
        in_f = cut[f"{cid.lower()}_in"]
        out_f = cut[f"{cid.lower()}_out"]
        en = "TRUE" if enabled else "FALSE"
        a_idx = a_tracks(cid)[ch - 1]
        return (
            f'          <clipitem id="clip-a{a_idx}-c{cut_idx}">\n'
            f'            <name>{cid}_camera</name><enabled>{en}</enabled>'
            f'<duration>{file_state[cid]["duration"]}</duration>\n'
            f'            {rate}\n'
            f'            <start>{cut["timeline_in"]}</start><end>{cut["timeline_out"]}</end>'
            f'<in>{in_f}</in><out>{out_f}</out>\n'
            f'            {file_xml(cid)}\n'
            f'            <sourcetrack><mediatype>audio</mediatype><trackindex>{ch}</trackindex></sourcetrack>\n'
            f'            {link_block(cut_idx, cid, clipidx)}\n'
            f'          </clipitem>'
        )

    # Track contents per camera.
    track_clips = {}
    for cid in cam_ids:
        if multicam:
            track_clips[cid] = [(i, c, c["cam"] == cid) for i, c in enumerate(cuts)]
        else:
            track_clips[cid] = [(i, c, True) for i, c in enumerate(cuts) if c["cam"] == cid]

    # Build video tracks.
    video_tracks = []
    for cid in cam_ids:
        clips = track_clips[cid]
        clips_xml = "\n".join(
            video_clip(c, ci, idx + 1, cid, en)
            for idx, (ci, c, en) in enumerate(clips)
        )
        video_tracks.append(
            f'        <track>\n'
            f'          <enabled>TRUE</enabled><locked>FALSE</locked>\n'
            f'{clips_xml}\n'
            f'        </track>'
        )

    # Build audio tracks.
    audio_tracks = []
    for cid in cam_ids:
        clips = track_clips[cid]
        for ch in range(1, audio_channels + 1):
            clips_xml = "\n".join(
                audio_clip(c, ci, idx + 1, cid, ch, en)
                for idx, (ci, c, en) in enumerate(clips)
            )
            output_ch = ((ch - 1) % 2) + 1
            audio_tracks.append(
                f'        <track>\n'
                f'          <enabled>TRUE</enabled><locked>FALSE</locked>\n'
                f'          <outputchannelindex>{output_ch}</outputchannelindex>\n'
                f'{clips_xml}\n'
                f'        </track>'
            )

    video_blocks = "\n".join(video_tracks)
    audio_blocks = "\n".join(audio_tracks)

    return (
        f'<?xml version="1.0" encoding="UTF-8"?>\n'
        f'<!DOCTYPE xmeml>\n'
        f'<xmeml version="5">\n'
        f'  <sequence id="seq-{xml_escape(name)}">\n'
        f'    <name>{xml_escape(name)}</name>\n'
        f'    <duration>{total_duration}</duration>\n'
        f'    {rate}\n'
        f'    <in>-1</in>\n'
        f'    <out>-1</out>\n'
        f'    <timecode>{rate}<string>00:00:00:00</string><frame>0</frame><displayformat>{df}</displayformat></timecode>\n'
        f'    <media>\n'
        f'      <video>\n'
        f'        <format>\n'
        f'          <samplecharacteristics>{rate}<width>{width}</width><height>{height}</height><pixelaspectratio>{pixel_aspect}</pixelaspectratio><fielddominance>none</fielddominance><colordepth>24</colordepth></samplecharacteristics>\n'
        f'        </format>\n'
        f'{video_blocks}\n'
        f'      </video>\n'
        f'      <audio>\n'
        f'        <numOutputChannels>2</numOutputChannels>\n'
        f'        <format><samplecharacteristics><depth>{audio_depth}</depth><samplerate>{audio_rate}</samplerate></samplecharacteristics></format>\n'
        f'{audio_blocks}\n'
        f'      </audio>\n'
        f'    </media>\n'
        f'  </sequence>\n'
        f'</xmeml>\n'
    )


# =============================================================================
# Verify / analyze
# =============================================================================

def make_analysis(cuts, srt, cam_offsets, anchor_cam, fps_preset, total_duration):
    fps_real = actual_fps(fps_preset)
    out = []
    for idx, cut in enumerate(cuts):
        mid_srt = (cut["srt_in"] + cut["srt_out"]) / 2
        cue_at = next(
            (n for n, (s, e, _) in srt.items() if s <= mid_srt < e), None
        )
        text = srt[cue_at][2] if cue_at else None
        cams_at_mid = {
            cid: {
                "in": cut[f"{cid.lower()}_in"],
                "out": cut[f"{cid.lower()}_out"],
                "mid": int(round(offset + mid_srt * fps_real)),
            }
            for cid, offset in cam_offsets.items()
        }
        out.append({
            "index": idx,
            "cam": cut["cam"],
            "cue_range": cut["cue_nums"],
            "label": cut.get("label"),
            "role": cut.get("role"),
            "srt_in": round(cut["srt_in"], 3),
            "srt_out": round(cut["srt_out"], 3),
            "timeline_in": cut["timeline_in"],
            "timeline_out": cut["timeline_out"],
            "duration_frames": cut["timeline_out"] - cut["timeline_in"],
            "duration_seconds": round((cut["timeline_out"] - cut["timeline_in"]) / fps_real, 3),
            "mid": {
                "srt_seconds": round(mid_srt, 3),
                "cue_at_mid": cue_at,
                "expected_text": text,
                "cameras": cams_at_mid,
            },
        })
    return {
        "total_duration_frames": total_duration,
        "total_duration_seconds": round(total_duration / fps_real, 3),
        "anchor_camera": anchor_cam,
        "camera_offsets": cam_offsets,
        "cuts": out,
    }


def print_verify(analysis, multicam):
    print("\n=== Verification anchors ===", file=sys.stderr)
    for cut in analysis["cuts"]:
        cams = cut["mid"]["cameras"]
        cam_str = " | ".join(f"{cid}: frame {info['mid']}" for cid, info in cams.items())
        active = cut["cam"]
        print(
            f"\nCut {cut['index']} [{active}] {cut.get('label') or ''}",
            file=sys.stderr,
        )
        print(
            f"  Timeline: {cut['timeline_in']}-{cut['timeline_out']} "
            f"({cut['duration_seconds']}s)",
            file=sys.stderr,
        )
        print(
            f"  SRT mid: {cut['mid']['srt_seconds']}s "
            f"(cue {cut['mid']['cue_at_mid']})",
            file=sys.stderr,
        )
        print(f"  Source frames: {cam_str}", file=sys.stderr)
        print(f"  Expected: \"{cut['mid']['expected_text']}\"", file=sys.stderr)
    if multicam:
        print(
            "\n[multicam] toggle V1/V2 enable at each timeline-mid; "
            "both cameras should show the same speech content.",
            file=sys.stderr,
        )
    print("", file=sys.stderr)


# =============================================================================
# IO helpers + spec validation
# =============================================================================

def read_input(path):
    if path == "-":
        return sys.stdin.read()
    return Path(path).read_text(encoding="utf-8")


def write_output(path, content):
    if path == "-":
        sys.stdout.write(content)
    else:
        Path(path).write_text(content, encoding="utf-8")


def validate_spec(spec):
    errors = []
    seq = spec.get("sequence", {})
    if "fps" not in seq:
        errors.append("sequence.fps required")
    elif str(seq["fps"]) not in FPS_PRESETS:
        errors.append(f"sequence.fps must be one of {sorted(FPS_PRESETS)}")
    fps_preset = FPS_PRESETS.get(str(seq.get("fps", "")), {})
    df = seq.get("displayformat", "NDF")
    if df not in ("NDF", "DF"):
        errors.append(f"displayformat must be NDF or DF, got {df!r}")
    if df == "DF" and "df_drop" not in fps_preset:
        errors.append(
            f"DF only valid for 29.97 or 59.94 fps (got {seq.get('fps')!r})"
        )

    cams = spec.get("cameras", {})
    if not cams:
        errors.append("cameras required (at least 1)")
    if not any(
        ("anchor" in c) or ("srt_starts_at_source_seconds" in c)
        for c in cams.values()
    ):
        errors.append(
            "at least one camera must have 'anchor' or 'srt_starts_at_source_seconds'"
        )
    cam_ids = set(cams.keys())

    mode = spec.get("mode", "highlight")
    if mode not in ("highlight", "sequential"):
        errors.append(f"mode must be 'highlight' or 'sequential', got {mode!r}")

    if mode == "highlight":
        cuts = spec.get("cuts", [])
        if not cuts:
            errors.append("highlight mode requires non-empty 'cuts' list")
        for i, cut in enumerate(cuts):
            if "cam" not in cut:
                errors.append(f"cuts[{i}].cam required")
            elif cut["cam"] not in cam_ids:
                errors.append(
                    f"cuts[{i}].cam={cut['cam']!r} not in cameras {sorted(cam_ids)}"
                )
            if "cues" not in cut:
                errors.append(f"cuts[{i}].cues required")
    elif mode == "sequential":
        default_cam = spec.get("default_cam")
        if not default_cam:
            errors.append("sequential mode requires 'default_cam'")
        elif default_cam not in cam_ids:
            errors.append(
                f"default_cam={default_cam!r} not in cameras {sorted(cam_ids)}"
            )
        for i, r in enumerate(spec.get("remove", [])):
            if "cues" not in r:
                errors.append(f"remove[{i}].cues required")

    multicam = spec.get("settings", {}).get("multicam", False)
    if multicam and len(cams) < 2:
        errors.append("settings.multicam=true requires >= 2 cameras")
    if mode == "sequential" and multicam:
        errors.append("sequential mode + multicam not supported "
                      "(use highlight mode for multicam editing)")

    if errors:
        for e in errors:
            print(f"ERROR: {e}", file=sys.stderr)
        sys.exit(1)


# =============================================================================
# Main
# =============================================================================

def main():
    ap = argparse.ArgumentParser(
        description="SRT + cut spec → FCP7 XML for Premiere Pro"
    )
    ap.add_argument("--srt", help="SRT file path or '-' for stdin")
    ap.add_argument("--spec", required=True, help="cuts JSON path or '-'")
    ap.add_argument("--output", help="XML/JSON output path or '-' (stdout)")
    ap.add_argument("--verify", action="store_true",
                    help="print verification anchors to stderr")
    ap.add_argument("--analyze", action="store_true",
                    help="output JSON analysis instead of XML")
    ap.add_argument("--validate", action="store_true",
                    help="validate spec only, no output")
    ap.add_argument("--target", help="target duration override "
                    "(e.g. '60s', '1min', '1h30min', '1:30')")
    ap.add_argument("--mode", choices=["highlight", "sequential"],
                    help="override spec.mode")
    args = ap.parse_args()

    spec = json.loads(read_input(args.spec))
    if args.mode:
        spec["mode"] = args.mode
    if args.target:
        spec.setdefault("settings", {})["target_duration"] = args.target
    validate_spec(spec)

    if args.validate:
        print("spec: OK", file=sys.stderr)
        return

    if not args.srt:
        raise SystemExit("--srt required (unless --validate)")

    srt = parse_srt(read_input(args.srt))

    seq = spec["sequence"]
    fps_preset = get_fps_preset(seq["fps"])
    timebase = fps_preset["timebase"]
    df = seq.get("displayformat", "NDF")
    settings = spec.get("settings", {})
    padding = settings.get("padding", 0.5)

    cam_offsets, anchor_cam = compute_offsets(spec["cameras"], fps_preset, df)

    mode = spec.get("mode", "highlight")
    if mode == "sequential":
        cut_specs = compute_sequential_cuts(
            srt, spec.get("remove", []), spec["default_cam"]
        )
        if not cut_specs:
            raise SystemExit(
                "sequential mode produced 0 cuts (every cue was removed?)"
            )
    else:
        cut_specs = spec["cuts"]

    cuts = expand_cuts(cut_specs, srt, padding)
    total_duration = compute_frames(cuts, cam_offsets, fps_preset)

    target_raw = settings.get("target_duration")
    target = parse_duration(target_raw) if target_raw is not None else None
    duration_sec = total_duration / actual_fps(fps_preset)
    duration_status = None
    if target is not None:
        diff = duration_sec - target
        if diff > 0:
            duration_status = (
                f"OVER target by {diff:.1f}s "
                f"(actual {duration_sec:.1f}s, target {target:.1f}s = {target_raw!r})"
            )
        else:
            duration_status = (
                f"under target ({duration_sec:.1f}s / {target:.1f}s = {target_raw!r}, "
                f"slack {-diff:.1f}s)"
            )

    if args.analyze:
        analysis = make_analysis(
            cuts, srt, cam_offsets, anchor_cam, fps_preset, total_duration
        )
        write_output(args.output or "-", json.dumps(analysis, ensure_ascii=False, indent=2))
        return

    if not args.output:
        raise SystemExit("--output required (unless --analyze or --validate)")

    xml = emit_xml(spec, cuts, total_duration, cam_offsets)
    write_output(args.output, xml)

    if args.output != "-":
        multicam = settings.get("multicam", False) and len(spec["cameras"]) > 1
        print(f"Wrote {args.output}", file=sys.stderr)
        print(
            f"  mode={mode}, fps={seq['fps']} {df}, "
            f"cameras={list(cam_offsets.keys())}, multicam={multicam}",
            file=sys.stderr,
        )
        print(f"  offsets: {cam_offsets}", file=sys.stderr)
        print(
            f"  cuts={len(cuts)}, duration={total_duration} frames "
            f"({duration_sec:.2f}s)",
            file=sys.stderr,
        )
        if duration_status:
            print(f"  target: {duration_status}", file=sys.stderr)

    if args.verify:
        analysis = make_analysis(
            cuts, srt, cam_offsets, anchor_cam, fps_preset, total_duration
        )
        multicam = spec.get("settings", {}).get("multicam", False) and len(spec["cameras"]) > 1
        print_verify(analysis, multicam)


if __name__ == "__main__":
    main()