"""Headless end-to-end backend test (no Gradio UI). Runs: sample video -> audio -> Whisper -> scene frames -> LLM step draft -> assemble -> DOCX, asserting each stage produced output. The LLM step falls back to a naive sentence-per-step draft if the HuggingFace API is unreachable, so the DOCX assembly is always validated. """ from __future__ import annotations import sys from pathlib import Path try: # Windows consoles default to cp1252 and choke on emoji/non-ASCII output. sys.stdout.reconfigure(encoding="utf-8", errors="replace") except Exception: pass ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) sys.path.insert(0, str(ROOT / "scripts")) import make_sample # noqa: E402 from src import config, docx_export, video # noqa: E402 from src import frames as frames_lib # noqa: E402 from src import guide as guide_lib # noqa: E402 from src import llm, transcribe # noqa: E402 from src.llm import GuideDraft, StepDraft # noqa: E402 def naive_draft(tr) -> GuideDraft: steps = [ StepDraft(heading=f"Step {i}", text=seg.text.strip(), approx_timestamp=seg.start) for i, seg in enumerate(tr.segments, start=1) if seg.text.strip() ] return GuideDraft(title="Sample Guide", intro="Generated offline.", steps=steps) def main() -> None: sample = config.WORK_DIR / "sample" / "sample.mp4" if not sample.exists(): sample = make_sample.main() sdir = config.session_dir("smoke") # The app takes the token from the UI; this headless test reads it from the # environment (validating which of HF_TOKEN / HUGGINGFACEHUB_API_TOKEN works). token = config.resolve_hf_token() print(f"HF token: {'present' if token else 'MISSING (LLM step will use naive fallback)'}") print("\n[1/5] Extract audio + transcribe…") wav = video.extract_audio(sample, sdir / "audio.wav") tr = transcribe.transcribe(wav) print(f" device={tr.device} segments={len(tr.segments)} text={tr.text[:120]!r}") assert tr.text.strip(), "Transcript is empty" print("[2/5] Build guide draft (LLM)…") try: draft = llm.build_guide_draft(tr, token=token) if not draft.steps: raise RuntimeError("LLM returned no steps") print(f" LLM ok: '{draft.title}' ({len(draft.steps)} steps)") except Exception as exc: print(f" LLM unavailable ({exc}); using naive fallback draft.") draft = naive_draft(tr) assert draft.steps, "No steps in draft" print("[3/5] Auto-extract frames (step-aligned)…") spoken = [(s.start, s.end) for s in tr.segments] if tr.segments else None step_ts = [s.approx_timestamp for s in draft.steps if s.approx_timestamp is not None] recs = frames_lib.extract_auto_frames( sample, sdir, spoken_intervals=spoken, step_timestamps=step_ts or None ) print(f" frames={len(recs)} (from {len(step_ts)} step timestamps)") assert recs, "No frames were extracted" print("[4/5] Assemble (align + caption)…") spoken_range = ( (min(s.start for s in tr.segments), max(s.end for s in tr.segments)) if tr.segments else None ) g = guide_lib.assemble_guide( draft, recs, video_path=str(sample), session_dir=sdir, do_caption=True, token=token, spoken_range=spoken_range, ) print("[5/5] Export DOCX…") out = sdir / "guide.docx" docx_export.export_docx(g, out) assert out.exists() and out.stat().st_size > 0, "DOCX not written" for s in g.steps: ts = f"{int((s.timestamp or 0))//60:02d}:{int((s.timestamp or 0))%60:02d}" print(f" - [{ts}] {s.heading!r}: img={'yes' if s.image_path else 'no'} cap={s.caption!r}") n_imgs = sum(1 for s in g.steps if s.image_path) n_caps = sum(1 for s in g.steps if s.caption) print( f"\nOK ✅ {out} ({out.stat().st_size} bytes, " f"{len(g.steps)} steps, {n_imgs} images, {n_caps} captions)" ) if __name__ == "__main__": main()