Spaces:

vivekchakraverty
/

DocuMaker

Sleeping

App Files Files Community

DocuMaker / scripts /smoke_test.py

vivekchakraverty

Auto-extract: generate the LLM step outline first, anchor frames to step times

0e7c368 14 days ago

Raw

History Blame Contribute Delete

3.99 kB

	"""Headless end-to-end backend test (no Gradio UI).

	Runs: sample video -> audio -> Whisper -> scene frames -> LLM step draft ->
	assemble -> DOCX, asserting each stage produced output. The LLM step falls back
	to a naive sentence-per-step draft if the HuggingFace API is unreachable, so the
	DOCX assembly is always validated.
	"""
	from __future__ import annotations

	import sys
	from pathlib import Path

	try: # Windows consoles default to cp1252 and choke on emoji/non-ASCII output.
	sys.stdout.reconfigure(encoding="utf-8", errors="replace")
	except Exception:
	pass

	ROOT = Path(__file__).resolve().parent.parent
	sys.path.insert(0, str(ROOT))
	sys.path.insert(0, str(ROOT / "scripts"))

	import make_sample # noqa: E402
	from src import config, docx_export, video # noqa: E402
	from src import frames as frames_lib # noqa: E402
	from src import guide as guide_lib # noqa: E402
	from src import llm, transcribe # noqa: E402
	from src.llm import GuideDraft, StepDraft # noqa: E402


	def naive_draft(tr) -> GuideDraft:
	steps = [
	StepDraft(heading=f"Step {i}", text=seg.text.strip(), approx_timestamp=seg.start)
	for i, seg in enumerate(tr.segments, start=1)
	if seg.text.strip()
	]
	return GuideDraft(title="Sample Guide", intro="Generated offline.", steps=steps)


	def main() -> None:
	sample = config.WORK_DIR / "sample" / "sample.mp4"
	if not sample.exists():
	sample = make_sample.main()

	sdir = config.session_dir("smoke")

	# The app takes the token from the UI; this headless test reads it from the
	# environment (validating which of HF_TOKEN / HUGGINGFACEHUB_API_TOKEN works).
	token = config.resolve_hf_token()
	print(f"HF token: {'present' if token else 'MISSING (LLM step will use naive fallback)'}")

	print("\n[1/5] Extract audio + transcribe…")
	wav = video.extract_audio(sample, sdir / "audio.wav")
	tr = transcribe.transcribe(wav)
	print(f" device={tr.device} segments={len(tr.segments)} text={tr.text[:120]!r}")
	assert tr.text.strip(), "Transcript is empty"

	print("[2/5] Build guide draft (LLM)…")
	try:
	draft = llm.build_guide_draft(tr, token=token)
	if not draft.steps:
	raise RuntimeError("LLM returned no steps")
	print(f" LLM ok: '{draft.title}' ({len(draft.steps)} steps)")
	except Exception as exc:
	print(f" LLM unavailable ({exc}); using naive fallback draft.")
	draft = naive_draft(tr)
	assert draft.steps, "No steps in draft"

	print("[3/5] Auto-extract frames (step-aligned)…")
	spoken = [(s.start, s.end) for s in tr.segments] if tr.segments else None
	step_ts = [s.approx_timestamp for s in draft.steps if s.approx_timestamp is not None]
	recs = frames_lib.extract_auto_frames(
	sample, sdir, spoken_intervals=spoken, step_timestamps=step_ts or None
	)
	print(f" frames={len(recs)} (from {len(step_ts)} step timestamps)")
	assert recs, "No frames were extracted"

	print("[4/5] Assemble (align + caption)…")
	spoken_range = (
	(min(s.start for s in tr.segments), max(s.end for s in tr.segments))
	if tr.segments else None
	)
	g = guide_lib.assemble_guide(
	draft, recs, video_path=str(sample), session_dir=sdir, do_caption=True,
	token=token, spoken_range=spoken_range,
	)

	print("[5/5] Export DOCX…")
	out = sdir / "guide.docx"
	docx_export.export_docx(g, out)
	assert out.exists() and out.stat().st_size > 0, "DOCX not written"

	for s in g.steps:
	ts = f"{int((s.timestamp or 0))//60:02d}:{int((s.timestamp or 0))%60:02d}"
	print(f" - [{ts}] {s.heading!r}: img={'yes' if s.image_path else 'no'} cap={s.caption!r}")
	n_imgs = sum(1 for s in g.steps if s.image_path)
	n_caps = sum(1 for s in g.steps if s.caption)
	print(
	f"\nOK ✅ {out} ({out.stat().st_size} bytes, "
	f"{len(g.steps)} steps, {n_imgs} images, {n_caps} captions)"
	)


	if __name__ == "__main__":
	main()