"""Transcript cleanup + step-structuring via the HuggingFace Inference API. The LLM turns a rough, timestamped transcript into a structured guide draft (title, intro, prerequisites, ordered steps). Long transcripts are processed map-reduce style so prompts stay within the model's context window. All model ids are config-driven and responses are parsed defensively, because free-tier model availability and exact output formatting both vary. """ from __future__ import annotations import json import re from dataclasses import dataclass, field from typing import Any, Callable from . import config from .transcribe import Transcript _SYSTEM = ( "You are a meticulous technical writer. You convert rough spoken transcripts " "from how-to/tutorial videos into clear, accurate, step-by-step instructions. " "You never invent actions that are not in the transcript." ) @dataclass class StepDraft: heading: str text: str approx_timestamp: float | None = None @dataclass class GuideDraft: title: str = "Step-by-Step Guide" intro: str = "" prerequisites: list[str] = field(default_factory=list) steps: list[StepDraft] = field(default_factory=list) # --- Inference client -------------------------------------------------------- def get_client(token: str | None): """Build an InferenceClient for the given user-supplied token. Creating a client is cheap (no network until a call), so we don't cache — this lets the token change at runtime (it comes from the UI field). """ from huggingface_hub import InferenceClient kwargs: dict[str, Any] = {"model": config.LLM_MODEL} if token: kwargs["token"] = token if config.LLM_PROVIDER: kwargs["provider"] = config.LLM_PROVIDER return InferenceClient(**kwargs) def _chat(client, user_prompt: str, *, max_tokens: int | None = None) -> str: try: resp = client.chat_completion( messages=[ {"role": "system", "content": _SYSTEM}, {"role": "user", "content": user_prompt}, ], max_tokens=max_tokens or config.LLM_MAX_TOKENS, temperature=config.LLM_TEMPERATURE, ) except Exception as exc: raise RuntimeError( f"HuggingFace LLM call failed for model '{config.LLM_MODEL}'. " f"Check the model is available on your plan or set DOCUMAKER_LLM_MODEL " f"to another instruct model.\nDetails: {exc}" ) from exc return resp.choices[0].message.content or "" # --- JSON / time parsing ----------------------------------------------------- def _extract_json(text: str) -> Any: """Best-effort extraction of a JSON object/array from an LLM response.""" text = text.strip() try: return json.loads(text) except Exception: pass fenced = re.search(r"```(?:json)?\s*(.*?)```", text, re.S) if fenced: try: return json.loads(fenced.group(1)) except Exception: pass for open_ch, close_ch in (("{", "}"), ("[", "]")): i, j = text.find(open_ch), text.rfind(close_ch) if i != -1 and j > i: try: return json.loads(text[i : j + 1]) except Exception: continue return None def _parse_time(value: Any) -> float | None: """Parse 'ss', 'mm:ss', or 'hh:mm:ss' (or a number) into seconds.""" if value is None: return None if isinstance(value, (int, float)): return float(value) s = str(value).strip() if not s: return None try: parts = [float(p) for p in s.split(":")] except ValueError: return None seconds = 0.0 for part in parts: seconds = seconds * 60 + part return seconds # --- Prompt builders --------------------------------------------------------- _JSON_FULL = ( '{"title": "...", "intro": "...", "prerequisites": ["..."], ' '"steps": [{"heading": "short title", "text": "what to do", "approx_time": "mm:ss"}]}' ) _JSON_STEPS = '{"steps": [{"heading": "short title", "text": "what to do", "approx_time": "mm:ss"}]}' def _full_prompt(timestamped_text: str) -> str: return ( "Convert this timestamped tutorial transcript into a clean step-by-step guide.\n" "- Fix obvious speech-to-text errors; remove filler words and repetition.\n" "- Write each step as a clear, imperative instruction.\n" "- Add a short descriptive title, a 1-2 sentence introduction, and any " "prerequisites that are implied (empty list if none).\n" "- For each step set \"approx_time\" as \"mm:ss\" from the nearest [mm:ss] marker.\n" f"Respond with ONLY JSON in this exact shape:\n{_JSON_FULL}\n\n" f"Transcript:\n{timestamped_text}" ) def _chunk_prompt(timestamped_text: str) -> str: return ( "From this timestamped transcript excerpt of a tutorial video, extract the " "concrete actions as an ordered list of steps.\n" "- Fix speech-to-text errors; remove filler and repetition.\n" "- Write each step as a clear, imperative instruction.\n" "- Set \"approx_time\" as \"mm:ss\" from the nearest [mm:ss] marker.\n" f"Respond with ONLY JSON in this exact shape:\n{_JSON_STEPS}\n\n" f"Transcript:\n{timestamped_text}" ) def _reduce_prompt(steps_json: str) -> str: return ( "You are assembling the final step-by-step guide from steps extracted across " "several transcript chunks.\n" "- Merge near-duplicates and keep a logical order.\n" "- Keep every distinct action; do not invent new steps.\n" "- Add a short descriptive title, a 1-2 sentence introduction, and prerequisites " "if implied (empty list if none). Preserve each step's \"approx_time\".\n" f"Respond with ONLY JSON in this exact shape:\n{_JSON_FULL}\n\n" f"Extracted steps:\n{steps_json}" ) # --- Chunking ---------------------------------------------------------------- def _timestamped_lines(transcript: Transcript) -> list[str]: lines = [] for seg in transcript.segments: mm, ss = divmod(int(seg.start), 60) lines.append(f"[{mm:02d}:{ss:02d}] {seg.text.strip()}") return lines def _chunk_transcript(transcript: Transcript, max_chars: int) -> list[str]: lines = _timestamped_lines(transcript) if not lines: return [transcript.text] if transcript.text else [] chunks: list[str] = [] current: list[str] = [] length = 0 for line in lines: if current and length + len(line) > max_chars: chunks.append("\n".join(current)) current, length = [], 0 current.append(line) length += len(line) + 1 if current: chunks.append("\n".join(current)) return chunks # --- Result parsing ---------------------------------------------------------- def _parse_steps(raw_steps: Any) -> list[StepDraft]: steps: list[StepDraft] = [] if not isinstance(raw_steps, list): return steps for item in raw_steps: if not isinstance(item, dict): continue heading = str(item.get("heading") or item.get("title") or "").strip() text = str(item.get("text") or item.get("instruction") or "").strip() if not (heading or text): continue ts = _parse_time(item.get("approx_time", item.get("approx_timestamp"))) steps.append(StepDraft(heading=heading or text[:60], text=text, approx_timestamp=ts)) return steps def _parse_guide(data: Any) -> GuideDraft: if not isinstance(data, dict): return GuideDraft() prereqs = data.get("prerequisites") or [] if not isinstance(prereqs, list): prereqs = [str(prereqs)] return GuideDraft( title=str(data.get("title") or "Step-by-Step Guide").strip(), intro=str(data.get("intro") or "").strip(), prerequisites=[str(p).strip() for p in prereqs if str(p).strip()], steps=_parse_steps(data.get("steps")), ) # --- Public entry point ------------------------------------------------------ def build_guide_draft( transcript: Transcript, *, token: str | None = None, progress: Callable[[float, str], None] | None = None, ) -> GuideDraft: """Turn a transcript into a structured :class:`GuideDraft` via the LLM. ``token`` is the user's HuggingFace token (supplied in the UI). """ chunks = _chunk_transcript(transcript, config.LLM_CHUNK_CHARS) if not chunks: return GuideDraft() client = get_client(token) if len(chunks) == 1: if progress: progress(0.1, "Writing the guide…") draft = _parse_guide(_extract_json(_chat(client, _full_prompt(chunks[0])))) if progress: progress(1.0, "Guide drafted.") return draft # Map: extract steps per chunk. all_steps: list[dict] = [] for i, chunk in enumerate(chunks): if progress: progress(i / (len(chunks) + 1), f"Structuring part {i + 1}/{len(chunks)}…") data = _extract_json(_chat(client, _chunk_prompt(chunk))) for step in _parse_steps((data or {}).get("steps") if isinstance(data, dict) else data): mm, ss = divmod(int(step.approx_timestamp or 0), 60) all_steps.append( {"heading": step.heading, "text": step.text, "approx_time": f"{mm:02d}:{ss:02d}"} ) # Reduce: merge into a titled guide. if progress: progress(len(chunks) / (len(chunks) + 1), "Assembling the final guide…") reduced = _extract_json(_chat(client, _reduce_prompt(json.dumps({"steps": all_steps})))) draft = _parse_guide(reduced) if not draft.steps: # reduce failed — fall back to the mapped steps draft = _parse_guide({"steps": all_steps}) if progress: progress(1.0, "Guide drafted.") return draft