Spaces:

vivekchakraverty
/

DocuMaker

Sleeping

File size: 9,942 Bytes

85b485a

"""Transcript cleanup + step-structuring via the HuggingFace Inference API.

The LLM turns a rough, timestamped transcript into a structured guide draft
(title, intro, prerequisites, ordered steps). Long transcripts are processed
map-reduce style so prompts stay within the model's context window. All model
ids are config-driven and responses are parsed defensively, because free-tier
model availability and exact output formatting both vary.
"""
from __future__ import annotations

import json
import re
from dataclasses import dataclass, field
from typing import Any, Callable

from . import config
from .transcribe import Transcript

_SYSTEM = (
    "You are a meticulous technical writer. You convert rough spoken transcripts "
    "from how-to/tutorial videos into clear, accurate, step-by-step instructions. "
    "You never invent actions that are not in the transcript."
)


@dataclass
class StepDraft:
    heading: str
    text: str
    approx_timestamp: float | None = None


@dataclass
class GuideDraft:
    title: str = "Step-by-Step Guide"
    intro: str = ""
    prerequisites: list[str] = field(default_factory=list)
    steps: list[StepDraft] = field(default_factory=list)


# --- Inference client --------------------------------------------------------
def get_client(token: str | None):
    """Build an InferenceClient for the given user-supplied token.

    Creating a client is cheap (no network until a call), so we don't cache —
    this lets the token change at runtime (it comes from the UI field).
    """
    from huggingface_hub import InferenceClient

    kwargs: dict[str, Any] = {"model": config.LLM_MODEL}
    if token:
        kwargs["token"] = token
    if config.LLM_PROVIDER:
        kwargs["provider"] = config.LLM_PROVIDER
    return InferenceClient(**kwargs)


def _chat(client, user_prompt: str, *, max_tokens: int | None = None) -> str:
    try:
        resp = client.chat_completion(
            messages=[
                {"role": "system", "content": _SYSTEM},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=max_tokens or config.LLM_MAX_TOKENS,
            temperature=config.LLM_TEMPERATURE,
        )
    except Exception as exc:
        raise RuntimeError(
            f"HuggingFace LLM call failed for model '{config.LLM_MODEL}'. "
            f"Check the model is available on your plan or set DOCUMAKER_LLM_MODEL "
            f"to another instruct model.\nDetails: {exc}"
        ) from exc
    return resp.choices[0].message.content or ""


# --- JSON / time parsing -----------------------------------------------------
def _extract_json(text: str) -> Any:
    """Best-effort extraction of a JSON object/array from an LLM response."""
    text = text.strip()
    try:
        return json.loads(text)
    except Exception:
        pass

    fenced = re.search(r"```(?:json)?\s*(.*?)```", text, re.S)
    if fenced:
        try:
            return json.loads(fenced.group(1))
        except Exception:
            pass

    for open_ch, close_ch in (("{", "}"), ("[", "]")):
        i, j = text.find(open_ch), text.rfind(close_ch)
        if i != -1 and j > i:
            try:
                return json.loads(text[i : j + 1])
            except Exception:
                continue
    return None


def _parse_time(value: Any) -> float | None:
    """Parse 'ss', 'mm:ss', or 'hh:mm:ss' (or a number) into seconds."""
    if value is None:
        return None
    if isinstance(value, (int, float)):
        return float(value)
    s = str(value).strip()
    if not s:
        return None
    try:
        parts = [float(p) for p in s.split(":")]
    except ValueError:
        return None
    seconds = 0.0
    for part in parts:
        seconds = seconds * 60 + part
    return seconds


# --- Prompt builders ---------------------------------------------------------
_JSON_FULL = (
    '{"title": "...", "intro": "...", "prerequisites": ["..."], '
    '"steps": [{"heading": "short title", "text": "what to do", "approx_time": "mm:ss"}]}'
)
_JSON_STEPS = '{"steps": [{"heading": "short title", "text": "what to do", "approx_time": "mm:ss"}]}'


def _full_prompt(timestamped_text: str) -> str:
    return (
        "Convert this timestamped tutorial transcript into a clean step-by-step guide.\n"
        "- Fix obvious speech-to-text errors; remove filler words and repetition.\n"
        "- Write each step as a clear, imperative instruction.\n"
        "- Add a short descriptive title, a 1-2 sentence introduction, and any "
        "prerequisites that are implied (empty list if none).\n"
        "- For each step set \"approx_time\" as \"mm:ss\" from the nearest [mm:ss] marker.\n"
        f"Respond with ONLY JSON in this exact shape:\n{_JSON_FULL}\n\n"
        f"Transcript:\n{timestamped_text}"
    )


def _chunk_prompt(timestamped_text: str) -> str:
    return (
        "From this timestamped transcript excerpt of a tutorial video, extract the "
        "concrete actions as an ordered list of steps.\n"
        "- Fix speech-to-text errors; remove filler and repetition.\n"
        "- Write each step as a clear, imperative instruction.\n"
        "- Set \"approx_time\" as \"mm:ss\" from the nearest [mm:ss] marker.\n"
        f"Respond with ONLY JSON in this exact shape:\n{_JSON_STEPS}\n\n"
        f"Transcript:\n{timestamped_text}"
    )


def _reduce_prompt(steps_json: str) -> str:
    return (
        "You are assembling the final step-by-step guide from steps extracted across "
        "several transcript chunks.\n"
        "- Merge near-duplicates and keep a logical order.\n"
        "- Keep every distinct action; do not invent new steps.\n"
        "- Add a short descriptive title, a 1-2 sentence introduction, and prerequisites "
        "if implied (empty list if none). Preserve each step's \"approx_time\".\n"
        f"Respond with ONLY JSON in this exact shape:\n{_JSON_FULL}\n\n"
        f"Extracted steps:\n{steps_json}"
    )


# --- Chunking ----------------------------------------------------------------
def _timestamped_lines(transcript: Transcript) -> list[str]:
    lines = []
    for seg in transcript.segments:
        mm, ss = divmod(int(seg.start), 60)
        lines.append(f"[{mm:02d}:{ss:02d}] {seg.text.strip()}")
    return lines


def _chunk_transcript(transcript: Transcript, max_chars: int) -> list[str]:
    lines = _timestamped_lines(transcript)
    if not lines:
        return [transcript.text] if transcript.text else []

    chunks: list[str] = []
    current: list[str] = []
    length = 0
    for line in lines:
        if current and length + len(line) > max_chars:
            chunks.append("\n".join(current))
            current, length = [], 0
        current.append(line)
        length += len(line) + 1
    if current:
        chunks.append("\n".join(current))
    return chunks


# --- Result parsing ----------------------------------------------------------
def _parse_steps(raw_steps: Any) -> list[StepDraft]:
    steps: list[StepDraft] = []
    if not isinstance(raw_steps, list):
        return steps
    for item in raw_steps:
        if not isinstance(item, dict):
            continue
        heading = str(item.get("heading") or item.get("title") or "").strip()
        text = str(item.get("text") or item.get("instruction") or "").strip()
        if not (heading or text):
            continue
        ts = _parse_time(item.get("approx_time", item.get("approx_timestamp")))
        steps.append(StepDraft(heading=heading or text[:60], text=text, approx_timestamp=ts))
    return steps


def _parse_guide(data: Any) -> GuideDraft:
    if not isinstance(data, dict):
        return GuideDraft()
    prereqs = data.get("prerequisites") or []
    if not isinstance(prereqs, list):
        prereqs = [str(prereqs)]
    return GuideDraft(
        title=str(data.get("title") or "Step-by-Step Guide").strip(),
        intro=str(data.get("intro") or "").strip(),
        prerequisites=[str(p).strip() for p in prereqs if str(p).strip()],
        steps=_parse_steps(data.get("steps")),
    )


# --- Public entry point ------------------------------------------------------
def build_guide_draft(
    transcript: Transcript,
    *,
    token: str | None = None,
    progress: Callable[[float, str], None] | None = None,
) -> GuideDraft:
    """Turn a transcript into a structured :class:`GuideDraft` via the LLM.

    ``token`` is the user's HuggingFace token (supplied in the UI).
    """
    chunks = _chunk_transcript(transcript, config.LLM_CHUNK_CHARS)
    if not chunks:
        return GuideDraft()

    client = get_client(token)

    if len(chunks) == 1:
        if progress:
            progress(0.1, "Writing the guide…")
        draft = _parse_guide(_extract_json(_chat(client, _full_prompt(chunks[0]))))
        if progress:
            progress(1.0, "Guide drafted.")
        return draft

    # Map: extract steps per chunk.
    all_steps: list[dict] = []
    for i, chunk in enumerate(chunks):
        if progress:
            progress(i / (len(chunks) + 1), f"Structuring part {i + 1}/{len(chunks)}…")
        data = _extract_json(_chat(client, _chunk_prompt(chunk)))
        for step in _parse_steps((data or {}).get("steps") if isinstance(data, dict) else data):
            mm, ss = divmod(int(step.approx_timestamp or 0), 60)
            all_steps.append(
                {"heading": step.heading, "text": step.text, "approx_time": f"{mm:02d}:{ss:02d}"}
            )

    # Reduce: merge into a titled guide.
    if progress:
        progress(len(chunks) / (len(chunks) + 1), "Assembling the final guide…")
    reduced = _extract_json(_chat(client, _reduce_prompt(json.dumps({"steps": all_steps}))))
    draft = _parse_guide(reduced)
    if not draft.steps:  # reduce failed — fall back to the mapped steps
        draft = _parse_guide({"steps": all_steps})
    if progress:
        progress(1.0, "Guide drafted.")
    return draft