File size: 9,942 Bytes
85b485a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
"""Transcript cleanup + step-structuring via the HuggingFace Inference API.

The LLM turns a rough, timestamped transcript into a structured guide draft
(title, intro, prerequisites, ordered steps). Long transcripts are processed
map-reduce style so prompts stay within the model's context window. All model
ids are config-driven and responses are parsed defensively, because free-tier
model availability and exact output formatting both vary.
"""
from __future__ import annotations

import json
import re
from dataclasses import dataclass, field
from typing import Any, Callable

from . import config
from .transcribe import Transcript

_SYSTEM = (
    "You are a meticulous technical writer. You convert rough spoken transcripts "
    "from how-to/tutorial videos into clear, accurate, step-by-step instructions. "
    "You never invent actions that are not in the transcript."
)


@dataclass
class StepDraft:
    heading: str
    text: str
    approx_timestamp: float | None = None


@dataclass
class GuideDraft:
    title: str = "Step-by-Step Guide"
    intro: str = ""
    prerequisites: list[str] = field(default_factory=list)
    steps: list[StepDraft] = field(default_factory=list)


# --- Inference client --------------------------------------------------------
def get_client(token: str | None):
    """Build an InferenceClient for the given user-supplied token.

    Creating a client is cheap (no network until a call), so we don't cache —
    this lets the token change at runtime (it comes from the UI field).
    """
    from huggingface_hub import InferenceClient

    kwargs: dict[str, Any] = {"model": config.LLM_MODEL}
    if token:
        kwargs["token"] = token
    if config.LLM_PROVIDER:
        kwargs["provider"] = config.LLM_PROVIDER
    return InferenceClient(**kwargs)


def _chat(client, user_prompt: str, *, max_tokens: int | None = None) -> str:
    try:
        resp = client.chat_completion(
            messages=[
                {"role": "system", "content": _SYSTEM},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=max_tokens or config.LLM_MAX_TOKENS,
            temperature=config.LLM_TEMPERATURE,
        )
    except Exception as exc:
        raise RuntimeError(
            f"HuggingFace LLM call failed for model '{config.LLM_MODEL}'. "
            f"Check the model is available on your plan or set DOCUMAKER_LLM_MODEL "
            f"to another instruct model.\nDetails: {exc}"
        ) from exc
    return resp.choices[0].message.content or ""


# --- JSON / time parsing -----------------------------------------------------
def _extract_json(text: str) -> Any:
    """Best-effort extraction of a JSON object/array from an LLM response."""
    text = text.strip()
    try:
        return json.loads(text)
    except Exception:
        pass

    fenced = re.search(r"```(?:json)?\s*(.*?)```", text, re.S)
    if fenced:
        try:
            return json.loads(fenced.group(1))
        except Exception:
            pass

    for open_ch, close_ch in (("{", "}"), ("[", "]")):
        i, j = text.find(open_ch), text.rfind(close_ch)
        if i != -1 and j > i:
            try:
                return json.loads(text[i : j + 1])
            except Exception:
                continue
    return None


def _parse_time(value: Any) -> float | None:
    """Parse 'ss', 'mm:ss', or 'hh:mm:ss' (or a number) into seconds."""
    if value is None:
        return None
    if isinstance(value, (int, float)):
        return float(value)
    s = str(value).strip()
    if not s:
        return None
    try:
        parts = [float(p) for p in s.split(":")]
    except ValueError:
        return None
    seconds = 0.0
    for part in parts:
        seconds = seconds * 60 + part
    return seconds


# --- Prompt builders ---------------------------------------------------------
_JSON_FULL = (
    '{"title": "...", "intro": "...", "prerequisites": ["..."], '
    '"steps": [{"heading": "short title", "text": "what to do", "approx_time": "mm:ss"}]}'
)
_JSON_STEPS = '{"steps": [{"heading": "short title", "text": "what to do", "approx_time": "mm:ss"}]}'


def _full_prompt(timestamped_text: str) -> str:
    return (
        "Convert this timestamped tutorial transcript into a clean step-by-step guide.\n"
        "- Fix obvious speech-to-text errors; remove filler words and repetition.\n"
        "- Write each step as a clear, imperative instruction.\n"
        "- Add a short descriptive title, a 1-2 sentence introduction, and any "
        "prerequisites that are implied (empty list if none).\n"
        "- For each step set \"approx_time\" as \"mm:ss\" from the nearest [mm:ss] marker.\n"
        f"Respond with ONLY JSON in this exact shape:\n{_JSON_FULL}\n\n"
        f"Transcript:\n{timestamped_text}"
    )


def _chunk_prompt(timestamped_text: str) -> str:
    return (
        "From this timestamped transcript excerpt of a tutorial video, extract the "
        "concrete actions as an ordered list of steps.\n"
        "- Fix speech-to-text errors; remove filler and repetition.\n"
        "- Write each step as a clear, imperative instruction.\n"
        "- Set \"approx_time\" as \"mm:ss\" from the nearest [mm:ss] marker.\n"
        f"Respond with ONLY JSON in this exact shape:\n{_JSON_STEPS}\n\n"
        f"Transcript:\n{timestamped_text}"
    )


def _reduce_prompt(steps_json: str) -> str:
    return (
        "You are assembling the final step-by-step guide from steps extracted across "
        "several transcript chunks.\n"
        "- Merge near-duplicates and keep a logical order.\n"
        "- Keep every distinct action; do not invent new steps.\n"
        "- Add a short descriptive title, a 1-2 sentence introduction, and prerequisites "
        "if implied (empty list if none). Preserve each step's \"approx_time\".\n"
        f"Respond with ONLY JSON in this exact shape:\n{_JSON_FULL}\n\n"
        f"Extracted steps:\n{steps_json}"
    )


# --- Chunking ----------------------------------------------------------------
def _timestamped_lines(transcript: Transcript) -> list[str]:
    lines = []
    for seg in transcript.segments:
        mm, ss = divmod(int(seg.start), 60)
        lines.append(f"[{mm:02d}:{ss:02d}] {seg.text.strip()}")
    return lines


def _chunk_transcript(transcript: Transcript, max_chars: int) -> list[str]:
    lines = _timestamped_lines(transcript)
    if not lines:
        return [transcript.text] if transcript.text else []

    chunks: list[str] = []
    current: list[str] = []
    length = 0
    for line in lines:
        if current and length + len(line) > max_chars:
            chunks.append("\n".join(current))
            current, length = [], 0
        current.append(line)
        length += len(line) + 1
    if current:
        chunks.append("\n".join(current))
    return chunks


# --- Result parsing ----------------------------------------------------------
def _parse_steps(raw_steps: Any) -> list[StepDraft]:
    steps: list[StepDraft] = []
    if not isinstance(raw_steps, list):
        return steps
    for item in raw_steps:
        if not isinstance(item, dict):
            continue
        heading = str(item.get("heading") or item.get("title") or "").strip()
        text = str(item.get("text") or item.get("instruction") or "").strip()
        if not (heading or text):
            continue
        ts = _parse_time(item.get("approx_time", item.get("approx_timestamp")))
        steps.append(StepDraft(heading=heading or text[:60], text=text, approx_timestamp=ts))
    return steps


def _parse_guide(data: Any) -> GuideDraft:
    if not isinstance(data, dict):
        return GuideDraft()
    prereqs = data.get("prerequisites") or []
    if not isinstance(prereqs, list):
        prereqs = [str(prereqs)]
    return GuideDraft(
        title=str(data.get("title") or "Step-by-Step Guide").strip(),
        intro=str(data.get("intro") or "").strip(),
        prerequisites=[str(p).strip() for p in prereqs if str(p).strip()],
        steps=_parse_steps(data.get("steps")),
    )


# --- Public entry point ------------------------------------------------------
def build_guide_draft(
    transcript: Transcript,
    *,
    token: str | None = None,
    progress: Callable[[float, str], None] | None = None,
) -> GuideDraft:
    """Turn a transcript into a structured :class:`GuideDraft` via the LLM.

    ``token`` is the user's HuggingFace token (supplied in the UI).
    """
    chunks = _chunk_transcript(transcript, config.LLM_CHUNK_CHARS)
    if not chunks:
        return GuideDraft()

    client = get_client(token)

    if len(chunks) == 1:
        if progress:
            progress(0.1, "Writing the guide…")
        draft = _parse_guide(_extract_json(_chat(client, _full_prompt(chunks[0]))))
        if progress:
            progress(1.0, "Guide drafted.")
        return draft

    # Map: extract steps per chunk.
    all_steps: list[dict] = []
    for i, chunk in enumerate(chunks):
        if progress:
            progress(i / (len(chunks) + 1), f"Structuring part {i + 1}/{len(chunks)}…")
        data = _extract_json(_chat(client, _chunk_prompt(chunk)))
        for step in _parse_steps((data or {}).get("steps") if isinstance(data, dict) else data):
            mm, ss = divmod(int(step.approx_timestamp or 0), 60)
            all_steps.append(
                {"heading": step.heading, "text": step.text, "approx_time": f"{mm:02d}:{ss:02d}"}
            )

    # Reduce: merge into a titled guide.
    if progress:
        progress(len(chunks) / (len(chunks) + 1), "Assembling the final guide…")
    reduced = _extract_json(_chat(client, _reduce_prompt(json.dumps({"steps": all_steps}))))
    draft = _parse_guide(reduced)
    if not draft.steps:  # reduce failed — fall back to the mapped steps
        draft = _parse_guide({"steps": all_steps})
    if progress:
        progress(1.0, "Guide drafted.")
    return draft