Spaces:
Sleeping
Sleeping
File size: 9,942 Bytes
85b485a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | """Transcript cleanup + step-structuring via the HuggingFace Inference API.
The LLM turns a rough, timestamped transcript into a structured guide draft
(title, intro, prerequisites, ordered steps). Long transcripts are processed
map-reduce style so prompts stay within the model's context window. All model
ids are config-driven and responses are parsed defensively, because free-tier
model availability and exact output formatting both vary.
"""
from __future__ import annotations
import json
import re
from dataclasses import dataclass, field
from typing import Any, Callable
from . import config
from .transcribe import Transcript
_SYSTEM = (
"You are a meticulous technical writer. You convert rough spoken transcripts "
"from how-to/tutorial videos into clear, accurate, step-by-step instructions. "
"You never invent actions that are not in the transcript."
)
@dataclass
class StepDraft:
heading: str
text: str
approx_timestamp: float | None = None
@dataclass
class GuideDraft:
title: str = "Step-by-Step Guide"
intro: str = ""
prerequisites: list[str] = field(default_factory=list)
steps: list[StepDraft] = field(default_factory=list)
# --- Inference client --------------------------------------------------------
def get_client(token: str | None):
"""Build an InferenceClient for the given user-supplied token.
Creating a client is cheap (no network until a call), so we don't cache —
this lets the token change at runtime (it comes from the UI field).
"""
from huggingface_hub import InferenceClient
kwargs: dict[str, Any] = {"model": config.LLM_MODEL}
if token:
kwargs["token"] = token
if config.LLM_PROVIDER:
kwargs["provider"] = config.LLM_PROVIDER
return InferenceClient(**kwargs)
def _chat(client, user_prompt: str, *, max_tokens: int | None = None) -> str:
try:
resp = client.chat_completion(
messages=[
{"role": "system", "content": _SYSTEM},
{"role": "user", "content": user_prompt},
],
max_tokens=max_tokens or config.LLM_MAX_TOKENS,
temperature=config.LLM_TEMPERATURE,
)
except Exception as exc:
raise RuntimeError(
f"HuggingFace LLM call failed for model '{config.LLM_MODEL}'. "
f"Check the model is available on your plan or set DOCUMAKER_LLM_MODEL "
f"to another instruct model.\nDetails: {exc}"
) from exc
return resp.choices[0].message.content or ""
# --- JSON / time parsing -----------------------------------------------------
def _extract_json(text: str) -> Any:
"""Best-effort extraction of a JSON object/array from an LLM response."""
text = text.strip()
try:
return json.loads(text)
except Exception:
pass
fenced = re.search(r"```(?:json)?\s*(.*?)```", text, re.S)
if fenced:
try:
return json.loads(fenced.group(1))
except Exception:
pass
for open_ch, close_ch in (("{", "}"), ("[", "]")):
i, j = text.find(open_ch), text.rfind(close_ch)
if i != -1 and j > i:
try:
return json.loads(text[i : j + 1])
except Exception:
continue
return None
def _parse_time(value: Any) -> float | None:
"""Parse 'ss', 'mm:ss', or 'hh:mm:ss' (or a number) into seconds."""
if value is None:
return None
if isinstance(value, (int, float)):
return float(value)
s = str(value).strip()
if not s:
return None
try:
parts = [float(p) for p in s.split(":")]
except ValueError:
return None
seconds = 0.0
for part in parts:
seconds = seconds * 60 + part
return seconds
# --- Prompt builders ---------------------------------------------------------
_JSON_FULL = (
'{"title": "...", "intro": "...", "prerequisites": ["..."], '
'"steps": [{"heading": "short title", "text": "what to do", "approx_time": "mm:ss"}]}'
)
_JSON_STEPS = '{"steps": [{"heading": "short title", "text": "what to do", "approx_time": "mm:ss"}]}'
def _full_prompt(timestamped_text: str) -> str:
return (
"Convert this timestamped tutorial transcript into a clean step-by-step guide.\n"
"- Fix obvious speech-to-text errors; remove filler words and repetition.\n"
"- Write each step as a clear, imperative instruction.\n"
"- Add a short descriptive title, a 1-2 sentence introduction, and any "
"prerequisites that are implied (empty list if none).\n"
"- For each step set \"approx_time\" as \"mm:ss\" from the nearest [mm:ss] marker.\n"
f"Respond with ONLY JSON in this exact shape:\n{_JSON_FULL}\n\n"
f"Transcript:\n{timestamped_text}"
)
def _chunk_prompt(timestamped_text: str) -> str:
return (
"From this timestamped transcript excerpt of a tutorial video, extract the "
"concrete actions as an ordered list of steps.\n"
"- Fix speech-to-text errors; remove filler and repetition.\n"
"- Write each step as a clear, imperative instruction.\n"
"- Set \"approx_time\" as \"mm:ss\" from the nearest [mm:ss] marker.\n"
f"Respond with ONLY JSON in this exact shape:\n{_JSON_STEPS}\n\n"
f"Transcript:\n{timestamped_text}"
)
def _reduce_prompt(steps_json: str) -> str:
return (
"You are assembling the final step-by-step guide from steps extracted across "
"several transcript chunks.\n"
"- Merge near-duplicates and keep a logical order.\n"
"- Keep every distinct action; do not invent new steps.\n"
"- Add a short descriptive title, a 1-2 sentence introduction, and prerequisites "
"if implied (empty list if none). Preserve each step's \"approx_time\".\n"
f"Respond with ONLY JSON in this exact shape:\n{_JSON_FULL}\n\n"
f"Extracted steps:\n{steps_json}"
)
# --- Chunking ----------------------------------------------------------------
def _timestamped_lines(transcript: Transcript) -> list[str]:
lines = []
for seg in transcript.segments:
mm, ss = divmod(int(seg.start), 60)
lines.append(f"[{mm:02d}:{ss:02d}] {seg.text.strip()}")
return lines
def _chunk_transcript(transcript: Transcript, max_chars: int) -> list[str]:
lines = _timestamped_lines(transcript)
if not lines:
return [transcript.text] if transcript.text else []
chunks: list[str] = []
current: list[str] = []
length = 0
for line in lines:
if current and length + len(line) > max_chars:
chunks.append("\n".join(current))
current, length = [], 0
current.append(line)
length += len(line) + 1
if current:
chunks.append("\n".join(current))
return chunks
# --- Result parsing ----------------------------------------------------------
def _parse_steps(raw_steps: Any) -> list[StepDraft]:
steps: list[StepDraft] = []
if not isinstance(raw_steps, list):
return steps
for item in raw_steps:
if not isinstance(item, dict):
continue
heading = str(item.get("heading") or item.get("title") or "").strip()
text = str(item.get("text") or item.get("instruction") or "").strip()
if not (heading or text):
continue
ts = _parse_time(item.get("approx_time", item.get("approx_timestamp")))
steps.append(StepDraft(heading=heading or text[:60], text=text, approx_timestamp=ts))
return steps
def _parse_guide(data: Any) -> GuideDraft:
if not isinstance(data, dict):
return GuideDraft()
prereqs = data.get("prerequisites") or []
if not isinstance(prereqs, list):
prereqs = [str(prereqs)]
return GuideDraft(
title=str(data.get("title") or "Step-by-Step Guide").strip(),
intro=str(data.get("intro") or "").strip(),
prerequisites=[str(p).strip() for p in prereqs if str(p).strip()],
steps=_parse_steps(data.get("steps")),
)
# --- Public entry point ------------------------------------------------------
def build_guide_draft(
transcript: Transcript,
*,
token: str | None = None,
progress: Callable[[float, str], None] | None = None,
) -> GuideDraft:
"""Turn a transcript into a structured :class:`GuideDraft` via the LLM.
``token`` is the user's HuggingFace token (supplied in the UI).
"""
chunks = _chunk_transcript(transcript, config.LLM_CHUNK_CHARS)
if not chunks:
return GuideDraft()
client = get_client(token)
if len(chunks) == 1:
if progress:
progress(0.1, "Writing the guide…")
draft = _parse_guide(_extract_json(_chat(client, _full_prompt(chunks[0]))))
if progress:
progress(1.0, "Guide drafted.")
return draft
# Map: extract steps per chunk.
all_steps: list[dict] = []
for i, chunk in enumerate(chunks):
if progress:
progress(i / (len(chunks) + 1), f"Structuring part {i + 1}/{len(chunks)}…")
data = _extract_json(_chat(client, _chunk_prompt(chunk)))
for step in _parse_steps((data or {}).get("steps") if isinstance(data, dict) else data):
mm, ss = divmod(int(step.approx_timestamp or 0), 60)
all_steps.append(
{"heading": step.heading, "text": step.text, "approx_time": f"{mm:02d}:{ss:02d}"}
)
# Reduce: merge into a titled guide.
if progress:
progress(len(chunks) / (len(chunks) + 1), "Assembling the final guide…")
reduced = _extract_json(_chat(client, _reduce_prompt(json.dumps({"steps": all_steps}))))
draft = _parse_guide(reduced)
if not draft.steps: # reduce failed — fall back to the mapped steps
draft = _parse_guide({"steps": all_steps})
if progress:
progress(1.0, "Guide drafted.")
return draft
|