fdaudens commited on
Commit
1c09a33
·
verified ·
1 Parent(s): ab40e34

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +376 -0
app.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ # Hugging Face Spaces Gradio app: upload video -> transcribe (Whisper large-v3-turbo) -> script (Qwen3 via HF API)
3
+
4
+ import os
5
+ import re
6
+ import json
7
+ import hashlib
8
+ import tempfile
9
+ import subprocess
10
+ from dataclasses import dataclass
11
+ from typing import Optional, Tuple, Dict
12
+
13
+ import gradio as gr
14
+ from huggingface_hub import InferenceClient
15
+
16
+ # -----------------------------
17
+ # Config
18
+ # -----------------------------
19
+ HF_TOKEN = os.getenv("HF_TOKEN") # put this in Space Secrets
20
+ ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo") # verified on HF :contentReference[oaicite:0]{index=0}
21
+
22
+ # Note: HF has Qwen3 models like 0.6B / 1.7B / 4B etc. (not always a literal "1B"). :contentReference[oaicite:1]{index=1}
23
+ # Closest cheap starter defaults to 0.6B, override with env var if you want 1.7B.
24
+ LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-0.6B")
25
+
26
+ MAX_VIDEO_SECONDS = 10 * 60 # 10 minutes
27
+ CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
28
+
29
+ os.makedirs(CACHE_DIR, exist_ok=True)
30
+
31
+
32
+ # -----------------------------
33
+ # Hardcoded examples in system prompt
34
+ # Put your real examples here.
35
+ # Keep them short: Qwen small models benefit from tight few-shot.
36
+ # -----------------------------
37
+ SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.
38
+
39
+ Rules:
40
+ - Use ONLY facts present in the transcript. Do not invent names, dates, numbers, places.
41
+ - If something is unclear in the transcript, stay neutral or mark it as [unclear].
42
+ - Match the style from the examples.
43
+ - Keep the script within the requested duration.
44
+
45
+ STYLE EXAMPLES (hardcoded):
46
+
47
+ Example 1
48
+ TRANSCRIPT:
49
+ "we launched a new feature today. it helps users summarize long articles faster."
50
+ SCRIPT:
51
+ "Big update today: a new feature that turns long reads into quick, clear summaries.
52
+ Here’s the idea: you drop in an article, and you get the key points in seconds.
53
+ If you’ve been drowning in tabs, this one’s for you."
54
+
55
+ Example 2
56
+ TRANSCRIPT:
57
+ "the storm caused delays across the region. officials said repairs will take two days."
58
+ SCRIPT:
59
+ "Here’s what’s happening: a storm has disrupted travel across the region.
60
+ Officials say repairs could take around two days, so delays may continue.
61
+ If you’re heading out, check updates before you go."
62
+
63
+ Output format:
64
+ Title:
65
+ Hook:
66
+ Body:
67
+ Closing:
68
+ """
69
+
70
+
71
+ # -----------------------------
72
+ # Helpers
73
+ # -----------------------------
74
+ def _run(cmd: list) -> Tuple[int, str, str]:
75
+ p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
76
+ return p.returncode, p.stdout, p.stderr
77
+
78
+
79
+ def sha256_file(path: str) -> str:
80
+ h = hashlib.sha256()
81
+ with open(path, "rb") as f:
82
+ for chunk in iter(lambda: f.read(1024 * 1024), b""):
83
+ h.update(chunk)
84
+ return h.hexdigest()
85
+
86
+
87
+ def get_video_duration_seconds(video_path: str) -> float:
88
+ # ffprobe returns duration in seconds (float). Works on Spaces typically.
89
+ cmd = [
90
+ "ffprobe", "-v", "error",
91
+ "-select_streams", "v:0",
92
+ "-show_entries", "format=duration",
93
+ "-of", "json",
94
+ video_path,
95
+ ]
96
+ code, out, err = _run(cmd)
97
+ if code != 0:
98
+ raise RuntimeError(f"ffprobe failed: {err.strip() or out.strip()}")
99
+ data = json.loads(out)
100
+ dur = float(data["format"]["duration"])
101
+ return dur
102
+
103
+
104
+ def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
105
+ # Standardize audio for ASR
106
+ cmd = [
107
+ "ffmpeg", "-y",
108
+ "-i", video_path,
109
+ "-vn",
110
+ "-ac", "1",
111
+ "-ar", "16000",
112
+ "-f", "wav",
113
+ wav_path,
114
+ ]
115
+ code, out, err = _run(cmd)
116
+ if code != 0:
117
+ raise RuntimeError(f"ffmpeg audio extraction failed: {err.strip() or out.strip()}")
118
+
119
+
120
+ def seconds_from_label(label: str) -> int:
121
+ mapping = {
122
+ "30s": 30,
123
+ "45s": 45,
124
+ "60s": 60,
125
+ "90s": 90,
126
+ "2m": 120,
127
+ }
128
+ return mapping.get(label, 60)
129
+
130
+
131
+ def estimate_words_for_seconds(seconds: int) -> int:
132
+ # Rough VO pacing: ~150 wpm => 2.5 words/sec
133
+ return max(40, int(seconds * 2.5))
134
+
135
+
136
+ def clean_text(s: str) -> str:
137
+ s = re.sub(r"\s+", " ", s).strip()
138
+ return s
139
+
140
+
141
+ @dataclass
142
+ class HFClients:
143
+ asr: InferenceClient
144
+ llm: InferenceClient
145
+
146
+
147
+ def make_clients() -> HFClients:
148
+ if not HF_TOKEN:
149
+ raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
150
+ return HFClients(
151
+ asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
152
+ llm=InferenceClient(model=LLM_MODEL_ID, token=HF_TOKEN),
153
+ )
154
+
155
+
156
+ def cache_paths(file_hash: str) -> Dict[str, str]:
157
+ return {
158
+ "transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt"),
159
+ "script": os.path.join(CACHE_DIR, f"{file_hash}.script.txt"),
160
+ }
161
+
162
+
163
+ def transcribe_video(video_path: str, language: str) -> str:
164
+ clients = make_clients()
165
+
166
+ dur = get_video_duration_seconds(video_path)
167
+ if dur > MAX_VIDEO_SECONDS:
168
+ raise RuntimeError(f"Video is {int(dur)}s. Max allowed is {MAX_VIDEO_SECONDS}s (10 minutes).")
169
+
170
+ file_hash = sha256_file(video_path)
171
+ paths = cache_paths(file_hash)
172
+
173
+ if os.path.exists(paths["transcript"]):
174
+ with open(paths["transcript"], "r", encoding="utf-8") as f:
175
+ return f.read()
176
+
177
+ with tempfile.TemporaryDirectory() as td:
178
+ wav_path = os.path.join(td, "audio.wav")
179
+ extract_audio_wav_16k_mono(video_path, wav_path)
180
+
181
+ # HF Inference API ASR: automatic_speech_recognition
182
+ # language handling: HF API params vary; safest is to pass None for auto.
183
+ # Some endpoints accept "language" in params; if yours does, this works.
184
+ params = {}
185
+ if language != "Auto":
186
+ params["language"] = language # e.g. "en", "fr"
187
+
188
+ result = clients.asr.automatic_speech_recognition(wav_path, **params)
189
+ text = result.get("text", "") if isinstance(result, dict) else str(result)
190
+ text = clean_text(text)
191
+
192
+ if not text:
193
+ raise RuntimeError("Transcription returned empty text.")
194
+
195
+ with open(paths["transcript"], "w", encoding="utf-8") as f:
196
+ f.write(text)
197
+
198
+ return text
199
+
200
+
201
+ def make_user_prompt(
202
+ transcript: str,
203
+ language: str,
204
+ duration_label: str,
205
+ tone: str,
206
+ fmt: str,
207
+ ) -> str:
208
+ seconds = seconds_from_label(duration_label)
209
+ target_words = estimate_words_for_seconds(seconds)
210
+
211
+ return f"""Constraints:
212
+ - Language: {language if language != "Auto" else "Match transcript language"}
213
+ - Target duration: ~{seconds} seconds
214
+ - Target length: ~{target_words} words (keep it tight)
215
+ - Tone: {tone}
216
+ - Format: {fmt}
217
+
218
+ Transcript:
219
+ \"\"\"{transcript}\"\"\"
220
+ """
221
+
222
+
223
+ def notes_first_pass(clients: HFClients, transcript: str, language: str) -> str:
224
+ # A cheap compression step for long transcripts
225
+ prompt = f"""You are an editor. Convert this transcript into concise bullet notes.
226
+ Rules:
227
+ - Keep only key facts mentioned.
228
+ - No inventions.
229
+ - 8 to 14 bullets max.
230
+ - Language: {language if language != "Auto" else "Match transcript"}
231
+
232
+ Transcript:
233
+ \"\"\"{transcript}\"\"\"
234
+
235
+ Bullets:"""
236
+
237
+ out = clients.llm.text_generation(
238
+ prompt,
239
+ max_new_tokens=300,
240
+ temperature=0.2,
241
+ return_full_text=False,
242
+ )
243
+ return clean_text(out)
244
+
245
+
246
+ def generate_script(
247
+ transcript: str,
248
+ language: str,
249
+ duration_label: str,
250
+ tone: str,
251
+ fmt: str,
252
+ force_notes_first: bool,
253
+ ) -> str:
254
+ clients = make_clients()
255
+
256
+ transcript = clean_text(transcript)
257
+ if not transcript:
258
+ raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")
259
+
260
+ # Notes-first threshold: tweak as you like
261
+ too_long = len(transcript) > 4500
262
+ use_notes = force_notes_first or too_long
263
+
264
+ source_text = transcript
265
+ if use_notes:
266
+ notes = notes_first_pass(clients, transcript, language)
267
+ source_text = f"NOTES:\n{notes}"
268
+
269
+ user_prompt = make_user_prompt(source_text, language, duration_label, tone, fmt)
270
+
271
+ # Keep generation settings conservative for small models
272
+ full_prompt = f"{SYSTEM_PROMPT}\n\n{user_prompt}"
273
+
274
+ out = clients.llm.text_generation(
275
+ full_prompt,
276
+ max_new_tokens=700,
277
+ temperature=0.4,
278
+ top_p=0.9,
279
+ return_full_text=False,
280
+ )
281
+ script = clean_text(out)
282
+
283
+ if not script:
284
+ raise RuntimeError("Script generation returned empty text.")
285
+
286
+ return script
287
+
288
+
289
+ # -----------------------------
290
+ # Gradio callbacks
291
+ # -----------------------------
292
+ def ui_transcribe(video_file, language, status):
293
+ if video_file is None:
294
+ return gr.update(), "Please upload a video first."
295
+ try:
296
+ status = "Checking duration + extracting audio…"
297
+ transcript = transcribe_video(video_file, language)
298
+ return transcript, "Done: transcript ready."
299
+ except Exception as e:
300
+ return gr.update(), f"Transcription error: {e}"
301
+
302
+
303
+ def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
304
+ try:
305
+ # If transcript is empty but video exists, auto-transcribe first
306
+ if (not transcript or not transcript.strip()) and video_file is not None:
307
+ transcript = transcribe_video(video_file, language)
308
+
309
+ script = generate_script(
310
+ transcript=transcript,
311
+ language=language,
312
+ duration_label=duration_label,
313
+ tone=tone,
314
+ fmt=fmt,
315
+ force_notes_first=force_notes_first,
316
+ )
317
+ return transcript, script, "Done: script generated."
318
+ except Exception as e:
319
+ return transcript, gr.update(), f"Script error: {e}"
320
+
321
+
322
+ # -----------------------------
323
+ # UI
324
+ # -----------------------------
325
+ with gr.Blocks(title="Video → Transcript → Script") as demo:
326
+ gr.Markdown("## Video → Transcript → Script\nUpload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with Qwen3 via HF API.")
327
+
328
+ with gr.Row():
329
+ with gr.Column(scale=1):
330
+ video = gr.Video(label="Upload video", format="mp4")
331
+ language = gr.Dropdown(
332
+ label="Language",
333
+ choices=["Auto", "en", "fr"],
334
+ value="Auto",
335
+ )
336
+ duration_label = gr.Dropdown(
337
+ label="Script length",
338
+ choices=["30s", "45s", "60s", "90s", "2m"],
339
+ value="60s",
340
+ )
341
+ tone = gr.Dropdown(
342
+ label="Tone",
343
+ choices=["neutral", "punchy", "calm", "playful"],
344
+ value="neutral",
345
+ )
346
+ fmt = gr.Dropdown(
347
+ label="Format",
348
+ choices=["voiceover", "anchor", "social short"],
349
+ value="voiceover",
350
+ )
351
+ force_notes_first = gr.Checkbox(label="Notes-first (better for long transcripts)", value=False)
352
+
353
+ with gr.Row():
354
+ btn_transcribe = gr.Button("Transcribe")
355
+ btn_generate = gr.Button("Generate script")
356
+
357
+ status = gr.Textbox(label="Status", value="Ready.", interactive=False)
358
+
359
+ with gr.Column(scale=2):
360
+ transcript = gr.Textbox(label="Transcript (editable)", lines=10)
361
+ script = gr.Textbox(label="Script (editable)", lines=14)
362
+
363
+ btn_transcribe.click(
364
+ fn=ui_transcribe,
365
+ inputs=[video, language, status],
366
+ outputs=[transcript, status],
367
+ )
368
+
369
+ btn_generate.click(
370
+ fn=ui_generate,
371
+ inputs=[video, transcript, language, duration_label, tone, fmt, force_notes_first],
372
+ outputs=[transcript, script, status],
373
+ )
374
+
375
+ if __name__ == "__main__":
376
+ demo.launch()