fdaudens commited on
Commit
05529eb
·
verified ·
1 Parent(s): e5d8f0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -82
app.py CHANGED
@@ -1,5 +1,9 @@
1
  # app.py
2
- # Hugging Face Spaces Gradio app: upload video -> transcribe (Whisper large-v3-turbo) -> script (Qwen3 via HF API)
 
 
 
 
3
 
4
  import os
5
  import re
@@ -7,8 +11,9 @@ import json
7
  import hashlib
8
  import tempfile
9
  import subprocess
 
10
  from dataclasses import dataclass
11
- from typing import Optional, Tuple, Dict
12
 
13
  import gradio as gr
14
  from huggingface_hub import InferenceClient
@@ -16,23 +21,18 @@ from huggingface_hub import InferenceClient
16
  # -----------------------------
17
  # Config
18
  # -----------------------------
19
- HF_TOKEN = os.getenv("HF_TOKEN") # put this in Space Secrets
20
- ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo") # verified on HF :contentReference[oaicite:0]{index=0}
21
 
22
- # Note: HF has Qwen3 models like 0.6B / 1.7B / 4B etc. (not always a literal "1B"). :contentReference[oaicite:1]{index=1}
23
- # Closest cheap starter defaults to 0.6B, override with env var if you want 1.7B.
24
- LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-0.6B")
25
 
26
  MAX_VIDEO_SECONDS = 10 * 60 # 10 minutes
27
  CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
28
-
29
  os.makedirs(CACHE_DIR, exist_ok=True)
30
 
31
-
32
  # -----------------------------
33
- # Hardcoded examples in system prompt
34
- # Put your real examples here.
35
- # Keep them short: Qwen small models benefit from tight few-shot.
36
  # -----------------------------
37
  SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.
38
 
@@ -41,6 +41,7 @@ Rules:
41
  - If something is unclear in the transcript, stay neutral or mark it as [unclear].
42
  - Match the style from the examples.
43
  - Keep the script within the requested duration.
 
44
 
45
  STYLE EXAMPLES (hardcoded):
46
 
@@ -48,26 +49,27 @@ Example 1
48
  TRANSCRIPT:
49
  "we launched a new feature today. it helps users summarize long articles faster."
50
  SCRIPT:
51
- "Big update today: a new feature that turns long reads into quick, clear summaries.
52
- Here’s the idea: you drop in an article, and you get the key points in seconds.
53
- If you’ve been drowning in tabs, this one’s for you."
 
54
 
55
  Example 2
56
  TRANSCRIPT:
57
  "the storm caused delays across the region. officials said repairs will take two days."
58
  SCRIPT:
59
- "Here’s what’s happening: a storm has disrupted travel across the region.
60
- Officials say repairs could take around two days, so delays may continue.
61
- If you’re heading out, check updates before you go."
 
62
 
63
- Output format:
64
  Title:
65
  Hook:
66
  Body:
67
  Closing:
68
  """
69
 
70
-
71
  # -----------------------------
72
  # Helpers
73
  # -----------------------------
@@ -85,10 +87,8 @@ def sha256_file(path: str) -> str:
85
 
86
 
87
  def get_video_duration_seconds(video_path: str) -> float:
88
- # ffprobe returns duration in seconds (float). Works on Spaces typically.
89
  cmd = [
90
  "ffprobe", "-v", "error",
91
- "-select_streams", "v:0",
92
  "-show_entries", "format=duration",
93
  "-of", "json",
94
  video_path,
@@ -97,12 +97,10 @@ def get_video_duration_seconds(video_path: str) -> float:
97
  if code != 0:
98
  raise RuntimeError(f"ffprobe failed: {err.strip() or out.strip()}")
99
  data = json.loads(out)
100
- dur = float(data["format"]["duration"])
101
- return dur
102
 
103
 
104
  def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
105
- # Standardize audio for ASR
106
  cmd = [
107
  "ffmpeg", "-y",
108
  "-i", video_path,
@@ -114,34 +112,32 @@ def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
114
  ]
115
  code, out, err = _run(cmd)
116
  if code != 0:
117
- raise RuntimeError(f"ffmpeg audio extraction failed: {err.strip() or out.strip()}")
 
 
 
 
 
118
 
119
 
120
  def seconds_from_label(label: str) -> int:
121
- mapping = {
122
- "30s": 30,
123
- "45s": 45,
124
- "60s": 60,
125
- "90s": 90,
126
- "2m": 120,
127
- }
128
  return mapping.get(label, 60)
129
 
130
 
131
  def estimate_words_for_seconds(seconds: int) -> int:
132
- # Rough VO pacing: ~150 wpm => 2.5 words/sec
133
  return max(40, int(seconds * 2.5))
134
 
135
 
136
- def clean_text(s: str) -> str:
137
- s = re.sub(r"\s+", " ", s).strip()
138
- return s
139
 
140
 
141
  @dataclass
142
  class HFClients:
143
  asr: InferenceClient
144
- llm: InferenceClient
145
 
146
 
147
  def make_clients() -> HFClients:
@@ -149,17 +145,30 @@ def make_clients() -> HFClients:
149
  raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
150
  return HFClients(
151
  asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
152
- llm=InferenceClient(model=LLM_MODEL_ID, token=HF_TOKEN),
153
  )
154
 
155
 
156
  def cache_paths(file_hash: str) -> Dict[str, str]:
157
  return {
158
  "transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt"),
159
- "script": os.path.join(CACHE_DIR, f"{file_hash}.script.txt"),
160
  }
161
 
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  def transcribe_video(video_path: str, language: str) -> str:
164
  clients = make_clients()
165
 
@@ -178,14 +187,16 @@ def transcribe_video(video_path: str, language: str) -> str:
178
  wav_path = os.path.join(td, "audio.wav")
179
  extract_audio_wav_16k_mono(video_path, wav_path)
180
 
181
- # HF Inference API ASR: automatic_speech_recognition
182
- # language handling: HF API params vary; safest is to pass None for auto.
183
- # Some endpoints accept "language" in params; if yours does, this works.
184
- params = {}
185
  if language != "Auto":
186
- params["language"] = language # e.g. "en", "fr"
 
 
 
 
 
187
 
188
- result = clients.asr.automatic_speech_recognition(wav_path, **params)
189
  text = result.get("text", "") if isinstance(result, dict) else str(result)
190
  text = clean_text(text)
191
 
@@ -199,7 +210,7 @@ def transcribe_video(video_path: str, language: str) -> str:
199
 
200
 
201
  def make_user_prompt(
202
- transcript: str,
203
  language: str,
204
  duration_label: str,
205
  tone: str,
@@ -209,37 +220,32 @@ def make_user_prompt(
209
  target_words = estimate_words_for_seconds(seconds)
210
 
211
  return f"""Constraints:
212
- - Language: {language if language != "Auto" else "Match transcript language"}
213
  - Target duration: ~{seconds} seconds
214
  - Target length: ~{target_words} words (keep it tight)
215
  - Tone: {tone}
216
  - Format: {fmt}
217
 
218
- Transcript:
219
- \"\"\"{transcript}\"\"\"
220
  """
221
 
222
 
223
  def notes_first_pass(clients: HFClients, transcript: str, language: str) -> str:
224
- # A cheap compression step for long transcripts
225
- prompt = f"""You are an editor. Convert this transcript into concise bullet notes.
 
226
  Rules:
227
  - Keep only key facts mentioned.
228
  - No inventions.
229
  - 8 to 14 bullets max.
230
- - Language: {language if language != "Auto" else "Match transcript"}
231
 
232
  Transcript:
233
  \"\"\"{transcript}\"\"\"
234
 
235
  Bullets:"""
236
-
237
- out = clients.llm.text_generation(
238
- prompt,
239
- max_new_tokens=300,
240
- temperature=0.2,
241
- return_full_text=False,
242
- )
243
  return clean_text(out)
244
 
245
 
@@ -257,47 +263,42 @@ def generate_script(
257
  if not transcript:
258
  raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")
259
 
260
- # Notes-first threshold: tweak as you like
261
  too_long = len(transcript) > 4500
262
  use_notes = force_notes_first or too_long
263
 
264
- source_text = transcript
265
  if use_notes:
266
  notes = notes_first_pass(clients, transcript, language)
267
- source_text = f"NOTES:\n{notes}"
268
-
269
- user_prompt = make_user_prompt(source_text, language, duration_label, tone, fmt)
270
 
271
- # Keep generation settings conservative for small models
272
- full_prompt = f"{SYSTEM_PROMPT}\n\n{user_prompt}"
273
 
274
- out = clients.llm.text_generation(
275
- full_prompt,
276
- max_new_tokens=700,
 
 
277
  temperature=0.4,
278
- top_p=0.9,
279
- return_full_text=False,
280
  )
281
- script = clean_text(out)
282
-
283
  if not script:
284
  raise RuntimeError("Script generation returned empty text.")
285
-
286
  return script
287
 
288
 
289
  # -----------------------------
290
  # Gradio callbacks
291
  # -----------------------------
292
- def ui_transcribe(video_file, language, status):
293
  if video_file is None:
294
  return gr.update(), "Please upload a video first."
295
  try:
296
- status = "Checking duration + extracting audio…"
297
  transcript = transcribe_video(video_file, language)
298
  return transcript, "Done: transcript ready."
299
  except Exception as e:
300
- return gr.update(), f"Transcription error: {e}"
 
301
 
302
 
303
  def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
@@ -316,21 +317,25 @@ def ui_generate(video_file, transcript, language, duration_label, tone, fmt, for
316
  )
317
  return transcript, script, "Done: script generated."
318
  except Exception as e:
319
- return transcript, gr.update(), f"Script error: {e}"
 
320
 
321
 
322
  # -----------------------------
323
  # UI
324
  # -----------------------------
325
  with gr.Blocks(title="Video → Transcript → Script") as demo:
326
- gr.Markdown("## Video → Transcript → Script\nUpload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with Qwen3 via HF API.")
 
 
 
327
 
328
  with gr.Row():
329
  with gr.Column(scale=1):
330
  video = gr.Video(label="Upload video", format="mp4")
331
  language = gr.Dropdown(
332
  label="Language",
333
- choices=["Auto", "en", "nl"],
334
  value="Auto",
335
  )
336
  duration_label = gr.Dropdown(
@@ -362,7 +367,7 @@ with gr.Blocks(title="Video → Transcript → Script") as demo:
362
 
363
  btn_transcribe.click(
364
  fn=ui_transcribe,
365
- inputs=[video, language, status],
366
  outputs=[transcript, status],
367
  )
368
 
 
1
  # app.py
2
+ # Hugging Face Spaces Gradio app: upload video -> transcribe (Whisper large-v3-turbo via HF API) -> script (Qwen3 via HF chat completion)
3
+ #
4
+ # Notes:
5
+ # - Put HF_TOKEN in Space Secrets.
6
+ # - Needs ffmpeg + ffprobe available in the Space runtime.
7
 
8
  import os
9
  import re
 
11
  import hashlib
12
  import tempfile
13
  import subprocess
14
+ import traceback
15
  from dataclasses import dataclass
16
+ from typing import Tuple, Dict
17
 
18
  import gradio as gr
19
  from huggingface_hub import InferenceClient
 
21
  # -----------------------------
22
  # Config
23
  # -----------------------------
24
+ HF_TOKEN = os.getenv("HF_TOKEN") # Space Secrets
 
25
 
26
+ ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo")
27
+ LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-0.6B") # override if you want a different Qwen3
 
28
 
29
  MAX_VIDEO_SECONDS = 10 * 60 # 10 minutes
30
  CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
 
31
  os.makedirs(CACHE_DIR, exist_ok=True)
32
 
 
33
  # -----------------------------
34
+ # Hardcoded examples in system prompt (replace with yours)
35
+ # Keep examples short for small LLMs.
 
36
  # -----------------------------
37
  SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.
38
 
 
41
  - If something is unclear in the transcript, stay neutral or mark it as [unclear].
42
  - Match the style from the examples.
43
  - Keep the script within the requested duration.
44
+ - Always write the final script in the requested output language.
45
 
46
  STYLE EXAMPLES (hardcoded):
47
 
 
49
  TRANSCRIPT:
50
  "we launched a new feature today. it helps users summarize long articles faster."
51
  SCRIPT:
52
+ Title: New feature drop
53
+ Hook: Big update today.
54
+ Body: We just launched a feature that turns long reads into quick, clear summaries. Drop in an article, get the key points in seconds.
55
+ Closing: If you’ve been drowning in tabs, this one’s for you.
56
 
57
  Example 2
58
  TRANSCRIPT:
59
  "the storm caused delays across the region. officials said repairs will take two days."
60
  SCRIPT:
61
+ Title: Storm delays
62
+ Hook: Here’s what’s happening.
63
+ Body: A storm has disrupted travel across the region. Officials say repairs could take around two days, so delays may continue.
64
+ Closing: If you’re heading out, check updates before you go.
65
 
66
+ Output format (always):
67
  Title:
68
  Hook:
69
  Body:
70
  Closing:
71
  """
72
 
 
73
  # -----------------------------
74
  # Helpers
75
  # -----------------------------
 
87
 
88
 
89
  def get_video_duration_seconds(video_path: str) -> float:
 
90
  cmd = [
91
  "ffprobe", "-v", "error",
 
92
  "-show_entries", "format=duration",
93
  "-of", "json",
94
  video_path,
 
97
  if code != 0:
98
  raise RuntimeError(f"ffprobe failed: {err.strip() or out.strip()}")
99
  data = json.loads(out)
100
+ return float(data["format"]["duration"])
 
101
 
102
 
103
  def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
 
104
  cmd = [
105
  "ffmpeg", "-y",
106
  "-i", video_path,
 
112
  ]
113
  code, out, err = _run(cmd)
114
  if code != 0:
115
+ raise RuntimeError(f"ffmpeg failed: {err.strip() or out.strip()}")
116
+
117
+
118
+ def clean_text(s: str) -> str:
119
+ s = re.sub(r"\s+", " ", (s or "")).strip()
120
+ return s
121
 
122
 
123
  def seconds_from_label(label: str) -> int:
124
+ mapping = {"30s": 30, "45s": 45, "60s": 60, "90s": 90, "2m": 120}
 
 
 
 
 
 
125
  return mapping.get(label, 60)
126
 
127
 
128
  def estimate_words_for_seconds(seconds: int) -> int:
129
+ # Rough VO pacing: ~150 wpm => ~2.5 words/sec
130
  return max(40, int(seconds * 2.5))
131
 
132
 
133
+ def language_name(code: str) -> str:
134
+ return {"en": "English", "fr": "French", "nl": "Dutch"}.get(code, "Match transcript language")
 
135
 
136
 
137
  @dataclass
138
  class HFClients:
139
  asr: InferenceClient
140
+ api: InferenceClient # generic client used for chat completion
141
 
142
 
143
  def make_clients() -> HFClients:
 
145
  raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
146
  return HFClients(
147
  asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
148
+ api=InferenceClient(token=HF_TOKEN),
149
  )
150
 
151
 
152
  def cache_paths(file_hash: str) -> Dict[str, str]:
153
  return {
154
  "transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt"),
 
155
  }
156
 
157
 
158
+ def llm_chat(clients: HFClients, system: str, user: str, max_tokens: int, temperature: float) -> str:
159
+ resp = clients.api.chat_completion(
160
+ model=LLM_MODEL_ID,
161
+ messages=[
162
+ {"role": "system", "content": system},
163
+ {"role": "user", "content": user},
164
+ ],
165
+ max_tokens=max_tokens,
166
+ temperature=temperature,
167
+ top_p=0.9,
168
+ )
169
+ return resp.choices[0].message.content
170
+
171
+
172
  def transcribe_video(video_path: str, language: str) -> str:
173
  clients = make_clients()
174
 
 
187
  wav_path = os.path.join(td, "audio.wav")
188
  extract_audio_wav_16k_mono(video_path, wav_path)
189
 
190
+ # Some ASR endpoints accept "language" param, some ignore it.
191
+ # We try it when set, and fall back without it if needed.
 
 
192
  if language != "Auto":
193
+ try:
194
+ result = clients.asr.automatic_speech_recognition(wav_path, language=language)
195
+ except TypeError:
196
+ result = clients.asr.automatic_speech_recognition(wav_path)
197
+ else:
198
+ result = clients.asr.automatic_speech_recognition(wav_path)
199
 
 
200
  text = result.get("text", "") if isinstance(result, dict) else str(result)
201
  text = clean_text(text)
202
 
 
210
 
211
 
212
  def make_user_prompt(
213
+ transcript_or_notes: str,
214
  language: str,
215
  duration_label: str,
216
  tone: str,
 
220
  target_words = estimate_words_for_seconds(seconds)
221
 
222
  return f"""Constraints:
223
+ - Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
224
  - Target duration: ~{seconds} seconds
225
  - Target length: ~{target_words} words (keep it tight)
226
  - Tone: {tone}
227
  - Format: {fmt}
228
 
229
+ Source:
230
+ \"\"\"{transcript_or_notes}\"\"\"
231
  """
232
 
233
 
234
  def notes_first_pass(clients: HFClients, transcript: str, language: str) -> str:
235
+ sys = "You are an editor. Return concise bullet notes only."
236
+ user = f"""Convert this transcript into concise bullet notes.
237
+
238
  Rules:
239
  - Keep only key facts mentioned.
240
  - No inventions.
241
  - 8 to 14 bullets max.
242
+ - Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
243
 
244
  Transcript:
245
  \"\"\"{transcript}\"\"\"
246
 
247
  Bullets:"""
248
+ out = llm_chat(clients, sys, user, max_tokens=320, temperature=0.2)
 
 
 
 
 
 
249
  return clean_text(out)
250
 
251
 
 
263
  if not transcript:
264
  raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")
265
 
266
+ # Notes-first helps small models on long inputs
267
  too_long = len(transcript) > 4500
268
  use_notes = force_notes_first or too_long
269
 
270
+ source = transcript
271
  if use_notes:
272
  notes = notes_first_pass(clients, transcript, language)
273
+ source = f"NOTES:\n{notes}"
 
 
274
 
275
+ user_prompt = make_user_prompt(source, language, duration_label, tone, fmt)
 
276
 
277
+ script = llm_chat(
278
+ clients,
279
+ system=SYSTEM_PROMPT,
280
+ user=user_prompt,
281
+ max_tokens=750,
282
  temperature=0.4,
 
 
283
  )
284
+ script = script.strip()
 
285
  if not script:
286
  raise RuntimeError("Script generation returned empty text.")
 
287
  return script
288
 
289
 
290
  # -----------------------------
291
  # Gradio callbacks
292
  # -----------------------------
293
+ def ui_transcribe(video_file, language):
294
  if video_file is None:
295
  return gr.update(), "Please upload a video first."
296
  try:
 
297
  transcript = transcribe_video(video_file, language)
298
  return transcript, "Done: transcript ready."
299
  except Exception as e:
300
+ tb = traceback.format_exc()
301
+ return gr.update(), f"Transcription error: {repr(e)}\n\n{tb}"
302
 
303
 
304
  def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
 
317
  )
318
  return transcript, script, "Done: script generated."
319
  except Exception as e:
320
+ tb = traceback.format_exc()
321
+ return transcript, gr.update(), f"Script error: {repr(e)}\n\n{tb}"
322
 
323
 
324
  # -----------------------------
325
  # UI
326
  # -----------------------------
327
  with gr.Blocks(title="Video → Transcript → Script") as demo:
328
+ gr.Markdown(
329
+ "## Video → Transcript → Script\n"
330
+ "Upload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with Qwen3 via HF API."
331
+ )
332
 
333
  with gr.Row():
334
  with gr.Column(scale=1):
335
  video = gr.Video(label="Upload video", format="mp4")
336
  language = gr.Dropdown(
337
  label="Language",
338
+ choices=["Auto", "en", "fr", "nl"],
339
  value="Auto",
340
  )
341
  duration_label = gr.Dropdown(
 
367
 
368
  btn_transcribe.click(
369
  fn=ui_transcribe,
370
+ inputs=[video, language],
371
  outputs=[transcript, status],
372
  )
373