fdaudens commited on
Commit
ec5bec4
·
verified ·
1 Parent(s): 05529eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -84
app.py CHANGED
@@ -1,9 +1,4 @@
1
  # app.py
2
- # Hugging Face Spaces Gradio app: upload video -> transcribe (Whisper large-v3-turbo via HF API) -> script (Qwen3 via HF chat completion)
3
- #
4
- # Notes:
5
- # - Put HF_TOKEN in Space Secrets.
6
- # - Needs ffmpeg + ffprobe available in the Space runtime.
7
 
8
  import os
9
  import re
@@ -24,7 +19,14 @@ from huggingface_hub import InferenceClient
24
  HF_TOKEN = os.getenv("HF_TOKEN") # Space Secrets
25
 
26
  ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo")
27
- LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-0.6B") # override if you want a different Qwen3
 
 
 
 
 
 
 
28
 
29
  MAX_VIDEO_SECONDS = 10 * 60 # 10 minutes
30
  CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
@@ -32,7 +34,6 @@ os.makedirs(CACHE_DIR, exist_ok=True)
32
 
33
  # -----------------------------
34
  # Hardcoded examples in system prompt (replace with yours)
35
- # Keep examples short for small LLMs.
36
  # -----------------------------
37
  SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.
38
 
@@ -116,8 +117,7 @@ def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
116
 
117
 
118
  def clean_text(s: str) -> str:
119
- s = re.sub(r"\s+", " ", (s or "")).strip()
120
- return s
121
 
122
 
123
  def seconds_from_label(label: str) -> int:
@@ -126,7 +126,6 @@ def seconds_from_label(label: str) -> int:
126
 
127
 
128
  def estimate_words_for_seconds(seconds: int) -> int:
129
- # Rough VO pacing: ~150 wpm => ~2.5 words/sec
130
  return max(40, int(seconds * 2.5))
131
 
132
 
@@ -137,7 +136,7 @@ def language_name(code: str) -> str:
137
  @dataclass
138
  class HFClients:
139
  asr: InferenceClient
140
- api: InferenceClient # generic client used for chat completion
141
 
142
 
143
  def make_clients() -> HFClients:
@@ -145,14 +144,12 @@ def make_clients() -> HFClients:
145
  raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
146
  return HFClients(
147
  asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
148
- api=InferenceClient(token=HF_TOKEN),
149
  )
150
 
151
 
152
  def cache_paths(file_hash: str) -> Dict[str, str]:
153
- return {
154
- "transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt"),
155
- }
156
 
157
 
158
  def llm_chat(clients: HFClients, system: str, user: str, max_tokens: int, temperature: float) -> str:
@@ -187,8 +184,6 @@ def transcribe_video(video_path: str, language: str) -> str:
187
  wav_path = os.path.join(td, "audio.wav")
188
  extract_audio_wav_16k_mono(video_path, wav_path)
189
 
190
- # Some ASR endpoints accept "language" param, some ignore it.
191
- # We try it when set, and fall back without it if needed.
192
  if language != "Auto":
193
  try:
194
  result = clients.asr.automatic_speech_recognition(wav_path, language=language)
@@ -209,20 +204,13 @@ def transcribe_video(video_path: str, language: str) -> str:
209
  return text
210
 
211
 
212
- def make_user_prompt(
213
- transcript_or_notes: str,
214
- language: str,
215
- duration_label: str,
216
- tone: str,
217
- fmt: str,
218
- ) -> str:
219
  seconds = seconds_from_label(duration_label)
220
  target_words = estimate_words_for_seconds(seconds)
221
-
222
  return f"""Constraints:
223
  - Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
224
  - Target duration: ~{seconds} seconds
225
- - Target length: ~{target_words} words (keep it tight)
226
  - Tone: {tone}
227
  - Format: {fmt}
228
 
@@ -249,21 +237,13 @@ Bullets:"""
249
  return clean_text(out)
250
 
251
 
252
- def generate_script(
253
- transcript: str,
254
- language: str,
255
- duration_label: str,
256
- tone: str,
257
- fmt: str,
258
- force_notes_first: bool,
259
- ) -> str:
260
  clients = make_clients()
261
 
262
  transcript = clean_text(transcript)
263
  if not transcript:
264
  raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")
265
 
266
- # Notes-first helps small models on long inputs
267
  too_long = len(transcript) > 4500
268
  use_notes = force_notes_first or too_long
269
 
@@ -273,15 +253,8 @@ def generate_script(
273
  source = f"NOTES:\n{notes}"
274
 
275
  user_prompt = make_user_prompt(source, language, duration_label, tone, fmt)
 
276
 
277
- script = llm_chat(
278
- clients,
279
- system=SYSTEM_PROMPT,
280
- user=user_prompt,
281
- max_tokens=750,
282
- temperature=0.4,
283
- )
284
- script = script.strip()
285
  if not script:
286
  raise RuntimeError("Script generation returned empty text.")
287
  return script
@@ -303,18 +276,10 @@ def ui_transcribe(video_file, language):
303
 
304
  def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
305
  try:
306
- # If transcript is empty but video exists, auto-transcribe first
307
  if (not transcript or not transcript.strip()) and video_file is not None:
308
  transcript = transcribe_video(video_file, language)
309
 
310
- script = generate_script(
311
- transcript=transcript,
312
- language=language,
313
- duration_label=duration_label,
314
- tone=tone,
315
- fmt=fmt,
316
- force_notes_first=force_notes_first,
317
- )
318
  return transcript, script, "Done: script generated."
319
  except Exception as e:
320
  tb = traceback.format_exc()
@@ -327,32 +292,16 @@ def ui_generate(video_file, transcript, language, duration_label, tone, fmt, for
327
  with gr.Blocks(title="Video → Transcript → Script") as demo:
328
  gr.Markdown(
329
  "## Video → Transcript → Script\n"
330
- "Upload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with Qwen3 via HF API."
331
  )
332
 
333
  with gr.Row():
334
  with gr.Column(scale=1):
335
  video = gr.Video(label="Upload video", format="mp4")
336
- language = gr.Dropdown(
337
- label="Language",
338
- choices=["Auto", "en", "fr", "nl"],
339
- value="Auto",
340
- )
341
- duration_label = gr.Dropdown(
342
- label="Script length",
343
- choices=["30s", "45s", "60s", "90s", "2m"],
344
- value="60s",
345
- )
346
- tone = gr.Dropdown(
347
- label="Tone",
348
- choices=["neutral", "punchy", "calm", "playful"],
349
- value="neutral",
350
- )
351
- fmt = gr.Dropdown(
352
- label="Format",
353
- choices=["voiceover", "anchor", "social short"],
354
- value="voiceover",
355
- )
356
  force_notes_first = gr.Checkbox(label="Notes-first (better for long transcripts)", value=False)
357
 
358
  with gr.Row():
@@ -365,17 +314,8 @@ with gr.Blocks(title="Video → Transcript → Script") as demo:
365
  transcript = gr.Textbox(label="Transcript (editable)", lines=10)
366
  script = gr.Textbox(label="Script (editable)", lines=14)
367
 
368
- btn_transcribe.click(
369
- fn=ui_transcribe,
370
- inputs=[video, language],
371
- outputs=[transcript, status],
372
- )
373
-
374
- btn_generate.click(
375
- fn=ui_generate,
376
- inputs=[video, transcript, language, duration_label, tone, fmt, force_notes_first],
377
- outputs=[transcript, script, status],
378
- )
379
 
380
  if __name__ == "__main__":
381
  demo.launch()
 
1
  # app.py
 
 
 
 
 
2
 
3
  import os
4
  import re
 
19
  HF_TOKEN = os.getenv("HF_TOKEN") # Space Secrets
20
 
21
  ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo")
22
+
23
+ # IMPORTANT:
24
+ # Inference Providers (router.huggingface.co) often requires model + provider suffix:
25
+ # "model_id:provider"
26
+ # Examples that are listed as supported:
27
+ # - "Qwen/Qwen3-4B-Thinking-2507:nscale"
28
+ # - "meta-llama/Llama-3.2-1B-Instruct:novita"
29
+ LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-4B-Thinking-2507:nscale")
30
 
31
  MAX_VIDEO_SECONDS = 10 * 60 # 10 minutes
32
  CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
 
34
 
35
  # -----------------------------
36
  # Hardcoded examples in system prompt (replace with yours)
 
37
  # -----------------------------
38
  SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.
39
 
 
117
 
118
 
119
  def clean_text(s: str) -> str:
120
+ return re.sub(r"\s+", " ", (s or "")).strip()
 
121
 
122
 
123
  def seconds_from_label(label: str) -> int:
 
126
 
127
 
128
  def estimate_words_for_seconds(seconds: int) -> int:
 
129
  return max(40, int(seconds * 2.5))
130
 
131
 
 
136
  @dataclass
137
  class HFClients:
138
  asr: InferenceClient
139
+ api: InferenceClient
140
 
141
 
142
  def make_clients() -> HFClients:
 
144
  raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
145
  return HFClients(
146
  asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
147
+ api=InferenceClient(token=HF_TOKEN), # router client
148
  )
149
 
150
 
151
  def cache_paths(file_hash: str) -> Dict[str, str]:
152
+ return {"transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt")}
 
 
153
 
154
 
155
  def llm_chat(clients: HFClients, system: str, user: str, max_tokens: int, temperature: float) -> str:
 
184
  wav_path = os.path.join(td, "audio.wav")
185
  extract_audio_wav_16k_mono(video_path, wav_path)
186
 
 
 
187
  if language != "Auto":
188
  try:
189
  result = clients.asr.automatic_speech_recognition(wav_path, language=language)
 
204
  return text
205
 
206
 
207
+ def make_user_prompt(transcript_or_notes: str, language: str, duration_label: str, tone: str, fmt: str) -> str:
 
 
 
 
 
 
208
  seconds = seconds_from_label(duration_label)
209
  target_words = estimate_words_for_seconds(seconds)
 
210
  return f"""Constraints:
211
  - Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
212
  - Target duration: ~{seconds} seconds
213
+ - Target length: ~{target_words} words
214
  - Tone: {tone}
215
  - Format: {fmt}
216
 
 
237
  return clean_text(out)
238
 
239
 
240
+ def generate_script(transcript: str, language: str, duration_label: str, tone: str, fmt: str, force_notes_first: bool) -> str:
 
 
 
 
 
 
 
241
  clients = make_clients()
242
 
243
  transcript = clean_text(transcript)
244
  if not transcript:
245
  raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")
246
 
 
247
  too_long = len(transcript) > 4500
248
  use_notes = force_notes_first or too_long
249
 
 
253
  source = f"NOTES:\n{notes}"
254
 
255
  user_prompt = make_user_prompt(source, language, duration_label, tone, fmt)
256
+ script = llm_chat(clients, SYSTEM_PROMPT, user_prompt, max_tokens=750, temperature=0.4).strip()
257
 
 
 
 
 
 
 
 
 
258
  if not script:
259
  raise RuntimeError("Script generation returned empty text.")
260
  return script
 
276
 
277
  def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
278
  try:
 
279
  if (not transcript or not transcript.strip()) and video_file is not None:
280
  transcript = transcribe_video(video_file, language)
281
 
282
+ script = generate_script(transcript, language, duration_label, tone, fmt, force_notes_first)
 
 
 
 
 
 
 
283
  return transcript, script, "Done: script generated."
284
  except Exception as e:
285
  tb = traceback.format_exc()
 
292
  with gr.Blocks(title="Video → Transcript → Script") as demo:
293
  gr.Markdown(
294
  "## Video → Transcript → Script\n"
295
+ "Upload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with an Inference Providers model."
296
  )
297
 
298
  with gr.Row():
299
  with gr.Column(scale=1):
300
  video = gr.Video(label="Upload video", format="mp4")
301
+ language = gr.Dropdown(label="Language", choices=["Auto", "en", "fr", "nl"], value="Auto")
302
+ duration_label = gr.Dropdown(label="Script length", choices=["30s", "45s", "60s", "90s", "2m"], value="60s")
303
+ tone = gr.Dropdown(label="Tone", choices=["neutral", "punchy", "calm", "playful"], value="neutral")
304
+ fmt = gr.Dropdown(label="Format", choices=["voiceover", "anchor", "social short"], value="voiceover")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  force_notes_first = gr.Checkbox(label="Notes-first (better for long transcripts)", value=False)
306
 
307
  with gr.Row():
 
314
  transcript = gr.Textbox(label="Transcript (editable)", lines=10)
315
  script = gr.Textbox(label="Script (editable)", lines=14)
316
 
317
+ btn_transcribe.click(fn=ui_transcribe, inputs=[video, language], outputs=[transcript, status])
318
+ btn_generate.click(fn=ui_generate, inputs=[video, transcript, language, duration_label, tone, fmt, force_notes_first], outputs=[transcript, script, status])
 
 
 
 
 
 
 
 
 
319
 
320
  if __name__ == "__main__":
321
  demo.launch()