Spaces:

fdaudens
/

script-writer

Runtime error

App Files Files Community

fdaudens commited on Feb 26

Commit

ec5bec4

verified ·

1 Parent(s): 05529eb

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -84

app.py CHANGED Viewed

@@ -1,9 +1,4 @@
 # app.py
-# Hugging Face Spaces Gradio app: upload video -> transcribe (Whisper large-v3-turbo via HF API) -> script (Qwen3 via HF chat completion)
-#
-# Notes:
-# - Put HF_TOKEN in Space Secrets.
-# - Needs ffmpeg + ffprobe available in the Space runtime.
 import os
 import re
@@ -24,7 +19,14 @@ from huggingface_hub import InferenceClient
 HF_TOKEN = os.getenv("HF_TOKEN")  # Space Secrets
 ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo")
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-0.6B")  # override if you want a different Qwen3
 MAX_VIDEO_SECONDS = 10 * 60  # 10 minutes
 CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
@@ -32,7 +34,6 @@ os.makedirs(CACHE_DIR, exist_ok=True)
 # -----------------------------
 # Hardcoded examples in system prompt (replace with yours)
-# Keep examples short for small LLMs.
 # -----------------------------
 SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.
@@ -116,8 +117,7 @@ def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
 def clean_text(s: str) -> str:
-    s = re.sub(r"\s+", " ", (s or "")).strip()
-    return s
 def seconds_from_label(label: str) -> int:
@@ -126,7 +126,6 @@ def seconds_from_label(label: str) -> int:
 def estimate_words_for_seconds(seconds: int) -> int:
-    # Rough VO pacing: ~150 wpm => ~2.5 words/sec
     return max(40, int(seconds * 2.5))
@@ -137,7 +136,7 @@ def language_name(code: str) -> str:
 @dataclass
 class HFClients:
     asr: InferenceClient
-    api: InferenceClient  # generic client used for chat completion
 def make_clients() -> HFClients:
@@ -145,14 +144,12 @@ def make_clients() -> HFClients:
         raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
     return HFClients(
         asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
-        api=InferenceClient(token=HF_TOKEN),
     )
 def cache_paths(file_hash: str) -> Dict[str, str]:
-    return {
-        "transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt"),
-    }
 def llm_chat(clients: HFClients, system: str, user: str, max_tokens: int, temperature: float) -> str:
@@ -187,8 +184,6 @@ def transcribe_video(video_path: str, language: str) -> str:
         wav_path = os.path.join(td, "audio.wav")
         extract_audio_wav_16k_mono(video_path, wav_path)
-        # Some ASR endpoints accept "language" param, some ignore it.
-        # We try it when set, and fall back without it if needed.
         if language != "Auto":
             try:
                 result = clients.asr.automatic_speech_recognition(wav_path, language=language)
@@ -209,20 +204,13 @@ def transcribe_video(video_path: str, language: str) -> str:
     return text
-def make_user_prompt(
-    transcript_or_notes: str,
-    language: str,
-    duration_label: str,
-    tone: str,
-    fmt: str,
-) -> str:
     seconds = seconds_from_label(duration_label)
     target_words = estimate_words_for_seconds(seconds)
     return f"""Constraints:
 - Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
 - Target duration: ~{seconds} seconds
-- Target length: ~{target_words} words (keep it tight)
 - Tone: {tone}
 - Format: {fmt}
@@ -249,21 +237,13 @@ Bullets:"""
     return clean_text(out)
-def generate_script(
-    transcript: str,
-    language: str,
-    duration_label: str,
-    tone: str,
-    fmt: str,
-    force_notes_first: bool,
-) -> str:
     clients = make_clients()
     transcript = clean_text(transcript)
     if not transcript:
         raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")
-    # Notes-first helps small models on long inputs
     too_long = len(transcript) > 4500
     use_notes = force_notes_first or too_long
@@ -273,15 +253,8 @@ def generate_script(
         source = f"NOTES:\n{notes}"
     user_prompt = make_user_prompt(source, language, duration_label, tone, fmt)
-    script = llm_chat(
-        clients,
-        system=SYSTEM_PROMPT,
-        user=user_prompt,
-        max_tokens=750,
-        temperature=0.4,
-    )
-    script = script.strip()
     if not script:
         raise RuntimeError("Script generation returned empty text.")
     return script
@@ -303,18 +276,10 @@ def ui_transcribe(video_file, language):
 def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
     try:
-        # If transcript is empty but video exists, auto-transcribe first
         if (not transcript or not transcript.strip()) and video_file is not None:
             transcript = transcribe_video(video_file, language)
-        script = generate_script(
-            transcript=transcript,
-            language=language,
-            duration_label=duration_label,
-            tone=tone,
-            fmt=fmt,
-            force_notes_first=force_notes_first,
-        )
         return transcript, script, "Done: script generated."
     except Exception as e:
         tb = traceback.format_exc()
@@ -327,32 +292,16 @@ def ui_generate(video_file, transcript, language, duration_label, tone, fmt, for
 with gr.Blocks(title="Video → Transcript → Script") as demo:
     gr.Markdown(
         "## Video → Transcript → Script\n"
-        "Upload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with Qwen3 via HF API."
     )
     with gr.Row():
         with gr.Column(scale=1):
             video = gr.Video(label="Upload video", format="mp4")
-            language = gr.Dropdown(
-                label="Language",
-                choices=["Auto", "en", "fr", "nl"],
-                value="Auto",
-            )
-            duration_label = gr.Dropdown(
-                label="Script length",
-                choices=["30s", "45s", "60s", "90s", "2m"],
-                value="60s",
-            )
-            tone = gr.Dropdown(
-                label="Tone",
-                choices=["neutral", "punchy", "calm", "playful"],
-                value="neutral",
-            )
-            fmt = gr.Dropdown(
-                label="Format",
-                choices=["voiceover", "anchor", "social short"],
-                value="voiceover",
-            )
             force_notes_first = gr.Checkbox(label="Notes-first (better for long transcripts)", value=False)
             with gr.Row():
@@ -365,17 +314,8 @@ with gr.Blocks(title="Video → Transcript → Script") as demo:
             transcript = gr.Textbox(label="Transcript (editable)", lines=10)
             script = gr.Textbox(label="Script (editable)", lines=14)
-    btn_transcribe.click(
-        fn=ui_transcribe,
-        inputs=[video, language],
-        outputs=[transcript, status],
-    )
-    btn_generate.click(
-        fn=ui_generate,
-        inputs=[video, transcript, language, duration_label, tone, fmt, force_notes_first],
-        outputs=[transcript, script, status],
-    )
 if __name__ == "__main__":
     demo.launch()

 # app.py
 import os
 import re
 HF_TOKEN = os.getenv("HF_TOKEN")  # Space Secrets
 ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo")
+# IMPORTANT:
+# Inference Providers (router.huggingface.co) often requires model + provider suffix:
+#   "model_id:provider"
+# Examples that are listed as supported:
+# - "Qwen/Qwen3-4B-Thinking-2507:nscale"
+# - "meta-llama/Llama-3.2-1B-Instruct:novita"
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-4B-Thinking-2507:nscale")
 MAX_VIDEO_SECONDS = 10 * 60  # 10 minutes
 CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
 # -----------------------------
 # Hardcoded examples in system prompt (replace with yours)
 # -----------------------------
 SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.
 def clean_text(s: str) -> str:
+    return re.sub(r"\s+", " ", (s or "")).strip()
 def seconds_from_label(label: str) -> int:
 def estimate_words_for_seconds(seconds: int) -> int:
     return max(40, int(seconds * 2.5))
 @dataclass
 class HFClients:
     asr: InferenceClient
+    api: InferenceClient
 def make_clients() -> HFClients:
         raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
     return HFClients(
         asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
+        api=InferenceClient(token=HF_TOKEN),  # router client
     )
 def cache_paths(file_hash: str) -> Dict[str, str]:
+    return {"transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt")}
 def llm_chat(clients: HFClients, system: str, user: str, max_tokens: int, temperature: float) -> str:
         wav_path = os.path.join(td, "audio.wav")
         extract_audio_wav_16k_mono(video_path, wav_path)
         if language != "Auto":
             try:
                 result = clients.asr.automatic_speech_recognition(wav_path, language=language)
     return text
+def make_user_prompt(transcript_or_notes: str, language: str, duration_label: str, tone: str, fmt: str) -> str:
     seconds = seconds_from_label(duration_label)
     target_words = estimate_words_for_seconds(seconds)
     return f"""Constraints:
 - Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
 - Target duration: ~{seconds} seconds
+- Target length: ~{target_words} words
 - Tone: {tone}
 - Format: {fmt}
     return clean_text(out)
+def generate_script(transcript: str, language: str, duration_label: str, tone: str, fmt: str, force_notes_first: bool) -> str:
     clients = make_clients()
     transcript = clean_text(transcript)
     if not transcript:
         raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")
     too_long = len(transcript) > 4500
     use_notes = force_notes_first or too_long
         source = f"NOTES:\n{notes}"
     user_prompt = make_user_prompt(source, language, duration_label, tone, fmt)
+    script = llm_chat(clients, SYSTEM_PROMPT, user_prompt, max_tokens=750, temperature=0.4).strip()
     if not script:
         raise RuntimeError("Script generation returned empty text.")
     return script
 def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
     try:
         if (not transcript or not transcript.strip()) and video_file is not None:
             transcript = transcribe_video(video_file, language)
+        script = generate_script(transcript, language, duration_label, tone, fmt, force_notes_first)
         return transcript, script, "Done: script generated."
     except Exception as e:
         tb = traceback.format_exc()
 with gr.Blocks(title="Video → Transcript → Script") as demo:
     gr.Markdown(
         "## Video → Transcript → Script\n"
+        "Upload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with an Inference Providers model."
     )
     with gr.Row():
         with gr.Column(scale=1):
             video = gr.Video(label="Upload video", format="mp4")
+            language = gr.Dropdown(label="Language", choices=["Auto", "en", "fr", "nl"], value="Auto")
+            duration_label = gr.Dropdown(label="Script length", choices=["30s", "45s", "60s", "90s", "2m"], value="60s")
+            tone = gr.Dropdown(label="Tone", choices=["neutral", "punchy", "calm", "playful"], value="neutral")
+            fmt = gr.Dropdown(label="Format", choices=["voiceover", "anchor", "social short"], value="voiceover")
             force_notes_first = gr.Checkbox(label="Notes-first (better for long transcripts)", value=False)
             with gr.Row():
             transcript = gr.Textbox(label="Transcript (editable)", lines=10)
             script = gr.Textbox(label="Script (editable)", lines=14)
+    btn_transcribe.click(fn=ui_transcribe, inputs=[video, language], outputs=[transcript, status])
+    btn_generate.click(fn=ui_generate, inputs=[video, transcript, language, duration_label, tone, fmt, force_notes_first], outputs=[transcript, script, status])
 if __name__ == "__main__":
     demo.launch()