Spaces:

fdaudens
/

script-writer

Runtime error

App Files Files Community

fdaudens commited on Feb 26

Commit

05529eb

verified ·

1 Parent(s): e5d8f0d

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -82

app.py CHANGED Viewed

@@ -1,5 +1,9 @@
 # app.py
-# Hugging Face Spaces Gradio app: upload video -> transcribe (Whisper large-v3-turbo) -> script (Qwen3 via HF API)
 import os
 import re
@@ -7,8 +11,9 @@ import json
 import hashlib
 import tempfile
 import subprocess
 from dataclasses import dataclass
-from typing import Optional, Tuple, Dict
 import gradio as gr
 from huggingface_hub import InferenceClient
@@ -16,23 +21,18 @@ from huggingface_hub import InferenceClient
 # -----------------------------
 # Config
 # -----------------------------
-HF_TOKEN = os.getenv("HF_TOKEN")  # put this in Space Secrets
-ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo")  # verified on HF :contentReference[oaicite:0]{index=0}
-# Note: HF has Qwen3 models like 0.6B / 1.7B / 4B etc. (not always a literal "1B"). :contentReference[oaicite:1]{index=1}
-# Closest cheap starter defaults to 0.6B, override with env var if you want 1.7B.
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-0.6B")
 MAX_VIDEO_SECONDS = 10 * 60  # 10 minutes
 CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
 os.makedirs(CACHE_DIR, exist_ok=True)
 # -----------------------------
-# Hardcoded examples in system prompt
-# Put your real examples here.
-# Keep them short: Qwen small models benefit from tight few-shot.
 # -----------------------------
 SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.
@@ -41,6 +41,7 @@ Rules:
 - If something is unclear in the transcript, stay neutral or mark it as [unclear].
 - Match the style from the examples.
 - Keep the script within the requested duration.
 STYLE EXAMPLES (hardcoded):
@@ -48,26 +49,27 @@ Example 1
 TRANSCRIPT:
 "we launched a new feature today. it helps users summarize long articles faster."
 SCRIPT:
-"Big update today: a new feature that turns long reads into quick, clear summaries.
-Here’s the idea: you drop in an article, and you get the key points in seconds.
-If you’ve been drowning in tabs, this one’s for you."
 Example 2
 TRANSCRIPT:
 "the storm caused delays across the region. officials said repairs will take two days."
 SCRIPT:
-"Here’s what’s happening: a storm has disrupted travel across the region.
-Officials say repairs could take around two days, so delays may continue.
-If you’re heading out, check updates before you go."
-Output format:
 Title:
 Hook:
 Body:
 Closing:
 """
 # -----------------------------
 # Helpers
 # -----------------------------
@@ -85,10 +87,8 @@ def sha256_file(path: str) -> str:
 def get_video_duration_seconds(video_path: str) -> float:
-    # ffprobe returns duration in seconds (float). Works on Spaces typically.
     cmd = [
         "ffprobe", "-v", "error",
-        "-select_streams", "v:0",
         "-show_entries", "format=duration",
         "-of", "json",
         video_path,
@@ -97,12 +97,10 @@ def get_video_duration_seconds(video_path: str) -> float:
     if code != 0:
         raise RuntimeError(f"ffprobe failed: {err.strip() or out.strip()}")
     data = json.loads(out)
-    dur = float(data["format"]["duration"])
-    return dur
 def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
-    # Standardize audio for ASR
     cmd = [
         "ffmpeg", "-y",
         "-i", video_path,
@@ -114,34 +112,32 @@ def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
     ]
     code, out, err = _run(cmd)
     if code != 0:
-        raise RuntimeError(f"ffmpeg audio extraction failed: {err.strip() or out.strip()}")
 def seconds_from_label(label: str) -> int:
-    mapping = {
-        "30s": 30,
-        "45s": 45,
-        "60s": 60,
-        "90s": 90,
-        "2m": 120,
-    }
     return mapping.get(label, 60)
 def estimate_words_for_seconds(seconds: int) -> int:
-    # Rough VO pacing: ~150 wpm => 2.5 words/sec
     return max(40, int(seconds * 2.5))
-def clean_text(s: str) -> str:
-    s = re.sub(r"\s+", " ", s).strip()
-    return s
 @dataclass
 class HFClients:
     asr: InferenceClient
-    llm: InferenceClient
 def make_clients() -> HFClients:
@@ -149,17 +145,30 @@ def make_clients() -> HFClients:
         raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
     return HFClients(
         asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
-        llm=InferenceClient(model=LLM_MODEL_ID, token=HF_TOKEN),
     )
 def cache_paths(file_hash: str) -> Dict[str, str]:
     return {
         "transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt"),
-        "script": os.path.join(CACHE_DIR, f"{file_hash}.script.txt"),
     }
 def transcribe_video(video_path: str, language: str) -> str:
     clients = make_clients()
@@ -178,14 +187,16 @@ def transcribe_video(video_path: str, language: str) -> str:
         wav_path = os.path.join(td, "audio.wav")
         extract_audio_wav_16k_mono(video_path, wav_path)
-        # HF Inference API ASR: automatic_speech_recognition
-        # language handling: HF API params vary; safest is to pass None for auto.
-        # Some endpoints accept "language" in params; if yours does, this works.
-        params = {}
         if language != "Auto":
-            params["language"] = language  # e.g. "en", "fr"
-        result = clients.asr.automatic_speech_recognition(wav_path, **params)
         text = result.get("text", "") if isinstance(result, dict) else str(result)
         text = clean_text(text)
@@ -199,7 +210,7 @@ def transcribe_video(video_path: str, language: str) -> str:
 def make_user_prompt(
-    transcript: str,
     language: str,
     duration_label: str,
     tone: str,
@@ -209,37 +220,32 @@ def make_user_prompt(
     target_words = estimate_words_for_seconds(seconds)
     return f"""Constraints:
-- Language: {language if language != "Auto" else "Match transcript language"}
 - Target duration: ~{seconds} seconds
 - Target length: ~{target_words} words (keep it tight)
 - Tone: {tone}
 - Format: {fmt}
-Transcript:
-\"\"\"{transcript}\"\"\"
 """
 def notes_first_pass(clients: HFClients, transcript: str, language: str) -> str:
-    # A cheap compression step for long transcripts
-    prompt = f"""You are an editor. Convert this transcript into concise bullet notes.
 Rules:
 - Keep only key facts mentioned.
 - No inventions.
 - 8 to 14 bullets max.
-- Language: {language if language != "Auto" else "Match transcript"}
 Transcript:
 \"\"\"{transcript}\"\"\"
 Bullets:"""
-    out = clients.llm.text_generation(
-        prompt,
-        max_new_tokens=300,
-        temperature=0.2,
-        return_full_text=False,
-    )
     return clean_text(out)
@@ -257,47 +263,42 @@ def generate_script(
     if not transcript:
         raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")
-    # Notes-first threshold: tweak as you like
     too_long = len(transcript) > 4500
     use_notes = force_notes_first or too_long
-    source_text = transcript
     if use_notes:
         notes = notes_first_pass(clients, transcript, language)
-        source_text = f"NOTES:\n{notes}"
-    user_prompt = make_user_prompt(source_text, language, duration_label, tone, fmt)
-    # Keep generation settings conservative for small models
-    full_prompt = f"{SYSTEM_PROMPT}\n\n{user_prompt}"
-    out = clients.llm.text_generation(
-        full_prompt,
-        max_new_tokens=700,
         temperature=0.4,
-        top_p=0.9,
-        return_full_text=False,
     )
-    script = clean_text(out)
     if not script:
         raise RuntimeError("Script generation returned empty text.")
     return script
 # -----------------------------
 # Gradio callbacks
 # -----------------------------
-def ui_transcribe(video_file, language, status):
     if video_file is None:
         return gr.update(), "Please upload a video first."
     try:
-        status = "Checking duration + extracting audio…"
         transcript = transcribe_video(video_file, language)
         return transcript, "Done: transcript ready."
     except Exception as e:
-        return gr.update(), f"Transcription error: {e}"
 def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
@@ -316,21 +317,25 @@ def ui_generate(video_file, transcript, language, duration_label, tone, fmt, for
         )
         return transcript, script, "Done: script generated."
     except Exception as e:
-        return transcript, gr.update(), f"Script error: {e}"
 # -----------------------------
 # UI
 # -----------------------------
 with gr.Blocks(title="Video → Transcript → Script") as demo:
-    gr.Markdown("## Video → Transcript → Script\nUpload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with Qwen3 via HF API.")
     with gr.Row():
         with gr.Column(scale=1):
             video = gr.Video(label="Upload video", format="mp4")
             language = gr.Dropdown(
                 label="Language",
-                choices=["Auto", "en", "nl"],
                 value="Auto",
             )
             duration_label = gr.Dropdown(
@@ -362,7 +367,7 @@ with gr.Blocks(title="Video → Transcript → Script") as demo:
     btn_transcribe.click(
         fn=ui_transcribe,
-        inputs=[video, language, status],
         outputs=[transcript, status],
     )

 # app.py
+# Hugging Face Spaces Gradio app: upload video -> transcribe (Whisper large-v3-turbo via HF API) -> script (Qwen3 via HF chat completion)
+#
+# Notes:
+# - Put HF_TOKEN in Space Secrets.
+# - Needs ffmpeg + ffprobe available in the Space runtime.
 import os
 import re
 import hashlib
 import tempfile
 import subprocess
+import traceback
 from dataclasses import dataclass
+from typing import Tuple, Dict
 import gradio as gr
 from huggingface_hub import InferenceClient
 # -----------------------------
 # Config
 # -----------------------------
+HF_TOKEN = os.getenv("HF_TOKEN")  # Space Secrets
+ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo")
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-0.6B")  # override if you want a different Qwen3
 MAX_VIDEO_SECONDS = 10 * 60  # 10 minutes
 CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
 os.makedirs(CACHE_DIR, exist_ok=True)
 # -----------------------------
+# Hardcoded examples in system prompt (replace with yours)
+# Keep examples short for small LLMs.
 # -----------------------------
 SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.
 - If something is unclear in the transcript, stay neutral or mark it as [unclear].
 - Match the style from the examples.
 - Keep the script within the requested duration.
+- Always write the final script in the requested output language.
 STYLE EXAMPLES (hardcoded):
 TRANSCRIPT:
 "we launched a new feature today. it helps users summarize long articles faster."
 SCRIPT:
+Title: New feature drop
+Hook: Big update today.
+Body: We just launched a feature that turns long reads into quick, clear summaries. Drop in an article, get the key points in seconds.
+Closing: If you’ve been drowning in tabs, this one’s for you.
 Example 2
 TRANSCRIPT:
 "the storm caused delays across the region. officials said repairs will take two days."
 SCRIPT:
+Title: Storm delays
+Hook: Here’s what’s happening.
+Body: A storm has disrupted travel across the region. Officials say repairs could take around two days, so delays may continue.
+Closing: If you’re heading out, check updates before you go.
+Output format (always):
 Title:
 Hook:
 Body:
 Closing:
 """
 # -----------------------------
 # Helpers
 # -----------------------------
 def get_video_duration_seconds(video_path: str) -> float:
     cmd = [
         "ffprobe", "-v", "error",
         "-show_entries", "format=duration",
         "-of", "json",
         video_path,
     if code != 0:
         raise RuntimeError(f"ffprobe failed: {err.strip() or out.strip()}")
     data = json.loads(out)
+    return float(data["format"]["duration"])
 def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
     cmd = [
         "ffmpeg", "-y",
         "-i", video_path,
     ]
     code, out, err = _run(cmd)
     if code != 0:
+        raise RuntimeError(f"ffmpeg failed: {err.strip() or out.strip()}")
+def clean_text(s: str) -> str:
+    s = re.sub(r"\s+", " ", (s or "")).strip()
+    return s
 def seconds_from_label(label: str) -> int:
+    mapping = {"30s": 30, "45s": 45, "60s": 60, "90s": 90, "2m": 120}
     return mapping.get(label, 60)
 def estimate_words_for_seconds(seconds: int) -> int:
+    # Rough VO pacing: ~150 wpm => ~2.5 words/sec
     return max(40, int(seconds * 2.5))
+def language_name(code: str) -> str:
+    return {"en": "English", "fr": "French", "nl": "Dutch"}.get(code, "Match transcript language")
 @dataclass
 class HFClients:
     asr: InferenceClient
+    api: InferenceClient  # generic client used for chat completion
 def make_clients() -> HFClients:
         raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
     return HFClients(
         asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
+        api=InferenceClient(token=HF_TOKEN),
     )
 def cache_paths(file_hash: str) -> Dict[str, str]:
     return {
         "transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt"),
     }
+def llm_chat(clients: HFClients, system: str, user: str, max_tokens: int, temperature: float) -> str:
+    resp = clients.api.chat_completion(
+        model=LLM_MODEL_ID,
+        messages=[
+            {"role": "system", "content": system},
+            {"role": "user", "content": user},
+        ],
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=0.9,
+    )
+    return resp.choices[0].message.content
 def transcribe_video(video_path: str, language: str) -> str:
     clients = make_clients()
         wav_path = os.path.join(td, "audio.wav")
         extract_audio_wav_16k_mono(video_path, wav_path)
+        # Some ASR endpoints accept "language" param, some ignore it.
+        # We try it when set, and fall back without it if needed.
         if language != "Auto":
+            try:
+                result = clients.asr.automatic_speech_recognition(wav_path, language=language)
+            except TypeError:
+                result = clients.asr.automatic_speech_recognition(wav_path)
+        else:
+            result = clients.asr.automatic_speech_recognition(wav_path)
         text = result.get("text", "") if isinstance(result, dict) else str(result)
         text = clean_text(text)
 def make_user_prompt(
+    transcript_or_notes: str,
     language: str,
     duration_label: str,
     tone: str,
     target_words = estimate_words_for_seconds(seconds)
     return f"""Constraints:
+- Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
 - Target duration: ~{seconds} seconds
 - Target length: ~{target_words} words (keep it tight)
 - Tone: {tone}
 - Format: {fmt}
+Source:
+\"\"\"{transcript_or_notes}\"\"\"
 """
 def notes_first_pass(clients: HFClients, transcript: str, language: str) -> str:
+    sys = "You are an editor. Return concise bullet notes only."
+    user = f"""Convert this transcript into concise bullet notes.
 Rules:
 - Keep only key facts mentioned.
 - No inventions.
 - 8 to 14 bullets max.
+- Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
 Transcript:
 \"\"\"{transcript}\"\"\"
 Bullets:"""
+    out = llm_chat(clients, sys, user, max_tokens=320, temperature=0.2)
     return clean_text(out)
     if not transcript:
         raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")
+    # Notes-first helps small models on long inputs
     too_long = len(transcript) > 4500
     use_notes = force_notes_first or too_long
+    source = transcript
     if use_notes:
         notes = notes_first_pass(clients, transcript, language)
+        source = f"NOTES:\n{notes}"
+    user_prompt = make_user_prompt(source, language, duration_label, tone, fmt)
+    script = llm_chat(
+        clients,
+        system=SYSTEM_PROMPT,
+        user=user_prompt,
+        max_tokens=750,
         temperature=0.4,
     )
+    script = script.strip()
     if not script:
         raise RuntimeError("Script generation returned empty text.")
     return script
 # -----------------------------
 # Gradio callbacks
 # -----------------------------
+def ui_transcribe(video_file, language):
     if video_file is None:
         return gr.update(), "Please upload a video first."
     try:
         transcript = transcribe_video(video_file, language)
         return transcript, "Done: transcript ready."
     except Exception as e:
+        tb = traceback.format_exc()
+        return gr.update(), f"Transcription error: {repr(e)}\n\n{tb}"
 def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
         )
         return transcript, script, "Done: script generated."
     except Exception as e:
+        tb = traceback.format_exc()
+        return transcript, gr.update(), f"Script error: {repr(e)}\n\n{tb}"
 # -----------------------------
 # UI
 # -----------------------------
 with gr.Blocks(title="Video → Transcript → Script") as demo:
+    gr.Markdown(
+        "## Video → Transcript → Script\n"
+        "Upload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with Qwen3 via HF API."
+    )
     with gr.Row():
         with gr.Column(scale=1):
             video = gr.Video(label="Upload video", format="mp4")
             language = gr.Dropdown(
                 label="Language",
+                choices=["Auto", "en", "fr", "nl"],
                 value="Auto",
             )
             duration_label = gr.Dropdown(
     btn_transcribe.click(
         fn=ui_transcribe,
+        inputs=[video, language],
         outputs=[transcript, status],
     )