Spaces:

fdaudens
/

script-writer

Runtime error

App Files Files Community

fdaudens commited on Feb 26

Commit

1c09a33

verified ·

1 Parent(s): ab40e34

Create app.py

Browse files

Files changed (1) hide show

app.py +376 -0

app.py ADDED Viewed

	@@ -0,0 +1,376 @@

+# app.py
+# Hugging Face Spaces Gradio app: upload video -> transcribe (Whisper large-v3-turbo) -> script (Qwen3 via HF API)
+import os
+import re
+import json
+import hashlib
+import tempfile
+import subprocess
+from dataclasses import dataclass
+from typing import Optional, Tuple, Dict
+import gradio as gr
+from huggingface_hub import InferenceClient
+# -----------------------------
+# Config
+# -----------------------------
+HF_TOKEN = os.getenv("HF_TOKEN")  # put this in Space Secrets
+ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo")  # verified on HF :contentReference[oaicite:0]{index=0}
+# Note: HF has Qwen3 models like 0.6B / 1.7B / 4B etc. (not always a literal "1B"). :contentReference[oaicite:1]{index=1}
+# Closest cheap starter defaults to 0.6B, override with env var if you want 1.7B.
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-0.6B")
+MAX_VIDEO_SECONDS = 10 * 60  # 10 minutes
+CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+# -----------------------------
+# Hardcoded examples in system prompt
+# Put your real examples here.
+# Keep them short: Qwen small models benefit from tight few-shot.
+# -----------------------------
+SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.
+Rules:
+- Use ONLY facts present in the transcript. Do not invent names, dates, numbers, places.
+- If something is unclear in the transcript, stay neutral or mark it as [unclear].
+- Match the style from the examples.
+- Keep the script within the requested duration.
+STYLE EXAMPLES (hardcoded):
+Example 1
+TRANSCRIPT:
+"we launched a new feature today. it helps users summarize long articles faster."
+SCRIPT:
+"Big update today: a new feature that turns long reads into quick, clear summaries.
+Here’s the idea: you drop in an article, and you get the key points in seconds.
+If you’ve been drowning in tabs, this one’s for you."
+Example 2
+TRANSCRIPT:
+"the storm caused delays across the region. officials said repairs will take two days."
+SCRIPT:
+"Here’s what’s happening: a storm has disrupted travel across the region.
+Officials say repairs could take around two days, so delays may continue.
+If you’re heading out, check updates before you go."
+Output format:
+Title:
+Hook:
+Body:
+Closing:
+"""
+# -----------------------------
+# Helpers
+# -----------------------------
+def _run(cmd: list) -> Tuple[int, str, str]:
+    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    return p.returncode, p.stdout, p.stderr
+def sha256_file(path: str) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(1024 * 1024), b""):
+            h.update(chunk)
+    return h.hexdigest()
+def get_video_duration_seconds(video_path: str) -> float:
+    # ffprobe returns duration in seconds (float). Works on Spaces typically.
+    cmd = [
+        "ffprobe", "-v", "error",
+        "-select_streams", "v:0",
+        "-show_entries", "format=duration",
+        "-of", "json",
+        video_path,
+    ]
+    code, out, err = _run(cmd)
+    if code != 0:
+        raise RuntimeError(f"ffprobe failed: {err.strip() or out.strip()}")
+    data = json.loads(out)
+    dur = float(data["format"]["duration"])
+    return dur
+def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
+    # Standardize audio for ASR
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", video_path,
+        "-vn",
+        "-ac", "1",
+        "-ar", "16000",
+        "-f", "wav",
+        wav_path,
+    ]
+    code, out, err = _run(cmd)
+    if code != 0:
+        raise RuntimeError(f"ffmpeg audio extraction failed: {err.strip() or out.strip()}")
+def seconds_from_label(label: str) -> int:
+    mapping = {
+        "30s": 30,
+        "45s": 45,
+        "60s": 60,
+        "90s": 90,
+        "2m": 120,
+    }
+    return mapping.get(label, 60)
+def estimate_words_for_seconds(seconds: int) -> int:
+    # Rough VO pacing: ~150 wpm => 2.5 words/sec
+    return max(40, int(seconds * 2.5))
+def clean_text(s: str) -> str:
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+@dataclass
+class HFClients:
+    asr: InferenceClient
+    llm: InferenceClient
+def make_clients() -> HFClients:
+    if not HF_TOKEN:
+        raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
+    return HFClients(
+        asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
+        llm=InferenceClient(model=LLM_MODEL_ID, token=HF_TOKEN),
+    )
+def cache_paths(file_hash: str) -> Dict[str, str]:
+    return {
+        "transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt"),
+        "script": os.path.join(CACHE_DIR, f"{file_hash}.script.txt"),
+    }
+def transcribe_video(video_path: str, language: str) -> str:
+    clients = make_clients()
+    dur = get_video_duration_seconds(video_path)
+    if dur > MAX_VIDEO_SECONDS:
+        raise RuntimeError(f"Video is {int(dur)}s. Max allowed is {MAX_VIDEO_SECONDS}s (10 minutes).")
+    file_hash = sha256_file(video_path)
+    paths = cache_paths(file_hash)
+    if os.path.exists(paths["transcript"]):
+        with open(paths["transcript"], "r", encoding="utf-8") as f:
+            return f.read()
+    with tempfile.TemporaryDirectory() as td:
+        wav_path = os.path.join(td, "audio.wav")
+        extract_audio_wav_16k_mono(video_path, wav_path)
+        # HF Inference API ASR: automatic_speech_recognition
+        # language handling: HF API params vary; safest is to pass None for auto.
+        # Some endpoints accept "language" in params; if yours does, this works.
+        params = {}
+        if language != "Auto":
+            params["language"] = language  # e.g. "en", "fr"
+        result = clients.asr.automatic_speech_recognition(wav_path, **params)
+        text = result.get("text", "") if isinstance(result, dict) else str(result)
+        text = clean_text(text)
+    if not text:
+        raise RuntimeError("Transcription returned empty text.")
+    with open(paths["transcript"], "w", encoding="utf-8") as f:
+        f.write(text)
+    return text
+def make_user_prompt(
+    transcript: str,
+    language: str,
+    duration_label: str,
+    tone: str,
+    fmt: str,
+) -> str:
+    seconds = seconds_from_label(duration_label)
+    target_words = estimate_words_for_seconds(seconds)
+    return f"""Constraints:
+- Language: {language if language != "Auto" else "Match transcript language"}
+- Target duration: ~{seconds} seconds
+- Target length: ~{target_words} words (keep it tight)
+- Tone: {tone}
+- Format: {fmt}
+Transcript:
+\"\"\"{transcript}\"\"\"
+"""
+def notes_first_pass(clients: HFClients, transcript: str, language: str) -> str:
+    # A cheap compression step for long transcripts
+    prompt = f"""You are an editor. Convert this transcript into concise bullet notes.
+Rules:
+- Keep only key facts mentioned.
+- No inventions.
+- 8 to 14 bullets max.
+- Language: {language if language != "Auto" else "Match transcript"}
+Transcript:
+\"\"\"{transcript}\"\"\"
+Bullets:"""
+    out = clients.llm.text_generation(
+        prompt,
+        max_new_tokens=300,
+        temperature=0.2,
+        return_full_text=False,
+    )
+    return clean_text(out)
+def generate_script(
+    transcript: str,
+    language: str,
+    duration_label: str,
+    tone: str,
+    fmt: str,
+    force_notes_first: bool,
+) -> str:
+    clients = make_clients()
+    transcript = clean_text(transcript)
+    if not transcript:
+        raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")
+    # Notes-first threshold: tweak as you like
+    too_long = len(transcript) > 4500
+    use_notes = force_notes_first or too_long
+    source_text = transcript
+    if use_notes:
+        notes = notes_first_pass(clients, transcript, language)
+        source_text = f"NOTES:\n{notes}"
+    user_prompt = make_user_prompt(source_text, language, duration_label, tone, fmt)
+    # Keep generation settings conservative for small models
+    full_prompt = f"{SYSTEM_PROMPT}\n\n{user_prompt}"
+    out = clients.llm.text_generation(
+        full_prompt,
+        max_new_tokens=700,
+        temperature=0.4,
+        top_p=0.9,
+        return_full_text=False,
+    )
+    script = clean_text(out)
+    if not script:
+        raise RuntimeError("Script generation returned empty text.")
+    return script
+# -----------------------------
+# Gradio callbacks
+# -----------------------------
+def ui_transcribe(video_file, language, status):
+    if video_file is None:
+        return gr.update(), "Please upload a video first."
+    try:
+        status = "Checking duration + extracting audio…"
+        transcript = transcribe_video(video_file, language)
+        return transcript, "Done: transcript ready."
+    except Exception as e:
+        return gr.update(), f"Transcription error: {e}"
+def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
+    try:
+        # If transcript is empty but video exists, auto-transcribe first
+        if (not transcript or not transcript.strip()) and video_file is not None:
+            transcript = transcribe_video(video_file, language)
+        script = generate_script(
+            transcript=transcript,
+            language=language,
+            duration_label=duration_label,
+            tone=tone,
+            fmt=fmt,
+            force_notes_first=force_notes_first,
+        )
+        return transcript, script, "Done: script generated."
+    except Exception as e:
+        return transcript, gr.update(), f"Script error: {e}"
+# -----------------------------
+# UI
+# -----------------------------
+with gr.Blocks(title="Video → Transcript → Script") as demo:
+    gr.Markdown("## Video → Transcript → Script\nUpload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with Qwen3 via HF API.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            video = gr.Video(label="Upload video", format="mp4")
+            language = gr.Dropdown(
+                label="Language",
+                choices=["Auto", "en", "fr"],
+                value="Auto",
+            )
+            duration_label = gr.Dropdown(
+                label="Script length",
+                choices=["30s", "45s", "60s", "90s", "2m"],
+                value="60s",
+            )
+            tone = gr.Dropdown(
+                label="Tone",
+                choices=["neutral", "punchy", "calm", "playful"],
+                value="neutral",
+            )
+            fmt = gr.Dropdown(
+                label="Format",
+                choices=["voiceover", "anchor", "social short"],
+                value="voiceover",
+            )
+            force_notes_first = gr.Checkbox(label="Notes-first (better for long transcripts)", value=False)
+            with gr.Row():
+                btn_transcribe = gr.Button("Transcribe")
+                btn_generate = gr.Button("Generate script")
+            status = gr.Textbox(label="Status", value="Ready.", interactive=False)
+        with gr.Column(scale=2):
+            transcript = gr.Textbox(label="Transcript (editable)", lines=10)
+            script = gr.Textbox(label="Script (editable)", lines=14)
+    btn_transcribe.click(
+        fn=ui_transcribe,
+        inputs=[video, language, status],
+        outputs=[transcript, status],
+    )
+    btn_generate.click(
+        fn=ui_generate,
+        inputs=[video, transcript, language, duration_label, tone, fmt, force_notes_first],
+        outputs=[transcript, script, status],
+    )
+if __name__ == "__main__":
+    demo.launch()