# app.py import os import re import json import hashlib import tempfile import subprocess import traceback from dataclasses import dataclass from typing import Tuple, Dict import gradio as gr from huggingface_hub import InferenceClient # ----------------------------- # Config # ----------------------------- HF_TOKEN = os.getenv("HF_TOKEN") # Space Secrets ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo") # IMPORTANT: # Inference Providers (router.huggingface.co) often requires model + provider suffix: # "model_id:provider" # Examples that are listed as supported: # - "Qwen/Qwen3-4B-Thinking-2507:nscale" # - "meta-llama/Llama-3.2-1B-Instruct:novita" LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-4B-Thinking-2507:nscale") MAX_VIDEO_SECONDS = 10 * 60 # 10 minutes CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache") os.makedirs(CACHE_DIR, exist_ok=True) # ----------------------------- # Hardcoded examples in system prompt (replace with yours) # ----------------------------- SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script. Rules: - Use ONLY facts present in the transcript. Do not invent names, dates, numbers, places. - If something is unclear in the transcript, stay neutral or mark it as [unclear]. - Match the style from the examples. - Keep the script within the requested duration. - Always write the final script in the requested output language. STYLE EXAMPLES (hardcoded): Example 1 TRANSCRIPT: "we launched a new feature today. it helps users summarize long articles faster." SCRIPT: Title: New feature drop Hook: Big update today. Body: We just launched a feature that turns long reads into quick, clear summaries. Drop in an article, get the key points in seconds. Closing: If you’ve been drowning in tabs, this one’s for you. Example 2 TRANSCRIPT: "the storm caused delays across the region. officials said repairs will take two days." SCRIPT: Title: Storm delays Hook: Here’s what’s happening. Body: A storm has disrupted travel across the region. Officials say repairs could take around two days, so delays may continue. Closing: If you’re heading out, check updates before you go. Output format (always): Title: Hook: Body: Closing: """ # ----------------------------- # Helpers # ----------------------------- def _run(cmd: list) -> Tuple[int, str, str]: p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) return p.returncode, p.stdout, p.stderr def sha256_file(path: str) -> str: h = hashlib.sha256() with open(path, "rb") as f: for chunk in iter(lambda: f.read(1024 * 1024), b""): h.update(chunk) return h.hexdigest() def get_video_duration_seconds(video_path: str) -> float: cmd = [ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "json", video_path, ] code, out, err = _run(cmd) if code != 0: raise RuntimeError(f"ffprobe failed: {err.strip() or out.strip()}") data = json.loads(out) return float(data["format"]["duration"]) def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None: cmd = [ "ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", wav_path, ] code, out, err = _run(cmd) if code != 0: raise RuntimeError(f"ffmpeg failed: {err.strip() or out.strip()}") def clean_text(s: str) -> str: return re.sub(r"\s+", " ", (s or "")).strip() def seconds_from_label(label: str) -> int: mapping = {"30s": 30, "45s": 45, "60s": 60, "90s": 90, "2m": 120} return mapping.get(label, 60) def estimate_words_for_seconds(seconds: int) -> int: return max(40, int(seconds * 2.5)) def language_name(code: str) -> str: return {"en": "English", "fr": "French", "nl": "Dutch"}.get(code, "Match transcript language") @dataclass class HFClients: asr: InferenceClient api: InferenceClient def make_clients() -> HFClients: if not HF_TOKEN: raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.") return HFClients( asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN), api=InferenceClient(token=HF_TOKEN), # router client ) def cache_paths(file_hash: str) -> Dict[str, str]: return {"transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt")} def llm_chat(clients: HFClients, system: str, user: str, max_tokens: int, temperature: float) -> str: resp = clients.api.chat_completion( model=LLM_MODEL_ID, messages=[ {"role": "system", "content": system}, {"role": "user", "content": user}, ], max_tokens=max_tokens, temperature=temperature, top_p=0.9, ) return resp.choices[0].message.content def transcribe_video(video_path: str, language: str) -> str: clients = make_clients() dur = get_video_duration_seconds(video_path) if dur > MAX_VIDEO_SECONDS: raise RuntimeError(f"Video is {int(dur)}s. Max allowed is {MAX_VIDEO_SECONDS}s (10 minutes).") file_hash = sha256_file(video_path) paths = cache_paths(file_hash) if os.path.exists(paths["transcript"]): with open(paths["transcript"], "r", encoding="utf-8") as f: return f.read() with tempfile.TemporaryDirectory() as td: wav_path = os.path.join(td, "audio.wav") extract_audio_wav_16k_mono(video_path, wav_path) if language != "Auto": try: result = clients.asr.automatic_speech_recognition(wav_path, language=language) except TypeError: result = clients.asr.automatic_speech_recognition(wav_path) else: result = clients.asr.automatic_speech_recognition(wav_path) text = result.get("text", "") if isinstance(result, dict) else str(result) text = clean_text(text) if not text: raise RuntimeError("Transcription returned empty text.") with open(paths["transcript"], "w", encoding="utf-8") as f: f.write(text) return text def make_user_prompt(transcript_or_notes: str, language: str, duration_label: str, tone: str, fmt: str) -> str: seconds = seconds_from_label(duration_label) target_words = estimate_words_for_seconds(seconds) return f"""Constraints: - Output language: {language_name(language) if language != "Auto" else "Match transcript language"} - Target duration: ~{seconds} seconds - Target length: ~{target_words} words - Tone: {tone} - Format: {fmt} Source: \"\"\"{transcript_or_notes}\"\"\" """ def notes_first_pass(clients: HFClients, transcript: str, language: str) -> str: sys = "You are an editor. Return concise bullet notes only." user = f"""Convert this transcript into concise bullet notes. Rules: - Keep only key facts mentioned. - No inventions. - 8 to 14 bullets max. - Output language: {language_name(language) if language != "Auto" else "Match transcript language"} Transcript: \"\"\"{transcript}\"\"\" Bullets:""" out = llm_chat(clients, sys, user, max_tokens=320, temperature=0.2) return clean_text(out) def generate_script(transcript: str, language: str, duration_label: str, tone: str, fmt: str, force_notes_first: bool) -> str: clients = make_clients() transcript = clean_text(transcript) if not transcript: raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.") too_long = len(transcript) > 4500 use_notes = force_notes_first or too_long source = transcript if use_notes: notes = notes_first_pass(clients, transcript, language) source = f"NOTES:\n{notes}" user_prompt = make_user_prompt(source, language, duration_label, tone, fmt) script = llm_chat(clients, SYSTEM_PROMPT, user_prompt, max_tokens=750, temperature=0.4).strip() if not script: raise RuntimeError("Script generation returned empty text.") return script # ----------------------------- # Gradio callbacks # ----------------------------- def ui_transcribe(video_file, language): if video_file is None: return gr.update(), "Please upload a video first." try: transcript = transcribe_video(video_file, language) return transcript, "Done: transcript ready." except Exception as e: tb = traceback.format_exc() return gr.update(), f"Transcription error: {repr(e)}\n\n{tb}" def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first): try: if (not transcript or not transcript.strip()) and video_file is not None: transcript = transcribe_video(video_file, language) script = generate_script(transcript, language, duration_label, tone, fmt, force_notes_first) return transcript, script, "Done: script generated." except Exception as e: tb = traceback.format_exc() return transcript, gr.update(), f"Script error: {repr(e)}\n\n{tb}" # ----------------------------- # UI # ----------------------------- with gr.Blocks(title="Video → Transcript → Script") as demo: gr.Markdown( "## Video → Transcript → Script\n" "Upload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with an Inference Providers model." ) with gr.Row(): with gr.Column(scale=1): video = gr.Video(label="Upload video", format="mp4") language = gr.Dropdown(label="Language", choices=["Auto", "en", "fr", "nl"], value="Auto") duration_label = gr.Dropdown(label="Script length", choices=["30s", "45s", "60s", "90s", "2m"], value="60s") tone = gr.Dropdown(label="Tone", choices=["neutral", "punchy", "calm", "playful"], value="neutral") fmt = gr.Dropdown(label="Format", choices=["voiceover", "anchor", "social short"], value="voiceover") force_notes_first = gr.Checkbox(label="Notes-first (better for long transcripts)", value=False) with gr.Row(): btn_transcribe = gr.Button("Transcribe") btn_generate = gr.Button("Generate script") status = gr.Textbox(label="Status", value="Ready.", interactive=False) with gr.Column(scale=2): transcript = gr.Textbox(label="Transcript (editable)", lines=10) script = gr.Textbox(label="Script (editable)", lines=14) btn_transcribe.click(fn=ui_transcribe, inputs=[video, language], outputs=[transcript, status]) btn_generate.click(fn=ui_generate, inputs=[video, transcript, language, duration_label, tone, fmt, force_notes_first], outputs=[transcript, script, status]) if __name__ == "__main__": demo.launch()