Spaces:
Running
Running
| # app.py | |
| import os | |
| import re | |
| import json | |
| import hashlib | |
| import tempfile | |
| import subprocess | |
| import traceback | |
| from dataclasses import dataclass | |
| from typing import Tuple, Dict | |
| import gradio as gr | |
| from huggingface_hub import InferenceClient | |
| # ----------------------------- | |
| # Config | |
| # ----------------------------- | |
| HF_TOKEN = os.getenv("HF_TOKEN") # Space Secrets | |
| ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo") | |
| # IMPORTANT: | |
| # Inference Providers (router.huggingface.co) often requires model + provider suffix: | |
| # "model_id:provider" | |
| # Examples that are listed as supported: | |
| # - "Qwen/Qwen3-4B-Thinking-2507:nscale" | |
| # - "meta-llama/Llama-3.2-1B-Instruct:novita" | |
| LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-4B-Thinking-2507:nscale") | |
| MAX_VIDEO_SECONDS = 10 * 60 # 10 minutes | |
| CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache") | |
| os.makedirs(CACHE_DIR, exist_ok=True) | |
| # ----------------------------- | |
| # Hardcoded examples in system prompt (replace with yours) | |
| # ----------------------------- | |
| SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script. | |
| Rules: | |
| - Use ONLY facts present in the transcript. Do not invent names, dates, numbers, places. | |
| - If something is unclear in the transcript, stay neutral or mark it as [unclear]. | |
| - Match the style from the examples. | |
| - Keep the script within the requested duration. | |
| - Always write the final script in the requested output language. | |
| STYLE EXAMPLES (hardcoded): | |
| Example 1 | |
| TRANSCRIPT: | |
| "we launched a new feature today. it helps users summarize long articles faster." | |
| SCRIPT: | |
| Title: New feature drop | |
| Hook: Big update today. | |
| Body: We just launched a feature that turns long reads into quick, clear summaries. Drop in an article, get the key points in seconds. | |
| Closing: If you’ve been drowning in tabs, this one’s for you. | |
| Example 2 | |
| TRANSCRIPT: | |
| "the storm caused delays across the region. officials said repairs will take two days." | |
| SCRIPT: | |
| Title: Storm delays | |
| Hook: Here’s what’s happening. | |
| Body: A storm has disrupted travel across the region. Officials say repairs could take around two days, so delays may continue. | |
| Closing: If you’re heading out, check updates before you go. | |
| Output format (always): | |
| Title: | |
| Hook: | |
| Body: | |
| Closing: | |
| """ | |
| # ----------------------------- | |
| # Helpers | |
| # ----------------------------- | |
| def _run(cmd: list) -> Tuple[int, str, str]: | |
| p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | |
| return p.returncode, p.stdout, p.stderr | |
| def sha256_file(path: str) -> str: | |
| h = hashlib.sha256() | |
| with open(path, "rb") as f: | |
| for chunk in iter(lambda: f.read(1024 * 1024), b""): | |
| h.update(chunk) | |
| return h.hexdigest() | |
| def get_video_duration_seconds(video_path: str) -> float: | |
| cmd = [ | |
| "ffprobe", "-v", "error", | |
| "-show_entries", "format=duration", | |
| "-of", "json", | |
| video_path, | |
| ] | |
| code, out, err = _run(cmd) | |
| if code != 0: | |
| raise RuntimeError(f"ffprobe failed: {err.strip() or out.strip()}") | |
| data = json.loads(out) | |
| return float(data["format"]["duration"]) | |
| def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None: | |
| cmd = [ | |
| "ffmpeg", "-y", | |
| "-i", video_path, | |
| "-vn", | |
| "-ac", "1", | |
| "-ar", "16000", | |
| "-f", "wav", | |
| wav_path, | |
| ] | |
| code, out, err = _run(cmd) | |
| if code != 0: | |
| raise RuntimeError(f"ffmpeg failed: {err.strip() or out.strip()}") | |
| def clean_text(s: str) -> str: | |
| return re.sub(r"\s+", " ", (s or "")).strip() | |
| def seconds_from_label(label: str) -> int: | |
| mapping = {"30s": 30, "45s": 45, "60s": 60, "90s": 90, "2m": 120} | |
| return mapping.get(label, 60) | |
| def estimate_words_for_seconds(seconds: int) -> int: | |
| return max(40, int(seconds * 2.5)) | |
| def language_name(code: str) -> str: | |
| return {"en": "English", "fr": "French", "nl": "Dutch"}.get(code, "Match transcript language") | |
| class HFClients: | |
| asr: InferenceClient | |
| api: InferenceClient | |
| def make_clients() -> HFClients: | |
| if not HF_TOKEN: | |
| raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.") | |
| return HFClients( | |
| asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN), | |
| api=InferenceClient(token=HF_TOKEN), # router client | |
| ) | |
| def cache_paths(file_hash: str) -> Dict[str, str]: | |
| return {"transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt")} | |
| def llm_chat(clients: HFClients, system: str, user: str, max_tokens: int, temperature: float) -> str: | |
| resp = clients.api.chat_completion( | |
| model=LLM_MODEL_ID, | |
| messages=[ | |
| {"role": "system", "content": system}, | |
| {"role": "user", "content": user}, | |
| ], | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.9, | |
| ) | |
| return resp.choices[0].message.content | |
| def transcribe_video(video_path: str, language: str) -> str: | |
| clients = make_clients() | |
| dur = get_video_duration_seconds(video_path) | |
| if dur > MAX_VIDEO_SECONDS: | |
| raise RuntimeError(f"Video is {int(dur)}s. Max allowed is {MAX_VIDEO_SECONDS}s (10 minutes).") | |
| file_hash = sha256_file(video_path) | |
| paths = cache_paths(file_hash) | |
| if os.path.exists(paths["transcript"]): | |
| with open(paths["transcript"], "r", encoding="utf-8") as f: | |
| return f.read() | |
| with tempfile.TemporaryDirectory() as td: | |
| wav_path = os.path.join(td, "audio.wav") | |
| extract_audio_wav_16k_mono(video_path, wav_path) | |
| if language != "Auto": | |
| try: | |
| result = clients.asr.automatic_speech_recognition(wav_path, language=language) | |
| except TypeError: | |
| result = clients.asr.automatic_speech_recognition(wav_path) | |
| else: | |
| result = clients.asr.automatic_speech_recognition(wav_path) | |
| text = result.get("text", "") if isinstance(result, dict) else str(result) | |
| text = clean_text(text) | |
| if not text: | |
| raise RuntimeError("Transcription returned empty text.") | |
| with open(paths["transcript"], "w", encoding="utf-8") as f: | |
| f.write(text) | |
| return text | |
| def make_user_prompt(transcript_or_notes: str, language: str, duration_label: str, tone: str, fmt: str) -> str: | |
| seconds = seconds_from_label(duration_label) | |
| target_words = estimate_words_for_seconds(seconds) | |
| return f"""Constraints: | |
| - Output language: {language_name(language) if language != "Auto" else "Match transcript language"} | |
| - Target duration: ~{seconds} seconds | |
| - Target length: ~{target_words} words | |
| - Tone: {tone} | |
| - Format: {fmt} | |
| Source: | |
| \"\"\"{transcript_or_notes}\"\"\" | |
| """ | |
| def notes_first_pass(clients: HFClients, transcript: str, language: str) -> str: | |
| sys = "You are an editor. Return concise bullet notes only." | |
| user = f"""Convert this transcript into concise bullet notes. | |
| Rules: | |
| - Keep only key facts mentioned. | |
| - No inventions. | |
| - 8 to 14 bullets max. | |
| - Output language: {language_name(language) if language != "Auto" else "Match transcript language"} | |
| Transcript: | |
| \"\"\"{transcript}\"\"\" | |
| Bullets:""" | |
| out = llm_chat(clients, sys, user, max_tokens=320, temperature=0.2) | |
| return clean_text(out) | |
| def generate_script(transcript: str, language: str, duration_label: str, tone: str, fmt: str, force_notes_first: bool) -> str: | |
| clients = make_clients() | |
| transcript = clean_text(transcript) | |
| if not transcript: | |
| raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.") | |
| too_long = len(transcript) > 4500 | |
| use_notes = force_notes_first or too_long | |
| source = transcript | |
| if use_notes: | |
| notes = notes_first_pass(clients, transcript, language) | |
| source = f"NOTES:\n{notes}" | |
| user_prompt = make_user_prompt(source, language, duration_label, tone, fmt) | |
| script = llm_chat(clients, SYSTEM_PROMPT, user_prompt, max_tokens=750, temperature=0.4).strip() | |
| if not script: | |
| raise RuntimeError("Script generation returned empty text.") | |
| return script | |
| # ----------------------------- | |
| # Gradio callbacks | |
| # ----------------------------- | |
| def ui_transcribe(video_file, language): | |
| if video_file is None: | |
| return gr.update(), "Please upload a video first." | |
| try: | |
| transcript = transcribe_video(video_file, language) | |
| return transcript, "Done: transcript ready." | |
| except Exception as e: | |
| tb = traceback.format_exc() | |
| return gr.update(), f"Transcription error: {repr(e)}\n\n{tb}" | |
| def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first): | |
| try: | |
| if (not transcript or not transcript.strip()) and video_file is not None: | |
| transcript = transcribe_video(video_file, language) | |
| script = generate_script(transcript, language, duration_label, tone, fmt, force_notes_first) | |
| return transcript, script, "Done: script generated." | |
| except Exception as e: | |
| tb = traceback.format_exc() | |
| return transcript, gr.update(), f"Script error: {repr(e)}\n\n{tb}" | |
| # ----------------------------- | |
| # UI | |
| # ----------------------------- | |
| with gr.Blocks(title="Video → Transcript → Script") as demo: | |
| gr.Markdown( | |
| "## Video → Transcript → Script\n" | |
| "Upload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with an Inference Providers model." | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| video = gr.Video(label="Upload video", format="mp4") | |
| language = gr.Dropdown(label="Language", choices=["Auto", "en", "fr", "nl"], value="Auto") | |
| duration_label = gr.Dropdown(label="Script length", choices=["30s", "45s", "60s", "90s", "2m"], value="60s") | |
| tone = gr.Dropdown(label="Tone", choices=["neutral", "punchy", "calm", "playful"], value="neutral") | |
| fmt = gr.Dropdown(label="Format", choices=["voiceover", "anchor", "social short"], value="voiceover") | |
| force_notes_first = gr.Checkbox(label="Notes-first (better for long transcripts)", value=False) | |
| with gr.Row(): | |
| btn_transcribe = gr.Button("Transcribe") | |
| btn_generate = gr.Button("Generate script") | |
| status = gr.Textbox(label="Status", value="Ready.", interactive=False) | |
| with gr.Column(scale=2): | |
| transcript = gr.Textbox(label="Transcript (editable)", lines=10) | |
| script = gr.Textbox(label="Script (editable)", lines=14) | |
| btn_transcribe.click(fn=ui_transcribe, inputs=[video, language], outputs=[transcript, status]) | |
| btn_generate.click(fn=ui_generate, inputs=[video, transcript, language, duration_label, tone, fmt, force_notes_first], outputs=[transcript, script, status]) | |
| if __name__ == "__main__": | |
| demo.launch() |