script-writer / app.py
fdaudens's picture
Update app.py
ec5bec4 verified
# app.py
import os
import re
import json
import hashlib
import tempfile
import subprocess
import traceback
from dataclasses import dataclass
from typing import Tuple, Dict
import gradio as gr
from huggingface_hub import InferenceClient
# -----------------------------
# Config
# -----------------------------
HF_TOKEN = os.getenv("HF_TOKEN") # Space Secrets
ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo")
# IMPORTANT:
# Inference Providers (router.huggingface.co) often requires model + provider suffix:
# "model_id:provider"
# Examples that are listed as supported:
# - "Qwen/Qwen3-4B-Thinking-2507:nscale"
# - "meta-llama/Llama-3.2-1B-Instruct:novita"
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-4B-Thinking-2507:nscale")
MAX_VIDEO_SECONDS = 10 * 60 # 10 minutes
CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
os.makedirs(CACHE_DIR, exist_ok=True)
# -----------------------------
# Hardcoded examples in system prompt (replace with yours)
# -----------------------------
SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.
Rules:
- Use ONLY facts present in the transcript. Do not invent names, dates, numbers, places.
- If something is unclear in the transcript, stay neutral or mark it as [unclear].
- Match the style from the examples.
- Keep the script within the requested duration.
- Always write the final script in the requested output language.
STYLE EXAMPLES (hardcoded):
Example 1
TRANSCRIPT:
"we launched a new feature today. it helps users summarize long articles faster."
SCRIPT:
Title: New feature drop
Hook: Big update today.
Body: We just launched a feature that turns long reads into quick, clear summaries. Drop in an article, get the key points in seconds.
Closing: If you’ve been drowning in tabs, this one’s for you.
Example 2
TRANSCRIPT:
"the storm caused delays across the region. officials said repairs will take two days."
SCRIPT:
Title: Storm delays
Hook: Here’s what’s happening.
Body: A storm has disrupted travel across the region. Officials say repairs could take around two days, so delays may continue.
Closing: If you’re heading out, check updates before you go.
Output format (always):
Title:
Hook:
Body:
Closing:
"""
# -----------------------------
# Helpers
# -----------------------------
def _run(cmd: list) -> Tuple[int, str, str]:
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return p.returncode, p.stdout, p.stderr
def sha256_file(path: str) -> str:
h = hashlib.sha256()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
h.update(chunk)
return h.hexdigest()
def get_video_duration_seconds(video_path: str) -> float:
cmd = [
"ffprobe", "-v", "error",
"-show_entries", "format=duration",
"-of", "json",
video_path,
]
code, out, err = _run(cmd)
if code != 0:
raise RuntimeError(f"ffprobe failed: {err.strip() or out.strip()}")
data = json.loads(out)
return float(data["format"]["duration"])
def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
cmd = [
"ffmpeg", "-y",
"-i", video_path,
"-vn",
"-ac", "1",
"-ar", "16000",
"-f", "wav",
wav_path,
]
code, out, err = _run(cmd)
if code != 0:
raise RuntimeError(f"ffmpeg failed: {err.strip() or out.strip()}")
def clean_text(s: str) -> str:
return re.sub(r"\s+", " ", (s or "")).strip()
def seconds_from_label(label: str) -> int:
mapping = {"30s": 30, "45s": 45, "60s": 60, "90s": 90, "2m": 120}
return mapping.get(label, 60)
def estimate_words_for_seconds(seconds: int) -> int:
return max(40, int(seconds * 2.5))
def language_name(code: str) -> str:
return {"en": "English", "fr": "French", "nl": "Dutch"}.get(code, "Match transcript language")
@dataclass
class HFClients:
asr: InferenceClient
api: InferenceClient
def make_clients() -> HFClients:
if not HF_TOKEN:
raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
return HFClients(
asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
api=InferenceClient(token=HF_TOKEN), # router client
)
def cache_paths(file_hash: str) -> Dict[str, str]:
return {"transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt")}
def llm_chat(clients: HFClients, system: str, user: str, max_tokens: int, temperature: float) -> str:
resp = clients.api.chat_completion(
model=LLM_MODEL_ID,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
max_tokens=max_tokens,
temperature=temperature,
top_p=0.9,
)
return resp.choices[0].message.content
def transcribe_video(video_path: str, language: str) -> str:
clients = make_clients()
dur = get_video_duration_seconds(video_path)
if dur > MAX_VIDEO_SECONDS:
raise RuntimeError(f"Video is {int(dur)}s. Max allowed is {MAX_VIDEO_SECONDS}s (10 minutes).")
file_hash = sha256_file(video_path)
paths = cache_paths(file_hash)
if os.path.exists(paths["transcript"]):
with open(paths["transcript"], "r", encoding="utf-8") as f:
return f.read()
with tempfile.TemporaryDirectory() as td:
wav_path = os.path.join(td, "audio.wav")
extract_audio_wav_16k_mono(video_path, wav_path)
if language != "Auto":
try:
result = clients.asr.automatic_speech_recognition(wav_path, language=language)
except TypeError:
result = clients.asr.automatic_speech_recognition(wav_path)
else:
result = clients.asr.automatic_speech_recognition(wav_path)
text = result.get("text", "") if isinstance(result, dict) else str(result)
text = clean_text(text)
if not text:
raise RuntimeError("Transcription returned empty text.")
with open(paths["transcript"], "w", encoding="utf-8") as f:
f.write(text)
return text
def make_user_prompt(transcript_or_notes: str, language: str, duration_label: str, tone: str, fmt: str) -> str:
seconds = seconds_from_label(duration_label)
target_words = estimate_words_for_seconds(seconds)
return f"""Constraints:
- Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
- Target duration: ~{seconds} seconds
- Target length: ~{target_words} words
- Tone: {tone}
- Format: {fmt}
Source:
\"\"\"{transcript_or_notes}\"\"\"
"""
def notes_first_pass(clients: HFClients, transcript: str, language: str) -> str:
sys = "You are an editor. Return concise bullet notes only."
user = f"""Convert this transcript into concise bullet notes.
Rules:
- Keep only key facts mentioned.
- No inventions.
- 8 to 14 bullets max.
- Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
Transcript:
\"\"\"{transcript}\"\"\"
Bullets:"""
out = llm_chat(clients, sys, user, max_tokens=320, temperature=0.2)
return clean_text(out)
def generate_script(transcript: str, language: str, duration_label: str, tone: str, fmt: str, force_notes_first: bool) -> str:
clients = make_clients()
transcript = clean_text(transcript)
if not transcript:
raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")
too_long = len(transcript) > 4500
use_notes = force_notes_first or too_long
source = transcript
if use_notes:
notes = notes_first_pass(clients, transcript, language)
source = f"NOTES:\n{notes}"
user_prompt = make_user_prompt(source, language, duration_label, tone, fmt)
script = llm_chat(clients, SYSTEM_PROMPT, user_prompt, max_tokens=750, temperature=0.4).strip()
if not script:
raise RuntimeError("Script generation returned empty text.")
return script
# -----------------------------
# Gradio callbacks
# -----------------------------
def ui_transcribe(video_file, language):
if video_file is None:
return gr.update(), "Please upload a video first."
try:
transcript = transcribe_video(video_file, language)
return transcript, "Done: transcript ready."
except Exception as e:
tb = traceback.format_exc()
return gr.update(), f"Transcription error: {repr(e)}\n\n{tb}"
def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
try:
if (not transcript or not transcript.strip()) and video_file is not None:
transcript = transcribe_video(video_file, language)
script = generate_script(transcript, language, duration_label, tone, fmt, force_notes_first)
return transcript, script, "Done: script generated."
except Exception as e:
tb = traceback.format_exc()
return transcript, gr.update(), f"Script error: {repr(e)}\n\n{tb}"
# -----------------------------
# UI
# -----------------------------
with gr.Blocks(title="Video → Transcript → Script") as demo:
gr.Markdown(
"## Video → Transcript → Script\n"
"Upload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with an Inference Providers model."
)
with gr.Row():
with gr.Column(scale=1):
video = gr.Video(label="Upload video", format="mp4")
language = gr.Dropdown(label="Language", choices=["Auto", "en", "fr", "nl"], value="Auto")
duration_label = gr.Dropdown(label="Script length", choices=["30s", "45s", "60s", "90s", "2m"], value="60s")
tone = gr.Dropdown(label="Tone", choices=["neutral", "punchy", "calm", "playful"], value="neutral")
fmt = gr.Dropdown(label="Format", choices=["voiceover", "anchor", "social short"], value="voiceover")
force_notes_first = gr.Checkbox(label="Notes-first (better for long transcripts)", value=False)
with gr.Row():
btn_transcribe = gr.Button("Transcribe")
btn_generate = gr.Button("Generate script")
status = gr.Textbox(label="Status", value="Ready.", interactive=False)
with gr.Column(scale=2):
transcript = gr.Textbox(label="Transcript (editable)", lines=10)
script = gr.Textbox(label="Script (editable)", lines=14)
btn_transcribe.click(fn=ui_transcribe, inputs=[video, language], outputs=[transcript, status])
btn_generate.click(fn=ui_generate, inputs=[video, transcript, language, duration_label, tone, fmt, force_notes_first], outputs=[transcript, script, status])
if __name__ == "__main__":
demo.launch()