Spaces:

fdaudens
/

script-writer

Running

File size: 10,804 Bytes

# app.py

import os
import re
import json
import hashlib
import tempfile
import subprocess
import traceback
from dataclasses import dataclass
from typing import Tuple, Dict

import gradio as gr
from huggingface_hub import InferenceClient

# -----------------------------
# Config
# -----------------------------
HF_TOKEN = os.getenv("HF_TOKEN")  # Space Secrets

ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo")

# IMPORTANT:
# Inference Providers (router.huggingface.co) often requires model + provider suffix:
#   "model_id:provider"
# Examples that are listed as supported:
# - "Qwen/Qwen3-4B-Thinking-2507:nscale"
# - "meta-llama/Llama-3.2-1B-Instruct:novita"
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-4B-Thinking-2507:nscale")

MAX_VIDEO_SECONDS = 10 * 60  # 10 minutes
CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
os.makedirs(CACHE_DIR, exist_ok=True)

# -----------------------------
# Hardcoded examples in system prompt (replace with yours)
# -----------------------------
SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.

Rules:
- Use ONLY facts present in the transcript. Do not invent names, dates, numbers, places.
- If something is unclear in the transcript, stay neutral or mark it as [unclear].
- Match the style from the examples.
- Keep the script within the requested duration.
- Always write the final script in the requested output language.

STYLE EXAMPLES (hardcoded):

Example 1
TRANSCRIPT:
"we launched a new feature today. it helps users summarize long articles faster."
SCRIPT:
Title: New feature drop
Hook: Big update today.
Body: We just launched a feature that turns long reads into quick, clear summaries. Drop in an article, get the key points in seconds.
Closing: If you’ve been drowning in tabs, this one’s for you.

Example 2
TRANSCRIPT:
"the storm caused delays across the region. officials said repairs will take two days."
SCRIPT:
Title: Storm delays
Hook: Here’s what’s happening.
Body: A storm has disrupted travel across the region. Officials say repairs could take around two days, so delays may continue.
Closing: If you’re heading out, check updates before you go.

Output format (always):
Title:
Hook:
Body:
Closing:
"""

# -----------------------------
# Helpers
# -----------------------------
def _run(cmd: list) -> Tuple[int, str, str]:
    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    return p.returncode, p.stdout, p.stderr


def sha256_file(path: str) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


def get_video_duration_seconds(video_path: str) -> float:
    cmd = [
        "ffprobe", "-v", "error",
        "-show_entries", "format=duration",
        "-of", "json",
        video_path,
    ]
    code, out, err = _run(cmd)
    if code != 0:
        raise RuntimeError(f"ffprobe failed: {err.strip() or out.strip()}")
    data = json.loads(out)
    return float(data["format"]["duration"])


def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
    cmd = [
        "ffmpeg", "-y",
        "-i", video_path,
        "-vn",
        "-ac", "1",
        "-ar", "16000",
        "-f", "wav",
        wav_path,
    ]
    code, out, err = _run(cmd)
    if code != 0:
        raise RuntimeError(f"ffmpeg failed: {err.strip() or out.strip()}")


def clean_text(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()


def seconds_from_label(label: str) -> int:
    mapping = {"30s": 30, "45s": 45, "60s": 60, "90s": 90, "2m": 120}
    return mapping.get(label, 60)


def estimate_words_for_seconds(seconds: int) -> int:
    return max(40, int(seconds * 2.5))


def language_name(code: str) -> str:
    return {"en": "English", "fr": "French", "nl": "Dutch"}.get(code, "Match transcript language")


@dataclass
class HFClients:
    asr: InferenceClient
    api: InferenceClient


def make_clients() -> HFClients:
    if not HF_TOKEN:
        raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
    return HFClients(
        asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
        api=InferenceClient(token=HF_TOKEN),  # router client
    )


def cache_paths(file_hash: str) -> Dict[str, str]:
    return {"transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt")}


def llm_chat(clients: HFClients, system: str, user: str, max_tokens: int, temperature: float) -> str:
    resp = clients.api.chat_completion(
        model=LLM_MODEL_ID,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=0.9,
    )
    return resp.choices[0].message.content


def transcribe_video(video_path: str, language: str) -> str:
    clients = make_clients()

    dur = get_video_duration_seconds(video_path)
    if dur > MAX_VIDEO_SECONDS:
        raise RuntimeError(f"Video is {int(dur)}s. Max allowed is {MAX_VIDEO_SECONDS}s (10 minutes).")

    file_hash = sha256_file(video_path)
    paths = cache_paths(file_hash)

    if os.path.exists(paths["transcript"]):
        with open(paths["transcript"], "r", encoding="utf-8") as f:
            return f.read()

    with tempfile.TemporaryDirectory() as td:
        wav_path = os.path.join(td, "audio.wav")
        extract_audio_wav_16k_mono(video_path, wav_path)

        if language != "Auto":
            try:
                result = clients.asr.automatic_speech_recognition(wav_path, language=language)
            except TypeError:
                result = clients.asr.automatic_speech_recognition(wav_path)
        else:
            result = clients.asr.automatic_speech_recognition(wav_path)

        text = result.get("text", "") if isinstance(result, dict) else str(result)
        text = clean_text(text)

    if not text:
        raise RuntimeError("Transcription returned empty text.")

    with open(paths["transcript"], "w", encoding="utf-8") as f:
        f.write(text)

    return text


def make_user_prompt(transcript_or_notes: str, language: str, duration_label: str, tone: str, fmt: str) -> str:
    seconds = seconds_from_label(duration_label)
    target_words = estimate_words_for_seconds(seconds)
    return f"""Constraints:
- Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
- Target duration: ~{seconds} seconds
- Target length: ~{target_words} words
- Tone: {tone}
- Format: {fmt}

Source:
\"\"\"{transcript_or_notes}\"\"\"
"""


def notes_first_pass(clients: HFClients, transcript: str, language: str) -> str:
    sys = "You are an editor. Return concise bullet notes only."
    user = f"""Convert this transcript into concise bullet notes.

Rules:
- Keep only key facts mentioned.
- No inventions.
- 8 to 14 bullets max.
- Output language: {language_name(language) if language != "Auto" else "Match transcript language"}

Transcript:
\"\"\"{transcript}\"\"\"

Bullets:"""
    out = llm_chat(clients, sys, user, max_tokens=320, temperature=0.2)
    return clean_text(out)


def generate_script(transcript: str, language: str, duration_label: str, tone: str, fmt: str, force_notes_first: bool) -> str:
    clients = make_clients()

    transcript = clean_text(transcript)
    if not transcript:
        raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")

    too_long = len(transcript) > 4500
    use_notes = force_notes_first or too_long

    source = transcript
    if use_notes:
        notes = notes_first_pass(clients, transcript, language)
        source = f"NOTES:\n{notes}"

    user_prompt = make_user_prompt(source, language, duration_label, tone, fmt)
    script = llm_chat(clients, SYSTEM_PROMPT, user_prompt, max_tokens=750, temperature=0.4).strip()

    if not script:
        raise RuntimeError("Script generation returned empty text.")
    return script


# -----------------------------
# Gradio callbacks
# -----------------------------
def ui_transcribe(video_file, language):
    if video_file is None:
        return gr.update(), "Please upload a video first."
    try:
        transcript = transcribe_video(video_file, language)
        return transcript, "Done: transcript ready."
    except Exception as e:
        tb = traceback.format_exc()
        return gr.update(), f"Transcription error: {repr(e)}\n\n{tb}"


def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
    try:
        if (not transcript or not transcript.strip()) and video_file is not None:
            transcript = transcribe_video(video_file, language)

        script = generate_script(transcript, language, duration_label, tone, fmt, force_notes_first)
        return transcript, script, "Done: script generated."
    except Exception as e:
        tb = traceback.format_exc()
        return transcript, gr.update(), f"Script error: {repr(e)}\n\n{tb}"


# -----------------------------
# UI
# -----------------------------
with gr.Blocks(title="Video → Transcript → Script") as demo:
    gr.Markdown(
        "## Video → Transcript → Script\n"
        "Upload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with an Inference Providers model."
    )

    with gr.Row():
        with gr.Column(scale=1):
            video = gr.Video(label="Upload video", format="mp4")
            language = gr.Dropdown(label="Language", choices=["Auto", "en", "fr", "nl"], value="Auto")
            duration_label = gr.Dropdown(label="Script length", choices=["30s", "45s", "60s", "90s", "2m"], value="60s")
            tone = gr.Dropdown(label="Tone", choices=["neutral", "punchy", "calm", "playful"], value="neutral")
            fmt = gr.Dropdown(label="Format", choices=["voiceover", "anchor", "social short"], value="voiceover")
            force_notes_first = gr.Checkbox(label="Notes-first (better for long transcripts)", value=False)

            with gr.Row():
                btn_transcribe = gr.Button("Transcribe")
                btn_generate = gr.Button("Generate script")

            status = gr.Textbox(label="Status", value="Ready.", interactive=False)

        with gr.Column(scale=2):
            transcript = gr.Textbox(label="Transcript (editable)", lines=10)
            script = gr.Textbox(label="Script (editable)", lines=14)

    btn_transcribe.click(fn=ui_transcribe, inputs=[video, language], outputs=[transcript, status])
    btn_generate.click(fn=ui_generate, inputs=[video, transcript, language, duration_label, tone, fmt, force_notes_first], outputs=[transcript, script, status])

if __name__ == "__main__":
    demo.launch()