Spaces:

fdaudens
/

script-writer

Running

App Files Files Community

script-writer / app.py

fdaudens

Update app.py

ec5bec4 verified 15 days ago

raw

history blame contribute delete

10.8 kB

	# app.py

	import os
	import re
	import json
	import hashlib
	import tempfile
	import subprocess
	import traceback
	from dataclasses import dataclass
	from typing import Tuple, Dict

	import gradio as gr
	from huggingface_hub import InferenceClient

	# -----------------------------
	# Config
	# -----------------------------
	HF_TOKEN = os.getenv("HF_TOKEN") # Space Secrets

	ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-large-v3-turbo")

	# IMPORTANT:
	# Inference Providers (router.huggingface.co) often requires model + provider suffix:
	# "model_id:provider"
	# Examples that are listed as supported:
	# - "Qwen/Qwen3-4B-Thinking-2507:nscale"
	# - "meta-llama/Llama-3.2-1B-Instruct:novita"
	LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Qwen/Qwen3-4B-Thinking-2507:nscale")

	MAX_VIDEO_SECONDS = 10 * 60 # 10 minutes
	CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/hf_gradio_cache")
	os.makedirs(CACHE_DIR, exist_ok=True)

	# -----------------------------
	# Hardcoded examples in system prompt (replace with yours)
	# -----------------------------
	SYSTEM_PROMPT = """You are a scriptwriter. You transform a video transcript into a polished script.

	Rules:
	- Use ONLY facts present in the transcript. Do not invent names, dates, numbers, places.
	- If something is unclear in the transcript, stay neutral or mark it as [unclear].
	- Match the style from the examples.
	- Keep the script within the requested duration.
	- Always write the final script in the requested output language.

	STYLE EXAMPLES (hardcoded):

	Example 1
	TRANSCRIPT:
	"we launched a new feature today. it helps users summarize long articles faster."
	SCRIPT:
	Title: New feature drop
	Hook: Big update today.
	Body: We just launched a feature that turns long reads into quick, clear summaries. Drop in an article, get the key points in seconds.
	Closing: If you’ve been drowning in tabs, this one’s for you.

	Example 2
	TRANSCRIPT:
	"the storm caused delays across the region. officials said repairs will take two days."
	SCRIPT:
	Title: Storm delays
	Hook: Here’s what’s happening.
	Body: A storm has disrupted travel across the region. Officials say repairs could take around two days, so delays may continue.
	Closing: If you’re heading out, check updates before you go.

	Output format (always):
	Title:
	Hook:
	Body:
	Closing:
	"""

	# -----------------------------
	# Helpers
	# -----------------------------
	def _run(cmd: list) -> Tuple[int, str, str]:
	p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
	return p.returncode, p.stdout, p.stderr


	def sha256_file(path: str) -> str:
	h = hashlib.sha256()
	with open(path, "rb") as f:
	for chunk in iter(lambda: f.read(1024 * 1024), b""):
	h.update(chunk)
	return h.hexdigest()


	def get_video_duration_seconds(video_path: str) -> float:
	cmd = [
	"ffprobe", "-v", "error",
	"-show_entries", "format=duration",
	"-of", "json",
	video_path,
	]
	code, out, err = _run(cmd)
	if code != 0:
	raise RuntimeError(f"ffprobe failed: {err.strip() or out.strip()}")
	data = json.loads(out)
	return float(data["format"]["duration"])


	def extract_audio_wav_16k_mono(video_path: str, wav_path: str) -> None:
	cmd = [
	"ffmpeg", "-y",
	"-i", video_path,
	"-vn",
	"-ac", "1",
	"-ar", "16000",
	"-f", "wav",
	wav_path,
	]
	code, out, err = _run(cmd)
	if code != 0:
	raise RuntimeError(f"ffmpeg failed: {err.strip() or out.strip()}")


	def clean_text(s: str) -> str:
	return re.sub(r"\s+", " ", (s or "")).strip()


	def seconds_from_label(label: str) -> int:
	mapping = {"30s": 30, "45s": 45, "60s": 60, "90s": 90, "2m": 120}
	return mapping.get(label, 60)


	def estimate_words_for_seconds(seconds: int) -> int:
	return max(40, int(seconds * 2.5))


	def language_name(code: str) -> str:
	return {"en": "English", "fr": "French", "nl": "Dutch"}.get(code, "Match transcript language")


	@dataclass
	class HFClients:
	asr: InferenceClient
	api: InferenceClient


	def make_clients() -> HFClients:
	if not HF_TOKEN:
	raise RuntimeError("Missing HF_TOKEN. Add it in your Space Secrets.")
	return HFClients(
	asr=InferenceClient(model=ASR_MODEL_ID, token=HF_TOKEN),
	api=InferenceClient(token=HF_TOKEN), # router client
	)


	def cache_paths(file_hash: str) -> Dict[str, str]:
	return {"transcript": os.path.join(CACHE_DIR, f"{file_hash}.transcript.txt")}


	def llm_chat(clients: HFClients, system: str, user: str, max_tokens: int, temperature: float) -> str:
	resp = clients.api.chat_completion(
	model=LLM_MODEL_ID,
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	],
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=0.9,
	)
	return resp.choices[0].message.content


	def transcribe_video(video_path: str, language: str) -> str:
	clients = make_clients()

	dur = get_video_duration_seconds(video_path)
	if dur > MAX_VIDEO_SECONDS:
	raise RuntimeError(f"Video is {int(dur)}s. Max allowed is {MAX_VIDEO_SECONDS}s (10 minutes).")

	file_hash = sha256_file(video_path)
	paths = cache_paths(file_hash)

	if os.path.exists(paths["transcript"]):
	with open(paths["transcript"], "r", encoding="utf-8") as f:
	return f.read()

	with tempfile.TemporaryDirectory() as td:
	wav_path = os.path.join(td, "audio.wav")
	extract_audio_wav_16k_mono(video_path, wav_path)

	if language != "Auto":
	try:
	result = clients.asr.automatic_speech_recognition(wav_path, language=language)
	except TypeError:
	result = clients.asr.automatic_speech_recognition(wav_path)
	else:
	result = clients.asr.automatic_speech_recognition(wav_path)

	text = result.get("text", "") if isinstance(result, dict) else str(result)
	text = clean_text(text)

	if not text:
	raise RuntimeError("Transcription returned empty text.")

	with open(paths["transcript"], "w", encoding="utf-8") as f:
	f.write(text)

	return text


	def make_user_prompt(transcript_or_notes: str, language: str, duration_label: str, tone: str, fmt: str) -> str:
	seconds = seconds_from_label(duration_label)
	target_words = estimate_words_for_seconds(seconds)
	return f"""Constraints:
	- Output language: {language_name(language) if language != "Auto" else "Match transcript language"}
	- Target duration: ~{seconds} seconds
	- Target length: ~{target_words} words
	- Tone: {tone}
	- Format: {fmt}

	Source:
	\"\"\"{transcript_or_notes}\"\"\"
	"""


	def notes_first_pass(clients: HFClients, transcript: str, language: str) -> str:
	sys = "You are an editor. Return concise bullet notes only."
	user = f"""Convert this transcript into concise bullet notes.

	Rules:
	- Keep only key facts mentioned.
	- No inventions.
	- 8 to 14 bullets max.
	- Output language: {language_name(language) if language != "Auto" else "Match transcript language"}

	Transcript:
	\"\"\"{transcript}\"\"\"

	Bullets:"""
	out = llm_chat(clients, sys, user, max_tokens=320, temperature=0.2)
	return clean_text(out)


	def generate_script(transcript: str, language: str, duration_label: str, tone: str, fmt: str, force_notes_first: bool) -> str:
	clients = make_clients()

	transcript = clean_text(transcript)
	if not transcript:
	raise RuntimeError("Transcript is empty. Transcribe first or paste a transcript.")

	too_long = len(transcript) > 4500
	use_notes = force_notes_first or too_long

	source = transcript
	if use_notes:
	notes = notes_first_pass(clients, transcript, language)
	source = f"NOTES:\n{notes}"

	user_prompt = make_user_prompt(source, language, duration_label, tone, fmt)
	script = llm_chat(clients, SYSTEM_PROMPT, user_prompt, max_tokens=750, temperature=0.4).strip()

	if not script:
	raise RuntimeError("Script generation returned empty text.")
	return script


	# -----------------------------
	# Gradio callbacks
	# -----------------------------
	def ui_transcribe(video_file, language):
	if video_file is None:
	return gr.update(), "Please upload a video first."
	try:
	transcript = transcribe_video(video_file, language)
	return transcript, "Done: transcript ready."
	except Exception as e:
	tb = traceback.format_exc()
	return gr.update(), f"Transcription error: {repr(e)}\n\n{tb}"


	def ui_generate(video_file, transcript, language, duration_label, tone, fmt, force_notes_first):
	try:
	if (not transcript or not transcript.strip()) and video_file is not None:
	transcript = transcribe_video(video_file, language)

	script = generate_script(transcript, language, duration_label, tone, fmt, force_notes_first)
	return transcript, script, "Done: script generated."
	except Exception as e:
	tb = traceback.format_exc()
	return transcript, gr.update(), f"Script error: {repr(e)}\n\n{tb}"


	# -----------------------------
	# UI
	# -----------------------------
	with gr.Blocks(title="Video → Transcript → Script") as demo:
	gr.Markdown(
	"## Video → Transcript → Script\n"
	"Upload a video (max 10 min), transcribe with Whisper Turbo, then generate a script with an Inference Providers model."
	)

	with gr.Row():
	with gr.Column(scale=1):
	video = gr.Video(label="Upload video", format="mp4")
	language = gr.Dropdown(label="Language", choices=["Auto", "en", "fr", "nl"], value="Auto")
	duration_label = gr.Dropdown(label="Script length", choices=["30s", "45s", "60s", "90s", "2m"], value="60s")
	tone = gr.Dropdown(label="Tone", choices=["neutral", "punchy", "calm", "playful"], value="neutral")
	fmt = gr.Dropdown(label="Format", choices=["voiceover", "anchor", "social short"], value="voiceover")
	force_notes_first = gr.Checkbox(label="Notes-first (better for long transcripts)", value=False)

	with gr.Row():
	btn_transcribe = gr.Button("Transcribe")
	btn_generate = gr.Button("Generate script")

	status = gr.Textbox(label="Status", value="Ready.", interactive=False)

	with gr.Column(scale=2):
	transcript = gr.Textbox(label="Transcript (editable)", lines=10)
	script = gr.Textbox(label="Script (editable)", lines=14)

	btn_transcribe.click(fn=ui_transcribe, inputs=[video, language], outputs=[transcript, status])
	btn_generate.click(fn=ui_generate, inputs=[video, transcript, language, duration_label, tone, fmt, force_notes_first], outputs=[transcript, script, status])

	if __name__ == "__main__":
	demo.launch()