ai-ffmpeg-render / planner.py
MarneMorgan's picture
Create planner.py
afb5eee verified
import os
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
MODEL_NAME = os.getenv("MODEL_NAME", "google/flan-t5-base")
_tok = None
_model = None
def _load():
global _tok, _model
if _tok is None or _model is None:
_tok = AutoTokenizer.from_pretrained(MODEL_NAME)
_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
def _gen(text: str) -> str:
_load()
inputs = _tok(text, return_tensors="pt", truncation=True)
out = _model.generate(**inputs, max_new_tokens=350, do_sample=False)
return _tok.decode(out[0], skip_special_tokens=True).strip()
SCHEMA = r"""
Return STRICT JSON only (no markdown, no commentary). Shape:
{
"output_ext": "mp4|mov|mkv|mp3|wav|gif",
"trim": {"start": "0", "end": "10"} | null,
"resize": "1280:720" | "1920:1080" | "1080:1920" | null,
"crop": "w:h:x:y" | null,
"fps": 30 | null,
"text_overlays": [
{"text":"...", "pos":"top-left|top-right|bottom-left|bottom-right|center|bottom-center",
"start":"0"|null, "end":"10"|null}
],
"logo_overlay": {"file":"input/..png", "pos":"top-right|top-left|bottom-right|bottom-left|center",
"scale":"180:180"|null, "opacity":0.7|null} | null,
"subtitles": {"file":"input/..srt|input/..vtt"} | null,
"audio_mix": [
{"file":"input/..mp3|input/..wav|input/..m4a", "volume":0.2, "loop":true|false}
],
"effects": {
"video_filters": ["hue=s=0", "eq=contrast=1.1", "boxblur=2:1"] ,
"audio_filters": ["afade=t=in:st=0:d=1", "loudnorm"]
},
"notes":"short"
}
Rules:
- The FIRST downloaded file (input0) is the main video/audio source.
- Only reference provided local paths exactly.
"""
def plan(local_files: list[str], probes: dict, user_prompt: str) -> dict:
files_list = "\n".join([f"- {f}" for f in local_files]) or "- (none)"
text = f"""
You are a media editing planner for FFmpeg.
Available local files:
{files_list}
ffprobe metadata (JSON, may be partial):
{json.dumps(probes)[:6500]}
User request:
{user_prompt}
{SCHEMA}
"""
raw = _gen(text)
a = raw.find("{")
b = raw.rfind("}")
if a == -1 or b == -1:
raise ValueError(f"Planner did not return JSON. Got: {raw[:220]}")
return json.loads(raw[a:b+1])
def repair(local_files: list[str], probes: dict, user_prompt: str, last_cmd: str, stderr_tail: str) -> dict:
files_list = "\n".join([f"- {f}" for f in local_files]) or "- (none)"
text = f"""
You are fixing a failed FFmpeg plan.
Files:
{files_list}
Metadata:
{json.dumps(probes)[:6500]}
User request:
{user_prompt}
Last command:
{last_cmd}
FFmpeg stderr tail:
{stderr_tail[-2400:]}
Return corrected JSON ONLY.
{SCHEMA}
"""
raw = _gen(text)
a = raw.find("{")
b = raw.rfind("}")
if a == -1 or b == -1:
raise ValueError(f"Repair did not return JSON. Got: {raw[:220]}")
return json.loads(raw[a:b+1])