import os import json from transformers import AutoTokenizer, AutoModelForSeq2SeqLM MODEL_NAME = os.getenv("MODEL_NAME", "google/flan-t5-base") _tok = None _model = None def _load(): global _tok, _model if _tok is None or _model is None: _tok = AutoTokenizer.from_pretrained(MODEL_NAME) _model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) def _gen(text: str) -> str: _load() inputs = _tok(text, return_tensors="pt", truncation=True) out = _model.generate(**inputs, max_new_tokens=350, do_sample=False) return _tok.decode(out[0], skip_special_tokens=True).strip() SCHEMA = r""" Return STRICT JSON only (no markdown, no commentary). Shape: { "output_ext": "mp4|mov|mkv|mp3|wav|gif", "trim": {"start": "0", "end": "10"} | null, "resize": "1280:720" | "1920:1080" | "1080:1920" | null, "crop": "w:h:x:y" | null, "fps": 30 | null, "text_overlays": [ {"text":"...", "pos":"top-left|top-right|bottom-left|bottom-right|center|bottom-center", "start":"0"|null, "end":"10"|null} ], "logo_overlay": {"file":"input/..png", "pos":"top-right|top-left|bottom-right|bottom-left|center", "scale":"180:180"|null, "opacity":0.7|null} | null, "subtitles": {"file":"input/..srt|input/..vtt"} | null, "audio_mix": [ {"file":"input/..mp3|input/..wav|input/..m4a", "volume":0.2, "loop":true|false} ], "effects": { "video_filters": ["hue=s=0", "eq=contrast=1.1", "boxblur=2:1"] , "audio_filters": ["afade=t=in:st=0:d=1", "loudnorm"] }, "notes":"short" } Rules: - The FIRST downloaded file (input0) is the main video/audio source. - Only reference provided local paths exactly. """ def plan(local_files: list[str], probes: dict, user_prompt: str) -> dict: files_list = "\n".join([f"- {f}" for f in local_files]) or "- (none)" text = f""" You are a media editing planner for FFmpeg. Available local files: {files_list} ffprobe metadata (JSON, may be partial): {json.dumps(probes)[:6500]} User request: {user_prompt} {SCHEMA} """ raw = _gen(text) a = raw.find("{") b = raw.rfind("}") if a == -1 or b == -1: raise ValueError(f"Planner did not return JSON. Got: {raw[:220]}") return json.loads(raw[a:b+1]) def repair(local_files: list[str], probes: dict, user_prompt: str, last_cmd: str, stderr_tail: str) -> dict: files_list = "\n".join([f"- {f}" for f in local_files]) or "- (none)" text = f""" You are fixing a failed FFmpeg plan. Files: {files_list} Metadata: {json.dumps(probes)[:6500]} User request: {user_prompt} Last command: {last_cmd} FFmpeg stderr tail: {stderr_tail[-2400:]} Return corrected JSON ONLY. {SCHEMA} """ raw = _gen(text) a = raw.find("{") b = raw.rfind("}") if a == -1 or b == -1: raise ValueError(f"Repair did not return JSON. Got: {raw[:220]}") return json.loads(raw[a:b+1])