|
|
from __future__ import annotations |
|
|
from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks, HTTPException |
|
|
from fastapi.responses import JSONResponse, FileResponse |
|
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
from pathlib import Path |
|
|
import shutil |
|
|
import uvicorn |
|
|
import json |
|
|
import uuid |
|
|
from datetime import datetime |
|
|
from typing import Dict |
|
|
from enum import Enum |
|
|
import os |
|
|
|
|
|
from video_processing import process_video_pipeline |
|
|
from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments |
|
|
from casting_loader import ensure_chroma, build_faces_index, build_voices_index |
|
|
from narration_system import NarrationSystem |
|
|
from llm_router import load_yaml, LLMRouter |
|
|
from character_detection import detect_characters_from_video |
|
|
|
|
|
from pipelines.audiodescription import generate as ad_generate |
|
|
|
|
|
app = FastAPI(title="Veureu Engine API", version="0.2.0") |
|
|
app.add_middleware( |
|
|
CORSMiddleware, |
|
|
allow_origins=["*"], |
|
|
allow_credentials=True, |
|
|
allow_methods=["*"], |
|
|
allow_headers=["*"], |
|
|
) |
|
|
|
|
|
ROOT = Path("/tmp/veureu") |
|
|
ROOT.mkdir(parents=True, exist_ok=True) |
|
|
TEMP_ROOT = Path("/tmp/temp") |
|
|
TEMP_ROOT.mkdir(parents=True, exist_ok=True) |
|
|
VIDEOS_ROOT = Path("/tmp/data/videos") |
|
|
VIDEOS_ROOT.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
class JobStatus(str, Enum): |
|
|
QUEUED = "queued" |
|
|
PROCESSING = "processing" |
|
|
DONE = "done" |
|
|
FAILED = "failed" |
|
|
|
|
|
jobs: Dict[str, dict] = {} |
|
|
|
|
|
@app.get("/") |
|
|
def root(): |
|
|
return {"ok": True, "service": "veureu-engine"} |
|
|
|
|
|
@app.post("/process_video") |
|
|
async def process_video( |
|
|
video_file: UploadFile = File(...), |
|
|
config_path: str = Form("config.yaml"), |
|
|
out_root: str = Form("results"), |
|
|
db_dir: str = Form("chroma_db"), |
|
|
): |
|
|
tmp_video = ROOT / video_file.filename |
|
|
with tmp_video.open("wb") as f: |
|
|
shutil.copyfileobj(video_file.file, f) |
|
|
result = process_video_pipeline(str(tmp_video), config_path=config_path, out_root=out_root, db_dir=db_dir) |
|
|
return JSONResponse(result) |
|
|
|
|
|
@app.post("/create_initial_casting") |
|
|
async def create_initial_casting( |
|
|
background_tasks: BackgroundTasks, |
|
|
video: UploadFile = File(...), |
|
|
epsilon: float = Form(...), |
|
|
min_cluster_size: int = Form(...), |
|
|
): |
|
|
""" |
|
|
Crea un job para procesar el vídeo de forma asíncrona. |
|
|
Devuelve un job_id inmediatamente. |
|
|
""" |
|
|
|
|
|
video_name = Path(video.filename).stem |
|
|
dst_video = VIDEOS_ROOT / f"{video_name}.mp4" |
|
|
with dst_video.open("wb") as f: |
|
|
shutil.copyfileobj(video.file, f) |
|
|
|
|
|
|
|
|
job_id = str(uuid.uuid4()) |
|
|
|
|
|
|
|
|
jobs[job_id] = { |
|
|
"id": job_id, |
|
|
"status": JobStatus.QUEUED, |
|
|
"video_path": str(dst_video), |
|
|
"video_name": video_name, |
|
|
"epsilon": float(epsilon), |
|
|
"min_cluster_size": int(min_cluster_size), |
|
|
"created_at": datetime.now().isoformat(), |
|
|
"results": None, |
|
|
"error": None |
|
|
} |
|
|
|
|
|
print(f"[{job_id}] Job creado para vídeo: {video_name}") |
|
|
|
|
|
|
|
|
background_tasks.add_task(process_video_job, job_id) |
|
|
|
|
|
|
|
|
return {"job_id": job_id} |
|
|
|
|
|
@app.get("/jobs/{job_id}/status") |
|
|
def get_job_status(job_id: str): |
|
|
""" |
|
|
Devuelve el estado actual de un job. |
|
|
El UI hace polling de este endpoint cada 5 segundos. |
|
|
""" |
|
|
if job_id not in jobs: |
|
|
raise HTTPException(status_code=404, detail="Job not found") |
|
|
|
|
|
job = jobs[job_id] |
|
|
|
|
|
|
|
|
status_value = job["status"].value if isinstance(job["status"], JobStatus) else str(job["status"]) |
|
|
response = {"status": status_value} |
|
|
|
|
|
|
|
|
if job.get("results") is not None: |
|
|
response["results"] = job["results"] |
|
|
|
|
|
|
|
|
if job.get("error"): |
|
|
response["error"] = job["error"] |
|
|
|
|
|
return response |
|
|
|
|
|
@app.get("/files/{video_name}/{char_id}/{filename}") |
|
|
def serve_character_file(video_name: str, char_id: str, filename: str): |
|
|
""" |
|
|
Sirve archivos estáticos de personajes (imágenes). |
|
|
Ejemplo: /files/dif_catala_1/char1/representative.jpg |
|
|
""" |
|
|
file_path = TEMP_ROOT / video_name / char_id / filename |
|
|
|
|
|
if not file_path.exists(): |
|
|
raise HTTPException(status_code=404, detail="File not found") |
|
|
|
|
|
return FileResponse(file_path) |
|
|
|
|
|
@app.get("/audio/{video_name}/{filename}") |
|
|
def serve_audio_file(video_name: str, filename: str): |
|
|
file_path = TEMP_ROOT / video_name / "clips" / filename |
|
|
if not file_path.exists(): |
|
|
raise HTTPException(status_code=404, detail="File not found") |
|
|
return FileResponse(file_path) |
|
|
|
|
|
def process_video_job(job_id: str): |
|
|
""" |
|
|
Procesa el vídeo de forma asíncrona. |
|
|
Esta función se ejecuta en background. |
|
|
""" |
|
|
try: |
|
|
job = jobs[job_id] |
|
|
print(f"[{job_id}] Iniciando procesamiento...") |
|
|
|
|
|
|
|
|
job["status"] = JobStatus.PROCESSING |
|
|
|
|
|
video_path = job["video_path"] |
|
|
video_name = job["video_name"] |
|
|
epsilon = job["epsilon"] |
|
|
min_cluster_size = job["min_cluster_size"] |
|
|
|
|
|
|
|
|
base = TEMP_ROOT / video_name |
|
|
base.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
print(f"[{job_id}] Directorio base: {base}") |
|
|
|
|
|
|
|
|
try: |
|
|
print(f"[{job_id}] Iniciando detección de personajes...") |
|
|
result = detect_characters_from_video( |
|
|
video_path=video_path, |
|
|
output_base=str(base), |
|
|
epsilon=epsilon, |
|
|
min_cluster_size=min_cluster_size, |
|
|
video_name=video_name, |
|
|
start_offset_sec=0.5, |
|
|
extract_every_sec=0.25 |
|
|
) |
|
|
|
|
|
print(f"[{job_id}] DEBUG - result completo: {result}") |
|
|
|
|
|
characters = result.get("characters", []) |
|
|
analysis_path = result.get("analysis_path", "") |
|
|
face_labels = result.get("face_labels", []) |
|
|
num_face_embeddings = int(result.get("num_face_embeddings", 0)) |
|
|
|
|
|
print(f"[{job_id}] Personajes detectados: {len(characters)}") |
|
|
for char in characters: |
|
|
print(f"[{job_id}] - {char['name']}: {char['num_faces']} caras") |
|
|
|
|
|
|
|
|
try: |
|
|
import glob, os |
|
|
for ch in characters: |
|
|
folder = ch.get("folder") |
|
|
face_files = [] |
|
|
if folder and os.path.isdir(folder): |
|
|
|
|
|
patterns = ["face_*.jpg", "face_*.png"] |
|
|
files = [] |
|
|
for pat in patterns: |
|
|
files.extend(glob.glob(os.path.join(folder, pat))) |
|
|
|
|
|
if not files: |
|
|
files.extend(glob.glob(os.path.join(folder, "*.jpg"))) |
|
|
files.extend(glob.glob(os.path.join(folder, "*.png"))) |
|
|
|
|
|
face_files = sorted({os.path.basename(p) for p in files}) |
|
|
|
|
|
for rep_name in ("representative.jpg", "representative.png"): |
|
|
rep_path = os.path.join(folder, rep_name) |
|
|
if os.path.exists(rep_path): |
|
|
if rep_name in face_files: |
|
|
face_files.remove(rep_name) |
|
|
face_files.insert(0, rep_name) |
|
|
ch["face_files"] = face_files |
|
|
|
|
|
if face_files: |
|
|
ch["num_faces"] = len(face_files) |
|
|
except Exception as _e: |
|
|
print(f"[{job_id}] WARN - No se pudo enumerar face_files: {_e}") |
|
|
|
|
|
|
|
|
try: |
|
|
cfg = load_yaml("config.yaml") |
|
|
audio_segments, srt_unmod, full_txt, diar_info, connection_logs = process_audio_for_video(video_path, base, cfg, voice_collection=None) |
|
|
|
|
|
try: |
|
|
for ev in (connection_logs or []): |
|
|
msg = ev.get("message") if isinstance(ev, dict) else None |
|
|
if msg: |
|
|
print(f"[{job_id}] {msg}") |
|
|
except Exception: |
|
|
pass |
|
|
except Exception as e_audio: |
|
|
import traceback |
|
|
print(f"[{job_id}] WARN - Audio pipeline failed: {e_audio}\n{traceback.format_exc()}") |
|
|
audio_segments, srt_unmod, full_txt = [], None, "" |
|
|
diar_info = {"diarization_ok": False, "error": str(e_audio)} |
|
|
connection_logs = [] |
|
|
|
|
|
|
|
|
if not audio_segments: |
|
|
try: |
|
|
from pathlib import Path as _P |
|
|
from pydub import AudioSegment as _AS |
|
|
wav_out = extract_audio_ffmpeg(video_path, base / f"{_P(video_path).stem}.wav", sr=16000) |
|
|
audio = _AS.from_wav(wav_out) |
|
|
clips_dir = base / "clips" |
|
|
clips_dir.mkdir(parents=True, exist_ok=True) |
|
|
cp = clips_dir / "segment_000.wav" |
|
|
audio.export(cp, format="wav") |
|
|
emb_list = embed_voice_segments([str(cp)]) |
|
|
audio_segments = [{ |
|
|
"segment": 0, |
|
|
"start": 0.0, |
|
|
"end": float(len(audio) / 1000.0), |
|
|
"speaker": "SPEAKER_00", |
|
|
"text": "", |
|
|
"voice_embedding": emb_list[0] if emb_list else [], |
|
|
"clip_path": str(cp), |
|
|
"lang": "ca", |
|
|
"lang_prob": 1.0, |
|
|
}] |
|
|
except Exception as _efb: |
|
|
print(f"[{job_id}] WARN - Audio minimal fallback failed: {_efb}") |
|
|
|
|
|
|
|
|
from sklearn.cluster import DBSCAN |
|
|
import numpy as np |
|
|
voice_embeddings = [seg.get("voice_embedding") for seg in audio_segments if seg.get("voice_embedding")] |
|
|
if voice_embeddings: |
|
|
try: |
|
|
Xv = np.array(voice_embeddings) |
|
|
v_eps = float(epsilon) |
|
|
v_min = max(1, int(min_cluster_size)) |
|
|
v_labels = DBSCAN(eps=v_eps, min_samples=v_min, metric='euclidean').fit(Xv).labels_.tolist() |
|
|
except Exception as _e: |
|
|
print(f"[{job_id}] WARN - Voice clustering failed: {_e}") |
|
|
v_labels = [] |
|
|
else: |
|
|
v_labels = [] |
|
|
|
|
|
|
|
|
job["results"] = { |
|
|
"characters": characters, |
|
|
"num_characters": len(characters), |
|
|
"analysis_path": analysis_path, |
|
|
"base_dir": str(base), |
|
|
"face_labels": face_labels, |
|
|
"num_face_embeddings": num_face_embeddings, |
|
|
"audio_segments": audio_segments, |
|
|
"srt_unmodified": srt_unmod, |
|
|
"full_transcription": full_txt, |
|
|
"voice_labels": v_labels, |
|
|
"num_voice_embeddings": len(voice_embeddings), |
|
|
"diarization_info": diar_info, |
|
|
} |
|
|
job["status"] = JobStatus.DONE |
|
|
|
|
|
print(f"[{job_id}] DEBUG - job['results'] guardado: {job['results']}") |
|
|
|
|
|
except Exception as e_detect: |
|
|
|
|
|
import traceback |
|
|
print(f"[{job_id}] ✗ Error en detección: {e_detect}") |
|
|
print(f"[{job_id}] Traceback: {traceback.format_exc()}") |
|
|
print(f"[{job_id}] Usando modo fallback (carpetas vacías)") |
|
|
|
|
|
|
|
|
for sub in ("sources", "faces", "voices", "backgrounds"): |
|
|
(base / sub).mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
job["results"] = { |
|
|
"characters": [], |
|
|
"num_characters": 0, |
|
|
"temp_dirs": { |
|
|
"sources": str(base / "sources"), |
|
|
"faces": str(base / "faces"), |
|
|
"voices": str(base / "voices"), |
|
|
"backgrounds": str(base / "backgrounds"), |
|
|
}, |
|
|
"warning": f"Detección falló, usando modo fallback: {str(e_detect)}" |
|
|
} |
|
|
job["status"] = JobStatus.DONE |
|
|
|
|
|
print(f"[{job_id}] ✓ Job completado exitosamente") |
|
|
|
|
|
except Exception as e: |
|
|
import traceback |
|
|
print(f"[{job_id}] ✗ Error inesperado: {e}") |
|
|
try: |
|
|
job = jobs.get(job_id) |
|
|
if job is not None: |
|
|
job["status"] = JobStatus.FAILED |
|
|
job["error"] = str(e) |
|
|
except Exception: |
|
|
pass |
|
|
print(f"[{job_id}] Traceback: {traceback.format_exc()}") |
|
|
|
|
|
@app.post("/generate_audiodescription") |
|
|
async def generate_audiodescription(video: UploadFile = File(...)): |
|
|
try: |
|
|
import uuid |
|
|
job_id = str(uuid.uuid4()) |
|
|
vid_name = video.filename or f"video_{job_id}.mp4" |
|
|
base = TEMP_ROOT / Path(vid_name).stem |
|
|
|
|
|
base.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
video_path = base / vid_name |
|
|
with open(video_path, "wb") as f: |
|
|
f.write(await video.read()) |
|
|
|
|
|
|
|
|
result = ad_generate(str(video_path), base) |
|
|
|
|
|
return { |
|
|
"status": "done", |
|
|
"results": { |
|
|
"une_srt": result.get("une_srt", ""), |
|
|
"free_text": result.get("free_text", ""), |
|
|
"artifacts": result.get("artifacts", {}), |
|
|
}, |
|
|
} |
|
|
except Exception as e: |
|
|
import traceback |
|
|
print(f"/generate_audiodescription error: {e}\n{traceback.format_exc()}") |
|
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|
|
@app.post("/load_casting") |
|
|
async def load_casting( |
|
|
faces_dir: str = Form("identities/faces"), |
|
|
voices_dir: str = Form("identities/voices"), |
|
|
db_dir: str = Form("chroma_db"), |
|
|
drop_collections: bool = Form(False), |
|
|
): |
|
|
client = ensure_chroma(Path(db_dir)) |
|
|
n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections) |
|
|
n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections) |
|
|
return {"ok": True, "faces": n_faces, "voices": n_voices} |
|
|
|
|
|
@app.post("/refine_narration") |
|
|
async def refine_narration( |
|
|
dialogues_srt: str = Form(...), |
|
|
frame_descriptions_json: str = Form("[]"), |
|
|
config_path: str = Form("config.yaml"), |
|
|
): |
|
|
cfg = load_yaml(config_path) |
|
|
frames = json.loads(frame_descriptions_json) |
|
|
model_name = cfg.get("narration", {}).get("model", "salamandra-instruct") |
|
|
use_remote = model_name in (cfg.get("models", {}).get("routing", {}).get("use_remote_for", [])) |
|
|
|
|
|
if use_remote: |
|
|
router = LLMRouter(cfg) |
|
|
system_msg = ( |
|
|
"Eres un sistema de audiodescripción que cumple UNE-153010. " |
|
|
"Fusiona diálogos del SRT con descripciones concisas en los huecos, evitando redundancias. " |
|
|
"Devuelve JSON con {narrative_text, srt_text}." |
|
|
) |
|
|
prompt = json.dumps({"dialogues_srt": dialogues_srt, "frames": frames, "rules": cfg.get("narration", {})}, ensure_ascii=False) |
|
|
try: |
|
|
txt = router.instruct(prompt=prompt, system=system_msg, model=model_name) |
|
|
out = {} |
|
|
try: |
|
|
out = json.loads(txt) |
|
|
except Exception: |
|
|
out = {"narrative_text": txt, "srt_text": ""} |
|
|
return { |
|
|
"narrative_text": out.get("narrative_text", ""), |
|
|
"srt_text": out.get("srt_text", ""), |
|
|
"approved": True, |
|
|
"critic_feedback": "", |
|
|
} |
|
|
except Exception: |
|
|
ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("narration_une_guidelines_path", "UNE_153010.txt")) |
|
|
res = ns.run(dialogues_srt, frames) |
|
|
return {"narrative_text": res.narrative_text, "srt_text": res.srt_text, "approved": res.approved, "critic_feedback": res.critic_feedback} |
|
|
|
|
|
ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("une_guidelines_path", "UNE_153010.txt")) |
|
|
out = ns.run(dialogues_srt, frames) |
|
|
return {"narrative_text": out.narrative_text, "srt_text": out.srt_text, "approved": out.approved, "critic_feedback": out.critic_feedback} |
|
|
|
|
|
if __name__ == "__main__": |
|
|
uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|
|