Upload api.py
Browse files
api.py
CHANGED
|
@@ -72,6 +72,8 @@ async def create_initial_casting(
|
|
| 72 |
video: UploadFile = File(...),
|
| 73 |
epsilon: float = Form(...),
|
| 74 |
min_cluster_size: int = Form(...),
|
|
|
|
|
|
|
| 75 |
):
|
| 76 |
"""
|
| 77 |
Crea un job para procesar el vídeo de forma asíncrona.
|
|
@@ -94,6 +96,8 @@ async def create_initial_casting(
|
|
| 94 |
"video_name": video_name,
|
| 95 |
"epsilon": float(epsilon),
|
| 96 |
"min_cluster_size": int(min_cluster_size),
|
|
|
|
|
|
|
| 97 |
"created_at": datetime.now().isoformat(),
|
| 98 |
"results": None,
|
| 99 |
"error": None
|
|
@@ -168,6 +172,8 @@ def process_video_job(job_id: str):
|
|
| 168 |
video_name = job["video_name"]
|
| 169 |
epsilon = job["epsilon"]
|
| 170 |
min_cluster_size = job["min_cluster_size"]
|
|
|
|
|
|
|
| 171 |
|
| 172 |
# Crear estructura de carpetas
|
| 173 |
base = TEMP_ROOT / video_name
|
|
@@ -283,8 +289,8 @@ def process_video_job(job_id: str):
|
|
| 283 |
if voice_embeddings:
|
| 284 |
try:
|
| 285 |
Xv = np.array(voice_embeddings)
|
| 286 |
-
v_eps = float(
|
| 287 |
-
v_min = max(1, int(
|
| 288 |
v_labels = DBSCAN(eps=v_eps, min_samples=v_min, metric='euclidean').fit(Xv).labels_.tolist()
|
| 289 |
except Exception as _e:
|
| 290 |
print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
|
|
@@ -480,6 +486,113 @@ async def finalize_casting(
|
|
| 480 |
"voice_identities": voice_identities,
|
| 481 |
}
|
| 482 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
@app.post("/refine_narration")
|
| 484 |
async def refine_narration(
|
| 485 |
dialogues_srt: str = Form(...),
|
|
|
|
| 72 |
video: UploadFile = File(...),
|
| 73 |
epsilon: float = Form(...),
|
| 74 |
min_cluster_size: int = Form(...),
|
| 75 |
+
voice_epsilon: float = Form(0.5),
|
| 76 |
+
voice_min_cluster_size: int = Form(2),
|
| 77 |
):
|
| 78 |
"""
|
| 79 |
Crea un job para procesar el vídeo de forma asíncrona.
|
|
|
|
| 96 |
"video_name": video_name,
|
| 97 |
"epsilon": float(epsilon),
|
| 98 |
"min_cluster_size": int(min_cluster_size),
|
| 99 |
+
"voice_epsilon": float(voice_epsilon),
|
| 100 |
+
"voice_min_cluster_size": int(voice_min_cluster_size),
|
| 101 |
"created_at": datetime.now().isoformat(),
|
| 102 |
"results": None,
|
| 103 |
"error": None
|
|
|
|
| 172 |
video_name = job["video_name"]
|
| 173 |
epsilon = job["epsilon"]
|
| 174 |
min_cluster_size = job["min_cluster_size"]
|
| 175 |
+
v_epsilon = float(job.get("voice_epsilon", epsilon))
|
| 176 |
+
v_min_cluster = int(job.get("voice_min_cluster_size", min_cluster_size))
|
| 177 |
|
| 178 |
# Crear estructura de carpetas
|
| 179 |
base = TEMP_ROOT / video_name
|
|
|
|
| 289 |
if voice_embeddings:
|
| 290 |
try:
|
| 291 |
Xv = np.array(voice_embeddings)
|
| 292 |
+
v_eps = float(v_epsilon)
|
| 293 |
+
v_min = max(1, int(v_min_cluster))
|
| 294 |
v_labels = DBSCAN(eps=v_eps, min_samples=v_min, metric='euclidean').fit(Xv).labels_.tolist()
|
| 295 |
except Exception as _e:
|
| 296 |
print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
|
|
|
|
| 486 |
"voice_identities": voice_identities,
|
| 487 |
}
|
| 488 |
|
| 489 |
+
@app.get("/files_scene/{video_name}/{scene_id}/{filename}")
|
| 490 |
+
def serve_scene_file(video_name: str, scene_id: str, filename: str):
|
| 491 |
+
file_path = TEMP_ROOT / video_name / "scenes" / scene_id / filename
|
| 492 |
+
if not file_path.exists():
|
| 493 |
+
raise HTTPException(status_code=404, detail="File not found")
|
| 494 |
+
return FileResponse(file_path)
|
| 495 |
+
|
| 496 |
+
@app.post("/detect_scenes")
|
| 497 |
+
async def detect_scenes(
|
| 498 |
+
video: UploadFile = File(...),
|
| 499 |
+
epsilon: float = Form(0.5),
|
| 500 |
+
min_cluster_size: int = Form(2),
|
| 501 |
+
frame_interval_sec: float = Form(0.5),
|
| 502 |
+
):
|
| 503 |
+
"""
|
| 504 |
+
Detecta clústers d'escenes mitjançant clustering de histogrames de color.
|
| 505 |
+
Retorna una llista de scene_clusters estructurada de forma similar a characters.
|
| 506 |
+
"""
|
| 507 |
+
import cv2
|
| 508 |
+
import numpy as np
|
| 509 |
+
from sklearn.cluster import DBSCAN
|
| 510 |
+
|
| 511 |
+
# Guardar el vídeo temporalment
|
| 512 |
+
video_name = Path(video.filename).stem
|
| 513 |
+
dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
|
| 514 |
+
with dst_video.open("wb") as f:
|
| 515 |
+
shutil.copyfileobj(video.file, f)
|
| 516 |
+
|
| 517 |
+
cap = cv2.VideoCapture(str(dst_video))
|
| 518 |
+
if not cap.isOpened():
|
| 519 |
+
raise HTTPException(status_code=400, detail="Cannot open video")
|
| 520 |
+
|
| 521 |
+
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
| 522 |
+
step = max(1, int(frame_interval_sec * fps))
|
| 523 |
+
|
| 524 |
+
frames = []
|
| 525 |
+
metas = []
|
| 526 |
+
idx = 0
|
| 527 |
+
while True:
|
| 528 |
+
ret = cap.grab()
|
| 529 |
+
if not ret:
|
| 530 |
+
break
|
| 531 |
+
if idx % step == 0:
|
| 532 |
+
ret2, frame = cap.retrieve()
|
| 533 |
+
if not ret2:
|
| 534 |
+
break
|
| 535 |
+
# Reduir mida per estabilitat i càlcul ràpid
|
| 536 |
+
small = cv2.resize(frame, (160, 90))
|
| 537 |
+
hsv = cv2.cvtColor(small, cv2.COLOR_BGR2HSV)
|
| 538 |
+
# Histograma per canal
|
| 539 |
+
h_hist = cv2.calcHist([hsv],[0],None,[32],[0,180]).flatten()
|
| 540 |
+
s_hist = cv2.calcHist([hsv],[1],None,[32],[0,256]).flatten()
|
| 541 |
+
v_hist = cv2.calcHist([hsv],[2],None,[32],[0,256]).flatten()
|
| 542 |
+
hist = np.concatenate([h_hist, s_hist, v_hist])
|
| 543 |
+
hist = hist / (np.linalg.norm(hist) + 1e-8)
|
| 544 |
+
frames.append(hist)
|
| 545 |
+
metas.append({"index": idx, "time_sec": idx/float(fps)})
|
| 546 |
+
idx += 1
|
| 547 |
+
cap.release()
|
| 548 |
+
|
| 549 |
+
if not frames:
|
| 550 |
+
return {"scene_clusters": []}
|
| 551 |
+
|
| 552 |
+
X = np.array(frames)
|
| 553 |
+
labels = DBSCAN(eps=float(epsilon), min_samples=int(min_cluster_size), metric='euclidean').fit(X).labels_.tolist()
|
| 554 |
+
|
| 555 |
+
# Agrupar per etiqueta (>=0)
|
| 556 |
+
clusters = {}
|
| 557 |
+
for i, lbl in enumerate(labels):
|
| 558 |
+
if lbl is None or lbl < 0:
|
| 559 |
+
continue
|
| 560 |
+
clusters.setdefault(int(lbl), []).append(i)
|
| 561 |
+
|
| 562 |
+
# Escriure imatges representatives per a cada clúster
|
| 563 |
+
base = TEMP_ROOT / video_name / "scenes"
|
| 564 |
+
base.mkdir(parents=True, exist_ok=True)
|
| 565 |
+
scene_list = []
|
| 566 |
+
cap = cv2.VideoCapture(str(dst_video))
|
| 567 |
+
for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
|
| 568 |
+
scene_id = f"scene_{int(lbl):02d}"
|
| 569 |
+
out_dir = base / scene_id
|
| 570 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 571 |
+
frame_files = []
|
| 572 |
+
# Guardar fins a 12 frames per clúster
|
| 573 |
+
for k, fi in enumerate(idxs[:12]):
|
| 574 |
+
frame_num = metas[fi]["index"]
|
| 575 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
| 576 |
+
ret2, frame = cap.read()
|
| 577 |
+
if not ret2:
|
| 578 |
+
continue
|
| 579 |
+
fn = f"frame_{k:03d}.jpg"
|
| 580 |
+
cv2.imwrite(str(out_dir / fn), frame)
|
| 581 |
+
frame_files.append(fn)
|
| 582 |
+
# Representative
|
| 583 |
+
rep = frame_files[0] if frame_files else None
|
| 584 |
+
image_url = f"/files_scene/{video_name}/{scene_id}/{rep}" if rep else ""
|
| 585 |
+
scene_list.append({
|
| 586 |
+
"id": scene_id,
|
| 587 |
+
"folder": str(out_dir),
|
| 588 |
+
"num_frames": len(frame_files),
|
| 589 |
+
"image_url": image_url,
|
| 590 |
+
"frame_files": frame_files,
|
| 591 |
+
})
|
| 592 |
+
cap.release()
|
| 593 |
+
|
| 594 |
+
return {"scene_clusters": scene_list, "base_dir": str(base)}
|
| 595 |
+
|
| 596 |
@app.post("/refine_narration")
|
| 597 |
async def refine_narration(
|
| 598 |
dialogues_srt: str = Form(...),
|