VeuReu commited on
Commit
612098a
·
verified ·
1 Parent(s): bccb735

Update preprocessing_router.py

Browse files
Files changed (1) hide show
  1. preprocessing_router.py +946 -942
preprocessing_router.py CHANGED
@@ -1,942 +1,946 @@
1
- from __future__ import annotations
2
-
3
- from fastapi import APIRouter, UploadFile, File, Form, BackgroundTasks, HTTPException, Body
4
- from fastapi.responses import FileResponse
5
- from pathlib import Path
6
- from datetime import datetime
7
- from enum import Enum
8
- from typing import Dict, Any, List
9
- import shutil
10
- import os
11
- import uuid
12
- import numpy as np
13
- import cv2
14
- import tempfile
15
-
16
- from casting_loader import ensure_chroma, build_faces_index, build_voices_index
17
- from llm_router import load_yaml, LLMRouter
18
-
19
- # External space clients (no local GPU needed)
20
- import svision_client
21
- import asr_client
22
-
23
-
24
- ROOT = Path("/tmp/veureu")
25
- ROOT.mkdir(parents=True, exist_ok=True)
26
- TEMP_ROOT = Path("/tmp/temp")
27
- TEMP_ROOT.mkdir(parents=True, exist_ok=True)
28
- VIDEOS_ROOT = Path("/tmp/data/videos")
29
- VIDEOS_ROOT.mkdir(parents=True, exist_ok=True)
30
- IDENTITIES_ROOT = Path("/tmp/characters")
31
- IDENTITIES_ROOT.mkdir(parents=True, exist_ok=True)
32
-
33
-
34
- class JobStatus(str, Enum):
35
- QUEUED = "queued"
36
- PROCESSING = "processing"
37
- DONE = "done"
38
- FAILED = "failed"
39
-
40
-
41
- jobs: Dict[str, dict] = {}
42
-
43
-
44
- # ---------------------------------------------------------------------------
45
- # Helper function for clustering (only math, no GPU)
46
- # ---------------------------------------------------------------------------
47
-
48
- def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
49
- """Hierarchical clustering using only min_cluster_size and k-target (max_groups).
50
-
51
- - Primero intenta crear el máximo número posible de clusters con al menos
52
- ``min_cluster_size`` elementos.
53
- - Después fusiona implícitamente (bajando el número de clusters) hasta
54
- llegar a un número de clusters válidos (tamaño >= min_cluster_size)
55
- menor o igual que ``max_groups``.
56
-
57
- ``sensitivity`` se mantiene en la firma por compatibilidad, pero no se usa.
58
- """
59
- from scipy.cluster.hierarchy import linkage, fcluster
60
- from collections import Counter
61
-
62
- n_samples = len(X)
63
- if n_samples == 0:
64
- return np.array([])
65
-
66
- # Si no hay suficientes muestras para formar un solo cluster válido,
67
- # marcamos todo como ruido (-1).
68
- if n_samples < min_cluster_size:
69
- return np.full(n_samples, -1, dtype=int)
70
-
71
- # k_target = max_groups (interpretamos este parámetro como k-Target)
72
- k_target = max(0, int(max_groups))
73
-
74
- # Caso especial: k_target == 0 => no queremos clusters, todo ruido.
75
- if k_target == 0:
76
- return np.full(n_samples, -1, dtype=int)
77
-
78
- # Enlace jerárquico una sola vez
79
- Z = linkage(X, method="average", metric="cosine")
80
-
81
- # Máximo número de clusters posibles respetando min_cluster_size
82
- max_possible = n_samples // min_cluster_size
83
- if max_possible <= 0:
84
- return np.full(n_samples, -1, dtype=int)
85
-
86
- max_to_try = min(max_possible, n_samples)
87
-
88
- best_labels = np.full(n_samples, -1, dtype=int)
89
-
90
- # Recorremos de más clusters a menos, buscando la primera solución
91
- # que tenga entre 1 y k_target clusters válidos.
92
- for n_clusters in range(max_to_try, 0, -1):
93
- trial_labels = fcluster(Z, t=n_clusters, criterion="maxclust") - 1
94
- counts = Counter(trial_labels)
95
-
96
- # Clusters con tamaño suficiente
97
- valid_clusters = {lbl for lbl, cnt in counts.items() if cnt >= min_cluster_size}
98
- num_valid = len(valid_clusters)
99
-
100
- if num_valid == 0:
101
- # Demasiado fino, todos los clusters son demasiado pequeños
102
- continue
103
-
104
- if num_valid <= k_target:
105
- # Aceptamos esta solución
106
- final_labels = []
107
- for lbl in trial_labels:
108
- if lbl in valid_clusters:
109
- final_labels.append(lbl)
110
- else:
111
- final_labels.append(-1)
112
- best_labels = np.array(final_labels, dtype=int)
113
- break
114
-
115
- return best_labels
116
-
117
-
118
- router = APIRouter(tags=["Preprocessing Manager"])
119
-
120
-
121
- @router.post("/create_initial_casting")
122
- async def create_initial_casting(
123
- background_tasks: BackgroundTasks,
124
- video: UploadFile = File(...),
125
- max_groups: int = Form(default=3),
126
- min_cluster_size: int = Form(default=3),
127
- face_sensitivity: float = Form(default=0.5),
128
- voice_max_groups: int = Form(default=3),
129
- voice_min_cluster_size: int = Form(default=3),
130
- voice_sensitivity: float = Form(default=0.5),
131
- max_frames: int = Form(default=100),
132
- ):
133
- video_name = Path(video.filename).stem
134
- dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
135
- with dst_video.open("wb") as f:
136
- shutil.copyfileobj(video.file, f)
137
-
138
- job_id = str(uuid.uuid4())
139
-
140
- jobs[job_id] = {
141
- "id": job_id,
142
- "status": JobStatus.QUEUED,
143
- "video_path": str(dst_video),
144
- "video_name": video_name,
145
- "max_groups": int(max_groups),
146
- "min_cluster_size": int(min_cluster_size),
147
- "face_sensitivity": float(face_sensitivity),
148
- "voice_max_groups": int(voice_max_groups),
149
- "voice_min_cluster_size": int(voice_min_cluster_size),
150
- "voice_sensitivity": float(voice_sensitivity),
151
- "max_frames": int(max_frames),
152
- "created_at": datetime.now().isoformat(),
153
- "results": None,
154
- "error": None,
155
- }
156
-
157
- print(f"[{job_id}] Job creado para vídeo: {video_name}")
158
- background_tasks.add_task(process_video_job, job_id)
159
- return {"job_id": job_id}
160
-
161
-
162
- @router.get("/jobs/{job_id}/status")
163
- def get_job_status(job_id: str):
164
- if job_id not in jobs:
165
- raise HTTPException(status_code=404, detail="Job not found")
166
-
167
- job = jobs[job_id]
168
- status_value = job["status"].value if isinstance(job["status"], JobStatus) else str(job["status"])
169
- response = {"status": status_value}
170
-
171
- if job.get("results") is not None:
172
- response["results"] = job["results"]
173
- if job.get("error"):
174
- response["error"] = job["error"]
175
-
176
- return response
177
-
178
-
179
- @router.get("/files/{video_name}/{char_id}/{filename}")
180
- def serve_character_file(video_name: str, char_id: str, filename: str):
181
- file_path = TEMP_ROOT / video_name / "characters" / char_id / filename
182
- if not file_path.exists():
183
- raise HTTPException(status_code=404, detail="File not found")
184
- return FileResponse(file_path)
185
-
186
-
187
- @router.get("/audio/{video_name}/{filename}")
188
- def serve_audio_file(video_name: str, filename: str):
189
- file_path = TEMP_ROOT / video_name / "clips" / filename
190
- if not file_path.exists():
191
- raise HTTPException(status_code=404, detail="File not found")
192
- return FileResponse(file_path)
193
-
194
-
195
- @router.post("/load_casting")
196
- async def load_casting(
197
- faces_dir: str = Form("identities/faces"),
198
- voices_dir: str = Form("identities/voices"),
199
- db_dir: str = Form("chroma_db"),
200
- drop_collections: bool = Form(False),
201
- ):
202
- client = ensure_chroma(Path(db_dir))
203
- n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
204
- n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
205
- return {"ok": True, "faces": n_faces, "voices": n_voices}
206
-
207
-
208
- @router.post("/finalize_casting")
209
- async def finalize_casting(
210
- payload: dict = Body(...),
211
- ):
212
- import shutil as _sh
213
- from pathlib import Path as _P
214
-
215
- video_name = payload.get("video_name")
216
- base_dir = payload.get("base_dir")
217
- characters = payload.get("characters", []) or []
218
- voice_clusters = payload.get("voice_clusters", []) or []
219
-
220
- if not video_name or not base_dir:
221
- raise HTTPException(status_code=400, detail="Missing video_name or base_dir")
222
-
223
- faces_out = IDENTITIES_ROOT / video_name / "faces"
224
- voices_out = IDENTITIES_ROOT / video_name / "voices"
225
- faces_out.mkdir(parents=True, exist_ok=True)
226
- voices_out.mkdir(parents=True, exist_ok=True)
227
-
228
- for ch in characters:
229
- ch_name = (ch.get("name") or "Unknown").strip() or "Unknown"
230
- ch_folder = ch.get("folder")
231
- kept = ch.get("kept_files") or []
232
- if not ch_folder or not os.path.isdir(ch_folder):
233
- continue
234
- dst_dir = faces_out / ch_name
235
- dst_dir.mkdir(parents=True, exist_ok=True)
236
- for fname in kept:
237
- src = _P(ch_folder) / fname
238
- if src.exists() and src.is_file():
239
- try:
240
- _sh.copy2(src, dst_dir / fname)
241
- except Exception:
242
- pass
243
-
244
- clips_dir = _P(base_dir) / "clips"
245
- for vc in voice_clusters:
246
- v_name = (vc.get("name") or f"SPEAKER_{int(vc.get('label',0)):02d}").strip()
247
- dst_dir = voices_out / v_name
248
- dst_dir.mkdir(parents=True, exist_ok=True)
249
- for wav in (vc.get("clips") or []):
250
- src = clips_dir / wav
251
- if src.exists() and src.is_file():
252
- try:
253
- _sh.copy2(src, dst_dir / wav)
254
- except Exception:
255
- pass
256
-
257
- db_dir = IDENTITIES_ROOT / video_name / "chroma_db"
258
- try:
259
- client = ensure_chroma(db_dir)
260
- n_faces = build_faces_index(
261
- faces_out,
262
- client,
263
- collection_name="index_faces",
264
- deepface_model="Facenet512",
265
- drop=True,
266
- )
267
- n_voices = build_voices_index(
268
- voices_out,
269
- client,
270
- collection_name="index_voices",
271
- drop=True,
272
- )
273
- except Exception as e:
274
- print(f"[finalize_casting] WARN - No se pudieron construir índices ChromaDB: {e}")
275
- n_faces = 0
276
- n_voices = 0
277
-
278
- face_identities = sorted([p.name for p in faces_out.iterdir() if p.is_dir()]) if faces_out.exists() else []
279
- voice_identities = sorted([p.name for p in voices_out.iterdir() if p.is_dir()]) if voices_out.exists() else []
280
-
281
- casting_json = {"face_col": [], "voice_col": []}
282
-
283
- try:
284
- cfg = load_yaml("config.yaml")
285
- router_llm = LLMRouter(cfg)
286
- except Exception:
287
- router_llm = None # type: ignore
288
-
289
- try:
290
- if face_identities and router_llm is not None:
291
- factory = router_llm.client_factories.get("salamandra-vision") # type: ignore[attr-defined]
292
- if factory is not None:
293
- vclient = factory()
294
- gclient = getattr(vclient, "_client", None)
295
- else:
296
- gclient = None
297
-
298
- if gclient is not None:
299
- for identity in face_identities:
300
- id_dir = faces_out / identity
301
- if not id_dir.is_dir():
302
- continue
303
- img_path = None
304
- for ext in (".jpg", ".jpeg", ".png", ".bmp", ".webp"):
305
- candidates = list(id_dir.glob(f"*{ext}"))
306
- if candidates:
307
- img_path = candidates[0]
308
- break
309
- if not img_path:
310
- continue
311
-
312
- try:
313
- out = gclient.predict(str(img_path), api_name="/face_image_embedding")
314
- emb = None
315
- if isinstance(out, list):
316
- if out and isinstance(out[0], (list, tuple, float, int)):
317
- if out and isinstance(out[0], (list, tuple)):
318
- emb = list(out[0])
319
- else:
320
- emb = list(out)
321
- elif isinstance(out, dict) and "embedding" in out:
322
- emb = out.get("embedding")
323
-
324
- if not emb:
325
- continue
326
-
327
- casting_json["face_col"].append({
328
- "nombre": identity,
329
- "embedding": emb,
330
- })
331
- except Exception:
332
- continue
333
- except Exception:
334
- casting_json["face_col"] = []
335
-
336
- try:
337
- if voice_identities and router_llm is not None:
338
- factory = router_llm.client_factories.get("whisper-catalan") # type: ignore[attr-defined]
339
- if factory is not None:
340
- aclient = factory()
341
- gclient = getattr(aclient, "_client", None)
342
- else:
343
- gclient = None
344
-
345
- if gclient is not None:
346
- for identity in voice_identities:
347
- id_dir = voices_out / identity
348
- if not id_dir.is_dir():
349
- continue
350
- wav_files = sorted([
351
- p for p in id_dir.iterdir()
352
- if p.is_file() and p.suffix.lower() in [".wav", ".flac", ".mp3"]
353
- ])
354
- if not wav_files:
355
- continue
356
-
357
- wf = wav_files[0]
358
- try:
359
- out = gclient.predict(str(wf), api_name="/voice_embedding")
360
- emb = None
361
- if isinstance(out, list):
362
- emb = list(out)
363
- elif isinstance(out, dict) and "embedding" in out:
364
- emb = out.get("embedding")
365
-
366
- if not emb:
367
- continue
368
-
369
- casting_json["voice_col"].append({
370
- "nombre": identity,
371
- "embedding": emb,
372
- })
373
- except Exception:
374
- continue
375
- except Exception:
376
- casting_json["voice_col"] = []
377
-
378
- return {
379
- "ok": True,
380
- "video_name": video_name,
381
- "faces_dir": str(faces_out),
382
- "voices_dir": str(voices_out),
383
- "db_dir": str(db_dir),
384
- "n_faces_embeddings": n_faces,
385
- "n_voices_embeddings": n_voices,
386
- "face_identities": face_identities,
387
- "voice_identities": voice_identities,
388
- "casting_json": casting_json,
389
- }
390
-
391
-
392
- @router.get("/files_scene/{video_name}/{scene_id}/{filename}")
393
- def serve_scene_file(video_name: str, scene_id: str, filename: str):
394
- file_path = TEMP_ROOT / video_name / "scenes" / scene_id / filename
395
- if not file_path.exists():
396
- raise HTTPException(status_code=404, detail="File not found")
397
- return FileResponse(file_path)
398
-
399
-
400
- @router.post("/detect_scenes")
401
- async def detect_scenes(
402
- video: UploadFile = File(...),
403
- max_groups: int = Form(default=3),
404
- min_cluster_size: int = Form(default=3),
405
- scene_sensitivity: float = Form(default=0.5),
406
- frame_interval_sec: float = Form(default=0.5), # mantenido por compatibilidad, no se usa
407
- max_frames: int = Form(default=100),
408
- ):
409
- """Detecta escenas usando frames equiespaciados del vídeo y clustering jerárquico.
410
-
411
- - Extrae ``max_frames`` fotogramas equiespaciados del vídeo original.
412
- - Descarta frames negros o muy oscuros antes de construir el histograma.
413
- - Representa cada frame por un histograma de color 3D (8x8x8) normalizado
414
- dividiendo por la media (si el histograma es todo ceros o la media es 0,
415
- se descarta el frame).
416
- - Aplica ``hierarchical_cluster_with_min_size`` igual que para cares i veus.
417
- """
418
-
419
- video_name = Path(video.filename).stem
420
- dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
421
- with dst_video.open("wb") as f:
422
- shutil.copyfileobj(video.file, f)
423
-
424
- try:
425
- print(f"[detect_scenes] Extrayendo frames equiespaciados de {video_name}...")
426
-
427
- cap = cv2.VideoCapture(str(dst_video))
428
- if not cap.isOpened():
429
- raise RuntimeError("No se pudo abrir el vídeo para detectar escenas")
430
-
431
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
432
- if total_frames <= 0:
433
- cap.release()
434
- print("[detect_scenes] total_frames <= 0")
435
- return {"scene_clusters": []}
436
-
437
- n_samples = max(1, min(int(max_frames), total_frames))
438
- frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=n_samples, dtype=int).tolist()))
439
- print(f"[detect_scenes] Total frames: {total_frames}, muestreando {len(frame_indices)} frames")
440
-
441
- # Create base directory for scenes
442
- base = TEMP_ROOT / video_name
443
- scenes_dir = base / "scenes"
444
- scenes_dir.mkdir(parents=True, exist_ok=True)
445
-
446
- # ------------------------------------------------------------------
447
- # STEP 1: Guardar frames y construir embeddings sencillos (histogramas)
448
- # ------------------------------------------------------------------
449
- keyframe_paths: List[Path] = []
450
- keyframe_infos: List[dict] = []
451
- features: List[np.ndarray] = []
452
-
453
- for i, frame_idx in enumerate(frame_indices):
454
- cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
455
- ret, frame = cap.read()
456
- if not ret:
457
- continue
458
-
459
- # Filtrar frames negros o muy oscuros (umbral sobre la media de intensidad)
460
- # Trabajamos en escala de grises para evaluar brillo global.
461
- gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
462
- mean_intensity = float(gray.mean())
463
- if mean_intensity < 5.0:
464
- # Frame negro o casi negro, lo descartamos
465
- continue
466
-
467
- local_keyframe = scenes_dir / f"keyframe_{frame_idx:06d}.jpg"
468
- try:
469
- cv2.imwrite(str(local_keyframe), frame)
470
- except Exception as werr:
471
- print(f"[detect_scenes] Error guardando frame {frame_idx}: {werr}")
472
- continue
473
-
474
- try:
475
- # Histograma de color 8x8x8 en RGB
476
- img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
477
- hist = cv2.calcHist(
478
- [img_rgb], [0, 1, 2], None,
479
- [8, 8, 8], [0, 256, 0, 256, 0, 256]
480
- ).astype("float32").flatten()
481
-
482
- if not np.any(hist):
483
- # Todo ceros, descartamos
484
- continue
485
-
486
- mean_val = float(hist.mean())
487
- if mean_val <= 0.0:
488
- # Media cero o negativa, descartamos
489
- continue
490
-
491
- hist /= mean_val
492
- features.append(hist)
493
- except Exception as fe_err:
494
- print(f"[detect_scenes] Error calculando embedding para frame {frame_idx}: {fe_err}")
495
- continue
496
-
497
- keyframe_paths.append(local_keyframe)
498
- # Como no tenemos frames_info de svision, usamos el índice de frame
499
- info = {"start": int(frame_idx), "end": int(frame_idx) + 1}
500
- keyframe_infos.append(info)
501
-
502
- cap.release()
503
-
504
- if not features or len(features) < min_cluster_size:
505
- print(
506
- f"[detect_scenes] No hay suficientes frames válidos para clusterizar escenas: "
507
- f"validos={len(features)}, min_cluster_size={min_cluster_size}"
508
- )
509
- return {"scene_clusters": []}
510
-
511
- Xs = np.vstack(features)
512
-
513
- # ------------------------------------------------------------------
514
- # STEP 2: Clustering jerárquico de escenas (k-Target + mida mínima)
515
- # ------------------------------------------------------------------
516
- print("[detect_scenes] Clustering jerárquico de escenas...")
517
- scene_labels = hierarchical_cluster_with_min_size(Xs, max_groups, min_cluster_size, 0.5)
518
- unique_labels = sorted({int(l) for l in scene_labels if int(l) >= 0})
519
- print(f"[detect_scenes] Etiquetas de escena válidas: {unique_labels}")
520
-
521
- # Mapear índices de keyframes a clusters
522
- cluster_map: Dict[int, List[int]] = {}
523
- for idx, lbl in enumerate(scene_labels):
524
- lbl = int(lbl)
525
- if lbl >= 0:
526
- cluster_map.setdefault(lbl, []).append(idx)
527
-
528
- # ------------------------------------------------------------------
529
- # STEP 3: Construir scene_clusters con el formato esperado por el demo
530
- # ------------------------------------------------------------------
531
- scene_clusters: List[Dict[str, Any]] = []
532
- for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
533
- if not idxs:
534
- continue
535
-
536
- scene_id = f"scene_{ci:02d}"
537
- scene_out_dir = scenes_dir / scene_id
538
- scene_out_dir.mkdir(parents=True, exist_ok=True)
539
-
540
- # Copiar todos los keyframes del cluster a la carpeta del cluster
541
- cluster_start = None
542
- cluster_end = None
543
- representative_file = None
544
-
545
- for j, k_idx in enumerate(idxs):
546
- src = keyframe_paths[k_idx]
547
- dst = scene_out_dir / src.name
548
- try:
549
- shutil.copy2(src, dst)
550
- except Exception as cp_err:
551
- print(f"[detect_scenes] Error copiando keyframe {src} a cluster {scene_id}: {cp_err}")
552
- continue
553
-
554
- if representative_file is None:
555
- representative_file = dst
556
-
557
- info = keyframe_infos[k_idx]
558
- start = info.get("start", k_idx)
559
- end = info.get("end", k_idx + 1)
560
- cluster_start = start if cluster_start is None else min(cluster_start, start)
561
- cluster_end = end if cluster_end is None else max(cluster_end, end)
562
-
563
- if representative_file is None:
564
- continue
565
-
566
- scene_clusters.append({
567
- "id": scene_id,
568
- "name": f"Escena {len(scene_clusters)+1}",
569
- "folder": str(scene_out_dir),
570
- "image_url": f"/files_scene/{video_name}/{scene_id}/{representative_file.name}",
571
- "start_time": float(cluster_start) if cluster_start is not None else 0.0,
572
- "end_time": float(cluster_end) if cluster_end is not None else 0.0,
573
- })
574
-
575
- print(f"[detect_scenes]  {len(scene_clusters)} escenes clusteritzades")
576
- return {"scene_clusters": scene_clusters}
577
-
578
- except Exception as e:
579
- print(f"[detect_scenes] Error: {e}")
580
- import traceback
581
- traceback.print_exc()
582
- return {"scene_clusters": [], "error": str(e)}
583
-
584
-
585
- def process_video_job(job_id: str):
586
- """
587
- Process video job in background using EXTERNAL spaces (svision, asr).
588
-
589
- NO local GPU needed - all vision/audio processing is delegated to:
590
- - svision: face detection + embeddings (MTCNN + FaceNet)
591
- - asr: audio diarization + voice embeddings (pyannote + ECAPA)
592
-
593
- Engine only does: frame extraction, clustering (math), file organization.
594
- """
595
- try:
596
- job = jobs[job_id]
597
- print(f"[{job_id}] Iniciando procesamiento (delegando a svision/asr)...")
598
-
599
- job["status"] = JobStatus.PROCESSING
600
-
601
- video_path = job["video_path"]
602
- video_name = job["video_name"]
603
- max_groups = int(job.get("max_groups", 5))
604
- min_cluster_size = int(job.get("min_cluster_size", 3))
605
- face_sensitivity = float(job.get("face_sensitivity", 0.5))
606
-
607
- base = TEMP_ROOT / video_name
608
- base.mkdir(parents=True, exist_ok=True)
609
- print(f"[{job_id}] Directorio base: {base}")
610
-
611
- try:
612
- # ============================================================
613
- # STEP 1: Extract frames from video (local, simple cv2)
614
- # ============================================================
615
- print(f"[{job_id}] Extrayendo frames del vídeo...")
616
-
617
- cap = cv2.VideoCapture(video_path)
618
- if not cap.isOpened():
619
- raise RuntimeError("No se pudo abrir el vídeo")
620
-
621
- fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
622
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
623
- max_samples = job.get("max_frames", 100)
624
-
625
- if total_frames > 0:
626
- frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
627
- else:
628
- frame_indices = []
629
-
630
- print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames")
631
-
632
- # Save frames temporarily for svision processing
633
- frames_dir = base / "frames_temp"
634
- frames_dir.mkdir(parents=True, exist_ok=True)
635
- faces_root = base / "faces_raw"
636
- faces_root.mkdir(parents=True, exist_ok=True)
637
-
638
- frame_paths: List[str] = []
639
- for frame_idx in frame_indices:
640
- cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
641
- ret, frame = cap.read()
642
- if not ret:
643
- continue
644
- frame_path = frames_dir / f"frame_{frame_idx:06d}.jpg"
645
- cv2.imwrite(str(frame_path), frame)
646
- frame_paths.append(str(frame_path))
647
- cap.release()
648
-
649
- print(f"[{job_id}] ✓ {len(frame_paths)} frames extraídos")
650
-
651
- # ============================================================
652
- # STEP 2: Send frames to SVISION for face detection + embeddings
653
- # ============================================================
654
- print(f"[{job_id}] Enviando frames a svision para detección de caras...")
655
-
656
- embeddings: List[List[float]] = []
657
- crops_meta: List[dict] = []
658
- saved_count = 0
659
- frames_with_faces = 0
660
-
661
- for i, frame_path in enumerate(frame_paths):
662
- frame_idx = frame_indices[i] if i < len(frame_indices) else i
663
- try:
664
- # Call svision to get faces + embeddings
665
- faces = svision_client.get_face_embeddings_from_image(frame_path)
666
-
667
- if faces:
668
- frames_with_faces += 1
669
- for face_data in faces:
670
- emb = face_data.get("embedding", [])
671
- if not emb:
672
- continue
673
-
674
- # Normalize embedding
675
- emb = np.array(emb, dtype=float)
676
- emb = emb / (np.linalg.norm(emb) + 1e-9)
677
- embeddings.append(emb.tolist())
678
-
679
- # Save face crop if provided by svision
680
- crop_path = face_data.get("face_crop_path")
681
- fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
682
- local_crop_path = faces_root / fn
683
-
684
- crop_saved = False
685
- if crop_path:
686
- # Handle remote URLs from svision (Gradio)
687
- if isinstance(crop_path, str) and crop_path.startswith("http"):
688
- try:
689
- import requests
690
- resp = requests.get(crop_path, timeout=30)
691
- if resp.status_code == 200:
692
- with open(local_crop_path, "wb") as f:
693
- f.write(resp.content)
694
- crop_saved = True
695
- except Exception as dl_err:
696
- print(f"[{job_id}] Error descargando crop: {dl_err}")
697
- # Handle local paths
698
- elif isinstance(crop_path, str) and os.path.exists(crop_path):
699
- shutil.copy2(crop_path, local_crop_path)
700
- crop_saved = True
701
-
702
- if not crop_saved:
703
- # If no crop from svision, use original frame
704
- shutil.copy2(frame_path, local_crop_path)
705
-
706
- crops_meta.append({
707
- "file": fn,
708
- "frame": frame_idx,
709
- "index": face_data.get("index", saved_count),
710
- })
711
- saved_count += 1
712
-
713
- except Exception as e:
714
- print(f"[{job_id}] Error procesando frame {frame_idx}: {e}")
715
- continue
716
-
717
- print(f"[{job_id}] Frames con caras: {frames_with_faces}/{len(frame_paths)}")
718
- print(f"[{job_id}] Caras detectadas: {len(embeddings)}")
719
-
720
- # ============================================================
721
- # STEP 3: Clustering (local, only math - no GPU)
722
- # ============================================================
723
- if embeddings:
724
- print(f"[{job_id}] Clustering jerárquico...")
725
- Xf = np.array(embeddings)
726
- labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
727
- n_clusters = len(set([l for l in labels if l >= 0]))
728
- print(f"[{job_id}] Clustering: {n_clusters} clusters")
729
- else:
730
- labels = []
731
-
732
- # ============================================================
733
- # STEP 4: Organize faces into character folders
734
- # ============================================================
735
- characters: List[Dict[str, Any]] = []
736
- cluster_map: Dict[int, List[int]] = {}
737
- for idx, lbl in enumerate(labels):
738
- if isinstance(lbl, int) and lbl >= 0:
739
- cluster_map.setdefault(lbl, []).append(idx)
740
-
741
- chars_dir = base / "characters"
742
- chars_dir.mkdir(parents=True, exist_ok=True)
743
-
744
- print(f"[{job_id}] cluster_map: {cluster_map}")
745
- print(f"[{job_id}] crops_meta count: {len(crops_meta)}")
746
- print(f"[{job_id}] faces_root: {faces_root}, exists: {faces_root.exists()}")
747
- if faces_root.exists():
748
- existing_files = list(faces_root.glob("*"))
749
- print(f"[{job_id}] Files in faces_root: {len(existing_files)}")
750
- for ef in existing_files[:5]:
751
- print(f"[{job_id}] - {ef.name}")
752
-
753
- for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
754
- char_id = f"char_{ci:02d}"
755
- print(f"[{job_id}] Processing cluster {char_id} with {len(idxs)} indices: {idxs[:5]}...")
756
-
757
- if not idxs:
758
- continue
759
-
760
- out_dir = chars_dir / char_id
761
- out_dir.mkdir(parents=True, exist_ok=True)
762
-
763
- # Select faces to show (half + 1)
764
- total_faces = len(idxs)
765
- max_faces_to_show = (total_faces // 2) + 1
766
- selected_idxs = idxs[:max_faces_to_show]
767
-
768
- files: List[str] = []
769
- file_urls: List[str] = []
770
-
771
- for j in selected_idxs:
772
- if j >= len(crops_meta):
773
- print(f"[{job_id}] Index {j} out of range (crops_meta len={len(crops_meta)})")
774
- continue
775
- meta = crops_meta[j]
776
- fname = meta.get("file")
777
- if not fname:
778
- print(f"[{job_id}] No filename in meta for index {j}")
779
- continue
780
-
781
- src = faces_root / fname
782
- dst = out_dir / fname
783
- try:
784
- if src.exists():
785
- shutil.copy2(src, dst)
786
- files.append(fname)
787
- file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
788
- else:
789
- print(f"[{job_id}] Source file not found: {src}")
790
- except Exception as cp_err:
791
- print(f"[{job_id}] Error copying {fname}: {cp_err}")
792
-
793
- # Create representative image
794
- rep = files[0] if files else None
795
- if rep:
796
- try:
797
- shutil.copy2(out_dir / rep, out_dir / "representative.jpg")
798
- except Exception:
799
- pass
800
-
801
- cluster_number = ci + 1
802
- character_name = f"Cluster {cluster_number}"
803
-
804
- characters.append({
805
- "id": char_id,
806
- "name": character_name,
807
- "folder": str(out_dir),
808
- "num_faces": len(files),
809
- "total_faces_detected": total_faces,
810
- "image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
811
- "face_files": file_urls,
812
- })
813
- print(f"[{job_id}] ✓ Cluster {char_id}: {len(files)} caras")
814
-
815
- # Cleanup temp frames
816
- try:
817
- shutil.rmtree(frames_dir)
818
- except Exception:
819
- pass
820
-
821
- print(f"[{job_id}] ✓ Total: {len(characters)} personajes")
822
-
823
- # ============================================================
824
- # STEP 5: Audio diarization + voice embeddings using ASR space
825
- # ============================================================
826
- voice_max_groups = int(job.get("voice_max_groups", 3))
827
- voice_min_cluster_size = int(job.get("voice_min_cluster_size", 3))
828
- voice_sensitivity = float(job.get("voice_sensitivity", 0.5))
829
-
830
- audio_segments: List[Dict[str, Any]] = []
831
- voice_labels: List[int] = []
832
- voice_embeddings: List[List[float]] = []
833
- diarization_info: Dict[str, Any] = {}
834
-
835
- print(f"[{job_id}] Procesando audio con ASR space...")
836
- try:
837
- # Extract audio and diarize
838
- diar_result = asr_client.extract_audio_and_diarize(video_path)
839
- clips = diar_result.get("clips", [])
840
- segments = diar_result.get("segments", [])
841
-
842
- print(f"[{job_id}] Diarización: {len(clips)} clips, {len(segments)} segmentos")
843
-
844
- # Save clips locally
845
- clips_dir = base / "clips"
846
- clips_dir.mkdir(parents=True, exist_ok=True)
847
-
848
- for i, clip_info in enumerate(clips if isinstance(clips, list) else []):
849
- clip_path = clip_info if isinstance(clip_info, str) else clip_info.get("path") if isinstance(clip_info, dict) else None
850
- if not clip_path:
851
- continue
852
-
853
- # Download or copy clip
854
- local_clip = clips_dir / f"segment_{i:03d}.wav"
855
- try:
856
- if isinstance(clip_path, str) and clip_path.startswith("http"):
857
- import requests
858
- resp = requests.get(clip_path, timeout=30)
859
- if resp.status_code == 200:
860
- with open(local_clip, "wb") as f:
861
- f.write(resp.content)
862
- elif isinstance(clip_path, str) and os.path.exists(clip_path):
863
- shutil.copy2(clip_path, local_clip)
864
- except Exception as dl_err:
865
- print(f"[{job_id}] Error guardando clip {i}: {dl_err}")
866
- continue
867
-
868
- # Get segment info
869
- seg_info = segments[i] if i < len(segments) else {}
870
- speaker = seg_info.get("speaker", f"SPEAKER_{i:02d}")
871
-
872
- # Get voice embedding for this clip
873
- emb = asr_client.get_voice_embedding(str(local_clip))
874
- if emb:
875
- voice_embeddings.append(emb)
876
-
877
- audio_segments.append({
878
- "index": i,
879
- "clip_path": str(local_clip),
880
- "clip_url": f"/audio/{video_name}/segment_{i:03d}.wav",
881
- "speaker": speaker,
882
- "start": seg_info.get("start", 0),
883
- "end": seg_info.get("end", 0),
884
- })
885
-
886
- print(f"[{job_id}] \u2713 {len(audio_segments)} segmentos de audio procesados")
887
-
888
- # Cluster voice embeddings
889
- if voice_embeddings:
890
- print(f"[{job_id}] Clustering jer\u00e1rquico de voz...")
891
- print(f"[{job_id}] - voice_embeddings: {len(voice_embeddings)} embeddings")
892
- print(f"[{job_id}] - par\u00e1metros: voice_max_groups={voice_max_groups}, voice_min_cluster_size={voice_min_cluster_size}")
893
- Xv = np.array(voice_embeddings)
894
- print(f"[{job_id}] - shape Xv: {Xv.shape}")
895
- voice_labels = hierarchical_cluster_with_min_size(
896
- Xv, voice_max_groups, voice_min_cluster_size, voice_sensitivity
897
- ).tolist()
898
- n_voice_clusters = len(set([l for l in voice_labels if l >= 0]))
899
- print(f"[{job_id}] - voice_labels: {voice_labels}")
900
- print(f"[{job_id}] \u2713 Clustering de voz: {n_voice_clusters} clusters de {len(voice_embeddings)} muestras")
901
- else:
902
- print(f"[{job_id}] \u26a0\ufe0f No hay voice_embeddings para clustering")
903
-
904
- diarization_info = {
905
- "num_segments": len(audio_segments),
906
- "num_voice_clusters": len(set([l for l in voice_labels if l >= 0])) if voice_labels else 0,
907
- }
908
-
909
- except Exception as audio_err:
910
- print(f"[{job_id}] Error en procesamiento de audio: {audio_err}")
911
- import traceback
912
- traceback.print_exc()
913
-
914
- job["results"] = {
915
- "characters": characters,
916
- "face_labels": labels,
917
- "audio_segments": audio_segments,
918
- "voice_labels": voice_labels,
919
- "diarization_info": diarization_info,
920
- "video_name": video_name,
921
- "base_dir": str(base),
922
- }
923
- job["status"] = JobStatus.DONE
924
- print(f"[{job_id}] ✓ Procesamiento completado")
925
-
926
- except Exception as proc_error:
927
- print(f"[{job_id}] Error en procesamiento: {proc_error}")
928
- import traceback
929
- traceback.print_exc()
930
- job["results"] = {
931
- "characters": [], "face_labels": [],
932
- "audio_segments": [], "voice_labels": [], "diarization_info": {},
933
- "video_name": video_name, "base_dir": str(base)
934
- }
935
- job["status"] = JobStatus.DONE
936
-
937
- except Exception as e:
938
- print(f"[{job_id}] Error general: {e}")
939
- import traceback
940
- traceback.print_exc()
941
- job["status"] = JobStatus.FAILED
942
- job["error"] = str(e)
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from fastapi import APIRouter, UploadFile, File, Form, BackgroundTasks, HTTPException, Body
4
+ from fastapi.responses import FileResponse
5
+ from pathlib import Path
6
+ from datetime import datetime
7
+ from enum import Enum
8
+ from typing import Dict, Any, List
9
+ import shutil
10
+ import os
11
+ import uuid
12
+ import numpy as np
13
+ import cv2
14
+ import tempfile
15
+
16
+ from casting_loader import ensure_chroma, build_faces_index, build_voices_index
17
+ from llm_router import load_yaml, LLMRouter
18
+ from storage.media_routers import upload_video
19
+
20
+ # External space clients (no local GPU needed)
21
+ import svision_client
22
+ import asr_client
23
+
24
+
25
+ ROOT = Path("/tmp/veureu")
26
+ ROOT.mkdir(parents=True, exist_ok=True)
27
+ TEMP_ROOT = Path("/tmp/temp")
28
+ TEMP_ROOT.mkdir(parents=True, exist_ok=True)
29
+ VIDEOS_ROOT = Path("/tmp/data/videos")
30
+ VIDEOS_ROOT.mkdir(parents=True, exist_ok=True)
31
+ IDENTITIES_ROOT = Path("/tmp/characters")
32
+ IDENTITIES_ROOT.mkdir(parents=True, exist_ok=True)
33
+ VEUREU_TOKEN = os.getenv("VEUREU_TOKEN")
34
+
35
+
36
+ class JobStatus(str, Enum):
37
+ QUEUED = "queued"
38
+ PROCESSING = "processing"
39
+ DONE = "done"
40
+ FAILED = "failed"
41
+
42
+
43
+ jobs: Dict[str, dict] = {}
44
+
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # Helper function for clustering (only math, no GPU)
48
+ # ---------------------------------------------------------------------------
49
+
50
+ def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
51
+ """Hierarchical clustering using only min_cluster_size and k-target (max_groups).
52
+
53
+ - Primero intenta crear el máximo número posible de clusters con al menos
54
+ ``min_cluster_size`` elementos.
55
+ - Después fusiona implícitamente (bajando el número de clusters) hasta
56
+ llegar a un número de clusters válidos (tamaño >= min_cluster_size)
57
+ menor o igual que ``max_groups``.
58
+
59
+ ``sensitivity`` se mantiene en la firma por compatibilidad, pero no se usa.
60
+ """
61
+ from scipy.cluster.hierarchy import linkage, fcluster
62
+ from collections import Counter
63
+
64
+ n_samples = len(X)
65
+ if n_samples == 0:
66
+ return np.array([])
67
+
68
+ # Si no hay suficientes muestras para formar un solo cluster válido,
69
+ # marcamos todo como ruido (-1).
70
+ if n_samples < min_cluster_size:
71
+ return np.full(n_samples, -1, dtype=int)
72
+
73
+ # k_target = max_groups (interpretamos este parámetro como k-Target)
74
+ k_target = max(0, int(max_groups))
75
+
76
+ # Caso especial: k_target == 0 => no queremos clusters, todo ruido.
77
+ if k_target == 0:
78
+ return np.full(n_samples, -1, dtype=int)
79
+
80
+ # Enlace jerárquico una sola vez
81
+ Z = linkage(X, method="average", metric="cosine")
82
+
83
+ # Máximo número de clusters posibles respetando min_cluster_size
84
+ max_possible = n_samples // min_cluster_size
85
+ if max_possible <= 0:
86
+ return np.full(n_samples, -1, dtype=int)
87
+
88
+ max_to_try = min(max_possible, n_samples)
89
+
90
+ best_labels = np.full(n_samples, -1, dtype=int)
91
+
92
+ # Recorremos de más clusters a menos, buscando la primera solución
93
+ # que tenga entre 1 y k_target clusters válidos.
94
+ for n_clusters in range(max_to_try, 0, -1):
95
+ trial_labels = fcluster(Z, t=n_clusters, criterion="maxclust") - 1
96
+ counts = Counter(trial_labels)
97
+
98
+ # Clusters con tamaño suficiente
99
+ valid_clusters = {lbl for lbl, cnt in counts.items() if cnt >= min_cluster_size}
100
+ num_valid = len(valid_clusters)
101
+
102
+ if num_valid == 0:
103
+ # Demasiado fino, todos los clusters son demasiado pequeños
104
+ continue
105
+
106
+ if num_valid <= k_target:
107
+ # Aceptamos esta solución
108
+ final_labels = []
109
+ for lbl in trial_labels:
110
+ if lbl in valid_clusters:
111
+ final_labels.append(lbl)
112
+ else:
113
+ final_labels.append(-1)
114
+ best_labels = np.array(final_labels, dtype=int)
115
+ break
116
+
117
+ return best_labels
118
+
119
+
120
+ router = APIRouter(tags=["Preprocessing Manager"])
121
+
122
+
123
+ @router.post("/create_initial_casting")
124
+ async def create_initial_casting(
125
+ background_tasks: BackgroundTasks,
126
+ video: UploadFile = File(...),
127
+ max_groups: int = Form(default=3),
128
+ min_cluster_size: int = Form(default=3),
129
+ face_sensitivity: float = Form(default=0.5),
130
+ voice_max_groups: int = Form(default=3),
131
+ voice_min_cluster_size: int = Form(default=3),
132
+ voice_sensitivity: float = Form(default=0.5),
133
+ max_frames: int = Form(default=100),
134
+ ):
135
+ video_name = Path(video.filename).stem
136
+ dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
137
+ with dst_video.open("wb") as f:
138
+ shutil.copyfileobj(video.file, f)
139
+
140
+ upload_video(video, VEUREU_TOKEN)
141
+
142
+ job_id = str(uuid.uuid4())
143
+
144
+ jobs[job_id] = {
145
+ "id": job_id,
146
+ "status": JobStatus.QUEUED,
147
+ "video_path": str(dst_video),
148
+ "video_name": video_name,
149
+ "max_groups": int(max_groups),
150
+ "min_cluster_size": int(min_cluster_size),
151
+ "face_sensitivity": float(face_sensitivity),
152
+ "voice_max_groups": int(voice_max_groups),
153
+ "voice_min_cluster_size": int(voice_min_cluster_size),
154
+ "voice_sensitivity": float(voice_sensitivity),
155
+ "max_frames": int(max_frames),
156
+ "created_at": datetime.now().isoformat(),
157
+ "results": None,
158
+ "error": None,
159
+ }
160
+
161
+ print(f"[{job_id}] Job creado para vídeo: {video_name}")
162
+ background_tasks.add_task(process_video_job, job_id)
163
+ return {"job_id": job_id}
164
+
165
+
166
+ @router.get("/jobs/{job_id}/status")
167
+ def get_job_status(job_id: str):
168
+ if job_id not in jobs:
169
+ raise HTTPException(status_code=404, detail="Job not found")
170
+
171
+ job = jobs[job_id]
172
+ status_value = job["status"].value if isinstance(job["status"], JobStatus) else str(job["status"])
173
+ response = {"status": status_value}
174
+
175
+ if job.get("results") is not None:
176
+ response["results"] = job["results"]
177
+ if job.get("error"):
178
+ response["error"] = job["error"]
179
+
180
+ return response
181
+
182
+
183
+ @router.get("/files/{video_name}/{char_id}/{filename}")
184
+ def serve_character_file(video_name: str, char_id: str, filename: str):
185
+ file_path = TEMP_ROOT / video_name / "characters" / char_id / filename
186
+ if not file_path.exists():
187
+ raise HTTPException(status_code=404, detail="File not found")
188
+ return FileResponse(file_path)
189
+
190
+
191
+ @router.get("/audio/{video_name}/{filename}")
192
+ def serve_audio_file(video_name: str, filename: str):
193
+ file_path = TEMP_ROOT / video_name / "clips" / filename
194
+ if not file_path.exists():
195
+ raise HTTPException(status_code=404, detail="File not found")
196
+ return FileResponse(file_path)
197
+
198
+
199
+ @router.post("/load_casting")
200
+ async def load_casting(
201
+ faces_dir: str = Form("identities/faces"),
202
+ voices_dir: str = Form("identities/voices"),
203
+ db_dir: str = Form("chroma_db"),
204
+ drop_collections: bool = Form(False),
205
+ ):
206
+ client = ensure_chroma(Path(db_dir))
207
+ n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
208
+ n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
209
+ return {"ok": True, "faces": n_faces, "voices": n_voices}
210
+
211
+
212
+ @router.post("/finalize_casting")
213
+ async def finalize_casting(
214
+ payload: dict = Body(...),
215
+ ):
216
+ import shutil as _sh
217
+ from pathlib import Path as _P
218
+
219
+ video_name = payload.get("video_name")
220
+ base_dir = payload.get("base_dir")
221
+ characters = payload.get("characters", []) or []
222
+ voice_clusters = payload.get("voice_clusters", []) or []
223
+
224
+ if not video_name or not base_dir:
225
+ raise HTTPException(status_code=400, detail="Missing video_name or base_dir")
226
+
227
+ faces_out = IDENTITIES_ROOT / video_name / "faces"
228
+ voices_out = IDENTITIES_ROOT / video_name / "voices"
229
+ faces_out.mkdir(parents=True, exist_ok=True)
230
+ voices_out.mkdir(parents=True, exist_ok=True)
231
+
232
+ for ch in characters:
233
+ ch_name = (ch.get("name") or "Unknown").strip() or "Unknown"
234
+ ch_folder = ch.get("folder")
235
+ kept = ch.get("kept_files") or []
236
+ if not ch_folder or not os.path.isdir(ch_folder):
237
+ continue
238
+ dst_dir = faces_out / ch_name
239
+ dst_dir.mkdir(parents=True, exist_ok=True)
240
+ for fname in kept:
241
+ src = _P(ch_folder) / fname
242
+ if src.exists() and src.is_file():
243
+ try:
244
+ _sh.copy2(src, dst_dir / fname)
245
+ except Exception:
246
+ pass
247
+
248
+ clips_dir = _P(base_dir) / "clips"
249
+ for vc in voice_clusters:
250
+ v_name = (vc.get("name") or f"SPEAKER_{int(vc.get('label',0)):02d}").strip()
251
+ dst_dir = voices_out / v_name
252
+ dst_dir.mkdir(parents=True, exist_ok=True)
253
+ for wav in (vc.get("clips") or []):
254
+ src = clips_dir / wav
255
+ if src.exists() and src.is_file():
256
+ try:
257
+ _sh.copy2(src, dst_dir / wav)
258
+ except Exception:
259
+ pass
260
+
261
+ db_dir = IDENTITIES_ROOT / video_name / "chroma_db"
262
+ try:
263
+ client = ensure_chroma(db_dir)
264
+ n_faces = build_faces_index(
265
+ faces_out,
266
+ client,
267
+ collection_name="index_faces",
268
+ deepface_model="Facenet512",
269
+ drop=True,
270
+ )
271
+ n_voices = build_voices_index(
272
+ voices_out,
273
+ client,
274
+ collection_name="index_voices",
275
+ drop=True,
276
+ )
277
+ except Exception as e:
278
+ print(f"[finalize_casting] WARN - No se pudieron construir índices ChromaDB: {e}")
279
+ n_faces = 0
280
+ n_voices = 0
281
+
282
+ face_identities = sorted([p.name for p in faces_out.iterdir() if p.is_dir()]) if faces_out.exists() else []
283
+ voice_identities = sorted([p.name for p in voices_out.iterdir() if p.is_dir()]) if voices_out.exists() else []
284
+
285
+ casting_json = {"face_col": [], "voice_col": []}
286
+
287
+ try:
288
+ cfg = load_yaml("config.yaml")
289
+ router_llm = LLMRouter(cfg)
290
+ except Exception:
291
+ router_llm = None # type: ignore
292
+
293
+ try:
294
+ if face_identities and router_llm is not None:
295
+ factory = router_llm.client_factories.get("salamandra-vision") # type: ignore[attr-defined]
296
+ if factory is not None:
297
+ vclient = factory()
298
+ gclient = getattr(vclient, "_client", None)
299
+ else:
300
+ gclient = None
301
+
302
+ if gclient is not None:
303
+ for identity in face_identities:
304
+ id_dir = faces_out / identity
305
+ if not id_dir.is_dir():
306
+ continue
307
+ img_path = None
308
+ for ext in (".jpg", ".jpeg", ".png", ".bmp", ".webp"):
309
+ candidates = list(id_dir.glob(f"*{ext}"))
310
+ if candidates:
311
+ img_path = candidates[0]
312
+ break
313
+ if not img_path:
314
+ continue
315
+
316
+ try:
317
+ out = gclient.predict(str(img_path), api_name="/face_image_embedding")
318
+ emb = None
319
+ if isinstance(out, list):
320
+ if out and isinstance(out[0], (list, tuple, float, int)):
321
+ if out and isinstance(out[0], (list, tuple)):
322
+ emb = list(out[0])
323
+ else:
324
+ emb = list(out)
325
+ elif isinstance(out, dict) and "embedding" in out:
326
+ emb = out.get("embedding")
327
+
328
+ if not emb:
329
+ continue
330
+
331
+ casting_json["face_col"].append({
332
+ "nombre": identity,
333
+ "embedding": emb,
334
+ })
335
+ except Exception:
336
+ continue
337
+ except Exception:
338
+ casting_json["face_col"] = []
339
+
340
+ try:
341
+ if voice_identities and router_llm is not None:
342
+ factory = router_llm.client_factories.get("whisper-catalan") # type: ignore[attr-defined]
343
+ if factory is not None:
344
+ aclient = factory()
345
+ gclient = getattr(aclient, "_client", None)
346
+ else:
347
+ gclient = None
348
+
349
+ if gclient is not None:
350
+ for identity in voice_identities:
351
+ id_dir = voices_out / identity
352
+ if not id_dir.is_dir():
353
+ continue
354
+ wav_files = sorted([
355
+ p for p in id_dir.iterdir()
356
+ if p.is_file() and p.suffix.lower() in [".wav", ".flac", ".mp3"]
357
+ ])
358
+ if not wav_files:
359
+ continue
360
+
361
+ wf = wav_files[0]
362
+ try:
363
+ out = gclient.predict(str(wf), api_name="/voice_embedding")
364
+ emb = None
365
+ if isinstance(out, list):
366
+ emb = list(out)
367
+ elif isinstance(out, dict) and "embedding" in out:
368
+ emb = out.get("embedding")
369
+
370
+ if not emb:
371
+ continue
372
+
373
+ casting_json["voice_col"].append({
374
+ "nombre": identity,
375
+ "embedding": emb,
376
+ })
377
+ except Exception:
378
+ continue
379
+ except Exception:
380
+ casting_json["voice_col"] = []
381
+
382
+ return {
383
+ "ok": True,
384
+ "video_name": video_name,
385
+ "faces_dir": str(faces_out),
386
+ "voices_dir": str(voices_out),
387
+ "db_dir": str(db_dir),
388
+ "n_faces_embeddings": n_faces,
389
+ "n_voices_embeddings": n_voices,
390
+ "face_identities": face_identities,
391
+ "voice_identities": voice_identities,
392
+ "casting_json": casting_json,
393
+ }
394
+
395
+
396
+ @router.get("/files_scene/{video_name}/{scene_id}/{filename}")
397
+ def serve_scene_file(video_name: str, scene_id: str, filename: str):
398
+ file_path = TEMP_ROOT / video_name / "scenes" / scene_id / filename
399
+ if not file_path.exists():
400
+ raise HTTPException(status_code=404, detail="File not found")
401
+ return FileResponse(file_path)
402
+
403
+
404
+ @router.post("/detect_scenes")
405
+ async def detect_scenes(
406
+ video: UploadFile = File(...),
407
+ max_groups: int = Form(default=3),
408
+ min_cluster_size: int = Form(default=3),
409
+ scene_sensitivity: float = Form(default=0.5),
410
+ frame_interval_sec: float = Form(default=0.5), # mantenido por compatibilidad, no se usa
411
+ max_frames: int = Form(default=100),
412
+ ):
413
+ """Detecta escenas usando frames equiespaciados del vídeo y clustering jerárquico.
414
+
415
+ - Extrae ``max_frames`` fotogramas equiespaciados del vídeo original.
416
+ - Descarta frames negros o muy oscuros antes de construir el histograma.
417
+ - Representa cada frame por un histograma de color 3D (8x8x8) normalizado
418
+ dividiendo por la media (si el histograma es todo ceros o la media es 0,
419
+ se descarta el frame).
420
+ - Aplica ``hierarchical_cluster_with_min_size`` igual que para cares i veus.
421
+ """
422
+
423
+ video_name = Path(video.filename).stem
424
+ dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
425
+ with dst_video.open("wb") as f:
426
+ shutil.copyfileobj(video.file, f)
427
+
428
+ try:
429
+ print(f"[detect_scenes] Extrayendo frames equiespaciados de {video_name}...")
430
+
431
+ cap = cv2.VideoCapture(str(dst_video))
432
+ if not cap.isOpened():
433
+ raise RuntimeError("No se pudo abrir el vídeo para detectar escenas")
434
+
435
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
436
+ if total_frames <= 0:
437
+ cap.release()
438
+ print("[detect_scenes] total_frames <= 0")
439
+ return {"scene_clusters": []}
440
+
441
+ n_samples = max(1, min(int(max_frames), total_frames))
442
+ frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=n_samples, dtype=int).tolist()))
443
+ print(f"[detect_scenes] Total frames: {total_frames}, muestreando {len(frame_indices)} frames")
444
+
445
+ # Create base directory for scenes
446
+ base = TEMP_ROOT / video_name
447
+ scenes_dir = base / "scenes"
448
+ scenes_dir.mkdir(parents=True, exist_ok=True)
449
+
450
+ # ------------------------------------------------------------------
451
+ # STEP 1: Guardar frames y construir embeddings sencillos (histogramas)
452
+ # ------------------------------------------------------------------
453
+ keyframe_paths: List[Path] = []
454
+ keyframe_infos: List[dict] = []
455
+ features: List[np.ndarray] = []
456
+
457
+ for i, frame_idx in enumerate(frame_indices):
458
+ cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
459
+ ret, frame = cap.read()
460
+ if not ret:
461
+ continue
462
+
463
+ # Filtrar frames negros o muy oscuros (umbral sobre la media de intensidad)
464
+ # Trabajamos en escala de grises para evaluar brillo global.
465
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
466
+ mean_intensity = float(gray.mean())
467
+ if mean_intensity < 5.0:
468
+ # Frame negro o casi negro, lo descartamos
469
+ continue
470
+
471
+ local_keyframe = scenes_dir / f"keyframe_{frame_idx:06d}.jpg"
472
+ try:
473
+ cv2.imwrite(str(local_keyframe), frame)
474
+ except Exception as werr:
475
+ print(f"[detect_scenes] Error guardando frame {frame_idx}: {werr}")
476
+ continue
477
+
478
+ try:
479
+ # Histograma de color 8x8x8 en RGB
480
+ img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
481
+ hist = cv2.calcHist(
482
+ [img_rgb], [0, 1, 2], None,
483
+ [8, 8, 8], [0, 256, 0, 256, 0, 256]
484
+ ).astype("float32").flatten()
485
+
486
+ if not np.any(hist):
487
+ # Todo ceros, descartamos
488
+ continue
489
+
490
+ mean_val = float(hist.mean())
491
+ if mean_val <= 0.0:
492
+ # Media cero o negativa, descartamos
493
+ continue
494
+
495
+ hist /= mean_val
496
+ features.append(hist)
497
+ except Exception as fe_err:
498
+ print(f"[detect_scenes] Error calculando embedding para frame {frame_idx}: {fe_err}")
499
+ continue
500
+
501
+ keyframe_paths.append(local_keyframe)
502
+ # Como no tenemos frames_info de svision, usamos el índice de frame
503
+ info = {"start": int(frame_idx), "end": int(frame_idx) + 1}
504
+ keyframe_infos.append(info)
505
+
506
+ cap.release()
507
+
508
+ if not features or len(features) < min_cluster_size:
509
+ print(
510
+ f"[detect_scenes] No hay suficientes frames válidos para clusterizar escenas: "
511
+ f"validos={len(features)}, min_cluster_size={min_cluster_size}"
512
+ )
513
+ return {"scene_clusters": []}
514
+
515
+ Xs = np.vstack(features)
516
+
517
+ # ------------------------------------------------------------------
518
+ # STEP 2: Clustering jerárquico de escenas (k-Target + mida mínima)
519
+ # ------------------------------------------------------------------
520
+ print("[detect_scenes] Clustering jerárquico de escenas...")
521
+ scene_labels = hierarchical_cluster_with_min_size(Xs, max_groups, min_cluster_size, 0.5)
522
+ unique_labels = sorted({int(l) for l in scene_labels if int(l) >= 0})
523
+ print(f"[detect_scenes] Etiquetas de escena válidas: {unique_labels}")
524
+
525
+ # Mapear índices de keyframes a clusters
526
+ cluster_map: Dict[int, List[int]] = {}
527
+ for idx, lbl in enumerate(scene_labels):
528
+ lbl = int(lbl)
529
+ if lbl >= 0:
530
+ cluster_map.setdefault(lbl, []).append(idx)
531
+
532
+ # ------------------------------------------------------------------
533
+ # STEP 3: Construir scene_clusters con el formato esperado por el demo
534
+ # ------------------------------------------------------------------
535
+ scene_clusters: List[Dict[str, Any]] = []
536
+ for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
537
+ if not idxs:
538
+ continue
539
+
540
+ scene_id = f"scene_{ci:02d}"
541
+ scene_out_dir = scenes_dir / scene_id
542
+ scene_out_dir.mkdir(parents=True, exist_ok=True)
543
+
544
+ # Copiar todos los keyframes del cluster a la carpeta del cluster
545
+ cluster_start = None
546
+ cluster_end = None
547
+ representative_file = None
548
+
549
+ for j, k_idx in enumerate(idxs):
550
+ src = keyframe_paths[k_idx]
551
+ dst = scene_out_dir / src.name
552
+ try:
553
+ shutil.copy2(src, dst)
554
+ except Exception as cp_err:
555
+ print(f"[detect_scenes] Error copiando keyframe {src} a cluster {scene_id}: {cp_err}")
556
+ continue
557
+
558
+ if representative_file is None:
559
+ representative_file = dst
560
+
561
+ info = keyframe_infos[k_idx]
562
+ start = info.get("start", k_idx)
563
+ end = info.get("end", k_idx + 1)
564
+ cluster_start = start if cluster_start is None else min(cluster_start, start)
565
+ cluster_end = end if cluster_end is None else max(cluster_end, end)
566
+
567
+ if representative_file is None:
568
+ continue
569
+
570
+ scene_clusters.append({
571
+ "id": scene_id,
572
+ "name": f"Escena {len(scene_clusters)+1}",
573
+ "folder": str(scene_out_dir),
574
+ "image_url": f"/files_scene/{video_name}/{scene_id}/{representative_file.name}",
575
+ "start_time": float(cluster_start) if cluster_start is not None else 0.0,
576
+ "end_time": float(cluster_end) if cluster_end is not None else 0.0,
577
+ })
578
+
579
+ print(f"[detect_scenes]  {len(scene_clusters)} escenes clusteritzades")
580
+ return {"scene_clusters": scene_clusters}
581
+
582
+ except Exception as e:
583
+ print(f"[detect_scenes] Error: {e}")
584
+ import traceback
585
+ traceback.print_exc()
586
+ return {"scene_clusters": [], "error": str(e)}
587
+
588
+
589
+ def process_video_job(job_id: str):
590
+ """
591
+ Process video job in background using EXTERNAL spaces (svision, asr).
592
+
593
+ NO local GPU needed - all vision/audio processing is delegated to:
594
+ - svision: face detection + embeddings (MTCNN + FaceNet)
595
+ - asr: audio diarization + voice embeddings (pyannote + ECAPA)
596
+
597
+ Engine only does: frame extraction, clustering (math), file organization.
598
+ """
599
+ try:
600
+ job = jobs[job_id]
601
+ print(f"[{job_id}] Iniciando procesamiento (delegando a svision/asr)...")
602
+
603
+ job["status"] = JobStatus.PROCESSING
604
+
605
+ video_path = job["video_path"]
606
+ video_name = job["video_name"]
607
+ max_groups = int(job.get("max_groups", 5))
608
+ min_cluster_size = int(job.get("min_cluster_size", 3))
609
+ face_sensitivity = float(job.get("face_sensitivity", 0.5))
610
+
611
+ base = TEMP_ROOT / video_name
612
+ base.mkdir(parents=True, exist_ok=True)
613
+ print(f"[{job_id}] Directorio base: {base}")
614
+
615
+ try:
616
+ # ============================================================
617
+ # STEP 1: Extract frames from video (local, simple cv2)
618
+ # ============================================================
619
+ print(f"[{job_id}] Extrayendo frames del vídeo...")
620
+
621
+ cap = cv2.VideoCapture(video_path)
622
+ if not cap.isOpened():
623
+ raise RuntimeError("No se pudo abrir el vídeo")
624
+
625
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
626
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
627
+ max_samples = job.get("max_frames", 100)
628
+
629
+ if total_frames > 0:
630
+ frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
631
+ else:
632
+ frame_indices = []
633
+
634
+ print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames")
635
+
636
+ # Save frames temporarily for svision processing
637
+ frames_dir = base / "frames_temp"
638
+ frames_dir.mkdir(parents=True, exist_ok=True)
639
+ faces_root = base / "faces_raw"
640
+ faces_root.mkdir(parents=True, exist_ok=True)
641
+
642
+ frame_paths: List[str] = []
643
+ for frame_idx in frame_indices:
644
+ cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
645
+ ret, frame = cap.read()
646
+ if not ret:
647
+ continue
648
+ frame_path = frames_dir / f"frame_{frame_idx:06d}.jpg"
649
+ cv2.imwrite(str(frame_path), frame)
650
+ frame_paths.append(str(frame_path))
651
+ cap.release()
652
+
653
+ print(f"[{job_id}] ✓ {len(frame_paths)} frames extraídos")
654
+
655
+ # ============================================================
656
+ # STEP 2: Send frames to SVISION for face detection + embeddings
657
+ # ============================================================
658
+ print(f"[{job_id}] Enviando frames a svision para detección de caras...")
659
+
660
+ embeddings: List[List[float]] = []
661
+ crops_meta: List[dict] = []
662
+ saved_count = 0
663
+ frames_with_faces = 0
664
+
665
+ for i, frame_path in enumerate(frame_paths):
666
+ frame_idx = frame_indices[i] if i < len(frame_indices) else i
667
+ try:
668
+ # Call svision to get faces + embeddings
669
+ faces = svision_client.get_face_embeddings_from_image(frame_path)
670
+
671
+ if faces:
672
+ frames_with_faces += 1
673
+ for face_data in faces:
674
+ emb = face_data.get("embedding", [])
675
+ if not emb:
676
+ continue
677
+
678
+ # Normalize embedding
679
+ emb = np.array(emb, dtype=float)
680
+ emb = emb / (np.linalg.norm(emb) + 1e-9)
681
+ embeddings.append(emb.tolist())
682
+
683
+ # Save face crop if provided by svision
684
+ crop_path = face_data.get("face_crop_path")
685
+ fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
686
+ local_crop_path = faces_root / fn
687
+
688
+ crop_saved = False
689
+ if crop_path:
690
+ # Handle remote URLs from svision (Gradio)
691
+ if isinstance(crop_path, str) and crop_path.startswith("http"):
692
+ try:
693
+ import requests
694
+ resp = requests.get(crop_path, timeout=30)
695
+ if resp.status_code == 200:
696
+ with open(local_crop_path, "wb") as f:
697
+ f.write(resp.content)
698
+ crop_saved = True
699
+ except Exception as dl_err:
700
+ print(f"[{job_id}] Error descargando crop: {dl_err}")
701
+ # Handle local paths
702
+ elif isinstance(crop_path, str) and os.path.exists(crop_path):
703
+ shutil.copy2(crop_path, local_crop_path)
704
+ crop_saved = True
705
+
706
+ if not crop_saved:
707
+ # If no crop from svision, use original frame
708
+ shutil.copy2(frame_path, local_crop_path)
709
+
710
+ crops_meta.append({
711
+ "file": fn,
712
+ "frame": frame_idx,
713
+ "index": face_data.get("index", saved_count),
714
+ })
715
+ saved_count += 1
716
+
717
+ except Exception as e:
718
+ print(f"[{job_id}] Error procesando frame {frame_idx}: {e}")
719
+ continue
720
+
721
+ print(f"[{job_id}] Frames con caras: {frames_with_faces}/{len(frame_paths)}")
722
+ print(f"[{job_id}] ✓ Caras detectadas: {len(embeddings)}")
723
+
724
+ # ============================================================
725
+ # STEP 3: Clustering (local, only math - no GPU)
726
+ # ============================================================
727
+ if embeddings:
728
+ print(f"[{job_id}] Clustering jerárquico...")
729
+ Xf = np.array(embeddings)
730
+ labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
731
+ n_clusters = len(set([l for l in labels if l >= 0]))
732
+ print(f"[{job_id}] ✓ Clustering: {n_clusters} clusters")
733
+ else:
734
+ labels = []
735
+
736
+ # ============================================================
737
+ # STEP 4: Organize faces into character folders
738
+ # ============================================================
739
+ characters: List[Dict[str, Any]] = []
740
+ cluster_map: Dict[int, List[int]] = {}
741
+ for idx, lbl in enumerate(labels):
742
+ if isinstance(lbl, int) and lbl >= 0:
743
+ cluster_map.setdefault(lbl, []).append(idx)
744
+
745
+ chars_dir = base / "characters"
746
+ chars_dir.mkdir(parents=True, exist_ok=True)
747
+
748
+ print(f"[{job_id}] cluster_map: {cluster_map}")
749
+ print(f"[{job_id}] crops_meta count: {len(crops_meta)}")
750
+ print(f"[{job_id}] faces_root: {faces_root}, exists: {faces_root.exists()}")
751
+ if faces_root.exists():
752
+ existing_files = list(faces_root.glob("*"))
753
+ print(f"[{job_id}] Files in faces_root: {len(existing_files)}")
754
+ for ef in existing_files[:5]:
755
+ print(f"[{job_id}] - {ef.name}")
756
+
757
+ for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
758
+ char_id = f"char_{ci:02d}"
759
+ print(f"[{job_id}] Processing cluster {char_id} with {len(idxs)} indices: {idxs[:5]}...")
760
+
761
+ if not idxs:
762
+ continue
763
+
764
+ out_dir = chars_dir / char_id
765
+ out_dir.mkdir(parents=True, exist_ok=True)
766
+
767
+ # Select faces to show (half + 1)
768
+ total_faces = len(idxs)
769
+ max_faces_to_show = (total_faces // 2) + 1
770
+ selected_idxs = idxs[:max_faces_to_show]
771
+
772
+ files: List[str] = []
773
+ file_urls: List[str] = []
774
+
775
+ for j in selected_idxs:
776
+ if j >= len(crops_meta):
777
+ print(f"[{job_id}] Index {j} out of range (crops_meta len={len(crops_meta)})")
778
+ continue
779
+ meta = crops_meta[j]
780
+ fname = meta.get("file")
781
+ if not fname:
782
+ print(f"[{job_id}] No filename in meta for index {j}")
783
+ continue
784
+
785
+ src = faces_root / fname
786
+ dst = out_dir / fname
787
+ try:
788
+ if src.exists():
789
+ shutil.copy2(src, dst)
790
+ files.append(fname)
791
+ file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
792
+ else:
793
+ print(f"[{job_id}] Source file not found: {src}")
794
+ except Exception as cp_err:
795
+ print(f"[{job_id}] Error copying {fname}: {cp_err}")
796
+
797
+ # Create representative image
798
+ rep = files[0] if files else None
799
+ if rep:
800
+ try:
801
+ shutil.copy2(out_dir / rep, out_dir / "representative.jpg")
802
+ except Exception:
803
+ pass
804
+
805
+ cluster_number = ci + 1
806
+ character_name = f"Cluster {cluster_number}"
807
+
808
+ characters.append({
809
+ "id": char_id,
810
+ "name": character_name,
811
+ "folder": str(out_dir),
812
+ "num_faces": len(files),
813
+ "total_faces_detected": total_faces,
814
+ "image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
815
+ "face_files": file_urls,
816
+ })
817
+ print(f"[{job_id}] ✓ Cluster {char_id}: {len(files)} caras")
818
+
819
+ # Cleanup temp frames
820
+ try:
821
+ shutil.rmtree(frames_dir)
822
+ except Exception:
823
+ pass
824
+
825
+ print(f"[{job_id}] ✓ Total: {len(characters)} personajes")
826
+
827
+ # ============================================================
828
+ # STEP 5: Audio diarization + voice embeddings using ASR space
829
+ # ============================================================
830
+ voice_max_groups = int(job.get("voice_max_groups", 3))
831
+ voice_min_cluster_size = int(job.get("voice_min_cluster_size", 3))
832
+ voice_sensitivity = float(job.get("voice_sensitivity", 0.5))
833
+
834
+ audio_segments: List[Dict[str, Any]] = []
835
+ voice_labels: List[int] = []
836
+ voice_embeddings: List[List[float]] = []
837
+ diarization_info: Dict[str, Any] = {}
838
+
839
+ print(f"[{job_id}] Procesando audio con ASR space...")
840
+ try:
841
+ # Extract audio and diarize
842
+ diar_result = asr_client.extract_audio_and_diarize(video_path)
843
+ clips = diar_result.get("clips", [])
844
+ segments = diar_result.get("segments", [])
845
+
846
+ print(f"[{job_id}] Diarización: {len(clips)} clips, {len(segments)} segmentos")
847
+
848
+ # Save clips locally
849
+ clips_dir = base / "clips"
850
+ clips_dir.mkdir(parents=True, exist_ok=True)
851
+
852
+ for i, clip_info in enumerate(clips if isinstance(clips, list) else []):
853
+ clip_path = clip_info if isinstance(clip_info, str) else clip_info.get("path") if isinstance(clip_info, dict) else None
854
+ if not clip_path:
855
+ continue
856
+
857
+ # Download or copy clip
858
+ local_clip = clips_dir / f"segment_{i:03d}.wav"
859
+ try:
860
+ if isinstance(clip_path, str) and clip_path.startswith("http"):
861
+ import requests
862
+ resp = requests.get(clip_path, timeout=30)
863
+ if resp.status_code == 200:
864
+ with open(local_clip, "wb") as f:
865
+ f.write(resp.content)
866
+ elif isinstance(clip_path, str) and os.path.exists(clip_path):
867
+ shutil.copy2(clip_path, local_clip)
868
+ except Exception as dl_err:
869
+ print(f"[{job_id}] Error guardando clip {i}: {dl_err}")
870
+ continue
871
+
872
+ # Get segment info
873
+ seg_info = segments[i] if i < len(segments) else {}
874
+ speaker = seg_info.get("speaker", f"SPEAKER_{i:02d}")
875
+
876
+ # Get voice embedding for this clip
877
+ emb = asr_client.get_voice_embedding(str(local_clip))
878
+ if emb:
879
+ voice_embeddings.append(emb)
880
+
881
+ audio_segments.append({
882
+ "index": i,
883
+ "clip_path": str(local_clip),
884
+ "clip_url": f"/audio/{video_name}/segment_{i:03d}.wav",
885
+ "speaker": speaker,
886
+ "start": seg_info.get("start", 0),
887
+ "end": seg_info.get("end", 0),
888
+ })
889
+
890
+ print(f"[{job_id}] \u2713 {len(audio_segments)} segmentos de audio procesados")
891
+
892
+ # Cluster voice embeddings
893
+ if voice_embeddings:
894
+ print(f"[{job_id}] Clustering jer\u00e1rquico de voz...")
895
+ print(f"[{job_id}] - voice_embeddings: {len(voice_embeddings)} embeddings")
896
+ print(f"[{job_id}] - par\u00e1metros: voice_max_groups={voice_max_groups}, voice_min_cluster_size={voice_min_cluster_size}")
897
+ Xv = np.array(voice_embeddings)
898
+ print(f"[{job_id}] - shape Xv: {Xv.shape}")
899
+ voice_labels = hierarchical_cluster_with_min_size(
900
+ Xv, voice_max_groups, voice_min_cluster_size, voice_sensitivity
901
+ ).tolist()
902
+ n_voice_clusters = len(set([l for l in voice_labels if l >= 0]))
903
+ print(f"[{job_id}] - voice_labels: {voice_labels}")
904
+ print(f"[{job_id}] \u2713 Clustering de voz: {n_voice_clusters} clusters de {len(voice_embeddings)} muestras")
905
+ else:
906
+ print(f"[{job_id}] \u26a0\ufe0f No hay voice_embeddings para clustering")
907
+
908
+ diarization_info = {
909
+ "num_segments": len(audio_segments),
910
+ "num_voice_clusters": len(set([l for l in voice_labels if l >= 0])) if voice_labels else 0,
911
+ }
912
+
913
+ except Exception as audio_err:
914
+ print(f"[{job_id}] Error en procesamiento de audio: {audio_err}")
915
+ import traceback
916
+ traceback.print_exc()
917
+
918
+ job["results"] = {
919
+ "characters": characters,
920
+ "face_labels": labels,
921
+ "audio_segments": audio_segments,
922
+ "voice_labels": voice_labels,
923
+ "diarization_info": diarization_info,
924
+ "video_name": video_name,
925
+ "base_dir": str(base),
926
+ }
927
+ job["status"] = JobStatus.DONE
928
+ print(f"[{job_id}] ✓ Procesamiento completado")
929
+
930
+ except Exception as proc_error:
931
+ print(f"[{job_id}] Error en procesamiento: {proc_error}")
932
+ import traceback
933
+ traceback.print_exc()
934
+ job["results"] = {
935
+ "characters": [], "face_labels": [],
936
+ "audio_segments": [], "voice_labels": [], "diarization_info": {},
937
+ "video_name": video_name, "base_dir": str(base)
938
+ }
939
+ job["status"] = JobStatus.DONE
940
+
941
+ except Exception as e:
942
+ print(f"[{job_id}] Error general: {e}")
943
+ import traceback
944
+ traceback.print_exc()
945
+ job["status"] = JobStatus.FAILED
946
+ job["error"] = str(e)