VeuReu commited on
Commit
6526378
·
verified ·
1 Parent(s): 557665b

Upload api.py

Browse files
Files changed (1) hide show
  1. api.py +299 -689
api.py CHANGED
@@ -75,22 +75,21 @@ app.include_router(embeddings_router)
75
  app.include_router(pending_videos_router)
76
 
77
  def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[str, str]:
78
- """
79
- Llama al space svision para describir una imagen (usado en generación de AD).
80
-
81
  Args:
82
- image_path: Ruta absoluta a la imagen
83
- is_face: True si es una cara, False si es una escena
84
-
85
  Returns:
86
- tuple (descripción_completa, nombre_abreviado)
87
  """
88
  try:
89
  from pathlib import Path as _P
90
  import yaml
91
  from llm_router import LLMRouter
92
 
93
- # Cargar configuración
94
  config_path = _P(__file__).parent / "config.yaml"
95
  if not config_path.exists():
96
  print(f"[svision] Config no encontrado: {config_path}")
@@ -101,7 +100,7 @@ def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[
101
 
102
  router = LLMRouter(cfg)
103
 
104
- # Contexto diferente para caras vs escenas
105
  if is_face:
106
  context = {
107
  "task": "describe_person",
@@ -115,7 +114,7 @@ def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[
115
  "max_tokens": 128
116
  }
117
 
118
- # Llamar a svision
119
  descriptions = router.vision_describe([str(image_path)], context=context, model="salamandra-vision")
120
  full_description = descriptions[0] if descriptions else ""
121
 
@@ -133,68 +132,69 @@ def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[
133
  return ("", "")
134
 
135
  def normalize_face_lighting(image):
136
- """
137
- Normaliza el brillo de una imagen de cara usando técnicas combinadas:
138
- 1. CLAHE para ecualización adaptativa
139
- 2. Normalización de rango para homogeneizar brillo general
140
-
141
- Esto reduce el impacto de diferentes condiciones de iluminación en los embeddings
142
- y en la visualización de las imágenes.
143
-
144
  Args:
145
- image: Imagen BGR (OpenCV format)
146
-
147
  Returns:
148
- Imagen normalizada en el mismo formato
149
  """
150
  import cv2
151
  import numpy as np
152
 
153
- # Paso 1: Convertir a LAB color space (más robusto para iluminación)
154
  lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
155
  l, a, b = cv2.split(lab)
156
 
157
- # Paso 2: Aplicar CLAHE (Contrast Limited Adaptive Histogram Equalization) al canal L
158
- # Usar clipLimit más alto para normalización más agresiva
159
  clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
160
  l_clahe = clahe.apply(l)
161
 
162
- # Paso 3: Normalizar el rango del canal L para asegurar distribución uniforme
163
- # Esto garantiza que todas las imágenes tengan un rango de brillo similar
164
  l_min, l_max = l_clahe.min(), l_clahe.max()
165
  if l_max > l_min:
166
- # Estirar el histograma al rango completo [0, 255]
167
  l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
168
  else:
169
  l_normalized = l_clahe
170
 
171
- # Paso 4: Aplicar suavizado suave para reducir ruido introducido por la normalización
172
  l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
173
 
174
- # Recombinar canales
175
  lab_normalized = cv2.merge([l_normalized, a, b])
176
 
177
- # Convertir de vuelta a BGR
178
  normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
179
  return normalized
180
 
181
  def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
182
- """
183
- Clustering jerárquico con silhouette score para encontrar automáticamente el mejor número de clusters.
184
- Selecciona automáticamente el mejor número de clusters (hasta max_groups) usando silhouette score.
185
- Filtra clusters con menos de min_cluster_size muestras (marcados como -1/ruido).
186
-
 
187
  Args:
188
- X: Array de embeddings (N, D)
189
- max_groups: Número máximo de clusters a formar
190
- min_cluster_size: Tamaño mínimo de cluster válido
191
- sensitivity: Sensibilidad del clustering (0.0-1.0)
192
- - 0.0 = muy agresivo (menos clusters)
193
- - 0.5 = balanceado (recomendado)
194
- - 1.0 = muy permisivo (más clusters)
195
-
196
  Returns:
197
- Array de labels (N,) donde -1 indica ruido
198
  """
199
  import numpy as np
200
  from scipy.cluster.hierarchy import linkage, fcluster
@@ -205,36 +205,36 @@ def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int
205
  return np.array([])
206
 
207
  if len(X) < min_cluster_size:
208
- # Si hay menos muestras que el mínimo, todo es ruido
209
  return np.full(len(X), -1, dtype=int)
210
 
211
- # Linkage usando average linkage (más flexible que ward, menos sensible a outliers)
212
- # Esto ayuda a agrupar mejor la misma persona con diferentes ángulos/expresiones
213
  Z = linkage(X, method='average', metric='cosine') # Cosine similarity para embeddings
214
 
215
- # Encontrar el número óptimo de clusters usando silhouette score
216
  best_n_clusters = 2
217
  best_score = -1
218
 
219
- # Probar diferentes números de clusters (de 2 a max_groups)
220
- max_to_try = min(max_groups, len(X) - 1) # No puede haber más clusters que muestras
221
 
222
  if max_to_try >= 2:
223
  for n_clusters in range(2, max_to_try + 1):
224
  trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
225
-
226
- # Calcular cuántos clusters válidos tendríamos después del filtrado
227
  trial_counts = Counter(trial_labels)
228
  valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
229
 
230
- # Solo evaluar si hay al menos 2 clusters válidos
231
  if valid_clusters >= 2:
232
  try:
233
  score = silhouette_score(X, trial_labels, metric='cosine')
234
- # Penalización dinámica basada en sensibilidad:
235
- # - sensitivity=0.0 → penalty=0.14 (muy agresivo, menos clusters)
236
- # - sensitivity=0.5 → penalty=0.07 (balanceado, recomendado)
237
- # - sensitivity=1.0 → penalty=0.01 (permisivo, más clusters)
238
  penalty = 0.14 - (sensitivity * 0.13)
239
  adjusted_score = score - (n_clusters * penalty)
240
 
@@ -244,22 +244,22 @@ def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int
244
  except:
245
  pass # Si falla el cálculo, ignorar esta configuración
246
 
247
- # Usar el número óptimo de clusters encontrado
248
  penalty = 0.14 - (sensitivity * 0.13)
249
  print(f"Clustering óptimo: {best_n_clusters} clusters (de máximo {max_groups}), sensitivity={sensitivity:.2f}, penalty={penalty:.3f}, silhouette={best_score:.3f}")
250
  labels = fcluster(Z, t=best_n_clusters, criterion='maxclust')
251
 
252
- # fcluster devuelve labels 1-indexed, convertir a 0-indexed
253
  labels = labels - 1
254
 
255
- # Filtrar clusters pequeños
256
  label_counts = Counter(labels)
257
  filtered_labels = []
258
  for lbl in labels:
259
  if label_counts[lbl] >= min_cluster_size:
260
  filtered_labels.append(lbl)
261
  else:
262
- filtered_labels.append(-1) # Ruido
263
 
264
  return np.array(filtered_labels, dtype=int)
265
 
@@ -292,20 +292,22 @@ async def create_initial_casting(
292
  voice_sensitivity: float = Form(default=0.5),
293
  max_frames: int = Form(default=100),
294
  ):
 
 
 
 
 
295
  """
296
- Crea un job para procesar el vídeo de forma asíncrona usando clustering jerárquico.
297
- Devuelve un job_id inmediatamente.
298
- """
299
- # Guardar vídeo en carpeta de datos
300
  video_name = Path(video.filename).stem
301
  dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
302
  with dst_video.open("wb") as f:
303
  shutil.copyfileobj(video.file, f)
304
 
305
- # Crear job_id único
306
  job_id = str(uuid.uuid4())
307
 
308
- # Inicializar el job
309
  jobs[job_id] = {
310
  "id": job_id,
311
  "status": JobStatus.QUEUED,
@@ -325,7 +327,7 @@ async def create_initial_casting(
325
 
326
  print(f"[{job_id}] Job creado para vídeo: {video_name}")
327
 
328
- # Iniciar procesamiento en background
329
  background_tasks.add_task(process_video_job, job_id)
330
 
331
  # Devolver job_id inmediatamente
@@ -566,70 +568,77 @@ def process_video_job(job_id: str):
566
  # Construir carpetas por clúster con validación DeepFace
567
  from face_classifier import validate_and_classify_face, FACE_CONFIDENCE_THRESHOLD
568
 
569
- characters_validated = []
570
  cluster_map: dict[int, list[int]] = {}
571
- for i, lbl in enumerate(labels):
 
572
  if isinstance(lbl, int) and lbl >= 0:
573
- cluster_map.setdefault(lbl, []).append(i)
574
 
575
  chars_dir = base / "characters"
576
  chars_dir.mkdir(parents=True, exist_ok=True)
577
  import shutil as _sh
578
-
579
  original_cluster_count = len(cluster_map)
580
  print(f"[{job_id}] Procesando {original_cluster_count} clusters detectados...")
581
-
582
  for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
583
  char_id = f"char_{ci:02d}"
584
-
585
- # PASO 1: Ordenar caras por área del bounding box (mejor calidad)
586
- face_detections = []
587
  for j in idxs:
588
  meta = crops_meta[j]
 
 
 
589
  box = meta.get("box", [0, 0, 0, 0])
 
590
  if len(box) >= 4:
591
  top, right, bottom, left = box
592
- w = abs(right - left)
593
- h = abs(bottom - top)
594
- area_score = w * h
595
- else:
596
- area_score = 0
597
-
598
- face_detections.append({
599
- 'index': j,
600
- 'score': area_score,
601
- 'file': meta['file'],
602
- 'box': box
603
  })
604
-
605
- # Ordenar por score descendente
606
- face_detections_sorted = sorted(
607
- face_detections,
608
- key=lambda x: x['score'],
609
- reverse=True
610
- )
611
-
612
- if not face_detections_sorted:
613
  print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: sense deteccions, eliminant")
614
  continue
615
-
616
- # PASO 2: Validar SOLO la mejor cara del cluster
617
- best_face = face_detections_sorted[0]
618
- best_face_path = faces_root / best_face['file']
619
-
620
  print(f"[{job_id}] [VALIDATION] Cluster {char_id}: validant millor cara (bbox_area={best_face['score']:.0f}px²)")
621
  print(f"[{job_id}] [VALIDATION] Cluster {char_id}: millor cara path={best_face_path}")
622
  print(f"[{job_id}] [VALIDATION] ▶▶▶ CRIDANT validate_and_classify_face() ◀◀◀")
623
-
624
  validation = validate_and_classify_face(str(best_face_path))
625
-
626
  print(f"[{job_id}] [VALIDATION] ▶▶▶ validate_and_classify_face() RETORNAT ◀◀◀")
627
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628
  if not validation:
629
  print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: error en validació DeepFace, eliminant cluster")
630
  continue
631
-
632
- # Mostrar resultados detallados de DeepFace
633
  print(f"[{job_id}] [DEEPFACE RESULT] Cluster {char_id}:")
634
  print(f"[{job_id}] - is_valid_face: {validation['is_valid_face']}")
635
  print(f"[{job_id}] - face_confidence: {validation['face_confidence']:.3f}")
@@ -638,36 +647,34 @@ def process_video_job(job_id: str):
638
  print(f"[{job_id}] - gender_diff: {abs(validation['man_prob'] - validation['woman_prob']):.3f}")
639
  print(f"[{job_id}] - gender_assigned: {validation['gender']}")
640
  print(f"[{job_id}] - gender_confidence: {validation['gender_confidence']:.3f}")
641
-
642
- # PASO 3: Verificar si és una cara vàlida
643
- if not validation['is_valid_face'] or validation['face_confidence'] < FACE_CONFIDENCE_THRESHOLD:
644
- print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: NO ES UNA CARA VÁLIDA (face_confidence={validation['face_confidence']:.3f} < threshold={FACE_CONFIDENCE_THRESHOLD}), eliminant tot el clúster")
 
 
645
  continue
646
-
647
- # PASO 4: És una cara vàlida! Crear carpeta
648
  out_dir = chars_dir / char_id
649
  out_dir.mkdir(parents=True, exist_ok=True)
650
-
651
- # PASO 5: Limitar caras a mostrar (primera meitat + 1)
652
- total_faces = len(face_detections_sorted)
653
  max_faces_to_show = (total_faces // 2) + 1
654
- face_detections_limited = face_detections_sorted[:max_faces_to_show]
655
-
656
- # Copiar solo las caras limitadas
657
- files = []
658
- face_files_urls = []
659
- for k, face_det in enumerate(face_detections_limited):
660
- fname = face_det['file']
661
  src = faces_root / fname
662
  dst = out_dir / fname
663
  try:
664
  _sh.copy2(src, dst)
665
  files.append(fname)
666
- face_files_urls.append(f"/files/{video_name}/{char_id}/{fname}")
667
  except Exception:
668
  pass
669
-
670
- # Imagen representativa (la mejor)
671
  rep = files[0] if files else None
672
  if rep:
673
  rep_src = out_dir / rep
@@ -676,535 +683,180 @@ def process_video_job(job_id: str):
676
  _sh.copy2(rep_src, rep_dst)
677
  except Exception:
678
  pass
679
-
680
- # PASO 6: Generar nombre de clúster
681
- cluster_number = int(char_id.split('_')[1]) + 1
682
  character_name = f"Cluster {cluster_number}"
683
- gender = validation['gender']
684
-
685
  print(f"[{job_id}] [NAME GENERATION] Cluster {char_id}:")
686
  print(f"[{job_id}] - Gender detectado: {gender}")
687
  print(f"[{job_id}] - Nombre asignado: {character_name}")
688
  print(f"[{job_id}] - Seed usado: {char_id}")
689
-
690
- character_data = {
691
  "id": char_id,
692
  "name": character_name,
693
  "gender": gender,
694
- "gender_confidence": validation['gender_confidence'],
695
- "face_confidence": validation['face_confidence'],
696
- "man_prob": validation['man_prob'],
697
- "woman_prob": validation['woman_prob'],
698
  "folder": str(out_dir),
699
  "num_faces": len(files),
700
  "total_faces_detected": total_faces,
701
  "image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
702
- "face_files": face_files_urls,
703
- }
704
-
705
- characters_validated.append(character_data)
706
-
707
  print(f"[{job_id}] [VALIDATION] ✓ Cluster {char_id}: CARA VÁLIDA!")
708
  print(f"[{job_id}] Nombre: {character_name}")
709
- print(f"[{job_id}] Género: {gender} (man={validation['man_prob']:.3f}, woman={validation['woman_prob']:.3f})")
710
- print(f"[{job_id}] Confianza género: {validation['gender_confidence']:.3f}")
711
- print(f"[{job_id}] Confianza cara: {validation['face_confidence']:.3f}")
712
  print(f"[{job_id}] Caras mostradas: {len(files)}/{total_faces}")
713
  print(f"[{job_id}] Imagen representativa: {best_face_path.name}")
714
-
715
- # Estadístiques finals
716
  eliminated_count = original_cluster_count - len(characters_validated)
717
  print(f"[{job_id}] [VALIDATION] Total: {len(characters_validated)} clústers vàlids "
718
  f"(eliminats {eliminated_count} falsos positius)")
719
-
720
- characters = characters_validated
721
 
722
- # Escribir analysis.json compatible con 'originales'
723
- analysis = {
724
- "caras": [{"embeddings": e} for e in embeddings],
725
- "voices": [],
726
- "escenas": [],
727
- }
728
- analysis_path = str(base / "analysis.json")
729
- with open(analysis_path, "w", encoding="utf-8") as f:
730
- json.dump(analysis, f, ensure_ascii=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
 
732
- face_labels = labels
733
- num_face_embeddings = len(embeddings)
 
 
 
 
734
 
735
- print(f"[{job_id}] Personajes detectados: {len(characters)}")
736
- for char in characters:
737
- print(f"[{job_id}] - {char['name']}: {char['num_faces']} caras")
738
-
739
- # Enriquecer info de personajes con listado real de imágenes disponibles
740
- try:
741
- import glob, os
742
- for ch in characters:
743
- folder = ch.get("folder")
744
- face_files = []
745
- if folder and os.path.isdir(folder):
746
- # soportar patrones face_* y extensiones jpg/png
747
- patterns = ["face_*.jpg", "face_*.png"]
748
- files = []
749
- for pat in patterns:
750
- files.extend(glob.glob(os.path.join(folder, pat)))
751
- # si no hay face_*, tomar cualquier jpg/png para no dejar vacío
752
- if not files:
753
- files.extend(glob.glob(os.path.join(folder, "*.jpg")))
754
- files.extend(glob.glob(os.path.join(folder, "*.png")))
755
- # normalizar nombres de fichero relativos
756
- face_files = sorted({os.path.basename(p) for p in files})
757
- # Garantizar que representative.(jpg|png) esté el primero si existe
758
- for rep_name in ("representative.jpg", "representative.png"):
759
- rep_path = os.path.join(folder, rep_name)
760
- if os.path.exists(rep_path):
761
- if rep_name in face_files:
762
- face_files.remove(rep_name)
763
- face_files.insert(0, rep_name)
764
- ch["face_files"] = face_files
765
- # Ajustar num_faces si hay discrepancia
766
- if face_files:
767
- ch["num_faces"] = len(face_files)
768
- except Exception as _e:
769
- print(f"[{job_id}] WARN - No se pudo enumerar face_files: {_e}")
770
-
771
- # Procesamiento de audio: diarización, ASR y embeddings de voz
772
- try:
773
- cfg = load_yaml("config.yaml")
774
- audio_segments, srt_unmod, full_txt, diar_info, connection_logs = process_audio_for_video(video_path, base, cfg, voice_collection=None)
775
- # Loggear en consola del engine los eventos de conexión
776
- try:
777
- for ev in (connection_logs or []):
778
- msg = ev.get("message") if isinstance(ev, dict) else None
779
- if msg:
780
- print(f"[{job_id}] {msg}")
781
- except Exception:
782
- pass
783
- except Exception as e_audio:
784
- import traceback
785
- print(f"[{job_id}] WARN - Audio pipeline failed: {e_audio}\n{traceback.format_exc()}")
786
- audio_segments, srt_unmod, full_txt = [], None, ""
787
- diar_info = {"diarization_ok": False, "error": str(e_audio)}
788
- connection_logs = []
789
-
790
- # Fallback: si no hay segmentos de audio, crear uno mínimo del audio completo
791
- if not audio_segments:
792
- try:
793
- from pathlib import Path as _P
794
- from pydub import AudioSegment as _AS
795
- wav_out = extract_audio_ffmpeg(video_path, base / f"{_P(video_path).stem}.wav", sr=16000)
796
- audio = _AS.from_wav(wav_out)
797
- clips_dir = base / "clips"
798
- clips_dir.mkdir(parents=True, exist_ok=True)
799
- cp = clips_dir / "segment_000.wav"
800
- audio.export(cp, format="wav")
801
- emb_list = embed_voice_segments([str(cp)])
802
- audio_segments = [{
803
- "segment": 0,
804
- "start": 0.0,
805
- "end": float(len(audio) / 1000.0),
806
- "speaker": "SPEAKER_00",
807
- "text": "",
808
- "voice_embedding": emb_list[0] if emb_list else [],
809
- "clip_path": str(cp),
810
- "lang": "ca",
811
- "lang_prob": 1.0,
812
- }]
813
- except Exception as _efb:
814
- print(f"[{job_id}] WARN - Audio minimal fallback failed: {_efb}")
815
-
816
- # Clustering jerárquico de voces sobre embeddings válidos
817
- import numpy as np
818
- voice_embeddings = [seg.get("voice_embedding") for seg in audio_segments if seg.get("voice_embedding")]
819
- if voice_embeddings:
820
- try:
821
- Xv = np.array(voice_embeddings)
822
- v_labels = hierarchical_cluster_with_min_size(Xv, v_max_groups, v_min_cluster, voice_sensitivity).tolist()
823
- print(f"[{job_id}] Clustering jerárquico de voz: {len(set([l for l in v_labels if l >= 0]))} clusters")
824
- except Exception as _e:
825
- print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
826
- v_labels = []
827
- else:
828
- v_labels = []
829
 
830
- # Guardar resultados primero y luego marcar como completado (evita carreras)
831
  job["results"] = {
832
- "characters": characters,
833
- "num_characters": len(characters),
834
- "analysis_path": analysis_path,
835
  "base_dir": str(base),
836
- "face_labels": face_labels,
837
- "num_face_embeddings": num_face_embeddings,
838
- "audio_segments": audio_segments,
839
- "srt_unmodified": srt_unmod,
840
- "full_transcription": full_txt,
841
- "voice_labels": v_labels,
842
- "num_voice_embeddings": len(voice_embeddings),
843
- "diarization_info": diar_info,
844
  }
845
  job["status"] = JobStatus.DONE
846
-
847
- # Log resumido sin embeddings
848
- print(f"[{job_id}] ✓ Resultados guardados:")
849
- print(f"[{job_id}] - Personatges: {len(characters)}")
850
- print(f"[{job_id}] - Segments d'àudio: {len(audio_segments)}")
851
- print(f"[{job_id}] - Face embeddings: {num_face_embeddings}")
852
- print(f"[{job_id}] - Voice embeddings: {len(voice_embeddings)}")
853
-
854
- except Exception as e_detect:
855
- # Si falla la detección, intentar modo fallback
856
- import traceback
857
- print(f"[{job_id}] ✗ Error en detección: {e_detect}")
858
- print(f"[{job_id}] Traceback: {traceback.format_exc()}")
859
- print(f"[{job_id}] Usando modo fallback (carpetas vacías)")
860
-
861
- # Crear carpetas básicas como fallback
862
- for sub in ("sources", "faces", "voices", "backgrounds"):
863
- (base / sub).mkdir(parents=True, exist_ok=True)
864
-
865
- # Guardar resultados de fallback y luego marcar como completado
866
- job["results"] = {
867
- "characters": [],
868
- "num_characters": 0,
869
- "temp_dirs": {
870
- "sources": str(base / "sources"),
871
- "faces": str(base / "faces"),
872
- "voices": str(base / "voices"),
873
- "backgrounds": str(base / "backgrounds"),
874
- },
875
- "warning": f"Detección falló, usando modo fallback: {str(e_detect)}"
876
- }
877
- job["status"] = JobStatus.DONE
878
-
879
- print(f"[{job_id}] ✓ Job completado exitosamente")
880
-
881
- except Exception as e:
882
- import traceback
883
- print(f"[{job_id}] ✗ Error inesperado: {e}")
884
- try:
885
- job = jobs.get(job_id)
886
- if job is not None:
887
- job["status"] = JobStatus.FAILED
888
- job["error"] = str(e)
889
- except Exception:
890
- pass
891
- print(f"[{job_id}] Traceback: {traceback.format_exc()}")
892
 
893
- @app.post("/generate_audiodescription")
894
- async def generate_audiodescription(video: UploadFile = File(...)):
895
- try:
896
- import uuid
897
- job_id = str(uuid.uuid4())
898
- vid_name = video.filename or f"video_{job_id}.mp4"
899
- base = TEMP_ROOT / Path(vid_name).stem
900
 
901
- base.mkdir(parents=True, exist_ok=True)
902
- # Save temp mp4
903
- video_path = base / vid_name
904
- with open(video_path, "wb") as f:
905
- f.write(await video.read())
906
-
907
- # Run MVP pipeline
908
- result = ad_generate(str(video_path), base)
909
-
910
- return {
911
- "status": "done",
912
- "results": {
913
- "une_srt": result.get("une_srt", ""),
914
- "free_text": result.get("free_text", ""),
915
- "artifacts": result.get("artifacts", {}),
916
- },
917
- }
918
  except Exception as e:
 
919
  import traceback
920
- print(f"/generate_audiodescription error: {e}\n{traceback.format_exc()}")
921
- raise HTTPException(status_code=500, detail=str(e))
922
-
923
- @app.post("/load_casting")
924
- async def load_casting(
925
- faces_dir: str = Form("identities/faces"),
926
- voices_dir: str = Form("identities/voices"),
927
- db_dir: str = Form("chroma_db"),
928
- drop_collections: bool = Form(False),
929
- ):
930
- client = ensure_chroma(Path(db_dir))
931
- n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
932
- n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
933
- return {"ok": True, "faces": n_faces, "voices": n_voices}
934
-
935
- @app.post("/finalize_casting")
936
- async def finalize_casting(
937
- payload: dict = Body(...),
938
- ):
939
- """
940
- Consolidate selected face and voice clusters into identities directories and build indices.
941
- Expected payload:
942
- {
943
- "video_name": str,
944
- "base_dir": str, # engine temp base for this video
945
- "characters": [
946
- {"id": "char1", "name": "Nom", "folder": "/tmp/temp/<video>/char1", "kept_files": ["representative.jpg", ...], "description": "..."}, ...
947
- ],
948
- "voice_clusters": [
949
- {"label": 0, "name": "SPEAKER_00", "clips": ["segment_000.wav", ...]}, ...
950
- ]
951
- }
952
- """
953
- import os
954
- import shutil
955
- from pathlib import Path as _P
956
-
957
- video_name = payload.get("video_name")
958
- base_dir = payload.get("base_dir")
959
- characters = payload.get("characters", []) or []
960
- voice_clusters = payload.get("voice_clusters", []) or []
961
-
962
- if not video_name or not base_dir:
963
- raise HTTPException(status_code=400, detail="Missing video_name or base_dir")
964
-
965
- faces_out = IDENTITIES_ROOT / video_name / "faces"
966
- voices_out = IDENTITIES_ROOT / video_name / "voices"
967
- faces_out.mkdir(parents=True, exist_ok=True)
968
- voices_out.mkdir(parents=True, exist_ok=True)
969
-
970
- # Consolidate faces per character name (merge same names)
971
- for ch in characters:
972
- ch_name = (ch.get("name") or "Unknown").strip() or "Unknown"
973
- ch_folder = ch.get("folder")
974
- kept = ch.get("kept_files") or []
975
- if not ch_folder or not os.path.isdir(ch_folder):
976
- continue
977
- dst_dir = faces_out / ch_name
978
- dst_dir.mkdir(parents=True, exist_ok=True)
979
- for fname in kept:
980
- src = _P(ch_folder) / fname
981
- if src.exists() and src.is_file():
982
- try:
983
- shutil.copy2(src, dst_dir / fname)
984
- except Exception:
985
- pass
986
-
987
- # Consolidate voices per cluster name
988
- clips_dir = _P(base_dir) / "clips"
989
- for vc in voice_clusters:
990
- v_name = (vc.get("name") or f"SPEAKER_{int(vc.get('label',0)):02d}").strip()
991
- dst_dir = voices_out / v_name
992
- dst_dir.mkdir(parents=True, exist_ok=True)
993
- for wav in (vc.get("clips") or []):
994
- src = clips_dir / wav
995
- if src.exists() and src.is_file():
996
- try:
997
- shutil.copy2(src, dst_dir / wav)
998
- except Exception:
999
- pass
1000
-
1001
- # Build indices using casting_loader helpers (best-effort)
1002
- db_dir = IDENTITIES_ROOT / video_name / "chroma_db"
1003
- try:
1004
- client = ensure_chroma(db_dir)
1005
- n_faces = build_faces_index(
1006
- faces_out,
1007
- client,
1008
- collection_name="index_faces",
1009
- deepface_model='Facenet512',
1010
- drop=True,
1011
- )
1012
- n_voices = build_voices_index(
1013
- voices_out,
1014
- client,
1015
- collection_name="index_voices",
1016
- drop=True,
1017
- )
1018
- except Exception as e:
1019
- # Si ChromaDB no está disponible o falla la indexación, no romper el flujo
1020
- print(f"[finalize_casting] WARN - No se pudieron construir índices ChromaDB: {e}")
1021
- n_faces = 0
1022
- n_voices = 0
1023
-
1024
- # Summary of identities
1025
- face_identities = sorted([p.name for p in faces_out.iterdir() if p.is_dir()]) if faces_out.exists() else []
1026
- voice_identities = sorted([p.name for p in voices_out.iterdir() if p.is_dir()]) if voices_out.exists() else []
1027
-
1028
- # Build casting_json with face and voice embeddings (best-effort) via remote Spaces
1029
- casting_json = {"face_col": [], "voice_col": []}
1030
-
1031
- # Cargar config y router para acceder a svision/asr
1032
- try:
1033
- cfg = load_yaml("config.yaml")
1034
- router = LLMRouter(cfg)
1035
- except Exception:
1036
- router = None # type: ignore
1037
-
1038
- # Face embeddings per identity using remote svision (face_image_embedding)
1039
- try:
1040
- if face_identities and router is not None:
1041
- factory = router.client_factories.get("salamandra-vision") # type: ignore[attr-defined]
1042
- if factory is not None:
1043
- vclient = factory()
1044
- gclient = getattr(vclient, "_client", None)
1045
- else:
1046
- gclient = None
1047
-
1048
- if gclient is not None:
1049
- for identity in face_identities:
1050
- id_dir = faces_out / identity
1051
- if not id_dir.is_dir():
1052
- continue
1053
- # Buscar una imagen representativa
1054
- img_path = None
1055
- for ext in (".jpg", ".jpeg", ".png", ".bmp", ".webp"):
1056
- candidates = list(id_dir.glob(f"*{ext}"))
1057
- if candidates:
1058
- img_path = candidates[0]
1059
- break
1060
- if not img_path:
1061
- continue
1062
-
1063
- try:
1064
- out = gclient.predict(str(img_path), api_name="/face_image_embedding")
1065
- # svision devuelve normalmente una lista de embeddings o un solo embedding
1066
- emb = None
1067
- if isinstance(out, list):
1068
- if out and isinstance(out[0], (list, tuple, float, int)):
1069
- # Si es lista de listas, tomamos la primera; si es lista plana, la usamos tal cual
1070
- if out and isinstance(out[0], (list, tuple)):
1071
- emb = list(out[0])
1072
- else:
1073
- emb = list(out)
1074
- elif isinstance(out, dict) and "embedding" in out:
1075
- emb = out.get("embedding")
1076
-
1077
- if not emb:
1078
- continue
1079
-
1080
- casting_json["face_col"].append({
1081
- "nombre": identity,
1082
- "embedding": emb,
1083
- })
1084
- except Exception:
1085
- # No romper por un fallo puntual de embedding
1086
- continue
1087
- except Exception:
1088
- # Si algo falla en todo el bloque de caras, dejamos face_col vacío
1089
- casting_json["face_col"] = []
1090
-
1091
- # Voice embeddings per identity using remote asr (voice_embedding)
1092
- try:
1093
- if voice_identities and router is not None:
1094
- factory = router.client_factories.get("whisper-catalan") # type: ignore[attr-defined]
1095
- if factory is not None:
1096
- aclient = factory()
1097
- gclient = getattr(aclient, "_client", None)
1098
- else:
1099
- gclient = None
1100
-
1101
- if gclient is not None:
1102
- for identity in voice_identities:
1103
- id_dir = voices_out / identity
1104
- if not id_dir.is_dir():
1105
- continue
1106
- wav_files = sorted([p for p in id_dir.iterdir() if p.is_file() and p.suffix.lower() in [".wav", ".flac", ".mp3"]])
1107
- if not wav_files:
1108
- continue
1109
-
1110
- # Obtenemos un embedding representativo usando el primer clip
1111
- wf = wav_files[0]
1112
- try:
1113
- out = gclient.predict(str(wf), api_name="/voice_embedding")
1114
- emb = None
1115
- if isinstance(out, list):
1116
- emb = list(out)
1117
- elif isinstance(out, dict) and "embedding" in out:
1118
- emb = out.get("embedding")
1119
-
1120
- if not emb:
1121
- continue
1122
-
1123
- casting_json["voice_col"].append({
1124
- "nombre": identity,
1125
- "embedding": emb,
1126
- })
1127
- except Exception:
1128
- continue
1129
- except Exception:
1130
- # Si algo falla en todo el bloque de voces, dejamos voice_col vacío
1131
- casting_json["voice_col"] = []
1132
-
1133
- return {
1134
- "ok": True,
1135
- "video_name": video_name,
1136
- "faces_dir": str(faces_out),
1137
- "voices_dir": str(voices_out),
1138
- "db_dir": str(db_dir),
1139
- "n_faces_embeddings": n_faces,
1140
- "n_voices_embeddings": n_voices,
1141
- "face_identities": face_identities,
1142
- "voice_identities": voice_identities,
1143
- "casting_json": casting_json,
1144
- }
1145
 
1146
- @app.get("/files_scene/{video_name}/{scene_id}/{filename}")
1147
- def serve_scene_file(video_name: str, scene_id: str, filename: str):
1148
- file_path = TEMP_ROOT / video_name / "scenes" / scene_id / filename
1149
- if not file_path.exists():
1150
- raise HTTPException(status_code=404, detail="File not found")
1151
- return FileResponse(file_path)
1152
 
1153
  @app.post("/detect_scenes")
1154
  async def detect_scenes(
1155
- video: UploadFile = File(...),
1156
- max_groups: int = Form(default=3),
1157
  min_cluster_size: int = Form(default=3),
1158
  scene_sensitivity: float = Form(default=0.5),
1159
- frame_interval_sec: float = Form(default=0.5),
1160
  ):
1161
  """
1162
- Detecta clústers d'escenes mitjançant clustering jeràrquic d'histogrames de color.
1163
- Retorna una llista de scene_clusters estructurada de forma similar a characters.
1164
  """
1165
  import cv2
1166
  import numpy as np
 
1167
 
1168
- # Guardar el vídeo temporalment
1169
- video_name = Path(video.filename).stem
1170
  dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
1171
- with dst_video.open("wb") as f:
1172
- shutil.copyfileobj(video.file, f)
1173
 
1174
  cap = cv2.VideoCapture(str(dst_video))
1175
  if not cap.isOpened():
1176
- raise HTTPException(status_code=400, detail="Cannot open video")
1177
 
1178
  fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
1179
- step = max(1, int(frame_interval_sec * fps))
 
 
 
 
 
 
1180
 
1181
- frames = []
1182
- metas = []
1183
- idx = 0
1184
- while True:
1185
- ret = cap.grab()
 
1186
  if not ret:
1187
- break
1188
- if idx % step == 0:
1189
- ret2, frame = cap.retrieve()
1190
- if not ret2:
1191
- break
1192
- # Reduir mida per estabilitat i càlcul ràpid
1193
- small = cv2.resize(frame, (160, 90))
1194
- hsv = cv2.cvtColor(small, cv2.COLOR_BGR2HSV)
1195
- # Histograma per canal
1196
- h_hist = cv2.calcHist([hsv],[0],None,[32],[0,180]).flatten()
1197
- s_hist = cv2.calcHist([hsv],[1],None,[32],[0,256]).flatten()
1198
- v_hist = cv2.calcHist([hsv],[2],None,[32],[0,256]).flatten()
1199
- hist = np.concatenate([h_hist, s_hist, v_hist])
1200
- hist = hist / (np.linalg.norm(hist) + 1e-8)
1201
- frames.append(hist)
1202
- metas.append({"index": idx, "time_sec": idx/float(fps)})
1203
- idx += 1
1204
  cap.release()
1205
 
1206
  if not frames:
1207
- return {"scene_clusters": []}
1208
 
1209
  X = np.array(frames)
1210
  labels = hierarchical_cluster_with_min_size(X, max_groups, min_cluster_size, scene_sensitivity).tolist()
@@ -1217,100 +869,75 @@ async def detect_scenes(
1217
  if lbl is None or lbl < 0:
1218
  continue
1219
  clusters.setdefault(int(lbl), []).append(i)
1220
-
1221
- # VALIDACIÓ MILLORADA: Fusionar clusters molt similars de forma més agressiva
1222
- # Calcular centroides (histograma promig de cada cluster)
 
 
 
 
1223
  centroids = {}
1224
  for lbl, idxs in clusters.items():
1225
  cluster_histograms = X[idxs]
1226
  centroids[lbl] = np.mean(cluster_histograms, axis=0)
1227
-
1228
  print(f"[SCENE VALIDATION] Validant similaritat entre {len(centroids)} clusters...")
1229
-
1230
- # Thresholds més agressius per fusionar escenes similars
1231
- SIMILARITY_THRESHOLD = 0.25 # Aumentado de 0.15 a 0.25 (fusiona más)
1232
- CORRELATION_THRESHOLD = 0.85 # Correlación mínima para considerar similares
1233
-
1234
- # Calcular matriu de distàncies i correlacions entre centroides
1235
  cluster_labels = sorted(centroids.keys())
1236
  similarities = {}
1237
-
1238
  for i, lbl1 in enumerate(cluster_labels):
1239
  for lbl2 in cluster_labels[i+1:]:
1240
- # Distancia euclidiana (normalizada)
1241
  dist = np.linalg.norm(centroids[lbl1] - centroids[lbl2])
1242
-
1243
- # Correlación de Pearson entre histogramas
1244
- corr = np.corrcoef(centroids[lbl1], centroids[lbl2])[0, 1]
1245
-
1246
- # Son similares si:
1247
- # - Distancia baja (< threshold) O
1248
- # - Correlación alta (> threshold)
1249
  are_similar = (dist < SIMILARITY_THRESHOLD) or (corr > CORRELATION_THRESHOLD)
1250
-
1251
- similarities[(lbl1, lbl2)] = {
1252
- 'distance': dist,
1253
- 'correlation': corr,
1254
- 'similar': are_similar
1255
- }
1256
-
1257
  if are_similar:
1258
- print(f"[SCENE VALIDATION] Clusters {lbl1} i {lbl2} són similars: "
1259
- f"dist={dist:.3f} (threshold={SIMILARITY_THRESHOLD}), "
1260
- f"corr={corr:.3f} (threshold={CORRELATION_THRESHOLD})")
1261
-
1262
- # Union-Find para fusionar clusters transitivamente
1263
- # Si A~B y B~C, entonces A~B~C (todos en el mismo grupo)
1264
  parent = {lbl: lbl for lbl in cluster_labels}
1265
-
1266
  def find(x):
1267
  if parent[x] != x:
1268
- parent[x] = find(parent[x]) # Path compression
1269
  return parent[x]
1270
-
1271
  def union(x, y):
1272
- root_x = find(x)
1273
- root_y = find(y)
1274
- if root_x != root_y:
1275
- parent[root_y] = root_x
1276
-
1277
- # Fusionar todos los clusters similares
1278
  fusion_count = 0
1279
  for (lbl1, lbl2), sim in similarities.items():
1280
  if sim['similar']:
1281
  union(lbl1, lbl2)
1282
  fusion_count += 1
1283
-
1284
- # Aplicar fusió als clusters
1285
  new_clusters = {}
1286
  for lbl, idxs in clusters.items():
1287
  root = find(lbl)
1288
- if root not in new_clusters:
1289
- new_clusters[root] = []
1290
- new_clusters[root].extend(idxs)
1291
-
1292
- # Reordenar labels para que sean consecutivos
1293
  final_clusters_dict = {}
1294
  for i, (root, idxs) in enumerate(sorted(new_clusters.items())):
1295
  final_clusters_dict[i] = idxs
1296
-
1297
  clusters = final_clusters_dict
1298
  final_clusters = len(clusters)
1299
  eliminated = initial_clusters - final_clusters
1300
-
1301
- print(f"[SCENE VALIDATION] ===== RESULTADO =====")
1302
- print(f"[SCENE VALIDATION] Clusters inicials: {initial_clusters}")
1303
- print(f"[SCENE VALIDATION] Fusions realitzades: {fusion_count}")
1304
- print(f"[SCENE VALIDATION] Clusters finals: {final_clusters}")
1305
- print(f"[SCENE VALIDATION] Clusters eliminats (fusionats): {eliminated}")
1306
- print(f"[SCENE VALIDATION] Reducció: {(eliminated/initial_clusters*100):.1f}%")
1307
- print(f"[SCENE VALIDATION] =======================")
1308
-
1309
- # Escriure imatges representatives per a cada clúster
1310
  base = TEMP_ROOT / video_name / "scenes"
1311
  base.mkdir(parents=True, exist_ok=True)
1312
  scene_list = []
1313
  cap = cv2.VideoCapture(str(dst_video))
 
1314
  for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
1315
  scene_id = f"scene_{int(lbl):02d}"
1316
  out_dir = base / scene_id
@@ -1329,53 +956,36 @@ async def detect_scenes(
1329
  # Representative
1330
  rep = frame_files[0] if frame_files else None
1331
  image_url = f"/files_scene/{video_name}/{scene_id}/{rep}" if rep else ""
1332
-
1333
- # Llamar a svision para describir la escena representativa
1334
  scene_description = ""
1335
- scene_name = f"Escena {lbl+1}"
1336
  if rep:
1337
  rep_full_path = out_dir / rep
1338
  if rep_full_path.exists():
1339
- print(f"Llamando a svision para describir {scene_id}...")
1340
  try:
1341
- scene_description, scene_name = describe_image_with_svision(str(rep_full_path), is_face=False)
1342
- if not scene_name:
1343
- scene_name = f"Escena {lbl+1}"
1344
-
1345
- # Si tenemos descripción, generar nombre corto con schat
1346
  if scene_description:
1347
- print(f"Llamando a schat para generar nombre corto de {scene_id}...")
1348
  try:
1349
- # Usar LLMRouter para llamar a schat
1350
  config_path = os.getenv("CONFIG_YAML", "config.yaml")
1351
  if os.path.exists(config_path):
1352
  with open(config_path, 'r', encoding='utf-8') as f:
1353
  cfg = yaml.safe_load(f) or {}
1354
  router = LLMRouter(cfg)
1355
-
1356
- prompt = f"Basant-te en aquesta descripció d'una escena, genera un nom curt de menys de 3 paraules que la resumeixi:\n\n{scene_description}\n\nNom de l'escena:"
1357
-
1358
  short_name = router.instruct(
1359
  prompt=prompt,
1360
- system="Ets un assistent que genera noms curts i descriptius per a escenes. Respon NOMÉS amb el nom, sense explicacions.",
1361
  model="salamandra-instruct"
1362
- ).strip()
1363
-
1364
- # Limpiar posibles comillas o puntuación extra
1365
- short_name = short_name.strip('"\'.,!?').strip()
1366
-
1367
- if short_name and len(short_name) > 0:
1368
  scene_name = short_name
1369
- print(f"[schat] Nom generat: {scene_name}")
1370
- else:
1371
- print(f"[schat] No s'ha generat nom, usant fallback")
1372
- except Exception as e_schat:
1373
- print(f"Error generando nombre con schat: {e_schat}")
1374
- # Mantener el nombre de svision si schat falla
1375
-
1376
  except Exception as e:
1377
  print(f"Error describiendo {scene_id}: {e}")
1378
-
1379
  scene_list.append({
1380
  "id": scene_id,
1381
  "name": scene_name,
@@ -1385,8 +995,8 @@ async def detect_scenes(
1385
  "image_url": image_url,
1386
  "frame_files": frame_files,
1387
  })
1388
- cap.release()
1389
 
 
1390
  return {"scene_clusters": scene_list, "base_dir": str(base)}
1391
 
1392
  @app.post("/refine_narration")
 
75
  app.include_router(pending_videos_router)
76
 
77
  def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[str, str]:
78
+ """Call the svision Space to describe an image (used in AD generation).
79
+
 
80
  Args:
81
+ image_path: Absolute path to the image.
82
+ is_face: True if the image is a face, False if it is a scene.
83
+
84
  Returns:
85
+ Tuple ``(full_description, short_name)``.
86
  """
87
  try:
88
  from pathlib import Path as _P
89
  import yaml
90
  from llm_router import LLMRouter
91
 
92
+ # Load configuration
93
  config_path = _P(__file__).parent / "config.yaml"
94
  if not config_path.exists():
95
  print(f"[svision] Config no encontrado: {config_path}")
 
100
 
101
  router = LLMRouter(cfg)
102
 
103
+ # Different context depending on whether the image is a face or a scene
104
  if is_face:
105
  context = {
106
  "task": "describe_person",
 
114
  "max_tokens": 128
115
  }
116
 
117
+ # Call svision
118
  descriptions = router.vision_describe([str(image_path)], context=context, model="salamandra-vision")
119
  full_description = descriptions[0] if descriptions else ""
120
 
 
132
  return ("", "")
133
 
134
  def normalize_face_lighting(image):
135
+ """Normalize face brightness using a combination of techniques.
136
+
137
+ 1. CLAHE for adaptive histogram equalization.
138
+ 2. Range normalization to homogenize overall brightness.
139
+
140
+ This reduces the impact of different lighting conditions on embeddings
141
+ and on how faces are visualized.
142
+
143
  Args:
144
+ image: BGR image (OpenCV format).
145
+
146
  Returns:
147
+ Normalized image in the same format.
148
  """
149
  import cv2
150
  import numpy as np
151
 
152
+ # Step 1: Convert to LAB color space (more robust to illumination changes)
153
  lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
154
  l, a, b = cv2.split(lab)
155
 
156
+ # Step 2: Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) to the L channel
157
+ # Use a higher clipLimit for more aggressive normalization
158
  clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
159
  l_clahe = clahe.apply(l)
160
 
161
+ # Step 3: Normalize the range of the L channel to ensure a more uniform distribution
162
+ # This guarantees that all images have a similar brightness range
163
  l_min, l_max = l_clahe.min(), l_clahe.max()
164
  if l_max > l_min:
165
+ # Stretch histogram to the full range [0, 255]
166
  l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
167
  else:
168
  l_normalized = l_clahe
169
 
170
+ # Step 4: Apply a small blur to reduce noise introduced by normalization
171
  l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
172
 
173
+ # Recombine channels
174
  lab_normalized = cv2.merge([l_normalized, a, b])
175
 
176
+ # Convert back to BGR
177
  normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
178
  return normalized
179
 
180
  def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
181
+ """Hierarchical clustering with silhouette score and a minimum cluster size.
182
+
183
+ It automatically selects the best number of clusters (up to ``max_groups``)
184
+ using the silhouette score, and then filters out clusters with fewer than
185
+ ``min_cluster_size`` samples (marked as -1 / noise).
186
+
187
  Args:
188
+ X: Embedding array of shape (N, D).
189
+ max_groups: Maximum number of clusters to form.
190
+ min_cluster_size: Minimum size for a cluster to be considered valid.
191
+ sensitivity: Clustering sensitivity (0.01.0).
192
+ - 0.0 = very aggressive (fewer clusters).
193
+ - 0.5 = balanced (recommended).
194
+ - 1.0 = permissive (more clusters).
195
+
196
  Returns:
197
+ ``np.ndarray`` of labels (N,), where -1 indicates noise.
198
  """
199
  import numpy as np
200
  from scipy.cluster.hierarchy import linkage, fcluster
 
205
  return np.array([])
206
 
207
  if len(X) < min_cluster_size:
208
+ # If there are fewer samples than the minimum, treat everything as noise
209
  return np.full(len(X), -1, dtype=int)
210
 
211
+ # Linkage using average linkage (more flexible than ward and less sensitive to outliers)
212
+ # This helps group the same person under different angles/expressions
213
  Z = linkage(X, method='average', metric='cosine') # Cosine similarity para embeddings
214
 
215
+ # Find the optimal number of clusters using the silhouette score
216
  best_n_clusters = 2
217
  best_score = -1
218
 
219
+ # Try different numbers of clusters (from 2 to max_groups)
220
+ max_to_try = min(max_groups, len(X) - 1) # Cannot have more clusters than samples
221
 
222
  if max_to_try >= 2:
223
  for n_clusters in range(2, max_to_try + 1):
224
  trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
225
+
226
+ # Compute how many valid clusters we would have after filtering
227
  trial_counts = Counter(trial_labels)
228
  valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
229
 
230
+ # Only evaluate if there are at least 2 valid clusters
231
  if valid_clusters >= 2:
232
  try:
233
  score = silhouette_score(X, trial_labels, metric='cosine')
234
+ # Dynamic penalty based on sensitivity:
235
+ # - sensitivity = 0.0 → penalty = 0.14 (very aggressive, fewer clusters)
236
+ # - sensitivity = 0.5 → penalty = 0.07 (balanced, recommended)
237
+ # - sensitivity = 1.0 → penalty = 0.01 (permissive, more clusters)
238
  penalty = 0.14 - (sensitivity * 0.13)
239
  adjusted_score = score - (n_clusters * penalty)
240
 
 
244
  except:
245
  pass # Si falla el cálculo, ignorar esta configuración
246
 
247
+ # Use the optimal number of clusters found
248
  penalty = 0.14 - (sensitivity * 0.13)
249
  print(f"Clustering óptimo: {best_n_clusters} clusters (de máximo {max_groups}), sensitivity={sensitivity:.2f}, penalty={penalty:.3f}, silhouette={best_score:.3f}")
250
  labels = fcluster(Z, t=best_n_clusters, criterion='maxclust')
251
 
252
+ # fcluster returns 1-indexed labels; convert to 0-indexed
253
  labels = labels - 1
254
 
255
+ # Filter out small clusters
256
  label_counts = Counter(labels)
257
  filtered_labels = []
258
  for lbl in labels:
259
  if label_counts[lbl] >= min_cluster_size:
260
  filtered_labels.append(lbl)
261
  else:
262
+ filtered_labels.append(-1) # Noise
263
 
264
  return np.array(filtered_labels, dtype=int)
265
 
 
292
  voice_sensitivity: float = Form(default=0.5),
293
  max_frames: int = Form(default=100),
294
  ):
295
+ """Create a background job to process a video using hierarchical clustering.
296
+
297
+ This endpoint stores the uploaded video, creates a job entry and
298
+ starts ``process_video_job`` in the background. It immediately
299
+ returns a ``job_id`` that the UI can poll.
300
  """
301
+ # Save video into the data folder
 
 
 
302
  video_name = Path(video.filename).stem
303
  dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
304
  with dst_video.open("wb") as f:
305
  shutil.copyfileobj(video.file, f)
306
 
307
+ # Create unique job_id
308
  job_id = str(uuid.uuid4())
309
 
310
+ # Initialize job metadata
311
  jobs[job_id] = {
312
  "id": job_id,
313
  "status": JobStatus.QUEUED,
 
327
 
328
  print(f"[{job_id}] Job creado para vídeo: {video_name}")
329
 
330
+ # Start processing in the background
331
  background_tasks.add_task(process_video_job, job_id)
332
 
333
  # Devolver job_id inmediatamente
 
568
  # Construir carpetas por clúster con validación DeepFace
569
  from face_classifier import validate_and_classify_face, FACE_CONFIDENCE_THRESHOLD
570
 
571
+ characters_validated: list[dict[str, Any]] = []
572
  cluster_map: dict[int, list[int]] = {}
573
+ fallback_candidate: dict[str, Any] | None = None
574
+ for idx, lbl in enumerate(labels):
575
  if isinstance(lbl, int) and lbl >= 0:
576
+ cluster_map.setdefault(lbl, []).append(idx)
577
 
578
  chars_dir = base / "characters"
579
  chars_dir.mkdir(parents=True, exist_ok=True)
580
  import shutil as _sh
581
+
582
  original_cluster_count = len(cluster_map)
583
  print(f"[{job_id}] Procesando {original_cluster_count} clusters detectados...")
584
+
585
  for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
586
  char_id = f"char_{ci:02d}"
587
+
588
+ detections: list[dict[str, Any]] = []
 
589
  for j in idxs:
590
  meta = crops_meta[j]
591
+ file_name = meta.get("file")
592
+ if not file_name:
593
+ continue
594
  box = meta.get("box", [0, 0, 0, 0])
595
+ area = 0
596
  if len(box) >= 4:
597
  top, right, bottom, left = box
598
+ area = abs(right - left) * abs(bottom - top)
599
+ detections.append({
600
+ "index": j,
601
+ "file": file_name,
602
+ "score": area,
603
+ "box": box,
 
 
 
 
 
604
  })
605
+
606
+ if not detections:
 
 
 
 
 
 
 
607
  print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: sense deteccions, eliminant")
608
  continue
609
+
610
+ detections.sort(key=lambda d: d["score"], reverse=True)
611
+ best_face = detections[0]
612
+ best_face_path = faces_root / best_face["file"]
613
+
614
  print(f"[{job_id}] [VALIDATION] Cluster {char_id}: validant millor cara (bbox_area={best_face['score']:.0f}px²)")
615
  print(f"[{job_id}] [VALIDATION] Cluster {char_id}: millor cara path={best_face_path}")
616
  print(f"[{job_id}] [VALIDATION] ▶▶▶ CRIDANT validate_and_classify_face() ◀◀◀")
617
+
618
  validation = validate_and_classify_face(str(best_face_path))
619
+
620
  print(f"[{job_id}] [VALIDATION] ▶▶▶ validate_and_classify_face() RETORNAT ◀◀◀")
621
+
622
+ candidate_conf = 0.0
623
+ if validation:
624
+ try:
625
+ candidate_conf = float(validation.get("face_confidence", 0.0) or 0.0)
626
+ except Exception:
627
+ candidate_conf = 0.0
628
+
629
+ if not fallback_candidate or candidate_conf > fallback_candidate.get("face_confidence", -1.0):
630
+ fallback_candidate = {
631
+ "char_id": char_id,
632
+ "detection": best_face,
633
+ "validation": validation,
634
+ "path": best_face_path,
635
+ "face_confidence": candidate_conf,
636
+ }
637
+
638
  if not validation:
639
  print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: error en validació DeepFace, eliminant cluster")
640
  continue
641
+
 
642
  print(f"[{job_id}] [DEEPFACE RESULT] Cluster {char_id}:")
643
  print(f"[{job_id}] - is_valid_face: {validation['is_valid_face']}")
644
  print(f"[{job_id}] - face_confidence: {validation['face_confidence']:.3f}")
 
647
  print(f"[{job_id}] - gender_diff: {abs(validation['man_prob'] - validation['woman_prob']):.3f}")
648
  print(f"[{job_id}] - gender_assigned: {validation['gender']}")
649
  print(f"[{job_id}] - gender_confidence: {validation['gender_confidence']:.3f}")
650
+
651
+ if (not validation.get("is_valid_face")) or (validation.get("face_confidence", 0.0) < FACE_CONFIDENCE_THRESHOLD):
652
+ print(
653
+ f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: NO ES UNA CARA VÁLIDA "
654
+ f"(face_confidence={validation.get('face_confidence', 0.0):.3f} < threshold={FACE_CONFIDENCE_THRESHOLD}), eliminant tot el clúster"
655
+ )
656
  continue
657
+
 
658
  out_dir = chars_dir / char_id
659
  out_dir.mkdir(parents=True, exist_ok=True)
660
+
661
+ total_faces = len(detections)
 
662
  max_faces_to_show = (total_faces // 2) + 1
663
+ selected = detections[:max_faces_to_show]
664
+
665
+ files: list[str] = []
666
+ file_urls: list[str] = []
667
+ for det in selected:
668
+ fname = det["file"]
 
669
  src = faces_root / fname
670
  dst = out_dir / fname
671
  try:
672
  _sh.copy2(src, dst)
673
  files.append(fname)
674
+ file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
675
  except Exception:
676
  pass
677
+
 
678
  rep = files[0] if files else None
679
  if rep:
680
  rep_src = out_dir / rep
 
683
  _sh.copy2(rep_src, rep_dst)
684
  except Exception:
685
  pass
686
+
687
+ cluster_number = int(char_id.split("_")[1]) + 1
 
688
  character_name = f"Cluster {cluster_number}"
689
+ gender = validation.get("gender", "Neutral")
690
+
691
  print(f"[{job_id}] [NAME GENERATION] Cluster {char_id}:")
692
  print(f"[{job_id}] - Gender detectado: {gender}")
693
  print(f"[{job_id}] - Nombre asignado: {character_name}")
694
  print(f"[{job_id}] - Seed usado: {char_id}")
695
+
696
+ characters_validated.append({
697
  "id": char_id,
698
  "name": character_name,
699
  "gender": gender,
700
+ "gender_confidence": validation.get("gender_confidence", 0.0),
701
+ "face_confidence": validation.get("face_confidence", 0.0),
702
+ "man_prob": validation.get("man_prob", 0.0),
703
+ "woman_prob": validation.get("woman_prob", 0.0),
704
  "folder": str(out_dir),
705
  "num_faces": len(files),
706
  "total_faces_detected": total_faces,
707
  "image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
708
+ "face_files": file_urls,
709
+ })
710
+
 
 
711
  print(f"[{job_id}] [VALIDATION] ✓ Cluster {char_id}: CARA VÁLIDA!")
712
  print(f"[{job_id}] Nombre: {character_name}")
713
+ print(f"[{job_id}] Género: {gender} (man={validation.get('man_prob', 0.0):.3f}, woman={validation.get('woman_prob', 0.0):.3f})")
714
+ print(f"[{job_id}] Confianza género: {validation.get('gender_confidence', 0.0):.3f}")
715
+ print(f"[{job_id}] Confianza cara: {validation.get('face_confidence', 0.0):.3f}")
716
  print(f"[{job_id}] Caras mostradas: {len(files)}/{total_faces}")
717
  print(f"[{job_id}] Imagen representativa: {best_face_path.name}")
718
+
 
719
  eliminated_count = original_cluster_count - len(characters_validated)
720
  print(f"[{job_id}] [VALIDATION] Total: {len(characters_validated)} clústers vàlids "
721
  f"(eliminats {eliminated_count} falsos positius)")
 
 
722
 
723
+ if not characters_validated and fallback_candidate:
724
+ print(f"[{job_id}] [FALLBACK] No hi ha clústers vàlids. Creant clúster de reserva amb la millor cara trobada.")
725
+ fallback = fallback_candidate
726
+ det = fallback.get("detection", {})
727
+ fname = det.get("file")
728
+ fallback_path: Path | None = fallback.get("path")
729
+ val = fallback.get("validation")
730
+ idx = det.get("index")
731
+
732
+ if fname and fallback_path is not None:
733
+ if val is None:
734
+ val = validate_and_classify_face(str(fallback_path))
735
+ if val is None:
736
+ val = {
737
+ "is_valid_face": False,
738
+ "face_confidence": fallback.get("face_confidence", 0.0),
739
+ "gender": "Neutral",
740
+ "gender_confidence": 0.0,
741
+ "man_prob": 0.0,
742
+ "woman_prob": 0.0,
743
+ }
744
+
745
+ out_dir = chars_dir / "char_00"
746
+ out_dir.mkdir(parents=True, exist_ok=True)
747
 
748
+ src = faces_root / fname
749
+ dst = out_dir / fname
750
+ try:
751
+ _sh.copy2(src, dst)
752
+ except Exception as copy_err:
753
+ print(f"[{job_id}] [FALLBACK] Error copiant la imatge de reserva: {copy_err}")
754
 
755
+ rep_dst = out_dir / "representative.jpg"
756
+ try:
757
+ _sh.copy2(dst, rep_dst)
758
+ except Exception:
759
+ pass
760
+
761
+ if embeddings:
762
+ if not labels or len(labels) != len(embeddings):
763
+ labels = [-1] * len(embeddings)
764
+ if isinstance(idx, int) and 0 <= idx < len(labels):
765
+ labels[idx] = 0
766
+
767
+ characters_validated.append({
768
+ "id": "char_00",
769
+ "name": "Cluster 1",
770
+ "gender": val.get("gender", "Neutral"),
771
+ "gender_confidence": val.get("gender_confidence", 0.0),
772
+ "face_confidence": val.get("face_confidence", 0.0),
773
+ "man_prob": val.get("man_prob", 0.0),
774
+ "woman_prob": val.get("woman_prob", 0.0),
775
+ "folder": str(out_dir),
776
+ "num_faces": 1,
777
+ "total_faces_detected": 1,
778
+ "image_url": f"/files/{video_name}/char_00/representative.jpg",
779
+ "face_files": [f"/files/{video_name}/char_00/{fname}"],
780
+ })
781
+
782
+ print(f"[{job_id}] [FALLBACK] Clúster de reserva creat amb confiança {val.get('face_confidence', 0.0):.3f}")
783
+ else:
784
+ print(f"[{job_id}] [FALLBACK] Dades insuficients per crear el clúster de reserva")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
 
786
+ # Guardar resultados de caras
787
  job["results"] = {
788
+ "characters": characters_validated,
789
+ "face_labels": labels,
790
+ "video_name": video_name,
791
  "base_dir": str(base),
 
 
 
 
 
 
 
 
792
  }
793
  job["status"] = JobStatus.DONE
794
+ print(f"[{job_id}] ✓ Procesamiento de caras completado: {len(characters_validated)} personajes")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
 
796
+ except Exception as face_error:
797
+ print(f"[{job_id}] Error en detección de caras: {face_error}")
798
+ import traceback
799
+ traceback.print_exc()
800
+ job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
801
+ job["status"] = JobStatus.DONE # Still mark done so UI can proceed
 
802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
  except Exception as e:
804
+ print(f"[{job_id}] Error general en procesamiento: {e}")
805
  import traceback
806
+ traceback.print_exc()
807
+ job["status"] = JobStatus.FAILED
808
+ job["error"] = str(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
 
 
 
 
 
 
 
810
 
811
  @app.post("/detect_scenes")
812
  async def detect_scenes(
813
+ video_name: str = Form(...),
814
+ max_groups: int = Form(default=5),
815
  min_cluster_size: int = Form(default=3),
816
  scene_sensitivity: float = Form(default=0.5),
 
817
  ):
818
  """
819
+ Detecta y agrupa escenas en un vídeo ya procesado.
 
820
  """
821
  import cv2
822
  import numpy as np
823
+ from typing import Any
824
 
 
 
825
  dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
826
+ if not dst_video.exists():
827
+ return {"error": f"Video {video_name} not found"}
828
 
829
  cap = cv2.VideoCapture(str(dst_video))
830
  if not cap.isOpened():
831
+ return {"error": "Could not open video"}
832
 
833
  fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
834
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
835
+ max_samples = 200 # Limit samples for scene detection
836
+
837
+ if total_frames > 0:
838
+ frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
839
+ else:
840
+ frame_indices = []
841
 
842
+ frames: list[list[float]] = []
843
+ metas: list[dict[str, Any]] = []
844
+
845
+ for frame_idx in frame_indices:
846
+ cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
847
+ ret, frame = cap.read()
848
  if not ret:
849
+ continue
850
+ # Color histogram as feature
851
+ hist = cv2.calcHist([frame], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
852
+ hist = cv2.normalize(hist, hist).flatten()
853
+ frames.append(hist.tolist())
854
+ metas.append({"index": frame_idx})
855
+
 
 
 
 
 
 
 
 
 
 
856
  cap.release()
857
 
858
  if not frames:
859
+ return {"scene_clusters": [], "base_dir": ""}
860
 
861
  X = np.array(frames)
862
  labels = hierarchical_cluster_with_min_size(X, max_groups, min_cluster_size, scene_sensitivity).tolist()
 
869
  if lbl is None or lbl < 0:
870
  continue
871
  clusters.setdefault(int(lbl), []).append(i)
872
+
873
+ # Fallback: garantir mínim 1 cluster d'escena
874
+ if not clusters and frames:
875
+ clusters[0] = [0] # Usar el primer frame com a escena per defecte
876
+ print("[SCENE FALLBACK] Cap cluster vàlid, creant cluster amb primer frame")
877
+
878
+ # VALIDACIÓ MILLORADA: Fusionar clusters molt similars
879
  centroids = {}
880
  for lbl, idxs in clusters.items():
881
  cluster_histograms = X[idxs]
882
  centroids[lbl] = np.mean(cluster_histograms, axis=0)
883
+
884
  print(f"[SCENE VALIDATION] Validant similaritat entre {len(centroids)} clusters...")
885
+
886
+ SIMILARITY_THRESHOLD = 0.25
887
+ CORRELATION_THRESHOLD = 0.85
888
+
 
 
889
  cluster_labels = sorted(centroids.keys())
890
  similarities = {}
891
+
892
  for i, lbl1 in enumerate(cluster_labels):
893
  for lbl2 in cluster_labels[i+1:]:
 
894
  dist = np.linalg.norm(centroids[lbl1] - centroids[lbl2])
895
+ corr = np.corrcoef(centroids[lbl1], centroids[lbl2])[0, 1] if len(centroids[lbl1]) > 1 else 0.0
 
 
 
 
 
 
896
  are_similar = (dist < SIMILARITY_THRESHOLD) or (corr > CORRELATION_THRESHOLD)
897
+ similarities[(lbl1, lbl2)] = {'distance': dist, 'correlation': corr, 'similar': are_similar}
 
 
 
 
 
 
898
  if are_similar:
899
+ print(f"[SCENE VALIDATION] Clusters {lbl1} i {lbl2} similars: dist={dist:.3f}, corr={corr:.3f}")
900
+
901
+ # Union-Find para fusionar clusters
 
 
 
902
  parent = {lbl: lbl for lbl in cluster_labels}
903
+
904
  def find(x):
905
  if parent[x] != x:
906
+ parent[x] = find(parent[x])
907
  return parent[x]
908
+
909
  def union(x, y):
910
+ rx, ry = find(x), find(y)
911
+ if rx != ry:
912
+ parent[ry] = rx
913
+
 
 
914
  fusion_count = 0
915
  for (lbl1, lbl2), sim in similarities.items():
916
  if sim['similar']:
917
  union(lbl1, lbl2)
918
  fusion_count += 1
919
+
 
920
  new_clusters = {}
921
  for lbl, idxs in clusters.items():
922
  root = find(lbl)
923
+ new_clusters.setdefault(root, []).extend(idxs)
924
+
 
 
 
925
  final_clusters_dict = {}
926
  for i, (root, idxs) in enumerate(sorted(new_clusters.items())):
927
  final_clusters_dict[i] = idxs
928
+
929
  clusters = final_clusters_dict
930
  final_clusters = len(clusters)
931
  eliminated = initial_clusters - final_clusters
932
+
933
+ print(f"[SCENE VALIDATION] Clusters finals: {final_clusters} (fusionats: {eliminated})")
934
+
935
+ # Escriure imatges representatives
 
 
 
 
 
 
936
  base = TEMP_ROOT / video_name / "scenes"
937
  base.mkdir(parents=True, exist_ok=True)
938
  scene_list = []
939
  cap = cv2.VideoCapture(str(dst_video))
940
+
941
  for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
942
  scene_id = f"scene_{int(lbl):02d}"
943
  out_dir = base / scene_id
 
956
  # Representative
957
  rep = frame_files[0] if frame_files else None
958
  image_url = f"/files_scene/{video_name}/{scene_id}/{rep}" if rep else ""
959
+
960
+ # Llamar a svision para describir la escena
961
  scene_description = ""
962
+ scene_name = f"Cluster {lbl+1}"
963
  if rep:
964
  rep_full_path = out_dir / rep
965
  if rep_full_path.exists():
 
966
  try:
967
+ scene_description, _ = describe_image_with_svision(str(rep_full_path), is_face=False)
968
+ # Generar nombre corto con schat
 
 
 
969
  if scene_description:
 
970
  try:
 
971
  config_path = os.getenv("CONFIG_YAML", "config.yaml")
972
  if os.path.exists(config_path):
973
  with open(config_path, 'r', encoding='utf-8') as f:
974
  cfg = yaml.safe_load(f) or {}
975
  router = LLMRouter(cfg)
976
+ prompt = f"Genera un nom curt (2-3 paraules) per aquesta escena:\n{scene_description}"
 
 
977
  short_name = router.instruct(
978
  prompt=prompt,
979
+ system="Respon NOMÉS amb el nom, sense explicacions.",
980
  model="salamandra-instruct"
981
+ ).strip().strip('"\'.,!?')
982
+ if short_name:
 
 
 
 
983
  scene_name = short_name
984
+ except Exception:
985
+ pass
 
 
 
 
 
986
  except Exception as e:
987
  print(f"Error describiendo {scene_id}: {e}")
988
+
989
  scene_list.append({
990
  "id": scene_id,
991
  "name": scene_name,
 
995
  "image_url": image_url,
996
  "frame_files": frame_files,
997
  })
 
998
 
999
+ cap.release()
1000
  return {"scene_clusters": scene_list, "base_dir": str(base)}
1001
 
1002
  @app.post("/refine_narration")