Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on Nov 30, 2025

Commit

6526378

verified ·

1 Parent(s): 557665b

Upload api.py

Browse files

Files changed (1) hide show

api.py +299 -689

api.py CHANGED Viewed

@@ -75,22 +75,21 @@ app.include_router(embeddings_router)
 app.include_router(pending_videos_router)
 def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[str, str]:
-    """
-    Llama al space svision para describir una imagen (usado en generación de AD).
     Args:
-        image_path: Ruta absoluta a la imagen
-        is_face: True si es una cara, False si es una escena
     Returns:
-        tuple (descripción_completa, nombre_abreviado)
     """
     try:
         from pathlib import Path as _P
         import yaml
         from llm_router import LLMRouter
-        # Cargar configuración
         config_path = _P(__file__).parent / "config.yaml"
         if not config_path.exists():
             print(f"[svision] Config no encontrado: {config_path}")
@@ -101,7 +100,7 @@ def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[
         router = LLMRouter(cfg)
-        # Contexto diferente para caras vs escenas
         if is_face:
             context = {
                 "task": "describe_person",
@@ -115,7 +114,7 @@ def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[
                 "max_tokens": 128
             }
-        # Llamar a svision
         descriptions = router.vision_describe([str(image_path)], context=context, model="salamandra-vision")
         full_description = descriptions[0] if descriptions else ""
@@ -133,68 +132,69 @@ def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[
         return ("", "")
 def normalize_face_lighting(image):
-    """
-    Normaliza el brillo de una imagen de cara usando técnicas combinadas:
-    1. CLAHE para ecualización adaptativa
-    2. Normalización de rango para homogeneizar brillo general
-    Esto reduce el impacto de diferentes condiciones de iluminación en los embeddings
-    y en la visualización de las imágenes.
     Args:
-        image: Imagen BGR (OpenCV format)
     Returns:
-        Imagen normalizada en el mismo formato
     """
     import cv2
     import numpy as np
-    # Paso 1: Convertir a LAB color space (más robusto para iluminación)
     lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
     l, a, b = cv2.split(lab)
-    # Paso 2: Aplicar CLAHE (Contrast Limited Adaptive Histogram Equalization) al canal L
-    # Usar clipLimit más alto para normalización más agresiva
     clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
     l_clahe = clahe.apply(l)
-    # Paso 3: Normalizar el rango del canal L para asegurar distribución uniforme
-    # Esto garantiza que todas las imágenes tengan un rango de brillo similar
     l_min, l_max = l_clahe.min(), l_clahe.max()
     if l_max > l_min:
-        # Estirar el histograma al rango completo [0, 255]
         l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
     else:
         l_normalized = l_clahe
-    # Paso 4: Aplicar suavizado suave para reducir ruido introducido por la normalización
     l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
-    # Recombinar canales
     lab_normalized = cv2.merge([l_normalized, a, b])
-    # Convertir de vuelta a BGR
     normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
     return normalized
 def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
-    """
-    Clustering jerárquico con silhouette score para encontrar automáticamente el mejor número de clusters.
-    Selecciona automáticamente el mejor número de clusters (hasta max_groups) usando silhouette score.
-    Filtra clusters con menos de min_cluster_size muestras (marcados como -1/ruido).
     Args:
-        X: Array de embeddings (N, D)
-        max_groups: Número máximo de clusters a formar
-        min_cluster_size: Tamaño mínimo de cluster válido
-        sensitivity: Sensibilidad del clustering (0.0-1.0)
-                    - 0.0 = muy agresivo (menos clusters)
-                    - 0.5 = balanceado (recomendado)
-                    - 1.0 = muy permisivo (más clusters)
     Returns:
-        Array de labels (N,) donde -1 indica ruido
     """
     import numpy as np
     from scipy.cluster.hierarchy import linkage, fcluster
@@ -205,36 +205,36 @@ def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int
         return np.array([])
     if len(X) < min_cluster_size:
-        # Si hay menos muestras que el mínimo, todo es ruido
         return np.full(len(X), -1, dtype=int)
-    # Linkage usando average linkage (más flexible que ward, menos sensible a outliers)
-    # Esto ayuda a agrupar mejor la misma persona con diferentes ángulos/expresiones
     Z = linkage(X, method='average', metric='cosine')  # Cosine similarity para embeddings
-    # Encontrar el número óptimo de clusters usando silhouette score
     best_n_clusters = 2
     best_score = -1
-    # Probar diferentes números de clusters (de 2 a max_groups)
-    max_to_try = min(max_groups, len(X) - 1)  # No puede haber más clusters que muestras
     if max_to_try >= 2:
         for n_clusters in range(2, max_to_try + 1):
             trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
-            # Calcular cuántos clusters válidos tendríamos después del filtrado
             trial_counts = Counter(trial_labels)
             valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
-            # Solo evaluar si hay al menos 2 clusters válidos
             if valid_clusters >= 2:
                 try:
                     score = silhouette_score(X, trial_labels, metric='cosine')
-                    # Penalización dinámica basada en sensibilidad:
-                    # - sensitivity=0.0 → penalty=0.14 (muy agresivo, menos clusters)
-                    # - sensitivity=0.5 → penalty=0.07 (balanceado, recomendado)
-                    # - sensitivity=1.0 → penalty=0.01 (permisivo, más clusters)
                     penalty = 0.14 - (sensitivity * 0.13)
                     adjusted_score = score - (n_clusters * penalty)
@@ -244,22 +244,22 @@ def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int
                 except:
                     pass  # Si falla el cálculo, ignorar esta configuración
-    # Usar el número óptimo de clusters encontrado
     penalty = 0.14 - (sensitivity * 0.13)
     print(f"Clustering óptimo: {best_n_clusters} clusters (de máximo {max_groups}), sensitivity={sensitivity:.2f}, penalty={penalty:.3f}, silhouette={best_score:.3f}")
     labels = fcluster(Z, t=best_n_clusters, criterion='maxclust')
-    # fcluster devuelve labels 1-indexed, convertir a 0-indexed
     labels = labels - 1
-    # Filtrar clusters pequeños
     label_counts = Counter(labels)
     filtered_labels = []
     for lbl in labels:
         if label_counts[lbl] >= min_cluster_size:
             filtered_labels.append(lbl)
         else:
-            filtered_labels.append(-1)  # Ruido
     return np.array(filtered_labels, dtype=int)
@@ -292,20 +292,22 @@ async def create_initial_casting(
     voice_sensitivity: float = Form(default=0.5),
     max_frames: int = Form(default=100),
 ):
     """
-    Crea un job para procesar el vídeo de forma asíncrona usando clustering jerárquico.
-    Devuelve un job_id inmediatamente.
-    """
-    # Guardar vídeo en carpeta de datos
     video_name = Path(video.filename).stem
     dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
     with dst_video.open("wb") as f:
         shutil.copyfileobj(video.file, f)
-    # Crear job_id único
     job_id = str(uuid.uuid4())
-    # Inicializar el job
     jobs[job_id] = {
         "id": job_id,
         "status": JobStatus.QUEUED,
@@ -325,7 +327,7 @@ async def create_initial_casting(
     print(f"[{job_id}] Job creado para vídeo: {video_name}")
-    # Iniciar procesamiento en background
     background_tasks.add_task(process_video_job, job_id)
     # Devolver job_id inmediatamente
@@ -566,70 +568,77 @@ def process_video_job(job_id: str):
             # Construir carpetas por clúster con validación DeepFace
             from face_classifier import validate_and_classify_face, FACE_CONFIDENCE_THRESHOLD
-            characters_validated = []
             cluster_map: dict[int, list[int]] = {}
-            for i, lbl in enumerate(labels):
                 if isinstance(lbl, int) and lbl >= 0:
-                    cluster_map.setdefault(lbl, []).append(i)
             chars_dir = base / "characters"
             chars_dir.mkdir(parents=True, exist_ok=True)
             import shutil as _sh
             original_cluster_count = len(cluster_map)
             print(f"[{job_id}] Procesando {original_cluster_count} clusters detectados...")
             for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
                 char_id = f"char_{ci:02d}"
-                # PASO 1: Ordenar caras por área del bounding box (mejor calidad)
-                face_detections = []
                 for j in idxs:
                     meta = crops_meta[j]
                     box = meta.get("box", [0, 0, 0, 0])
                     if len(box) >= 4:
                         top, right, bottom, left = box
-                        w = abs(right - left)
-                        h = abs(bottom - top)
-                        area_score = w * h
-                    else:
-                        area_score = 0
-                    face_detections.append({
-                        'index': j,
-                        'score': area_score,
-                        'file': meta['file'],
-                        'box': box
                     })
-                # Ordenar por score descendente
-                face_detections_sorted = sorted(
-                    face_detections,
-                    key=lambda x: x['score'],
-                    reverse=True
-                )
-                if not face_detections_sorted:
                     print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: sense deteccions, eliminant")
                     continue
-                # PASO 2: Validar SOLO la mejor cara del cluster
-                best_face = face_detections_sorted[0]
-                best_face_path = faces_root / best_face['file']
                 print(f"[{job_id}] [VALIDATION] Cluster {char_id}: validant millor cara (bbox_area={best_face['score']:.0f}px²)")
                 print(f"[{job_id}] [VALIDATION] Cluster {char_id}: millor cara path={best_face_path}")
                 print(f"[{job_id}] [VALIDATION] ▶▶▶ CRIDANT validate_and_classify_face() ◀◀◀")
                 validation = validate_and_classify_face(str(best_face_path))
                 print(f"[{job_id}] [VALIDATION] ▶▶▶ validate_and_classify_face() RETORNAT ◀◀◀")
                 if not validation:
                     print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: error en validació DeepFace, eliminant cluster")
                     continue
-                # Mostrar resultados detallados de DeepFace
                 print(f"[{job_id}] [DEEPFACE RESULT] Cluster {char_id}:")
                 print(f"[{job_id}]   - is_valid_face: {validation['is_valid_face']}")
                 print(f"[{job_id}]   - face_confidence: {validation['face_confidence']:.3f}")
@@ -638,36 +647,34 @@ def process_video_job(job_id: str):
                 print(f"[{job_id}]   - gender_diff: {abs(validation['man_prob'] - validation['woman_prob']):.3f}")
                 print(f"[{job_id}]   - gender_assigned: {validation['gender']}")
                 print(f"[{job_id}]   - gender_confidence: {validation['gender_confidence']:.3f}")
-                # PASO 3: Verificar si és una cara vàlida
-                if not validation['is_valid_face'] or validation['face_confidence'] < FACE_CONFIDENCE_THRESHOLD:
-                    print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: NO ES UNA CARA VÁLIDA (face_confidence={validation['face_confidence']:.3f} < threshold={FACE_CONFIDENCE_THRESHOLD}), eliminant tot el clúster")
                     continue
-                # PASO 4: És una cara vàlida! Crear carpeta
                 out_dir = chars_dir / char_id
                 out_dir.mkdir(parents=True, exist_ok=True)
-                # PASO 5: Limitar caras a mostrar (primera meitat + 1)
-                total_faces = len(face_detections_sorted)
                 max_faces_to_show = (total_faces // 2) + 1
-                face_detections_limited = face_detections_sorted[:max_faces_to_show]
-                # Copiar solo las caras limitadas
-                files = []
-                face_files_urls = []
-                for k, face_det in enumerate(face_detections_limited):
-                    fname = face_det['file']
                     src = faces_root / fname
                     dst = out_dir / fname
                     try:
                         _sh.copy2(src, dst)
                         files.append(fname)
-                        face_files_urls.append(f"/files/{video_name}/{char_id}/{fname}")
                     except Exception:
                         pass
-                # Imagen representativa (la mejor)
                 rep = files[0] if files else None
                 if rep:
                     rep_src = out_dir / rep
@@ -676,535 +683,180 @@ def process_video_job(job_id: str):
                         _sh.copy2(rep_src, rep_dst)
                     except Exception:
                         pass
-                # PASO 6: Generar nombre de clúster
-                cluster_number = int(char_id.split('_')[1]) + 1
                 character_name = f"Cluster {cluster_number}"
-                gender = validation['gender']
                 print(f"[{job_id}] [NAME GENERATION] Cluster {char_id}:")
                 print(f"[{job_id}]   - Gender detectado: {gender}")
                 print(f"[{job_id}]   - Nombre asignado: {character_name}")
                 print(f"[{job_id}]   - Seed usado: {char_id}")
-                character_data = {
                     "id": char_id,
                     "name": character_name,
                     "gender": gender,
-                    "gender_confidence": validation['gender_confidence'],
-                    "face_confidence": validation['face_confidence'],
-                    "man_prob": validation['man_prob'],
-                    "woman_prob": validation['woman_prob'],
                     "folder": str(out_dir),
                     "num_faces": len(files),
                     "total_faces_detected": total_faces,
                     "image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
-                    "face_files": face_files_urls,
-                }
-                characters_validated.append(character_data)
                 print(f"[{job_id}] [VALIDATION] ✓ Cluster {char_id}: CARA VÁLIDA!")
                 print(f"[{job_id}]   Nombre: {character_name}")
-                print(f"[{job_id}]   Género: {gender} (man={validation['man_prob']:.3f}, woman={validation['woman_prob']:.3f})")
-                print(f"[{job_id}]   Confianza género: {validation['gender_confidence']:.3f}")
-                print(f"[{job_id}]   Confianza cara: {validation['face_confidence']:.3f}")
                 print(f"[{job_id}]   Caras mostradas: {len(files)}/{total_faces}")
                 print(f"[{job_id}]   Imagen representativa: {best_face_path.name}")
-            # Estadístiques finals
             eliminated_count = original_cluster_count - len(characters_validated)
             print(f"[{job_id}] [VALIDATION] Total: {len(characters_validated)} clústers vàlids "
                   f"(eliminats {eliminated_count} falsos positius)")
-            characters = characters_validated
-            # Escribir analysis.json compatible con 'originales'
-            analysis = {
-                "caras": [{"embeddings": e} for e in embeddings],
-                "voices": [],
-                "escenas": [],
-            }
-            analysis_path = str(base / "analysis.json")
-            with open(analysis_path, "w", encoding="utf-8") as f:
-                json.dump(analysis, f, ensure_ascii=False)
-            face_labels = labels
-            num_face_embeddings = len(embeddings)
-            print(f"[{job_id}] Personajes detectados: {len(characters)}")
-            for char in characters:
-                print(f"[{job_id}]   - {char['name']}: {char['num_faces']} caras")
-            # Enriquecer info de personajes con listado real de imágenes disponibles
-            try:
-                import glob, os
-                for ch in characters:
-                    folder = ch.get("folder")
-                    face_files = []
-                    if folder and os.path.isdir(folder):
-                        # soportar patrones face_* y extensiones jpg/png
-                        patterns = ["face_*.jpg", "face_*.png"]
-                        files = []
-                        for pat in patterns:
-                            files.extend(glob.glob(os.path.join(folder, pat)))
-                        # si no hay face_*, tomar cualquier jpg/png para no dejar vacío
-                        if not files:
-                            files.extend(glob.glob(os.path.join(folder, "*.jpg")))
-                            files.extend(glob.glob(os.path.join(folder, "*.png")))
-                        # normalizar nombres de fichero relativos
-                        face_files = sorted({os.path.basename(p) for p in files})
-                        # Garantizar que representative.(jpg|png) esté el primero si existe
-                        for rep_name in ("representative.jpg", "representative.png"):
-                            rep_path = os.path.join(folder, rep_name)
-                            if os.path.exists(rep_path):
-                                if rep_name in face_files:
-                                    face_files.remove(rep_name)
-                                face_files.insert(0, rep_name)
-                    ch["face_files"] = face_files
-                    # Ajustar num_faces si hay discrepancia
-                    if face_files:
-                        ch["num_faces"] = len(face_files)
-            except Exception as _e:
-                print(f"[{job_id}] WARN - No se pudo enumerar face_files: {_e}")
-            # Procesamiento de audio: diarización, ASR y embeddings de voz
-            try:
-                cfg = load_yaml("config.yaml")
-                audio_segments, srt_unmod, full_txt, diar_info, connection_logs = process_audio_for_video(video_path, base, cfg, voice_collection=None)
-                # Loggear en consola del engine los eventos de conexión
-                try:
-                    for ev in (connection_logs or []):
-                        msg = ev.get("message") if isinstance(ev, dict) else None
-                        if msg:
-                            print(f"[{job_id}] {msg}")
-                except Exception:
-                    pass
-            except Exception as e_audio:
-                import traceback
-                print(f"[{job_id}] WARN - Audio pipeline failed: {e_audio}\n{traceback.format_exc()}")
-                audio_segments, srt_unmod, full_txt = [], None, ""
-                diar_info = {"diarization_ok": False, "error": str(e_audio)}
-                connection_logs = []
-            # Fallback: si no hay segmentos de audio, crear uno mínimo del audio completo
-            if not audio_segments:
-                try:
-                    from pathlib import Path as _P
-                    from pydub import AudioSegment as _AS
-                    wav_out = extract_audio_ffmpeg(video_path, base / f"{_P(video_path).stem}.wav", sr=16000)
-                    audio = _AS.from_wav(wav_out)
-                    clips_dir = base / "clips"
-                    clips_dir.mkdir(parents=True, exist_ok=True)
-                    cp = clips_dir / "segment_000.wav"
-                    audio.export(cp, format="wav")
-                    emb_list = embed_voice_segments([str(cp)])
-                    audio_segments = [{
-                        "segment": 0,
-                        "start": 0.0,
-                        "end": float(len(audio) / 1000.0),
-                        "speaker": "SPEAKER_00",
-                        "text": "",
-                        "voice_embedding": emb_list[0] if emb_list else [],
-                        "clip_path": str(cp),
-                        "lang": "ca",
-                        "lang_prob": 1.0,
-                    }]
-                except Exception as _efb:
-                    print(f"[{job_id}] WARN - Audio minimal fallback failed: {_efb}")
-            # Clustering jerárquico de voces sobre embeddings válidos
-            import numpy as np
-            voice_embeddings = [seg.get("voice_embedding") for seg in audio_segments if seg.get("voice_embedding")]
-            if voice_embeddings:
-                try:
-                    Xv = np.array(voice_embeddings)
-                    v_labels = hierarchical_cluster_with_min_size(Xv, v_max_groups, v_min_cluster, voice_sensitivity).tolist()
-                    print(f"[{job_id}] Clustering jerárquico de voz: {len(set([l for l in v_labels if l >= 0]))} clusters")
-                except Exception as _e:
-                    print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
-                    v_labels = []
-            else:
-                v_labels = []
-            # Guardar resultados primero y luego marcar como completado (evita carreras)
             job["results"] = {
-                "characters": characters,
-                "num_characters": len(characters),
-                "analysis_path": analysis_path,
                 "base_dir": str(base),
-                "face_labels": face_labels,
-                "num_face_embeddings": num_face_embeddings,
-                "audio_segments": audio_segments,
-                "srt_unmodified": srt_unmod,
-                "full_transcription": full_txt,
-                "voice_labels": v_labels,
-                "num_voice_embeddings": len(voice_embeddings),
-                "diarization_info": diar_info,
             }
             job["status"] = JobStatus.DONE
-            # Log resumido sin embeddings
-            print(f"[{job_id}] ✓ Resultados guardados:")
-            print(f"[{job_id}]   - Personatges: {len(characters)}")
-            print(f"[{job_id}]   - Segments d'àudio: {len(audio_segments)}")
-            print(f"[{job_id}]   - Face embeddings: {num_face_embeddings}")
-            print(f"[{job_id}]   - Voice embeddings: {len(voice_embeddings)}")
-        except Exception as e_detect:
-            # Si falla la detección, intentar modo fallback
-            import traceback
-            print(f"[{job_id}] ✗ Error en detección: {e_detect}")
-            print(f"[{job_id}] Traceback: {traceback.format_exc()}")
-            print(f"[{job_id}] Usando modo fallback (carpetas vacías)")
-            # Crear carpetas básicas como fallback
-            for sub in ("sources", "faces", "voices", "backgrounds"):
-                (base / sub).mkdir(parents=True, exist_ok=True)
-            # Guardar resultados de fallback y luego marcar como completado
-            job["results"] = {
-                "characters": [],
-                "num_characters": 0,
-                "temp_dirs": {
-                    "sources": str(base / "sources"),
-                    "faces": str(base / "faces"),
-                    "voices": str(base / "voices"),
-                    "backgrounds": str(base / "backgrounds"),
-                },
-                "warning": f"Detección falló, usando modo fallback: {str(e_detect)}"
-            }
-            job["status"] = JobStatus.DONE
-        print(f"[{job_id}] ✓ Job completado exitosamente")
-    except Exception as e:
-        import traceback
-        print(f"[{job_id}] ✗ Error inesperado: {e}")
-        try:
-            job = jobs.get(job_id)
-            if job is not None:
-                job["status"] = JobStatus.FAILED
-                job["error"] = str(e)
-        except Exception:
-            pass
-        print(f"[{job_id}] Traceback: {traceback.format_exc()}")
-@app.post("/generate_audiodescription")
-async def generate_audiodescription(video: UploadFile = File(...)):
-    try:
-        import uuid
-        job_id = str(uuid.uuid4())
-        vid_name = video.filename or f"video_{job_id}.mp4"
-        base = TEMP_ROOT / Path(vid_name).stem
-        base.mkdir(parents=True, exist_ok=True)
-        # Save temp mp4
-        video_path = base / vid_name
-        with open(video_path, "wb") as f:
-            f.write(await video.read())
-        # Run MVP pipeline
-        result = ad_generate(str(video_path), base)
-        return {
-            "status": "done",
-            "results": {
-                "une_srt": result.get("une_srt", ""),
-                "free_text": result.get("free_text", ""),
-                "artifacts": result.get("artifacts", {}),
-            },
-        }
     except Exception as e:
         import traceback
-        print(f"/generate_audiodescription error: {e}\n{traceback.format_exc()}")
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post("/load_casting")
-async def load_casting(
-    faces_dir: str = Form("identities/faces"),
-    voices_dir: str = Form("identities/voices"),
-    db_dir: str = Form("chroma_db"),
-    drop_collections: bool = Form(False),
-):
-    client = ensure_chroma(Path(db_dir))
-    n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
-    n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
-    return {"ok": True, "faces": n_faces, "voices": n_voices}
-@app.post("/finalize_casting")
-async def finalize_casting(
-    payload: dict = Body(...),
-):
-    """
-    Consolidate selected face and voice clusters into identities directories and build indices.
-    Expected payload:
-    {
-      "video_name": str,
-      "base_dir": str,  # engine temp base for this video
-      "characters": [
-        {"id": "char1", "name": "Nom", "folder": "/tmp/temp/<video>/char1", "kept_files": ["representative.jpg", ...], "description": "..."}, ...
-      ],
-      "voice_clusters": [
-        {"label": 0, "name": "SPEAKER_00", "clips": ["segment_000.wav", ...]}, ...
-      ]
-    }
-    """
-    import os
-    import shutil
-    from pathlib import Path as _P
-    video_name = payload.get("video_name")
-    base_dir = payload.get("base_dir")
-    characters = payload.get("characters", []) or []
-    voice_clusters = payload.get("voice_clusters", []) or []
-    if not video_name or not base_dir:
-        raise HTTPException(status_code=400, detail="Missing video_name or base_dir")
-    faces_out = IDENTITIES_ROOT / video_name / "faces"
-    voices_out = IDENTITIES_ROOT / video_name / "voices"
-    faces_out.mkdir(parents=True, exist_ok=True)
-    voices_out.mkdir(parents=True, exist_ok=True)
-    # Consolidate faces per character name (merge same names)
-    for ch in characters:
-        ch_name = (ch.get("name") or "Unknown").strip() or "Unknown"
-        ch_folder = ch.get("folder")
-        kept = ch.get("kept_files") or []
-        if not ch_folder or not os.path.isdir(ch_folder):
-            continue
-        dst_dir = faces_out / ch_name
-        dst_dir.mkdir(parents=True, exist_ok=True)
-        for fname in kept:
-            src = _P(ch_folder) / fname
-            if src.exists() and src.is_file():
-                try:
-                    shutil.copy2(src, dst_dir / fname)
-                except Exception:
-                    pass
-    # Consolidate voices per cluster name
-    clips_dir = _P(base_dir) / "clips"
-    for vc in voice_clusters:
-        v_name = (vc.get("name") or f"SPEAKER_{int(vc.get('label',0)):02d}").strip()
-        dst_dir = voices_out / v_name
-        dst_dir.mkdir(parents=True, exist_ok=True)
-        for wav in (vc.get("clips") or []):
-            src = clips_dir / wav
-            if src.exists() and src.is_file():
-                try:
-                    shutil.copy2(src, dst_dir / wav)
-                except Exception:
-                    pass
-    # Build indices using casting_loader helpers (best-effort)
-    db_dir = IDENTITIES_ROOT / video_name / "chroma_db"
-    try:
-        client = ensure_chroma(db_dir)
-        n_faces = build_faces_index(
-            faces_out,
-            client,
-            collection_name="index_faces",
-            deepface_model='Facenet512',
-            drop=True,
-        )
-        n_voices = build_voices_index(
-            voices_out,
-            client,
-            collection_name="index_voices",
-            drop=True,
-        )
-    except Exception as e:
-        # Si ChromaDB no está disponible o falla la indexación, no romper el flujo
-        print(f"[finalize_casting] WARN - No se pudieron construir índices ChromaDB: {e}")
-        n_faces = 0
-        n_voices = 0
-    # Summary of identities
-    face_identities = sorted([p.name for p in faces_out.iterdir() if p.is_dir()]) if faces_out.exists() else []
-    voice_identities = sorted([p.name for p in voices_out.iterdir() if p.is_dir()]) if voices_out.exists() else []
-    # Build casting_json with face and voice embeddings (best-effort) via remote Spaces
-    casting_json = {"face_col": [], "voice_col": []}
-    # Cargar config y router para acceder a svision/asr
-    try:
-        cfg = load_yaml("config.yaml")
-        router = LLMRouter(cfg)
-    except Exception:
-        router = None  # type: ignore
-    # Face embeddings per identity using remote svision (face_image_embedding)
-    try:
-        if face_identities and router is not None:
-            factory = router.client_factories.get("salamandra-vision")  # type: ignore[attr-defined]
-            if factory is not None:
-                vclient = factory()
-                gclient = getattr(vclient, "_client", None)
-            else:
-                gclient = None
-            if gclient is not None:
-                for identity in face_identities:
-                    id_dir = faces_out / identity
-                    if not id_dir.is_dir():
-                        continue
-                    # Buscar una imagen representativa
-                    img_path = None
-                    for ext in (".jpg", ".jpeg", ".png", ".bmp", ".webp"):
-                        candidates = list(id_dir.glob(f"*{ext}"))
-                        if candidates:
-                            img_path = candidates[0]
-                            break
-                    if not img_path:
-                        continue
-                    try:
-                        out = gclient.predict(str(img_path), api_name="/face_image_embedding")
-                        # svision devuelve normalmente una lista de embeddings o un solo embedding
-                        emb = None
-                        if isinstance(out, list):
-                            if out and isinstance(out[0], (list, tuple, float, int)):
-                                # Si es lista de listas, tomamos la primera; si es lista plana, la usamos tal cual
-                                if out and isinstance(out[0], (list, tuple)):
-                                    emb = list(out[0])
-                                else:
-                                    emb = list(out)
-                        elif isinstance(out, dict) and "embedding" in out:
-                            emb = out.get("embedding")
-                        if not emb:
-                            continue
-                        casting_json["face_col"].append({
-                            "nombre": identity,
-                            "embedding": emb,
-                        })
-                    except Exception:
-                        # No romper por un fallo puntual de embedding
-                        continue
-    except Exception:
-        # Si algo falla en todo el bloque de caras, dejamos face_col vacío
-        casting_json["face_col"] = []
-    # Voice embeddings per identity using remote asr (voice_embedding)
-    try:
-        if voice_identities and router is not None:
-            factory = router.client_factories.get("whisper-catalan")  # type: ignore[attr-defined]
-            if factory is not None:
-                aclient = factory()
-                gclient = getattr(aclient, "_client", None)
-            else:
-                gclient = None
-            if gclient is not None:
-                for identity in voice_identities:
-                    id_dir = voices_out / identity
-                    if not id_dir.is_dir():
-                        continue
-                    wav_files = sorted([p for p in id_dir.iterdir() if p.is_file() and p.suffix.lower() in [".wav", ".flac", ".mp3"]])
-                    if not wav_files:
-                        continue
-                    # Obtenemos un embedding representativo usando el primer clip
-                    wf = wav_files[0]
-                    try:
-                        out = gclient.predict(str(wf), api_name="/voice_embedding")
-                        emb = None
-                        if isinstance(out, list):
-                            emb = list(out)
-                        elif isinstance(out, dict) and "embedding" in out:
-                            emb = out.get("embedding")
-                        if not emb:
-                            continue
-                        casting_json["voice_col"].append({
-                            "nombre": identity,
-                            "embedding": emb,
-                        })
-                    except Exception:
-                        continue
-    except Exception:
-        # Si algo falla en todo el bloque de voces, dejamos voice_col vacío
-        casting_json["voice_col"] = []
-    return {
-        "ok": True,
-        "video_name": video_name,
-        "faces_dir": str(faces_out),
-        "voices_dir": str(voices_out),
-        "db_dir": str(db_dir),
-        "n_faces_embeddings": n_faces,
-        "n_voices_embeddings": n_voices,
-        "face_identities": face_identities,
-        "voice_identities": voice_identities,
-        "casting_json": casting_json,
-    }
-@app.get("/files_scene/{video_name}/{scene_id}/{filename}")
-def serve_scene_file(video_name: str, scene_id: str, filename: str):
-    file_path = TEMP_ROOT / video_name / "scenes" / scene_id / filename
-    if not file_path.exists():
-        raise HTTPException(status_code=404, detail="File not found")
-    return FileResponse(file_path)
 @app.post("/detect_scenes")
 async def detect_scenes(
-    video: UploadFile = File(...),
-    max_groups: int = Form(default=3),
     min_cluster_size: int = Form(default=3),
     scene_sensitivity: float = Form(default=0.5),
-    frame_interval_sec: float = Form(default=0.5),
 ):
     """
-    Detecta clústers d'escenes mitjançant clustering jeràrquic d'histogrames de color.
-    Retorna una llista de scene_clusters estructurada de forma similar a characters.
     """
     import cv2
     import numpy as np
-    # Guardar el vídeo temporalment
-    video_name = Path(video.filename).stem
     dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
-    with dst_video.open("wb") as f:
-        shutil.copyfileobj(video.file, f)
     cap = cv2.VideoCapture(str(dst_video))
     if not cap.isOpened():
-        raise HTTPException(status_code=400, detail="Cannot open video")
     fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
-    step = max(1, int(frame_interval_sec * fps))
-    frames = []
-    metas = []
-    idx = 0
-    while True:
-        ret = cap.grab()
         if not ret:
-            break
-        if idx % step == 0:
-            ret2, frame = cap.retrieve()
-            if not ret2:
-                break
-            # Reduir mida per estabilitat i càlcul ràpid
-            small = cv2.resize(frame, (160, 90))
-            hsv = cv2.cvtColor(small, cv2.COLOR_BGR2HSV)
-            # Histograma per canal
-            h_hist = cv2.calcHist([hsv],[0],None,[32],[0,180]).flatten()
-            s_hist = cv2.calcHist([hsv],[1],None,[32],[0,256]).flatten()
-            v_hist = cv2.calcHist([hsv],[2],None,[32],[0,256]).flatten()
-            hist = np.concatenate([h_hist, s_hist, v_hist])
-            hist = hist / (np.linalg.norm(hist) + 1e-8)
-            frames.append(hist)
-            metas.append({"index": idx, "time_sec": idx/float(fps)})
-        idx += 1
     cap.release()
     if not frames:
-        return {"scene_clusters": []}
     X = np.array(frames)
     labels = hierarchical_cluster_with_min_size(X, max_groups, min_cluster_size, scene_sensitivity).tolist()
@@ -1217,100 +869,75 @@ async def detect_scenes(
         if lbl is None or lbl < 0:
             continue
         clusters.setdefault(int(lbl), []).append(i)
-    # VALIDACIÓ MILLORADA: Fusionar clusters molt similars de forma més agressiva
-    # Calcular centroides (histograma promig de cada cluster)
     centroids = {}
     for lbl, idxs in clusters.items():
         cluster_histograms = X[idxs]
         centroids[lbl] = np.mean(cluster_histograms, axis=0)
     print(f"[SCENE VALIDATION] Validant similaritat entre {len(centroids)} clusters...")
-    # Thresholds més agressius per fusionar escenes similars
-    SIMILARITY_THRESHOLD = 0.25  # Aumentado de 0.15 a 0.25 (fusiona más)
-    CORRELATION_THRESHOLD = 0.85  # Correlación mínima para considerar similares
-    # Calcular matriu de distàncies i correlacions entre centroides
     cluster_labels = sorted(centroids.keys())
     similarities = {}
     for i, lbl1 in enumerate(cluster_labels):
         for lbl2 in cluster_labels[i+1:]:
-            # Distancia euclidiana (normalizada)
             dist = np.linalg.norm(centroids[lbl1] - centroids[lbl2])
-            # Correlación de Pearson entre histogramas
-            corr = np.corrcoef(centroids[lbl1], centroids[lbl2])[0, 1]
-            # Son similares si:
-            # - Distancia baja (< threshold) O
-            # - Correlación alta (> threshold)
             are_similar = (dist < SIMILARITY_THRESHOLD) or (corr > CORRELATION_THRESHOLD)
-            similarities[(lbl1, lbl2)] = {
-                'distance': dist,
-                'correlation': corr,
-                'similar': are_similar
-            }
             if are_similar:
-                print(f"[SCENE VALIDATION] Clusters {lbl1} i {lbl2} són similars: "
-                      f"dist={dist:.3f} (threshold={SIMILARITY_THRESHOLD}), "
-                      f"corr={corr:.3f} (threshold={CORRELATION_THRESHOLD})")
-    # Union-Find para fusionar clusters transitivamente
-    # Si A~B y B~C, entonces A~B~C (todos en el mismo grupo)
     parent = {lbl: lbl for lbl in cluster_labels}
     def find(x):
         if parent[x] != x:
-            parent[x] = find(parent[x])  # Path compression
         return parent[x]
     def union(x, y):
-        root_x = find(x)
-        root_y = find(y)
-        if root_x != root_y:
-            parent[root_y] = root_x
-    # Fusionar todos los clusters similares
     fusion_count = 0
     for (lbl1, lbl2), sim in similarities.items():
         if sim['similar']:
             union(lbl1, lbl2)
             fusion_count += 1
-    # Aplicar fusió als clusters
     new_clusters = {}
     for lbl, idxs in clusters.items():
         root = find(lbl)
-        if root not in new_clusters:
-            new_clusters[root] = []
-        new_clusters[root].extend(idxs)
-    # Reordenar labels para que sean consecutivos
     final_clusters_dict = {}
     for i, (root, idxs) in enumerate(sorted(new_clusters.items())):
         final_clusters_dict[i] = idxs
     clusters = final_clusters_dict
     final_clusters = len(clusters)
     eliminated = initial_clusters - final_clusters
-    print(f"[SCENE VALIDATION] ===== RESULTADO =====")
-    print(f"[SCENE VALIDATION] Clusters inicials: {initial_clusters}")
-    print(f"[SCENE VALIDATION] Fusions realitzades: {fusion_count}")
-    print(f"[SCENE VALIDATION] Clusters finals: {final_clusters}")
-    print(f"[SCENE VALIDATION] Clusters eliminats (fusionats): {eliminated}")
-    print(f"[SCENE VALIDATION] Reducció: {(eliminated/initial_clusters*100):.1f}%")
-    print(f"[SCENE VALIDATION] =======================")
-    # Escriure imatges representatives per a cada clúster
     base = TEMP_ROOT / video_name / "scenes"
     base.mkdir(parents=True, exist_ok=True)
     scene_list = []
     cap = cv2.VideoCapture(str(dst_video))
     for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
         scene_id = f"scene_{int(lbl):02d}"
         out_dir = base / scene_id
@@ -1329,53 +956,36 @@ async def detect_scenes(
         # Representative
         rep = frame_files[0] if frame_files else None
         image_url = f"/files_scene/{video_name}/{scene_id}/{rep}" if rep else ""
-        # Llamar a svision para describir la escena representativa
         scene_description = ""
-        scene_name = f"Escena {lbl+1}"
         if rep:
             rep_full_path = out_dir / rep
             if rep_full_path.exists():
-                print(f"Llamando a svision para describir {scene_id}...")
                 try:
-                    scene_description, scene_name = describe_image_with_svision(str(rep_full_path), is_face=False)
-                    if not scene_name:
-                        scene_name = f"Escena {lbl+1}"
-                    # Si tenemos descripción, generar nombre corto con schat
                     if scene_description:
-                        print(f"Llamando a schat para generar nombre corto de {scene_id}...")
                         try:
-                            # Usar LLMRouter para llamar a schat
                             config_path = os.getenv("CONFIG_YAML", "config.yaml")
                             if os.path.exists(config_path):
                                 with open(config_path, 'r', encoding='utf-8') as f:
                                     cfg = yaml.safe_load(f) or {}
                                 router = LLMRouter(cfg)
-                                prompt = f"Basant-te en aquesta descripció d'una escena, genera un nom curt de menys de 3 paraules que la resumeixi:\n\n{scene_description}\n\nNom de l'escena:"
                                 short_name = router.instruct(
                                     prompt=prompt,
-                                    system="Ets un assistent que genera noms curts i descriptius per a escenes. Respon NOMÉS amb el nom, sense explicacions.",
                                     model="salamandra-instruct"
-                                ).strip()
-                                # Limpiar posibles comillas o puntuación extra
-                                short_name = short_name.strip('"\'.,!?').strip()
-                                if short_name and len(short_name) > 0:
                                     scene_name = short_name
-                                    print(f"[schat] Nom generat: {scene_name}")
-                                else:
-                                    print(f"[schat] No s'ha generat nom, usant fallback")
-                        except Exception as e_schat:
-                            print(f"Error generando nombre con schat: {e_schat}")
-                            # Mantener el nombre de svision si schat falla
                 except Exception as e:
                     print(f"Error describiendo {scene_id}: {e}")
         scene_list.append({
             "id": scene_id,
             "name": scene_name,
@@ -1385,8 +995,8 @@ async def detect_scenes(
             "image_url": image_url,
             "frame_files": frame_files,
         })
-    cap.release()
     return {"scene_clusters": scene_list, "base_dir": str(base)}
 @app.post("/refine_narration")

 app.include_router(pending_videos_router)
 def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[str, str]:
+    """Call the svision Space to describe an image (used in AD generation).
     Args:
+        image_path: Absolute path to the image.
+        is_face: True if the image is a face, False if it is a scene.
     Returns:
+        Tuple ``(full_description, short_name)``.
     """
     try:
         from pathlib import Path as _P
         import yaml
         from llm_router import LLMRouter
+        # Load configuration
         config_path = _P(__file__).parent / "config.yaml"
         if not config_path.exists():
             print(f"[svision] Config no encontrado: {config_path}")
         router = LLMRouter(cfg)
+        # Different context depending on whether the image is a face or a scene
         if is_face:
             context = {
                 "task": "describe_person",
                 "max_tokens": 128
             }
+        # Call svision
         descriptions = router.vision_describe([str(image_path)], context=context, model="salamandra-vision")
         full_description = descriptions[0] if descriptions else ""
         return ("", "")
 def normalize_face_lighting(image):
+    """Normalize face brightness using a combination of techniques.
+    1. CLAHE for adaptive histogram equalization.
+    2. Range normalization to homogenize overall brightness.
+    This reduces the impact of different lighting conditions on embeddings
+    and on how faces are visualized.
     Args:
+        image: BGR image (OpenCV format).
     Returns:
+        Normalized image in the same format.
     """
     import cv2
     import numpy as np
+    # Step 1: Convert to LAB color space (more robust to illumination changes)
     lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
     l, a, b = cv2.split(lab)
+    # Step 2: Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) to the L channel
+    # Use a higher clipLimit for more aggressive normalization
     clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
     l_clahe = clahe.apply(l)
+    # Step 3: Normalize the range of the L channel to ensure a more uniform distribution
+    # This guarantees that all images have a similar brightness range
     l_min, l_max = l_clahe.min(), l_clahe.max()
     if l_max > l_min:
+        # Stretch histogram to the full range [0, 255]
         l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
     else:
         l_normalized = l_clahe
+    # Step 4: Apply a small blur to reduce noise introduced by normalization
     l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
+    # Recombine channels
     lab_normalized = cv2.merge([l_normalized, a, b])
+    # Convert back to BGR
     normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
     return normalized
 def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
+    """Hierarchical clustering with silhouette score and a minimum cluster size.
+    It automatically selects the best number of clusters (up to ``max_groups``)
+    using the silhouette score, and then filters out clusters with fewer than
+    ``min_cluster_size`` samples (marked as -1 / noise).
     Args:
+        X: Embedding array of shape (N, D).
+        max_groups: Maximum number of clusters to form.
+        min_cluster_size: Minimum size for a cluster to be considered valid.
+        sensitivity: Clustering sensitivity (0.0–1.0).
+            - 0.0 = very aggressive (fewer clusters).
+            - 0.5 = balanced (recommended).
+            - 1.0 = permissive (more clusters).
     Returns:
+        ``np.ndarray`` of labels (N,), where -1 indicates noise.
     """
     import numpy as np
     from scipy.cluster.hierarchy import linkage, fcluster
         return np.array([])
     if len(X) < min_cluster_size:
+        # If there are fewer samples than the minimum, treat everything as noise
         return np.full(len(X), -1, dtype=int)
+    # Linkage using average linkage (more flexible than ward and less sensitive to outliers)
+    # This helps group the same person under different angles/expressions
     Z = linkage(X, method='average', metric='cosine')  # Cosine similarity para embeddings
+    # Find the optimal number of clusters using the silhouette score
     best_n_clusters = 2
     best_score = -1
+    # Try different numbers of clusters (from 2 to max_groups)
+    max_to_try = min(max_groups, len(X) - 1)  # Cannot have more clusters than samples
     if max_to_try >= 2:
         for n_clusters in range(2, max_to_try + 1):
             trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
+            # Compute how many valid clusters we would have after filtering
             trial_counts = Counter(trial_labels)
             valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
+            # Only evaluate if there are at least 2 valid clusters
             if valid_clusters >= 2:
                 try:
                     score = silhouette_score(X, trial_labels, metric='cosine')
+                    # Dynamic penalty based on sensitivity:
+                    # - sensitivity = 0.0 → penalty = 0.14 (very aggressive, fewer clusters)
+                    # - sensitivity = 0.5 → penalty = 0.07 (balanced, recommended)
+                    # - sensitivity = 1.0 → penalty = 0.01 (permissive, more clusters)
                     penalty = 0.14 - (sensitivity * 0.13)
                     adjusted_score = score - (n_clusters * penalty)
                 except:
                     pass  # Si falla el cálculo, ignorar esta configuración
+    # Use the optimal number of clusters found
     penalty = 0.14 - (sensitivity * 0.13)
     print(f"Clustering óptimo: {best_n_clusters} clusters (de máximo {max_groups}), sensitivity={sensitivity:.2f}, penalty={penalty:.3f}, silhouette={best_score:.3f}")
     labels = fcluster(Z, t=best_n_clusters, criterion='maxclust')
+    # fcluster returns 1-indexed labels; convert to 0-indexed
     labels = labels - 1
+    # Filter out small clusters
     label_counts = Counter(labels)
     filtered_labels = []
     for lbl in labels:
         if label_counts[lbl] >= min_cluster_size:
             filtered_labels.append(lbl)
         else:
+            filtered_labels.append(-1)  # Noise
     return np.array(filtered_labels, dtype=int)
     voice_sensitivity: float = Form(default=0.5),
     max_frames: int = Form(default=100),
 ):
+    """Create a background job to process a video using hierarchical clustering.
+    This endpoint stores the uploaded video, creates a job entry and
+    starts ``process_video_job`` in the background. It immediately
+    returns a ``job_id`` that the UI can poll.
     """
+    # Save video into the data folder
     video_name = Path(video.filename).stem
     dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
     with dst_video.open("wb") as f:
         shutil.copyfileobj(video.file, f)
+    # Create unique job_id
     job_id = str(uuid.uuid4())
+    # Initialize job metadata
     jobs[job_id] = {
         "id": job_id,
         "status": JobStatus.QUEUED,
     print(f"[{job_id}] Job creado para vídeo: {video_name}")
+    # Start processing in the background
     background_tasks.add_task(process_video_job, job_id)
     # Devolver job_id inmediatamente
             # Construir carpetas por clúster con validación DeepFace
             from face_classifier import validate_and_classify_face, FACE_CONFIDENCE_THRESHOLD
+            characters_validated: list[dict[str, Any]] = []
             cluster_map: dict[int, list[int]] = {}
+            fallback_candidate: dict[str, Any] | None = None
+            for idx, lbl in enumerate(labels):
                 if isinstance(lbl, int) and lbl >= 0:
+                    cluster_map.setdefault(lbl, []).append(idx)
             chars_dir = base / "characters"
             chars_dir.mkdir(parents=True, exist_ok=True)
             import shutil as _sh
             original_cluster_count = len(cluster_map)
             print(f"[{job_id}] Procesando {original_cluster_count} clusters detectados...")
             for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
                 char_id = f"char_{ci:02d}"
+                detections: list[dict[str, Any]] = []
                 for j in idxs:
                     meta = crops_meta[j]
+                    file_name = meta.get("file")
+                    if not file_name:
+                        continue
                     box = meta.get("box", [0, 0, 0, 0])
+                    area = 0
                     if len(box) >= 4:
                         top, right, bottom, left = box
+                        area = abs(right - left) * abs(bottom - top)
+                    detections.append({
+                        "index": j,
+                        "file": file_name,
+                        "score": area,
+                        "box": box,
                     })
+                if not detections:
                     print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: sense deteccions, eliminant")
                     continue
+                detections.sort(key=lambda d: d["score"], reverse=True)
+                best_face = detections[0]
+                best_face_path = faces_root / best_face["file"]
                 print(f"[{job_id}] [VALIDATION] Cluster {char_id}: validant millor cara (bbox_area={best_face['score']:.0f}px²)")
                 print(f"[{job_id}] [VALIDATION] Cluster {char_id}: millor cara path={best_face_path}")
                 print(f"[{job_id}] [VALIDATION] ▶▶▶ CRIDANT validate_and_classify_face() ◀◀◀")
                 validation = validate_and_classify_face(str(best_face_path))
                 print(f"[{job_id}] [VALIDATION] ▶▶▶ validate_and_classify_face() RETORNAT ◀◀◀")
+                candidate_conf = 0.0
+                if validation:
+                    try:
+                        candidate_conf = float(validation.get("face_confidence", 0.0) or 0.0)
+                    except Exception:
+                        candidate_conf = 0.0
+                if not fallback_candidate or candidate_conf > fallback_candidate.get("face_confidence", -1.0):
+                    fallback_candidate = {
+                        "char_id": char_id,
+                        "detection": best_face,
+                        "validation": validation,
+                        "path": best_face_path,
+                        "face_confidence": candidate_conf,
+                    }
                 if not validation:
                     print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: error en validació DeepFace, eliminant cluster")
                     continue
                 print(f"[{job_id}] [DEEPFACE RESULT] Cluster {char_id}:")
                 print(f"[{job_id}]   - is_valid_face: {validation['is_valid_face']}")
                 print(f"[{job_id}]   - face_confidence: {validation['face_confidence']:.3f}")
                 print(f"[{job_id}]   - gender_diff: {abs(validation['man_prob'] - validation['woman_prob']):.3f}")
                 print(f"[{job_id}]   - gender_assigned: {validation['gender']}")
                 print(f"[{job_id}]   - gender_confidence: {validation['gender_confidence']:.3f}")
+                if (not validation.get("is_valid_face")) or (validation.get("face_confidence", 0.0) < FACE_CONFIDENCE_THRESHOLD):
+                    print(
+                        f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: NO ES UNA CARA VÁLIDA "
+                        f"(face_confidence={validation.get('face_confidence', 0.0):.3f} < threshold={FACE_CONFIDENCE_THRESHOLD}), eliminant tot el clúster"
+                    )
                     continue
                 out_dir = chars_dir / char_id
                 out_dir.mkdir(parents=True, exist_ok=True)
+                total_faces = len(detections)
                 max_faces_to_show = (total_faces // 2) + 1
+                selected = detections[:max_faces_to_show]
+                files: list[str] = []
+                file_urls: list[str] = []
+                for det in selected:
+                    fname = det["file"]
                     src = faces_root / fname
                     dst = out_dir / fname
                     try:
                         _sh.copy2(src, dst)
                         files.append(fname)
+                        file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
                     except Exception:
                         pass
                 rep = files[0] if files else None
                 if rep:
                     rep_src = out_dir / rep
                         _sh.copy2(rep_src, rep_dst)
                     except Exception:
                         pass
+                cluster_number = int(char_id.split("_")[1]) + 1
                 character_name = f"Cluster {cluster_number}"
+                gender = validation.get("gender", "Neutral")
                 print(f"[{job_id}] [NAME GENERATION] Cluster {char_id}:")
                 print(f"[{job_id}]   - Gender detectado: {gender}")
                 print(f"[{job_id}]   - Nombre asignado: {character_name}")
                 print(f"[{job_id}]   - Seed usado: {char_id}")
+                characters_validated.append({
                     "id": char_id,
                     "name": character_name,
                     "gender": gender,
+                    "gender_confidence": validation.get("gender_confidence", 0.0),
+                    "face_confidence": validation.get("face_confidence", 0.0),
+                    "man_prob": validation.get("man_prob", 0.0),
+                    "woman_prob": validation.get("woman_prob", 0.0),
                     "folder": str(out_dir),
                     "num_faces": len(files),
                     "total_faces_detected": total_faces,
                     "image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
+                    "face_files": file_urls,
+                })
                 print(f"[{job_id}] [VALIDATION] ✓ Cluster {char_id}: CARA VÁLIDA!")
                 print(f"[{job_id}]   Nombre: {character_name}")
+                print(f"[{job_id}]   Género: {gender} (man={validation.get('man_prob', 0.0):.3f}, woman={validation.get('woman_prob', 0.0):.3f})")
+                print(f"[{job_id}]   Confianza género: {validation.get('gender_confidence', 0.0):.3f}")
+                print(f"[{job_id}]   Confianza cara: {validation.get('face_confidence', 0.0):.3f}")
                 print(f"[{job_id}]   Caras mostradas: {len(files)}/{total_faces}")
                 print(f"[{job_id}]   Imagen representativa: {best_face_path.name}")
             eliminated_count = original_cluster_count - len(characters_validated)
             print(f"[{job_id}] [VALIDATION] Total: {len(characters_validated)} clústers vàlids "
                   f"(eliminats {eliminated_count} falsos positius)")
+            if not characters_validated and fallback_candidate:
+                print(f"[{job_id}] [FALLBACK] No hi ha clústers vàlids. Creant clúster de reserva amb la millor cara trobada.")
+                fallback = fallback_candidate
+                det = fallback.get("detection", {})
+                fname = det.get("file")
+                fallback_path: Path | None = fallback.get("path")
+                val = fallback.get("validation")
+                idx = det.get("index")
+                if fname and fallback_path is not None:
+                    if val is None:
+                        val = validate_and_classify_face(str(fallback_path))
+                    if val is None:
+                        val = {
+                            "is_valid_face": False,
+                            "face_confidence": fallback.get("face_confidence", 0.0),
+                            "gender": "Neutral",
+                            "gender_confidence": 0.0,
+                            "man_prob": 0.0,
+                            "woman_prob": 0.0,
+                        }
+                    out_dir = chars_dir / "char_00"
+                    out_dir.mkdir(parents=True, exist_ok=True)
+                    src = faces_root / fname
+                    dst = out_dir / fname
+                    try:
+                        _sh.copy2(src, dst)
+                    except Exception as copy_err:
+                        print(f"[{job_id}] [FALLBACK] Error copiant la imatge de reserva: {copy_err}")
+                    rep_dst = out_dir / "representative.jpg"
+                    try:
+                        _sh.copy2(dst, rep_dst)
+                    except Exception:
+                        pass
+                    if embeddings:
+                        if not labels or len(labels) != len(embeddings):
+                            labels = [-1] * len(embeddings)
+                        if isinstance(idx, int) and 0 <= idx < len(labels):
+                            labels[idx] = 0
+                    characters_validated.append({
+                        "id": "char_00",
+                        "name": "Cluster 1",
+                        "gender": val.get("gender", "Neutral"),
+                        "gender_confidence": val.get("gender_confidence", 0.0),
+                        "face_confidence": val.get("face_confidence", 0.0),
+                        "man_prob": val.get("man_prob", 0.0),
+                        "woman_prob": val.get("woman_prob", 0.0),
+                        "folder": str(out_dir),
+                        "num_faces": 1,
+                        "total_faces_detected": 1,
+                        "image_url": f"/files/{video_name}/char_00/representative.jpg",
+                        "face_files": [f"/files/{video_name}/char_00/{fname}"],
+                    })
+                    print(f"[{job_id}] [FALLBACK] Clúster de reserva creat amb confiança {val.get('face_confidence', 0.0):.3f}")
+                else:
+                    print(f"[{job_id}] [FALLBACK] Dades insuficients per crear el clúster de reserva")
+            # Guardar resultados de caras
             job["results"] = {
+                "characters": characters_validated,
+                "face_labels": labels,
+                "video_name": video_name,
                 "base_dir": str(base),
             }
             job["status"] = JobStatus.DONE
+            print(f"[{job_id}] ✓ Procesamiento de caras completado: {len(characters_validated)} personajes")
+        except Exception as face_error:
+            print(f"[{job_id}] Error en detección de caras: {face_error}")
+            import traceback
+            traceback.print_exc()
+            job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
+            job["status"] = JobStatus.DONE  # Still mark done so UI can proceed
     except Exception as e:
+        print(f"[{job_id}] Error general en procesamiento: {e}")
         import traceback
+        traceback.print_exc()
+        job["status"] = JobStatus.FAILED
+        job["error"] = str(e)
 @app.post("/detect_scenes")
 async def detect_scenes(
+    video_name: str = Form(...),
+    max_groups: int = Form(default=5),
     min_cluster_size: int = Form(default=3),
     scene_sensitivity: float = Form(default=0.5),
 ):
     """
+    Detecta y agrupa escenas en un vídeo ya procesado.
     """
     import cv2
     import numpy as np
+    from typing import Any
     dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
+    if not dst_video.exists():
+        return {"error": f"Video {video_name} not found"}
     cap = cv2.VideoCapture(str(dst_video))
     if not cap.isOpened():
+        return {"error": "Could not open video"}
     fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
+    max_samples = 200  # Limit samples for scene detection
+    if total_frames > 0:
+        frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
+    else:
+        frame_indices = []
+    frames: list[list[float]] = []
+    metas: list[dict[str, Any]] = []
+    for frame_idx in frame_indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
+        ret, frame = cap.read()
         if not ret:
+            continue
+        # Color histogram as feature
+        hist = cv2.calcHist([frame], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
+        hist = cv2.normalize(hist, hist).flatten()
+        frames.append(hist.tolist())
+        metas.append({"index": frame_idx})
     cap.release()
     if not frames:
+        return {"scene_clusters": [], "base_dir": ""}
     X = np.array(frames)
     labels = hierarchical_cluster_with_min_size(X, max_groups, min_cluster_size, scene_sensitivity).tolist()
         if lbl is None or lbl < 0:
             continue
         clusters.setdefault(int(lbl), []).append(i)
+    # Fallback: garantir mínim 1 cluster d'escena
+    if not clusters and frames:
+        clusters[0] = [0]  # Usar el primer frame com a escena per defecte
+        print("[SCENE FALLBACK] Cap cluster vàlid, creant cluster amb primer frame")
+    # VALIDACIÓ MILLORADA: Fusionar clusters molt similars
     centroids = {}
     for lbl, idxs in clusters.items():
         cluster_histograms = X[idxs]
         centroids[lbl] = np.mean(cluster_histograms, axis=0)
     print(f"[SCENE VALIDATION] Validant similaritat entre {len(centroids)} clusters...")
+    SIMILARITY_THRESHOLD = 0.25
+    CORRELATION_THRESHOLD = 0.85
     cluster_labels = sorted(centroids.keys())
     similarities = {}
     for i, lbl1 in enumerate(cluster_labels):
         for lbl2 in cluster_labels[i+1:]:
             dist = np.linalg.norm(centroids[lbl1] - centroids[lbl2])
+            corr = np.corrcoef(centroids[lbl1], centroids[lbl2])[0, 1] if len(centroids[lbl1]) > 1 else 0.0
             are_similar = (dist < SIMILARITY_THRESHOLD) or (corr > CORRELATION_THRESHOLD)
+            similarities[(lbl1, lbl2)] = {'distance': dist, 'correlation': corr, 'similar': are_similar}
             if are_similar:
+                print(f"[SCENE VALIDATION] Clusters {lbl1} i {lbl2} similars: dist={dist:.3f}, corr={corr:.3f}")
+    # Union-Find para fusionar clusters
     parent = {lbl: lbl for lbl in cluster_labels}
     def find(x):
         if parent[x] != x:
+            parent[x] = find(parent[x])
         return parent[x]
     def union(x, y):
+        rx, ry = find(x), find(y)
+        if rx != ry:
+            parent[ry] = rx
     fusion_count = 0
     for (lbl1, lbl2), sim in similarities.items():
         if sim['similar']:
             union(lbl1, lbl2)
             fusion_count += 1
     new_clusters = {}
     for lbl, idxs in clusters.items():
         root = find(lbl)
+        new_clusters.setdefault(root, []).extend(idxs)
     final_clusters_dict = {}
     for i, (root, idxs) in enumerate(sorted(new_clusters.items())):
         final_clusters_dict[i] = idxs
     clusters = final_clusters_dict
     final_clusters = len(clusters)
     eliminated = initial_clusters - final_clusters
+    print(f"[SCENE VALIDATION] Clusters finals: {final_clusters} (fusionats: {eliminated})")
+    # Escriure imatges representatives
     base = TEMP_ROOT / video_name / "scenes"
     base.mkdir(parents=True, exist_ok=True)
     scene_list = []
     cap = cv2.VideoCapture(str(dst_video))
     for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
         scene_id = f"scene_{int(lbl):02d}"
         out_dir = base / scene_id
         # Representative
         rep = frame_files[0] if frame_files else None
         image_url = f"/files_scene/{video_name}/{scene_id}/{rep}" if rep else ""
+        # Llamar a svision para describir la escena
         scene_description = ""
+        scene_name = f"Cluster {lbl+1}"
         if rep:
             rep_full_path = out_dir / rep
             if rep_full_path.exists():
                 try:
+                    scene_description, _ = describe_image_with_svision(str(rep_full_path), is_face=False)
+                    # Generar nombre corto con schat
                     if scene_description:
                         try:
                             config_path = os.getenv("CONFIG_YAML", "config.yaml")
                             if os.path.exists(config_path):
                                 with open(config_path, 'r', encoding='utf-8') as f:
                                     cfg = yaml.safe_load(f) or {}
                                 router = LLMRouter(cfg)
+                                prompt = f"Genera un nom curt (2-3 paraules) per aquesta escena:\n{scene_description}"
                                 short_name = router.instruct(
                                     prompt=prompt,
+                                    system="Respon NOMÉS amb el nom, sense explicacions.",
                                     model="salamandra-instruct"
+                                ).strip().strip('"\'.,!?')
+                                if short_name:
                                     scene_name = short_name
+                        except Exception:
+                            pass
                 except Exception as e:
                     print(f"Error describiendo {scene_id}: {e}")
         scene_list.append({
             "id": scene_id,
             "name": scene_name,
             "image_url": image_url,
             "frame_files": frame_files,
         })
+    cap.release()
     return {"scene_clusters": scene_list, "base_dir": str(base)}
 @app.post("/refine_narration")