Upload api.py
Browse files
api.py
CHANGED
|
@@ -75,22 +75,21 @@ app.include_router(embeddings_router)
|
|
| 75 |
app.include_router(pending_videos_router)
|
| 76 |
|
| 77 |
def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[str, str]:
|
| 78 |
-
"""
|
| 79 |
-
|
| 80 |
-
|
| 81 |
Args:
|
| 82 |
-
image_path:
|
| 83 |
-
is_face: True
|
| 84 |
-
|
| 85 |
Returns:
|
| 86 |
-
|
| 87 |
"""
|
| 88 |
try:
|
| 89 |
from pathlib import Path as _P
|
| 90 |
import yaml
|
| 91 |
from llm_router import LLMRouter
|
| 92 |
|
| 93 |
-
#
|
| 94 |
config_path = _P(__file__).parent / "config.yaml"
|
| 95 |
if not config_path.exists():
|
| 96 |
print(f"[svision] Config no encontrado: {config_path}")
|
|
@@ -101,7 +100,7 @@ def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[
|
|
| 101 |
|
| 102 |
router = LLMRouter(cfg)
|
| 103 |
|
| 104 |
-
#
|
| 105 |
if is_face:
|
| 106 |
context = {
|
| 107 |
"task": "describe_person",
|
|
@@ -115,7 +114,7 @@ def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[
|
|
| 115 |
"max_tokens": 128
|
| 116 |
}
|
| 117 |
|
| 118 |
-
#
|
| 119 |
descriptions = router.vision_describe([str(image_path)], context=context, model="salamandra-vision")
|
| 120 |
full_description = descriptions[0] if descriptions else ""
|
| 121 |
|
|
@@ -133,68 +132,69 @@ def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[
|
|
| 133 |
return ("", "")
|
| 134 |
|
| 135 |
def normalize_face_lighting(image):
|
| 136 |
-
"""
|
| 137 |
-
|
| 138 |
-
1. CLAHE
|
| 139 |
-
2.
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
Args:
|
| 145 |
-
image:
|
| 146 |
-
|
| 147 |
Returns:
|
| 148 |
-
|
| 149 |
"""
|
| 150 |
import cv2
|
| 151 |
import numpy as np
|
| 152 |
|
| 153 |
-
#
|
| 154 |
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
|
| 155 |
l, a, b = cv2.split(lab)
|
| 156 |
|
| 157 |
-
#
|
| 158 |
-
#
|
| 159 |
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
| 160 |
l_clahe = clahe.apply(l)
|
| 161 |
|
| 162 |
-
#
|
| 163 |
-
#
|
| 164 |
l_min, l_max = l_clahe.min(), l_clahe.max()
|
| 165 |
if l_max > l_min:
|
| 166 |
-
#
|
| 167 |
l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
|
| 168 |
else:
|
| 169 |
l_normalized = l_clahe
|
| 170 |
|
| 171 |
-
#
|
| 172 |
l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
|
| 173 |
|
| 174 |
-
#
|
| 175 |
lab_normalized = cv2.merge([l_normalized, a, b])
|
| 176 |
|
| 177 |
-
#
|
| 178 |
normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
|
| 179 |
return normalized
|
| 180 |
|
| 181 |
def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
|
| 182 |
-
"""
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
| 187 |
Args:
|
| 188 |
-
X:
|
| 189 |
-
max_groups:
|
| 190 |
-
min_cluster_size:
|
| 191 |
-
sensitivity:
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
Returns:
|
| 197 |
-
|
| 198 |
"""
|
| 199 |
import numpy as np
|
| 200 |
from scipy.cluster.hierarchy import linkage, fcluster
|
|
@@ -205,36 +205,36 @@ def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int
|
|
| 205 |
return np.array([])
|
| 206 |
|
| 207 |
if len(X) < min_cluster_size:
|
| 208 |
-
#
|
| 209 |
return np.full(len(X), -1, dtype=int)
|
| 210 |
|
| 211 |
-
# Linkage
|
| 212 |
-
#
|
| 213 |
Z = linkage(X, method='average', metric='cosine') # Cosine similarity para embeddings
|
| 214 |
|
| 215 |
-
#
|
| 216 |
best_n_clusters = 2
|
| 217 |
best_score = -1
|
| 218 |
|
| 219 |
-
#
|
| 220 |
-
max_to_try = min(max_groups, len(X) - 1) #
|
| 221 |
|
| 222 |
if max_to_try >= 2:
|
| 223 |
for n_clusters in range(2, max_to_try + 1):
|
| 224 |
trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
|
| 225 |
-
|
| 226 |
-
#
|
| 227 |
trial_counts = Counter(trial_labels)
|
| 228 |
valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
|
| 229 |
|
| 230 |
-
#
|
| 231 |
if valid_clusters >= 2:
|
| 232 |
try:
|
| 233 |
score = silhouette_score(X, trial_labels, metric='cosine')
|
| 234 |
-
#
|
| 235 |
-
# - sensitivity=0.0 → penalty=0.14 (
|
| 236 |
-
# - sensitivity=0.5 → penalty=0.07 (
|
| 237 |
-
# - sensitivity=1.0 → penalty=0.01 (
|
| 238 |
penalty = 0.14 - (sensitivity * 0.13)
|
| 239 |
adjusted_score = score - (n_clusters * penalty)
|
| 240 |
|
|
@@ -244,22 +244,22 @@ def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int
|
|
| 244 |
except:
|
| 245 |
pass # Si falla el cálculo, ignorar esta configuración
|
| 246 |
|
| 247 |
-
#
|
| 248 |
penalty = 0.14 - (sensitivity * 0.13)
|
| 249 |
print(f"Clustering óptimo: {best_n_clusters} clusters (de máximo {max_groups}), sensitivity={sensitivity:.2f}, penalty={penalty:.3f}, silhouette={best_score:.3f}")
|
| 250 |
labels = fcluster(Z, t=best_n_clusters, criterion='maxclust')
|
| 251 |
|
| 252 |
-
# fcluster
|
| 253 |
labels = labels - 1
|
| 254 |
|
| 255 |
-
#
|
| 256 |
label_counts = Counter(labels)
|
| 257 |
filtered_labels = []
|
| 258 |
for lbl in labels:
|
| 259 |
if label_counts[lbl] >= min_cluster_size:
|
| 260 |
filtered_labels.append(lbl)
|
| 261 |
else:
|
| 262 |
-
filtered_labels.append(-1) #
|
| 263 |
|
| 264 |
return np.array(filtered_labels, dtype=int)
|
| 265 |
|
|
@@ -292,20 +292,22 @@ async def create_initial_casting(
|
|
| 292 |
voice_sensitivity: float = Form(default=0.5),
|
| 293 |
max_frames: int = Form(default=100),
|
| 294 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
"""
|
| 296 |
-
|
| 297 |
-
Devuelve un job_id inmediatamente.
|
| 298 |
-
"""
|
| 299 |
-
# Guardar vídeo en carpeta de datos
|
| 300 |
video_name = Path(video.filename).stem
|
| 301 |
dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
|
| 302 |
with dst_video.open("wb") as f:
|
| 303 |
shutil.copyfileobj(video.file, f)
|
| 304 |
|
| 305 |
-
#
|
| 306 |
job_id = str(uuid.uuid4())
|
| 307 |
|
| 308 |
-
#
|
| 309 |
jobs[job_id] = {
|
| 310 |
"id": job_id,
|
| 311 |
"status": JobStatus.QUEUED,
|
|
@@ -325,7 +327,7 @@ async def create_initial_casting(
|
|
| 325 |
|
| 326 |
print(f"[{job_id}] Job creado para vídeo: {video_name}")
|
| 327 |
|
| 328 |
-
#
|
| 329 |
background_tasks.add_task(process_video_job, job_id)
|
| 330 |
|
| 331 |
# Devolver job_id inmediatamente
|
|
@@ -566,70 +568,77 @@ def process_video_job(job_id: str):
|
|
| 566 |
# Construir carpetas por clúster con validación DeepFace
|
| 567 |
from face_classifier import validate_and_classify_face, FACE_CONFIDENCE_THRESHOLD
|
| 568 |
|
| 569 |
-
characters_validated = []
|
| 570 |
cluster_map: dict[int, list[int]] = {}
|
| 571 |
-
|
|
|
|
| 572 |
if isinstance(lbl, int) and lbl >= 0:
|
| 573 |
-
cluster_map.setdefault(lbl, []).append(
|
| 574 |
|
| 575 |
chars_dir = base / "characters"
|
| 576 |
chars_dir.mkdir(parents=True, exist_ok=True)
|
| 577 |
import shutil as _sh
|
| 578 |
-
|
| 579 |
original_cluster_count = len(cluster_map)
|
| 580 |
print(f"[{job_id}] Procesando {original_cluster_count} clusters detectados...")
|
| 581 |
-
|
| 582 |
for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
|
| 583 |
char_id = f"char_{ci:02d}"
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
face_detections = []
|
| 587 |
for j in idxs:
|
| 588 |
meta = crops_meta[j]
|
|
|
|
|
|
|
|
|
|
| 589 |
box = meta.get("box", [0, 0, 0, 0])
|
|
|
|
| 590 |
if len(box) >= 4:
|
| 591 |
top, right, bottom, left = box
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
face_detections.append({
|
| 599 |
-
'index': j,
|
| 600 |
-
'score': area_score,
|
| 601 |
-
'file': meta['file'],
|
| 602 |
-
'box': box
|
| 603 |
})
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
face_detections_sorted = sorted(
|
| 607 |
-
face_detections,
|
| 608 |
-
key=lambda x: x['score'],
|
| 609 |
-
reverse=True
|
| 610 |
-
)
|
| 611 |
-
|
| 612 |
-
if not face_detections_sorted:
|
| 613 |
print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: sense deteccions, eliminant")
|
| 614 |
continue
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
best_face =
|
| 618 |
-
best_face_path = faces_root / best_face[
|
| 619 |
-
|
| 620 |
print(f"[{job_id}] [VALIDATION] Cluster {char_id}: validant millor cara (bbox_area={best_face['score']:.0f}px²)")
|
| 621 |
print(f"[{job_id}] [VALIDATION] Cluster {char_id}: millor cara path={best_face_path}")
|
| 622 |
print(f"[{job_id}] [VALIDATION] ▶▶▶ CRIDANT validate_and_classify_face() ◀◀◀")
|
| 623 |
-
|
| 624 |
validation = validate_and_classify_face(str(best_face_path))
|
| 625 |
-
|
| 626 |
print(f"[{job_id}] [VALIDATION] ▶▶▶ validate_and_classify_face() RETORNAT ◀◀◀")
|
| 627 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
if not validation:
|
| 629 |
print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: error en validació DeepFace, eliminant cluster")
|
| 630 |
continue
|
| 631 |
-
|
| 632 |
-
# Mostrar resultados detallados de DeepFace
|
| 633 |
print(f"[{job_id}] [DEEPFACE RESULT] Cluster {char_id}:")
|
| 634 |
print(f"[{job_id}] - is_valid_face: {validation['is_valid_face']}")
|
| 635 |
print(f"[{job_id}] - face_confidence: {validation['face_confidence']:.3f}")
|
|
@@ -638,36 +647,34 @@ def process_video_job(job_id: str):
|
|
| 638 |
print(f"[{job_id}] - gender_diff: {abs(validation['man_prob'] - validation['woman_prob']):.3f}")
|
| 639 |
print(f"[{job_id}] - gender_assigned: {validation['gender']}")
|
| 640 |
print(f"[{job_id}] - gender_confidence: {validation['gender_confidence']:.3f}")
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
|
|
|
|
|
|
| 645 |
continue
|
| 646 |
-
|
| 647 |
-
# PASO 4: És una cara vàlida! Crear carpeta
|
| 648 |
out_dir = chars_dir / char_id
|
| 649 |
out_dir.mkdir(parents=True, exist_ok=True)
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
total_faces = len(face_detections_sorted)
|
| 653 |
max_faces_to_show = (total_faces // 2) + 1
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
fname = face_det['file']
|
| 661 |
src = faces_root / fname
|
| 662 |
dst = out_dir / fname
|
| 663 |
try:
|
| 664 |
_sh.copy2(src, dst)
|
| 665 |
files.append(fname)
|
| 666 |
-
|
| 667 |
except Exception:
|
| 668 |
pass
|
| 669 |
-
|
| 670 |
-
# Imagen representativa (la mejor)
|
| 671 |
rep = files[0] if files else None
|
| 672 |
if rep:
|
| 673 |
rep_src = out_dir / rep
|
|
@@ -676,535 +683,180 @@ def process_video_job(job_id: str):
|
|
| 676 |
_sh.copy2(rep_src, rep_dst)
|
| 677 |
except Exception:
|
| 678 |
pass
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
cluster_number = int(char_id.split('_')[1]) + 1
|
| 682 |
character_name = f"Cluster {cluster_number}"
|
| 683 |
-
gender = validation
|
| 684 |
-
|
| 685 |
print(f"[{job_id}] [NAME GENERATION] Cluster {char_id}:")
|
| 686 |
print(f"[{job_id}] - Gender detectado: {gender}")
|
| 687 |
print(f"[{job_id}] - Nombre asignado: {character_name}")
|
| 688 |
print(f"[{job_id}] - Seed usado: {char_id}")
|
| 689 |
-
|
| 690 |
-
|
| 691 |
"id": char_id,
|
| 692 |
"name": character_name,
|
| 693 |
"gender": gender,
|
| 694 |
-
"gender_confidence": validation
|
| 695 |
-
"face_confidence": validation
|
| 696 |
-
"man_prob": validation
|
| 697 |
-
"woman_prob": validation
|
| 698 |
"folder": str(out_dir),
|
| 699 |
"num_faces": len(files),
|
| 700 |
"total_faces_detected": total_faces,
|
| 701 |
"image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
|
| 702 |
-
"face_files":
|
| 703 |
-
}
|
| 704 |
-
|
| 705 |
-
characters_validated.append(character_data)
|
| 706 |
-
|
| 707 |
print(f"[{job_id}] [VALIDATION] ✓ Cluster {char_id}: CARA VÁLIDA!")
|
| 708 |
print(f"[{job_id}] Nombre: {character_name}")
|
| 709 |
-
print(f"[{job_id}] Género: {gender} (man={validation
|
| 710 |
-
print(f"[{job_id}] Confianza género: {validation
|
| 711 |
-
print(f"[{job_id}] Confianza cara: {validation
|
| 712 |
print(f"[{job_id}] Caras mostradas: {len(files)}/{total_faces}")
|
| 713 |
print(f"[{job_id}] Imagen representativa: {best_face_path.name}")
|
| 714 |
-
|
| 715 |
-
# Estadístiques finals
|
| 716 |
eliminated_count = original_cluster_count - len(characters_validated)
|
| 717 |
print(f"[{job_id}] [VALIDATION] Total: {len(characters_validated)} clústers vàlids "
|
| 718 |
f"(eliminats {eliminated_count} falsos positius)")
|
| 719 |
-
|
| 720 |
-
characters = characters_validated
|
| 721 |
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
"
|
| 726 |
-
"
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 731 |
|
| 732 |
-
|
| 733 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
# Ajustar num_faces si hay discrepancia
|
| 766 |
-
if face_files:
|
| 767 |
-
ch["num_faces"] = len(face_files)
|
| 768 |
-
except Exception as _e:
|
| 769 |
-
print(f"[{job_id}] WARN - No se pudo enumerar face_files: {_e}")
|
| 770 |
-
|
| 771 |
-
# Procesamiento de audio: diarización, ASR y embeddings de voz
|
| 772 |
-
try:
|
| 773 |
-
cfg = load_yaml("config.yaml")
|
| 774 |
-
audio_segments, srt_unmod, full_txt, diar_info, connection_logs = process_audio_for_video(video_path, base, cfg, voice_collection=None)
|
| 775 |
-
# Loggear en consola del engine los eventos de conexión
|
| 776 |
-
try:
|
| 777 |
-
for ev in (connection_logs or []):
|
| 778 |
-
msg = ev.get("message") if isinstance(ev, dict) else None
|
| 779 |
-
if msg:
|
| 780 |
-
print(f"[{job_id}] {msg}")
|
| 781 |
-
except Exception:
|
| 782 |
-
pass
|
| 783 |
-
except Exception as e_audio:
|
| 784 |
-
import traceback
|
| 785 |
-
print(f"[{job_id}] WARN - Audio pipeline failed: {e_audio}\n{traceback.format_exc()}")
|
| 786 |
-
audio_segments, srt_unmod, full_txt = [], None, ""
|
| 787 |
-
diar_info = {"diarization_ok": False, "error": str(e_audio)}
|
| 788 |
-
connection_logs = []
|
| 789 |
-
|
| 790 |
-
# Fallback: si no hay segmentos de audio, crear uno mínimo del audio completo
|
| 791 |
-
if not audio_segments:
|
| 792 |
-
try:
|
| 793 |
-
from pathlib import Path as _P
|
| 794 |
-
from pydub import AudioSegment as _AS
|
| 795 |
-
wav_out = extract_audio_ffmpeg(video_path, base / f"{_P(video_path).stem}.wav", sr=16000)
|
| 796 |
-
audio = _AS.from_wav(wav_out)
|
| 797 |
-
clips_dir = base / "clips"
|
| 798 |
-
clips_dir.mkdir(parents=True, exist_ok=True)
|
| 799 |
-
cp = clips_dir / "segment_000.wav"
|
| 800 |
-
audio.export(cp, format="wav")
|
| 801 |
-
emb_list = embed_voice_segments([str(cp)])
|
| 802 |
-
audio_segments = [{
|
| 803 |
-
"segment": 0,
|
| 804 |
-
"start": 0.0,
|
| 805 |
-
"end": float(len(audio) / 1000.0),
|
| 806 |
-
"speaker": "SPEAKER_00",
|
| 807 |
-
"text": "",
|
| 808 |
-
"voice_embedding": emb_list[0] if emb_list else [],
|
| 809 |
-
"clip_path": str(cp),
|
| 810 |
-
"lang": "ca",
|
| 811 |
-
"lang_prob": 1.0,
|
| 812 |
-
}]
|
| 813 |
-
except Exception as _efb:
|
| 814 |
-
print(f"[{job_id}] WARN - Audio minimal fallback failed: {_efb}")
|
| 815 |
-
|
| 816 |
-
# Clustering jerárquico de voces sobre embeddings válidos
|
| 817 |
-
import numpy as np
|
| 818 |
-
voice_embeddings = [seg.get("voice_embedding") for seg in audio_segments if seg.get("voice_embedding")]
|
| 819 |
-
if voice_embeddings:
|
| 820 |
-
try:
|
| 821 |
-
Xv = np.array(voice_embeddings)
|
| 822 |
-
v_labels = hierarchical_cluster_with_min_size(Xv, v_max_groups, v_min_cluster, voice_sensitivity).tolist()
|
| 823 |
-
print(f"[{job_id}] Clustering jerárquico de voz: {len(set([l for l in v_labels if l >= 0]))} clusters")
|
| 824 |
-
except Exception as _e:
|
| 825 |
-
print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
|
| 826 |
-
v_labels = []
|
| 827 |
-
else:
|
| 828 |
-
v_labels = []
|
| 829 |
|
| 830 |
-
# Guardar resultados
|
| 831 |
job["results"] = {
|
| 832 |
-
"characters":
|
| 833 |
-
"
|
| 834 |
-
"
|
| 835 |
"base_dir": str(base),
|
| 836 |
-
"face_labels": face_labels,
|
| 837 |
-
"num_face_embeddings": num_face_embeddings,
|
| 838 |
-
"audio_segments": audio_segments,
|
| 839 |
-
"srt_unmodified": srt_unmod,
|
| 840 |
-
"full_transcription": full_txt,
|
| 841 |
-
"voice_labels": v_labels,
|
| 842 |
-
"num_voice_embeddings": len(voice_embeddings),
|
| 843 |
-
"diarization_info": diar_info,
|
| 844 |
}
|
| 845 |
job["status"] = JobStatus.DONE
|
| 846 |
-
|
| 847 |
-
# Log resumido sin embeddings
|
| 848 |
-
print(f"[{job_id}] ✓ Resultados guardados:")
|
| 849 |
-
print(f"[{job_id}] - Personatges: {len(characters)}")
|
| 850 |
-
print(f"[{job_id}] - Segments d'àudio: {len(audio_segments)}")
|
| 851 |
-
print(f"[{job_id}] - Face embeddings: {num_face_embeddings}")
|
| 852 |
-
print(f"[{job_id}] - Voice embeddings: {len(voice_embeddings)}")
|
| 853 |
-
|
| 854 |
-
except Exception as e_detect:
|
| 855 |
-
# Si falla la detección, intentar modo fallback
|
| 856 |
-
import traceback
|
| 857 |
-
print(f"[{job_id}] ✗ Error en detección: {e_detect}")
|
| 858 |
-
print(f"[{job_id}] Traceback: {traceback.format_exc()}")
|
| 859 |
-
print(f"[{job_id}] Usando modo fallback (carpetas vacías)")
|
| 860 |
-
|
| 861 |
-
# Crear carpetas básicas como fallback
|
| 862 |
-
for sub in ("sources", "faces", "voices", "backgrounds"):
|
| 863 |
-
(base / sub).mkdir(parents=True, exist_ok=True)
|
| 864 |
-
|
| 865 |
-
# Guardar resultados de fallback y luego marcar como completado
|
| 866 |
-
job["results"] = {
|
| 867 |
-
"characters": [],
|
| 868 |
-
"num_characters": 0,
|
| 869 |
-
"temp_dirs": {
|
| 870 |
-
"sources": str(base / "sources"),
|
| 871 |
-
"faces": str(base / "faces"),
|
| 872 |
-
"voices": str(base / "voices"),
|
| 873 |
-
"backgrounds": str(base / "backgrounds"),
|
| 874 |
-
},
|
| 875 |
-
"warning": f"Detección falló, usando modo fallback: {str(e_detect)}"
|
| 876 |
-
}
|
| 877 |
-
job["status"] = JobStatus.DONE
|
| 878 |
-
|
| 879 |
-
print(f"[{job_id}] ✓ Job completado exitosamente")
|
| 880 |
-
|
| 881 |
-
except Exception as e:
|
| 882 |
-
import traceback
|
| 883 |
-
print(f"[{job_id}] ✗ Error inesperado: {e}")
|
| 884 |
-
try:
|
| 885 |
-
job = jobs.get(job_id)
|
| 886 |
-
if job is not None:
|
| 887 |
-
job["status"] = JobStatus.FAILED
|
| 888 |
-
job["error"] = str(e)
|
| 889 |
-
except Exception:
|
| 890 |
-
pass
|
| 891 |
-
print(f"[{job_id}] Traceback: {traceback.format_exc()}")
|
| 892 |
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
base = TEMP_ROOT / Path(vid_name).stem
|
| 900 |
|
| 901 |
-
base.mkdir(parents=True, exist_ok=True)
|
| 902 |
-
# Save temp mp4
|
| 903 |
-
video_path = base / vid_name
|
| 904 |
-
with open(video_path, "wb") as f:
|
| 905 |
-
f.write(await video.read())
|
| 906 |
-
|
| 907 |
-
# Run MVP pipeline
|
| 908 |
-
result = ad_generate(str(video_path), base)
|
| 909 |
-
|
| 910 |
-
return {
|
| 911 |
-
"status": "done",
|
| 912 |
-
"results": {
|
| 913 |
-
"une_srt": result.get("une_srt", ""),
|
| 914 |
-
"free_text": result.get("free_text", ""),
|
| 915 |
-
"artifacts": result.get("artifacts", {}),
|
| 916 |
-
},
|
| 917 |
-
}
|
| 918 |
except Exception as e:
|
|
|
|
| 919 |
import traceback
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
@app.post("/load_casting")
|
| 924 |
-
async def load_casting(
|
| 925 |
-
faces_dir: str = Form("identities/faces"),
|
| 926 |
-
voices_dir: str = Form("identities/voices"),
|
| 927 |
-
db_dir: str = Form("chroma_db"),
|
| 928 |
-
drop_collections: bool = Form(False),
|
| 929 |
-
):
|
| 930 |
-
client = ensure_chroma(Path(db_dir))
|
| 931 |
-
n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
|
| 932 |
-
n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
|
| 933 |
-
return {"ok": True, "faces": n_faces, "voices": n_voices}
|
| 934 |
-
|
| 935 |
-
@app.post("/finalize_casting")
|
| 936 |
-
async def finalize_casting(
|
| 937 |
-
payload: dict = Body(...),
|
| 938 |
-
):
|
| 939 |
-
"""
|
| 940 |
-
Consolidate selected face and voice clusters into identities directories and build indices.
|
| 941 |
-
Expected payload:
|
| 942 |
-
{
|
| 943 |
-
"video_name": str,
|
| 944 |
-
"base_dir": str, # engine temp base for this video
|
| 945 |
-
"characters": [
|
| 946 |
-
{"id": "char1", "name": "Nom", "folder": "/tmp/temp/<video>/char1", "kept_files": ["representative.jpg", ...], "description": "..."}, ...
|
| 947 |
-
],
|
| 948 |
-
"voice_clusters": [
|
| 949 |
-
{"label": 0, "name": "SPEAKER_00", "clips": ["segment_000.wav", ...]}, ...
|
| 950 |
-
]
|
| 951 |
-
}
|
| 952 |
-
"""
|
| 953 |
-
import os
|
| 954 |
-
import shutil
|
| 955 |
-
from pathlib import Path as _P
|
| 956 |
-
|
| 957 |
-
video_name = payload.get("video_name")
|
| 958 |
-
base_dir = payload.get("base_dir")
|
| 959 |
-
characters = payload.get("characters", []) or []
|
| 960 |
-
voice_clusters = payload.get("voice_clusters", []) or []
|
| 961 |
-
|
| 962 |
-
if not video_name or not base_dir:
|
| 963 |
-
raise HTTPException(status_code=400, detail="Missing video_name or base_dir")
|
| 964 |
-
|
| 965 |
-
faces_out = IDENTITIES_ROOT / video_name / "faces"
|
| 966 |
-
voices_out = IDENTITIES_ROOT / video_name / "voices"
|
| 967 |
-
faces_out.mkdir(parents=True, exist_ok=True)
|
| 968 |
-
voices_out.mkdir(parents=True, exist_ok=True)
|
| 969 |
-
|
| 970 |
-
# Consolidate faces per character name (merge same names)
|
| 971 |
-
for ch in characters:
|
| 972 |
-
ch_name = (ch.get("name") or "Unknown").strip() or "Unknown"
|
| 973 |
-
ch_folder = ch.get("folder")
|
| 974 |
-
kept = ch.get("kept_files") or []
|
| 975 |
-
if not ch_folder or not os.path.isdir(ch_folder):
|
| 976 |
-
continue
|
| 977 |
-
dst_dir = faces_out / ch_name
|
| 978 |
-
dst_dir.mkdir(parents=True, exist_ok=True)
|
| 979 |
-
for fname in kept:
|
| 980 |
-
src = _P(ch_folder) / fname
|
| 981 |
-
if src.exists() and src.is_file():
|
| 982 |
-
try:
|
| 983 |
-
shutil.copy2(src, dst_dir / fname)
|
| 984 |
-
except Exception:
|
| 985 |
-
pass
|
| 986 |
-
|
| 987 |
-
# Consolidate voices per cluster name
|
| 988 |
-
clips_dir = _P(base_dir) / "clips"
|
| 989 |
-
for vc in voice_clusters:
|
| 990 |
-
v_name = (vc.get("name") or f"SPEAKER_{int(vc.get('label',0)):02d}").strip()
|
| 991 |
-
dst_dir = voices_out / v_name
|
| 992 |
-
dst_dir.mkdir(parents=True, exist_ok=True)
|
| 993 |
-
for wav in (vc.get("clips") or []):
|
| 994 |
-
src = clips_dir / wav
|
| 995 |
-
if src.exists() and src.is_file():
|
| 996 |
-
try:
|
| 997 |
-
shutil.copy2(src, dst_dir / wav)
|
| 998 |
-
except Exception:
|
| 999 |
-
pass
|
| 1000 |
-
|
| 1001 |
-
# Build indices using casting_loader helpers (best-effort)
|
| 1002 |
-
db_dir = IDENTITIES_ROOT / video_name / "chroma_db"
|
| 1003 |
-
try:
|
| 1004 |
-
client = ensure_chroma(db_dir)
|
| 1005 |
-
n_faces = build_faces_index(
|
| 1006 |
-
faces_out,
|
| 1007 |
-
client,
|
| 1008 |
-
collection_name="index_faces",
|
| 1009 |
-
deepface_model='Facenet512',
|
| 1010 |
-
drop=True,
|
| 1011 |
-
)
|
| 1012 |
-
n_voices = build_voices_index(
|
| 1013 |
-
voices_out,
|
| 1014 |
-
client,
|
| 1015 |
-
collection_name="index_voices",
|
| 1016 |
-
drop=True,
|
| 1017 |
-
)
|
| 1018 |
-
except Exception as e:
|
| 1019 |
-
# Si ChromaDB no está disponible o falla la indexación, no romper el flujo
|
| 1020 |
-
print(f"[finalize_casting] WARN - No se pudieron construir índices ChromaDB: {e}")
|
| 1021 |
-
n_faces = 0
|
| 1022 |
-
n_voices = 0
|
| 1023 |
-
|
| 1024 |
-
# Summary of identities
|
| 1025 |
-
face_identities = sorted([p.name for p in faces_out.iterdir() if p.is_dir()]) if faces_out.exists() else []
|
| 1026 |
-
voice_identities = sorted([p.name for p in voices_out.iterdir() if p.is_dir()]) if voices_out.exists() else []
|
| 1027 |
-
|
| 1028 |
-
# Build casting_json with face and voice embeddings (best-effort) via remote Spaces
|
| 1029 |
-
casting_json = {"face_col": [], "voice_col": []}
|
| 1030 |
-
|
| 1031 |
-
# Cargar config y router para acceder a svision/asr
|
| 1032 |
-
try:
|
| 1033 |
-
cfg = load_yaml("config.yaml")
|
| 1034 |
-
router = LLMRouter(cfg)
|
| 1035 |
-
except Exception:
|
| 1036 |
-
router = None # type: ignore
|
| 1037 |
-
|
| 1038 |
-
# Face embeddings per identity using remote svision (face_image_embedding)
|
| 1039 |
-
try:
|
| 1040 |
-
if face_identities and router is not None:
|
| 1041 |
-
factory = router.client_factories.get("salamandra-vision") # type: ignore[attr-defined]
|
| 1042 |
-
if factory is not None:
|
| 1043 |
-
vclient = factory()
|
| 1044 |
-
gclient = getattr(vclient, "_client", None)
|
| 1045 |
-
else:
|
| 1046 |
-
gclient = None
|
| 1047 |
-
|
| 1048 |
-
if gclient is not None:
|
| 1049 |
-
for identity in face_identities:
|
| 1050 |
-
id_dir = faces_out / identity
|
| 1051 |
-
if not id_dir.is_dir():
|
| 1052 |
-
continue
|
| 1053 |
-
# Buscar una imagen representativa
|
| 1054 |
-
img_path = None
|
| 1055 |
-
for ext in (".jpg", ".jpeg", ".png", ".bmp", ".webp"):
|
| 1056 |
-
candidates = list(id_dir.glob(f"*{ext}"))
|
| 1057 |
-
if candidates:
|
| 1058 |
-
img_path = candidates[0]
|
| 1059 |
-
break
|
| 1060 |
-
if not img_path:
|
| 1061 |
-
continue
|
| 1062 |
-
|
| 1063 |
-
try:
|
| 1064 |
-
out = gclient.predict(str(img_path), api_name="/face_image_embedding")
|
| 1065 |
-
# svision devuelve normalmente una lista de embeddings o un solo embedding
|
| 1066 |
-
emb = None
|
| 1067 |
-
if isinstance(out, list):
|
| 1068 |
-
if out and isinstance(out[0], (list, tuple, float, int)):
|
| 1069 |
-
# Si es lista de listas, tomamos la primera; si es lista plana, la usamos tal cual
|
| 1070 |
-
if out and isinstance(out[0], (list, tuple)):
|
| 1071 |
-
emb = list(out[0])
|
| 1072 |
-
else:
|
| 1073 |
-
emb = list(out)
|
| 1074 |
-
elif isinstance(out, dict) and "embedding" in out:
|
| 1075 |
-
emb = out.get("embedding")
|
| 1076 |
-
|
| 1077 |
-
if not emb:
|
| 1078 |
-
continue
|
| 1079 |
-
|
| 1080 |
-
casting_json["face_col"].append({
|
| 1081 |
-
"nombre": identity,
|
| 1082 |
-
"embedding": emb,
|
| 1083 |
-
})
|
| 1084 |
-
except Exception:
|
| 1085 |
-
# No romper por un fallo puntual de embedding
|
| 1086 |
-
continue
|
| 1087 |
-
except Exception:
|
| 1088 |
-
# Si algo falla en todo el bloque de caras, dejamos face_col vacío
|
| 1089 |
-
casting_json["face_col"] = []
|
| 1090 |
-
|
| 1091 |
-
# Voice embeddings per identity using remote asr (voice_embedding)
|
| 1092 |
-
try:
|
| 1093 |
-
if voice_identities and router is not None:
|
| 1094 |
-
factory = router.client_factories.get("whisper-catalan") # type: ignore[attr-defined]
|
| 1095 |
-
if factory is not None:
|
| 1096 |
-
aclient = factory()
|
| 1097 |
-
gclient = getattr(aclient, "_client", None)
|
| 1098 |
-
else:
|
| 1099 |
-
gclient = None
|
| 1100 |
-
|
| 1101 |
-
if gclient is not None:
|
| 1102 |
-
for identity in voice_identities:
|
| 1103 |
-
id_dir = voices_out / identity
|
| 1104 |
-
if not id_dir.is_dir():
|
| 1105 |
-
continue
|
| 1106 |
-
wav_files = sorted([p for p in id_dir.iterdir() if p.is_file() and p.suffix.lower() in [".wav", ".flac", ".mp3"]])
|
| 1107 |
-
if not wav_files:
|
| 1108 |
-
continue
|
| 1109 |
-
|
| 1110 |
-
# Obtenemos un embedding representativo usando el primer clip
|
| 1111 |
-
wf = wav_files[0]
|
| 1112 |
-
try:
|
| 1113 |
-
out = gclient.predict(str(wf), api_name="/voice_embedding")
|
| 1114 |
-
emb = None
|
| 1115 |
-
if isinstance(out, list):
|
| 1116 |
-
emb = list(out)
|
| 1117 |
-
elif isinstance(out, dict) and "embedding" in out:
|
| 1118 |
-
emb = out.get("embedding")
|
| 1119 |
-
|
| 1120 |
-
if not emb:
|
| 1121 |
-
continue
|
| 1122 |
-
|
| 1123 |
-
casting_json["voice_col"].append({
|
| 1124 |
-
"nombre": identity,
|
| 1125 |
-
"embedding": emb,
|
| 1126 |
-
})
|
| 1127 |
-
except Exception:
|
| 1128 |
-
continue
|
| 1129 |
-
except Exception:
|
| 1130 |
-
# Si algo falla en todo el bloque de voces, dejamos voice_col vacío
|
| 1131 |
-
casting_json["voice_col"] = []
|
| 1132 |
-
|
| 1133 |
-
return {
|
| 1134 |
-
"ok": True,
|
| 1135 |
-
"video_name": video_name,
|
| 1136 |
-
"faces_dir": str(faces_out),
|
| 1137 |
-
"voices_dir": str(voices_out),
|
| 1138 |
-
"db_dir": str(db_dir),
|
| 1139 |
-
"n_faces_embeddings": n_faces,
|
| 1140 |
-
"n_voices_embeddings": n_voices,
|
| 1141 |
-
"face_identities": face_identities,
|
| 1142 |
-
"voice_identities": voice_identities,
|
| 1143 |
-
"casting_json": casting_json,
|
| 1144 |
-
}
|
| 1145 |
|
| 1146 |
-
@app.get("/files_scene/{video_name}/{scene_id}/{filename}")
|
| 1147 |
-
def serve_scene_file(video_name: str, scene_id: str, filename: str):
|
| 1148 |
-
file_path = TEMP_ROOT / video_name / "scenes" / scene_id / filename
|
| 1149 |
-
if not file_path.exists():
|
| 1150 |
-
raise HTTPException(status_code=404, detail="File not found")
|
| 1151 |
-
return FileResponse(file_path)
|
| 1152 |
|
| 1153 |
@app.post("/detect_scenes")
|
| 1154 |
async def detect_scenes(
|
| 1155 |
-
|
| 1156 |
-
max_groups: int = Form(default=
|
| 1157 |
min_cluster_size: int = Form(default=3),
|
| 1158 |
scene_sensitivity: float = Form(default=0.5),
|
| 1159 |
-
frame_interval_sec: float = Form(default=0.5),
|
| 1160 |
):
|
| 1161 |
"""
|
| 1162 |
-
Detecta
|
| 1163 |
-
Retorna una llista de scene_clusters estructurada de forma similar a characters.
|
| 1164 |
"""
|
| 1165 |
import cv2
|
| 1166 |
import numpy as np
|
|
|
|
| 1167 |
|
| 1168 |
-
# Guardar el vídeo temporalment
|
| 1169 |
-
video_name = Path(video.filename).stem
|
| 1170 |
dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
|
| 1171 |
-
|
| 1172 |
-
|
| 1173 |
|
| 1174 |
cap = cv2.VideoCapture(str(dst_video))
|
| 1175 |
if not cap.isOpened():
|
| 1176 |
-
|
| 1177 |
|
| 1178 |
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
| 1179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1180 |
|
| 1181 |
-
frames = []
|
| 1182 |
-
metas = []
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
|
|
|
| 1186 |
if not ret:
|
| 1187 |
-
|
| 1188 |
-
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
-
|
| 1194 |
-
hsv = cv2.cvtColor(small, cv2.COLOR_BGR2HSV)
|
| 1195 |
-
# Histograma per canal
|
| 1196 |
-
h_hist = cv2.calcHist([hsv],[0],None,[32],[0,180]).flatten()
|
| 1197 |
-
s_hist = cv2.calcHist([hsv],[1],None,[32],[0,256]).flatten()
|
| 1198 |
-
v_hist = cv2.calcHist([hsv],[2],None,[32],[0,256]).flatten()
|
| 1199 |
-
hist = np.concatenate([h_hist, s_hist, v_hist])
|
| 1200 |
-
hist = hist / (np.linalg.norm(hist) + 1e-8)
|
| 1201 |
-
frames.append(hist)
|
| 1202 |
-
metas.append({"index": idx, "time_sec": idx/float(fps)})
|
| 1203 |
-
idx += 1
|
| 1204 |
cap.release()
|
| 1205 |
|
| 1206 |
if not frames:
|
| 1207 |
-
return {"scene_clusters": []}
|
| 1208 |
|
| 1209 |
X = np.array(frames)
|
| 1210 |
labels = hierarchical_cluster_with_min_size(X, max_groups, min_cluster_size, scene_sensitivity).tolist()
|
|
@@ -1217,100 +869,75 @@ async def detect_scenes(
|
|
| 1217 |
if lbl is None or lbl < 0:
|
| 1218 |
continue
|
| 1219 |
clusters.setdefault(int(lbl), []).append(i)
|
| 1220 |
-
|
| 1221 |
-
#
|
| 1222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1223 |
centroids = {}
|
| 1224 |
for lbl, idxs in clusters.items():
|
| 1225 |
cluster_histograms = X[idxs]
|
| 1226 |
centroids[lbl] = np.mean(cluster_histograms, axis=0)
|
| 1227 |
-
|
| 1228 |
print(f"[SCENE VALIDATION] Validant similaritat entre {len(centroids)} clusters...")
|
| 1229 |
-
|
| 1230 |
-
|
| 1231 |
-
|
| 1232 |
-
|
| 1233 |
-
|
| 1234 |
-
# Calcular matriu de distàncies i correlacions entre centroides
|
| 1235 |
cluster_labels = sorted(centroids.keys())
|
| 1236 |
similarities = {}
|
| 1237 |
-
|
| 1238 |
for i, lbl1 in enumerate(cluster_labels):
|
| 1239 |
for lbl2 in cluster_labels[i+1:]:
|
| 1240 |
-
# Distancia euclidiana (normalizada)
|
| 1241 |
dist = np.linalg.norm(centroids[lbl1] - centroids[lbl2])
|
| 1242 |
-
|
| 1243 |
-
# Correlación de Pearson entre histogramas
|
| 1244 |
-
corr = np.corrcoef(centroids[lbl1], centroids[lbl2])[0, 1]
|
| 1245 |
-
|
| 1246 |
-
# Son similares si:
|
| 1247 |
-
# - Distancia baja (< threshold) O
|
| 1248 |
-
# - Correlación alta (> threshold)
|
| 1249 |
are_similar = (dist < SIMILARITY_THRESHOLD) or (corr > CORRELATION_THRESHOLD)
|
| 1250 |
-
|
| 1251 |
-
similarities[(lbl1, lbl2)] = {
|
| 1252 |
-
'distance': dist,
|
| 1253 |
-
'correlation': corr,
|
| 1254 |
-
'similar': are_similar
|
| 1255 |
-
}
|
| 1256 |
-
|
| 1257 |
if are_similar:
|
| 1258 |
-
print(f"[SCENE VALIDATION] Clusters {lbl1} i {lbl2}
|
| 1259 |
-
|
| 1260 |
-
|
| 1261 |
-
|
| 1262 |
-
# Union-Find para fusionar clusters transitivamente
|
| 1263 |
-
# Si A~B y B~C, entonces A~B~C (todos en el mismo grupo)
|
| 1264 |
parent = {lbl: lbl for lbl in cluster_labels}
|
| 1265 |
-
|
| 1266 |
def find(x):
|
| 1267 |
if parent[x] != x:
|
| 1268 |
-
parent[x] = find(parent[x])
|
| 1269 |
return parent[x]
|
| 1270 |
-
|
| 1271 |
def union(x, y):
|
| 1272 |
-
|
| 1273 |
-
|
| 1274 |
-
|
| 1275 |
-
|
| 1276 |
-
|
| 1277 |
-
# Fusionar todos los clusters similares
|
| 1278 |
fusion_count = 0
|
| 1279 |
for (lbl1, lbl2), sim in similarities.items():
|
| 1280 |
if sim['similar']:
|
| 1281 |
union(lbl1, lbl2)
|
| 1282 |
fusion_count += 1
|
| 1283 |
-
|
| 1284 |
-
# Aplicar fusió als clusters
|
| 1285 |
new_clusters = {}
|
| 1286 |
for lbl, idxs in clusters.items():
|
| 1287 |
root = find(lbl)
|
| 1288 |
-
|
| 1289 |
-
|
| 1290 |
-
new_clusters[root].extend(idxs)
|
| 1291 |
-
|
| 1292 |
-
# Reordenar labels para que sean consecutivos
|
| 1293 |
final_clusters_dict = {}
|
| 1294 |
for i, (root, idxs) in enumerate(sorted(new_clusters.items())):
|
| 1295 |
final_clusters_dict[i] = idxs
|
| 1296 |
-
|
| 1297 |
clusters = final_clusters_dict
|
| 1298 |
final_clusters = len(clusters)
|
| 1299 |
eliminated = initial_clusters - final_clusters
|
| 1300 |
-
|
| 1301 |
-
print(f"[SCENE VALIDATION]
|
| 1302 |
-
|
| 1303 |
-
|
| 1304 |
-
print(f"[SCENE VALIDATION] Clusters finals: {final_clusters}")
|
| 1305 |
-
print(f"[SCENE VALIDATION] Clusters eliminats (fusionats): {eliminated}")
|
| 1306 |
-
print(f"[SCENE VALIDATION] Reducció: {(eliminated/initial_clusters*100):.1f}%")
|
| 1307 |
-
print(f"[SCENE VALIDATION] =======================")
|
| 1308 |
-
|
| 1309 |
-
# Escriure imatges representatives per a cada clúster
|
| 1310 |
base = TEMP_ROOT / video_name / "scenes"
|
| 1311 |
base.mkdir(parents=True, exist_ok=True)
|
| 1312 |
scene_list = []
|
| 1313 |
cap = cv2.VideoCapture(str(dst_video))
|
|
|
|
| 1314 |
for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
|
| 1315 |
scene_id = f"scene_{int(lbl):02d}"
|
| 1316 |
out_dir = base / scene_id
|
|
@@ -1329,53 +956,36 @@ async def detect_scenes(
|
|
| 1329 |
# Representative
|
| 1330 |
rep = frame_files[0] if frame_files else None
|
| 1331 |
image_url = f"/files_scene/{video_name}/{scene_id}/{rep}" if rep else ""
|
| 1332 |
-
|
| 1333 |
-
# Llamar a svision para describir la escena
|
| 1334 |
scene_description = ""
|
| 1335 |
-
scene_name = f"
|
| 1336 |
if rep:
|
| 1337 |
rep_full_path = out_dir / rep
|
| 1338 |
if rep_full_path.exists():
|
| 1339 |
-
print(f"Llamando a svision para describir {scene_id}...")
|
| 1340 |
try:
|
| 1341 |
-
scene_description,
|
| 1342 |
-
|
| 1343 |
-
scene_name = f"Escena {lbl+1}"
|
| 1344 |
-
|
| 1345 |
-
# Si tenemos descripción, generar nombre corto con schat
|
| 1346 |
if scene_description:
|
| 1347 |
-
print(f"Llamando a schat para generar nombre corto de {scene_id}...")
|
| 1348 |
try:
|
| 1349 |
-
# Usar LLMRouter para llamar a schat
|
| 1350 |
config_path = os.getenv("CONFIG_YAML", "config.yaml")
|
| 1351 |
if os.path.exists(config_path):
|
| 1352 |
with open(config_path, 'r', encoding='utf-8') as f:
|
| 1353 |
cfg = yaml.safe_load(f) or {}
|
| 1354 |
router = LLMRouter(cfg)
|
| 1355 |
-
|
| 1356 |
-
prompt = f"Basant-te en aquesta descripció d'una escena, genera un nom curt de menys de 3 paraules que la resumeixi:\n\n{scene_description}\n\nNom de l'escena:"
|
| 1357 |
-
|
| 1358 |
short_name = router.instruct(
|
| 1359 |
prompt=prompt,
|
| 1360 |
-
system="
|
| 1361 |
model="salamandra-instruct"
|
| 1362 |
-
).strip()
|
| 1363 |
-
|
| 1364 |
-
# Limpiar posibles comillas o puntuación extra
|
| 1365 |
-
short_name = short_name.strip('"\'.,!?').strip()
|
| 1366 |
-
|
| 1367 |
-
if short_name and len(short_name) > 0:
|
| 1368 |
scene_name = short_name
|
| 1369 |
-
|
| 1370 |
-
|
| 1371 |
-
print(f"[schat] No s'ha generat nom, usant fallback")
|
| 1372 |
-
except Exception as e_schat:
|
| 1373 |
-
print(f"Error generando nombre con schat: {e_schat}")
|
| 1374 |
-
# Mantener el nombre de svision si schat falla
|
| 1375 |
-
|
| 1376 |
except Exception as e:
|
| 1377 |
print(f"Error describiendo {scene_id}: {e}")
|
| 1378 |
-
|
| 1379 |
scene_list.append({
|
| 1380 |
"id": scene_id,
|
| 1381 |
"name": scene_name,
|
|
@@ -1385,8 +995,8 @@ async def detect_scenes(
|
|
| 1385 |
"image_url": image_url,
|
| 1386 |
"frame_files": frame_files,
|
| 1387 |
})
|
| 1388 |
-
cap.release()
|
| 1389 |
|
|
|
|
| 1390 |
return {"scene_clusters": scene_list, "base_dir": str(base)}
|
| 1391 |
|
| 1392 |
@app.post("/refine_narration")
|
|
|
|
| 75 |
app.include_router(pending_videos_router)
|
| 76 |
|
| 77 |
def describe_image_with_svision(image_path: str, is_face: bool = True) -> tuple[str, str]:
|
| 78 |
+
"""Call the svision Space to describe an image (used in AD generation).
|
| 79 |
+
|
|
|
|
| 80 |
Args:
|
| 81 |
+
image_path: Absolute path to the image.
|
| 82 |
+
is_face: True if the image is a face, False if it is a scene.
|
| 83 |
+
|
| 84 |
Returns:
|
| 85 |
+
Tuple ``(full_description, short_name)``.
|
| 86 |
"""
|
| 87 |
try:
|
| 88 |
from pathlib import Path as _P
|
| 89 |
import yaml
|
| 90 |
from llm_router import LLMRouter
|
| 91 |
|
| 92 |
+
# Load configuration
|
| 93 |
config_path = _P(__file__).parent / "config.yaml"
|
| 94 |
if not config_path.exists():
|
| 95 |
print(f"[svision] Config no encontrado: {config_path}")
|
|
|
|
| 100 |
|
| 101 |
router = LLMRouter(cfg)
|
| 102 |
|
| 103 |
+
# Different context depending on whether the image is a face or a scene
|
| 104 |
if is_face:
|
| 105 |
context = {
|
| 106 |
"task": "describe_person",
|
|
|
|
| 114 |
"max_tokens": 128
|
| 115 |
}
|
| 116 |
|
| 117 |
+
# Call svision
|
| 118 |
descriptions = router.vision_describe([str(image_path)], context=context, model="salamandra-vision")
|
| 119 |
full_description = descriptions[0] if descriptions else ""
|
| 120 |
|
|
|
|
| 132 |
return ("", "")
|
| 133 |
|
| 134 |
def normalize_face_lighting(image):
|
| 135 |
+
"""Normalize face brightness using a combination of techniques.
|
| 136 |
+
|
| 137 |
+
1. CLAHE for adaptive histogram equalization.
|
| 138 |
+
2. Range normalization to homogenize overall brightness.
|
| 139 |
+
|
| 140 |
+
This reduces the impact of different lighting conditions on embeddings
|
| 141 |
+
and on how faces are visualized.
|
| 142 |
+
|
| 143 |
Args:
|
| 144 |
+
image: BGR image (OpenCV format).
|
| 145 |
+
|
| 146 |
Returns:
|
| 147 |
+
Normalized image in the same format.
|
| 148 |
"""
|
| 149 |
import cv2
|
| 150 |
import numpy as np
|
| 151 |
|
| 152 |
+
# Step 1: Convert to LAB color space (more robust to illumination changes)
|
| 153 |
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
|
| 154 |
l, a, b = cv2.split(lab)
|
| 155 |
|
| 156 |
+
# Step 2: Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) to the L channel
|
| 157 |
+
# Use a higher clipLimit for more aggressive normalization
|
| 158 |
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
| 159 |
l_clahe = clahe.apply(l)
|
| 160 |
|
| 161 |
+
# Step 3: Normalize the range of the L channel to ensure a more uniform distribution
|
| 162 |
+
# This guarantees that all images have a similar brightness range
|
| 163 |
l_min, l_max = l_clahe.min(), l_clahe.max()
|
| 164 |
if l_max > l_min:
|
| 165 |
+
# Stretch histogram to the full range [0, 255]
|
| 166 |
l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
|
| 167 |
else:
|
| 168 |
l_normalized = l_clahe
|
| 169 |
|
| 170 |
+
# Step 4: Apply a small blur to reduce noise introduced by normalization
|
| 171 |
l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
|
| 172 |
|
| 173 |
+
# Recombine channels
|
| 174 |
lab_normalized = cv2.merge([l_normalized, a, b])
|
| 175 |
|
| 176 |
+
# Convert back to BGR
|
| 177 |
normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
|
| 178 |
return normalized
|
| 179 |
|
| 180 |
def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
|
| 181 |
+
"""Hierarchical clustering with silhouette score and a minimum cluster size.
|
| 182 |
+
|
| 183 |
+
It automatically selects the best number of clusters (up to ``max_groups``)
|
| 184 |
+
using the silhouette score, and then filters out clusters with fewer than
|
| 185 |
+
``min_cluster_size`` samples (marked as -1 / noise).
|
| 186 |
+
|
| 187 |
Args:
|
| 188 |
+
X: Embedding array of shape (N, D).
|
| 189 |
+
max_groups: Maximum number of clusters to form.
|
| 190 |
+
min_cluster_size: Minimum size for a cluster to be considered valid.
|
| 191 |
+
sensitivity: Clustering sensitivity (0.0–1.0).
|
| 192 |
+
- 0.0 = very aggressive (fewer clusters).
|
| 193 |
+
- 0.5 = balanced (recommended).
|
| 194 |
+
- 1.0 = permissive (more clusters).
|
| 195 |
+
|
| 196 |
Returns:
|
| 197 |
+
``np.ndarray`` of labels (N,), where -1 indicates noise.
|
| 198 |
"""
|
| 199 |
import numpy as np
|
| 200 |
from scipy.cluster.hierarchy import linkage, fcluster
|
|
|
|
| 205 |
return np.array([])
|
| 206 |
|
| 207 |
if len(X) < min_cluster_size:
|
| 208 |
+
# If there are fewer samples than the minimum, treat everything as noise
|
| 209 |
return np.full(len(X), -1, dtype=int)
|
| 210 |
|
| 211 |
+
# Linkage using average linkage (more flexible than ward and less sensitive to outliers)
|
| 212 |
+
# This helps group the same person under different angles/expressions
|
| 213 |
Z = linkage(X, method='average', metric='cosine') # Cosine similarity para embeddings
|
| 214 |
|
| 215 |
+
# Find the optimal number of clusters using the silhouette score
|
| 216 |
best_n_clusters = 2
|
| 217 |
best_score = -1
|
| 218 |
|
| 219 |
+
# Try different numbers of clusters (from 2 to max_groups)
|
| 220 |
+
max_to_try = min(max_groups, len(X) - 1) # Cannot have more clusters than samples
|
| 221 |
|
| 222 |
if max_to_try >= 2:
|
| 223 |
for n_clusters in range(2, max_to_try + 1):
|
| 224 |
trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
|
| 225 |
+
|
| 226 |
+
# Compute how many valid clusters we would have after filtering
|
| 227 |
trial_counts = Counter(trial_labels)
|
| 228 |
valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
|
| 229 |
|
| 230 |
+
# Only evaluate if there are at least 2 valid clusters
|
| 231 |
if valid_clusters >= 2:
|
| 232 |
try:
|
| 233 |
score = silhouette_score(X, trial_labels, metric='cosine')
|
| 234 |
+
# Dynamic penalty based on sensitivity:
|
| 235 |
+
# - sensitivity = 0.0 → penalty = 0.14 (very aggressive, fewer clusters)
|
| 236 |
+
# - sensitivity = 0.5 → penalty = 0.07 (balanced, recommended)
|
| 237 |
+
# - sensitivity = 1.0 → penalty = 0.01 (permissive, more clusters)
|
| 238 |
penalty = 0.14 - (sensitivity * 0.13)
|
| 239 |
adjusted_score = score - (n_clusters * penalty)
|
| 240 |
|
|
|
|
| 244 |
except:
|
| 245 |
pass # Si falla el cálculo, ignorar esta configuración
|
| 246 |
|
| 247 |
+
# Use the optimal number of clusters found
|
| 248 |
penalty = 0.14 - (sensitivity * 0.13)
|
| 249 |
print(f"Clustering óptimo: {best_n_clusters} clusters (de máximo {max_groups}), sensitivity={sensitivity:.2f}, penalty={penalty:.3f}, silhouette={best_score:.3f}")
|
| 250 |
labels = fcluster(Z, t=best_n_clusters, criterion='maxclust')
|
| 251 |
|
| 252 |
+
# fcluster returns 1-indexed labels; convert to 0-indexed
|
| 253 |
labels = labels - 1
|
| 254 |
|
| 255 |
+
# Filter out small clusters
|
| 256 |
label_counts = Counter(labels)
|
| 257 |
filtered_labels = []
|
| 258 |
for lbl in labels:
|
| 259 |
if label_counts[lbl] >= min_cluster_size:
|
| 260 |
filtered_labels.append(lbl)
|
| 261 |
else:
|
| 262 |
+
filtered_labels.append(-1) # Noise
|
| 263 |
|
| 264 |
return np.array(filtered_labels, dtype=int)
|
| 265 |
|
|
|
|
| 292 |
voice_sensitivity: float = Form(default=0.5),
|
| 293 |
max_frames: int = Form(default=100),
|
| 294 |
):
|
| 295 |
+
"""Create a background job to process a video using hierarchical clustering.
|
| 296 |
+
|
| 297 |
+
This endpoint stores the uploaded video, creates a job entry and
|
| 298 |
+
starts ``process_video_job`` in the background. It immediately
|
| 299 |
+
returns a ``job_id`` that the UI can poll.
|
| 300 |
"""
|
| 301 |
+
# Save video into the data folder
|
|
|
|
|
|
|
|
|
|
| 302 |
video_name = Path(video.filename).stem
|
| 303 |
dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
|
| 304 |
with dst_video.open("wb") as f:
|
| 305 |
shutil.copyfileobj(video.file, f)
|
| 306 |
|
| 307 |
+
# Create unique job_id
|
| 308 |
job_id = str(uuid.uuid4())
|
| 309 |
|
| 310 |
+
# Initialize job metadata
|
| 311 |
jobs[job_id] = {
|
| 312 |
"id": job_id,
|
| 313 |
"status": JobStatus.QUEUED,
|
|
|
|
| 327 |
|
| 328 |
print(f"[{job_id}] Job creado para vídeo: {video_name}")
|
| 329 |
|
| 330 |
+
# Start processing in the background
|
| 331 |
background_tasks.add_task(process_video_job, job_id)
|
| 332 |
|
| 333 |
# Devolver job_id inmediatamente
|
|
|
|
| 568 |
# Construir carpetas por clúster con validación DeepFace
|
| 569 |
from face_classifier import validate_and_classify_face, FACE_CONFIDENCE_THRESHOLD
|
| 570 |
|
| 571 |
+
characters_validated: list[dict[str, Any]] = []
|
| 572 |
cluster_map: dict[int, list[int]] = {}
|
| 573 |
+
fallback_candidate: dict[str, Any] | None = None
|
| 574 |
+
for idx, lbl in enumerate(labels):
|
| 575 |
if isinstance(lbl, int) and lbl >= 0:
|
| 576 |
+
cluster_map.setdefault(lbl, []).append(idx)
|
| 577 |
|
| 578 |
chars_dir = base / "characters"
|
| 579 |
chars_dir.mkdir(parents=True, exist_ok=True)
|
| 580 |
import shutil as _sh
|
| 581 |
+
|
| 582 |
original_cluster_count = len(cluster_map)
|
| 583 |
print(f"[{job_id}] Procesando {original_cluster_count} clusters detectados...")
|
| 584 |
+
|
| 585 |
for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
|
| 586 |
char_id = f"char_{ci:02d}"
|
| 587 |
+
|
| 588 |
+
detections: list[dict[str, Any]] = []
|
|
|
|
| 589 |
for j in idxs:
|
| 590 |
meta = crops_meta[j]
|
| 591 |
+
file_name = meta.get("file")
|
| 592 |
+
if not file_name:
|
| 593 |
+
continue
|
| 594 |
box = meta.get("box", [0, 0, 0, 0])
|
| 595 |
+
area = 0
|
| 596 |
if len(box) >= 4:
|
| 597 |
top, right, bottom, left = box
|
| 598 |
+
area = abs(right - left) * abs(bottom - top)
|
| 599 |
+
detections.append({
|
| 600 |
+
"index": j,
|
| 601 |
+
"file": file_name,
|
| 602 |
+
"score": area,
|
| 603 |
+
"box": box,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
})
|
| 605 |
+
|
| 606 |
+
if not detections:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: sense deteccions, eliminant")
|
| 608 |
continue
|
| 609 |
+
|
| 610 |
+
detections.sort(key=lambda d: d["score"], reverse=True)
|
| 611 |
+
best_face = detections[0]
|
| 612 |
+
best_face_path = faces_root / best_face["file"]
|
| 613 |
+
|
| 614 |
print(f"[{job_id}] [VALIDATION] Cluster {char_id}: validant millor cara (bbox_area={best_face['score']:.0f}px²)")
|
| 615 |
print(f"[{job_id}] [VALIDATION] Cluster {char_id}: millor cara path={best_face_path}")
|
| 616 |
print(f"[{job_id}] [VALIDATION] ▶▶▶ CRIDANT validate_and_classify_face() ◀◀◀")
|
| 617 |
+
|
| 618 |
validation = validate_and_classify_face(str(best_face_path))
|
| 619 |
+
|
| 620 |
print(f"[{job_id}] [VALIDATION] ▶▶▶ validate_and_classify_face() RETORNAT ◀◀◀")
|
| 621 |
+
|
| 622 |
+
candidate_conf = 0.0
|
| 623 |
+
if validation:
|
| 624 |
+
try:
|
| 625 |
+
candidate_conf = float(validation.get("face_confidence", 0.0) or 0.0)
|
| 626 |
+
except Exception:
|
| 627 |
+
candidate_conf = 0.0
|
| 628 |
+
|
| 629 |
+
if not fallback_candidate or candidate_conf > fallback_candidate.get("face_confidence", -1.0):
|
| 630 |
+
fallback_candidate = {
|
| 631 |
+
"char_id": char_id,
|
| 632 |
+
"detection": best_face,
|
| 633 |
+
"validation": validation,
|
| 634 |
+
"path": best_face_path,
|
| 635 |
+
"face_confidence": candidate_conf,
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
if not validation:
|
| 639 |
print(f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: error en validació DeepFace, eliminant cluster")
|
| 640 |
continue
|
| 641 |
+
|
|
|
|
| 642 |
print(f"[{job_id}] [DEEPFACE RESULT] Cluster {char_id}:")
|
| 643 |
print(f"[{job_id}] - is_valid_face: {validation['is_valid_face']}")
|
| 644 |
print(f"[{job_id}] - face_confidence: {validation['face_confidence']:.3f}")
|
|
|
|
| 647 |
print(f"[{job_id}] - gender_diff: {abs(validation['man_prob'] - validation['woman_prob']):.3f}")
|
| 648 |
print(f"[{job_id}] - gender_assigned: {validation['gender']}")
|
| 649 |
print(f"[{job_id}] - gender_confidence: {validation['gender_confidence']:.3f}")
|
| 650 |
+
|
| 651 |
+
if (not validation.get("is_valid_face")) or (validation.get("face_confidence", 0.0) < FACE_CONFIDENCE_THRESHOLD):
|
| 652 |
+
print(
|
| 653 |
+
f"[{job_id}] [VALIDATION] ✗ Cluster {char_id}: NO ES UNA CARA VÁLIDA "
|
| 654 |
+
f"(face_confidence={validation.get('face_confidence', 0.0):.3f} < threshold={FACE_CONFIDENCE_THRESHOLD}), eliminant tot el clúster"
|
| 655 |
+
)
|
| 656 |
continue
|
| 657 |
+
|
|
|
|
| 658 |
out_dir = chars_dir / char_id
|
| 659 |
out_dir.mkdir(parents=True, exist_ok=True)
|
| 660 |
+
|
| 661 |
+
total_faces = len(detections)
|
|
|
|
| 662 |
max_faces_to_show = (total_faces // 2) + 1
|
| 663 |
+
selected = detections[:max_faces_to_show]
|
| 664 |
+
|
| 665 |
+
files: list[str] = []
|
| 666 |
+
file_urls: list[str] = []
|
| 667 |
+
for det in selected:
|
| 668 |
+
fname = det["file"]
|
|
|
|
| 669 |
src = faces_root / fname
|
| 670 |
dst = out_dir / fname
|
| 671 |
try:
|
| 672 |
_sh.copy2(src, dst)
|
| 673 |
files.append(fname)
|
| 674 |
+
file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
|
| 675 |
except Exception:
|
| 676 |
pass
|
| 677 |
+
|
|
|
|
| 678 |
rep = files[0] if files else None
|
| 679 |
if rep:
|
| 680 |
rep_src = out_dir / rep
|
|
|
|
| 683 |
_sh.copy2(rep_src, rep_dst)
|
| 684 |
except Exception:
|
| 685 |
pass
|
| 686 |
+
|
| 687 |
+
cluster_number = int(char_id.split("_")[1]) + 1
|
|
|
|
| 688 |
character_name = f"Cluster {cluster_number}"
|
| 689 |
+
gender = validation.get("gender", "Neutral")
|
| 690 |
+
|
| 691 |
print(f"[{job_id}] [NAME GENERATION] Cluster {char_id}:")
|
| 692 |
print(f"[{job_id}] - Gender detectado: {gender}")
|
| 693 |
print(f"[{job_id}] - Nombre asignado: {character_name}")
|
| 694 |
print(f"[{job_id}] - Seed usado: {char_id}")
|
| 695 |
+
|
| 696 |
+
characters_validated.append({
|
| 697 |
"id": char_id,
|
| 698 |
"name": character_name,
|
| 699 |
"gender": gender,
|
| 700 |
+
"gender_confidence": validation.get("gender_confidence", 0.0),
|
| 701 |
+
"face_confidence": validation.get("face_confidence", 0.0),
|
| 702 |
+
"man_prob": validation.get("man_prob", 0.0),
|
| 703 |
+
"woman_prob": validation.get("woman_prob", 0.0),
|
| 704 |
"folder": str(out_dir),
|
| 705 |
"num_faces": len(files),
|
| 706 |
"total_faces_detected": total_faces,
|
| 707 |
"image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
|
| 708 |
+
"face_files": file_urls,
|
| 709 |
+
})
|
| 710 |
+
|
|
|
|
|
|
|
| 711 |
print(f"[{job_id}] [VALIDATION] ✓ Cluster {char_id}: CARA VÁLIDA!")
|
| 712 |
print(f"[{job_id}] Nombre: {character_name}")
|
| 713 |
+
print(f"[{job_id}] Género: {gender} (man={validation.get('man_prob', 0.0):.3f}, woman={validation.get('woman_prob', 0.0):.3f})")
|
| 714 |
+
print(f"[{job_id}] Confianza género: {validation.get('gender_confidence', 0.0):.3f}")
|
| 715 |
+
print(f"[{job_id}] Confianza cara: {validation.get('face_confidence', 0.0):.3f}")
|
| 716 |
print(f"[{job_id}] Caras mostradas: {len(files)}/{total_faces}")
|
| 717 |
print(f"[{job_id}] Imagen representativa: {best_face_path.name}")
|
| 718 |
+
|
|
|
|
| 719 |
eliminated_count = original_cluster_count - len(characters_validated)
|
| 720 |
print(f"[{job_id}] [VALIDATION] Total: {len(characters_validated)} clústers vàlids "
|
| 721 |
f"(eliminats {eliminated_count} falsos positius)")
|
|
|
|
|
|
|
| 722 |
|
| 723 |
+
if not characters_validated and fallback_candidate:
|
| 724 |
+
print(f"[{job_id}] [FALLBACK] No hi ha clústers vàlids. Creant clúster de reserva amb la millor cara trobada.")
|
| 725 |
+
fallback = fallback_candidate
|
| 726 |
+
det = fallback.get("detection", {})
|
| 727 |
+
fname = det.get("file")
|
| 728 |
+
fallback_path: Path | None = fallback.get("path")
|
| 729 |
+
val = fallback.get("validation")
|
| 730 |
+
idx = det.get("index")
|
| 731 |
+
|
| 732 |
+
if fname and fallback_path is not None:
|
| 733 |
+
if val is None:
|
| 734 |
+
val = validate_and_classify_face(str(fallback_path))
|
| 735 |
+
if val is None:
|
| 736 |
+
val = {
|
| 737 |
+
"is_valid_face": False,
|
| 738 |
+
"face_confidence": fallback.get("face_confidence", 0.0),
|
| 739 |
+
"gender": "Neutral",
|
| 740 |
+
"gender_confidence": 0.0,
|
| 741 |
+
"man_prob": 0.0,
|
| 742 |
+
"woman_prob": 0.0,
|
| 743 |
+
}
|
| 744 |
+
|
| 745 |
+
out_dir = chars_dir / "char_00"
|
| 746 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 747 |
|
| 748 |
+
src = faces_root / fname
|
| 749 |
+
dst = out_dir / fname
|
| 750 |
+
try:
|
| 751 |
+
_sh.copy2(src, dst)
|
| 752 |
+
except Exception as copy_err:
|
| 753 |
+
print(f"[{job_id}] [FALLBACK] Error copiant la imatge de reserva: {copy_err}")
|
| 754 |
|
| 755 |
+
rep_dst = out_dir / "representative.jpg"
|
| 756 |
+
try:
|
| 757 |
+
_sh.copy2(dst, rep_dst)
|
| 758 |
+
except Exception:
|
| 759 |
+
pass
|
| 760 |
+
|
| 761 |
+
if embeddings:
|
| 762 |
+
if not labels or len(labels) != len(embeddings):
|
| 763 |
+
labels = [-1] * len(embeddings)
|
| 764 |
+
if isinstance(idx, int) and 0 <= idx < len(labels):
|
| 765 |
+
labels[idx] = 0
|
| 766 |
+
|
| 767 |
+
characters_validated.append({
|
| 768 |
+
"id": "char_00",
|
| 769 |
+
"name": "Cluster 1",
|
| 770 |
+
"gender": val.get("gender", "Neutral"),
|
| 771 |
+
"gender_confidence": val.get("gender_confidence", 0.0),
|
| 772 |
+
"face_confidence": val.get("face_confidence", 0.0),
|
| 773 |
+
"man_prob": val.get("man_prob", 0.0),
|
| 774 |
+
"woman_prob": val.get("woman_prob", 0.0),
|
| 775 |
+
"folder": str(out_dir),
|
| 776 |
+
"num_faces": 1,
|
| 777 |
+
"total_faces_detected": 1,
|
| 778 |
+
"image_url": f"/files/{video_name}/char_00/representative.jpg",
|
| 779 |
+
"face_files": [f"/files/{video_name}/char_00/{fname}"],
|
| 780 |
+
})
|
| 781 |
+
|
| 782 |
+
print(f"[{job_id}] [FALLBACK] Clúster de reserva creat amb confiança {val.get('face_confidence', 0.0):.3f}")
|
| 783 |
+
else:
|
| 784 |
+
print(f"[{job_id}] [FALLBACK] Dades insuficients per crear el clúster de reserva")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 785 |
|
| 786 |
+
# Guardar resultados de caras
|
| 787 |
job["results"] = {
|
| 788 |
+
"characters": characters_validated,
|
| 789 |
+
"face_labels": labels,
|
| 790 |
+
"video_name": video_name,
|
| 791 |
"base_dir": str(base),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
}
|
| 793 |
job["status"] = JobStatus.DONE
|
| 794 |
+
print(f"[{job_id}] ✓ Procesamiento de caras completado: {len(characters_validated)} personajes")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 795 |
|
| 796 |
+
except Exception as face_error:
|
| 797 |
+
print(f"[{job_id}] Error en detección de caras: {face_error}")
|
| 798 |
+
import traceback
|
| 799 |
+
traceback.print_exc()
|
| 800 |
+
job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
|
| 801 |
+
job["status"] = JobStatus.DONE # Still mark done so UI can proceed
|
|
|
|
| 802 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 803 |
except Exception as e:
|
| 804 |
+
print(f"[{job_id}] Error general en procesamiento: {e}")
|
| 805 |
import traceback
|
| 806 |
+
traceback.print_exc()
|
| 807 |
+
job["status"] = JobStatus.FAILED
|
| 808 |
+
job["error"] = str(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 809 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 810 |
|
| 811 |
@app.post("/detect_scenes")
|
| 812 |
async def detect_scenes(
|
| 813 |
+
video_name: str = Form(...),
|
| 814 |
+
max_groups: int = Form(default=5),
|
| 815 |
min_cluster_size: int = Form(default=3),
|
| 816 |
scene_sensitivity: float = Form(default=0.5),
|
|
|
|
| 817 |
):
|
| 818 |
"""
|
| 819 |
+
Detecta y agrupa escenas en un vídeo ya procesado.
|
|
|
|
| 820 |
"""
|
| 821 |
import cv2
|
| 822 |
import numpy as np
|
| 823 |
+
from typing import Any
|
| 824 |
|
|
|
|
|
|
|
| 825 |
dst_video = VIDEOS_ROOT / f"{video_name}.mp4"
|
| 826 |
+
if not dst_video.exists():
|
| 827 |
+
return {"error": f"Video {video_name} not found"}
|
| 828 |
|
| 829 |
cap = cv2.VideoCapture(str(dst_video))
|
| 830 |
if not cap.isOpened():
|
| 831 |
+
return {"error": "Could not open video"}
|
| 832 |
|
| 833 |
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
| 834 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
|
| 835 |
+
max_samples = 200 # Limit samples for scene detection
|
| 836 |
+
|
| 837 |
+
if total_frames > 0:
|
| 838 |
+
frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
|
| 839 |
+
else:
|
| 840 |
+
frame_indices = []
|
| 841 |
|
| 842 |
+
frames: list[list[float]] = []
|
| 843 |
+
metas: list[dict[str, Any]] = []
|
| 844 |
+
|
| 845 |
+
for frame_idx in frame_indices:
|
| 846 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
|
| 847 |
+
ret, frame = cap.read()
|
| 848 |
if not ret:
|
| 849 |
+
continue
|
| 850 |
+
# Color histogram as feature
|
| 851 |
+
hist = cv2.calcHist([frame], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
|
| 852 |
+
hist = cv2.normalize(hist, hist).flatten()
|
| 853 |
+
frames.append(hist.tolist())
|
| 854 |
+
metas.append({"index": frame_idx})
|
| 855 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 856 |
cap.release()
|
| 857 |
|
| 858 |
if not frames:
|
| 859 |
+
return {"scene_clusters": [], "base_dir": ""}
|
| 860 |
|
| 861 |
X = np.array(frames)
|
| 862 |
labels = hierarchical_cluster_with_min_size(X, max_groups, min_cluster_size, scene_sensitivity).tolist()
|
|
|
|
| 869 |
if lbl is None or lbl < 0:
|
| 870 |
continue
|
| 871 |
clusters.setdefault(int(lbl), []).append(i)
|
| 872 |
+
|
| 873 |
+
# Fallback: garantir mínim 1 cluster d'escena
|
| 874 |
+
if not clusters and frames:
|
| 875 |
+
clusters[0] = [0] # Usar el primer frame com a escena per defecte
|
| 876 |
+
print("[SCENE FALLBACK] Cap cluster vàlid, creant cluster amb primer frame")
|
| 877 |
+
|
| 878 |
+
# VALIDACIÓ MILLORADA: Fusionar clusters molt similars
|
| 879 |
centroids = {}
|
| 880 |
for lbl, idxs in clusters.items():
|
| 881 |
cluster_histograms = X[idxs]
|
| 882 |
centroids[lbl] = np.mean(cluster_histograms, axis=0)
|
| 883 |
+
|
| 884 |
print(f"[SCENE VALIDATION] Validant similaritat entre {len(centroids)} clusters...")
|
| 885 |
+
|
| 886 |
+
SIMILARITY_THRESHOLD = 0.25
|
| 887 |
+
CORRELATION_THRESHOLD = 0.85
|
| 888 |
+
|
|
|
|
|
|
|
| 889 |
cluster_labels = sorted(centroids.keys())
|
| 890 |
similarities = {}
|
| 891 |
+
|
| 892 |
for i, lbl1 in enumerate(cluster_labels):
|
| 893 |
for lbl2 in cluster_labels[i+1:]:
|
|
|
|
| 894 |
dist = np.linalg.norm(centroids[lbl1] - centroids[lbl2])
|
| 895 |
+
corr = np.corrcoef(centroids[lbl1], centroids[lbl2])[0, 1] if len(centroids[lbl1]) > 1 else 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
are_similar = (dist < SIMILARITY_THRESHOLD) or (corr > CORRELATION_THRESHOLD)
|
| 897 |
+
similarities[(lbl1, lbl2)] = {'distance': dist, 'correlation': corr, 'similar': are_similar}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 898 |
if are_similar:
|
| 899 |
+
print(f"[SCENE VALIDATION] Clusters {lbl1} i {lbl2} similars: dist={dist:.3f}, corr={corr:.3f}")
|
| 900 |
+
|
| 901 |
+
# Union-Find para fusionar clusters
|
|
|
|
|
|
|
|
|
|
| 902 |
parent = {lbl: lbl for lbl in cluster_labels}
|
| 903 |
+
|
| 904 |
def find(x):
|
| 905 |
if parent[x] != x:
|
| 906 |
+
parent[x] = find(parent[x])
|
| 907 |
return parent[x]
|
| 908 |
+
|
| 909 |
def union(x, y):
|
| 910 |
+
rx, ry = find(x), find(y)
|
| 911 |
+
if rx != ry:
|
| 912 |
+
parent[ry] = rx
|
| 913 |
+
|
|
|
|
|
|
|
| 914 |
fusion_count = 0
|
| 915 |
for (lbl1, lbl2), sim in similarities.items():
|
| 916 |
if sim['similar']:
|
| 917 |
union(lbl1, lbl2)
|
| 918 |
fusion_count += 1
|
| 919 |
+
|
|
|
|
| 920 |
new_clusters = {}
|
| 921 |
for lbl, idxs in clusters.items():
|
| 922 |
root = find(lbl)
|
| 923 |
+
new_clusters.setdefault(root, []).extend(idxs)
|
| 924 |
+
|
|
|
|
|
|
|
|
|
|
| 925 |
final_clusters_dict = {}
|
| 926 |
for i, (root, idxs) in enumerate(sorted(new_clusters.items())):
|
| 927 |
final_clusters_dict[i] = idxs
|
| 928 |
+
|
| 929 |
clusters = final_clusters_dict
|
| 930 |
final_clusters = len(clusters)
|
| 931 |
eliminated = initial_clusters - final_clusters
|
| 932 |
+
|
| 933 |
+
print(f"[SCENE VALIDATION] Clusters finals: {final_clusters} (fusionats: {eliminated})")
|
| 934 |
+
|
| 935 |
+
# Escriure imatges representatives
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 936 |
base = TEMP_ROOT / video_name / "scenes"
|
| 937 |
base.mkdir(parents=True, exist_ok=True)
|
| 938 |
scene_list = []
|
| 939 |
cap = cv2.VideoCapture(str(dst_video))
|
| 940 |
+
|
| 941 |
for lbl, idxs in sorted(clusters.items(), key=lambda x: x[0]):
|
| 942 |
scene_id = f"scene_{int(lbl):02d}"
|
| 943 |
out_dir = base / scene_id
|
|
|
|
| 956 |
# Representative
|
| 957 |
rep = frame_files[0] if frame_files else None
|
| 958 |
image_url = f"/files_scene/{video_name}/{scene_id}/{rep}" if rep else ""
|
| 959 |
+
|
| 960 |
+
# Llamar a svision para describir la escena
|
| 961 |
scene_description = ""
|
| 962 |
+
scene_name = f"Cluster {lbl+1}"
|
| 963 |
if rep:
|
| 964 |
rep_full_path = out_dir / rep
|
| 965 |
if rep_full_path.exists():
|
|
|
|
| 966 |
try:
|
| 967 |
+
scene_description, _ = describe_image_with_svision(str(rep_full_path), is_face=False)
|
| 968 |
+
# Generar nombre corto con schat
|
|
|
|
|
|
|
|
|
|
| 969 |
if scene_description:
|
|
|
|
| 970 |
try:
|
|
|
|
| 971 |
config_path = os.getenv("CONFIG_YAML", "config.yaml")
|
| 972 |
if os.path.exists(config_path):
|
| 973 |
with open(config_path, 'r', encoding='utf-8') as f:
|
| 974 |
cfg = yaml.safe_load(f) or {}
|
| 975 |
router = LLMRouter(cfg)
|
| 976 |
+
prompt = f"Genera un nom curt (2-3 paraules) per aquesta escena:\n{scene_description}"
|
|
|
|
|
|
|
| 977 |
short_name = router.instruct(
|
| 978 |
prompt=prompt,
|
| 979 |
+
system="Respon NOMÉS amb el nom, sense explicacions.",
|
| 980 |
model="salamandra-instruct"
|
| 981 |
+
).strip().strip('"\'.,!?')
|
| 982 |
+
if short_name:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 983 |
scene_name = short_name
|
| 984 |
+
except Exception:
|
| 985 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 986 |
except Exception as e:
|
| 987 |
print(f"Error describiendo {scene_id}: {e}")
|
| 988 |
+
|
| 989 |
scene_list.append({
|
| 990 |
"id": scene_id,
|
| 991 |
"name": scene_name,
|
|
|
|
| 995 |
"image_url": image_url,
|
| 996 |
"frame_files": frame_files,
|
| 997 |
})
|
|
|
|
| 998 |
|
| 999 |
+
cap.release()
|
| 1000 |
return {"scene_clusters": scene_list, "base_dir": str(base)}
|
| 1001 |
|
| 1002 |
@app.post("/refine_narration")
|