VeuReu commited on
Commit
a72be74
·
verified ·
1 Parent(s): d5bf8e1

Update preprocessing_router.py

Browse files
Files changed (1) hide show
  1. preprocessing_router.py +68 -13
preprocessing_router.py CHANGED
@@ -21,6 +21,9 @@ from storage.media_routers import upload_video
21
  import svision_client
22
  import asr_client
23
 
 
 
 
24
 
25
  ROOT = Path("/tmp/veureu")
26
  ROOT.mkdir(parents=True, exist_ok=True)
@@ -890,22 +893,74 @@ def process_video_job(job_id: str):
890
  print(f"[{job_id}] \u2713 {len(audio_segments)} segmentos de audio procesados")
891
 
892
  # Cluster voice embeddings
893
- print(voice_embeddings)
894
  if voice_embeddings:
895
- print(f"[{job_id}] Clustering jer\u00e1rquico de voz...")
896
- print(f"[{job_id}] - voice_embeddings: {len(voice_embeddings)} embeddings")
897
- print(f"[{job_id}] - par\u00e1metros: voice_max_groups={voice_max_groups}, voice_min_cluster_size={voice_min_cluster_size}")
 
 
 
 
898
  Xv = np.array(voice_embeddings)
899
  Xv = Xv / np.linalg.norm(Xv, axis=1, keepdims=True)
900
- print(f"[{job_id}] - shape Xv: {Xv.shape}")
901
- voice_labels = hierarchical_cluster_with_min_size(
902
- Xv, voice_max_groups, voice_min_cluster_size, voice_sensitivity
903
- ).tolist()
904
- n_voice_clusters = len(set([l for l in voice_labels if l >= 0]))
905
- print(f"[{job_id}] - voice_labels: {voice_labels}")
906
- print(f"[{job_id}] \u2713 Clustering de voz: {n_voice_clusters} clusters de {len(voice_embeddings)} muestras")
907
- else:
908
- print(f"[{job_id}] \u26a0\ufe0f No hay voice_embeddings para clustering")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
909
 
910
  diarization_info = {
911
  "num_segments": len(audio_segments),
 
21
  import svision_client
22
  import asr_client
23
 
24
+ from sklearn.cluster import KMeans
25
+ from sklearn.neighbors import KNeighborsClassifier
26
+
27
 
28
  ROOT = Path("/tmp/veureu")
29
  ROOT.mkdir(parents=True, exist_ok=True)
 
893
  print(f"[{job_id}] \u2713 {len(audio_segments)} segmentos de audio procesados")
894
 
895
  # Cluster voice embeddings
 
896
  if voice_embeddings:
897
+ print(f"[{job_id}] Clustering KMeans+KNN de voz (forzado)...")
898
+ print(f"[{job_id}] - voice_embeddings: {len(voice_embeddings)}")
899
+ print(f"[{job_id}] - parámetros: grupos={voice_max_groups}, max_por_cluster={voice_min_cluster_size}")
900
+
901
+ # ------------------------------
902
+ # NORMALIZAR EMBEDDINGS
903
+ # ------------------------------
904
  Xv = np.array(voice_embeddings)
905
  Xv = Xv / np.linalg.norm(Xv, axis=1, keepdims=True)
906
+
907
+ N = len(Xv)
908
+ K = max(1, voice_max_groups) # número mínimo de clusters
909
+ MAX_PER_CLUSTER = max(1, voice_min_cluster_size)
910
+
911
+ # ------------------------------
912
+ # STEP 1: KMEANS FORZADO
913
+ # ------------------------------
914
+ from sklearn.cluster import KMeans
915
+
916
+ km = KMeans(n_clusters=K, n_init=10, random_state=42)
917
+ labels = km.fit_predict(Xv)
918
+
919
+ print(f"[{job_id}] - Inicial: {labels.tolist()}")
920
+
921
+ # ------------------------------
922
+ # STEP 2: REBALANCEO CON KNN SI HAY CLUSTERS SOBRECARGADOS
923
+ # ------------------------------
924
+ from sklearn.neighbors import KNeighborsClassifier
925
+
926
+ for iteration in range(10): # máximo 10 ajustes
927
+ sizes = {c: np.sum(labels == c) for c in range(K)}
928
+ bad_clusters = [c for c, s in sizes.items() if s > MAX_PER_CLUSTER]
929
+
930
+ print(f"[{job_id}] - Iter {iteration}: tamaños={sizes}")
931
+
932
+ if not bad_clusters:
933
+ break # Todo OK, ningún cluster supera el límite
934
+
935
+ # Entrenar KNN usando SOLO clusters válidos
936
+ good_indices = []
937
+ for c in range(K):
938
+ idx = np.where(labels == c)[0]
939
+ if len(idx) <= MAX_PER_CLUSTER:
940
+ good_indices.extend(idx)
941
+
942
+ if len(good_indices) == 0:
943
+ print(f"[{job_id}] - No hay clusters válidos para KNN, abortando rebalanceo.")
944
+ break
945
+
946
+ knn = KNeighborsClassifier(n_neighbors=min(3, len(good_indices)))
947
+ knn.fit(Xv[good_indices], labels[good_indices])
948
+
949
+ # Reasignar elementos excedentes
950
+ for c in bad_clusters:
951
+ idx = np.where(labels == c)[0]
952
+ excess = idx[MAX_PER_CLUSTER:] # los que sobran
953
+
954
+ for i in excess:
955
+ new_lab = knn.predict([Xv[i]])[0]
956
+ labels[i] = new_lab
957
+
958
+ voice_labels = labels.tolist()
959
+ n_voice_clusters = len(set(voice_labels))
960
+
961
+ print(f"[{job_id}] - Final voice_labels: {voice_labels}")
962
+ print(f"[{job_id}] ✓ Clustering voz final: {n_voice_clusters} clusters")
963
+
964
 
965
  diarization_info = {
966
  "num_segments": len(audio_segments),