Update preprocessing_router.py
Browse files- preprocessing_router.py +68 -13
preprocessing_router.py
CHANGED
|
@@ -21,6 +21,9 @@ from storage.media_routers import upload_video
|
|
| 21 |
import svision_client
|
| 22 |
import asr_client
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
ROOT = Path("/tmp/veureu")
|
| 26 |
ROOT.mkdir(parents=True, exist_ok=True)
|
|
@@ -890,22 +893,74 @@ def process_video_job(job_id: str):
|
|
| 890 |
print(f"[{job_id}] \u2713 {len(audio_segments)} segmentos de audio procesados")
|
| 891 |
|
| 892 |
# Cluster voice embeddings
|
| 893 |
-
print(voice_embeddings)
|
| 894 |
if voice_embeddings:
|
| 895 |
-
print(f"[{job_id}] Clustering
|
| 896 |
-
print(f"[{job_id}] - voice_embeddings: {len(voice_embeddings)}
|
| 897 |
-
print(f"[{job_id}] -
|
|
|
|
|
|
|
|
|
|
|
|
|
| 898 |
Xv = np.array(voice_embeddings)
|
| 899 |
Xv = Xv / np.linalg.norm(Xv, axis=1, keepdims=True)
|
| 900 |
-
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 909 |
|
| 910 |
diarization_info = {
|
| 911 |
"num_segments": len(audio_segments),
|
|
|
|
| 21 |
import svision_client
|
| 22 |
import asr_client
|
| 23 |
|
| 24 |
+
from sklearn.cluster import KMeans
|
| 25 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 26 |
+
|
| 27 |
|
| 28 |
ROOT = Path("/tmp/veureu")
|
| 29 |
ROOT.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 893 |
print(f"[{job_id}] \u2713 {len(audio_segments)} segmentos de audio procesados")
|
| 894 |
|
| 895 |
# Cluster voice embeddings
|
|
|
|
| 896 |
if voice_embeddings:
|
| 897 |
+
print(f"[{job_id}] Clustering KMeans+KNN de voz (forzado)...")
|
| 898 |
+
print(f"[{job_id}] - voice_embeddings: {len(voice_embeddings)}")
|
| 899 |
+
print(f"[{job_id}] - parámetros: grupos={voice_max_groups}, max_por_cluster={voice_min_cluster_size}")
|
| 900 |
+
|
| 901 |
+
# ------------------------------
|
| 902 |
+
# NORMALIZAR EMBEDDINGS
|
| 903 |
+
# ------------------------------
|
| 904 |
Xv = np.array(voice_embeddings)
|
| 905 |
Xv = Xv / np.linalg.norm(Xv, axis=1, keepdims=True)
|
| 906 |
+
|
| 907 |
+
N = len(Xv)
|
| 908 |
+
K = max(1, voice_max_groups) # número mínimo de clusters
|
| 909 |
+
MAX_PER_CLUSTER = max(1, voice_min_cluster_size)
|
| 910 |
+
|
| 911 |
+
# ------------------------------
|
| 912 |
+
# STEP 1: KMEANS FORZADO
|
| 913 |
+
# ------------------------------
|
| 914 |
+
from sklearn.cluster import KMeans
|
| 915 |
+
|
| 916 |
+
km = KMeans(n_clusters=K, n_init=10, random_state=42)
|
| 917 |
+
labels = km.fit_predict(Xv)
|
| 918 |
+
|
| 919 |
+
print(f"[{job_id}] - Inicial: {labels.tolist()}")
|
| 920 |
+
|
| 921 |
+
# ------------------------------
|
| 922 |
+
# STEP 2: REBALANCEO CON KNN SI HAY CLUSTERS SOBRECARGADOS
|
| 923 |
+
# ------------------------------
|
| 924 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 925 |
+
|
| 926 |
+
for iteration in range(10): # máximo 10 ajustes
|
| 927 |
+
sizes = {c: np.sum(labels == c) for c in range(K)}
|
| 928 |
+
bad_clusters = [c for c, s in sizes.items() if s > MAX_PER_CLUSTER]
|
| 929 |
+
|
| 930 |
+
print(f"[{job_id}] - Iter {iteration}: tamaños={sizes}")
|
| 931 |
+
|
| 932 |
+
if not bad_clusters:
|
| 933 |
+
break # Todo OK, ningún cluster supera el límite
|
| 934 |
+
|
| 935 |
+
# Entrenar KNN usando SOLO clusters válidos
|
| 936 |
+
good_indices = []
|
| 937 |
+
for c in range(K):
|
| 938 |
+
idx = np.where(labels == c)[0]
|
| 939 |
+
if len(idx) <= MAX_PER_CLUSTER:
|
| 940 |
+
good_indices.extend(idx)
|
| 941 |
+
|
| 942 |
+
if len(good_indices) == 0:
|
| 943 |
+
print(f"[{job_id}] - No hay clusters válidos para KNN, abortando rebalanceo.")
|
| 944 |
+
break
|
| 945 |
+
|
| 946 |
+
knn = KNeighborsClassifier(n_neighbors=min(3, len(good_indices)))
|
| 947 |
+
knn.fit(Xv[good_indices], labels[good_indices])
|
| 948 |
+
|
| 949 |
+
# Reasignar elementos excedentes
|
| 950 |
+
for c in bad_clusters:
|
| 951 |
+
idx = np.where(labels == c)[0]
|
| 952 |
+
excess = idx[MAX_PER_CLUSTER:] # los que sobran
|
| 953 |
+
|
| 954 |
+
for i in excess:
|
| 955 |
+
new_lab = knn.predict([Xv[i]])[0]
|
| 956 |
+
labels[i] = new_lab
|
| 957 |
+
|
| 958 |
+
voice_labels = labels.tolist()
|
| 959 |
+
n_voice_clusters = len(set(voice_labels))
|
| 960 |
+
|
| 961 |
+
print(f"[{job_id}] - Final voice_labels: {voice_labels}")
|
| 962 |
+
print(f"[{job_id}] ✓ Clustering voz final: {n_voice_clusters} clusters")
|
| 963 |
+
|
| 964 |
|
| 965 |
diarization_info = {
|
| 966 |
"num_segments": len(audio_segments),
|