VeuReu commited on
Commit
ce5bf11
·
verified ·
1 Parent(s): 6865b9f

Upload 3 files

Browse files
Files changed (3) hide show
  1. api.py +108 -1
  2. main_process/main_router.py +314 -302
  3. pipelines/audiodescription.py +147 -147
api.py CHANGED
@@ -16,11 +16,12 @@ import yaml
16
  import io
17
 
18
  from video_processing import process_video_pipeline
19
- from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments
20
  from casting_loader import ensure_chroma, build_faces_index, build_voices_index
21
  from narration_system import NarrationSystem
22
  from llm_router import load_yaml, LLMRouter
23
  from character_detection import detect_characters_from_video
 
24
 
25
  from pipelines.audiodescription import generate as ad_generate
26
 
@@ -1015,6 +1016,111 @@ async def finalize_casting(
1015
  face_identities = sorted([p.name for p in faces_out.iterdir() if p.is_dir()]) if faces_out.exists() else []
1016
  voice_identities = sorted([p.name for p in voices_out.iterdir() if p.is_dir()]) if voices_out.exists() else []
1017
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1018
  return {
1019
  "ok": True,
1020
  "video_name": video_name,
@@ -1025,6 +1131,7 @@ async def finalize_casting(
1025
  "n_voices_embeddings": n_voices,
1026
  "face_identities": face_identities,
1027
  "voice_identities": voice_identities,
 
1028
  }
1029
 
1030
  @app.get("/files_scene/{video_name}/{scene_id}/{filename}")
 
16
  import io
17
 
18
  from video_processing import process_video_pipeline
19
+ from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments, VoiceEmbedder
20
  from casting_loader import ensure_chroma, build_faces_index, build_voices_index
21
  from narration_system import NarrationSystem
22
  from llm_router import load_yaml, LLMRouter
23
  from character_detection import detect_characters_from_video
24
+ from vision_tools import FaceOfImageEmbedding
25
 
26
  from pipelines.audiodescription import generate as ad_generate
27
 
 
1016
  face_identities = sorted([p.name for p in faces_out.iterdir() if p.is_dir()]) if faces_out.exists() else []
1017
  voice_identities = sorted([p.name for p in voices_out.iterdir() if p.is_dir()]) if voices_out.exists() else []
1018
 
1019
+ # Build casting_json with face and voice embeddings (best-effort) via remote Spaces
1020
+ casting_json = {"face_col": [], "voice_col": []}
1021
+
1022
+ # Cargar config y router para acceder a svision/asr
1023
+ try:
1024
+ cfg = load_yaml("config.yaml")
1025
+ router = LLMRouter(cfg)
1026
+ except Exception:
1027
+ router = None # type: ignore
1028
+
1029
+ # Face embeddings per identity using remote svision (face_image_embedding)
1030
+ try:
1031
+ if face_identities and router is not None:
1032
+ factory = router.client_factories.get("salamandra-vision") # type: ignore[attr-defined]
1033
+ if factory is not None:
1034
+ vclient = factory()
1035
+ gclient = getattr(vclient, "_client", None)
1036
+ else:
1037
+ gclient = None
1038
+
1039
+ if gclient is not None:
1040
+ for identity in face_identities:
1041
+ id_dir = faces_out / identity
1042
+ if not id_dir.is_dir():
1043
+ continue
1044
+ # Buscar una imagen representativa
1045
+ img_path = None
1046
+ for ext in (".jpg", ".jpeg", ".png", ".bmp", ".webp"):
1047
+ candidates = list(id_dir.glob(f"*{ext}"))
1048
+ if candidates:
1049
+ img_path = candidates[0]
1050
+ break
1051
+ if not img_path:
1052
+ continue
1053
+
1054
+ try:
1055
+ out = gclient.predict(str(img_path), api_name="/face_image_embedding")
1056
+ # svision devuelve normalmente una lista de embeddings o un solo embedding
1057
+ emb = None
1058
+ if isinstance(out, list):
1059
+ if out and isinstance(out[0], (list, tuple, float, int)):
1060
+ # Si es lista de listas, tomamos la primera; si es lista plana, la usamos tal cual
1061
+ if out and isinstance(out[0], (list, tuple)):
1062
+ emb = list(out[0])
1063
+ else:
1064
+ emb = list(out)
1065
+ elif isinstance(out, dict) and "embedding" in out:
1066
+ emb = out.get("embedding")
1067
+
1068
+ if not emb:
1069
+ continue
1070
+
1071
+ casting_json["face_col"].append({
1072
+ "nombre": identity,
1073
+ "embedding": emb,
1074
+ })
1075
+ except Exception:
1076
+ # No romper por un fallo puntual de embedding
1077
+ continue
1078
+ except Exception:
1079
+ # Si algo falla en todo el bloque de caras, dejamos face_col vacío
1080
+ casting_json["face_col"] = []
1081
+
1082
+ # Voice embeddings per identity using remote asr (voice_embedding)
1083
+ try:
1084
+ if voice_identities and router is not None:
1085
+ factory = router.client_factories.get("whisper-catalan") # type: ignore[attr-defined]
1086
+ if factory is not None:
1087
+ aclient = factory()
1088
+ gclient = getattr(aclient, "_client", None)
1089
+ else:
1090
+ gclient = None
1091
+
1092
+ if gclient is not None:
1093
+ for identity in voice_identities:
1094
+ id_dir = voices_out / identity
1095
+ if not id_dir.is_dir():
1096
+ continue
1097
+ wav_files = sorted([p for p in id_dir.iterdir() if p.is_file() and p.suffix.lower() in [".wav", ".flac", ".mp3"]])
1098
+ if not wav_files:
1099
+ continue
1100
+
1101
+ # Obtenemos un embedding representativo usando el primer clip
1102
+ wf = wav_files[0]
1103
+ try:
1104
+ out = gclient.predict(str(wf), api_name="/voice_embedding")
1105
+ emb = None
1106
+ if isinstance(out, list):
1107
+ emb = list(out)
1108
+ elif isinstance(out, dict) and "embedding" in out:
1109
+ emb = out.get("embedding")
1110
+
1111
+ if not emb:
1112
+ continue
1113
+
1114
+ casting_json["voice_col"].append({
1115
+ "nombre": identity,
1116
+ "embedding": emb,
1117
+ })
1118
+ except Exception:
1119
+ continue
1120
+ except Exception:
1121
+ # Si algo falla en todo el bloque de voces, dejamos voice_col vacío
1122
+ casting_json["voice_col"] = []
1123
+
1124
  return {
1125
  "ok": True,
1126
  "video_name": video_name,
 
1131
  "n_voices_embeddings": n_voices,
1132
  "face_identities": face_identities,
1133
  "voice_identities": voice_identities,
1134
+ "casting_json": casting_json,
1135
  }
1136
 
1137
  @app.get("/files_scene/{video_name}/{scene_id}/{filename}")
main_process/main_router.py CHANGED
@@ -1,303 +1,315 @@
1
- import os
2
- import io
3
-
4
- from pathlib import Path
5
- from typing import Counter,List, Dict
6
- import ast
7
- import json
8
- import torch
9
- from svision_client import extract_scenes, add_ocr_and_faces, keyframes_every_second_extraction, extract_descripcion_escena
10
- from asr_client import extract_audio_from_video, diarize_audio, transcribe_long_audio, transcribe_short_audio, identificar_veu
11
-
12
- from fastapi import APIRouter, UploadFile, File, Query, HTTPException
13
- from fastapi.responses import JSONResponse, StreamingResponse
14
-
15
- from storage.common import validate_token
16
- from storage.files.file_manager import FileManager
17
-
18
- EMBEDDINGS_ROOT = Path("/data/embeddings")
19
- MEDIA_ROOT = Path("/data/media")
20
- os.environ["CUDA_VISIBLE_DEVICES"] = "1"
21
- router = APIRouter(prefix="/transcription", tags=["Transcription Process"])
22
- HF_TOKEN = os.getenv("HF_TOKEN")
23
-
24
- def get_casting(video_sha1: str):
25
- # TODO: Buscarlo en el directorio
26
- face_col = [{"nombre": "Ana", "embedding": [0.1]*512}]
27
- voice_col = [{"nombre": "Ana", "embedding": [0.2]*192}]
28
-
29
- return face_col, voice_col
30
-
31
- transcripcion_inicial = "/home/acasado/bsc/hugging_face_bsc/engine/results/transcription_initial.srt"
32
- informacion_json = "/home/acasado/bsc/hugging_face_bsc/engine/results/informacion.json"
33
-
34
- def map_identities_per_second(frames_per_second, intervals):
35
- for seg in intervals:
36
- seg_start = seg["start"]
37
- seg_end = seg["end"]
38
-
39
- identities = []
40
- for f in frames_per_second:
41
- if seg_start <= f["start"] <= seg_end:
42
- for face in f.get("faces", []):
43
- identities.append(face)
44
-
45
- seg["counts"] = dict(Counter(identities))
46
-
47
- return intervals
48
-
49
- def _fmt_srt_time(seconds: float) -> str:
50
- """Formatea segundos en el formato SRT HH:MM:SS,mmm"""
51
- h = int(seconds // 3600)
52
- m = int((seconds % 3600) // 60)
53
- s = int(seconds % 60)
54
- ms = int((seconds - int(seconds)) * 1000)
55
- return f"{h:02}:{m:02}:{s:02},{ms:03}"
56
-
57
- from pathlib import Path
58
- from typing import List, Dict
59
- from fastapi import HTTPException
60
-
61
-
62
- def generate_srt_from_segments(segments: List[Dict], sha1: str) -> str:
63
- """
64
- Generate an SRT subtitle file from diarization/transcription segments.
65
-
66
- This function:
67
- - Creates the required folder structure for storing SRTs.
68
- - Removes any previous SRT files for the same SHA1.
69
- - Builds the SRT content with timestamps, speaker identity and transcription.
70
- - Saves the SRT file to disk.
71
- - Returns the SRT content as a string (to be sent by the endpoint).
72
-
73
- Parameters
74
- ----------
75
- segments : List[Dict]
76
- List of dictionaries containing:
77
- - "start": float (start time in seconds)
78
- - "end": float (end time in seconds)
79
- - "speaker": dict with "identity"
80
- - "transcription": str
81
- sha1 : str
82
- Identifier used to locate the target media folder.
83
-
84
- Returns
85
- -------
86
- str
87
- Full SRT file content as a string.
88
- """
89
-
90
- # Path: /data/media/<sha1>
91
- video_root = MEDIA_ROOT / sha1
92
- video_root.mkdir(parents=True, exist_ok=True)
93
-
94
- # Path: /data/media/<sha1>/srt
95
- srt_dir = video_root / "srt"
96
- srt_dir.mkdir(parents=True, exist_ok=True)
97
-
98
- # Delete old SRT files
99
- try:
100
- for old_srt in srt_dir.glob("*.srt"):
101
- old_srt.unlink()
102
- except Exception as exc:
103
- raise HTTPException(status_code=500, detail=f"Failed to delete old SRT files: {exc}")
104
-
105
- # Save file as initial.srt
106
- final_path = srt_dir / "initial.srt"
107
-
108
- # Build SRT content
109
- srt_lines = []
110
-
111
- for i, seg in enumerate(segments, start=1):
112
- start = seg.get("start", 0.0)
113
- end = seg.get("end", 0.0)
114
- transcription = seg.get("transcription", "").strip()
115
-
116
- speaker_info = seg.get("speaker", {})
117
- speaker = speaker_info.get("identity", "Unknown")
118
-
119
- text = f"[{speaker}]: {transcription}" if speaker else transcription
120
-
121
- entry = (
122
- f"{i}\n"
123
- f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}\n"
124
- f"{text}\n"
125
- )
126
- srt_lines.append(entry)
127
-
128
- # Join with blank lines
129
- srt_content = "\n".join(srt_lines)
130
-
131
- # Write to disk
132
- try:
133
- with final_path.open("w", encoding="utf-8-sig") as f:
134
- f.write(srt_content)
135
- except Exception as exc:
136
- raise HTTPException(status_code=500, detail=f"Failed to write SRT file: {exc}")
137
-
138
- return srt_content
139
-
140
- def pipeline_preprocessing_vision(video_path: str, face_col):
141
- """
142
- Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de vision.
143
- """
144
-
145
- print(f"Procesando video para visión: {video_path}")
146
-
147
- print("Extrayendo escenas...")
148
- threshold: float = 30.0
149
- offset_frames: int = 3
150
- crop_ratio: float = 0.1
151
- result_extract_scenes = extract_scenes(video_path, threshold, offset_frames, crop_ratio)
152
- print(result_extract_scenes)
153
- # Obtener las rutas de las imágenes y la información de las escenas
154
- escenas = result_extract_scenes[0] if len(result_extract_scenes) > 0 else []
155
- escenas_paths = [f["image"] for f in escenas]
156
- print(escenas_paths)
157
- info_escenas = result_extract_scenes[1] if len(result_extract_scenes) > 1 else []
158
- print(info_escenas)
159
-
160
- print("Extrayendo imagenes por segundo...")
161
- result_extract_per_second = keyframes_every_second_extraction(video_path)
162
- # Obtener las rutas de las imágenes y la información de las escenas
163
- images_per_second = result_extract_per_second[0] if len(result_extract_per_second) > 0 else []
164
- images_per_second_paths = [f["image"] for f in images_per_second]
165
- info_images_per_second = result_extract_per_second[1] if len(result_extract_per_second) > 1 else []
166
-
167
- print("Aumentamos la información de las escenas viendo quién aparece en cada escena y detectando OCR...")
168
- info_escenas_completa = []
169
- for imagen_escena, info_escena in zip(escenas_paths, info_escenas):
170
- result_add_ocr_and_faces = add_ocr_and_faces(imagen_escena, info_escena, face_col)
171
- info_escenas_completa.append(result_add_ocr_and_faces)
172
-
173
- print("Aumentamos la información de las imagenes por segundo viendo quién aparece en cada escena y detectando OCR...")
174
- info_images_per_second_completa = []
175
- for imagen_segundo, info_segundo in zip(images_per_second_paths, info_images_per_second):
176
- result_add_ocr_and_faces =add_ocr_and_faces(imagen_segundo, info_segundo, face_col)
177
- info_images_per_second_completa.append(result_add_ocr_and_faces)
178
- print(info_escenas_completa)
179
-
180
- print("Ahora se va a tratar los OCR (se sustituirán ciertas escenas por alguna de las imágenes por segundo si tienen mejor OCR)...")
181
- # Se hará lo último
182
-
183
- print("Combinando información de escenas e imágenes por segundo...")
184
- info_escenas_completa = map_identities_per_second(info_images_per_second_completa, info_escenas_completa)
185
- print(info_escenas_completa)
186
-
187
- print("Ahora se incluyen en los diccionarios de las escenas la descripciones de estas.")
188
- for escena_path, info_escena in zip(escenas_paths, info_escenas_completa):
189
- descripcion_escena = extract_descripcion_escena(escena_path)
190
- lista = ast.literal_eval(descripcion_escena)
191
- frase = lista[0]
192
- info_escena["descripcion"] = frase
193
- del descripcion_escena
194
- torch.cuda.empty_cache()
195
-
196
- return info_escenas_completa, info_images_per_second_completa
197
-
198
- def pipeline_preprocessing_audio(video_path: str, voice_col):
199
- """
200
- Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de audio.
201
- """
202
- print(f"Procesando video para audio: {video_path}")
203
-
204
- print("Extrayendo audio del video...")
205
- audio_video = extract_audio_from_video(video_path)
206
- print(audio_video)
207
-
208
- print("Diartizando el audio...")
209
- diarization_audio = diarize_audio(audio_video)
210
- print(diarization_audio)
211
- clips_path = diarization_audio[0]
212
- print(clips_path)
213
- diarization_info = diarization_audio[1]
214
- print(diarization_info)
215
-
216
- print("Transcribiendo el video completo...")
217
- full_transcription = transcribe_long_audio(audio_video)
218
- print(full_transcription)
219
-
220
- print("Transcribiendo los clips diartizados...")
221
- for clip_path, clip_info in zip(clips_path, diarization_info):
222
- clip_transcription = transcribe_short_audio(clip_path)
223
- clip_info["transcription"] = clip_transcription
224
-
225
- print("Calculando los embeddings para cada uno de los clips obtenidos y posteriormente identificar las voces...")
226
- for clip_path, clip_info in zip(clips_path, diarization_info):
227
- clip_speaker = identificar_veu(clip_path, voice_col)
228
- clip_info["speaker"] = clip_speaker
229
-
230
- return full_transcription, diarization_info
231
-
232
- @router.post("/generate_srt", tags=["Transcription Process"])
233
- async def pipeline_video_analysis(
234
- sha1: str,
235
- token: str = Query(..., description="Token required for authorization")
236
- ):
237
- """
238
- Endpoint that processes a full video identified by its SHA1 folder, performs
239
- complete audio-visual preprocessing, and returns an SRT subtitle file.
240
-
241
- This pipeline integrates:
242
- - Vision preprocessing (scene detection, keyframes, OCR, face recognition)
243
- - Audio preprocessing (diarization, speech recognition, speaker identity matching)
244
- - Identity mapping between vision and audio streams
245
- - Final generation of an SRT file describing who speaks and when
246
-
247
- Parameters
248
- ----------
249
- sha1 : str
250
- Identifier corresponding to the folder containing the video and related assets.
251
- token : str
252
- Security token required for authorization.
253
-
254
- Returns
255
- -------
256
- str
257
- The generated SRT file (as text) containing time-aligned subtitles with
258
- speaker identities and transcriptions.
259
- """
260
-
261
- validate_token(token)
262
-
263
- # Resolve directories
264
- file_manager = FileManager(MEDIA_ROOT)
265
- sha1_folder = MEDIA_ROOT / sha1
266
- clip_folder = sha1_folder / "clip"
267
-
268
- if not sha1_folder.exists() or not sha1_folder.is_dir():
269
- raise HTTPException(status_code=404, detail="SHA1 folder not found")
270
-
271
- if not clip_folder.exists() or not clip_folder.is_dir():
272
- raise HTTPException(status_code=404, detail="Clip folder not found")
273
-
274
- # Locate video file
275
- mp4_files = list(clip_folder.glob("*.mp4"))
276
- if not mp4_files:
277
- raise HTTPException(status_code=404, detail="No MP4 files found")
278
-
279
- video_path = mp4_files[0]
280
-
281
- # Convert absolute path to a relative path for FileManager
282
- video_path = MEDIA_ROOT / video_path.relative_to(MEDIA_ROOT)
283
-
284
- print(f"Processing full video: {video_path}")
285
-
286
- # Get face and voice embeddings for casting
287
- face_col, voice_col = get_casting(sha1)
288
-
289
- # Vision processing pipeline
290
- info_escenas, info_images_per_second = pipeline_preprocessing_vision(video_path, face_col)
291
- torch.cuda.empty_cache()
292
-
293
- # Audio processing pipeline
294
- full_transcription, info_clips = pipeline_preprocessing_audio(video_path, voice_col)
295
-
296
- # Merge identities from vision pipeline with audio segments
297
- info_clips = map_identities_per_second(info_images_per_second, info_clips)
298
-
299
- # Generate the final SRT subtitle file
300
- srt = generate_srt_from_segments(info_clips, sha1)
301
-
302
- # The endpoint returns the SRT file as plain text
 
 
 
 
 
 
 
 
 
 
 
 
303
  return srt
 
1
+ import os
2
+ import io
3
+
4
+ from pathlib import Path
5
+ from typing import Counter,List, Dict
6
+ import ast
7
+ import json
8
+ import torch
9
+ from svision_client import extract_scenes, add_ocr_and_faces, keyframes_every_second_extraction, extract_descripcion_escena
10
+ from asr_client import extract_audio_from_video, diarize_audio, transcribe_long_audio, transcribe_short_audio, identificar_veu
11
+
12
+ from fastapi import APIRouter, UploadFile, File, Query, HTTPException
13
+ from fastapi.responses import JSONResponse, StreamingResponse
14
+
15
+ from storage.common import validate_token
16
+ from storage.files.file_manager import FileManager
17
+ from storage.embeddings_routers import get_embeddings_json
18
+
19
+ EMBEDDINGS_ROOT = Path("/data/embeddings")
20
+ MEDIA_ROOT = Path("/data/media")
21
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
22
+ router = APIRouter(prefix="/transcription", tags=["Transcription Process"])
23
+ HF_TOKEN = os.getenv("HF_TOKEN")
24
+
25
+ def get_casting(video_sha1: str):
26
+ """Recupera els embeddings reals de càsting per a un vídeo a partir del seu SHA1.
27
+
28
+ Llegeix el JSON d'embeddings que demo ha pujat prèviament a /data/embeddings
29
+ mitjançant l'endpoint /embeddings/upload_embeddings i en retorna les
30
+ columnes face_col i voice_col.
31
+ """
32
+
33
+ # get_embeddings_json retorna el JSON complet tal com es va pujar (casting_json)
34
+ faces_json = get_embeddings_json(video_sha1, "faces")
35
+ voices_json = get_embeddings_json(video_sha1, "voices")
36
+
37
+ # Ens quedem només amb les columnes que interessen al pipeline
38
+ face_col = faces_json.get("face_col", []) if isinstance(faces_json, dict) else []
39
+ voice_col = voices_json.get("voice_col", []) if isinstance(voices_json, dict) else []
40
+
41
+ return face_col, voice_col
42
+
43
+ transcripcion_inicial = "/home/acasado/bsc/hugging_face_bsc/engine/results/transcription_initial.srt"
44
+ informacion_json = "/home/acasado/bsc/hugging_face_bsc/engine/results/informacion.json"
45
+
46
+ def map_identities_per_second(frames_per_second, intervals):
47
+ for seg in intervals:
48
+ seg_start = seg["start"]
49
+ seg_end = seg["end"]
50
+
51
+ identities = []
52
+ for f in frames_per_second:
53
+ if seg_start <= f["start"] <= seg_end:
54
+ for face in f.get("faces", []):
55
+ identities.append(face)
56
+
57
+ seg["counts"] = dict(Counter(identities))
58
+
59
+ return intervals
60
+
61
+ def _fmt_srt_time(seconds: float) -> str:
62
+ """Formatea segundos en el formato SRT HH:MM:SS,mmm"""
63
+ h = int(seconds // 3600)
64
+ m = int((seconds % 3600) // 60)
65
+ s = int(seconds % 60)
66
+ ms = int((seconds - int(seconds)) * 1000)
67
+ return f"{h:02}:{m:02}:{s:02},{ms:03}"
68
+
69
+ from pathlib import Path
70
+ from typing import List, Dict
71
+ from fastapi import HTTPException
72
+
73
+
74
+ def generate_srt_from_segments(segments: List[Dict], sha1: str) -> str:
75
+ """
76
+ Generate an SRT subtitle file from diarization/transcription segments.
77
+
78
+ This function:
79
+ - Creates the required folder structure for storing SRTs.
80
+ - Removes any previous SRT files for the same SHA1.
81
+ - Builds the SRT content with timestamps, speaker identity and transcription.
82
+ - Saves the SRT file to disk.
83
+ - Returns the SRT content as a string (to be sent by the endpoint).
84
+
85
+ Parameters
86
+ ----------
87
+ segments : List[Dict]
88
+ List of dictionaries containing:
89
+ - "start": float (start time in seconds)
90
+ - "end": float (end time in seconds)
91
+ - "speaker": dict with "identity"
92
+ - "transcription": str
93
+ sha1 : str
94
+ Identifier used to locate the target media folder.
95
+
96
+ Returns
97
+ -------
98
+ str
99
+ Full SRT file content as a string.
100
+ """
101
+
102
+ # Path: /data/media/<sha1>
103
+ video_root = MEDIA_ROOT / sha1
104
+ video_root.mkdir(parents=True, exist_ok=True)
105
+
106
+ # Path: /data/media/<sha1>/srt
107
+ srt_dir = video_root / "srt"
108
+ srt_dir.mkdir(parents=True, exist_ok=True)
109
+
110
+ # Delete old SRT files
111
+ try:
112
+ for old_srt in srt_dir.glob("*.srt"):
113
+ old_srt.unlink()
114
+ except Exception as exc:
115
+ raise HTTPException(status_code=500, detail=f"Failed to delete old SRT files: {exc}")
116
+
117
+ # Save file as initial.srt
118
+ final_path = srt_dir / "initial.srt"
119
+
120
+ # Build SRT content
121
+ srt_lines = []
122
+
123
+ for i, seg in enumerate(segments, start=1):
124
+ start = seg.get("start", 0.0)
125
+ end = seg.get("end", 0.0)
126
+ transcription = seg.get("transcription", "").strip()
127
+
128
+ speaker_info = seg.get("speaker", {})
129
+ speaker = speaker_info.get("identity", "Unknown")
130
+
131
+ text = f"[{speaker}]: {transcription}" if speaker else transcription
132
+
133
+ entry = (
134
+ f"{i}\n"
135
+ f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}\n"
136
+ f"{text}\n"
137
+ )
138
+ srt_lines.append(entry)
139
+
140
+ # Join with blank lines
141
+ srt_content = "\n".join(srt_lines)
142
+
143
+ # Write to disk
144
+ try:
145
+ with final_path.open("w", encoding="utf-8-sig") as f:
146
+ f.write(srt_content)
147
+ except Exception as exc:
148
+ raise HTTPException(status_code=500, detail=f"Failed to write SRT file: {exc}")
149
+
150
+ return srt_content
151
+
152
+ def pipeline_preprocessing_vision(video_path: str, face_col):
153
+ """
154
+ Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de vision.
155
+ """
156
+
157
+ print(f"Procesando video para visión: {video_path}")
158
+
159
+ print("Extrayendo escenas...")
160
+ threshold: float = 30.0
161
+ offset_frames: int = 3
162
+ crop_ratio: float = 0.1
163
+ result_extract_scenes = extract_scenes(video_path, threshold, offset_frames, crop_ratio)
164
+ print(result_extract_scenes)
165
+ # Obtener las rutas de las imágenes y la información de las escenas
166
+ escenas = result_extract_scenes[0] if len(result_extract_scenes) > 0 else []
167
+ escenas_paths = [f["image"] for f in escenas]
168
+ print(escenas_paths)
169
+ info_escenas = result_extract_scenes[1] if len(result_extract_scenes) > 1 else []
170
+ print(info_escenas)
171
+
172
+ print("Extrayendo imagenes por segundo...")
173
+ result_extract_per_second = keyframes_every_second_extraction(video_path)
174
+ # Obtener las rutas de las imágenes y la información de las escenas
175
+ images_per_second = result_extract_per_second[0] if len(result_extract_per_second) > 0 else []
176
+ images_per_second_paths = [f["image"] for f in images_per_second]
177
+ info_images_per_second = result_extract_per_second[1] if len(result_extract_per_second) > 1 else []
178
+
179
+ print("Aumentamos la información de las escenas viendo quién aparece en cada escena y detectando OCR...")
180
+ info_escenas_completa = []
181
+ for imagen_escena, info_escena in zip(escenas_paths, info_escenas):
182
+ result_add_ocr_and_faces = add_ocr_and_faces(imagen_escena, info_escena, face_col)
183
+ info_escenas_completa.append(result_add_ocr_and_faces)
184
+
185
+ print("Aumentamos la información de las imagenes por segundo viendo quién aparece en cada escena y detectando OCR...")
186
+ info_images_per_second_completa = []
187
+ for imagen_segundo, info_segundo in zip(images_per_second_paths, info_images_per_second):
188
+ result_add_ocr_and_faces =add_ocr_and_faces(imagen_segundo, info_segundo, face_col)
189
+ info_images_per_second_completa.append(result_add_ocr_and_faces)
190
+ print(info_escenas_completa)
191
+
192
+ print("Ahora se va a tratar los OCR (se sustituirán ciertas escenas por alguna de las imágenes por segundo si tienen mejor OCR)...")
193
+ # Se hará lo último
194
+
195
+ print("Combinando información de escenas e imágenes por segundo...")
196
+ info_escenas_completa = map_identities_per_second(info_images_per_second_completa, info_escenas_completa)
197
+ print(info_escenas_completa)
198
+
199
+ print("Ahora se incluyen en los diccionarios de las escenas la descripciones de estas.")
200
+ for escena_path, info_escena in zip(escenas_paths, info_escenas_completa):
201
+ descripcion_escena = extract_descripcion_escena(escena_path)
202
+ lista = ast.literal_eval(descripcion_escena)
203
+ frase = lista[0]
204
+ info_escena["descripcion"] = frase
205
+ del descripcion_escena
206
+ torch.cuda.empty_cache()
207
+
208
+ return info_escenas_completa, info_images_per_second_completa
209
+
210
+ def pipeline_preprocessing_audio(video_path: str, voice_col):
211
+ """
212
+ Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de audio.
213
+ """
214
+ print(f"Procesando video para audio: {video_path}")
215
+
216
+ print("Extrayendo audio del video...")
217
+ audio_video = extract_audio_from_video(video_path)
218
+ print(audio_video)
219
+
220
+ print("Diartizando el audio...")
221
+ diarization_audio = diarize_audio(audio_video)
222
+ print(diarization_audio)
223
+ clips_path = diarization_audio[0]
224
+ print(clips_path)
225
+ diarization_info = diarization_audio[1]
226
+ print(diarization_info)
227
+
228
+ print("Transcribiendo el video completo...")
229
+ full_transcription = transcribe_long_audio(audio_video)
230
+ print(full_transcription)
231
+
232
+ print("Transcribiendo los clips diartizados...")
233
+ for clip_path, clip_info in zip(clips_path, diarization_info):
234
+ clip_transcription = transcribe_short_audio(clip_path)
235
+ clip_info["transcription"] = clip_transcription
236
+
237
+ print("Calculando los embeddings para cada uno de los clips obtenidos y posteriormente identificar las voces...")
238
+ for clip_path, clip_info in zip(clips_path, diarization_info):
239
+ clip_speaker = identificar_veu(clip_path, voice_col)
240
+ clip_info["speaker"] = clip_speaker
241
+
242
+ return full_transcription, diarization_info
243
+
244
+ @router.post("/generate_srt", tags=["Transcription Process"])
245
+ async def pipeline_video_analysis(
246
+ sha1: str,
247
+ token: str = Query(..., description="Token required for authorization")
248
+ ):
249
+ """
250
+ Endpoint that processes a full video identified by its SHA1 folder, performs
251
+ complete audio-visual preprocessing, and returns an SRT subtitle file.
252
+
253
+ This pipeline integrates:
254
+ - Vision preprocessing (scene detection, keyframes, OCR, face recognition)
255
+ - Audio preprocessing (diarization, speech recognition, speaker identity matching)
256
+ - Identity mapping between vision and audio streams
257
+ - Final generation of an SRT file describing who speaks and when
258
+
259
+ Parameters
260
+ ----------
261
+ sha1 : str
262
+ Identifier corresponding to the folder containing the video and related assets.
263
+ token : str
264
+ Security token required for authorization.
265
+
266
+ Returns
267
+ -------
268
+ str
269
+ The generated SRT file (as text) containing time-aligned subtitles with
270
+ speaker identities and transcriptions.
271
+ """
272
+
273
+ validate_token(token)
274
+
275
+ # Resolve directories
276
+ file_manager = FileManager(MEDIA_ROOT)
277
+ sha1_folder = MEDIA_ROOT / sha1
278
+ clip_folder = sha1_folder / "clip"
279
+
280
+ if not sha1_folder.exists() or not sha1_folder.is_dir():
281
+ raise HTTPException(status_code=404, detail="SHA1 folder not found")
282
+
283
+ if not clip_folder.exists() or not clip_folder.is_dir():
284
+ raise HTTPException(status_code=404, detail="Clip folder not found")
285
+
286
+ # Locate video file
287
+ mp4_files = list(clip_folder.glob("*.mp4"))
288
+ if not mp4_files:
289
+ raise HTTPException(status_code=404, detail="No MP4 files found")
290
+
291
+ video_path = mp4_files[0]
292
+
293
+ # Convert absolute path to a relative path for FileManager
294
+ video_path = MEDIA_ROOT / video_path.relative_to(MEDIA_ROOT)
295
+
296
+ print(f"Processing full video: {video_path}")
297
+
298
+ # Get face and voice embeddings for casting
299
+ face_col, voice_col = get_casting(sha1)
300
+
301
+ # Vision processing pipeline
302
+ info_escenas, info_images_per_second = pipeline_preprocessing_vision(video_path, face_col)
303
+ torch.cuda.empty_cache()
304
+
305
+ # Audio processing pipeline
306
+ full_transcription, info_clips = pipeline_preprocessing_audio(video_path, voice_col)
307
+
308
+ # Merge identities from vision pipeline with audio segments
309
+ info_clips = map_identities_per_second(info_images_per_second, info_clips)
310
+
311
+ # Generate the final SRT subtitle file
312
+ srt = generate_srt_from_segments(info_clips, sha1)
313
+
314
+ # The endpoint returns the SRT file as plain text
315
  return srt
pipelines/audiodescription.py CHANGED
@@ -1,147 +1,147 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import shlex
5
- import subprocess
6
- from pathlib import Path
7
- from typing import Dict, Any, List, Tuple, Optional
8
-
9
- # Minimal, robust MVP audio-only pipeline
10
- # - Extract audio with ffmpeg
11
- # - Diarize with pyannote (if HF token available); otherwise, fallback: single segment over full duration
12
- # - ASR with Whisper (AINA if available optional). To keep footprint reasonable and robust,
13
- # we'll default to a lightweight faster-whisper if present; otherwise, return empty text.
14
- # - Generate basic SRT from segments and ASR texts.
15
-
16
-
17
- def extract_audio_ffmpeg(video_path: str, audio_out: Path, sr: int = 16000, mono: bool = True) -> str:
18
- audio_out.parent.mkdir(parents=True, exist_ok=True)
19
- cmd = f'ffmpeg -y -i "{video_path}" -vn {"-ac 1" if mono else ""} -ar {sr} -f wav "{audio_out}"'
20
- subprocess.run(shlex.split(cmd), check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
21
- return str(audio_out)
22
-
23
-
24
- def _get_video_duration_seconds(video_path: str) -> float:
25
- try:
26
- # Use ffprobe to get duration
27
- cmd = f'ffprobe -v error -select_streams v:0 -show_entries stream=duration -of default=nw=1 "{video_path}"'
28
- out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.DEVNULL).decode("utf-8", errors="ignore")
29
- for line in out.splitlines():
30
- if line.startswith("duration="):
31
- try:
32
- return float(line.split("=", 1)[1])
33
- except Exception:
34
- pass
35
- except Exception:
36
- pass
37
- return 0.0
38
-
39
-
40
- def diarize_audio(wav_path: str, base_dir: Path, hf_token_env: str | None = None) -> Tuple[List[Dict[str, Any]], List[str]]:
41
- """Returns segments [{'start','end','speaker'}] and dummy clip_paths (not used in MVP)."""
42
- segments: List[Dict[str, Any]] = []
43
- clip_paths: List[str] = []
44
- # Prefer PYANNOTE_TOKEN if provided; fallback to explicit env name, then HF_TOKEN
45
- token = os.getenv("PYANNOTE_TOKEN") or (os.getenv(hf_token_env) if hf_token_env else os.getenv("HF_TOKEN"))
46
- try:
47
- if token:
48
- from pyannote.audio import Pipeline # type: ignore
49
- pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=token)
50
- diarization = pipeline(wav_path)
51
- # Collect segments
52
- # We don't export individual clips in MVP; just timestamps.
53
- for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
54
- segments.append({
55
- "start": float(getattr(turn, "start", 0.0) or 0.0),
56
- "end": float(getattr(turn, "end", 0.0) or 0.0),
57
- "speaker": str(speaker) if speaker is not None else f"SPEAKER_{i:02d}",
58
- })
59
- else:
60
- # Fallback: single segment using full duration
61
- # Caller must provide video path to compute exact duration; as we only have wav, skip precise duration
62
- # and fallback to 0..0 (UI tolerates).
63
- segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
64
- except Exception:
65
- # Robust fallback
66
- segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
67
- # Sort by start
68
- segments = sorted(segments, key=lambda s: s.get("start", 0.0))
69
- return segments, clip_paths
70
-
71
-
72
- def _fmt_srt_time(seconds: float) -> str:
73
- h = int(seconds // 3600)
74
- m = int((seconds % 3600) // 60)
75
- s = int(seconds % 60)
76
- ms = int(round((seconds - int(seconds)) * 1000))
77
- return f"{h:02}:{m:02}:{s:02},{ms:03}"
78
-
79
-
80
- def _generate_srt(segments: List[Dict[str, Any]], texts: List[str]) -> str:
81
- n = min(len(segments), len(texts))
82
- lines: List[str] = []
83
- for i in range(n):
84
- seg = segments[i]
85
- text = (texts[i] or "").strip()
86
- start = float(seg.get("start", 0.0))
87
- end = float(seg.get("end", max(start + 2.0, start)))
88
- speaker = seg.get("speaker")
89
- if speaker:
90
- text = f"[{speaker}]: {text}" if text else f"[{speaker}]"
91
- lines.append(str(i + 1))
92
- lines.append(f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}")
93
- lines.append(text)
94
- lines.append("")
95
- return "\n".join(lines).strip() + "\n"
96
-
97
-
98
- def asr_transcribe_wav_simple(wav_path: str) -> str:
99
- """Very robust ASR stub: try faster-whisper small if present; otherwise return empty text.
100
- Intended for MVP in Spaces without heavy GPU. """
101
- try:
102
- from faster_whisper import WhisperModel # type: ignore
103
- model = WhisperModel("Systran/faster-whisper-small", device="cpu")
104
- # Short transcript without timestamps
105
- segments, info = model.transcribe(wav_path, vad_filter=True, without_timestamps=True, language=None)
106
- text = " ".join(seg.text.strip() for seg in segments if getattr(seg, "text", None))
107
- return text.strip()
108
- except Exception:
109
- # As last resort, empty text
110
- return ""
111
-
112
-
113
- def generate(video_path: str, out_dir: Path) -> Dict[str, Any]:
114
- """End-to-end MVP that returns {'une_srt','free_text','artifacts':{...}}."""
115
- out_dir.mkdir(parents=True, exist_ok=True)
116
- wav_path = extract_audio_ffmpeg(video_path, out_dir / f"{Path(video_path).stem}.wav")
117
-
118
- # Diarization (robust)
119
- segments, _ = diarize_audio(wav_path, out_dir, hf_token_env="HF_TOKEN")
120
-
121
- # ASR (for MVP: single transcript of full audio to use as 'free_text')
122
- free_text = asr_transcribe_wav_simple(wav_path)
123
-
124
- # Build per-segment 'texts' using a simple split of free_text if we have multiple segments
125
- if not segments:
126
- segments = [{"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"}]
127
- texts: List[str] = []
128
- if len(segments) <= 1:
129
- texts = [free_text]
130
- else:
131
- # Naive split into N parts by words
132
- words = free_text.split()
133
- chunk = max(1, len(words) // len(segments))
134
- for i in range(len(segments)):
135
- start_idx = i * chunk
136
- end_idx = (i + 1) * chunk if i < len(segments) - 1 else len(words)
137
- texts.append(" ".join(words[start_idx:end_idx]))
138
-
139
- une_srt = _generate_srt(segments, texts)
140
-
141
- return {
142
- "une_srt": une_srt,
143
- "free_text": free_text,
144
- "artifacts": {
145
- "wav_path": str(wav_path),
146
- },
147
- }
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import shlex
5
+ import subprocess
6
+ from pathlib import Path
7
+ from typing import Dict, Any, List, Tuple, Optional
8
+
9
+ # Minimal, robust MVP audio-only pipeline
10
+ # - Extract audio with ffmpeg
11
+ # - Diarize with pyannote (if HF token available); otherwise, fallback: single segment over full duration
12
+ # - ASR with Whisper (AINA if available optional). To keep footprint reasonable and robust,
13
+ # we'll default to a lightweight faster-whisper if present; otherwise, return empty text.
14
+ # - Generate basic SRT from segments and ASR texts.
15
+
16
+
17
+ def extract_audio_ffmpeg(video_path: str, audio_out: Path, sr: int = 16000, mono: bool = True) -> str:
18
+ audio_out.parent.mkdir(parents=True, exist_ok=True)
19
+ cmd = f'ffmpeg -y -i "{video_path}" -vn {"-ac 1" if mono else ""} -ar {sr} -f wav "{audio_out}"'
20
+ subprocess.run(shlex.split(cmd), check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
21
+ return str(audio_out)
22
+
23
+
24
+ def _get_video_duration_seconds(video_path: str) -> float:
25
+ try:
26
+ # Use ffprobe to get duration
27
+ cmd = f'ffprobe -v error -select_streams v:0 -show_entries stream=duration -of default=nw=1 "{video_path}"'
28
+ out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.DEVNULL).decode("utf-8", errors="ignore")
29
+ for line in out.splitlines():
30
+ if line.startswith("duration="):
31
+ try:
32
+ return float(line.split("=", 1)[1])
33
+ except Exception:
34
+ pass
35
+ except Exception:
36
+ pass
37
+ return 0.0
38
+
39
+
40
+ def diarize_audio(wav_path: str, base_dir: Path, hf_token_env: str | None = None) -> Tuple[List[Dict[str, Any]], List[str]]:
41
+ """Returns segments [{'start','end','speaker'}] and dummy clip_paths (not used in MVP)."""
42
+ segments: List[Dict[str, Any]] = []
43
+ clip_paths: List[str] = []
44
+ # Prefer PYANNOTE_TOKEN if provided; fallback to explicit env name, then HF_TOKEN
45
+ token = os.getenv("PYANNOTE_TOKEN") or (os.getenv(hf_token_env) if hf_token_env else os.getenv("HF_TOKEN"))
46
+ try:
47
+ if token:
48
+ from pyannote.audio import Pipeline # type: ignore
49
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=token)
50
+ diarization = pipeline(wav_path)
51
+ # Collect segments
52
+ # We don't export individual clips in MVP; just timestamps.
53
+ for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
54
+ segments.append({
55
+ "start": float(getattr(turn, "start", 0.0) or 0.0),
56
+ "end": float(getattr(turn, "end", 0.0) or 0.0),
57
+ "speaker": str(speaker) if speaker is not None else f"SPEAKER_{i:02d}",
58
+ })
59
+ else:
60
+ # Fallback: single segment using full duration
61
+ # Caller must provide video path to compute exact duration; as we only have wav, skip precise duration
62
+ # and fallback to 0..0 (UI tolerates).
63
+ segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
64
+ except Exception:
65
+ # Robust fallback
66
+ segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
67
+ # Sort by start
68
+ segments = sorted(segments, key=lambda s: s.get("start", 0.0))
69
+ return segments, clip_paths
70
+
71
+
72
+ def _fmt_srt_time(seconds: float) -> str:
73
+ h = int(seconds // 3600)
74
+ m = int((seconds % 3600) // 60)
75
+ s = int(seconds % 60)
76
+ ms = int(round((seconds - int(seconds)) * 1000))
77
+ return f"{h:02}:{m:02}:{s:02},{ms:03}"
78
+
79
+
80
+ def _generate_srt(segments: List[Dict[str, Any]], texts: List[str]) -> str:
81
+ n = min(len(segments), len(texts))
82
+ lines: List[str] = []
83
+ for i in range(n):
84
+ seg = segments[i]
85
+ text = (texts[i] or "").strip()
86
+ start = float(seg.get("start", 0.0))
87
+ end = float(seg.get("end", max(start + 2.0, start)))
88
+ speaker = seg.get("speaker")
89
+ if speaker:
90
+ text = f"[{speaker}]: {text}" if text else f"[{speaker}]"
91
+ lines.append(str(i + 1))
92
+ lines.append(f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}")
93
+ lines.append(text)
94
+ lines.append("")
95
+ return "\n".join(lines).strip() + "\n"
96
+
97
+
98
+ def asr_transcribe_wav_simple(wav_path: str) -> str:
99
+ """Very robust ASR stub: try faster-whisper small if present; otherwise return empty text.
100
+ Intended for MVP in Spaces without heavy GPU. """
101
+ try:
102
+ from faster_whisper import WhisperModel # type: ignore
103
+ model = WhisperModel("Systran/faster-whisper-small", device="cpu")
104
+ # Short transcript without timestamps
105
+ segments, info = model.transcribe(wav_path, vad_filter=True, without_timestamps=True, language=None)
106
+ text = " ".join(seg.text.strip() for seg in segments if getattr(seg, "text", None))
107
+ return text.strip()
108
+ except Exception:
109
+ # As last resort, empty text
110
+ return ""
111
+
112
+
113
+ def generate(video_path: str, out_dir: Path) -> Dict[str, Any]:
114
+ """End-to-end MVP that returns {'une_srt','free_text','artifacts':{...}}."""
115
+ out_dir.mkdir(parents=True, exist_ok=True)
116
+ wav_path = extract_audio_ffmpeg(video_path, out_dir / f"{Path(video_path).stem}.wav")
117
+
118
+ # Diarization (robust)
119
+ segments, _ = diarize_audio(wav_path, out_dir, hf_token_env="HF_TOKEN")
120
+
121
+ # ASR (for MVP: single transcript of full audio to use as 'free_text')
122
+ free_text = asr_transcribe_wav_simple(wav_path)
123
+
124
+ # Build per-segment 'texts' using a simple split of free_text if we have multiple segments
125
+ if not segments:
126
+ segments = [{"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"}]
127
+ texts: List[str] = []
128
+ if len(segments) <= 1:
129
+ texts = [free_text]
130
+ else:
131
+ # Naive split into N parts by words
132
+ words = free_text.split()
133
+ chunk = max(1, len(words) // len(segments))
134
+ for i in range(len(segments)):
135
+ start_idx = i * chunk
136
+ end_idx = (i + 1) * chunk if i < len(segments) - 1 else len(words)
137
+ texts.append(" ".join(words[start_idx:end_idx]))
138
+
139
+ une_srt = _generate_srt(segments, texts)
140
+
141
+ return {
142
+ "une_srt": une_srt,
143
+ "free_text": free_text,
144
+ "artifacts": {
145
+ "wav_path": str(wav_path),
146
+ },
147
+ }