VeuReu commited on
Commit
afe1310
·
verified ·
1 Parent(s): ce5bf11

Update main_process/main_router.py

Browse files
Files changed (1) hide show
  1. main_process/main_router.py +311 -314
main_process/main_router.py CHANGED
@@ -1,315 +1,312 @@
1
- import os
2
- import io
3
-
4
- from pathlib import Path
5
- from typing import Counter,List, Dict
6
- import ast
7
- import json
8
- import torch
9
- from svision_client import extract_scenes, add_ocr_and_faces, keyframes_every_second_extraction, extract_descripcion_escena
10
- from asr_client import extract_audio_from_video, diarize_audio, transcribe_long_audio, transcribe_short_audio, identificar_veu
11
-
12
- from fastapi import APIRouter, UploadFile, File, Query, HTTPException
13
- from fastapi.responses import JSONResponse, StreamingResponse
14
-
15
- from storage.common import validate_token
16
- from storage.files.file_manager import FileManager
17
- from storage.embeddings_routers import get_embeddings_json
18
-
19
- EMBEDDINGS_ROOT = Path("/data/embeddings")
20
- MEDIA_ROOT = Path("/data/media")
21
- os.environ["CUDA_VISIBLE_DEVICES"] = "1"
22
- router = APIRouter(prefix="/transcription", tags=["Transcription Process"])
23
- HF_TOKEN = os.getenv("HF_TOKEN")
24
-
25
- def get_casting(video_sha1: str):
26
- """Recupera els embeddings reals de càsting per a un vídeo a partir del seu SHA1.
27
-
28
- Llegeix el JSON d'embeddings que demo ha pujat prèviament a /data/embeddings
29
- mitjançant l'endpoint /embeddings/upload_embeddings i en retorna les
30
- columnes face_col i voice_col.
31
- """
32
-
33
- # get_embeddings_json retorna el JSON complet tal com es va pujar (casting_json)
34
- faces_json = get_embeddings_json(video_sha1, "faces")
35
- voices_json = get_embeddings_json(video_sha1, "voices")
36
-
37
- # Ens quedem només amb les columnes que interessen al pipeline
38
- face_col = faces_json.get("face_col", []) if isinstance(faces_json, dict) else []
39
- voice_col = voices_json.get("voice_col", []) if isinstance(voices_json, dict) else []
40
-
41
- return face_col, voice_col
42
-
43
- transcripcion_inicial = "/home/acasado/bsc/hugging_face_bsc/engine/results/transcription_initial.srt"
44
- informacion_json = "/home/acasado/bsc/hugging_face_bsc/engine/results/informacion.json"
45
-
46
- def map_identities_per_second(frames_per_second, intervals):
47
- for seg in intervals:
48
- seg_start = seg["start"]
49
- seg_end = seg["end"]
50
-
51
- identities = []
52
- for f in frames_per_second:
53
- if seg_start <= f["start"] <= seg_end:
54
- for face in f.get("faces", []):
55
- identities.append(face)
56
-
57
- seg["counts"] = dict(Counter(identities))
58
-
59
- return intervals
60
-
61
- def _fmt_srt_time(seconds: float) -> str:
62
- """Formatea segundos en el formato SRT HH:MM:SS,mmm"""
63
- h = int(seconds // 3600)
64
- m = int((seconds % 3600) // 60)
65
- s = int(seconds % 60)
66
- ms = int((seconds - int(seconds)) * 1000)
67
- return f"{h:02}:{m:02}:{s:02},{ms:03}"
68
-
69
- from pathlib import Path
70
- from typing import List, Dict
71
- from fastapi import HTTPException
72
-
73
-
74
- def generate_srt_from_segments(segments: List[Dict], sha1: str) -> str:
75
- """
76
- Generate an SRT subtitle file from diarization/transcription segments.
77
-
78
- This function:
79
- - Creates the required folder structure for storing SRTs.
80
- - Removes any previous SRT files for the same SHA1.
81
- - Builds the SRT content with timestamps, speaker identity and transcription.
82
- - Saves the SRT file to disk.
83
- - Returns the SRT content as a string (to be sent by the endpoint).
84
-
85
- Parameters
86
- ----------
87
- segments : List[Dict]
88
- List of dictionaries containing:
89
- - "start": float (start time in seconds)
90
- - "end": float (end time in seconds)
91
- - "speaker": dict with "identity"
92
- - "transcription": str
93
- sha1 : str
94
- Identifier used to locate the target media folder.
95
-
96
- Returns
97
- -------
98
- str
99
- Full SRT file content as a string.
100
- """
101
-
102
- # Path: /data/media/<sha1>
103
- video_root = MEDIA_ROOT / sha1
104
- video_root.mkdir(parents=True, exist_ok=True)
105
-
106
- # Path: /data/media/<sha1>/srt
107
- srt_dir = video_root / "srt"
108
- srt_dir.mkdir(parents=True, exist_ok=True)
109
-
110
- # Delete old SRT files
111
- try:
112
- for old_srt in srt_dir.glob("*.srt"):
113
- old_srt.unlink()
114
- except Exception as exc:
115
- raise HTTPException(status_code=500, detail=f"Failed to delete old SRT files: {exc}")
116
-
117
- # Save file as initial.srt
118
- final_path = srt_dir / "initial.srt"
119
-
120
- # Build SRT content
121
- srt_lines = []
122
-
123
- for i, seg in enumerate(segments, start=1):
124
- start = seg.get("start", 0.0)
125
- end = seg.get("end", 0.0)
126
- transcription = seg.get("transcription", "").strip()
127
-
128
- speaker_info = seg.get("speaker", {})
129
- speaker = speaker_info.get("identity", "Unknown")
130
-
131
- text = f"[{speaker}]: {transcription}" if speaker else transcription
132
-
133
- entry = (
134
- f"{i}\n"
135
- f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}\n"
136
- f"{text}\n"
137
- )
138
- srt_lines.append(entry)
139
-
140
- # Join with blank lines
141
- srt_content = "\n".join(srt_lines)
142
-
143
- # Write to disk
144
- try:
145
- with final_path.open("w", encoding="utf-8-sig") as f:
146
- f.write(srt_content)
147
- except Exception as exc:
148
- raise HTTPException(status_code=500, detail=f"Failed to write SRT file: {exc}")
149
-
150
- return srt_content
151
-
152
- def pipeline_preprocessing_vision(video_path: str, face_col):
153
- """
154
- Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de vision.
155
- """
156
-
157
- print(f"Procesando video para visión: {video_path}")
158
-
159
- print("Extrayendo escenas...")
160
- threshold: float = 30.0
161
- offset_frames: int = 3
162
- crop_ratio: float = 0.1
163
- result_extract_scenes = extract_scenes(video_path, threshold, offset_frames, crop_ratio)
164
- print(result_extract_scenes)
165
- # Obtener las rutas de las imágenes y la información de las escenas
166
- escenas = result_extract_scenes[0] if len(result_extract_scenes) > 0 else []
167
- escenas_paths = [f["image"] for f in escenas]
168
- print(escenas_paths)
169
- info_escenas = result_extract_scenes[1] if len(result_extract_scenes) > 1 else []
170
- print(info_escenas)
171
-
172
- print("Extrayendo imagenes por segundo...")
173
- result_extract_per_second = keyframes_every_second_extraction(video_path)
174
- # Obtener las rutas de las imágenes y la información de las escenas
175
- images_per_second = result_extract_per_second[0] if len(result_extract_per_second) > 0 else []
176
- images_per_second_paths = [f["image"] for f in images_per_second]
177
- info_images_per_second = result_extract_per_second[1] if len(result_extract_per_second) > 1 else []
178
-
179
- print("Aumentamos la información de las escenas viendo quién aparece en cada escena y detectando OCR...")
180
- info_escenas_completa = []
181
- for imagen_escena, info_escena in zip(escenas_paths, info_escenas):
182
- result_add_ocr_and_faces = add_ocr_and_faces(imagen_escena, info_escena, face_col)
183
- info_escenas_completa.append(result_add_ocr_and_faces)
184
-
185
- print("Aumentamos la información de las imagenes por segundo viendo quién aparece en cada escena y detectando OCR...")
186
- info_images_per_second_completa = []
187
- for imagen_segundo, info_segundo in zip(images_per_second_paths, info_images_per_second):
188
- result_add_ocr_and_faces =add_ocr_and_faces(imagen_segundo, info_segundo, face_col)
189
- info_images_per_second_completa.append(result_add_ocr_and_faces)
190
- print(info_escenas_completa)
191
-
192
- print("Ahora se va a tratar los OCR (se sustituirán ciertas escenas por alguna de las imágenes por segundo si tienen mejor OCR)...")
193
- # Se hará lo último
194
-
195
- print("Combinando información de escenas e imágenes por segundo...")
196
- info_escenas_completa = map_identities_per_second(info_images_per_second_completa, info_escenas_completa)
197
- print(info_escenas_completa)
198
-
199
- print("Ahora se incluyen en los diccionarios de las escenas la descripciones de estas.")
200
- for escena_path, info_escena in zip(escenas_paths, info_escenas_completa):
201
- descripcion_escena = extract_descripcion_escena(escena_path)
202
- lista = ast.literal_eval(descripcion_escena)
203
- frase = lista[0]
204
- info_escena["descripcion"] = frase
205
- del descripcion_escena
206
- torch.cuda.empty_cache()
207
-
208
- return info_escenas_completa, info_images_per_second_completa
209
-
210
- def pipeline_preprocessing_audio(video_path: str, voice_col):
211
- """
212
- Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de audio.
213
- """
214
- print(f"Procesando video para audio: {video_path}")
215
-
216
- print("Extrayendo audio del video...")
217
- audio_video = extract_audio_from_video(video_path)
218
- print(audio_video)
219
-
220
- print("Diartizando el audio...")
221
- diarization_audio = diarize_audio(audio_video)
222
- print(diarization_audio)
223
- clips_path = diarization_audio[0]
224
- print(clips_path)
225
- diarization_info = diarization_audio[1]
226
- print(diarization_info)
227
-
228
- print("Transcribiendo el video completo...")
229
- full_transcription = transcribe_long_audio(audio_video)
230
- print(full_transcription)
231
-
232
- print("Transcribiendo los clips diartizados...")
233
- for clip_path, clip_info in zip(clips_path, diarization_info):
234
- clip_transcription = transcribe_short_audio(clip_path)
235
- clip_info["transcription"] = clip_transcription
236
-
237
- print("Calculando los embeddings para cada uno de los clips obtenidos y posteriormente identificar las voces...")
238
- for clip_path, clip_info in zip(clips_path, diarization_info):
239
- clip_speaker = identificar_veu(clip_path, voice_col)
240
- clip_info["speaker"] = clip_speaker
241
-
242
- return full_transcription, diarization_info
243
-
244
- @router.post("/generate_srt", tags=["Transcription Process"])
245
- async def pipeline_video_analysis(
246
- sha1: str,
247
- token: str = Query(..., description="Token required for authorization")
248
- ):
249
- """
250
- Endpoint that processes a full video identified by its SHA1 folder, performs
251
- complete audio-visual preprocessing, and returns an SRT subtitle file.
252
-
253
- This pipeline integrates:
254
- - Vision preprocessing (scene detection, keyframes, OCR, face recognition)
255
- - Audio preprocessing (diarization, speech recognition, speaker identity matching)
256
- - Identity mapping between vision and audio streams
257
- - Final generation of an SRT file describing who speaks and when
258
-
259
- Parameters
260
- ----------
261
- sha1 : str
262
- Identifier corresponding to the folder containing the video and related assets.
263
- token : str
264
- Security token required for authorization.
265
-
266
- Returns
267
- -------
268
- str
269
- The generated SRT file (as text) containing time-aligned subtitles with
270
- speaker identities and transcriptions.
271
- """
272
-
273
- validate_token(token)
274
-
275
- # Resolve directories
276
- file_manager = FileManager(MEDIA_ROOT)
277
- sha1_folder = MEDIA_ROOT / sha1
278
- clip_folder = sha1_folder / "clip"
279
-
280
- if not sha1_folder.exists() or not sha1_folder.is_dir():
281
- raise HTTPException(status_code=404, detail="SHA1 folder not found")
282
-
283
- if not clip_folder.exists() or not clip_folder.is_dir():
284
- raise HTTPException(status_code=404, detail="Clip folder not found")
285
-
286
- # Locate video file
287
- mp4_files = list(clip_folder.glob("*.mp4"))
288
- if not mp4_files:
289
- raise HTTPException(status_code=404, detail="No MP4 files found")
290
-
291
- video_path = mp4_files[0]
292
-
293
- # Convert absolute path to a relative path for FileManager
294
- video_path = MEDIA_ROOT / video_path.relative_to(MEDIA_ROOT)
295
-
296
- print(f"Processing full video: {video_path}")
297
-
298
- # Get face and voice embeddings for casting
299
- face_col, voice_col = get_casting(sha1)
300
-
301
- # Vision processing pipeline
302
- info_escenas, info_images_per_second = pipeline_preprocessing_vision(video_path, face_col)
303
- torch.cuda.empty_cache()
304
-
305
- # Audio processing pipeline
306
- full_transcription, info_clips = pipeline_preprocessing_audio(video_path, voice_col)
307
-
308
- # Merge identities from vision pipeline with audio segments
309
- info_clips = map_identities_per_second(info_images_per_second, info_clips)
310
-
311
- # Generate the final SRT subtitle file
312
- srt = generate_srt_from_segments(info_clips, sha1)
313
-
314
- # The endpoint returns the SRT file as plain text
315
  return srt
 
1
+ import os
2
+ import io
3
+
4
+ from pathlib import Path
5
+ from typing import Counter,List, Dict
6
+ import ast
7
+ import json
8
+ import torch
9
+ from svision_client import extract_scenes, add_ocr_and_faces, keyframes_every_second_extraction, extract_descripcion_escena
10
+ from asr_client import extract_audio_from_video, diarize_audio, transcribe_long_audio, transcribe_short_audio, identificar_veu
11
+
12
+ from fastapi import APIRouter, UploadFile, File, Query, HTTPException
13
+ from fastapi.responses import JSONResponse, StreamingResponse
14
+
15
+ from storage.common import validate_token
16
+ from storage.files.file_manager import FileManager
17
+ from storage.embeddings_routers import get_embeddings_json
18
+
19
+ EMBEDDINGS_ROOT = Path("/data/embeddings")
20
+ MEDIA_ROOT = Path("/data/media")
21
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
22
+ router = APIRouter(prefix="/transcription", tags=["Transcription Process"])
23
+ HF_TOKEN = os.getenv("HF_TOKEN")
24
+
25
+ def get_casting(video_sha1: str):
26
+ """Recupera els embeddings reals de càsting per a un vídeo a partir del seu SHA1.
27
+
28
+ Llegeix el JSON d'embeddings que demo ha pujat prèviament a /data/embeddings
29
+ mitjançant l'endpoint /embeddings/upload_embeddings i en retorna les
30
+ columnes face_col i voice_col.
31
+ """
32
+
33
+ # get_embeddings_json retorna el JSON complet tal com es va pujar (casting_json)
34
+ faces_json = get_embeddings_json(video_sha1, "faces")
35
+ voices_json = get_embeddings_json(video_sha1, "voices")
36
+
37
+ # Ens quedem només amb les columnes que interessen al pipeline
38
+ face_col = faces_json.get("face_col", []) if isinstance(faces_json, dict) else []
39
+ voice_col = voices_json.get("voice_col", []) if isinstance(voices_json, dict) else []
40
+
41
+ return face_col, voice_col
42
+
43
+ def map_identities_per_second(frames_per_second, intervals):
44
+ for seg in intervals:
45
+ seg_start = seg["start"]
46
+ seg_end = seg["end"]
47
+
48
+ identities = []
49
+ for f in frames_per_second:
50
+ if seg_start <= f["start"] <= seg_end:
51
+ for face in f.get("faces", []):
52
+ identities.append(face)
53
+
54
+ seg["counts"] = dict(Counter(identities))
55
+
56
+ return intervals
57
+
58
+ def _fmt_srt_time(seconds: float) -> str:
59
+ """Formatea segundos en el formato SRT HH:MM:SS,mmm"""
60
+ h = int(seconds // 3600)
61
+ m = int((seconds % 3600) // 60)
62
+ s = int(seconds % 60)
63
+ ms = int((seconds - int(seconds)) * 1000)
64
+ return f"{h:02}:{m:02}:{s:02},{ms:03}"
65
+
66
+ from pathlib import Path
67
+ from typing import List, Dict
68
+ from fastapi import HTTPException
69
+
70
+
71
+ def generate_srt_from_segments(segments: List[Dict], sha1: str) -> str:
72
+ """
73
+ Generate an SRT subtitle file from diarization/transcription segments.
74
+
75
+ This function:
76
+ - Creates the required folder structure for storing SRTs.
77
+ - Removes any previous SRT files for the same SHA1.
78
+ - Builds the SRT content with timestamps, speaker identity and transcription.
79
+ - Saves the SRT file to disk.
80
+ - Returns the SRT content as a string (to be sent by the endpoint).
81
+
82
+ Parameters
83
+ ----------
84
+ segments : List[Dict]
85
+ List of dictionaries containing:
86
+ - "start": float (start time in seconds)
87
+ - "end": float (end time in seconds)
88
+ - "speaker": dict with "identity"
89
+ - "transcription": str
90
+ sha1 : str
91
+ Identifier used to locate the target media folder.
92
+
93
+ Returns
94
+ -------
95
+ str
96
+ Full SRT file content as a string.
97
+ """
98
+
99
+ # Path: /data/media/<sha1>
100
+ video_root = MEDIA_ROOT / sha1
101
+ video_root.mkdir(parents=True, exist_ok=True)
102
+
103
+ # Path: /data/media/<sha1>/srt
104
+ srt_dir = video_root / "srt"
105
+ srt_dir.mkdir(parents=True, exist_ok=True)
106
+
107
+ # Delete old SRT files
108
+ try:
109
+ for old_srt in srt_dir.glob("*.srt"):
110
+ old_srt.unlink()
111
+ except Exception as exc:
112
+ raise HTTPException(status_code=500, detail=f"Failed to delete old SRT files: {exc}")
113
+
114
+ # Save file as initial.srt
115
+ final_path = srt_dir / "initial.srt"
116
+
117
+ # Build SRT content
118
+ srt_lines = []
119
+
120
+ for i, seg in enumerate(segments, start=1):
121
+ start = seg.get("start", 0.0)
122
+ end = seg.get("end", 0.0)
123
+ transcription = seg.get("transcription", "").strip()
124
+
125
+ speaker_info = seg.get("speaker", {})
126
+ speaker = speaker_info.get("identity", "Unknown")
127
+
128
+ text = f"[{speaker}]: {transcription}" if speaker else transcription
129
+
130
+ entry = (
131
+ f"{i}\n"
132
+ f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}\n"
133
+ f"{text}\n"
134
+ )
135
+ srt_lines.append(entry)
136
+
137
+ # Join with blank lines
138
+ srt_content = "\n".join(srt_lines)
139
+
140
+ # Write to disk
141
+ try:
142
+ with final_path.open("w", encoding="utf-8-sig") as f:
143
+ f.write(srt_content)
144
+ except Exception as exc:
145
+ raise HTTPException(status_code=500, detail=f"Failed to write SRT file: {exc}")
146
+
147
+ return srt_content
148
+
149
+ def pipeline_preprocessing_vision(video_path: str, face_col):
150
+ """
151
+ Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de vision.
152
+ """
153
+
154
+ print(f"Procesando video para visión: {video_path}")
155
+
156
+ print("Extrayendo escenas...")
157
+ threshold: float = 30.0
158
+ offset_frames: int = 3
159
+ crop_ratio: float = 0.1
160
+ result_extract_scenes = extract_scenes(video_path, threshold, offset_frames, crop_ratio)
161
+ print(result_extract_scenes)
162
+ # Obtener las rutas de las imágenes y la información de las escenas
163
+ escenas = result_extract_scenes[0] if len(result_extract_scenes) > 0 else []
164
+ escenas_paths = [f["image"] for f in escenas]
165
+ print(escenas_paths)
166
+ info_escenas = result_extract_scenes[1] if len(result_extract_scenes) > 1 else []
167
+ print(info_escenas)
168
+
169
+ print("Extrayendo imagenes por segundo...")
170
+ result_extract_per_second = keyframes_every_second_extraction(video_path)
171
+ # Obtener las rutas de las imágenes y la información de las escenas
172
+ images_per_second = result_extract_per_second[0] if len(result_extract_per_second) > 0 else []
173
+ images_per_second_paths = [f["image"] for f in images_per_second]
174
+ info_images_per_second = result_extract_per_second[1] if len(result_extract_per_second) > 1 else []
175
+
176
+ print("Aumentamos la información de las escenas viendo quién aparece en cada escena y detectando OCR...")
177
+ info_escenas_completa = []
178
+ for imagen_escena, info_escena in zip(escenas_paths, info_escenas):
179
+ result_add_ocr_and_faces = add_ocr_and_faces(imagen_escena, info_escena, face_col)
180
+ info_escenas_completa.append(result_add_ocr_and_faces)
181
+
182
+ print("Aumentamos la información de las imagenes por segundo viendo quién aparece en cada escena y detectando OCR...")
183
+ info_images_per_second_completa = []
184
+ for imagen_segundo, info_segundo in zip(images_per_second_paths, info_images_per_second):
185
+ result_add_ocr_and_faces =add_ocr_and_faces(imagen_segundo, info_segundo, face_col)
186
+ info_images_per_second_completa.append(result_add_ocr_and_faces)
187
+ print(info_escenas_completa)
188
+
189
+ print("Ahora se va a tratar los OCR (se sustituirán ciertas escenas por alguna de las imágenes por segundo si tienen mejor OCR)...")
190
+ # Se hará lo último
191
+
192
+ print("Combinando información de escenas e imágenes por segundo...")
193
+ info_escenas_completa = map_identities_per_second(info_images_per_second_completa, info_escenas_completa)
194
+ print(info_escenas_completa)
195
+
196
+ print("Ahora se incluyen en los diccionarios de las escenas la descripciones de estas.")
197
+ for escena_path, info_escena in zip(escenas_paths, info_escenas_completa):
198
+ descripcion_escena = extract_descripcion_escena(escena_path)
199
+ lista = ast.literal_eval(descripcion_escena)
200
+ frase = lista[0]
201
+ info_escena["descripcion"] = frase
202
+ del descripcion_escena
203
+ torch.cuda.empty_cache()
204
+
205
+ return info_escenas_completa, info_images_per_second_completa
206
+
207
+ def pipeline_preprocessing_audio(video_path: str, voice_col):
208
+ """
209
+ Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de audio.
210
+ """
211
+ print(f"Procesando video para audio: {video_path}")
212
+
213
+ print("Extrayendo audio del video...")
214
+ audio_video = extract_audio_from_video(video_path)
215
+ print(audio_video)
216
+
217
+ print("Diartizando el audio...")
218
+ diarization_audio = diarize_audio(audio_video)
219
+ print(diarization_audio)
220
+ clips_path = diarization_audio[0]
221
+ print(clips_path)
222
+ diarization_info = diarization_audio[1]
223
+ print(diarization_info)
224
+
225
+ print("Transcribiendo el video completo...")
226
+ full_transcription = transcribe_long_audio(audio_video)
227
+ print(full_transcription)
228
+
229
+ print("Transcribiendo los clips diartizados...")
230
+ for clip_path, clip_info in zip(clips_path, diarization_info):
231
+ clip_transcription = transcribe_short_audio(clip_path)
232
+ clip_info["transcription"] = clip_transcription
233
+
234
+ print("Calculando los embeddings para cada uno de los clips obtenidos y posteriormente identificar las voces...")
235
+ for clip_path, clip_info in zip(clips_path, diarization_info):
236
+ clip_speaker = identificar_veu(clip_path, voice_col)
237
+ clip_info["speaker"] = clip_speaker
238
+
239
+ return full_transcription, diarization_info
240
+
241
+ @router.post("/generate_srt_salamandra", tags=["Transcription Process"])
242
+ async def pipeline_video_analysis(
243
+ sha1: str,
244
+ token: str = Query(..., description="Token required for authorization")
245
+ ):
246
+ """
247
+ Endpoint that processes a full video identified by its SHA1 folder, performs
248
+ complete audio-visual preprocessing, and returns an SRT subtitle file.
249
+
250
+ This pipeline integrates:
251
+ - Vision preprocessing (scene detection, keyframes, OCR, face recognition)
252
+ - Audio preprocessing (diarization, speech recognition, speaker identity matching)
253
+ - Identity mapping between vision and audio streams
254
+ - Final generation of an SRT file describing who speaks and when
255
+
256
+ Parameters
257
+ ----------
258
+ sha1 : str
259
+ Identifier corresponding to the folder containing the video and related assets.
260
+ token : str
261
+ Security token required for authorization.
262
+
263
+ Returns
264
+ -------
265
+ str
266
+ The generated SRT file (as text) containing time-aligned subtitles with
267
+ speaker identities and transcriptions.
268
+ """
269
+
270
+ validate_token(token)
271
+
272
+ # Resolve directories
273
+ file_manager = FileManager(MEDIA_ROOT)
274
+ sha1_folder = MEDIA_ROOT / sha1
275
+ clip_folder = sha1_folder / "clip"
276
+
277
+ if not sha1_folder.exists() or not sha1_folder.is_dir():
278
+ raise HTTPException(status_code=404, detail="SHA1 folder not found")
279
+
280
+ if not clip_folder.exists() or not clip_folder.is_dir():
281
+ raise HTTPException(status_code=404, detail="Clip folder not found")
282
+
283
+ # Locate video file
284
+ mp4_files = list(clip_folder.glob("*.mp4"))
285
+ if not mp4_files:
286
+ raise HTTPException(status_code=404, detail="No MP4 files found")
287
+
288
+ video_path = mp4_files[0]
289
+
290
+ # Convert absolute path to a relative path for FileManager
291
+ video_path = MEDIA_ROOT / video_path.relative_to(MEDIA_ROOT)
292
+
293
+ print(f"Processing full video: {video_path}")
294
+
295
+ # Get face and voice embeddings for casting
296
+ face_col, voice_col = get_casting(sha1)
297
+
298
+ # Vision processing pipeline
299
+ info_escenas, info_images_per_second = pipeline_preprocessing_vision(video_path, face_col)
300
+ torch.cuda.empty_cache()
301
+
302
+ # Audio processing pipeline
303
+ full_transcription, info_clips = pipeline_preprocessing_audio(video_path, voice_col)
304
+
305
+ # Merge identities from vision pipeline with audio segments
306
+ info_clips = map_identities_per_second(info_images_per_second, info_clips)
307
+
308
+ # Generate the final SRT subtitle file
309
+ srt = generate_srt_from_segments(info_clips, sha1)
310
+
311
+ # The endpoint returns the SRT file as plain text
 
 
 
312
  return srt