Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on Nov 26, 2025

Commit

afe1310

verified ·

1 Parent(s): ce5bf11

Update main_process/main_router.py

Browse files

Files changed (1) hide show

main_process/main_router.py +311 -314

main_process/main_router.py CHANGED Viewed

@@ -1,315 +1,312 @@
-import os
-import io
-from pathlib import Path
-from typing import Counter,List, Dict
-import ast
-import json
-import torch
-from svision_client import extract_scenes, add_ocr_and_faces, keyframes_every_second_extraction, extract_descripcion_escena
-from asr_client import extract_audio_from_video, diarize_audio, transcribe_long_audio, transcribe_short_audio, identificar_veu
-from fastapi import APIRouter, UploadFile, File, Query, HTTPException
-from fastapi.responses import JSONResponse, StreamingResponse
-from storage.common import validate_token
-from storage.files.file_manager import FileManager
-from storage.embeddings_routers import get_embeddings_json
-EMBEDDINGS_ROOT = Path("/data/embeddings")
-MEDIA_ROOT = Path("/data/media")
-os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-router = APIRouter(prefix="/transcription", tags=["Transcription Process"])
-HF_TOKEN = os.getenv("HF_TOKEN")
-def get_casting(video_sha1: str):
-    """Recupera els embeddings reals de càsting per a un vídeo a partir del seu SHA1.
-    Llegeix el JSON d'embeddings que demo ha pujat prèviament a /data/embeddings
-    mitjançant l'endpoint /embeddings/upload_embeddings i en retorna les
-    columnes face_col i voice_col.
-    """
-    # get_embeddings_json retorna el JSON complet tal com es va pujar (casting_json)
-    faces_json = get_embeddings_json(video_sha1, "faces")
-    voices_json = get_embeddings_json(video_sha1, "voices")
-    # Ens quedem només amb les columnes que interessen al pipeline
-    face_col = faces_json.get("face_col", []) if isinstance(faces_json, dict) else []
-    voice_col = voices_json.get("voice_col", []) if isinstance(voices_json, dict) else []
-    return face_col, voice_col
-transcripcion_inicial = "/home/acasado/bsc/hugging_face_bsc/engine/results/transcription_initial.srt"
-informacion_json = "/home/acasado/bsc/hugging_face_bsc/engine/results/informacion.json"
-def map_identities_per_second(frames_per_second, intervals):
-    for seg in intervals:
-        seg_start = seg["start"]
-        seg_end = seg["end"]
-        identities = []
-        for f in frames_per_second:
-            if seg_start <= f["start"] <= seg_end:
-                for face in f.get("faces", []):
-                    identities.append(face)
-        seg["counts"] = dict(Counter(identities))
-    return intervals
-def _fmt_srt_time(seconds: float) -> str:
-    """Formatea segundos en el formato SRT HH:MM:SS,mmm"""
-    h = int(seconds // 3600)
-    m = int((seconds % 3600) // 60)
-    s = int(seconds % 60)
-    ms = int((seconds - int(seconds)) * 1000)
-    return f"{h:02}:{m:02}:{s:02},{ms:03}"
-from pathlib import Path
-from typing import List, Dict
-from fastapi import HTTPException
-def generate_srt_from_segments(segments: List[Dict], sha1: str) -> str:
-    """
-    Generate an SRT subtitle file from diarization/transcription segments.
-    This function:
-    - Creates the required folder structure for storing SRTs.
-    - Removes any previous SRT files for the same SHA1.
-    - Builds the SRT content with timestamps, speaker identity and transcription.
-    - Saves the SRT file to disk.
-    - Returns the SRT content as a string (to be sent by the endpoint).
-    Parameters
-    ----------
-    segments : List[Dict]
-        List of dictionaries containing:
-            - "start": float (start time in seconds)
-            - "end": float (end time in seconds)
-            - "speaker": dict with "identity"
-            - "transcription": str
-    sha1 : str
-        Identifier used to locate the target media folder.
-    Returns
-    -------
-    str
-        Full SRT file content as a string.
-    """
-    # Path: /data/media/<sha1>
-    video_root = MEDIA_ROOT / sha1
-    video_root.mkdir(parents=True, exist_ok=True)
-    # Path: /data/media/<sha1>/srt
-    srt_dir = video_root / "srt"
-    srt_dir.mkdir(parents=True, exist_ok=True)
-    # Delete old SRT files
-    try:
-        for old_srt in srt_dir.glob("*.srt"):
-            old_srt.unlink()
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Failed to delete old SRT files: {exc}")
-    # Save file as initial.srt
-    final_path = srt_dir / "initial.srt"
-    # Build SRT content
-    srt_lines = []
-    for i, seg in enumerate(segments, start=1):
-        start = seg.get("start", 0.0)
-        end = seg.get("end", 0.0)
-        transcription = seg.get("transcription", "").strip()
-        speaker_info = seg.get("speaker", {})
-        speaker = speaker_info.get("identity", "Unknown")
-        text = f"[{speaker}]: {transcription}" if speaker else transcription
-        entry = (
-            f"{i}\n"
-            f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}\n"
-            f"{text}\n"
-        )
-        srt_lines.append(entry)
-    # Join with blank lines
-    srt_content = "\n".join(srt_lines)
-    # Write to disk
-    try:
-        with final_path.open("w", encoding="utf-8-sig") as f:
-            f.write(srt_content)
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Failed to write SRT file: {exc}")
-    return srt_content
-def pipeline_preprocessing_vision(video_path: str, face_col):
-    """
-    Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de vision.
-    """
-    print(f"Procesando video para visión: {video_path}")
-    print("Extrayendo escenas...")
-    threshold: float = 30.0
-    offset_frames: int = 3
-    crop_ratio: float = 0.1
-    result_extract_scenes = extract_scenes(video_path, threshold, offset_frames, crop_ratio)
-    print(result_extract_scenes)
-    # Obtener las rutas de las imágenes y la información de las escenas
-    escenas = result_extract_scenes[0] if len(result_extract_scenes) > 0 else []
-    escenas_paths = [f["image"] for f in escenas]
-    print(escenas_paths)
-    info_escenas = result_extract_scenes[1] if len(result_extract_scenes) > 1 else []
-    print(info_escenas)
-    print("Extrayendo imagenes por segundo...")
-    result_extract_per_second = keyframes_every_second_extraction(video_path)
-    # Obtener las rutas de las imágenes y la información de las escenas
-    images_per_second = result_extract_per_second[0] if len(result_extract_per_second) > 0 else []
-    images_per_second_paths = [f["image"] for f in images_per_second]
-    info_images_per_second = result_extract_per_second[1] if len(result_extract_per_second) > 1 else []
-    print("Aumentamos la información de las escenas viendo quién aparece en cada escena y detectando OCR...")
-    info_escenas_completa = []
-    for imagen_escena, info_escena in zip(escenas_paths, info_escenas):
-        result_add_ocr_and_faces = add_ocr_and_faces(imagen_escena, info_escena, face_col)
-        info_escenas_completa.append(result_add_ocr_and_faces)
-    print("Aumentamos la información de las imagenes por segundo viendo quién aparece en cada escena y detectando OCR...")
-    info_images_per_second_completa = []
-    for imagen_segundo, info_segundo in zip(images_per_second_paths, info_images_per_second):
-        result_add_ocr_and_faces =add_ocr_and_faces(imagen_segundo, info_segundo, face_col)
-        info_images_per_second_completa.append(result_add_ocr_and_faces)
-    print(info_escenas_completa)
-    print("Ahora se va a tratar los OCR (se sustituirán ciertas escenas por alguna de las imágenes por segundo si tienen mejor OCR)...")
-    # Se hará lo último
-    print("Combinando información de escenas e imágenes por segundo...")
-    info_escenas_completa = map_identities_per_second(info_images_per_second_completa, info_escenas_completa)
-    print(info_escenas_completa)
-    print("Ahora se incluyen en los diccionarios de las escenas la descripciones de estas.")
-    for escena_path, info_escena in zip(escenas_paths, info_escenas_completa):
-        descripcion_escena = extract_descripcion_escena(escena_path)
-        lista = ast.literal_eval(descripcion_escena)
-        frase = lista[0]
-        info_escena["descripcion"] = frase
-        del descripcion_escena
-        torch.cuda.empty_cache()
-    return info_escenas_completa, info_images_per_second_completa
-def pipeline_preprocessing_audio(video_path: str, voice_col):
-    """
-    Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de audio.
-    """
-    print(f"Procesando video para audio: {video_path}")
-    print("Extrayendo audio del video...")
-    audio_video = extract_audio_from_video(video_path)
-    print(audio_video)
-    print("Diartizando el audio...")
-    diarization_audio = diarize_audio(audio_video)
-    print(diarization_audio)
-    clips_path = diarization_audio[0]
-    print(clips_path)
-    diarization_info = diarization_audio[1]
-    print(diarization_info)
-    print("Transcribiendo el video completo...")
-    full_transcription = transcribe_long_audio(audio_video)
-    print(full_transcription)
-    print("Transcribiendo los clips diartizados...")
-    for clip_path, clip_info in zip(clips_path, diarization_info):
-        clip_transcription = transcribe_short_audio(clip_path)
-        clip_info["transcription"] = clip_transcription
-    print("Calculando los embeddings para cada uno de los clips obtenidos y posteriormente identificar las voces...")
-    for clip_path, clip_info in zip(clips_path, diarization_info):
-        clip_speaker = identificar_veu(clip_path, voice_col)
-        clip_info["speaker"] = clip_speaker
-    return full_transcription, diarization_info
-@router.post("/generate_srt", tags=["Transcription Process"])
-async def pipeline_video_analysis(
-    sha1: str,
-    token: str = Query(..., description="Token required for authorization")
-):
-    """
-    Endpoint that processes a full video identified by its SHA1 folder, performs
-    complete audio-visual preprocessing, and returns an SRT subtitle file.
-    This pipeline integrates:
-    - Vision preprocessing (scene detection, keyframes, OCR, face recognition)
-    - Audio preprocessing (diarization, speech recognition, speaker identity matching)
-    - Identity mapping between vision and audio streams
-    - Final generation of an SRT file describing who speaks and when
-    Parameters
-    ----------
-    sha1 : str
-        Identifier corresponding to the folder containing the video and related assets.
-    token : str
-        Security token required for authorization.
-    Returns
-    -------
-    str
-        The generated SRT file (as text) containing time-aligned subtitles with
-        speaker identities and transcriptions.
-    """
-    validate_token(token)
-    # Resolve directories
-    file_manager = FileManager(MEDIA_ROOT)
-    sha1_folder = MEDIA_ROOT / sha1
-    clip_folder = sha1_folder / "clip"
-    if not sha1_folder.exists() or not sha1_folder.is_dir():
-        raise HTTPException(status_code=404, detail="SHA1 folder not found")
-    if not clip_folder.exists() or not clip_folder.is_dir():
-        raise HTTPException(status_code=404, detail="Clip folder not found")
-    # Locate video file
-    mp4_files = list(clip_folder.glob("*.mp4"))
-    if not mp4_files:
-        raise HTTPException(status_code=404, detail="No MP4 files found")
-    video_path = mp4_files[0]
-    # Convert absolute path to a relative path for FileManager
-    video_path = MEDIA_ROOT / video_path.relative_to(MEDIA_ROOT)
-    print(f"Processing full video: {video_path}")
-    # Get face and voice embeddings for casting
-    face_col, voice_col = get_casting(sha1)
-    # Vision processing pipeline
-    info_escenas, info_images_per_second = pipeline_preprocessing_vision(video_path, face_col)
-    torch.cuda.empty_cache()
-    # Audio processing pipeline
-    full_transcription, info_clips = pipeline_preprocessing_audio(video_path, voice_col)
-    # Merge identities from vision pipeline with audio segments
-    info_clips = map_identities_per_second(info_images_per_second, info_clips)
-    # Generate the final SRT subtitle file
-    srt = generate_srt_from_segments(info_clips, sha1)
-    # The endpoint returns the SRT file as plain text
     return srt

+import os
+import io
+from pathlib import Path
+from typing import Counter,List, Dict
+import ast
+import json
+import torch
+from svision_client import extract_scenes, add_ocr_and_faces, keyframes_every_second_extraction, extract_descripcion_escena
+from asr_client import extract_audio_from_video, diarize_audio, transcribe_long_audio, transcribe_short_audio, identificar_veu
+from fastapi import APIRouter, UploadFile, File, Query, HTTPException
+from fastapi.responses import JSONResponse, StreamingResponse
+from storage.common import validate_token
+from storage.files.file_manager import FileManager
+from storage.embeddings_routers import get_embeddings_json
+EMBEDDINGS_ROOT = Path("/data/embeddings")
+MEDIA_ROOT = Path("/data/media")
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+router = APIRouter(prefix="/transcription", tags=["Transcription Process"])
+HF_TOKEN = os.getenv("HF_TOKEN")
+def get_casting(video_sha1: str):
+    """Recupera els embeddings reals de càsting per a un vídeo a partir del seu SHA1.
+    Llegeix el JSON d'embeddings que demo ha pujat prèviament a /data/embeddings
+    mitjançant l'endpoint /embeddings/upload_embeddings i en retorna les
+    columnes face_col i voice_col.
+    """
+    # get_embeddings_json retorna el JSON complet tal com es va pujar (casting_json)
+    faces_json = get_embeddings_json(video_sha1, "faces")
+    voices_json = get_embeddings_json(video_sha1, "voices")
+    # Ens quedem només amb les columnes que interessen al pipeline
+    face_col = faces_json.get("face_col", []) if isinstance(faces_json, dict) else []
+    voice_col = voices_json.get("voice_col", []) if isinstance(voices_json, dict) else []
+    return face_col, voice_col
+def map_identities_per_second(frames_per_second, intervals):
+    for seg in intervals:
+        seg_start = seg["start"]
+        seg_end = seg["end"]
+        identities = []
+        for f in frames_per_second:
+            if seg_start <= f["start"] <= seg_end:
+                for face in f.get("faces", []):
+                    identities.append(face)
+        seg["counts"] = dict(Counter(identities))
+    return intervals
+def _fmt_srt_time(seconds: float) -> str:
+    """Formatea segundos en el formato SRT HH:MM:SS,mmm"""
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = int(seconds % 60)
+    ms = int((seconds - int(seconds)) * 1000)
+    return f"{h:02}:{m:02}:{s:02},{ms:03}"
+from pathlib import Path
+from typing import List, Dict
+from fastapi import HTTPException
+def generate_srt_from_segments(segments: List[Dict], sha1: str) -> str:
+    """
+    Generate an SRT subtitle file from diarization/transcription segments.
+    This function:
+    - Creates the required folder structure for storing SRTs.
+    - Removes any previous SRT files for the same SHA1.
+    - Builds the SRT content with timestamps, speaker identity and transcription.
+    - Saves the SRT file to disk.
+    - Returns the SRT content as a string (to be sent by the endpoint).
+    Parameters
+    ----------
+    segments : List[Dict]
+        List of dictionaries containing:
+            - "start": float (start time in seconds)
+            - "end": float (end time in seconds)
+            - "speaker": dict with "identity"
+            - "transcription": str
+    sha1 : str
+        Identifier used to locate the target media folder.
+    Returns
+    -------
+    str
+        Full SRT file content as a string.
+    """
+    # Path: /data/media/<sha1>
+    video_root = MEDIA_ROOT / sha1
+    video_root.mkdir(parents=True, exist_ok=True)
+    # Path: /data/media/<sha1>/srt
+    srt_dir = video_root / "srt"
+    srt_dir.mkdir(parents=True, exist_ok=True)
+    # Delete old SRT files
+    try:
+        for old_srt in srt_dir.glob("*.srt"):
+            old_srt.unlink()
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=f"Failed to delete old SRT files: {exc}")
+    # Save file as initial.srt
+    final_path = srt_dir / "initial.srt"
+    # Build SRT content
+    srt_lines = []
+    for i, seg in enumerate(segments, start=1):
+        start = seg.get("start", 0.0)
+        end = seg.get("end", 0.0)
+        transcription = seg.get("transcription", "").strip()
+        speaker_info = seg.get("speaker", {})
+        speaker = speaker_info.get("identity", "Unknown")
+        text = f"[{speaker}]: {transcription}" if speaker else transcription
+        entry = (
+            f"{i}\n"
+            f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}\n"
+            f"{text}\n"
+        )
+        srt_lines.append(entry)
+    # Join with blank lines
+    srt_content = "\n".join(srt_lines)
+    # Write to disk
+    try:
+        with final_path.open("w", encoding="utf-8-sig") as f:
+            f.write(srt_content)
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=f"Failed to write SRT file: {exc}")
+    return srt_content
+def pipeline_preprocessing_vision(video_path: str, face_col):
+    """
+    Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de vision.
+    """
+    print(f"Procesando video para visión: {video_path}")
+    print("Extrayendo escenas...")
+    threshold: float = 30.0
+    offset_frames: int = 3
+    crop_ratio: float = 0.1
+    result_extract_scenes = extract_scenes(video_path, threshold, offset_frames, crop_ratio)
+    print(result_extract_scenes)
+    # Obtener las rutas de las imágenes y la información de las escenas
+    escenas = result_extract_scenes[0] if len(result_extract_scenes) > 0 else []
+    escenas_paths = [f["image"] for f in escenas]
+    print(escenas_paths)
+    info_escenas = result_extract_scenes[1] if len(result_extract_scenes) > 1 else []
+    print(info_escenas)
+    print("Extrayendo imagenes por segundo...")
+    result_extract_per_second = keyframes_every_second_extraction(video_path)
+    # Obtener las rutas de las imágenes y la información de las escenas
+    images_per_second = result_extract_per_second[0] if len(result_extract_per_second) > 0 else []
+    images_per_second_paths = [f["image"] for f in images_per_second]
+    info_images_per_second = result_extract_per_second[1] if len(result_extract_per_second) > 1 else []
+    print("Aumentamos la información de las escenas viendo quién aparece en cada escena y detectando OCR...")
+    info_escenas_completa = []
+    for imagen_escena, info_escena in zip(escenas_paths, info_escenas):
+        result_add_ocr_and_faces = add_ocr_and_faces(imagen_escena, info_escena, face_col)
+        info_escenas_completa.append(result_add_ocr_and_faces)
+    print("Aumentamos la información de las imagenes por segundo viendo quién aparece en cada escena y detectando OCR...")
+    info_images_per_second_completa = []
+    for imagen_segundo, info_segundo in zip(images_per_second_paths, info_images_per_second):
+        result_add_ocr_and_faces =add_ocr_and_faces(imagen_segundo, info_segundo, face_col)
+        info_images_per_second_completa.append(result_add_ocr_and_faces)
+    print(info_escenas_completa)
+    print("Ahora se va a tratar los OCR (se sustituirán ciertas escenas por alguna de las imágenes por segundo si tienen mejor OCR)...")
+    # Se hará lo último
+    print("Combinando información de escenas e imágenes por segundo...")
+    info_escenas_completa = map_identities_per_second(info_images_per_second_completa, info_escenas_completa)
+    print(info_escenas_completa)
+    print("Ahora se incluyen en los diccionarios de las escenas la descripciones de estas.")
+    for escena_path, info_escena in zip(escenas_paths, info_escenas_completa):
+        descripcion_escena = extract_descripcion_escena(escena_path)
+        lista = ast.literal_eval(descripcion_escena)
+        frase = lista[0]
+        info_escena["descripcion"] = frase
+        del descripcion_escena
+        torch.cuda.empty_cache()
+    return info_escenas_completa, info_images_per_second_completa
+def pipeline_preprocessing_audio(video_path: str, voice_col):
+    """
+    Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de audio.
+    """
+    print(f"Procesando video para audio: {video_path}")
+    print("Extrayendo audio del video...")
+    audio_video = extract_audio_from_video(video_path)
+    print(audio_video)
+    print("Diartizando el audio...")
+    diarization_audio = diarize_audio(audio_video)
+    print(diarization_audio)
+    clips_path = diarization_audio[0]
+    print(clips_path)
+    diarization_info = diarization_audio[1]
+    print(diarization_info)
+    print("Transcribiendo el video completo...")
+    full_transcription = transcribe_long_audio(audio_video)
+    print(full_transcription)
+    print("Transcribiendo los clips diartizados...")
+    for clip_path, clip_info in zip(clips_path, diarization_info):
+        clip_transcription = transcribe_short_audio(clip_path)
+        clip_info["transcription"] = clip_transcription
+    print("Calculando los embeddings para cada uno de los clips obtenidos y posteriormente identificar las voces...")
+    for clip_path, clip_info in zip(clips_path, diarization_info):
+        clip_speaker = identificar_veu(clip_path, voice_col)
+        clip_info["speaker"] = clip_speaker
+    return full_transcription, diarization_info
+@router.post("/generate_srt_salamandra", tags=["Transcription Process"])
+async def pipeline_video_analysis(
+    sha1: str,
+    token: str = Query(..., description="Token required for authorization")
+):
+    """
+    Endpoint that processes a full video identified by its SHA1 folder, performs
+    complete audio-visual preprocessing, and returns an SRT subtitle file.
+    This pipeline integrates:
+    - Vision preprocessing (scene detection, keyframes, OCR, face recognition)
+    - Audio preprocessing (diarization, speech recognition, speaker identity matching)
+    - Identity mapping between vision and audio streams
+    - Final generation of an SRT file describing who speaks and when
+    Parameters
+    ----------
+    sha1 : str
+        Identifier corresponding to the folder containing the video and related assets.
+    token : str
+        Security token required for authorization.
+    Returns
+    -------
+    str
+        The generated SRT file (as text) containing time-aligned subtitles with
+        speaker identities and transcriptions.
+    """
+    validate_token(token)
+    # Resolve directories
+    file_manager = FileManager(MEDIA_ROOT)
+    sha1_folder = MEDIA_ROOT / sha1
+    clip_folder = sha1_folder / "clip"
+    if not sha1_folder.exists() or not sha1_folder.is_dir():
+        raise HTTPException(status_code=404, detail="SHA1 folder not found")
+    if not clip_folder.exists() or not clip_folder.is_dir():
+        raise HTTPException(status_code=404, detail="Clip folder not found")
+    # Locate video file
+    mp4_files = list(clip_folder.glob("*.mp4"))
+    if not mp4_files:
+        raise HTTPException(status_code=404, detail="No MP4 files found")
+    video_path = mp4_files[0]
+    # Convert absolute path to a relative path for FileManager
+    video_path = MEDIA_ROOT / video_path.relative_to(MEDIA_ROOT)
+    print(f"Processing full video: {video_path}")
+    # Get face and voice embeddings for casting
+    face_col, voice_col = get_casting(sha1)
+    # Vision processing pipeline
+    info_escenas, info_images_per_second = pipeline_preprocessing_vision(video_path, face_col)
+    torch.cuda.empty_cache()
+    # Audio processing pipeline
+    full_transcription, info_clips = pipeline_preprocessing_audio(video_path, voice_col)
+    # Merge identities from vision pipeline with audio segments
+    info_clips = map_identities_per_second(info_images_per_second, info_clips)
+    # Generate the final SRT subtitle file
+    srt = generate_srt_from_segments(info_clips, sha1)
+    # The endpoint returns the SRT file as plain text
     return srt