Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on Nov 25, 2025

Commit

ce5bf11

verified ·

1 Parent(s): 6865b9f

Upload 3 files

Browse files

Files changed (3) hide show

api.py +108 -1
main_process/main_router.py +314 -302
pipelines/audiodescription.py +147 -147

api.py CHANGED Viewed

@@ -16,11 +16,12 @@ import yaml
 import io
 from video_processing import process_video_pipeline
-from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments
 from casting_loader import ensure_chroma, build_faces_index, build_voices_index
 from narration_system import NarrationSystem
 from llm_router import load_yaml, LLMRouter
 from character_detection import detect_characters_from_video
 from pipelines.audiodescription import generate as ad_generate
@@ -1015,6 +1016,111 @@ async def finalize_casting(
     face_identities = sorted([p.name for p in faces_out.iterdir() if p.is_dir()]) if faces_out.exists() else []
     voice_identities = sorted([p.name for p in voices_out.iterdir() if p.is_dir()]) if voices_out.exists() else []
     return {
         "ok": True,
         "video_name": video_name,
@@ -1025,6 +1131,7 @@ async def finalize_casting(
         "n_voices_embeddings": n_voices,
         "face_identities": face_identities,
         "voice_identities": voice_identities,
     }
 @app.get("/files_scene/{video_name}/{scene_id}/{filename}")

 import io
 from video_processing import process_video_pipeline
+from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments, VoiceEmbedder
 from casting_loader import ensure_chroma, build_faces_index, build_voices_index
 from narration_system import NarrationSystem
 from llm_router import load_yaml, LLMRouter
 from character_detection import detect_characters_from_video
+from vision_tools import FaceOfImageEmbedding
 from pipelines.audiodescription import generate as ad_generate
     face_identities = sorted([p.name for p in faces_out.iterdir() if p.is_dir()]) if faces_out.exists() else []
     voice_identities = sorted([p.name for p in voices_out.iterdir() if p.is_dir()]) if voices_out.exists() else []
+    # Build casting_json with face and voice embeddings (best-effort) via remote Spaces
+    casting_json = {"face_col": [], "voice_col": []}
+    # Cargar config y router para acceder a svision/asr
+    try:
+        cfg = load_yaml("config.yaml")
+        router = LLMRouter(cfg)
+    except Exception:
+        router = None  # type: ignore
+    # Face embeddings per identity using remote svision (face_image_embedding)
+    try:
+        if face_identities and router is not None:
+            factory = router.client_factories.get("salamandra-vision")  # type: ignore[attr-defined]
+            if factory is not None:
+                vclient = factory()
+                gclient = getattr(vclient, "_client", None)
+            else:
+                gclient = None
+            if gclient is not None:
+                for identity in face_identities:
+                    id_dir = faces_out / identity
+                    if not id_dir.is_dir():
+                        continue
+                    # Buscar una imagen representativa
+                    img_path = None
+                    for ext in (".jpg", ".jpeg", ".png", ".bmp", ".webp"):
+                        candidates = list(id_dir.glob(f"*{ext}"))
+                        if candidates:
+                            img_path = candidates[0]
+                            break
+                    if not img_path:
+                        continue
+                    try:
+                        out = gclient.predict(str(img_path), api_name="/face_image_embedding")
+                        # svision devuelve normalmente una lista de embeddings o un solo embedding
+                        emb = None
+                        if isinstance(out, list):
+                            if out and isinstance(out[0], (list, tuple, float, int)):
+                                # Si es lista de listas, tomamos la primera; si es lista plana, la usamos tal cual
+                                if out and isinstance(out[0], (list, tuple)):
+                                    emb = list(out[0])
+                                else:
+                                    emb = list(out)
+                        elif isinstance(out, dict) and "embedding" in out:
+                            emb = out.get("embedding")
+                        if not emb:
+                            continue
+                        casting_json["face_col"].append({
+                            "nombre": identity,
+                            "embedding": emb,
+                        })
+                    except Exception:
+                        # No romper por un fallo puntual de embedding
+                        continue
+    except Exception:
+        # Si algo falla en todo el bloque de caras, dejamos face_col vacío
+        casting_json["face_col"] = []
+    # Voice embeddings per identity using remote asr (voice_embedding)
+    try:
+        if voice_identities and router is not None:
+            factory = router.client_factories.get("whisper-catalan")  # type: ignore[attr-defined]
+            if factory is not None:
+                aclient = factory()
+                gclient = getattr(aclient, "_client", None)
+            else:
+                gclient = None
+            if gclient is not None:
+                for identity in voice_identities:
+                    id_dir = voices_out / identity
+                    if not id_dir.is_dir():
+                        continue
+                    wav_files = sorted([p for p in id_dir.iterdir() if p.is_file() and p.suffix.lower() in [".wav", ".flac", ".mp3"]])
+                    if not wav_files:
+                        continue
+                    # Obtenemos un embedding representativo usando el primer clip
+                    wf = wav_files[0]
+                    try:
+                        out = gclient.predict(str(wf), api_name="/voice_embedding")
+                        emb = None
+                        if isinstance(out, list):
+                            emb = list(out)
+                        elif isinstance(out, dict) and "embedding" in out:
+                            emb = out.get("embedding")
+                        if not emb:
+                            continue
+                        casting_json["voice_col"].append({
+                            "nombre": identity,
+                            "embedding": emb,
+                        })
+                    except Exception:
+                        continue
+    except Exception:
+        # Si algo falla en todo el bloque de voces, dejamos voice_col vacío
+        casting_json["voice_col"] = []
     return {
         "ok": True,
         "video_name": video_name,
         "n_voices_embeddings": n_voices,
         "face_identities": face_identities,
         "voice_identities": voice_identities,
+        "casting_json": casting_json,
     }
 @app.get("/files_scene/{video_name}/{scene_id}/{filename}")

main_process/main_router.py CHANGED Viewed

@@ -1,303 +1,315 @@
-import os
-import io
-from pathlib import Path
-from typing import Counter,List, Dict
-import ast
-import json
-import torch
-from svision_client import extract_scenes, add_ocr_and_faces, keyframes_every_second_extraction, extract_descripcion_escena
-from asr_client import extract_audio_from_video, diarize_audio, transcribe_long_audio, transcribe_short_audio, identificar_veu
-from fastapi import APIRouter, UploadFile, File, Query, HTTPException
-from fastapi.responses import JSONResponse, StreamingResponse
-from storage.common import validate_token
-from storage.files.file_manager import FileManager
-EMBEDDINGS_ROOT = Path("/data/embeddings")
-MEDIA_ROOT = Path("/data/media")
-os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-router = APIRouter(prefix="/transcription", tags=["Transcription Process"])
-HF_TOKEN = os.getenv("HF_TOKEN")
-def get_casting(video_sha1: str):
-    # TODO: Buscarlo en el directorio
-    face_col = [{"nombre": "Ana", "embedding": [0.1]*512}]
-    voice_col = [{"nombre": "Ana", "embedding": [0.2]*192}]
-    return face_col, voice_col
-transcripcion_inicial = "/home/acasado/bsc/hugging_face_bsc/engine/results/transcription_initial.srt"
-informacion_json = "/home/acasado/bsc/hugging_face_bsc/engine/results/informacion.json"
-def map_identities_per_second(frames_per_second, intervals):
-    for seg in intervals:
-        seg_start = seg["start"]
-        seg_end = seg["end"]
-        identities = []
-        for f in frames_per_second:
-            if seg_start <= f["start"] <= seg_end:
-                for face in f.get("faces", []):
-                    identities.append(face)
-        seg["counts"] = dict(Counter(identities))
-    return intervals
-def _fmt_srt_time(seconds: float) -> str:
-    """Formatea segundos en el formato SRT HH:MM:SS,mmm"""
-    h = int(seconds // 3600)
-    m = int((seconds % 3600) // 60)
-    s = int(seconds % 60)
-    ms = int((seconds - int(seconds)) * 1000)
-    return f"{h:02}:{m:02}:{s:02},{ms:03}"
-from pathlib import Path
-from typing import List, Dict
-from fastapi import HTTPException
-def generate_srt_from_segments(segments: List[Dict], sha1: str) -> str:
-    """
-    Generate an SRT subtitle file from diarization/transcription segments.
-    This function:
-    - Creates the required folder structure for storing SRTs.
-    - Removes any previous SRT files for the same SHA1.
-    - Builds the SRT content with timestamps, speaker identity and transcription.
-    - Saves the SRT file to disk.
-    - Returns the SRT content as a string (to be sent by the endpoint).
-    Parameters
-    ----------
-    segments : List[Dict]
-        List of dictionaries containing:
-            - "start": float (start time in seconds)
-            - "end": float (end time in seconds)
-            - "speaker": dict with "identity"
-            - "transcription": str
-    sha1 : str
-        Identifier used to locate the target media folder.
-    Returns
-    -------
-    str
-        Full SRT file content as a string.
-    """
-    # Path: /data/media/<sha1>
-    video_root = MEDIA_ROOT / sha1
-    video_root.mkdir(parents=True, exist_ok=True)
-    # Path: /data/media/<sha1>/srt
-    srt_dir = video_root / "srt"
-    srt_dir.mkdir(parents=True, exist_ok=True)
-    # Delete old SRT files
-    try:
-        for old_srt in srt_dir.glob("*.srt"):
-            old_srt.unlink()
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Failed to delete old SRT files: {exc}")
-    # Save file as initial.srt
-    final_path = srt_dir / "initial.srt"
-    # Build SRT content
-    srt_lines = []
-    for i, seg in enumerate(segments, start=1):
-        start = seg.get("start", 0.0)
-        end = seg.get("end", 0.0)
-        transcription = seg.get("transcription", "").strip()
-        speaker_info = seg.get("speaker", {})
-        speaker = speaker_info.get("identity", "Unknown")
-        text = f"[{speaker}]: {transcription}" if speaker else transcription
-        entry = (
-            f"{i}\n"
-            f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}\n"
-            f"{text}\n"
-        )
-        srt_lines.append(entry)
-    # Join with blank lines
-    srt_content = "\n".join(srt_lines)
-    # Write to disk
-    try:
-        with final_path.open("w", encoding="utf-8-sig") as f:
-            f.write(srt_content)
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Failed to write SRT file: {exc}")
-    return srt_content
-def pipeline_preprocessing_vision(video_path: str, face_col):
-    """
-    Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de vision.
-    """
-    print(f"Procesando video para visión: {video_path}")
-    print("Extrayendo escenas...")
-    threshold: float = 30.0
-    offset_frames: int = 3
-    crop_ratio: float = 0.1
-    result_extract_scenes = extract_scenes(video_path, threshold, offset_frames, crop_ratio)
-    print(result_extract_scenes)
-    # Obtener las rutas de las imágenes y la información de las escenas
-    escenas = result_extract_scenes[0] if len(result_extract_scenes) > 0 else []
-    escenas_paths = [f["image"] for f in escenas]
-    print(escenas_paths)
-    info_escenas = result_extract_scenes[1] if len(result_extract_scenes) > 1 else []
-    print(info_escenas)
-    print("Extrayendo imagenes por segundo...")
-    result_extract_per_second = keyframes_every_second_extraction(video_path)
-    # Obtener las rutas de las imágenes y la información de las escenas
-    images_per_second = result_extract_per_second[0] if len(result_extract_per_second) > 0 else []
-    images_per_second_paths = [f["image"] for f in images_per_second]
-    info_images_per_second = result_extract_per_second[1] if len(result_extract_per_second) > 1 else []
-    print("Aumentamos la información de las escenas viendo quién aparece en cada escena y detectando OCR...")
-    info_escenas_completa = []
-    for imagen_escena, info_escena in zip(escenas_paths, info_escenas):
-        result_add_ocr_and_faces = add_ocr_and_faces(imagen_escena, info_escena, face_col)
-        info_escenas_completa.append(result_add_ocr_and_faces)
-    print("Aumentamos la información de las imagenes por segundo viendo quién aparece en cada escena y detectando OCR...")
-    info_images_per_second_completa = []
-    for imagen_segundo, info_segundo in zip(images_per_second_paths, info_images_per_second):
-        result_add_ocr_and_faces =add_ocr_and_faces(imagen_segundo, info_segundo, face_col)
-        info_images_per_second_completa.append(result_add_ocr_and_faces)
-    print(info_escenas_completa)
-    print("Ahora se va a tratar los OCR (se sustituirán ciertas escenas por alguna de las imágenes por segundo si tienen mejor OCR)...")
-    # Se hará lo último
-    print("Combinando información de escenas e imágenes por segundo...")
-    info_escenas_completa = map_identities_per_second(info_images_per_second_completa, info_escenas_completa)
-    print(info_escenas_completa)
-    print("Ahora se incluyen en los diccionarios de las escenas la descripciones de estas.")
-    for escena_path, info_escena in zip(escenas_paths, info_escenas_completa):
-        descripcion_escena = extract_descripcion_escena(escena_path)
-        lista = ast.literal_eval(descripcion_escena)
-        frase = lista[0]
-        info_escena["descripcion"] = frase
-        del descripcion_escena
-        torch.cuda.empty_cache()
-    return info_escenas_completa, info_images_per_second_completa
-def pipeline_preprocessing_audio(video_path: str, voice_col):
-    """
-    Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de audio.
-    """
-    print(f"Procesando video para audio: {video_path}")
-    print("Extrayendo audio del video...")
-    audio_video = extract_audio_from_video(video_path)
-    print(audio_video)
-    print("Diartizando el audio...")
-    diarization_audio = diarize_audio(audio_video)
-    print(diarization_audio)
-    clips_path = diarization_audio[0]
-    print(clips_path)
-    diarization_info = diarization_audio[1]
-    print(diarization_info)
-    print("Transcribiendo el video completo...")
-    full_transcription = transcribe_long_audio(audio_video)
-    print(full_transcription)
-    print("Transcribiendo los clips diartizados...")
-    for clip_path, clip_info in zip(clips_path, diarization_info):
-        clip_transcription = transcribe_short_audio(clip_path)
-        clip_info["transcription"] = clip_transcription
-    print("Calculando los embeddings para cada uno de los clips obtenidos y posteriormente identificar las voces...")
-    for clip_path, clip_info in zip(clips_path, diarization_info):
-        clip_speaker = identificar_veu(clip_path, voice_col)
-        clip_info["speaker"] = clip_speaker
-    return full_transcription, diarization_info
-@router.post("/generate_srt", tags=["Transcription Process"])
-async def pipeline_video_analysis(
-    sha1: str,
-    token: str = Query(..., description="Token required for authorization")
-):
-    """
-    Endpoint that processes a full video identified by its SHA1 folder, performs
-    complete audio-visual preprocessing, and returns an SRT subtitle file.
-    This pipeline integrates:
-    - Vision preprocessing (scene detection, keyframes, OCR, face recognition)
-    - Audio preprocessing (diarization, speech recognition, speaker identity matching)
-    - Identity mapping between vision and audio streams
-    - Final generation of an SRT file describing who speaks and when
-    Parameters
-    ----------
-    sha1 : str
-        Identifier corresponding to the folder containing the video and related assets.
-    token : str
-        Security token required for authorization.
-    Returns
-    -------
-    str
-        The generated SRT file (as text) containing time-aligned subtitles with
-        speaker identities and transcriptions.
-    """
-    validate_token(token)
-    # Resolve directories
-    file_manager = FileManager(MEDIA_ROOT)
-    sha1_folder = MEDIA_ROOT / sha1
-    clip_folder = sha1_folder / "clip"
-    if not sha1_folder.exists() or not sha1_folder.is_dir():
-        raise HTTPException(status_code=404, detail="SHA1 folder not found")
-    if not clip_folder.exists() or not clip_folder.is_dir():
-        raise HTTPException(status_code=404, detail="Clip folder not found")
-    # Locate video file
-    mp4_files = list(clip_folder.glob("*.mp4"))
-    if not mp4_files:
-        raise HTTPException(status_code=404, detail="No MP4 files found")
-    video_path = mp4_files[0]
-    # Convert absolute path to a relative path for FileManager
-    video_path = MEDIA_ROOT / video_path.relative_to(MEDIA_ROOT)
-    print(f"Processing full video: {video_path}")
-    # Get face and voice embeddings for casting
-    face_col, voice_col = get_casting(sha1)
-    # Vision processing pipeline
-    info_escenas, info_images_per_second = pipeline_preprocessing_vision(video_path, face_col)
-    torch.cuda.empty_cache()
-    # Audio processing pipeline
-    full_transcription, info_clips = pipeline_preprocessing_audio(video_path, voice_col)
-    # Merge identities from vision pipeline with audio segments
-    info_clips = map_identities_per_second(info_images_per_second, info_clips)
-    # Generate the final SRT subtitle file
-    srt = generate_srt_from_segments(info_clips, sha1)
-    # The endpoint returns the SRT file as plain text
     return srt

+import os
+import io
+from pathlib import Path
+from typing import Counter,List, Dict
+import ast
+import json
+import torch
+from svision_client import extract_scenes, add_ocr_and_faces, keyframes_every_second_extraction, extract_descripcion_escena
+from asr_client import extract_audio_from_video, diarize_audio, transcribe_long_audio, transcribe_short_audio, identificar_veu
+from fastapi import APIRouter, UploadFile, File, Query, HTTPException
+from fastapi.responses import JSONResponse, StreamingResponse
+from storage.common import validate_token
+from storage.files.file_manager import FileManager
+from storage.embeddings_routers import get_embeddings_json
+EMBEDDINGS_ROOT = Path("/data/embeddings")
+MEDIA_ROOT = Path("/data/media")
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+router = APIRouter(prefix="/transcription", tags=["Transcription Process"])
+HF_TOKEN = os.getenv("HF_TOKEN")
+def get_casting(video_sha1: str):
+    """Recupera els embeddings reals de càsting per a un vídeo a partir del seu SHA1.
+    Llegeix el JSON d'embeddings que demo ha pujat prèviament a /data/embeddings
+    mitjançant l'endpoint /embeddings/upload_embeddings i en retorna les
+    columnes face_col i voice_col.
+    """
+    # get_embeddings_json retorna el JSON complet tal com es va pujar (casting_json)
+    faces_json = get_embeddings_json(video_sha1, "faces")
+    voices_json = get_embeddings_json(video_sha1, "voices")
+    # Ens quedem només amb les columnes que interessen al pipeline
+    face_col = faces_json.get("face_col", []) if isinstance(faces_json, dict) else []
+    voice_col = voices_json.get("voice_col", []) if isinstance(voices_json, dict) else []
+    return face_col, voice_col
+transcripcion_inicial = "/home/acasado/bsc/hugging_face_bsc/engine/results/transcription_initial.srt"
+informacion_json = "/home/acasado/bsc/hugging_face_bsc/engine/results/informacion.json"
+def map_identities_per_second(frames_per_second, intervals):
+    for seg in intervals:
+        seg_start = seg["start"]
+        seg_end = seg["end"]
+        identities = []
+        for f in frames_per_second:
+            if seg_start <= f["start"] <= seg_end:
+                for face in f.get("faces", []):
+                    identities.append(face)
+        seg["counts"] = dict(Counter(identities))
+    return intervals
+def _fmt_srt_time(seconds: float) -> str:
+    """Formatea segundos en el formato SRT HH:MM:SS,mmm"""
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = int(seconds % 60)
+    ms = int((seconds - int(seconds)) * 1000)
+    return f"{h:02}:{m:02}:{s:02},{ms:03}"
+from pathlib import Path
+from typing import List, Dict
+from fastapi import HTTPException
+def generate_srt_from_segments(segments: List[Dict], sha1: str) -> str:
+    """
+    Generate an SRT subtitle file from diarization/transcription segments.
+    This function:
+    - Creates the required folder structure for storing SRTs.
+    - Removes any previous SRT files for the same SHA1.
+    - Builds the SRT content with timestamps, speaker identity and transcription.
+    - Saves the SRT file to disk.
+    - Returns the SRT content as a string (to be sent by the endpoint).
+    Parameters
+    ----------
+    segments : List[Dict]
+        List of dictionaries containing:
+            - "start": float (start time in seconds)
+            - "end": float (end time in seconds)
+            - "speaker": dict with "identity"
+            - "transcription": str
+    sha1 : str
+        Identifier used to locate the target media folder.
+    Returns
+    -------
+    str
+        Full SRT file content as a string.
+    """
+    # Path: /data/media/<sha1>
+    video_root = MEDIA_ROOT / sha1
+    video_root.mkdir(parents=True, exist_ok=True)
+    # Path: /data/media/<sha1>/srt
+    srt_dir = video_root / "srt"
+    srt_dir.mkdir(parents=True, exist_ok=True)
+    # Delete old SRT files
+    try:
+        for old_srt in srt_dir.glob("*.srt"):
+            old_srt.unlink()
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=f"Failed to delete old SRT files: {exc}")
+    # Save file as initial.srt
+    final_path = srt_dir / "initial.srt"
+    # Build SRT content
+    srt_lines = []
+    for i, seg in enumerate(segments, start=1):
+        start = seg.get("start", 0.0)
+        end = seg.get("end", 0.0)
+        transcription = seg.get("transcription", "").strip()
+        speaker_info = seg.get("speaker", {})
+        speaker = speaker_info.get("identity", "Unknown")
+        text = f"[{speaker}]: {transcription}" if speaker else transcription
+        entry = (
+            f"{i}\n"
+            f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}\n"
+            f"{text}\n"
+        )
+        srt_lines.append(entry)
+    # Join with blank lines
+    srt_content = "\n".join(srt_lines)
+    # Write to disk
+    try:
+        with final_path.open("w", encoding="utf-8-sig") as f:
+            f.write(srt_content)
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=f"Failed to write SRT file: {exc}")
+    return srt_content
+def pipeline_preprocessing_vision(video_path: str, face_col):
+    """
+    Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de vision.
+    """
+    print(f"Procesando video para visión: {video_path}")
+    print("Extrayendo escenas...")
+    threshold: float = 30.0
+    offset_frames: int = 3
+    crop_ratio: float = 0.1
+    result_extract_scenes = extract_scenes(video_path, threshold, offset_frames, crop_ratio)
+    print(result_extract_scenes)
+    # Obtener las rutas de las imágenes y la información de las escenas
+    escenas = result_extract_scenes[0] if len(result_extract_scenes) > 0 else []
+    escenas_paths = [f["image"] for f in escenas]
+    print(escenas_paths)
+    info_escenas = result_extract_scenes[1] if len(result_extract_scenes) > 1 else []
+    print(info_escenas)
+    print("Extrayendo imagenes por segundo...")
+    result_extract_per_second = keyframes_every_second_extraction(video_path)
+    # Obtener las rutas de las imágenes y la información de las escenas
+    images_per_second = result_extract_per_second[0] if len(result_extract_per_second) > 0 else []
+    images_per_second_paths = [f["image"] for f in images_per_second]
+    info_images_per_second = result_extract_per_second[1] if len(result_extract_per_second) > 1 else []
+    print("Aumentamos la información de las escenas viendo quién aparece en cada escena y detectando OCR...")
+    info_escenas_completa = []
+    for imagen_escena, info_escena in zip(escenas_paths, info_escenas):
+        result_add_ocr_and_faces = add_ocr_and_faces(imagen_escena, info_escena, face_col)
+        info_escenas_completa.append(result_add_ocr_and_faces)
+    print("Aumentamos la información de las imagenes por segundo viendo quién aparece en cada escena y detectando OCR...")
+    info_images_per_second_completa = []
+    for imagen_segundo, info_segundo in zip(images_per_second_paths, info_images_per_second):
+        result_add_ocr_and_faces =add_ocr_and_faces(imagen_segundo, info_segundo, face_col)
+        info_images_per_second_completa.append(result_add_ocr_and_faces)
+    print(info_escenas_completa)
+    print("Ahora se va a tratar los OCR (se sustituirán ciertas escenas por alguna de las imágenes por segundo si tienen mejor OCR)...")
+    # Se hará lo último
+    print("Combinando información de escenas e imágenes por segundo...")
+    info_escenas_completa = map_identities_per_second(info_images_per_second_completa, info_escenas_completa)
+    print(info_escenas_completa)
+    print("Ahora se incluyen en los diccionarios de las escenas la descripciones de estas.")
+    for escena_path, info_escena in zip(escenas_paths, info_escenas_completa):
+        descripcion_escena = extract_descripcion_escena(escena_path)
+        lista = ast.literal_eval(descripcion_escena)
+        frase = lista[0]
+        info_escena["descripcion"] = frase
+        del descripcion_escena
+        torch.cuda.empty_cache()
+    return info_escenas_completa, info_images_per_second_completa
+def pipeline_preprocessing_audio(video_path: str, voice_col):
+    """
+    Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de audio.
+    """
+    print(f"Procesando video para audio: {video_path}")
+    print("Extrayendo audio del video...")
+    audio_video = extract_audio_from_video(video_path)
+    print(audio_video)
+    print("Diartizando el audio...")
+    diarization_audio = diarize_audio(audio_video)
+    print(diarization_audio)
+    clips_path = diarization_audio[0]
+    print(clips_path)
+    diarization_info = diarization_audio[1]
+    print(diarization_info)
+    print("Transcribiendo el video completo...")
+    full_transcription = transcribe_long_audio(audio_video)
+    print(full_transcription)
+    print("Transcribiendo los clips diartizados...")
+    for clip_path, clip_info in zip(clips_path, diarization_info):
+        clip_transcription = transcribe_short_audio(clip_path)
+        clip_info["transcription"] = clip_transcription
+    print("Calculando los embeddings para cada uno de los clips obtenidos y posteriormente identificar las voces...")
+    for clip_path, clip_info in zip(clips_path, diarization_info):
+        clip_speaker = identificar_veu(clip_path, voice_col)
+        clip_info["speaker"] = clip_speaker
+    return full_transcription, diarization_info
+@router.post("/generate_srt", tags=["Transcription Process"])
+async def pipeline_video_analysis(
+    sha1: str,
+    token: str = Query(..., description="Token required for authorization")
+):
+    """
+    Endpoint that processes a full video identified by its SHA1 folder, performs
+    complete audio-visual preprocessing, and returns an SRT subtitle file.
+    This pipeline integrates:
+    - Vision preprocessing (scene detection, keyframes, OCR, face recognition)
+    - Audio preprocessing (diarization, speech recognition, speaker identity matching)
+    - Identity mapping between vision and audio streams
+    - Final generation of an SRT file describing who speaks and when
+    Parameters
+    ----------
+    sha1 : str
+        Identifier corresponding to the folder containing the video and related assets.
+    token : str
+        Security token required for authorization.
+    Returns
+    -------
+    str
+        The generated SRT file (as text) containing time-aligned subtitles with
+        speaker identities and transcriptions.
+    """
+    validate_token(token)
+    # Resolve directories
+    file_manager = FileManager(MEDIA_ROOT)
+    sha1_folder = MEDIA_ROOT / sha1
+    clip_folder = sha1_folder / "clip"
+    if not sha1_folder.exists() or not sha1_folder.is_dir():
+        raise HTTPException(status_code=404, detail="SHA1 folder not found")
+    if not clip_folder.exists() or not clip_folder.is_dir():
+        raise HTTPException(status_code=404, detail="Clip folder not found")
+    # Locate video file
+    mp4_files = list(clip_folder.glob("*.mp4"))
+    if not mp4_files:
+        raise HTTPException(status_code=404, detail="No MP4 files found")
+    video_path = mp4_files[0]
+    # Convert absolute path to a relative path for FileManager
+    video_path = MEDIA_ROOT / video_path.relative_to(MEDIA_ROOT)
+    print(f"Processing full video: {video_path}")
+    # Get face and voice embeddings for casting
+    face_col, voice_col = get_casting(sha1)
+    # Vision processing pipeline
+    info_escenas, info_images_per_second = pipeline_preprocessing_vision(video_path, face_col)
+    torch.cuda.empty_cache()
+    # Audio processing pipeline
+    full_transcription, info_clips = pipeline_preprocessing_audio(video_path, voice_col)
+    # Merge identities from vision pipeline with audio segments
+    info_clips = map_identities_per_second(info_images_per_second, info_clips)
+    # Generate the final SRT subtitle file
+    srt = generate_srt_from_segments(info_clips, sha1)
+    # The endpoint returns the SRT file as plain text
     return srt

pipelines/audiodescription.py CHANGED Viewed

@@ -1,147 +1,147 @@
-from __future__ import annotations
-import os
-import shlex
-import subprocess
-from pathlib import Path
-from typing import Dict, Any, List, Tuple, Optional
-# Minimal, robust MVP audio-only pipeline
-# - Extract audio with ffmpeg
-# - Diarize with pyannote (if HF token available); otherwise, fallback: single segment over full duration
-# - ASR with Whisper (AINA if available optional). To keep footprint reasonable and robust,
-#   we'll default to a lightweight faster-whisper if present; otherwise, return empty text.
-# - Generate basic SRT from segments and ASR texts.
-def extract_audio_ffmpeg(video_path: str, audio_out: Path, sr: int = 16000, mono: bool = True) -> str:
-    audio_out.parent.mkdir(parents=True, exist_ok=True)
-    cmd = f'ffmpeg -y -i "{video_path}" -vn {"-ac 1" if mono else ""} -ar {sr} -f wav "{audio_out}"'
-    subprocess.run(shlex.split(cmd), check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-    return str(audio_out)
-def _get_video_duration_seconds(video_path: str) -> float:
-    try:
-        # Use ffprobe to get duration
-        cmd = f'ffprobe -v error -select_streams v:0 -show_entries stream=duration -of default=nw=1 "{video_path}"'
-        out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.DEVNULL).decode("utf-8", errors="ignore")
-        for line in out.splitlines():
-            if line.startswith("duration="):
-                try:
-                    return float(line.split("=", 1)[1])
-                except Exception:
-                    pass
-    except Exception:
-        pass
-    return 0.0
-def diarize_audio(wav_path: str, base_dir: Path, hf_token_env: str | None = None) -> Tuple[List[Dict[str, Any]], List[str]]:
-    """Returns segments [{'start','end','speaker'}] and dummy clip_paths (not used in MVP)."""
-    segments: List[Dict[str, Any]] = []
-    clip_paths: List[str] = []
-    # Prefer PYANNOTE_TOKEN if provided; fallback to explicit env name, then HF_TOKEN
-    token = os.getenv("PYANNOTE_TOKEN") or (os.getenv(hf_token_env) if hf_token_env else os.getenv("HF_TOKEN"))
-    try:
-        if token:
-            from pyannote.audio import Pipeline  # type: ignore
-            pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=token)
-            diarization = pipeline(wav_path)
-            # Collect segments
-            # We don't export individual clips in MVP; just timestamps.
-            for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
-                segments.append({
-                    "start": float(getattr(turn, "start", 0.0) or 0.0),
-                    "end": float(getattr(turn, "end", 0.0) or 0.0),
-                    "speaker": str(speaker) if speaker is not None else f"SPEAKER_{i:02d}",
-                })
-        else:
-            # Fallback: single segment using full duration
-            # Caller must provide video path to compute exact duration; as we only have wav, skip precise duration
-            # and fallback to 0..0 (UI tolerates).
-            segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
-    except Exception:
-        # Robust fallback
-        segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
-    # Sort by start
-    segments = sorted(segments, key=lambda s: s.get("start", 0.0))
-    return segments, clip_paths
-def _fmt_srt_time(seconds: float) -> str:
-    h = int(seconds // 3600)
-    m = int((seconds % 3600) // 60)
-    s = int(seconds % 60)
-    ms = int(round((seconds - int(seconds)) * 1000))
-    return f"{h:02}:{m:02}:{s:02},{ms:03}"
-def _generate_srt(segments: List[Dict[str, Any]], texts: List[str]) -> str:
-    n = min(len(segments), len(texts))
-    lines: List[str] = []
-    for i in range(n):
-        seg = segments[i]
-        text = (texts[i] or "").strip()
-        start = float(seg.get("start", 0.0))
-        end = float(seg.get("end", max(start + 2.0, start)))
-        speaker = seg.get("speaker")
-        if speaker:
-            text = f"[{speaker}]: {text}" if text else f"[{speaker}]"
-        lines.append(str(i + 1))
-        lines.append(f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}")
-        lines.append(text)
-        lines.append("")
-    return "\n".join(lines).strip() + "\n"
-def asr_transcribe_wav_simple(wav_path: str) -> str:
-    """Very robust ASR stub: try faster-whisper small if present; otherwise return empty text.
-    Intended for MVP in Spaces without heavy GPU. """
-    try:
-        from faster_whisper import WhisperModel  # type: ignore
-        model = WhisperModel("Systran/faster-whisper-small", device="cpu")
-        # Short transcript without timestamps
-        segments, info = model.transcribe(wav_path, vad_filter=True, without_timestamps=True, language=None)
-        text = " ".join(seg.text.strip() for seg in segments if getattr(seg, "text", None))
-        return text.strip()
-    except Exception:
-        # As last resort, empty text
-        return ""
-def generate(video_path: str, out_dir: Path) -> Dict[str, Any]:
-    """End-to-end MVP that returns {'une_srt','free_text','artifacts':{...}}."""
-    out_dir.mkdir(parents=True, exist_ok=True)
-    wav_path = extract_audio_ffmpeg(video_path, out_dir / f"{Path(video_path).stem}.wav")
-    # Diarization (robust)
-    segments, _ = diarize_audio(wav_path, out_dir, hf_token_env="HF_TOKEN")
-    # ASR (for MVP: single transcript of full audio to use as 'free_text')
-    free_text = asr_transcribe_wav_simple(wav_path)
-    # Build per-segment 'texts' using a simple split of free_text if we have multiple segments
-    if not segments:
-        segments = [{"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"}]
-    texts: List[str] = []
-    if len(segments) <= 1:
-        texts = [free_text]
-    else:
-        # Naive split into N parts by words
-        words = free_text.split()
-        chunk = max(1, len(words) // len(segments))
-        for i in range(len(segments)):
-            start_idx = i * chunk
-            end_idx = (i + 1) * chunk if i < len(segments) - 1 else len(words)
-            texts.append(" ".join(words[start_idx:end_idx]))
-    une_srt = _generate_srt(segments, texts)
-    return {
-        "une_srt": une_srt,
-        "free_text": free_text,
-        "artifacts": {
-            "wav_path": str(wav_path),
-        },
-    }

+from __future__ import annotations
+import os
+import shlex
+import subprocess
+from pathlib import Path
+from typing import Dict, Any, List, Tuple, Optional
+# Minimal, robust MVP audio-only pipeline
+# - Extract audio with ffmpeg
+# - Diarize with pyannote (if HF token available); otherwise, fallback: single segment over full duration
+# - ASR with Whisper (AINA if available optional). To keep footprint reasonable and robust,
+#   we'll default to a lightweight faster-whisper if present; otherwise, return empty text.
+# - Generate basic SRT from segments and ASR texts.
+def extract_audio_ffmpeg(video_path: str, audio_out: Path, sr: int = 16000, mono: bool = True) -> str:
+    audio_out.parent.mkdir(parents=True, exist_ok=True)
+    cmd = f'ffmpeg -y -i "{video_path}" -vn {"-ac 1" if mono else ""} -ar {sr} -f wav "{audio_out}"'
+    subprocess.run(shlex.split(cmd), check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    return str(audio_out)
+def _get_video_duration_seconds(video_path: str) -> float:
+    try:
+        # Use ffprobe to get duration
+        cmd = f'ffprobe -v error -select_streams v:0 -show_entries stream=duration -of default=nw=1 "{video_path}"'
+        out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.DEVNULL).decode("utf-8", errors="ignore")
+        for line in out.splitlines():
+            if line.startswith("duration="):
+                try:
+                    return float(line.split("=", 1)[1])
+                except Exception:
+                    pass
+    except Exception:
+        pass
+    return 0.0
+def diarize_audio(wav_path: str, base_dir: Path, hf_token_env: str | None = None) -> Tuple[List[Dict[str, Any]], List[str]]:
+    """Returns segments [{'start','end','speaker'}] and dummy clip_paths (not used in MVP)."""
+    segments: List[Dict[str, Any]] = []
+    clip_paths: List[str] = []
+    # Prefer PYANNOTE_TOKEN if provided; fallback to explicit env name, then HF_TOKEN
+    token = os.getenv("PYANNOTE_TOKEN") or (os.getenv(hf_token_env) if hf_token_env else os.getenv("HF_TOKEN"))
+    try:
+        if token:
+            from pyannote.audio import Pipeline  # type: ignore
+            pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=token)
+            diarization = pipeline(wav_path)
+            # Collect segments
+            # We don't export individual clips in MVP; just timestamps.
+            for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
+                segments.append({
+                    "start": float(getattr(turn, "start", 0.0) or 0.0),
+                    "end": float(getattr(turn, "end", 0.0) or 0.0),
+                    "speaker": str(speaker) if speaker is not None else f"SPEAKER_{i:02d}",
+                })
+        else:
+            # Fallback: single segment using full duration
+            # Caller must provide video path to compute exact duration; as we only have wav, skip precise duration
+            # and fallback to 0..0 (UI tolerates).
+            segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
+    except Exception:
+        # Robust fallback
+        segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
+    # Sort by start
+    segments = sorted(segments, key=lambda s: s.get("start", 0.0))
+    return segments, clip_paths
+def _fmt_srt_time(seconds: float) -> str:
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = int(seconds % 60)
+    ms = int(round((seconds - int(seconds)) * 1000))
+    return f"{h:02}:{m:02}:{s:02},{ms:03}"
+def _generate_srt(segments: List[Dict[str, Any]], texts: List[str]) -> str:
+    n = min(len(segments), len(texts))
+    lines: List[str] = []
+    for i in range(n):
+        seg = segments[i]
+        text = (texts[i] or "").strip()
+        start = float(seg.get("start", 0.0))
+        end = float(seg.get("end", max(start + 2.0, start)))
+        speaker = seg.get("speaker")
+        if speaker:
+            text = f"[{speaker}]: {text}" if text else f"[{speaker}]"
+        lines.append(str(i + 1))
+        lines.append(f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}")
+        lines.append(text)
+        lines.append("")
+    return "\n".join(lines).strip() + "\n"
+def asr_transcribe_wav_simple(wav_path: str) -> str:
+    """Very robust ASR stub: try faster-whisper small if present; otherwise return empty text.
+    Intended for MVP in Spaces without heavy GPU. """
+    try:
+        from faster_whisper import WhisperModel  # type: ignore
+        model = WhisperModel("Systran/faster-whisper-small", device="cpu")
+        # Short transcript without timestamps
+        segments, info = model.transcribe(wav_path, vad_filter=True, without_timestamps=True, language=None)
+        text = " ".join(seg.text.strip() for seg in segments if getattr(seg, "text", None))
+        return text.strip()
+    except Exception:
+        # As last resort, empty text
+        return ""
+def generate(video_path: str, out_dir: Path) -> Dict[str, Any]:
+    """End-to-end MVP that returns {'une_srt','free_text','artifacts':{...}}."""
+    out_dir.mkdir(parents=True, exist_ok=True)
+    wav_path = extract_audio_ffmpeg(video_path, out_dir / f"{Path(video_path).stem}.wav")
+    # Diarization (robust)
+    segments, _ = diarize_audio(wav_path, out_dir, hf_token_env="HF_TOKEN")
+    # ASR (for MVP: single transcript of full audio to use as 'free_text')
+    free_text = asr_transcribe_wav_simple(wav_path)
+    # Build per-segment 'texts' using a simple split of free_text if we have multiple segments
+    if not segments:
+        segments = [{"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"}]
+    texts: List[str] = []
+    if len(segments) <= 1:
+        texts = [free_text]
+    else:
+        # Naive split into N parts by words
+        words = free_text.split()
+        chunk = max(1, len(words) // len(segments))
+        for i in range(len(segments)):
+            start_idx = i * chunk
+            end_idx = (i + 1) * chunk if i < len(segments) - 1 else len(words)
+            texts.append(" ".join(words[start_idx:end_idx]))
+    une_srt = _generate_srt(segments, texts)
+    return {
+        "une_srt": une_srt,
+        "free_text": free_text,
+        "artifacts": {
+            "wav_path": str(wav_path),
+        },
+    }