# app.py — veureu/svision (Salamandra Vision 7B · ZeroGPU) — compatible con ENGINE import os import json from typing import Dict, List, Optional, Tuple, Union, Any import gradio as gr import spaces import torch from facenet_pytorch import MTCNN, InceptionResnetV1 import numpy as np from PIL import Image from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration import cv2 from scenedetect import VideoManager, SceneManager from scenedetect.detectors import ContentDetector MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision") DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32 DEVICE = "cuda" if torch.cuda.is_available() else "cpu" _model = None _processor = None _mtcnn = None _facenet = None def _load_face_models() -> Tuple[MTCNN, InceptionResnetV1]: global _mtcnn, _facenet if _mtcnn is None or _facenet is None: device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu" _mtcnn = MTCNN(image_size=160, margin=0, post_process=True, device=device) _facenet = InceptionResnetV1(pretrained="vggface2").eval().to(device) return _mtcnn, _facenet def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]: global _model, _processor if _model is None or _processor is None: _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) _model = LlavaOnevisionForConditionalGeneration.from_pretrained( MODEL_ID, dtype=DTYPE, low_cpu_mem_usage=True, trust_remote_code=True, use_safetensors=True, device_map=None, ) _model.to(DEVICE) return _model, _processor def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict]: """Construye el chat template con imagen + texto + contexto opcional.""" ctx_txt = "" if context: try: # breve, sin ruido ctx_txt = "\n\nContexto adicional:\n" + json.dumps(context, ensure_ascii=False)[:2000] except Exception: pass user_txt = (user_text or "Describe la imagen con detalle.") + ctx_txt convo = [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": user_txt}, ], } ] return convo @spaces.GPU # en HF Spaces usará GPU cuando haya disponibilidad (ZeroGPU) def _infer_one(image: Image.Image, text: str, max_new_tokens: int = 256, temperature: float = 0.7, context: Optional[Dict] = None) -> str: # Reducir el tamaño de la imagen para ahorrar memoria en la GPU image.thumbnail((1024, 1024)) model, processor = _lazy_load() prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True) inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE) with torch.inference_mode(): out = model.generate(**inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature)) return processor.decode(out[0], skip_special_tokens=True).strip() # ----------------------------- API helpers ----------------------------------- def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.", max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]: result = _infer_one(image, text, max_new_tokens, temperature, context=None) return {"text": result} def describe_batch(images: List[Image.Image], context_json: str, max_new_tokens: int = 256, temperature: float = 0.7) -> List[str]: """Endpoint batch para ENGINE: lista de imágenes + contexto (JSON) → lista de textos.""" try: context = json.loads(context_json) if context_json else None except Exception: context = None outputs: List[str] = [] for img in images: outputs.append(_infer_one(img, text="Describe la imagen con detalle.", max_new_tokens=max_new_tokens, temperature=temperature, context=context)) return outputs @spaces.GPU def face_image_embedding(image: Image.Image) -> List[float] | None: try: mtcnn, facenet = _load_face_models() # detectar y extraer cara face = mtcnn(image) if face is None: return None # FaceNet espera tensor shape (1,3,160,160) device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu" face = face.unsqueeze(0).to(device) # obtener embedding with torch.no_grad(): emb = facenet(face).cpu().numpy()[0] # normalizar igual que tu código original emb = emb / np.linalg.norm(emb) return emb.astype(float).tolist() except Exception as e: print(f"Fallo embedding cara: {e}") return None @spaces.GPU def scenes_extraction(video_file: str, threshold: float, offset_frames: int, crop_ratio: float) -> Tuple[List[Image.Image], List[Dict]] | None: # video_file es un str ya que aunque realmente el usuario subió un archivo desde la UI, Gradio lo guarda temporalmente como ruta # Detectamos las escenas video_manager = VideoManager([video_file]) scene_manager = SceneManager() scene_manager.add_detector(ContentDetector(threshold=threshold)) video_manager.start() scene_manager.detect_scenes(video_manager) scene_list = scene_manager.get_scene_list() cap = cv2.VideoCapture(video_path) images: List[Image.Image] = [] informacion_escenas: List[Dict] = [] for i, (start_time, end_time) in enumerate(scene_list): frame_number = int(start_time.get_frames()) + offset_frames cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number) ret, frame = cap.read() if ret: h, w = frame.shape[:2] # Ahora realizamos el recorte ch, cw = int(h * crop_ratio), int(w * crop_ratio) frame = frame[ch:h-ch, cw:w-cw] # Guardamos la escena obtenida frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) img = Image.fromarray(frame_rgb) images.append(img) # Guardamos la información de la escena informacion_escenas.append({ "index": i+1, "start": start_time.get_seconds(), "end": end_time.get_seconds() }) cap.release() return images, informacion_escenas # ----------------------------- UI & Endpoints -------------------------------- with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo: gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.") with gr.Row(): with gr.Column(): in_img = gr.Image(label="Imagen", type="pil") in_txt = gr.Textbox(label="Texto/prompt", value="Describe la imagen con detalle (ES/CA).") max_new = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens") temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature") btn = gr.Button("Generar", variant="primary") with gr.Column(): out = gr.Textbox(label="Descripción", lines=18) # UI btn.click(_infer_one, [in_img, in_txt, max_new, temp], out, api_name="describe", concurrency_limit=1) # API simple (multipart) compatible con tu versión anterior # demo.load( # None, # [gr.Image(label="image", type="pil"), # gr.Textbox(value="Describe la imagen con detalle."), # gr.Slider(16, 1024, value=256), # gr.Slider(0.0, 1.5, value=0.7)], # describe_raw, # api_name="describe_raw" # ) # API BATCH para ENGINE (Gradio Client): images + context_json → list[str] # Firma que espera el VisionClient del engine (api_name="/predict") batch_in_images = gr.Gallery(label="Imágenes (batch)", show_label=False, columns=4, height="auto") batch_context = gr.Textbox(label="context_json", value="{}", lines=4) batch_max = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens") batch_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature") batch_btn = gr.Button("Describir lote") batch_out = gr.JSON(label="Descripciones (lista)") # Nota: Gradio Gallery entrega rutas/obj; nos apoyamos en el cliente para cargar archivos batch_btn.click(describe_batch, [batch_in_images, batch_context, batch_max, batch_temp], batch_out, api_name="predict", concurrency_limit=1) with gr.Row(): face_img = gr.Image(label="Imagen para embedding facial", type="pil") face_btn = gr.Button("Obtener embedding facial") face_out = gr.JSON(label="Embedding facial (vector)") face_btn.click(face_image_embedding, [face_img], face_out, api_name="face_image_embedding", concurrency_limit=1) with gr.Row(): video_file = gr.Video(label="Sube un vídeo") threshold = gr.Slider(0.0, 100.0, value=30.0, step=1.0, label="Threshold") offset_frames = gr.Slider(0, 30, value=5, step=1, label="Offset frames") crop_ratio = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Crop ratio") scenes_btn = gr.Button("Extraer escenas") scenes_gallery_out = gr.Gallery(label="Keyframes de escenas", show_label=False, columns=4, height="auto") scenes_info_out = gr.JSON(label="Información de escenas") scenes_btn.click(scenes_extraction, inputs=[video_file, threshold, offset_frames, crop_ratio], outputs=[scenes_gallery_out, scenes_info_out], api_name="scenes_extraction", concurrency_limit=1) demo.queue(max_size=16).launch()