|
|
""" |
|
|
# ============================================================================== |
|
|
# Vision-Language and Face Recognition Utilities |
|
|
# ============================================================================== |
|
|
This module provides helper functions, lazy-loading mechanisms, and |
|
|
API endpoint wrappers for multimodal inference, face recognition, and |
|
|
video scene extraction. |
|
|
|
|
|
It includes functionality for: |
|
|
- Lazy initialization of heavyweight models (vision-language and face models) |
|
|
- Image and video preprocessing |
|
|
- Multimodal inference with configurable parameters (token limits, temperature) |
|
|
- Facial embedding generation |
|
|
- Scene extraction from video files |
|
|
- Gradio UI components and endpoint definitions for user interaction |
|
|
|
|
|
All functions and utilities are designed to be: |
|
|
- Reusable and cache heavy models to reduce repeated loading |
|
|
- Compatible with GPU/CPU execution |
|
|
- Stateless and safe to call concurrently from multiple requests |
|
|
- Modular, separating model logic from endpoint and UI handling |
|
|
|
|
|
This module serves as the core interface layer between client-facing |
|
|
APIs/UI and the underlying machine learning models. |
|
|
# ============================================================================== |
|
|
""" |
|
|
|
|
|
|
|
|
import json |
|
|
import os |
|
|
import re |
|
|
from typing import Any, Dict, List, Optional, Tuple, Union |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
import cv2 |
|
|
import tempfile |
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
import spaces |
|
|
import torch |
|
|
from facenet_pytorch import InceptionResnetV1, MTCNN |
|
|
from PIL import Image |
|
|
from scenedetect import SceneManager, VideoManager |
|
|
from scenedetect.detectors import ContentDetector |
|
|
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration |
|
|
from wordfreq import zipf_frequency |
|
|
import easyocr |
|
|
|
|
|
|
|
|
''' |
|
|
# ============================================================================== |
|
|
# Lazy-loading utilities for vision-language and face recognition models |
|
|
# ============================================================================== |
|
|
|
|
|
This module provides on-demand initialization of heavyweight components, including: |
|
|
- MTCNN: Face detector used to locate and align faces. |
|
|
- FaceNet (InceptionResnetV1): Generates 512-dimensional facial embeddings. |
|
|
- LLaVA OneVision: Vision-language model for multimodal inference. |
|
|
|
|
|
By loading models lazily and caching them in global variables, the system avoids |
|
|
unnecessary reinitialization and reduces startup time, improving performance in |
|
|
production environments such as FastAPI services, Docker deployments, and |
|
|
Hugging Face Spaces. |
|
|
# ============================================================================== |
|
|
''' |
|
|
MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision") |
|
|
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
_model = None |
|
|
_processor = None |
|
|
_mtcnn = None |
|
|
_facenet = None |
|
|
|
|
|
|
|
|
def _load_face_models() -> Tuple[MTCNN, InceptionResnetV1]: |
|
|
""" |
|
|
Lazily loads and initializes the facial detection and facial embedding models. |
|
|
|
|
|
This function loads: |
|
|
- **MTCNN**: Used for face detection and cropping. |
|
|
- **InceptionResnetV1 (FaceNet)**: Used to generate 512-dimensional face embeddings. |
|
|
|
|
|
Both models are loaded only once and stored in global variables to avoid |
|
|
unnecessary re-initialization. They are automatically placed on GPU if available, |
|
|
otherwise CPU is used. |
|
|
|
|
|
Returns: |
|
|
Tuple[MTCNN, InceptionResnetV1]: A tuple containing the initialized |
|
|
face detection model and the face embedding model. |
|
|
""" |
|
|
global _mtcnn, _facenet |
|
|
if _mtcnn is None or _facenet is None: |
|
|
device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu" |
|
|
_mtcnn = MTCNN(image_size=160, margin=0, post_process=True, device=device) |
|
|
_facenet = InceptionResnetV1(pretrained="vggface2").eval().to(device) |
|
|
return _mtcnn, _facenet |
|
|
|
|
|
|
|
|
def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]: |
|
|
""" |
|
|
Lazily loads the vision-language model and its processor. |
|
|
|
|
|
This function performs a first-time load of: |
|
|
- **AutoProcessor**: Handles preprocessing of text and images for the model. |
|
|
- **LlavaOnevisionForConditionalGeneration**: The main multimodal model used |
|
|
for inference and text generation. |
|
|
|
|
|
The model is moved to GPU if available and configured with: |
|
|
- The appropriate floating-point precision (`float16` or `float32`) |
|
|
- Low memory usage mode |
|
|
- SafeTensors loading enabled |
|
|
|
|
|
Both components are cached in global variables to ensure subsequent calls |
|
|
reuse the loaded instances without reinitialization. |
|
|
|
|
|
Returns: |
|
|
Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]: |
|
|
The loaded model and processor ready for inference. |
|
|
""" |
|
|
global _model, _processor |
|
|
if _model is None or _processor is None: |
|
|
_processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
|
_model = LlavaOnevisionForConditionalGeneration.from_pretrained( |
|
|
MODEL_ID, |
|
|
dtype=DTYPE, |
|
|
low_cpu_mem_usage=True, |
|
|
trust_remote_code=True, |
|
|
use_safetensors=True, |
|
|
device_map=None, |
|
|
) |
|
|
_model.to(DEVICE) |
|
|
return _model, _processor |
|
|
|
|
|
''' |
|
|
# ============================================================================== |
|
|
# Auxiliary Model Loading Utilities for API Endpoints |
|
|
# ============================================================================== |
|
|
This module contains helper functions used internally by the API endpoints to |
|
|
efficiently load and manage heavy machine learning components. These utilities |
|
|
handle on-demand initialization ("lazy loading") of both the vision-language |
|
|
model (LLaVA OneVision) and the facial detection/embedding models (MTCNN and |
|
|
FaceNet). |
|
|
|
|
|
The goal of this helper block is to: |
|
|
- Avoid repeated loading of large models across requests. |
|
|
- Reduce GPU/CPU memory pressure by reusing cached instances. |
|
|
- Provide clean separation between endpoint logic and model-handling logic. |
|
|
- Improve performance and stability in production environments |
|
|
(FastAPI, Docker, Hugging Face Spaces). |
|
|
|
|
|
All functions here are intended for internal use and should be called by |
|
|
endpoint handlers when a model is required for a given request. |
|
|
# ============================================================================== |
|
|
''' |
|
|
|
|
|
@spaces.GPU |
|
|
def _infer_one( |
|
|
image: Image.Image, |
|
|
text: str, |
|
|
max_new_tokens: int = 256, |
|
|
temperature: float = 0.7, |
|
|
context: Optional[Dict] = None, |
|
|
) -> str: |
|
|
""" |
|
|
Run a single multimodal inference step using the LLaVA OneVision model. |
|
|
|
|
|
This function: |
|
|
- Optionally downsizes the input image to reduce GPU memory consumption. |
|
|
- Loads the model and processor through lazy initialization. |
|
|
- Builds the final prompt by applying the chat template and injecting optional context. |
|
|
- Performs autoregressive generation with configurable token and temperature settings. |
|
|
- Returns the decoded textual output. |
|
|
|
|
|
Args: |
|
|
image (Image.Image): Input PIL image used for multimodal conditioning. |
|
|
text (str): User-provided instruction or query. |
|
|
max_new_tokens (int): Maximum number of tokens to generate. |
|
|
temperature (float): Sampling temperature controlling output randomness. |
|
|
context (Optional[Dict]): Additional context injected into the prompt. |
|
|
|
|
|
Returns: |
|
|
str: The generated textual response. |
|
|
""" |
|
|
image.thumbnail((1024, 1024)) |
|
|
|
|
|
model, processor = _lazy_load() |
|
|
prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True) |
|
|
|
|
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE) |
|
|
|
|
|
with torch.inference_mode(): |
|
|
out = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=int(max_new_tokens), |
|
|
temperature=float(temperature), |
|
|
) |
|
|
|
|
|
return processor.decode(out[0], skip_special_tokens=True).strip() |
|
|
|
|
|
@spaces.GPU |
|
|
def _get_face_embedding_casting(image: Image.Image) -> list[dict] | None: |
|
|
""" |
|
|
Returns list of dicts: |
|
|
[ |
|
|
{ |
|
|
"embedding": <list[float]>, |
|
|
"face_crop": <PIL.Image> |
|
|
}, |
|
|
... |
|
|
] |
|
|
""" |
|
|
try: |
|
|
mtcnn, facenet = _load_face_models() |
|
|
boxes, probs = mtcnn.detect(image) |
|
|
|
|
|
if boxes is None: |
|
|
return [] |
|
|
|
|
|
resultados = [] |
|
|
device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu" |
|
|
|
|
|
for box in boxes: |
|
|
x1, y1, x2, y2 = map(int, box) |
|
|
face_crop = image.crop((x1, y1, x2, y2)) |
|
|
|
|
|
face_tensor = mtcnn(face_crop) |
|
|
if face_tensor is None: |
|
|
continue |
|
|
|
|
|
face_tensor = face_tensor.unsqueeze(0).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
emb = facenet(face_tensor).cpu().numpy()[0] |
|
|
|
|
|
emb = emb / np.linalg.norm(emb) |
|
|
|
|
|
resultados.append({ |
|
|
"embedding": emb.astype(float).tolist(), |
|
|
"face_crop": face_crop |
|
|
}) |
|
|
|
|
|
del mtcnn |
|
|
del facenet |
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
torch.cuda.ipc_collect() |
|
|
|
|
|
return resultados |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Face embedding failed: {e}") |
|
|
return [] |
|
|
|
|
|
@spaces.GPU |
|
|
def _get_face_embedding( |
|
|
image: Image.Image |
|
|
) -> list[float] | None: |
|
|
""" |
|
|
Generate a FaceNet embedding for a single face in an image. |
|
|
|
|
|
Args: |
|
|
image (Image.Image): A PIL Image containing a face. |
|
|
|
|
|
Returns: |
|
|
list[float] | None: Normalized embedding vector for the detected face, |
|
|
or None if no face is detected or an error occurs. |
|
|
""" |
|
|
try: |
|
|
mtcnn, facenet = _load_face_models() |
|
|
|
|
|
boxes, probs = mtcnn.detect(image) |
|
|
|
|
|
if boxes is None: |
|
|
return [] |
|
|
|
|
|
embeddings = [] |
|
|
device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu" |
|
|
|
|
|
for box in boxes: |
|
|
x1, y1, x2, y2 = map(int, box) |
|
|
face = image.crop((x1, y1, x2, y2)) |
|
|
|
|
|
face_tensor = mtcnn(face) |
|
|
if face_tensor is None: |
|
|
continue |
|
|
face_tensor = face_tensor.unsqueeze(0).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
emb = facenet(face_tensor).cpu().numpy()[0] |
|
|
|
|
|
emb = emb / np.linalg.norm(emb) |
|
|
embeddings.append(emb.astype(float).tolist()) |
|
|
|
|
|
|
|
|
del mtcnn |
|
|
del facenet |
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
torch.cuda.ipc_collect() |
|
|
|
|
|
return embeddings |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Face embedding failed: {e}") |
|
|
return [] |
|
|
|
|
|
@spaces.GPU |
|
|
def _get_scenes_extraction( |
|
|
video_file: str, |
|
|
threshold: float, |
|
|
offset_frames: int, |
|
|
crop_ratio: float |
|
|
) -> Tuple[List[Image.Image], List[Dict]] | None: |
|
|
""" |
|
|
Extracts scenes from a video and returns cropped images along with information about each scene. |
|
|
|
|
|
Args: |
|
|
video_file (str): Path to the video file. |
|
|
threshold (float): Threshold for scene detection. |
|
|
offset_frames (int): Frame offset from the start of each scene. |
|
|
crop_ratio (float): Central crop ratio for each frame. |
|
|
|
|
|
Returns: |
|
|
Tuple[List[Image.Image], List[Dict]] | None: List of scene images and list of scene information, |
|
|
or (None, None) if an error occurs. |
|
|
""" |
|
|
try: |
|
|
|
|
|
video_manager = VideoManager([video_file]) |
|
|
scene_manager = SceneManager() |
|
|
scene_manager.add_detector(ContentDetector(threshold=threshold)) |
|
|
video_manager.start() |
|
|
scene_manager.detect_scenes(video_manager) |
|
|
scene_list = scene_manager.get_scene_list() |
|
|
|
|
|
if len(scene_list) == 0: |
|
|
scene_list = [(video_manager.get_base_timecode(), video_manager.get_duration())] |
|
|
|
|
|
cap = cv2.VideoCapture(video_file) |
|
|
images: List[Image.Image] = [] |
|
|
scene_info: List[Dict] = [] |
|
|
|
|
|
for i, (start_time, end_time) in enumerate(scene_list): |
|
|
frame_number = int(start_time.get_frames()) + offset_frames |
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number) |
|
|
ret, frame = cap.read() |
|
|
|
|
|
if not ret: |
|
|
continue |
|
|
|
|
|
h, w = frame.shape[:2] |
|
|
|
|
|
|
|
|
ch, cw = int(h * crop_ratio), int(w * crop_ratio) |
|
|
cropped_frame = frame[ch:h-ch, cw:w-cw] |
|
|
|
|
|
|
|
|
img_rgb = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB) |
|
|
images.append(Image.fromarray(img_rgb)) |
|
|
|
|
|
|
|
|
scene_info.append({ |
|
|
"index": i + 1, |
|
|
"start": start_time.get_seconds(), |
|
|
"end": end_time.get_seconds() |
|
|
}) |
|
|
|
|
|
if len(scene_info) == 0: |
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, offset_frames) |
|
|
ret, frame = cap.read() |
|
|
if ret: |
|
|
h, w = frame.shape[:2] |
|
|
|
|
|
ch, cw = int(h * crop_ratio), int(w * crop_ratio) |
|
|
cropped_frame = frame[ch:h-ch, cw:w-cw] |
|
|
|
|
|
img_rgb = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB) |
|
|
images.append(Image.fromarray(img_rgb)) |
|
|
|
|
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
|
fps = cap.get(cv2.CAP_PROP_FPS) |
|
|
duration_seconds = total_frames / fps if fps > 0 else 0.0 |
|
|
|
|
|
scene_info.append({ |
|
|
"index": 1, |
|
|
"start": 0.0, |
|
|
"end": duration_seconds |
|
|
}) |
|
|
|
|
|
cap.release() |
|
|
return images, scene_info |
|
|
|
|
|
except Exception as e: |
|
|
print("Error in scenes_extraction:", e) |
|
|
return [], [] |
|
|
|
|
|
@spaces.GPU |
|
|
def _get_image_list_description( |
|
|
images: List[Image.Image] |
|
|
) -> List[str]: |
|
|
""" |
|
|
Generate brief visual descriptions for a list of PIL Images using Salamandra Vision. |
|
|
|
|
|
Args: |
|
|
images (List[Image.Image]): List of PIL Image objects to describe. |
|
|
|
|
|
Returns: |
|
|
List[str]: List of descriptions, one per image. |
|
|
""" |
|
|
list_images = [x[0] for x in images] |
|
|
|
|
|
|
|
|
path_model = "BSC-LT/salamandra-7b-vision" |
|
|
processor = AutoProcessor.from_pretrained(path_model) |
|
|
model = LlavaOnevisionForConditionalGeneration.from_pretrained( |
|
|
path_model, |
|
|
torch_dtype=torch.float16, |
|
|
low_cpu_mem_usage=False |
|
|
).to("cuda") |
|
|
|
|
|
|
|
|
sys_prompt = ( |
|
|
"Ets un expert en narrativa visual. " |
|
|
"Descriu la imatge de manera molt breu i senzilla en català, " |
|
|
"explicant només l'acció principal que es veu. " |
|
|
"Respon amb una única frase curta (màxim 10–20 paraules), " |
|
|
"sense afegir detalls innecessaris ni descriure el fons." |
|
|
) |
|
|
|
|
|
all_results = [] |
|
|
|
|
|
for img in list_images: |
|
|
batch = [img] |
|
|
|
|
|
|
|
|
conversation = [ |
|
|
{"role": "system", "content": sys_prompt}, |
|
|
{"role": "user", "content": [ |
|
|
{"type": "image", "image": batch[0]}, |
|
|
{"type": "text", "text": ( |
|
|
"Descriu la imatge de manera molt breu i senzilla en català." |
|
|
)} |
|
|
]} |
|
|
] |
|
|
prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True) |
|
|
|
|
|
|
|
|
inputs = processor(images=batch, text=prompt_batch, return_tensors="pt") |
|
|
for k, v in inputs.items(): |
|
|
if v.dtype.is_floating_point: |
|
|
inputs[k] = v.to("cuda", torch.float16) |
|
|
else: |
|
|
inputs[k] = v.to("cuda") |
|
|
|
|
|
|
|
|
output = model.generate(**inputs, max_new_tokens=1024) |
|
|
text = processor.decode(output[0], skip_special_tokens=True) |
|
|
lines = text.split("\n") |
|
|
|
|
|
|
|
|
desc = "" |
|
|
for i, line in enumerate(lines): |
|
|
if line.lower().startswith(" assistant"): |
|
|
desc = "\n".join(lines[i+1:]).strip() |
|
|
break |
|
|
|
|
|
all_results.append(desc) |
|
|
|
|
|
del model |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
return all_results |
|
|
|
|
|
@spaces.GPU |
|
|
def _get_ocr_characters_to_image( |
|
|
image: Image.Image, |
|
|
informacion_image: Dict[str, Any], |
|
|
face_col: List[Dict[str, Any]] |
|
|
) -> Dict[str, Any]: |
|
|
""" |
|
|
Process an input image by detecting faces, generating face embeddings, |
|
|
performing K-nearest neighbors (KNN) matching against a known face database, |
|
|
and extracting OCR (Optical Character Recognition) text using EasyOCR. |
|
|
|
|
|
The function performs the following steps: |
|
|
1. Detects faces in the image and generates embeddings for each face. |
|
|
2. For each detected face, retrieves the top 3 closest embeddings from the |
|
|
reference database and determines the identity if the distance is below |
|
|
a defined threshold. |
|
|
3. Executes OCR using EasyOCR to extract textual content from the image. |
|
|
It filters the OCR output by removing uncommon or noisy words, and |
|
|
validates results using zipf word frequency to ensure linguistic relevance. |
|
|
4. Returns a dictionary containing metadata, detected identities, and OCR text. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
image : PIL.Image.Image |
|
|
The image to process. |
|
|
informacion_image : Dict[str, Any] |
|
|
Metadata about the image (index, start time, end time), provided as JSON. |
|
|
face_col : List[Dict[str, Any]] |
|
|
A list of dictionaries containing stored face embeddings and names, |
|
|
provided as JSON. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
Dict[str, Any] |
|
|
A dictionary containing: |
|
|
- id: image identifier |
|
|
- start: start timestamp |
|
|
- end: end timestamp |
|
|
- faces: list of detected identities |
|
|
- ocr: extracted OCR text |
|
|
""" |
|
|
|
|
|
|
|
|
raw_faces = _get_face_embedding(image) |
|
|
informacion_image_dict = json.loads(informacion_image) |
|
|
face_col = json.loads(face_col) |
|
|
faces_detected = [] |
|
|
if raw_faces != None: |
|
|
for f in raw_faces: |
|
|
embedding_image = f |
|
|
identity = "Desconegut" |
|
|
knn = [] |
|
|
|
|
|
|
|
|
if face_col and embedding_image is not None: |
|
|
try: |
|
|
num_embeddings = len(face_col) |
|
|
|
|
|
if num_embeddings < 1: |
|
|
knn = [] |
|
|
identity = "Desconegut" |
|
|
|
|
|
else: |
|
|
n_results = min(3, num_embeddings) |
|
|
|
|
|
embedding_image = np.array(embedding_image) |
|
|
|
|
|
distances_embedding = [] |
|
|
|
|
|
|
|
|
for image_base_datos in face_col: |
|
|
image_base_datos_embedding = np.array(image_base_datos["embedding"]) |
|
|
distance = np.linalg.norm(embedding_image - image_base_datos_embedding) |
|
|
distances_embedding.append({ |
|
|
"identity": image_base_datos["nombre"], |
|
|
"distance": float(distance) |
|
|
}) |
|
|
|
|
|
|
|
|
distances_embedding = sorted(distances_embedding, key=lambda x: x["distance"]) |
|
|
knn = distances_embedding[:n_results] |
|
|
|
|
|
|
|
|
if knn and knn[0]["distance"] < 0.8: |
|
|
identity = knn[0]["identity"] |
|
|
else: |
|
|
identity = "Desconegut" |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Face KNN failed: {e}") |
|
|
knn = [] |
|
|
identity = "Desconegut" |
|
|
|
|
|
faces_detected.append(identity) |
|
|
|
|
|
|
|
|
ocr_text_easyocr = "" |
|
|
use_easyocr = True |
|
|
if use_easyocr: |
|
|
try: |
|
|
rgb = np.array(image) |
|
|
bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR) |
|
|
|
|
|
|
|
|
reader = easyocr.Reader(['en', 'es'], gpu=True) |
|
|
results = reader.readtext(bgr) |
|
|
|
|
|
|
|
|
ocr_text_easyocr = " ".join([text for _, text, _ in results]).strip() |
|
|
|
|
|
|
|
|
palabras_ocr_text = ocr_text_easyocr.split() |
|
|
palabras_ocr_text = [p for p in palabras_ocr_text if re.fullmatch(r'[A-Za-zÀ-ÿ]+', p)] |
|
|
|
|
|
|
|
|
for palabra in palabras_ocr_text: |
|
|
if zipf_frequency(palabra, "ca") != 0.0: |
|
|
break |
|
|
else: |
|
|
ocr_text_easyocr = "" |
|
|
|
|
|
except Exception as e: |
|
|
print(f"OCR error: {e}") |
|
|
return {"id": informacion_image_dict["index"], |
|
|
"start": informacion_image_dict["start"], |
|
|
"end": informacion_image_dict["end"], |
|
|
"faces": faces_detected, |
|
|
"ocr": ""} |
|
|
|
|
|
|
|
|
informacion_image_completo = { |
|
|
"id": informacion_image_dict["index"], |
|
|
"start": informacion_image_dict["start"], |
|
|
"end": informacion_image_dict["end"], |
|
|
"faces": faces_detected, |
|
|
"ocr": ocr_text_easyocr, |
|
|
} |
|
|
|
|
|
return informacion_image_completo |
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def _extract_keyframes_every_second( |
|
|
video: str, |
|
|
crop_ratio: float = 0.1 |
|
|
) -> Tuple[List[np.ndarray], List[dict]]: |
|
|
""" |
|
|
Extracts one keyframe per second from a video file. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
video : str |
|
|
Path to the input video file. |
|
|
crop_ratio : float, optional |
|
|
Percentage of the frame to crop from each border before resizing |
|
|
back to the original dimensions. Default is 0.1 (10%). |
|
|
|
|
|
Returns |
|
|
------- |
|
|
images : List[np.ndarray] |
|
|
List of extracted frames as NumPy arrays. |
|
|
frames_info : List[dict] |
|
|
List of metadata dictionaries for each extracted frame. Each dictionary contains: |
|
|
- "index": sequential index starting from 1 |
|
|
- "start": starting second of the interval represented by the frame |
|
|
- "end": ending second of the interval represented by the frame |
|
|
|
|
|
Notes |
|
|
----- |
|
|
A temporary directory is automatically created to store intermediate |
|
|
images. These images are not returned but can be useful for debugging. |
|
|
The directory is cleaned up after the function finishes. |
|
|
""" |
|
|
|
|
|
|
|
|
tmp_dir = Path(tempfile.mkdtemp()) |
|
|
|
|
|
|
|
|
cap = cv2.VideoCapture(str(video)) |
|
|
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0 |
|
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
|
duration = total_frames / fps |
|
|
|
|
|
images = [] |
|
|
frames_info = [] |
|
|
|
|
|
|
|
|
for sec in range(int(duration)): |
|
|
frame_number = int(sec * fps) |
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number) |
|
|
ret, frame = cap.read() |
|
|
if not ret: |
|
|
break |
|
|
|
|
|
|
|
|
h, w = frame.shape[:2] |
|
|
ch, cw = int(h * crop_ratio), int(w * crop_ratio) |
|
|
cropped = frame[ch:h-ch, cw:w-cw] |
|
|
|
|
|
|
|
|
cropped = cv2.resize(cropped, (w, h)) |
|
|
cropped_rgb = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB) |
|
|
|
|
|
timestamp = frame_number / fps |
|
|
|
|
|
|
|
|
tmp_path = tmp_dir / f"frame_{sec:03d}.jpg" |
|
|
cv2.imwrite(str(tmp_path), cv2.cvtColor(cropped_rgb, cv2.COLOR_RGB2BGR)) |
|
|
|
|
|
|
|
|
images.append(cropped) |
|
|
frames_info.append({ |
|
|
"index": sec + 1, |
|
|
"start": sec, |
|
|
"end": sec + 1 |
|
|
}) |
|
|
|
|
|
|
|
|
cap.release() |
|
|
|
|
|
return images, frames_info |
|
|
|
|
|
|
|
|
""" |
|
|
# ============================================================================== |
|
|
# API Helpers |
|
|
# ============================================================================== |
|
|
Collection of public-facing API endpoints used by the application. |
|
|
|
|
|
This section exposes functions that process incoming requests, |
|
|
perform validation, interact with the model inference helpers, |
|
|
and return structured responses. Each endpoint is designed to be |
|
|
stateless, deterministic, and safe to call from external clients. |
|
|
|
|
|
Endpoints in this module typically: |
|
|
- Receive raw data (images, text, base64-encoded content, etc.) |
|
|
- Preprocess inputs before forwarding them to internal inference utilities |
|
|
- Handle optional parameters such as temperature or token limits |
|
|
- Return JSON-serializable dictionaries as responses |
|
|
|
|
|
The functions below constitute the interface layer between users |
|
|
and the underlying model logic implemented in the helper utilities. |
|
|
# ============================================================================== |
|
|
""" |
|
|
|
|
|
def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.", |
|
|
max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]: |
|
|
""" |
|
|
Endpoint to generate a detailed description of an input image. |
|
|
|
|
|
This function receives an image and an optional text prompt, then forwards |
|
|
the request to the internal inference helper `_infer_one`. It returns a JSON- |
|
|
serializable dictionary containing the generated text description. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
image : PIL.Image.Image |
|
|
The input image to be analyzed and described. |
|
|
text : str, optional |
|
|
Instruction or prompt for the model guiding how the image should be described. |
|
|
Defaults to a general "describe in detail" prompt (in Spanish). |
|
|
max_new_tokens : int, optional |
|
|
Maximum number of tokens the model is allowed to generate. Default is 256. |
|
|
temperature : float, optional |
|
|
Sampling temperature controlling randomness of the output. Default is 0.7. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
Dict[str, str] |
|
|
A dictionary with a single key `"text"` containing the generated description. |
|
|
""" |
|
|
result = _infer_one(image, text, max_new_tokens, temperature, context=None) |
|
|
return {"text": result} |
|
|
|
|
|
|
|
|
def describe_batch( |
|
|
images: List[Image.Image], |
|
|
context_json: str, |
|
|
max_new_tokens: int = 256, |
|
|
temperature: float = 0.7 |
|
|
) -> List[str]: |
|
|
""" |
|
|
Batch endpoint for the image description engine. |
|
|
|
|
|
This endpoint receives a list of images along with an optional JSON-formatted |
|
|
context, and returns a list of textual descriptions generated by the model. |
|
|
Each image is processed individually using the internal `_infer_one` function, |
|
|
optionally incorporating the context into the prompt. |
|
|
|
|
|
Args: |
|
|
images (List[Image.Image]): |
|
|
A list of PIL Image objects to describe. |
|
|
context_json (str): |
|
|
A JSON-formatted string providing additional context for the prompt. |
|
|
If empty or invalid, no context will be used. |
|
|
max_new_tokens (int, optional): |
|
|
Maximum number of tokens to generate per image. Defaults to 256. |
|
|
temperature (float, optional): |
|
|
Sampling temperature controlling text randomness. Defaults to 0.7. |
|
|
|
|
|
Returns: |
|
|
List[str]: A list of text descriptions, one for each input image, in order. |
|
|
""" |
|
|
try: |
|
|
context = json.loads(context_json) if context_json else None |
|
|
except Exception: |
|
|
context = None |
|
|
outputs: List[str] = [] |
|
|
for img in images: |
|
|
outputs.append(_infer_one(img, text="Describe la imagen con detalle.", max_new_tokens=max_new_tokens, |
|
|
temperature=temperature, context=context)) |
|
|
return outputs |
|
|
|
|
|
def face_image_embedding_casting(image): |
|
|
results = _get_face_embedding_casting(image) |
|
|
|
|
|
if not results: |
|
|
return [], [] |
|
|
|
|
|
|
|
|
face_crops = [r["face_crop"] for r in results] |
|
|
|
|
|
|
|
|
face_embeddings = [ |
|
|
{ |
|
|
"index": i, |
|
|
"embedding": r["embedding"] |
|
|
} |
|
|
for i, r in enumerate(results) |
|
|
] |
|
|
|
|
|
return face_crops, face_embeddings |
|
|
|
|
|
def face_image_embedding(image: Image.Image) -> List[float] | None: |
|
|
""" |
|
|
Endpoint to generate a face embedding for a given image. |
|
|
|
|
|
This function wraps the core `_get_face_embedding` logic for use in endpoints. |
|
|
The MTCNN and FaceNet models must be preloaded before calling this function. |
|
|
|
|
|
Args: |
|
|
image (Image.Image): Input image containing a face. |
|
|
mtcnn (MTCNN): Preloaded MTCNN face detector. |
|
|
facenet (InceptionResnetV1): Preloaded FaceNet model. |
|
|
|
|
|
Returns: |
|
|
list[float] | None: Normalized embedding vector or None if no face detected. |
|
|
""" |
|
|
return _get_face_embedding(image) |
|
|
|
|
|
|
|
|
def scenes_extraction( |
|
|
video_file: str, |
|
|
threshold: float, |
|
|
offset_frames: int, |
|
|
crop_ratio: float |
|
|
) -> Tuple[List[Image.Image], List[Dict]] | None: |
|
|
""" |
|
|
Endpoint wrapper for extracting scenes from a video. |
|
|
|
|
|
This function acts as a wrapper around the internal `_get_scenes_extraction` function. |
|
|
It handles a video file provided as a string path (as Gradio temporarily saves uploaded files) |
|
|
and returns the extracted scene images along with scene metadata. |
|
|
|
|
|
Args: |
|
|
video_file (str): Path to the uploaded video file. |
|
|
threshold (float): Threshold for scene detection. |
|
|
offset_frames (int): Frame offset from the start of each detected scene. |
|
|
crop_ratio (float): Central crop ratio to apply to each extracted frame. |
|
|
|
|
|
Returns: |
|
|
Tuple[List[Image.Image], List[Dict]] | None: A tuple containing: |
|
|
- A list of PIL Images representing each extracted scene. |
|
|
- A list of dictionaries with scene information (index, start time, end time). |
|
|
Returns (None, None) if an error occurs during extraction. |
|
|
""" |
|
|
return _get_scenes_extraction(video_file, threshold, offset_frames, crop_ratio) |
|
|
|
|
|
|
|
|
def describe_list_images( |
|
|
images: List[Image.Image] |
|
|
) -> List[str]: |
|
|
""" |
|
|
Endpoint wrapper for generating brief descriptions of a list of images. |
|
|
|
|
|
This function acts as a wrapper around the internal `_get_image_list_description` function. |
|
|
It takes a list of PIL Images and returns a list of short textual descriptions for each image. |
|
|
|
|
|
Args: |
|
|
images (List[Image.Image]): A list of PIL Image objects to describe. |
|
|
|
|
|
Returns: |
|
|
List[str]: A list of strings, where each string is a brief description of the corresponding image. |
|
|
""" |
|
|
return _get_image_list_description(images) |
|
|
|
|
|
|
|
|
def add_ocr_characters_to_image( |
|
|
image: Image.Image, |
|
|
informacion_image: Dict[str, Any], |
|
|
face_col: List[Dict[str, Any]] |
|
|
) -> Dict[str, Any]: |
|
|
""" |
|
|
Endpoint wrapper for processing an image to extract face identities and OCR text. |
|
|
|
|
|
This function serves as a wrapper for the internal `_get_ocr_characters_to_image` |
|
|
function. It receives an image, metadata describing that image, and a collection |
|
|
of stored face embeddings. The wrapped internal function performs the following: |
|
|
|
|
|
1. Detects faces and generates embeddings for each detected face. |
|
|
2. Matches these embeddings against a reference database using K-nearest neighbors. |
|
|
3. Runs OCR (Optical Character Recognition) on the image to extract textual content. |
|
|
4. Applies filtering to discard invalid or noisy OCR results. |
|
|
5. Returns a structured dictionary containing image metadata, identified faces, |
|
|
and OCR-extracted text. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
image : PIL.Image.Image |
|
|
The image object to be analyzed. |
|
|
informacion_image : Dict[str, Any] |
|
|
Metadata describing the image (such as index, start timestamp, end timestamp). |
|
|
face_col : List[Dict[str, Any]] |
|
|
A list of dictionaries representing stored face embeddings and related identity |
|
|
information, used for similarity matching. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
Dict[str, Any] |
|
|
A dictionary containing: |
|
|
- id: the image identifier |
|
|
- start: start timestamp |
|
|
- end: end timestamp |
|
|
- faces: detected face identities |
|
|
- ocr: the extracted OCR text |
|
|
""" |
|
|
return _get_ocr_characters_to_image(image,informacion_image,face_col) |
|
|
|
|
|
|
|
|
def extract_keyframes_endpoint( |
|
|
video_path: str, |
|
|
crop_ratio: float = 0.1 |
|
|
) -> Dict[str, Any]: |
|
|
""" |
|
|
Endpoint wrapper for extracting one keyframe per second from a video. |
|
|
|
|
|
This function serves as a wrapper around the internal |
|
|
`_extract_keyframes_every_second` function. It receives a path to a |
|
|
video file and an optional cropping ratio, and delegates the extraction |
|
|
of frames to the internal function. The wrapped internal function |
|
|
performs the following: |
|
|
|
|
|
1. Loads the video and determines its duration and FPS. |
|
|
2. Extracts exactly one frame per second of video playback. |
|
|
3. Crops each frame by a proportional margin and resizes it back to the |
|
|
original resolution. |
|
|
4. Optionally stores intermediate images in a temporary directory for |
|
|
debugging purposes. |
|
|
5. Returns the frames as NumPy arrays along with structured metadata |
|
|
describing the extracted intervals. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
video_path : str |
|
|
Path to the input video file. |
|
|
crop_ratio : float, optional |
|
|
Percentage of the frame to crop from each border before resizing |
|
|
(default is 0.1, equivalent to 10%). |
|
|
|
|
|
Returns |
|
|
------- |
|
|
Dict[str, Any] |
|
|
A dictionary containing: |
|
|
- frames: list of extracted frames represented as NumPy arrays |
|
|
- metadata: list of dictionaries with: |
|
|
* index: sequential frame identifier |
|
|
* start: starting timestamp of the 1-second interval |
|
|
* end: ending timestamp of the interval |
|
|
""" |
|
|
images, frames_info = _extract_keyframes_every_second(video_path, crop_ratio) |
|
|
|
|
|
return images, frames_info |
|
|
|
|
|
""" |
|
|
# ============================================================================== |
|
|
# UI & Endpoints |
|
|
# ============================================================================== |
|
|
Collection of Gradio interface elements and API endpoints used by the application. |
|
|
|
|
|
This section defines the user-facing interface for Salamandra Vision 7B, |
|
|
allowing users to interact with the model through images, text prompts, |
|
|
video uploads, and batch operations. |
|
|
|
|
|
The components and endpoints in this module typically: |
|
|
- Accept images, text, or video files from the user |
|
|
- Apply optional parameters such as temperature, token limits, or crop ratios |
|
|
- Preprocess inputs and invoke internal inference or utility functions |
|
|
- Return structured outputs, including text descriptions, JSON metadata, |
|
|
or image galleries |
|
|
|
|
|
All endpoints are designed to be stateless, safe for concurrent calls, |
|
|
and compatible with both interactive UI usage and programmatic API access. |
|
|
# ============================================================================== |
|
|
""" |
|
|
def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict]: |
|
|
""" |
|
|
Build the chat template with an image, text, and optional context. |
|
|
|
|
|
Args: |
|
|
user_text (str): Text provided by the user. |
|
|
context (Optional[Dict]): Optional additional context. |
|
|
|
|
|
Returns: |
|
|
List[Dict]: A conversation template for the model, including the image and text. |
|
|
""" |
|
|
ctx_txt = "" |
|
|
if context: |
|
|
try: |
|
|
|
|
|
ctx_txt = "\n\nAdditional context:\n" + json.dumps(context, ensure_ascii=False)[:2000] |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
user_txt = (user_text or "Describe the image in detail.") + ctx_txt |
|
|
convo = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image"}, |
|
|
{"type": "text", "text": user_txt}, |
|
|
], |
|
|
} |
|
|
] |
|
|
return convo |
|
|
|
|
|
custom_css = """ |
|
|
h2 { |
|
|
background: #e3e4e6 !important; |
|
|
padding: 14px 22px !important; |
|
|
border-radius: 14px !important; |
|
|
box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important; |
|
|
display: block !important; /* ocupa tot l'ample */ |
|
|
width: 100% !important; /* assegura 100% */ |
|
|
margin: 20px auto !important; |
|
|
text-align:center; |
|
|
} |
|
|
""" |
|
|
with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU", css=custom_css,theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
gr.Markdown('<h1 style="text-align:center">SALAMANDRA VISION 7B · ZEROGPU</h1>') |
|
|
gr.Markdown("---") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown('<h2 style="text-align:center">Inferència per imatge única</h2>') |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
in_img = gr.Image(label="Imatge", type="pil") |
|
|
in_txt = gr.Textbox(label="Text/prompt", value="Descriu la imatge amb detall (ES/CA).") |
|
|
max_new = gr.Slider(16, 1024, value=256, step=16, label="màx_tokens nous") |
|
|
temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperatura") |
|
|
btn = gr.Button("Genera", variant="primary") |
|
|
with gr.Column(): |
|
|
out = gr.Textbox(label="Descripció", lines=18) |
|
|
|
|
|
btn.click(_infer_one, [in_img, in_txt, max_new, temp], out, api_name="describe", concurrency_limit=1) |
|
|
gr.Markdown("---") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown('<h2 style="text-align:center">Llot d’imatges</h2>') |
|
|
batch_in_images = gr.Gallery(label="Llot d’imatges", show_label=False, columns=4, height="auto") |
|
|
batch_context = gr.Textbox(label="context_json", value="{}", lines=4) |
|
|
batch_max = gr.Slider(16, 1024, value=256, step=16, label="màx_tokens nous") |
|
|
batch_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperatura") |
|
|
batch_btn = gr.Button("Descriu el lot", variant="primary") |
|
|
batch_out = gr.JSON(label="Descripcions (llista)") |
|
|
|
|
|
batch_btn.click( |
|
|
describe_batch, |
|
|
[batch_in_images, batch_context, batch_max, batch_temp], |
|
|
batch_out, |
|
|
api_name="predict", |
|
|
concurrency_limit=1 |
|
|
) |
|
|
gr.Markdown("---") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown('<h2 style="text-align:center">Embeddings facials casting</h2>') |
|
|
|
|
|
with gr.Row(): |
|
|
face_img = gr.Image(label="Imatge per embedding facial", type="pil") |
|
|
|
|
|
with gr.Row(): |
|
|
face_btn = gr.Button("Obté embedding facial", variant="primary") |
|
|
|
|
|
with gr.Row(): |
|
|
face_crops = gr.Gallery(label="Cares detectades", columns=3, height="auto") |
|
|
|
|
|
with gr.Row(): |
|
|
face_embeddings = gr.JSON(label="Vectors d'embedding") |
|
|
|
|
|
face_btn.click( |
|
|
face_image_embedding_casting, |
|
|
[face_img], |
|
|
[face_crops, face_embeddings], |
|
|
api_name="face_image_embedding_casting", |
|
|
concurrency_limit=1 |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown('<h2 style="text-align:center">Embeddings facials</h2>') |
|
|
with gr.Row(): |
|
|
face_img = gr.Image(label="Imatge per embedding facial", type="pil") |
|
|
with gr.Row(): |
|
|
face_btn = gr.Button("Obté embedding facial", variant="primary") |
|
|
with gr.Row(): |
|
|
face_out = gr.JSON(label="Embedding facial (vector)") |
|
|
face_btn.click(face_image_embedding, [face_img], face_out, api_name="face_image_embedding", concurrency_limit=1) |
|
|
gr.Markdown("---") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown('<h2 style="text-align:center">Extracció d’escenes de vídeo</h2>') |
|
|
with gr.Row(): |
|
|
video_file = gr.Video(label="Puja un vídeo") |
|
|
with gr.Row(): |
|
|
threshold = gr.Slider(0.0, 100.0, value=30.0, step=1.0, label="Llindar") |
|
|
offset_frames = gr.Slider(0, 240.0, value=240.0, step=1.0, label="Desplaçament de frames") |
|
|
crop_ratio = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Raó de retall") |
|
|
with gr.Row(): |
|
|
scenes_btn = gr.Button("Extreu escenes", variant="primary") |
|
|
with gr.Row(): |
|
|
scenes_gallery_out = gr.Gallery(label="Fotogrames clau de l’escena", show_label=False, columns=4, height="auto") |
|
|
scenes_info_out = gr.JSON(label="Informació de l’escena") |
|
|
|
|
|
scenes_btn.click( |
|
|
scenes_extraction, |
|
|
inputs=[video_file, threshold, offset_frames, crop_ratio], |
|
|
outputs=[scenes_gallery_out, scenes_info_out], |
|
|
api_name="scenes_extraction", |
|
|
concurrency_limit=1 |
|
|
) |
|
|
gr.Markdown("---") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown('<h2 style="text-align:center">Extracció d’frames de vídeo</h2>') |
|
|
with gr.Row(): |
|
|
video_file = gr.Video(label="Puja un vídeo") |
|
|
with gr.Row(): |
|
|
scenes_btn = gr.Button("Extreu frames", variant="primary") |
|
|
with gr.Row(): |
|
|
scenes_gallery_out = gr.Gallery(label="Fotogrames clau de l’escena", show_label=False, columns=4, height="auto") |
|
|
scenes_info_out = gr.JSON(label="Informació de l’escena") |
|
|
|
|
|
scenes_btn.click( |
|
|
extract_keyframes_endpoint, |
|
|
inputs=[video_file], |
|
|
outputs=[scenes_gallery_out, scenes_info_out], |
|
|
api_name="keyframes_every_second_extraction", |
|
|
concurrency_limit=1 |
|
|
) |
|
|
gr.Markdown("---") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown('<h2 style="text-align:center">Descripció per lots amb Salamandra Vision</h2>') |
|
|
with gr.Row(): |
|
|
img_input = gr.Gallery(label="Llot d’imatges", show_label=False) |
|
|
with gr.Row(): |
|
|
describe_btn = gr.Button("Genera descripcions", variant="primary") |
|
|
with gr.Row(): |
|
|
desc_output = gr.Textbox(label="Descripcions de les imatges") |
|
|
|
|
|
describe_btn.click( |
|
|
describe_list_images, |
|
|
inputs=[img_input], |
|
|
outputs=desc_output, |
|
|
api_name="describe_images", |
|
|
concurrency_limit=1 |
|
|
) |
|
|
gr.Markdown("---") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown('<h2 style="text-align:center">Afegiu OCR i informació de caràcters al vídeo</h2>') |
|
|
with gr.Row(): |
|
|
img_input = gr.Image(label="Imatge per ampliar la descripció", type="pil") |
|
|
info_input = gr.Textbox( |
|
|
label="Diccionari informacion_image (format JSON)", |
|
|
placeholder='{"index": 0, "start": 0.0, "end": 1.2}', |
|
|
lines=3 |
|
|
) |
|
|
with gr.Row(): |
|
|
faces_input = gr.Textbox( |
|
|
label="Llistat de diccionaris face_col (format JSON)", |
|
|
placeholder='[{"nombre": "Anna", "embedding": [0.12, 0.88, ...]}, ...]', |
|
|
lines=5 |
|
|
) |
|
|
with gr.Row(): |
|
|
process_btn = gr.Button("Processar imatge (OCR + Persones)", variant="primary") |
|
|
with gr.Row(): |
|
|
output_json = gr.JSON(label="Resultat complet") |
|
|
|
|
|
process_btn.click( |
|
|
add_ocr_characters_to_image, |
|
|
inputs=[img_input, info_input, faces_input], |
|
|
outputs=output_json, |
|
|
api_name="add_ocr_and_faces", |
|
|
concurrency_limit=1 |
|
|
) |
|
|
|
|
|
demo.queue(max_size=16).launch(show_error=True,share=True) |
|
|
|
|
|
|