Spaces:

VeuReu
/

svision

Running on Zero

App Files Files Community

svision / app.py

VeuReu

Upload 8 files

82a79d6 verified about 2 months ago

raw

history blame

3.06 kB

	# app.py
	import os
	from typing import Dict
	import gradio as gr
	import spaces
	import torch
	from PIL import Image
	from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

	MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision")
	DTYPE = torch.float16
	DEVICE = "cuda"

	_model = None
	_processor = None

	def _lazy_load():
	global _model, _processor
	if _model is None or _processor is None:
	_processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	_model = LlavaOnevisionForConditionalGeneration.from_pretrained(
	MODEL_ID,
	torch_dtype=DTYPE,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	use_safetensors=True,
	device_map=None,
	)
	return _model, _processor

	def _compose_prompt(user_text: str):
	convo = [{"role": "user", "content": [{"type": "image"},
	{"type": "text", "text": user_text or "Describe la imagen con detalle."}]}]
	return convo

	@spaces.GPU
	def infer_core(image: Image.Image, text: str, max_new_tokens: int = 256, temperature: float = 0.7) -> str:
	model, processor = _lazy_load()
	prompt = processor.apply_chat_template(_compose_prompt(text), add_generation_prompt=True)
	model = model.to(DEVICE)
	inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, DTYPE)
	with torch.inference_mode():
	out = model.generate(**inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature))
	return processor.decode(out[0], skip_special_tokens=True).strip()

	# ---------- UI ----------
	with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
	gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.")
	with gr.Row():
	with gr.Column():
	in_img = gr.Image(label="Imagen", type="pil")
	in_txt = gr.Textbox(label="Texto/prompt", value="Describe la imagen con detalle (ES/CA).")
	max_new = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
	temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
	btn = gr.Button("Generar", variant="primary")
	with gr.Column():
	out = gr.Textbox(label="Descripción", lines=18)
	btn.click(infer_core, [in_img, in_txt, max_new, temp], out, api_name="describe")

	# ---------- API pura (sin UI) ----------
	# Exponemos un endpoint REST nítido (multipart/form-data o JSON base64) sin depender de componentes UI.
	# /api/describe_raw -> recibe {image,file} y campos simples.
	@gr.api()
	@spaces.GPU
	def describe_raw(image: gr.File, text: str = "Describe la imagen con detalle.",
	max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]:
	img = Image.open(image)
	result = infer_core(img, text, max_new_tokens, temperature)
	return {"text": result}

	demo.queue(concurrency_count=1, max_size=16).launch()