Spaces:

Chaste20
/

Home_project

Build error

App Files Files Community

Home_project / app.py

Chaste20

Update app.py

3b08fc5 verified about 1 month ago

raw

history blame contribute delete

3.91 kB

	import torch
	import gradio as gr
	from PIL import Image
	from transformers import AutoProcessor, AutoModelForImageTextToText
	from peft import PeftModel

	# -------------------------
	# CONFIG
	# -------------------------
	BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
	FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2"

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
	DEFAULT_QUESTION = "What sign language letter is this image?"
	ALLOWED_LETTERS = [chr(ord("A") + i) for i in range(26)]

	processor = None
	model = None

	def load_model():
	global processor, model
	if processor is not None and model is not None:
	return processor, model

	processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)

	base = AutoModelForImageTextToText.from_pretrained(
	BASE_MODEL_ID,
	torch_dtype=DTYPE,
	device_map="auto" if torch.cuda.is_available() else None,
	)

	model_peft = PeftModel.from_pretrained(
	base,
	FINETUNED_MODEL_ID,
	torch_dtype=DTYPE,
	)
	model_peft.to(DEVICE)
	model_peft.eval()
	model_peft.config.use_cache = True

	model = model_peft
	return processor, model

	def extract_letter(raw_text: str) -> str:
	for ch in raw_text:
	if ch in ALLOWED_LETTERS:
	return ch
	return "?"

	@torch.inference_mode()
	def guardio_predict(image, question: str):
	if image is None:
	return "⚠️ Please upload an image of an ASL handshape."

	if not question or not question.strip():
	question = DEFAULT_QUESTION

	if not isinstance(image, Image.Image):
	image = Image.fromarray(image)
	if image.mode != "RGB":
	image = image.convert("RGB")

	processor, model = load_model()

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": question},
	{"type": "image"},
	],
	}
	]

	text = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=False,
	)

	inputs = processor(
	text=[text],
	images=[[image]],
	padding=True,
	return_tensors="pt",
	)
	inputs = {k: v.to(DEVICE, dtype=DTYPE) for k, v in inputs.items()}

	output_ids = model.generate(
	**inputs,
	max_new_tokens=8,
	do_sample=False,
	num_beams=4,
	temperature=0.1,
	pad_token_id=processor.tokenizer.eos_token_id,
	)

	raw_text = processor.batch_decode(
	output_ids,
	skip_special_tokens=True,
	)[0].strip()

	letter = extract_letter(raw_text)

	if letter == "?":
	return (
	"❓ I couldn’t confidently map this to a single A–Z letter.\n\n"
	f"Raw model output: `{raw_text}`"
	)

	return f"🔤 Predicted letter: {letter}\n\n`Raw output: {raw_text}`"

	with gr.Blocks(title="Guardio – ASL Letter Demo") as demo:
	gr.Markdown(
	"""
	# 🧤 Guardio – ASL Letter Demo

	Upload an image of a single ASL alphabet handshape
	and ask: "What sign language letter is this image?"
	"""
	)

	with gr.Row():
	with gr.Column():
	img = gr.Image(
	label="ASL handshape image",
	type="pil",
	height=320,
	)
	q = gr.Textbox(
	label="Question",
	value=DEFAULT_QUESTION,
	lines=2,
	)
	btn = gr.Button("Ask Guardio", variant="primary")

	with gr.Column():
	out = gr.Markdown(
	label="Model answer",
	value="Upload an image and click Ask Guardio.",
	)

	btn.click(fn=guardio_predict, inputs=[img, q], outputs=[out])

	if __name__ == "__main__":
	demo.launch()