Spaces:

baubab4
/

VLM_WORDPRESS

Running

App Files Files Community

VLM_WORDPRESS / app.py

baubab4

Update app.py

0b28744 verified 2 months ago

raw

history blame contribute delete

7.62 kB

	import gradio as gr
	import torch
	from PIL import Image
	from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
	import numpy as np
	import cv2
	import threading
	import traceback
	import time

	# ======================================================
	# 1. SETUP (GOOGLE PALIGEMMA)
	# ======================================================

	print("⏳ Avvio MUSIC4D - PaliGemma (Smart Mode)")

	# Usiamo la versione 224px. È più leggera per la CPU rispetto alla 448px.
	MODEL_ID = "google/paligemma-3b-mix-224"

	VALID_EMOTIONS = [
	"JOY", "HAPPY", "SMILING", "LAUGHING",
	"ANGER", "ANGRY", "FROWNING", "FURIOUS",
	"FEAR", "SCARED", "TERRIFIED",
	"DISGUST", "DISGUSTED",
	"SURPRISE", "SHOCKED", "AMAZED",
	"SADNESS", "SAD", "CRYING", "DEPRESSED",
	"BOREDOM", "BORED", "TIRED",
	"NEUTRAL", "SERIOUS", "CALM"
	]

	model = None
	processor = None
	current_emotion = "CARICAMENTO..."
	analysis_thread = None

	# ======================================================
	# 2. CARICAMENTO
	# ======================================================

	def load_paligemma():
	global model, processor
	try:
	print("📦 Caricamento PaliGemma (3B Params)...")
	# PaliGemma richiede la sua classe specifica
	model = PaliGemmaForConditionalGeneration.from_pretrained(
	MODEL_ID,
	device_map="cpu",
	torch_dtype=torch.float32, # Float32 è più sicuro su CPU vecchie
	revision="bfloat16" # Scarica i pesi leggeri ma li converte
	).eval()

	processor = AutoProcessor.from_pretrained(MODEL_ID)
	print("✅ PaliGemma Pronto! (Sarà lento ma intelligente)")
	except Exception as e:
	print(f"❌ Errore caricamento: {e}")

	load_paligemma()

	# ======================================================
	# 3. AI LOGIC (PaliGemma)
	# ======================================================

	def analyze_emotion_task(pil_image):
	global current_emotion
	start = time.time()

	if model is None:
	return

	try:
	# Prompt
	prompt = """<image>Which is the general emotion in this picture? Answer the question
	using a single word for each emotion you can find. Follow this
	example: ’emotions: [em1, em2, em3, ...]’. Choose from: [’joy’,
	’anger’, ’fear’, ’disgust’, ’surprise’, ’sadness’, ’boredom’, ’neutral’,
	’contentment’]. \n"""

	# 1. Preprocessing
	if pil_image.mode != "RGB":
	pil_image = pil_image.convert("RGB")

	inputs = processor(text=prompt, images=pil_image, return_tensors="pt")
	inputs = {k: v.to("cpu") for k, v in inputs.items()}

	# 2. Generazione
	with torch.no_grad():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=20, # Risposta breve
	do_sample=False, # Deterministico
	)

	# 3. Decoding
	generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

	# --- FIX BUG "SEMPRE JOY" ---
	# Il problema era che 'replace' non funzionava bene e rimaneva la lista delle opzioni
	# (che contiene la parola 'joy').
	# SOLUZIONE: Spezziamo la stringa dove finisce la domanda ("contentment’].")
	# e prendiamo solo quello che c'è dopo.

	description = generated_text.upper()

	# Cerchiamo la fine della domanda per tagliare via tutto il prompt
	marker = "CONTENTMENT’]."
	if marker in description:
	description = description.split(marker)[-1].strip()
	else:
	# Fallback di sicurezza: prendiamo solo gli ultimi 50 caratteri
	# (la risposta è corta, il prompt è lungo)
	description = description[-50:].strip()

	description = description.replace("\n", "").strip()

	print(f"🧠 PaliGemma dice: {description}")

	# 4. Keyword Matching
	detected = "UNKNOWN"
	for keyword in VALID_EMOTIONS:
	if keyword in description:
	if keyword in ["HAPPY", "SMILING", "LAUGHING", "JOY"]: detected = "JOY"
	elif keyword in ["ANGRY", "FROWNING", "FURIOUS"]: detected = "ANGER"
	elif keyword in ["SCARED", "TERRIFIED", "FEAR"]: detected = "FEAR"
	elif keyword in ["SAD", "CRYING", "DEPRESSED"]: detected = "SADNESS"
	elif keyword in ["SHOCKED", "AMAZED", "SURPRISE"]: detected = "SURPRISE"
	elif keyword in ["BORED", "TIRED"]: detected = "BOREDOM"
	elif keyword in ["SERIOUS", "CALM", "NEUTRAL"]: detected = "NEUTRAL"
	elif keyword in ["CONTENTMENT"]: detected = "JOY"
	else: detected = keyword
	break

	# Fallback intelligente
	if detected == "UNKNOWN" and len(description) > 2:
	detected = description

	current_emotion = detected

	except Exception as e:
	print("⚠️ ERRORE CRITICO NEL THREAD:")
	traceback.print_exc()
	current_emotion = "ERRORE"

	end = time.time() - start
	print(f"⏱️ Tempo inferenza: {end:.2f}s")

	# ======================================================
	# 4. FRAME PROCESSING
	# ======================================================

	def process_frame(image_array):
	global analysis_thread, current_emotion

	if image_array is None:
	return image_array

	# print("thread grafica")
	annotated = image_array.copy()
	img_h, img_w, _ = annotated.shape

	# Logica Thread
	if analysis_thread is None or not analysis_thread.is_alive():
	if model is not None:
	try:
	# Resize per PaliGemma
	ai_frame = cv2.resize(image_array, (224, 224))
	pil_image = Image.fromarray(ai_frame)

	analysis_thread = threading.Thread(target=analyze_emotion_task, args=(pil_image,))
	analysis_thread.start()
	except Exception as e:
	print(f"Errore prep immagine: {e}")

	# Grafica
	is_working = analysis_thread is not None and analysis_thread.is_alive()
	status_symbol = " (*)" if is_working else ""
	text_display = f"{current_emotion}{status_symbol}"

	color_bg = (0, 0, 0)
	color_txt = (0, 165, 255) if is_working else (0, 255, 0)
	if "ERRORE" in current_emotion: color_txt = (255, 0, 0)

	font = cv2.FONT_HERSHEY_SIMPLEX
	scale = 0.8
	thick = 2

	(tw, th), base = cv2.getTextSize(text_display, font, scale, thick)

	x = 20
	y = img_h - 20

	cv2.rectangle(annotated, (x-10, y-th-10), (x+tw+10, y+base+5), color_bg, -1)
	cv2.putText(annotated, text_display, (x, y), font, scale, color_txt, thick, cv2.LINE_AA)

	return annotated

	# ======================================================
	# 5. UI
	# ======================================================

	with gr.Blocks(title="MUSIC4D - PaliGemma Final") as demo:
	gr.Markdown("## 🎵 MUSIC4D – PaliGemma CPU")

	with gr.Row():
	inp = gr.Image(sources=["webcam"], type="numpy", label="Webcam")
	out = gr.Image(type="numpy", label="Output Live")

	inp.stream(
	process_frame,
	inputs=inp,
	outputs=out,
	stream_every=11.5,
	time_limit=350
	)

	if __name__ == "__main__":
	demo.launch(ssr_mode=False)