SmolVLM2-HighlightGenerator

Sleeping

App Files Files Community

ericjedha commited on Dec 23, 2025

Commit

bd04204

verified ·

1 Parent(s): e6bbe3f

Update app.py

Browse files

Files changed (1) hide show

app.py +224 -87

app.py CHANGED Viewed

@@ -1,100 +1,237 @@
-import gradio as gr
 import torch
-from threading import Thread
 from transformers import (
-    AutoProcessor,
-    AutoModelForVision2Seq,
-    TextIteratorStreamer,
 )
-# ======================
-# INIT MODÈLE
-# ======================
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Instruct"  # version compatible HF
-processor = AutoProcessor.from_pretrained(MODEL_ID)
-model = AutoModelForVision2Seq.from_pretrained(
-    MODEL_ID,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
 ).to(DEVICE).eval()
-# ======================
-# STREAMING INFERENCE
-# ======================
-def analyze_stream(text, image, max_tokens):
-    if not text.strip() and image is None:
-        return "❌ Veuillez fournir un texte ou une image."
-    # Construire le contenu
-    content = []
-    if image:
-        content.append({"type": "image", "path": image})
-    if text.strip():
-        content.append({"type": "text", "text": text})
-    messages = [{"role": "user", "content": content}]
-    # Préparer les inputs
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_tensors="pt",
-    ).to(DEVICE)
-    # Créer le streamer
-    streamer = TextIteratorStreamer(
-        processor,
-        skip_prompt=True,
-        skip_special_tokens=True,
-    )
-    # Lancer la génération dans un thread
-    Thread(
-        target=model.generate,
-        kwargs=dict(
-            **inputs,
-            streamer=streamer,
-            max_new_tokens=max_tokens,
-            do_sample=False,
-            temperature=0.0,
-        ),
-    ).start()
-    # Yield token par token
-    output = ""
-    for token in streamer:
-        output += token
-        yield output
-# ======================
-# INTERFACE GRADIO
-# ======================
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## ⚡ SmolVLM2 – Analyse en temps réel (Streaming)")
     with gr.Row():
         with gr.Column():
-            txt = gr.Textbox(
-                label="Question / Description",
-                placeholder="Posez une question ou décrivez l'image",
-                lines=3,
-            )
-            img = gr.Image(type="filepath", label="Image")
-            max_tokens = gr.Slider(50, 400, value=200, step=50, label="Max Tokens")
-            btn = gr.Button("🚀 Analyser", variant="primary")
         with gr.Column():
-            out = gr.Textbox(
-                label="Réponse en temps réel",
-                lines=14,
-            )
-    btn.click(
-        fn=analyze_stream,
-        inputs=[txt, img, max_tokens],
-        outputs=out,
-    )
-demo.launch()

 import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import gradio as gr
+import librosa
+import numpy as np
+import cv2
+import timm
+import os
+import warnings
+import logging
+import time
+from pathlib import Path
+from huggingface_hub import hf_hub_download
 from transformers import (
+    AutoProcessor,
+    AutoModelForImageTextToText,
+    ASTFeatureExtractor,
+    ASTForAudioClassification
 )
+from PIL import Image
+from moviepy import VideoFileClip
+# --- Configuration & Silence ---
+logging.getLogger("asyncio").setLevel(logging.CRITICAL)
+warnings.filterwarnings("ignore")
+CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
+TARGET_SR = 16000
+MAX_SEC = 5.0
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ==========================================
+# LOGIQUE DE FUSION V6.4 (Priorité Comportementale)
+# ==========================================
+def apply_visual_logic_v6(description, audio_probs):
+    scores = audio_probs.copy()
+    desc = description.lower()
+    applied_rules = []
+    # On détecte les états
+    has_teeth = "teeth" in desc
+    has_ears_back = "ears back" in desc
+    has_ears_forward = "ears forward" in desc
+    # 1. RÈGLE D'OR : Priorité à l'agression
+    # Si à n'importe quel moment on a vu les oreilles en arrière OU les dents
+    # on ignore totalement le "ears forward" pour les catégories de tension.
+    if has_ears_back or has_teeth:
+        # On booste les catégories de tension
+        for cat in ["angry", "back_off", "defensive"]:
+            scores[CATEGORIES.index(cat)] *= 4.0
+        applied_rules.append("⚠️ PRIORITÉ AGRESSION (Dents/Oreilles arrière détectées)")
+    # 2. On traite les autres indices normalement s'ils ne contredisent pas l'agression
+    if has_ears_forward and not (has_ears_back or has_teeth):
+        for cat in ["happy", "hunt", "wants_attention"]:
+            scores[CATEGORIES.index(cat)] *= 2.0
+        applied_rules.append("✅ Calme/Alerte (Ears forward)")
+    # 3. Les yeux et le front (Indicateurs de tension interne)
+    if "eyes wide" in desc:
+        for cat in ["angry", "back_off", "pain"]:
+            scores[CATEGORIES.index(cat)] *= 2.0
+        applied_rules.append("✅ Yeux écarquillés")
+    if np.sum(scores) > 0:
+        scores /= np.sum(scores)
+    return scores, applied_rules
+# ==========================================
+# PARSING NARRATIF (Détection de mots-clés)
+# ==========================================
+def parse_narrative_to_indices(text):
+    found = set()
+    text = f" {text.lower()} " # On ajoute des espaces pour isoler les mots
+    # OREILLES : On cherche des termes précis
+    if any(x in text for x in [" back ", "backward", " flat", " down", " low"]):
+        found.add("ears back")
+    if any(x in text for x in [" forward", "upright", "pointed", " up"]):
+        found.add("ears forward")
+    # BOUCHE : On évite de confondre 'open' et 'opening'
+    if any(x in text for x in [" open", "hiss", "snarl", "meow", "yawn"]):
+        found.add("mouth open")
+    if any(x in text for x in ["teeth", "fangs", "sharp"]):
+        found.add("teeth")
+    # YEUX & FRONT
+    if any(x in text for x in ["wide", "dilated", "staring"]):
+        found.add("eyes wide")
+    if any(x in text for x in ["squint", "closed eyes", "blink"]):
+        found.add("eyes squinted")
+    if any(x in text for x in ["wrinkl", "furrow", "tense forehead"]):
+        found.add("forehead wrinkled")
+    return " ".join(list(found))
+# ==========================================
+# CHARGEMENT DES MODÈLES AUDIO
+# ==========================================
+def load_audio_models():
+    models = {}
+    for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
+                       ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
+        path = hf_hub_download(repo_id=repo, filename=f)
+        m = timm.create_model("vit_small_patch16_224", num_classes=len(CATEGORIES), in_chans=3)
+        m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
+        models[p] = m.to(DEVICE).eval()
+    path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pillar_c_ast_v95_2_f1_0_9109.pth")
+    model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
+    sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
+    model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
+    models['C'] = model_c.to(DEVICE).eval()
+    models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
+    return models
+# ==========================================
+# INITIALISATION
+# ==========================================
+print("📥 Initialisation CatSense v8.4 (Full Face Analysis)...")
+vlm_id = "HuggingFaceTB/SmolVLM2-256M-Instruct"
+vlm_proc = AutoProcessor.from_pretrained(vlm_id)
+vlm_model = AutoModelForImageTextToText.from_pretrained(
+    vlm_id,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    _attn_implementation="sdpa"
 ).to(DEVICE).eval()
+audio_ensemble = load_audio_models()
+# ==========================================
+# INFERENCE AUDIO
+# ==========================================
+def get_audio_probs(path):
+    w, _ = librosa.load(path, sr=TARGET_SR, duration=MAX_SEC)
+    if len(w) < 48000: w = np.pad(w, (0, 48000-len(w)))
+    mel = librosa.feature.melspectrogram(y=w, sr=TARGET_SR, n_mels=192)
+    mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
+    img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
+    img_t = torch.tensor(img).unsqueeze(0).repeat(1, 3, 1, 1).float().to(DEVICE) / 255.0
+    with torch.no_grad():
+        pa = F.softmax(audio_ensemble['A'](img_t), dim=1)
+        pb = F.softmax(audio_ensemble['B'](img_t), dim=1)
+        ic = audio_ensemble['ast_ext'](w, sampling_rate=TARGET_SR, return_tensors="pt").to(DEVICE)
+        pc = F.softmax(audio_ensemble['C'](**ic).logits, dim=1)
+        return (pa * 0.3468 + pb * 0.2762 + pc * 0.3770).cpu().numpy()[0]
+# ==========================================
+# ANALYSE PRINCIPALE
+# ==========================================
+def analyze_cat(video_path):
+    if video_path is None: return [], "❌ Vidéo absente."
+    start_time = time.time()
+    tmp_audio = f"temp_{os.getpid()}.wav"
+    try:
+        # 1. Extraction Frames & Audio
+        with VideoFileClip(video_path) as clip:
+            clip.audio.write_audiofile(tmp_audio, fps=TARGET_SR, logger=None)
+            duration = min(clip.duration, MAX_SEC)
+            ts = [0.1 * duration, 0.5 * duration, 0.9 * duration]
+            frames_pil = []
+            for t in ts:
+                img = Image.fromarray(clip.get_frame(t)).convert("RGB")
+                w, h = img.size
+                # Crop focalisé
+                img = img.crop((int(w*0.12), int(h*0.05), int(w*0.88), int(h*0.85)))
+                img.thumbnail((384, 384))
+                frames_pil.append(img)
+        # 2. Audio
+        raw_audio_probs = get_audio_probs(tmp_audio)
+        # 3. Vision : Prompt Expert complet
+        prompt = """Analyze the cat in these 3 frames. List the state of:
+        1. Mouth and Teeth (Visible?)
+        2. Ears (Position?)
+        3. Eyes (Wide or squinted?)
+        4. Forehead (Wrinkled or smooth?)
+        5. Tail (Is it up or down?)
+        Be very direct for each image."""
+        messages = [{"role": "user", "content": [{"type": "image"}]*3 + [{"type": "text", "text": prompt}]}]
+        text_prompt = vlm_proc.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = vlm_proc(text=text_prompt, images=frames_pil, return_tensors="pt").to(DEVICE)
+        with torch.no_grad():
+            outputs = vlm_model.generate(**inputs, max_new_tokens=120, do_sample=False, temperature=0.0)
+        vlm_desc = vlm_proc.decode(outputs[0], skip_special_tokens=True).split("assistant")[-1].strip()
+        # 4. Fusion
+        visual_mapped = parse_narrative_to_indices(vlm_desc)
+        final_probs, indices = apply_visual_logic_v6(visual_mapped, raw_audio_probs)
+        final_idx = np.argmax(final_probs)
+        if os.path.exists(tmp_audio): os.remove(tmp_audio)
+        elapsed = time.time() - start_time
+        # 5. Rapport
+        res = f"🏆 VERDICT : {CATEGORIES[final_idx].upper()}\n"
+        res += f"🎯 CONFIANCE : {final_probs[final_idx]:.1%}\n"
+        res += f"⏱️ VITESSE : {elapsed:.2f}s\n"
+        res += f"------------------------------------------\n"
+        res += f"👁️ ANALYSE VISUELLE :\n{vlm_desc}\n"
+        res += f"------------------------------------------\n"
+        res += f"🔎 SIGNES RETENUS : {', '.join(indices) if indices else 'Aucun'}\n"
+        res += f"🔊 AUDIO DOMINANT : {CATEGORIES[np.argmax(raw_audio_probs)].upper()}\n"
+        res += f"📊 TOP 3 :\n"
+        for i in np.argsort(final_probs)[::-1][:3]:
+            res += f"   - {CATEGORIES[i]}: {final_probs[i]:.1%}\n"
+        return frames_pil, res
+    except Exception as e:
+        if os.path.exists(tmp_audio): os.remove(tmp_audio)
+        import traceback
+        return [], f"❌ Erreur critique :\n{traceback.format_exc()}"
+# ==========================================
+# INTERFACE
+# ==========================================
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🐱 CatSense v8.4 - Expert Behavior Analysis")
     with gr.Row():
         with gr.Column():
+            v_in = gr.Video(label="Vidéo (5s)")
+            btn = gr.Button("🚀 LANCER L'EXPERTISE", variant="primary")
         with gr.Column():
+            gal = gr.Gallery(label="Indices Visuels", columns=3)
+            out = gr.Textbox(label="Rapport de Fusion", lines=20)
+    btn.click(fn=analyze_cat, inputs=v_in, outputs=[gal, out])
+demo.launch()