SmolVLM2-HighlightGenerator

Sleeping

App Files Files Community

ericjedha commited on Dec 23, 2025

Commit

0aa2638

verified ·

1 Parent(s): 6a45583

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -224

app.py CHANGED Viewed

@@ -1,237 +1,98 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
 import gradio as gr
-import librosa
-import numpy as np
-import cv2
-import timm
-import os
-import warnings
-import logging
-import time
-from pathlib import Path
-from huggingface_hub import hf_hub_download
 from transformers import (
-    AutoProcessor,
-    AutoModelForImageTextToText,
-    ASTFeatureExtractor,
-    ASTForAudioClassification
 )
-from PIL import Image
-from moviepy import VideoFileClip
-# --- Configuration & Silence ---
-logging.getLogger("asyncio").setLevel(logging.CRITICAL)
-warnings.filterwarnings("ignore")
-CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
-TARGET_SR = 16000
-MAX_SEC = 5.0
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# ==========================================
-# LOGIQUE DE FUSION V6.4 (Priorité Comportementale)
-# ==========================================
-def apply_visual_logic_v6(description, audio_probs):
-    scores = audio_probs.copy()
-    desc = description.lower()
-    applied_rules = []
-    # On détecte les états
-    has_teeth = "teeth" in desc
-    has_ears_back = "ears back" in desc
-    has_ears_forward = "ears forward" in desc
-    # 1. RÈGLE D'OR : Priorité à l'agression
-    # Si à n'importe quel moment on a vu les oreilles en arrière OU les dents
-    # on ignore totalement le "ears forward" pour les catégories de tension.
-    if has_ears_back or has_teeth:
-        # On booste les catégories de tension
-        for cat in ["angry", "back_off", "defensive"]:
-            scores[CATEGORIES.index(cat)] *= 4.0
-        applied_rules.append("⚠️ PRIORITÉ AGRESSION (Dents/Oreilles arrière détectées)")
-    # 2. On traite les autres indices normalement s'ils ne contredisent pas l'agression
-    if has_ears_forward and not (has_ears_back or has_teeth):
-        for cat in ["happy", "hunt", "wants_attention"]:
-            scores[CATEGORIES.index(cat)] *= 2.0
-        applied_rules.append("✅ Calme/Alerte (Ears forward)")
-    # 3. Les yeux et le front (Indicateurs de tension interne)
-    if "eyes wide" in desc:
-        for cat in ["angry", "back_off", "pain"]:
-            scores[CATEGORIES.index(cat)] *= 2.0
-        applied_rules.append("✅ Yeux écarquillés")
-    if np.sum(scores) > 0:
-        scores /= np.sum(scores)
-    return scores, applied_rules
-# ==========================================
-# PARSING NARRATIF (Détection de mots-clés)
-# ==========================================
-def parse_narrative_to_indices(text):
-    found = set()
-    text = f" {text.lower()} " # On ajoute des espaces pour isoler les mots
-    # OREILLES : On cherche des termes précis
-    if any(x in text for x in [" back ", "backward", " flat", " down", " low"]):
-        found.add("ears back")
-    if any(x in text for x in [" forward", "upright", "pointed", " up"]):
-        found.add("ears forward")
-    # BOUCHE : On évite de confondre 'open' et 'opening'
-    if any(x in text for x in [" open", "hiss", "snarl", "meow", "yawn"]):
-        found.add("mouth open")
-    if any(x in text for x in ["teeth", "fangs", "sharp"]):
-        found.add("teeth")
-    # YEUX & FRONT
-    if any(x in text for x in ["wide", "dilated", "staring"]):
-        found.add("eyes wide")
-    if any(x in text for x in ["squint", "closed eyes", "blink"]):
-        found.add("eyes squinted")
-    if any(x in text for x in ["wrinkl", "furrow", "tense forehead"]):
-        found.add("forehead wrinkled")
-    return " ".join(list(found))
-# ==========================================
-# CHARGEMENT DES MODÈLES AUDIO
-# ==========================================
-def load_audio_models():
-    models = {}
-    for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
-                       ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
-        path = hf_hub_download(repo_id=repo, filename=f)
-        m = timm.create_model("vit_small_patch16_224", num_classes=len(CATEGORIES), in_chans=3)
-        m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
-        models[p] = m.to(DEVICE).eval()
-    path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pillar_c_ast_v95_2_f1_0_9109.pth")
-    model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
-    sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
-    model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
-    models['C'] = model_c.to(DEVICE).eval()
-    models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
-    return models
-# ==========================================
-# INITIALISATION
-# ==========================================
-print("📥 Initialisation CatSense v8.4 (Full Face Analysis)...")
-vlm_id = "HuggingFaceTB/SmolVLM2-256M-Instruct"
-vlm_proc = AutoProcessor.from_pretrained(vlm_id)
-vlm_model = AutoModelForImageTextToText.from_pretrained(
-    vlm_id,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-    _attn_implementation="sdpa"
 ).to(DEVICE).eval()
-audio_ensemble = load_audio_models()
-# ==========================================
-# INFERENCE AUDIO
-# ==========================================
-def get_audio_probs(path):
-    w, _ = librosa.load(path, sr=TARGET_SR, duration=MAX_SEC)
-    if len(w) < 48000: w = np.pad(w, (0, 48000-len(w)))
-    mel = librosa.feature.melspectrogram(y=w, sr=TARGET_SR, n_mels=192)
-    mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
-    img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
-    img_t = torch.tensor(img).unsqueeze(0).repeat(1, 3, 1, 1).float().to(DEVICE) / 255.0
-    with torch.no_grad():
-        pa = F.softmax(audio_ensemble['A'](img_t), dim=1)
-        pb = F.softmax(audio_ensemble['B'](img_t), dim=1)
-        ic = audio_ensemble['ast_ext'](w, sampling_rate=TARGET_SR, return_tensors="pt").to(DEVICE)
-        pc = F.softmax(audio_ensemble['C'](**ic).logits, dim=1)
-        return (pa * 0.3468 + pb * 0.2762 + pc * 0.3770).cpu().numpy()[0]
-# ==========================================
-# ANALYSE PRINCIPALE
-# ==========================================
-def analyze_cat(video_path):
-    if video_path is None: return [], "❌ Vidéo absente."
-    start_time = time.time()
-    tmp_audio = f"temp_{os.getpid()}.wav"
-    try:
-        # 1. Extraction Frames & Audio
-        with VideoFileClip(video_path) as clip:
-            clip.audio.write_audiofile(tmp_audio, fps=TARGET_SR, logger=None)
-            duration = min(clip.duration, MAX_SEC)
-            ts = [0.1 * duration, 0.5 * duration, 0.9 * duration]
-            frames_pil = []
-            for t in ts:
-                img = Image.fromarray(clip.get_frame(t)).convert("RGB")
-                w, h = img.size
-                # Crop focalisé
-                img = img.crop((int(w*0.12), int(h*0.05), int(w*0.88), int(h*0.85)))
-                img.thumbnail((384, 384))
-                frames_pil.append(img)
-        # 2. Audio
-        raw_audio_probs = get_audio_probs(tmp_audio)
-        # 3. Vision : Prompt Expert complet
-        prompt = """Analyze the cat in these 3 frames. List the state of:
-        1. Mouth and Teeth (Visible?)
-        2. Ears (Position?)
-        3. Eyes (Wide or squinted?)
-        4. Forehead (Wrinkled or smooth?)
-        5. Tail (Is it up or down?)
-        Be very direct for each image."""
-        messages = [{"role": "user", "content": [{"type": "image"}]*3 + [{"type": "text", "text": prompt}]}]
-        text_prompt = vlm_proc.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = vlm_proc(text=text_prompt, images=frames_pil, return_tensors="pt").to(DEVICE)
-        with torch.no_grad():
-            outputs = vlm_model.generate(**inputs, max_new_tokens=120, do_sample=False, temperature=0.0)
-        vlm_desc = vlm_proc.decode(outputs[0], skip_special_tokens=True).split("assistant")[-1].strip()
-        # 4. Fusion
-        visual_mapped = parse_narrative_to_indices(vlm_desc)
-        final_probs, indices = apply_visual_logic_v6(visual_mapped, raw_audio_probs)
-        final_idx = np.argmax(final_probs)
-        if os.path.exists(tmp_audio): os.remove(tmp_audio)
-        elapsed = time.time() - start_time
-        # 5. Rapport
-        res = f"🏆 VERDICT : {CATEGORIES[final_idx].upper()}\n"
-        res += f"🎯 CONFIANCE : {final_probs[final_idx]:.1%}\n"
-        res += f"⏱️ VITESSE : {elapsed:.2f}s\n"
-        res += f"------------------------------------------\n"
-        res += f"👁️ ANALYSE VISUELLE :\n{vlm_desc}\n"
-        res += f"------------------------------------------\n"
-        res += f"🔎 SIGNES RETENUS : {', '.join(indices) if indices else 'Aucun'}\n"
-        res += f"🔊 AUDIO DOMINANT : {CATEGORIES[np.argmax(raw_audio_probs)].upper()}\n"
-        res += f"📊 TOP 3 :\n"
-        for i in np.argsort(final_probs)[::-1][:3]:
-            res += f"   - {CATEGORIES[i]}: {final_probs[i]:.1%}\n"
-        return frames_pil, res
-    except Exception as e:
-        if os.path.exists(tmp_audio): os.remove(tmp_audio)
-        import traceback
-        return [], f"❌ Erreur critique :\n{traceback.format_exc()}"
-# ==========================================
-# INTERFACE
-# ==========================================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🐱 CatSense v8.4 - Expert Behavior Analysis")
     with gr.Row():
         with gr.Column():
-            v_in = gr.Video(label="Vidéo (5s)")
-            btn = gr.Button("🚀 LANCER L'EXPERTISE", variant="primary")
-        with gr.Column():
-            gal = gr.Gallery(label="Indices Visuels", columns=3)
-            out = gr.Textbox(label="Rapport de Fusion", lines=20)
-    btn.click(fn=analyze_cat, inputs=v_in, outputs=[gal, out])
-demo.launch()

 import gradio as gr
+import torch
+from threading import Thread
 from transformers import (
+    SmolVLMProcessor,
+    AutoModelForImageTextToText,
+    TextIteratorStreamer,
 )
+# ======================
+# INIT MODÈLE
+# ======================
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+processor = SmolVLMProcessor.from_pretrained(MODEL_ID)
+model = AutoModelForImageTextToText.from_pretrained(
+    MODEL_ID,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
 ).to(DEVICE).eval()
+# ======================
+# STREAMING INFERENCE
+# ======================
+def analyze_stream(text, image, max_tokens):
+    if image is None and not text.strip():
+        return "❌ Veuillez fournir un texte ou une image."
+    content = []
+    if image:
+        content.append({"type": "image", "path": image})
+    if text.strip():
+        content.append({"type": "text", "text": text})
+    messages = [{"role": "user", "content": content}]
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_tensors="pt",
+    ).to(DEVICE)
+    streamer = TextIteratorStreamer(
+        processor,
+        skip_prompt=True,
+        skip_special_tokens=True,
+    )
+    Thread(
+        target=model.generate,
+        kwargs=dict(
+            **inputs,
+            streamer=streamer,
+            max_new_tokens=max_tokens,
+            do_sample=False,
+            temperature=0.0,
+        ),
+    ).start()
+    output = ""
+    for token in streamer:
+        output += token
+        yield output
+# ======================
+# UI GRADIO
+# ======================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## ⚡ SmolVLM2 – Analyse Temps Réel")
     with gr.Row():
         with gr.Column():
+            txt = gr.Textbox(
+                label="Question / Description",
+                lines=3,
+            )
+            img = gr.Image(type="filepath", label="Image")
+            max_tokens = gr.Slider(
+                50, 400, value=200, step=50, label="Max Tokens"
+            )
+            btn = gr.Button("🚀 Analyser", variant="primary")
+        with gr.Column():
+            out = gr.Textbox(
+                label="Réponse en Temps Réel",
+                lines=14,
+            )
+    btn.click(
+        fn=analyze_stream,
+        inputs=[txt, img, max_tokens],
+        outputs=out,
+    )
+demo.launch()