Spaces:

ericjedha
/

crazycat

Sleeping

App Files Files Community

ericjedha commited on Jan 5

Commit

0642309

verified ·

1 Parent(s): a87f75c

Update app.py

Browse files

Files changed (1) hide show

app.py +352 -79

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ from huggingface_hub import hf_hub_download
 from transformers import (
     AutoProcessor,
     AutoModelForImageTextToText,
     AutoModelForCausalLM,
     AutoTokenizer
 )
@@ -20,127 +22,398 @@ from moviepy import VideoFileClip
 # --- Configuration ---
 CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
-IDX = {c: i for i, c in enumerate(CATEGORIES)}
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ==========================================
-# 1. CHARGEMENT DES MODÈLES (FOCUS DUO)
 # ==========================================
 def load_models():
-    print("📥 Chargement du Duo Audio (Master 4ch + Student V3 192-mels)...")
-    # --- VLM & LLM ---
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
-    vlm_model = AutoModelForImageTextToText.from_pretrained(vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32).to(DEVICE).eval()
     llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     llm_tok = AutoTokenizer.from_pretrained(llm_id)
-    llm_model = AutoModelForCausalLM.from_pretrained(llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32).to(DEVICE).eval()
-    # --- DUO AUDIO ---
     audio_models = {}
-    # 1. Master Specialist (Expert 4ch - 128 Mels)
-    path_spec = hf_hub_download(repo_id="ericjedha/best_specialist_modle_eff", filename="best_specialist_modle_eff.pth")
-    m_spec = timm.create_model("efficientformerv2_s0", num_classes=len(CATEGORIES)).to(DEVICE)
-    for name, module in m_spec.named_modules():
-        if isinstance(module, torch.nn.Conv2d):
-            new_conv = torch.nn.Conv2d(4, module.out_channels, 3, 2, 1, bias=True).to(DEVICE)
-            parts = name.split('.'); parent = m_spec
-            for part in parts[:-1]: parent = getattr(parent, part)
-            setattr(parent, parts[-1], new_conv)
-            break
-    m_spec.load_state_dict(torch.load(path_spec, map_location=DEVICE), strict=False)
-    audio_models['master'] = m_spec.eval()
-    # 2. Student V3 (3ch - 192 Mels)
-    path_student = hf_hub_download(repo_id="ericjedha/best_student", filename="best_student.pth")
-    m_student = timm.create_model("efficientformerv2_s0", num_classes=len(CATEGORIES)).to(DEVICE)
-    checkpoint = torch.load(path_student, map_location=DEVICE)
-    # Support du dictionnaire de checkpoint complet
-    state_dict = checkpoint['model_state_dict'] if 'model_state_dict' in checkpoint else checkpoint
-    m_student.load_state_dict(state_dict)
-    audio_models['student'] = m_student.eval()
     return vlm_model, llm_tok, llm_model, audio_models
-# Chargement initial
 vlm_model, llm_tok, llm_model, audio_models = load_models()
 # ==========================================
-# 2. PIPELINE ANALYSE DUO
 # ==========================================
 @spaces.GPU(duration=120)
-def analyze_cat_v13_duo(video_path):
-    if not video_path: return "❌ Aucune vidéo.", None
-    tmp_audio = f"temp_audio_{os.getpid()}.wav"
     start_total = time.time()
     try:
-        # --- PHASE AUDIO ---
         t_0 = time.time()
         clip = VideoFileClip(video_path)
         audio_probs = np.zeros(len(CATEGORIES))
         if clip.audio:
             clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
             w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
-            w = np.pad(w, (0, max(0, 80000 - len(w))))[:80000]
-            # A. Prep Master (128 Mels + Timbre 4ch)
-            mel_128 = librosa.power_to_db(librosa.feature.melspectrogram(y=w, sr=16000, n_mels=128), ref=np.max)
-            m_img_128 = cv2.resize((mel_128 + 40) / 40, (224, 224))
-            zcr = cv2.resize(librosa.feature.zero_crossing_rate(w), (224, 224))
-            cent = cv2.resize(librosa.feature.spectral_centroid(y=w, sr=16000), (224, 224))
-            d_cent = librosa.feature.delta(cent)
-            def norm(x): return (x - x.min()) / (x.max() - x.min() + 1e-6)
-            x4 = torch.from_numpy(np.stack([m_img_128, norm(zcr), norm(cent), norm(d_cent)], 0)).float().unsqueeze(0).to(DEVICE)
-            # B. Prep Student (192 Mels + 3ch)
-            mel_192 = librosa.power_to_db(librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192, fmax=8000), ref=np.max)
-            m_img_192 = cv2.resize((mel_192 + 40) / 40, (224, 224))
-            xst = torch.from_numpy(np.stack([m_img_192]*3, 0)).float().unsqueeze(0).to(DEVICE)
-            # C. Inférence & Fusion (Ratio 60/40)
             with torch.no_grad():
-                p_m = F.softmax(audio_models['master'](x4), dim=1).cpu().numpy()
-                p_s = F.softmax(audio_models['student'](xst), dim=1).cpu().numpy()
-            eps = 1e-7
-            log_probs = (0.60 * np.log(p_m + eps)) + (0.40 * np.log(p_s + eps))
-            audio_probs = np.exp(log_probs)[0]
-            audio_probs /= audio_probs.sum()
         clip.close()
         t_audio = time.time() - t_0
-        # --- PHASE VISION & JUGE (Inchangées pour le comparatif) ---
-        # ... (Garder le reste de ta fonction d'analyse et ton juge ici) ...
-        # (J'utilise ici un résumé rapide pour la structure)
         top_idx = np.argmax(audio_probs)
         audio_ctx = f"{CATEGORIES[top_idx].upper()} ({audio_probs[top_idx]*100:.1f}%)"
-        # Rapport final
-        fig = px.bar(x=[audio_probs[i]*100 for i in np.argsort(audio_probs)[-5:][::-1]],
-                     y=[CATEGORIES[i].upper() for i in np.argsort(audio_probs)[-5:][::-1]],
-                     orientation='h', title="Duo Score 60/40")
-        report = f"⚖️ DUO VERDICT : {CATEGORIES[top_idx].upper()}\n📊 Score Audio : {audio_ctx}"
         return report, fig
     except Exception as e:
         return f"❌ Erreur : {str(e)}", None
     finally:
-        if os.path.exists(tmp_audio): os.remove(tmp_audio)
-# --- Gradio UI ---
-with gr.Blocks() as demo:
-    gr.Markdown("# 🐱 CatSense Duo Tester")
     with gr.Row():
-        video_input = gr.Video()
         with gr.Column():
-            report_out = gr.Textbox(label="Verdict")
-            chart_out = gr.Plot()
-    gr.Button("Analyser").click(analyze_cat_v13_duo, inputs=video_input, outputs=[report_out, chart_out])
 demo.launch()

 from transformers import (
     AutoProcessor,
     AutoModelForImageTextToText,
+    ASTFeatureExtractor,
+    ASTForAudioClassification,
     AutoModelForCausalLM,
     AutoTokenizer
 )
 # --- Configuration ---
 CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ==========================================
+# 1. CHARGEMENT DES MODÈLES
 # ==========================================
 def load_models():
+    print("📥 Initialisation CatSense v12.13 (Vision Pure Mode)...")
+    # Modèle VLM
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
+    vlm_model = AutoModelForImageTextToText.from_pretrained(
+        vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    ).to(DEVICE).eval()
+    # LLM Juge
     llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     llm_tok = AutoTokenizer.from_pretrained(llm_id)
+    llm_model = AutoModelForCausalLM.from_pretrained(
+        llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    ).to(DEVICE).eval()
+    # Audio models
     audio_models = {}
+    for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
+                       ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
+        path = hf_hub_download(repo_id=repo, filename=f)
+        m = timm.create_model("vit_small_patch16_224", num_classes=len(CATEGORIES), in_chans=3)
+        m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
+        audio_models[p] = m.to(DEVICE).eval()
+    path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pillar_c_ast_v95_2_f1_0_9109.pth")
+    model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
+    sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
+    model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
+    audio_models['C'] = model_c.to(DEVICE).eval()
+    audio_models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
     return vlm_model, llm_tok, llm_model, audio_models
+# Chargement global
 vlm_model, llm_tok, llm_model, audio_models = load_models()
 # ==========================================
+# 2. JUGE HYBRIDE (règles + LLM)
+# ==========================================
+def call_peace_judge(audio_ctx, vlm_desc):
+    """
+    Deterministic + LLM hybrid judge.
+    AUDIO dominates when confidence > 30%.
+    Vision can refine but never neutralize strong audio signals.
+    """
+    vlm_lower = vlm_desc.lower()
+    audio_upper = audio_ctx.upper()
+    # =====================================================
+    # 1. HARD AUDIO GUARDRAILS (ABSOLUTE PRIORITY)
+    # =====================================================
+    if "PAIN" in audio_upper:
+        return "The cat is in pain."
+    if "ANGRY" in audio_upper:
+        return "The cat is angry."
+    if "DEFENSIVE" in audio_upper:
+        return "The cat is defensive."
+    if "BACK_OFF" in audio_upper or "BACKING_OFF" in audio_upper:
+        return "The cat is backing off."
+    # =====================================================
+    # 2. HARD VISUAL OVERRIDES (SAFETY FIRST)
+    # =====================================================
+    # Aggression / threat display
+    if any(x in vlm_lower for x in [
+        "front paws raised", "paws raised", "swiping",
+        "hissing", "mouth open and tense"
+    ]):
+        return "The cat is angry."
+    # Defensive posture
+    if any(x in vlm_lower for x in [
+        "arched back", "puffed fur", "ears flat",
+        "ears back", "sideways stance"
+    ]):
+        return "The cat is defensive."
+    # Pain indicators
+    if any(x in vlm_lower for x in [
+        "limping", "hunched", "crouched low",
+        "guarding", "withdrawn posture"
+    ]):
+        return "The cat is in pain."
+    # =====================================================
+    # 3. POSITIVE / LOW-RISK VISUAL STATES
+    # =====================================================
+    if any(x in vlm_lower for x in [
+        "kneading", "rubbing", "head bumping"
+    ]):
+        return "The cat is affectionate."
+    if any(x in vlm_lower for x in [
+        "playful", "rolling", "pouncing"
+    ]):
+        return "The cat is happy."
+    if any(x in vlm_lower for x in [
+        "stalking", "tail twitching", "low crawl"
+    ]):
+        return "The cat is hunting."
+    if any(x in vlm_lower for x in [
+        "approaching human", "following human",
+        "pawing at leg"
+    ]):
+        return "The cat is wanting attention."
+    if any(x in vlm_lower for x in [
+        "waiting posture", "looking at food",
+        "pacing near bowl"
+    ]):
+        return "The cat is hungry."
+    # =====================================================
+    # 4. LLM FALLBACK (NO CALM ALLOWED)
+    # =====================================================
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You are a strict cat behavior decision engine.\n"
+                "Rules:\n"
+                "1. AUDIO has priority over vision.\n"
+                "2. You must choose the most conservative interpretation.\n"
+                "3. 'calm' is NOT a valid output.\n"
+                "4. If unsure, prefer defensive or in pain.\n\n"
+                "Allowed outputs ONLY:\n"
+                "affectionate, angry, backing off, defensive, hungry, happy, "
+                "hunting, in heat, calling kittens, in pain, wanting attention\n\n"
+                "Answer format EXACTLY:\n"
+                "The cat is [label]."
+            )
+        },
+        {
+            "role": "user",
+            "content": (
+                f"AUDIO SIGNAL (PRIMARY): {audio_ctx}\n"
+                f"VISION OBSERVATIONS (SECONDARY): {vlm_desc}\n\n"
+                "FINAL DECISION:"
+            )
+        }
+    ]
+    input_text = llm_tok.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    inputs = llm_tok(input_text, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        outputs = llm_model.generate(
+            **inputs,
+            max_new_tokens=15,
+            do_sample=False,
+            temperature=0.0,
+            pad_token_id=llm_tok.eos_token_id,
+            eos_token_id=llm_tok.eos_token_id
+        )
+    generated = llm_tok.decode(
+        outputs[0][inputs["input_ids"].shape[1]:],
+        skip_special_tokens=True
+    ).lower()
+    for cat in CATEGORIES:
+        if cat.replace("_", " ") in generated:
+            return f"The cat is {cat.replace('_', ' ')}."
+    # =====================================================
+    # 5. FINAL FAILSAFE (NEVER CALM)
+    # =====================================================
+    return "The cat is defensive."
+# ==========================================
+# 3. PIPELINE ANALYSE COMPLETE (CORRIGÉ)
 # ==========================================
 @spaces.GPU(duration=120)
+def analyze_cat_v12_final(video_path):
+    if not video_path:
+        return "❌ Aucune vidéo.", None
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    tmp_audio = f"temp_{os.getpid()}_{int(time.time())}.wav"
     start_total = time.time()
+    # --------------------------------------------------
+    # Helper: clean VLM repetitions (cheap & mobile-safe)
+    # --------------------------------------------------
+    def clean_vlm_output(text):
+        sentences = text.split(". ")
+        cleaned = []
+        seen = set()
+        for s in sentences:
+            key = s.strip().lower()
+            if key and key not in seen:
+                seen.add(key)
+                cleaned.append(s.strip())
+        return ". ".join(cleaned)
     try:
+        # =========================
+        # A. AUDIO
+        # =========================
         t_0 = time.time()
         clip = VideoFileClip(video_path)
         audio_probs = np.zeros(len(CATEGORIES))
         if clip.audio:
             clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
             w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
+            if len(w) < 48000:
+                w = np.pad(w, (0, 48000 - len(w)))
+            mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
+            mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
+            img = cv2.resize(
+                (np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8),
+                (224, 224)
+            )
+            img_t = (
+                torch.tensor(img)
+                .unsqueeze(0)
+                .repeat(1, 3, 1, 1)
+                .float()
+                .to(DEVICE) / 255.0
+            )
             with torch.no_grad():
+                pa = F.softmax(audio_models['A'](img_t), dim=1)
+                pb = F.softmax(audio_models['B'](img_t), dim=1)
+                ic = audio_models['ast_ext'](
+                    w, sampling_rate=16000, return_tensors="pt"
+                ).to(DEVICE)
+                pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
+                audio_probs = (
+                    pa * 0.3468 + pb * 0.2762 + pc * 0.3770
+                ).cpu().numpy()[0]
         clip.close()
         t_audio = time.time() - t_0
+        # =========================
+        # B. VISION (VLM STABILISÉ)
+        # =========================
+        t_1 = time.time()
+        vlm_proc = AutoProcessor.from_pretrained(
+            "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
+        )
+        vlm_prompt = (
+            "You are a feline behavior expert.\n"
+            "Describe ONLY observable physical features:\n"
+            "- ears position\n"
+            "- mouth state (open/closed/tense)\n"
+            "- tail position or movement\n"
+            "- body posture\n"
+            "Use short factual sentences.\n"
+            "One observation per sentence.\n"
+            "Do NOT interpret mood."
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "path": video_path},
+                    {"type": "text", "text": vlm_prompt}
+                ]
+            }
+        ]
+        vlm_inputs = vlm_proc.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(DEVICE)
+        input_length = vlm_inputs["input_ids"].shape[1]
+        with torch.no_grad():
+            vlm_out = vlm_model.generate(
+                **vlm_inputs,
+                max_new_tokens=80,
+                do_sample=False,
+                temperature=0.0,
+                repetition_penalty=1.15,     # 🔑 anti-loop
+                no_repeat_ngram_size=5,      # 🔑 anti-phrase répétée
+                pad_token_id=vlm_proc.tokenizer.eos_token_id,
+                eos_token_id=vlm_proc.tokenizer.eos_token_id
+            )
+        gen_tokens = vlm_out[0][input_length:]
+        vlm_clean = vlm_proc.batch_decode(
+            [gen_tokens], skip_special_tokens=True
+        )[0]
+        vlm_clean = vlm_clean.strip().split("\n")[0]
+        if vlm_clean.lower().startswith("assistant:"):
+            vlm_clean = vlm_clean.split(":", 1)[-1].strip()
+        # nettoyage final anti-répétition
+        vlm_clean = clean_vlm_output(vlm_clean)
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        t_vlm = time.time() - t_1
+        # =========================
+        # C. JUGE
+        # =========================
+        t_2 = time.time()
         top_idx = np.argmax(audio_probs)
         audio_ctx = f"{CATEGORIES[top_idx].upper()} ({audio_probs[top_idx]*100:.1f}%)"
+        judge_decision = call_peace_judge(audio_ctx, vlm_clean)
+        t_llm = time.time() - t_2
+        # =========================
+        # D. VISUELS
+        # =========================
+        top5 = np.argsort(audio_probs)[-5:][::-1]
+        fig = px.bar(
+            x=[audio_probs[i] * 100 for i in top5],
+            y=[CATEGORIES[i].upper() for i in top5],
+            orientation="h",
+            title="Top 5 Scores Audio",
+            labels={"x": "Probabilité (%)", "y": "Émotion"},
+            color=[audio_probs[i] * 100 for i in top5],
+            color_continuous_scale="Viridis"
+        )
+        fig.update_layout(height=400, showlegend=False)
+        # =========================
+        # E. RAPPORT FINAL
+        # =========================
+        t_total = time.time() - start_total
+        report = f"""⚖️ VERDICT JUGE : {judge_decision}
+------------------------------------------
+👁️ VISION : {vlm_clean}
+📊 AUDIO : {audio_ctx}
+⏱️ TEMPS : Audio {t_audio:.2f}s | Vision {t_vlm:.2f}s | Juge {t_llm:.2f}s | Total {t_total:.2f}s"""
         return report, fig
     except Exception as e:
         return f"❌ Erreur : {str(e)}", None
     finally:
+        if os.path.exists(tmp_audio):
+            try:
+                os.remove(tmp_audio)
+            except:
+                pass
+# --- Interface Gradio ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🐱 CatSense v12.13 - Vision Pure Mode")
+    gr.Markdown("✅ **SmolVLM2-256M** + **SmolLM2-135M Juge** + Audio Ensemble")
     with gr.Row():
         with gr.Column():
+            video_input = gr.Video(label="Vidéo du chat")
+            btn = gr.Button("🚀 ANALYSER", variant="primary", size="lg")
+        with gr.Column():
+            report_out = gr.Textbox(label="Résultat complet", lines=12, interactive=False)
+            chart_out = gr.Plot(label="Distribution des émotions (Audio)")
+    btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out])
 demo.launch()