Spaces:

ericjedha
/

crazycat

Sleeping

App Files Files Community

ericjedha commited on Dec 29, 2025

Commit

be73d82

verified ·

1 Parent(s): 281d274

Update app.py

Browse files

Files changed (1) hide show

app.py +233 -11

app.py CHANGED Viewed

@@ -1,17 +1,239 @@
-⚖️ DÉCISION DU JUGE (SmolLM) :
-This sequence of actions indicates that the cat is either defensive or alert. The ears are back, mouth open, and the ears are back, which is a defensive posture. The ears are back, which is a defensive posture, and the mouth is open, which is a relaxed posture. This sequence of actions suggests that the cat is either preparing to defend itself or is in a relaxed state.
 ------------------------------------------
-👁️ VISION (Yeux) :
-1. Description: Describe ears and posture.
-2. Avis: Mood?
-Assistant: Ears back, mouth open.
-Ears back, mouth open.
-This sequence of actions indicates that the cat is either defensive or alert. The ears are back, which is a defensive posture, and the mouth is open, which is a relaxed posture. The ears are back, which is a defensive posture, and the mouth is open, which is a relaxed posture. This sequence of actions suggests that the cat is either preparing to defend itself or is in a relaxed state.
-📊 AUDIO (Oreilles) :
-HUNT (62.6%)
-⏱️ TOTAL : 44.85s

+import torch
+import torch.nn.functional as F
+import gradio as gr
+import librosa
+import numpy as np
+import cv2
+import timm
+import os
+import time
+import spaces
+import plotly.express as px
+from huggingface_hub import hf_hub_download
+from transformers import (
+    AutoProcessor,
+    AutoModelForImageTextToText,
+    ASTFeatureExtractor,
+    ASTForAudioClassification,
+    AutoModelForCausalLM,
+    AutoTokenizer
+)
+from moviepy import VideoFileClip
+import subprocess
+# --- Configuration ---
+CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# ==========================================
+# 1. CHARGEMENT DE LA TRINITÉ
+# ==========================================
+def load_models():
+    print("📥 Initialisation de la Trinité (VLM + LLM + Audio)...")
+    # Yeux : SmolVLM 256M
+    vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
+    vlm_proc = AutoProcessor.from_pretrained(vlm_id)
+    vlm_model = AutoModelForImageTextToText.from_pretrained(
+        vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    ).to(DEVICE).eval()
+    # Cerveau : SmolLM 135M (Arbitre)
+    llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
+    llm_tok = AutoTokenizer.from_pretrained(llm_id)
+    llm_model = AutoModelForCausalLM.from_pretrained(
+        llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    ).to(DEVICE).eval()
+    # Oreilles : Piliers Audio
+    audio_models = {}
+    for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
+                        ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
+        path = hf_hub_download(repo_id=repo, filename=f)
+        m = timm.create_model("vit_small_patch16_224", num_classes=len(CATEGORIES), in_chans=3)
+        m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
+        audio_models[p] = m.to(DEVICE).eval()
+    path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pillar_c_ast_v95_2_f1_0_9109.pth")
+    model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
+    sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
+    model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
+    audio_models['C'] = model_c.to(DEVICE).eval()
+    audio_models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
+    return vlm_proc, vlm_model, llm_tok, llm_model, audio_models
+vlm_proc, vlm_model, llm_tok, llm_model, audio_models = load_models()
+# ==========================================
+# 2. FONCTIONS UTILITAIRES
+# ==========================================
+def get_audio_probs(audio_path):
+    w, _ = librosa.load(audio_path, sr=16000, duration=5.0)
+    if len(w) < 48000: w = np.pad(w, (0, 48000-len(w)))
+    mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
+    mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
+    img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
+    img_t = torch.tensor(img).unsqueeze(0).repeat(1, 3, 1, 1).float().to(DEVICE) / 255.0
+    with torch.no_grad():
+        pa = F.softmax(audio_models['A'](img_t), dim=1)
+        pb = F.softmax(audio_models['B'](img_t), dim=1)
+        ic = audio_models['ast_ext'](w, sampling_rate=16000, return_tensors="pt").to(DEVICE)
+        pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
+        return (pa * 0.3468 + pb * 0.2762 + pc * 0.3770).cpu().numpy()[0]
+def call_peace_judge(audio_top, vlm_desc):
+    prompt_text = f"""You are a feline behavior expert. Decide the final cat mood.
+CONTEXT:
+- Audio Sensor predicts: {audio_top}
+- Video Sensor describes: {vlm_desc}
+RULES:
+- If Video describes 'ears back', 'teeth', or 'rigid', prioritize BACK_OFF/ANGRY.
+- Be concise and avoid repetition.
+VERDICT: [CATEGORY NAME]
+REASON: [1 short sentence]"""
+    messages = [{"role": "user", "content": prompt_text}]
+    full_prompt_string = llm_tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    model_inputs = llm_tok(full_prompt_string, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        generated_ids = llm_model.generate(
+            **model_inputs,
+            max_new_tokens=80,
+            temperature=0.1,
+            do_sample=False,
+            repetition_penalty=1.2,
+            pad_token_id=llm_tok.eos_token_id
+        )
+    decoded = llm_tok.decode(generated_ids[0][len(model_inputs["input_ids"][0]):], skip_special_tokens=True)
+    return decoded.strip()
+# ==========================================
+# 3. PIPELINE ANALYSE V12.1
+# ==========================================
+@spaces.GPU(duration=60)
+def analyze_cat_v12_final(video_path):
+    if not video_path: return "❌ Aucune vidéo.", None, None
+    tmp_audio = f"temp_{os.getpid()}.wav"
+    tmp_output_video = f"annotated_{os.getpid()}.mp4"
+    start_total = time.time()
+    try:
+        # --- PHASE 1: AUDIO (Les Oreilles) ---
+        t_audio_start = time.time()
+        clip = VideoFileClip(video_path)
+        audio_probs = np.zeros(len(CATEGORIES))
+        if clip.audio:
+            clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
+            audio_probs = get_audio_probs(tmp_audio)
+        clip.close()
+        t_audio = time.time() - t_audio_start
+        # --- PHASE 2: VISION (Les Yeux - FIX BY GROK) ---
+        t_vlm_start = time.time()
+        vlm_prompt = (
+            "Analyze the cat body language precisely.\n"
+            "EXAMPLE:\nDescription: Ears back, mouth open.\nAvis: Defensive.\n\n"
+            "YOUR TURN:\n1. Description: Describe ears and posture.\n2. Avis: Mood?"
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "video_path": video_path}, # FIX ICI
+                    {"type": "text", "text": vlm_prompt}
+                ]
+            }
+        ]
+        # Application du template officiel pour SmolVLM2-Video
+        vlm_inputs = vlm_proc.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(DEVICE)
+        with torch.no_grad():
+            vlm_out = vlm_model.generate(**vlm_inputs, max_new_tokens=100, do_sample=False)
+        vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
+        vlm_clean = vlm_res.split("YOUR TURN:")[-1].strip() if "YOUR TURN:" in vlm_res else vlm_res.strip()
+        t_vlm = time.time() - t_vlm_start
+        # --- PHASE 3: JUGE (Le Cerveau) ---
+        t_llm_start = time.time()
+        top_a_idx = np.argmax(audio_probs)
+        audio_context = f"{CATEGORIES[top_a_idx].upper()} ({audio_probs[top_a_idx]*100:.1f}%)"
+        judge_decision = call_peace_judge(audio_context, vlm_clean)
+        t_llm = time.time() - t_llm_start
+        # Extraction du verdict final
+        final_verdict = CATEGORIES[top_a_idx].upper()
+        for cat in CATEGORIES:
+            if cat.upper() in judge_decision.upper():
+                final_verdict = cat.upper()
+                break
+        # --- PHASE 4: ANNOTATION & EXPORT ---
+        top5 = np.argsort(audio_probs)[-5:][::-1]
+        fig = px.bar(x=[audio_probs[i]*100 for i in top5], y=[CATEGORIES[i].upper() for i in top5], orientation='h', title='Entrée Audio')
+        cap = cv2.VideoCapture(video_path)
+        fps, w_v, h_v = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        tmp_no_audio = f"no_audio_{os.getpid()}.mp4"
+        out_v = cv2.VideoWriter(tmp_no_audio, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w_v, h_v))
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret: break
+            cv2.rectangle(frame, (0,0), (w_v, 65), (0,0,0), -1)
+            cv2.putText(frame, f"JUDGE: {final_verdict}", (20, 45), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 255), 3)
+            out_v.write(frame)
+        cap.release(); out_v.release()
+        subprocess.run(['ffmpeg', '-i', tmp_no_audio, '-i', video_path, '-c:v', 'copy', '-c:a', 'aac', '-map', '0:v:0', '-map', '1:a:0', '-y', tmp_output_video], capture_output=True)
+        # --- PHASE 5: RAPPORT FINAL ---
+        t_total = time.time() - start_total
+        report = f"""⚖️ DÉCISION DU JUGE DE PAIX :
+{judge_decision}
 ------------------------------------------
+👁️ ANALYSE VISUELLE (VLM) :
+{vlm_clean}
+📊 DONNÉES AUDIO :
+{audio_context}
+⏱️ CHRONOMÈTRES :
+• Audio (Piliers A/B/C) : {t_audio:.2f}s
+• Vision (SmolVLM)      : {t_vlm:.2f}s
+• Juge (SmolLM)         : {t_llm:.2f}s
+• TOTAL                 : {t_total:.2f}s"""
+        if os.path.exists(tmp_audio): os.remove(tmp_audio)
+        if os.path.exists(tmp_no_audio): os.remove(tmp_no_audio)
+        return report, fig, tmp_output_video
+    except Exception as e:
+        return f"❌ Erreur : {str(e)}", None, None
+# --- Interface Gradio ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🐱 CatSense POC v12.1 - Trinité Architecture")
+    with gr.Row():
+        with gr.Column():
+            video_input = gr.Video()
+            btn = gr.Button("🚀 ANALYSE MULTIMODALE", variant="primary")
+        with gr.Column():
+            report_out = gr.Textbox(label="Rapport Expert", lines=18)
+            chart_out = gr.Plot()
+    video_out = gr.Video(label="Vidéo Expertisée")
+    btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out, video_out])
+demo.launch()