Spaces:

ericjedha
/

crazycat

Sleeping

App Files Files Community

ericjedha commited on Dec 30, 2025

Commit

fab092e

verified ·

1 Parent(s): 6bf2afb

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -66

app.py CHANGED Viewed

@@ -18,9 +18,8 @@ from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer
 )
-from moviepy.editor import VideoFileClip
-import decord
-decord.bridge.set_bridge('torch')  # Nécessaire pour le traitement vidéo de SmolVLM
 # --- Configuration ---
 CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
@@ -32,21 +31,18 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 def load_models():
     print("📥 Initialisation CatSense v12.9 (Pure Logic Mode)...")
-    # Yeux : SmolVLM 256M
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
     vlm_proc = AutoProcessor.from_pretrained(vlm_id)
     vlm_model = AutoModelForImageTextToText.from_pretrained(
         vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
-    # Cerveau : SmolLM 135M
     llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     llm_tok = AutoTokenizer.from_pretrained(llm_id)
     llm_model = AutoModelForCausalLM.from_pretrained(
         llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
-    # Oreilles : Piliers Audio
     audio_models = {}
     for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
                         ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
@@ -70,7 +66,8 @@ vlm_proc, vlm_model, llm_tok, llm_model, audio_models = load_models()
 # 2. LOGIQUE DU JUGE (SANS ASSISTANT)
 # ==========================================
 def call_peace_judge(audio_top, vlm_desc):
-    prompt_text = f"Audio: {audio_top}\nVideo: {vlm_desc}\nVerdict:"
     inputs = llm_tok(prompt_text, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
@@ -88,24 +85,21 @@ def call_peace_judge(audio_top, vlm_desc):
 # ==========================================
 @spaces.GPU(duration=120)
 def analyze_cat_v12_final(video_path):
-    if not video_path:
-        return "❌ Aucune vidéo.", None
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
     tmp_audio = f"temp_{os.getpid()}.wav"
     start_total = time.time()
     try:
-        # --- A. AUDIO (Oreilles) ---
         t_0 = time.time()
         clip = VideoFileClip(video_path)
         audio_probs = np.zeros(len(CATEGORIES))
         if clip.audio:
             clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
             w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
-            if len(w) < 48000:
-                w = np.pad(w, (0, 48000 - len(w)))
             mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
             mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
             img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
@@ -119,57 +113,38 @@ def analyze_cat_v12_final(video_path):
         clip.close()
         t_audio = time.time() - t_0
-        # --- B. VISION (avec chat template correct) ---
         t_1 = time.time()
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "video", "path": video_path},
-                    {
-                        "type": "text",
-                        "text": "Describe the cat in the video focusing on ears, mouth, tail and body posture. Based on this, what is the cat's mood?"
-                    }
-                ]
-            }
-        ]
-        vlm_inputs = vlm_proc.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            return_tensors="pt",
-            return_dict=True
-        ).to(DEVICE)
         with torch.no_grad():
-            vlm_out = vlm_model.generate(
-                **vlm_inputs,
-                max_new_tokens=80,
-                do_sample=False
-            )
-        vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
-        vlm_clean = vlm_res.strip()
         t_vlm = time.time() - t_1
-        # --- C. JUGE (Cerveau) ---
         t_2 = time.time()
-        top_a_label = CATEGORIES[np.argmax(audio_probs)].upper()
-        audio_ctx = f"{top_a_label} ({np.max(audio_probs)*100:.1f}%)"
         judge_decision = call_peace_judge(audio_ctx, vlm_clean)
         t_llm = time.time() - t_2
         # --- D. VISUELS ---
         top5 = np.argsort(audio_probs)[-5:][::-1]
-        fig = px.bar(
-            x=[audio_probs[i]*100 for i in top5],
-            y=[CATEGORIES[i].upper() for i in top5],
-            orientation='h',
-            title='Top 5 Audio Scores',
-            labels={'x': 'Confidence (%)', 'y': 'Émotion'}
-        )
         # --- E. RAPPORT ---
         t_total = time.time() - start_total
@@ -177,23 +152,21 @@ def analyze_cat_v12_final(video_path):
 {judge_decision}
 ------------------------------------------
-👁️ DESCRIPTION VISUELLE (VLM) :
 {vlm_clean}
-📊 AUDIO DOMINANT :
 {audio_ctx}
-⏱️ TEMPS DE TRAITEMENT :
 Audio: {t_audio:.2f}s | Vision: {t_vlm:.2f}s | Juge: {t_llm:.2f}s
 TOTAL: {t_total:.2f}s"""
-        if os.path.exists(tmp_audio):
-            os.remove(tmp_audio)
         return report, fig
     except Exception as e:
-        if os.path.exists(tmp_audio):
-            os.remove(tmp_audio)
         return f"❌ Erreur : {str(e)}", None
 # --- Interface Gradio ---
@@ -201,12 +174,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🐱 CatSense v12.9 - Trinité Simplifiée")
     with gr.Row():
         with gr.Column():
-            video_input = gr.Video(label="Uploader une vidéo de chat")
-            btn = gr.Button("🚀 ANALYSER", variant="primary", size="lg")
         with gr.Column():
-            report_out = gr.Textbox(label="Rapport d'expertise", lines=15)
-            chart_out = gr.Plot(label="Scores audio (top 5)")
     btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out])
 demo.launch()

     AutoModelForCausalLM,
     AutoTokenizer
 )
+# CORRECTION MOVIEPY : Import direct pour éviter l'erreur .editor
+from moviepy import VideoFileClip
 # --- Configuration ---
 CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
 def load_models():
     print("📥 Initialisation CatSense v12.9 (Pure Logic Mode)...")
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
     vlm_proc = AutoProcessor.from_pretrained(vlm_id)
     vlm_model = AutoModelForImageTextToText.from_pretrained(
         vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
     llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     llm_tok = AutoTokenizer.from_pretrained(llm_id)
     llm_model = AutoModelForCausalLM.from_pretrained(
         llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
     audio_models = {}
     for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
                         ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
 # 2. LOGIQUE DU JUGE (SANS ASSISTANT)
 # ==========================================
 def call_peace_judge(audio_top, vlm_desc):
+    # Prompt brut pour éviter le blabla
+    prompt_text = f"Audio: {audio_top}\nVideo Analysis: {vlm_desc}\nFinal Mood Verdict:"
     inputs = llm_tok(prompt_text, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
 # ==========================================
 @spaces.GPU(duration=120)
 def analyze_cat_v12_final(video_path):
+    if not video_path: return "❌ Aucune vidéo.", None
+    if torch.cuda.is_available(): torch.cuda.empty_cache()
     tmp_audio = f"temp_{os.getpid()}.wav"
     start_total = time.time()
     try:
+        # --- A. AUDIO ---
         t_0 = time.time()
         clip = VideoFileClip(video_path)
         audio_probs = np.zeros(len(CATEGORIES))
         if clip.audio:
             clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
             w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
+            if len(w) < 48000: w = np.pad(w, (0, 48000 - len(w)))
             mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
             mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
             img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
         clip.close()
         t_audio = time.time() - t_0
+        # --- B. VISION (Prompt Direct avec extraction propre) ---
         t_1 = time.time()
+        # On utilise le template pour éviter l'erreur de matching, mais on demande du factuel
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "video", "path": video_path},
+                {"type": "text", "text": "Describe the cat: ears, mouth, tail and body posture. Based on this, what is the cat's mood?"}
+            ]
+        }]
+        vlm_inputs = vlm_proc.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(DEVICE)
         with torch.no_grad():
+            vlm_out = vlm_model.generate(**vlm_inputs, max_new_tokens=80, do_sample=False)
+        # On décode et on retire tout ce qui précède le texte de l'assistant pour avoir l'analyse brute
+        full_text = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
+        vlm_clean = full_text.split("assistant")[-1].strip()
         t_vlm = time.time() - t_1
+        # --- C. JUGE ---
         t_2 = time.time()
+        top_idx = np.argmax(audio_probs)
+        audio_ctx = f"{CATEGORIES[top_idx].upper()} ({audio_probs[top_idx]*100:.1f}%)"
         judge_decision = call_peace_judge(audio_ctx, vlm_clean)
         t_llm = time.time() - t_2
         # --- D. VISUELS ---
         top5 = np.argsort(audio_probs)[-5:][::-1]
+        fig = px.bar(x=[audio_probs[i]*100 for i in top5], y=[CATEGORIES[i].upper() for i in top5], orientation='h', title='Audio Scores')
         # --- E. RAPPORT ---
         t_total = time.time() - start_total
 {judge_decision}
 ------------------------------------------
+👁️ VISION (VLM) :
 {vlm_clean}
+📊 AUDIO :
 {audio_ctx}
+⏱️ CHRONOS :
 Audio: {t_audio:.2f}s | Vision: {t_vlm:.2f}s | Juge: {t_llm:.2f}s
 TOTAL: {t_total:.2f}s"""
+        if os.path.exists(tmp_audio): os.remove(tmp_audio)
         return report, fig
     except Exception as e:
+        if os.path.exists(tmp_audio): os.remove(tmp_audio)
         return f"❌ Erreur : {str(e)}", None
 # --- Interface Gradio ---
     gr.Markdown("# 🐱 CatSense v12.9 - Trinité Simplifiée")
     with gr.Row():
         with gr.Column():
+            video_input = gr.Video()
+            btn = gr.Button("🚀 ANALYSER", variant="primary")
         with gr.Column():
+            report_out = gr.Textbox(label="Résultat", lines=12)
+            chart_out = gr.Plot()
     btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out])
 demo.launch()