Spaces:

ericjedha
/

crazycat

Sleeping

App Files Files Community

ericjedha commited on Dec 29, 2025

Commit

27d8a18

verified ·

1 Parent(s): be73d82

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -101

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # 1. CHARGEMENT DE LA TRINITÉ
 # ==========================================
 def load_models():
-    print("📥 Initialisation de la Trinité (VLM + LLM + Audio)...")
     # Yeux : SmolVLM 256M
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
@@ -66,125 +66,98 @@ def load_models():
 vlm_proc, vlm_model, llm_tok, llm_model, audio_models = load_models()
 # ==========================================
-# 2. FONCTIONS UTILITAIRES
 # ==========================================
-def get_audio_probs(audio_path):
-    w, _ = librosa.load(audio_path, sr=16000, duration=5.0)
-    if len(w) < 48000: w = np.pad(w, (0, 48000-len(w)))
-    mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
-    mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
-    img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
-    img_t = torch.tensor(img).unsqueeze(0).repeat(1, 3, 1, 1).float().to(DEVICE) / 255.0
-    with torch.no_grad():
-        pa = F.softmax(audio_models['A'](img_t), dim=1)
-        pb = F.softmax(audio_models['B'](img_t), dim=1)
-        ic = audio_models['ast_ext'](w, sampling_rate=16000, return_tensors="pt").to(DEVICE)
-        pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
-        return (pa * 0.3468 + pb * 0.2762 + pc * 0.3770).cpu().numpy()[0]
 def call_peace_judge(audio_top, vlm_desc):
-    prompt_text = f"""You are a feline behavior expert. Decide the final cat mood.
-CONTEXT:
-- Audio Sensor predicts: {audio_top}
-- Video Sensor describes: {vlm_desc}
-RULES:
-- If Video describes 'ears back', 'teeth', or 'rigid', prioritize BACK_OFF/ANGRY.
-- Be concise and avoid repetition.
-VERDICT: [CATEGORY NAME]
-REASON: [1 short sentence]"""
     messages = [{"role": "user", "content": prompt_text}]
-    full_prompt_string = llm_tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    model_inputs = llm_tok(full_prompt_string, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
-        generated_ids = llm_model.generate(
-            **model_inputs,
-            max_new_tokens=80,
-            temperature=0.1,
-            do_sample=False,
-            repetition_penalty=1.2,
-            pad_token_id=llm_tok.eos_token_id
         )
-    decoded = llm_tok.decode(generated_ids[0][len(model_inputs["input_ids"][0]):], skip_special_tokens=True)
-    return decoded.strip()
 # ==========================================
-# 3. PIPELINE ANALYSE V12.1
 # ==========================================
 @spaces.GPU(duration=60)
 def analyze_cat_v12_final(video_path):
     if not video_path: return "❌ Aucune vidéo.", None, None
     tmp_audio = f"temp_{os.getpid()}.wav"
     tmp_output_video = f"annotated_{os.getpid()}.mp4"
     start_total = time.time()
     try:
-        # --- PHASE 1: AUDIO (Les Oreilles) ---
-        t_audio_start = time.time()
         clip = VideoFileClip(video_path)
         audio_probs = np.zeros(len(CATEGORIES))
         if clip.audio:
             clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
-            audio_probs = get_audio_probs(tmp_audio)
         clip.close()
-        t_audio = time.time() - t_audio_start
-        # --- PHASE 2: VISION (Les Yeux - FIX BY GROK) ---
-        t_vlm_start = time.time()
-        vlm_prompt = (
-            "Analyze the cat body language precisely.\n"
-            "EXAMPLE:\nDescription: Ears back, mouth open.\nAvis: Defensive.\n\n"
-            "YOUR TURN:\n1. Description: Describe ears and posture.\n2. Avis: Mood?"
-        )
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "video", "video_path": video_path}, # FIX ICI
-                    {"type": "text", "text": vlm_prompt}
-                ]
-            }
-        ]
-        # Application du template officiel pour SmolVLM2-Video
-        vlm_inputs = vlm_proc.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt"
-        ).to(DEVICE)
         with torch.no_grad():
-            vlm_out = vlm_model.generate(**vlm_inputs, max_new_tokens=100, do_sample=False)
         vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
-        vlm_clean = vlm_res.split("YOUR TURN:")[-1].strip() if "YOUR TURN:" in vlm_res else vlm_res.strip()
-        t_vlm = time.time() - t_vlm_start
-        # --- PHASE 3: JUGE (Le Cerveau) ---
-        t_llm_start = time.time()
-        top_a_idx = np.argmax(audio_probs)
-        audio_context = f"{CATEGORIES[top_a_idx].upper()} ({audio_probs[top_a_idx]*100:.1f}%)"
-        judge_decision = call_peace_judge(audio_context, vlm_clean)
-        t_llm = time.time() - t_llm_start
-        # Extraction du verdict final
-        final_verdict = CATEGORIES[top_a_idx].upper()
-        for cat in CATEGORIES:
-            if cat.upper() in judge_decision.upper():
-                final_verdict = cat.upper()
-                break
-        # --- PHASE 4: ANNOTATION & EXPORT ---
         top5 = np.argsort(audio_probs)[-5:][::-1]
-        fig = px.bar(x=[audio_probs[i]*100 for i in top5], y=[CATEGORIES[i].upper() for i in top5], orientation='h', title='Entrée Audio')
         cap = cv2.VideoCapture(video_path)
         fps, w_v, h_v = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         tmp_no_audio = f"no_audio_{os.getpid()}.mp4"
@@ -193,28 +166,26 @@ def analyze_cat_v12_final(video_path):
             ret, frame = cap.read()
             if not ret: break
             cv2.rectangle(frame, (0,0), (w_v, 65), (0,0,0), -1)
-            cv2.putText(frame, f"JUDGE: {final_verdict}", (20, 45), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 255), 3)
             out_v.write(frame)
         cap.release(); out_v.release()
         subprocess.run(['ffmpeg', '-i', tmp_no_audio, '-i', video_path, '-c:v', 'copy', '-c:a', 'aac', '-map', '0:v:0', '-map', '1:a:0', '-y', tmp_output_video], capture_output=True)
-        # --- PHASE 5: RAPPORT FINAL ---
         t_total = time.time() - start_total
-        report = f"""⚖️ DÉCISION DU JUGE DE PAIX :
 {judge_decision}
 ------------------------------------------
-👁️ ANALYSE VISUELLE (VLM) :
 {vlm_clean}
-📊 DONNÉES AUDIO :
-{audio_context}
-⏱️ CHRONOMÈTRES :
-• Audio (Piliers A/B/C) : {t_audio:.2f}s
-• Vision (SmolVLM)      : {t_vlm:.2f}s
-• Juge (SmolLM)         : {t_llm:.2f}s
-• TOTAL                 : {t_total:.2f}s"""
         if os.path.exists(tmp_audio): os.remove(tmp_audio)
         if os.path.exists(tmp_no_audio): os.remove(tmp_no_audio)
@@ -225,15 +196,15 @@ def analyze_cat_v12_final(video_path):
 # --- Interface Gradio ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🐱 CatSense POC v12.1 - Trinité Architecture")
     with gr.Row():
         with gr.Column():
             video_input = gr.Video()
-            btn = gr.Button("🚀 ANALYSE MULTIMODALE", variant="primary")
         with gr.Column():
-            report_out = gr.Textbox(label="Rapport Expert", lines=18)
             chart_out = gr.Plot()
-    video_out = gr.Video(label="Vidéo Expertisée")
     btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out, video_out])
 demo.launch()

 # 1. CHARGEMENT DE LA TRINITÉ
 # ==========================================
 def load_models():
+    print("📥 Initialisation CatSense v12.2 (Stateless Mode)...")
     # Yeux : SmolVLM 256M
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
 vlm_proc, vlm_model, llm_tok, llm_model, audio_models = load_models()
 # ==========================================
+# 2. LOGIQUE DU JUGE (FEW-SHOT & FAST)
 # ==========================================
 def call_peace_judge(audio_top, vlm_desc):
+    # Prompt ultra-court pour éviter que le 135M ne divague
+    prompt_text = f"""Task: Decide final cat mood.
+Example: Audio=HAPPY, Video=Ears back/Hissing -> Verdict: BACK_OFF.
+Current: Audio={audio_top}, Video={vlm_desc}.
+Verdict:"""
     messages = [{"role": "user", "content": prompt_text}]
+    full_prompt = llm_tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = llm_tok(full_prompt, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
+        outputs = llm_model.generate(
+            **inputs,
+            max_new_tokens=30,
+            temperature=0.01, # Déterministe
+            repetition_penalty=1.5,
+            do_sample=False
         )
+    res = llm_tok.decode(outputs[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
+    return res.strip().split('\n')[0]
 # ==========================================
+# 3. PIPELINE ANALYSE (STATELESS)
 # ==========================================
 @spaces.GPU(duration=60)
 def analyze_cat_v12_final(video_path):
     if not video_path: return "❌ Aucune vidéo.", None, None
+    if torch.cuda.is_available(): torch.cuda.empty_cache() # Purge mémoire vive
     tmp_audio = f"temp_{os.getpid()}.wav"
     tmp_output_video = f"annotated_{os.getpid()}.mp4"
     start_total = time.time()
     try:
+        # --- A. AUDIO (Oreilles) ---
+        t_0 = time.time()
         clip = VideoFileClip(video_path)
         audio_probs = np.zeros(len(CATEGORIES))
         if clip.audio:
             clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
+            # Logique simplifiée get_audio_probs intégrée ici pour stabilité
+            w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
+            if len(w) < 48000: w = np.pad(w, (0, 48000-len(w)))
+            mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
+            mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
+            img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
+            img_t = torch.tensor(img).unsqueeze(0).repeat(1, 3, 1, 1).float().to(DEVICE) / 255.0
+            with torch.no_grad():
+                pa = F.softmax(audio_models['A'](img_t), dim=1)
+                pb = F.softmax(audio_models['B'](img_t), dim=1)
+                ic = audio_models['ast_ext'](w, sampling_rate=16000, return_tensors="pt").to(DEVICE)
+                pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
+                audio_probs = (pa * 0.3468 + pb * 0.2762 + pc * 0.3770).cpu().numpy()[0]
         clip.close()
+        t_audio = time.time() - t_0
+        # --- B. VISION (Yeux - Nouveau Prompt Direct) ---
+        t_1 = time.time()
+        # On ne donne plus d'exemple au VLM pour éviter qu'il ne les répète
+        vlm_prompt = "Describe the cat's ears and mouth. Then name the mood."
+        messages = [{"role": "user", "content": [{"type": "video", "video_path": video_path}, {"type": "text", "text": vlm_prompt}]}]
+        vlm_inputs = vlm_proc.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(DEVICE)
         with torch.no_grad():
+            vlm_out = vlm_model.generate(**vlm_inputs, max_new_tokens=50, do_sample=True, temperature=0.1)
         vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
+        vlm_clean = vlm_res.split("assistant")[-1].strip()
+        t_vlm = time.time() - t_1
+        # --- C. JUGE (Cerveau) ---
+        t_2 = time.time()
+        top_a_label = CATEGORIES[np.argmax(audio_probs)].upper()
+        audio_ctx = f"{top_a_label} ({np.max(audio_probs)*100:.1f}%)"
+        judge_decision = call_peace_judge(audio_ctx, vlm_clean)
+        t_llm = time.time() - t_2
+        # --- D. VISUELS & EXPORT ---
         top5 = np.argsort(audio_probs)[-5:][::-1]
+        fig = px.bar(x=[audio_probs[i]*100 for i in top5], y=[CATEGORIES[i].upper() for i in top5], orientation='h', title='Audio Scores')
+        # Annotation vidéo simplifiée
+        final_v = top_a_label
+        for cat in CATEGORIES:
+            if cat.upper() in judge_decision.upper(): final_v = cat.upper(); break
         cap = cv2.VideoCapture(video_path)
         fps, w_v, h_v = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         tmp_no_audio = f"no_audio_{os.getpid()}.mp4"
             ret, frame = cap.read()
             if not ret: break
             cv2.rectangle(frame, (0,0), (w_v, 65), (0,0,0), -1)
+            cv2.putText(frame, f"JUDGE: {final_v}", (20, 45), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 255), 3)
             out_v.write(frame)
         cap.release(); out_v.release()
         subprocess.run(['ffmpeg', '-i', tmp_no_audio, '-i', video_path, '-c:v', 'copy', '-c:a', 'aac', '-map', '0:v:0', '-map', '1:a:0', '-y', tmp_output_video], capture_output=True)
+        # --- E. RAPPORT ---
         t_total = time.time() - start_total
+        report = f"""⚖️ DÉCISION DU JUGE :
 {judge_decision}
 ------------------------------------------
+👁️ VISION (VLM) :
 {vlm_clean}
+📊 AUDIO :
+{audio_ctx}
+⏱️ CHRONOS :
+Audio: {t_audio:.2f}s | Vision: {t_vlm:.2f}s | Juge: {t_llm:.2f}s
+TOTAL: {t_total:.2f}s"""
         if os.path.exists(tmp_audio): os.remove(tmp_audio)
         if os.path.exists(tmp_no_audio): os.remove(tmp_no_audio)
 # --- Interface Gradio ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🐱 CatSense POC v12.2 - Final Trinité")
     with gr.Row():
         with gr.Column():
             video_input = gr.Video()
+            btn = gr.Button("🚀 ANALYSE EXPERTE", variant="primary")
         with gr.Column():
+            report_out = gr.Textbox(label="Rapport Expert", lines=12)
             chart_out = gr.Plot()
+    video_out = gr.Video(label="Vidéo Annotée")
     btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out, video_out])
 demo.launch()