Spaces:

ericjedha
/

crazycat

Sleeping

App Files Files Community

ericjedha commited on Dec 30, 2025

Commit

adaf41a

verified ·

1 Parent(s): ce6cd2e

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -25

app.py CHANGED Viewed

@@ -77,8 +77,10 @@ def call_peace_judge(audio_top, vlm_desc):
 # ==========================================
 @spaces.GPU(duration=120)
 def analyze_cat_v12_final(video_path):
-    if not video_path: return "❌ Aucune vidéo.", None
-    if torch.cuda.is_available(): torch.cuda.empty_cache()
     tmp_audio = f"temp_{os.getpid()}.wav"
     start_total = time.time()
@@ -91,7 +93,8 @@ def analyze_cat_v12_final(video_path):
         if clip.audio:
             clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
             w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
-            if len(w) < 48000: w = np.pad(w, (0, 48000 - len(w)))
             mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
             mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
             img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
@@ -105,7 +108,7 @@ def analyze_cat_v12_final(video_path):
         clip.close()
         t_audio = time.time() - t_0
-        # --- B. VISION (Correction Matching Frames) ---
         t_1 = time.time()
         vlm_prompt = (
@@ -114,14 +117,11 @@ def analyze_cat_v12_final(video_path):
             "Based on this, what is the cat's mood?"
         )
-        # On définit le message
         messages = [{"role": "user", "content": [{"type": "video", "path": video_path}, {"type": "text", "text": vlm_prompt}]}]
-        # LA SOLUTION : On utilise le processeur pour transformer les messages en tenseurs directement.
-        # Cela injecte automatiquement le bon nombre de jetons vidéo (93) dans le prompt.
         vlm_inputs = vlm_proc.apply_chat_template(
             messages,
-            add_generation_prompt=True, # Indispensable pour la structure interne du modèle
             tokenize=True,
             return_dict=True,
             return_tensors="pt"
@@ -132,25 +132,13 @@ def analyze_cat_v12_final(video_path):
         vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
-        # On nettoie tout le bloc "User" et "Assistant" pour n'avoir QUE la réponse brute
         if "assistant" in vlm_res.lower():
             vlm_clean = vlm_res.split("assistant")[-1].strip()
         else:
-            vlm_clean = vlm_res.strip()
         t_vlm = time.time() - t_1
-        # On appelle le processeur directement avec le texte brut
-        vlm_inputs = vlm_proc(text=vlm_prompt, videos=video_path, return_tensors="pt").to(DEVICE)
-        with torch.no_grad():
-            vlm_out = vlm_model.generate(**vlm_inputs, max_new_tokens=100, do_sample=False)
-        vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
-        # On nettoie juste le prompt de l'affichage
-        vlm_clean = vlm_res.replace(vlm_prompt, "").strip()
-        t_vlm = time.time() - t_1
         # --- C. JUGE ---
         t_2 = time.time()
@@ -161,7 +149,12 @@ def analyze_cat_v12_final(video_path):
         # --- D. VISUELS ---
         top5 = np.argsort(audio_probs)[-5:][::-1]
-        fig = px.bar(x=[audio_probs[i]*100 for i in top5], y=[CATEGORIES[i].upper() for i in top5], orientation='h', title='Scores Audio')
         # --- E. RAPPORT ---
         t_total = time.time() - start_total
@@ -171,11 +164,13 @@ def analyze_cat_v12_final(video_path):
 📊 AUDIO : {audio_ctx}
 ⏱️ TEMPS : Audio {t_audio:.2f}s | Vision {t_vlm:.2f}s | Total {t_total:.2f}s"""
-        if os.path.exists(tmp_audio): os.remove(tmp_audio)
         return report, fig
     except Exception as e:
-        if os.path.exists(tmp_audio): os.remove(tmp_audio)
         return f"❌ Erreur : {str(e)}", None
 # --- Interface Gradio ---

 # ==========================================
 @spaces.GPU(duration=120)
 def analyze_cat_v12_final(video_path):
+    if not video_path:
+        return "❌ Aucune vidéo.", None
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     tmp_audio = f"temp_{os.getpid()}.wav"
     start_total = time.time()
         if clip.audio:
             clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
             w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
+            if len(w) < 48000:
+                w = np.pad(w, (0, 48000 - len(w)))
             mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
             mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
             img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
         clip.close()
         t_audio = time.time() - t_0
+        # --- B. VISION (CORRIGÉ : une seule fois, via apply_chat_template) ---
         t_1 = time.time()
         vlm_prompt = (
             "Based on this, what is the cat's mood?"
         )
         messages = [{"role": "user", "content": [{"type": "video", "path": video_path}, {"type": "text", "text": vlm_prompt}]}]
         vlm_inputs = vlm_proc.apply_chat_template(
             messages,
+            add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
             return_tensors="pt"
         vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
+        # Nettoyage robuste de la réponse
         if "assistant" in vlm_res.lower():
             vlm_clean = vlm_res.split("assistant")[-1].strip()
         else:
+            vlm_clean = vlm_res.replace(vlm_prompt, "").strip()
         t_vlm = time.time() - t_1
         # --- C. JUGE ---
         t_2 = time.time()
         # --- D. VISUELS ---
         top5 = np.argsort(audio_probs)[-5:][::-1]
+        fig = px.bar(
+            x=[audio_probs[i]*100 for i in top5],
+            y=[CATEGORIES[i].upper() for i in top5],
+            orientation='h',
+            title='Scores Audio'
+        )
         # --- E. RAPPORT ---
         t_total = time.time() - start_total
 📊 AUDIO : {audio_ctx}
 ⏱️ TEMPS : Audio {t_audio:.2f}s | Vision {t_vlm:.2f}s | Total {t_total:.2f}s"""
+        if os.path.exists(tmp_audio):
+            os.remove(tmp_audio)
         return report, fig
     except Exception as e:
+        if os.path.exists(tmp_audio):
+            os.remove(tmp_audio)
         return f"❌ Erreur : {str(e)}", None
 # --- Interface Gradio ---