Spaces:

ericjedha
/

crazycat

Sleeping

App Files Files Community

ericjedha commited on Dec 30, 2025

Commit

bf64010

verified ·

1 Parent(s): 283a965

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -84

app.py CHANGED Viewed

@@ -11,14 +11,14 @@ import spaces
 import plotly.express as px
 from huggingface_hub import hf_hub_download
 from transformers import (
-    AutoProcessor,
-    AutoModelForImageTextToText,
-    ASTFeatureExtractor,
     ASTForAudioClassification,
     AutoModelForCausalLM,
     AutoTokenizer
 )
-from moviepy import VideoFileClip
 # --- Configuration ---
 CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
@@ -29,9 +29,9 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ==========================================
 def load_models():
     print("📥 Initialisation CatSense v12.13 (Vision Pure Mode)...")
-    # On charge SEULEMENT le modèle VLM (lourd), pas le processor
-    vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
     vlm_model = AutoModelForImageTextToText.from_pretrained(
         vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
@@ -45,23 +45,23 @@ def load_models():
     # Audio models
     audio_models = {}
-    for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
-                        ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
         path = hf_hub_download(repo_id=repo, filename=f)
         m = timm.create_model("vit_small_patch16_224", num_classes=len(CATEGORIES), in_chans=3)
         m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
         audio_models[p] = m.to(DEVICE).eval()
     path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pillar_c_ast_v95_2_f1_0_9109.pth")
     model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
     sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
     model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
     audio_models['C'] = model_c.to(DEVICE).eval()
     audio_models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
     return vlm_model, llm_tok, llm_model, audio_models
-# Chargement global des modèles lourds (pas du processor VLM)
 vlm_model, llm_tok, llm_model, audio_models = load_models()
 # ==========================================
@@ -77,99 +77,112 @@ def call_peace_judge(audio_top, vlm_desc):
     inputs = llm_tok(prompt_text, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
         outputs = llm_model.generate(
-            **inputs,
-            max_new_tokens=25,
             do_sample=True,
             temperature=0.4,
             top_p=0.9,
             pad_token_id=llm_tok.eos_token_id
         )
     res = llm_tok.decode(outputs[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
-    # Nettoyer les sauts de ligne, points, et garder une seule phrase
     res = res.strip().split('\n')[0].split('.')[0].strip()
     if not res.startswith("The cat"):
         res = "The cat " + res.lower()
     return res
 # ==========================================
-# 3. PIPELINE ANALYSE (Processor VLM FRESH à chaque appel)
 # ==========================================
 @spaces.GPU(duration=120)
 def analyze_cat_v12_final(video_path):
-    if not video_path:
         return "❌ Aucune vidéo.", None
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    tmp_audio = f"temp_{os.getpid()}.wav"
     start_total = time.time()
     try:
         # --- A. AUDIO ---
         t_0 = time.time()
         clip = VideoFileClip(video_path)
         audio_probs = np.zeros(len(CATEGORIES))
         if clip.audio:
-            clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
             w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
-            if len(w) < 48000:
                 w = np.pad(w, (0, 48000 - len(w)))
             mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
             mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
-            img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
             img_t = torch.tensor(img).unsqueeze(0).repeat(1, 3, 1, 1).float().to(DEVICE) / 255.0
             with torch.no_grad():
                 pa = F.softmax(audio_models['A'](img_t), dim=1)
                 pb = F.softmax(audio_models['B'](img_t), dim=1)
                 ic = audio_models['ast_ext'](w, sampling_rate=16000, return_tensors="pt").to(DEVICE)
                 pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
                 audio_probs = (pa * 0.3468 + pb * 0.2762 + pc * 0.3770).cpu().numpy()[0]
         clip.close()
         t_audio = time.time() - t_0
-        # --- B. VISION (Processor FRESH à chaque appel) ---
-t_1 = time.time()
-vlm_proc = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256VLM2-Video-Instruct")
-vlm_prompt = (
-    "You are a feline behavior expert. "
-    "Analyze precisely: number and position of ears, state of mouth (open/closed/tense), tail position and movement, and overall body posture. "
-    "Do not interpret mood. Only describe observable features."
-)
-messages = [{"role": "user", "content": [{"type": "video", "path": video_path}, {"type": "text", "text": vlm_prompt}]}]
-# Tokenize avec retour des inputs
-vlm_inputs = vlm_proc.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt"
-).to(DEVICE)
-input_length = vlm_inputs["input_ids"].shape[1]  # 🔑 nombre de tokens du prompt
-with torch.no_grad():
-    vlm_out = vlm_model.generate(
-        **vlm_inputs,
-        max_new_tokens=80,
-        do_sample=True,
-        temperature=0.7,
-        top_p=0.9
-    )
-# 🔑 DÉCODAGE SÉCURISÉ : uniquement les nouveaux tokens
-gen_tokens = vlm_out[0][input_length:]
-vlm_clean = vlm_proc.batch_decode(gen_tokens.unsqueeze(0), skip_special_tokens=True)[0]
-# Nettoyage final : une seule phrase, sans "Assistant:"
-vlm_clean = vlm_clean.strip().split('\n')[0]
-if vlm_clean.lower().startswith("assistant:"):
-    vlm_clean = vlm_clean.split(":", 1)[-1].strip()
-t_vlm = time.time() - t_1
         # --- C. JUGE ---
         t_2 = time.time()
@@ -181,39 +194,48 @@ t_vlm = time.time() - t_1
         # --- D. VISUELS ---
         top5 = np.argsort(audio_probs)[-5:][::-1]
         fig = px.bar(
-            x=[audio_probs[i]*100 for i in top5],
-            y=[CATEGORIES[i].upper() for i in top5],
-            orientation='h',
-            title='Scores Audio'
         )
         # --- E. RAPPORT ---
         t_total = time.time() - start_total
         report = f"""⚖️ VERDICT JUGE : {judge_decision}
-                    ------------------------------------------
-                👁️ VISION : {vlm_clean}
-                📊 AUDIO : {audio_ctx}
-                ⏱️ TEMPS : Audio {t_audio:.2f}s | Vision {t_vlm:.2f}s | Total {t_total:.2f}s"""
-        if os.path.exists(tmp_audio):
-            os.remove(tmp_audio)
         return report, fig
     except Exception as e:
-        if os.path.exists(tmp_audio):
-            os.remove(tmp_audio)
         return f"❌ Erreur : {str(e)}", None
 # --- Interface Gradio ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🐱 CatSense v12.13 - Vision Pure Mode")
     with gr.Row():
         with gr.Column():
-            video_input = gr.Video()
-            btn = gr.Button("🚀 ANALYSER", variant="primary")
         with gr.Column():
-            report_out = gr.Textbox(label="Résultat", lines=12)
-            chart_out = gr.Plot()
     btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out])
 demo.launch()

 import plotly.express as px
 from huggingface_hub import hf_hub_download
 from transformers import (
+    AutoProcessor,
+    AutoModelForImageTextToText,
+    ASTFeatureExtractor,
     ASTForAudioClassification,
     AutoModelForCausalLM,
     AutoTokenizer
 )
+from moviepy.editor import VideoFileClip  # Correction : import correct
 # --- Configuration ---
 CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
 # ==========================================
 def load_models():
     print("📥 Initialisation CatSense v12.13 (Vision Pure Mode)...")
+    # Modèle VLM (seulement le modèle, pas le processor)
+    vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
     vlm_model = AutoModelForImageTextToText.from_pretrained(
         vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
     # Audio models
     audio_models = {}
+    for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
+                       ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
         path = hf_hub_download(repo_id=repo, filename=f)
         m = timm.create_model("vit_small_patch16_224", num_classes=len(CATEGORIES), in_chans=3)
         m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
         audio_models[p] = m.to(DEVICE).eval()
     path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pillar_c_ast_v95_2_f1_0_9109.pth")
     model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
     sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
     model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
     audio_models['C'] = model_c.to(DEVICE).eval()
     audio_models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
     return vlm_model, llm_tok, llm_model, audio_models
+# Chargement global des modèles lourds
 vlm_model, llm_tok, llm_model, audio_models = load_models()
 # ==========================================
     inputs = llm_tok(prompt_text, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
         outputs = llm_model.generate(
+            **inputs,
+            max_new_tokens=25,
             do_sample=True,
             temperature=0.4,
             top_p=0.9,
             pad_token_id=llm_tok.eos_token_id
         )
     res = llm_tok.decode(outputs[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
     res = res.strip().split('\n')[0].split('.')[0].strip()
     if not res.startswith("The cat"):
         res = "The cat " + res.lower()
     return res
 # ==========================================
+# 3. PIPELINE ANALYSE
 # ==========================================
 @spaces.GPU(duration=120)
 def analyze_cat_v12_final(video_path):
+    if not video_path:
         return "❌ Aucune vidéo.", None
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    tmp_audio = f"temp_{os.getpid()}_{int(time.time())}.wav"
     start_total = time.time()
     try:
         # --- A. AUDIO ---
         t_0 = time.time()
         clip = VideoFileClip(video_path)
         audio_probs = np.zeros(len(CATEGORIES))
         if clip.audio:
+            clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None, verbose=False)
             w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
+            if len(w) < 48000:
                 w = np.pad(w, (0, 48000 - len(w)))
             mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
             mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
+            img = cv2.resize(
+                (np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8),
+                (224, 224)
+            )
             img_t = torch.tensor(img).unsqueeze(0).repeat(1, 3, 1, 1).float().to(DEVICE) / 255.0
             with torch.no_grad():
                 pa = F.softmax(audio_models['A'](img_t), dim=1)
                 pb = F.softmax(audio_models['B'](img_t), dim=1)
                 ic = audio_models['ast_ext'](w, sampling_rate=16000, return_tensors="pt").to(DEVICE)
                 pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
                 audio_probs = (pa * 0.3468 + pb * 0.2762 + pc * 0.3770).cpu().numpy()[0]
         clip.close()
         t_audio = time.time() - t_0
+        # --- B. VISION (Processor chargé à chaque appel pour éviter les fuites mémoire) ---
+        t_1 = time.time()
+        vlm_proc = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
+        vlm_prompt = (
+            "You are a feline behavior expert. "
+            "Analyze precisely: number and position of ears, state of mouth (open/closed/tense), tail position and movement, and overall body posture. "
+            "Do not interpret mood. Only describe observable features."
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "path": video_path},
+                    {"type": "text", "text": vlm_prompt}
+                ]
+            }
+        ]
+        vlm_inputs = vlm_proc.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(DEVICE)
+        input_length = vlm_inputs["input_ids"].shape[1]
+        with torch.no_grad():
+            vlm_out = vlm_model.generate(
+                **vlm_inputs,
+                max_new_tokens=80,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9
+            )
+        gen_tokens = vlm_out[0][input_length:]
+        vlm_clean = vlm_proc.batch_decode([gen_tokens], skip_special_tokens=True)[0]
+        vlm_clean = vlm_clean.strip().split('\n')[0]
+        if vlm_clean.lower().startswith("assistant:"):
+            vlm_clean = vlm_clean.split(":", 1)[-1].strip()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        t_vlm = time.time() - t_1
         # --- C. JUGE ---
         t_2 = time.time()
         # --- D. VISUELS ---
         top5 = np.argsort(audio_probs)[-5:][::-1]
         fig = px.bar(
+            x=[audio_probs[i]*100 for i in top5],
+            y=[CATEGORIES[i].upper() for i in top5],
+            orientation='h',
+            title='Top 5 Scores Audio',
+            labels={'x': 'Probabilité (%)', 'y': 'Émotion'},
+            color=[audio_probs[i]*100 for i in top5],
+            color_continuous_scale='Viridis'
         )
+        fig.update_layout(height=400, showlegend=False)
         # --- E. RAPPORT ---
         t_total = time.time() - start_total
         report = f"""⚖️ VERDICT JUGE : {judge_decision}
+------------------------------------------
+👁️ VISION : {vlm_clean}
+📊 AUDIO : {audio_ctx}
+⏱️ TEMPS : Audio {t_audio:.2f}s | Vision {t_vlm:.2f}s | Juge {t_llm:.2f}s | Total {t_total:.2f}s"""
         return report, fig
     except Exception as e:
         return f"❌ Erreur : {str(e)}", None
+    finally:
+        if os.path.exists(tmp_audio):
+            try:
+                os.remove(tmp_audio)
+            except:
+                pass
 # --- Interface Gradio ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🐱 CatSense v12.13 - Vision Pure Mode")
     with gr.Row():
         with gr.Column():
+            video_input = gr.Video(label="Vidéo du chat")
+            btn = gr.Button("🚀 ANALYSER", variant="primary", size="lg")
         with gr.Column():
+            report_out = gr.Textbox(label="Résultat complet", lines=12, interactive=False)
+            chart_out = gr.Plot(label="Distribution des émotions (Audio)")
     btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out])
 demo.launch()