Spaces:

ericjedha
/

crazycat

Sleeping

App Files Files Community

ericjedha commited on Dec 31, 2025

Commit

455bb26

verified ·

1 Parent(s): a7d8fd5

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -49

app.py CHANGED Viewed

@@ -18,25 +18,25 @@ from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer
 )
-from moviepy import VideoFileClip  # ← Changement principal : import direct depuis moviepy (v2.x)
 # --- Configuration ---
 CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ==========================================
-# 1. CHARGEMENT DES MODÈLES (sans le VLM processor)
 # ==========================================
 def load_models():
     print("📥 Initialisation CatSense v12.13 (Vision Pure Mode)...")
-    # Modèle VLM (seulement le modèle, pas le processor)
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
     vlm_model = AutoModelForImageTextToText.from_pretrained(
         vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
-    # LLM
     llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     llm_tok = AutoTokenizer.from_pretrained(llm_id)
     llm_model = AutoModelForCausalLM.from_pretrained(
@@ -61,70 +61,50 @@ def load_models():
     return vlm_model, llm_tok, llm_model, audio_models
-# Chargement global des modèles lourds
 vlm_model, llm_tok, llm_model, audio_models = load_models()
 # ==========================================
-# 2. LOGIQUE DU JUGE (avec stochasticité)
 # ==========================================
 def call_peace_judge(audio_top, vlm_desc):
-    # Utilise le format messages + chat template (recommandé par HF pour SmolLM2-Instruct)
     messages = [
         {
             "role": "system",
-            "content": """You are a cat behavior expert. Use audio result and video description to describe the behavior of the cat: Answer with ONLY one short sentence starting exactly with "The cat is" : use only these words: affectionate, angry, backing off, defensive, hungry, happy, hunting, in heat, calling kittens, in pain, wanting attention, calm.
-No explanation. No extra words."""
         },
         {
             "role": "user",
-            "content": f"Audio analysis (most reliable for vocalizations): {audio_top}\nVisual description (posture and body language): {vlm_desc}"
         }
     ]
-    # Applique le chat template correctement
-    input_text = llm_tok.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True  # Ajoute le token pour commencer la réponse assistant
-    )
     inputs = llm_tok(input_text, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
         outputs = llm_model.generate(
-            **inputs,
-            max_new_tokens=30,      # Un peu plus pour être sûr
-            do_sample=True,
-            temperature=0.3,        # Augmente un peu pour plus de créativité
-            top_p=0.90,
             pad_token_id=llm_tok.eos_token_id,
             eos_token_id=llm_tok.eos_token_id
         )
-    # Décode seulement les nouveaux tokens
     generated = llm_tok.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-    generated = generated.strip()
-    # Nettoyage final
-    if generated == "":
-        generated = "is displaying neutral behavior."
-    # Force le début si besoin
-    if not generated.lower().startswith("the cat"):
-        generated = "The cat " + generated.lower()
-    # Garde seulement la première phrase
-    generated = generated.split('\n')[0].split('.')[0].strip()
-    if not generated.endswith('.'):
-        generated += "."
-    # Capitalise correctement
-    if generated.startswith("The cat"):
-        generated = "The cat" + generated[7:].capitalize()
-    return generated
 # ==========================================
-# 3. PIPELINE ANALYSE
 # ==========================================
 @spaces.GPU(duration=120)
 def analyze_cat_v12_final(video_path):
@@ -167,7 +147,7 @@ def analyze_cat_v12_final(video_path):
         clip.close()
         t_audio = time.time() - t_0
-        # --- B. VISION (Processor chargé à chaque appel) ---
         t_1 = time.time()
         vlm_proc = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
@@ -201,9 +181,10 @@ def analyze_cat_v12_final(video_path):
             vlm_out = vlm_model.generate(
                 **vlm_inputs,
                 max_new_tokens=80,
-                do_sample=True,
-                temperature=0.7,
-                top_p=0.9
             )
         gen_tokens = vlm_out[0][input_length:]
@@ -217,7 +198,7 @@ def analyze_cat_v12_final(video_path):
         t_vlm = time.time() - t_1
-        # --- C. JUGE ---
         t_2 = time.time()
         top_idx = np.argmax(audio_probs)
         audio_ctx = f"{CATEGORIES[top_idx].upper()} ({audio_probs[top_idx]*100:.1f}%)"
@@ -237,7 +218,7 @@ def analyze_cat_v12_final(video_path):
         )
         fig.update_layout(height=400, showlegend=False)
-        # --- E. RAPPORT ---
         t_total = time.time() - start_total
         report = f"""⚖️ VERDICT JUGE : {judge_decision}
@@ -261,6 +242,8 @@ def analyze_cat_v12_final(video_path):
 # --- Interface Gradio ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🐱 CatSense v12.13 - Vision Pure Mode")
     with gr.Row():
         with gr.Column():
             video_input = gr.Video(label="Vidéo du chat")
@@ -271,4 +254,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out])
-demo.launch()

     AutoModelForCausalLM,
     AutoTokenizer
 )
+from moviepy import VideoFileClip
 # --- Configuration ---
 CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ==========================================
+# 1. CHARGEMENT DES MODÈLES
 # ==========================================
 def load_models():
     print("📥 Initialisation CatSense v12.13 (Vision Pure Mode)...")
+    # Modèle VLM
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
     vlm_model = AutoModelForImageTextToText.from_pretrained(
         vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
+    # LLM Juge
     llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     llm_tok = AutoTokenizer.from_pretrained(llm_id)
     llm_model = AutoModelForCausalLM.from_pretrained(
     return vlm_model, llm_tok, llm_model, audio_models
+# Chargement global
 vlm_model, llm_tok, llm_model, audio_models = load_models()
 # ==========================================
+# 2. JUGE OPTIMISÉ (nouveau)
 # ==========================================
 def call_peace_judge(audio_top, vlm_desc):
     messages = [
         {
             "role": "system",
+            "content": """You are a cat behavior expert. Match audio prediction with visual description.
+Answer ONLY: "The cat is [ONE WORD]: affectionate/angry/backing_off/defensive/hungry/happy/hunting/in_heat/calling_kittens/in_pain/wanting_attention/calm"
+No explanation. No extra text. Match exactly."""
         },
         {
             "role": "user",
+            "content": f"AUDIO: {audio_top}\nVISION: {vlm_desc}\n\nFINAL JUDGEMENT:"
         }
     ]
+    input_text = llm_tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = llm_tok(input_text, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
         outputs = llm_model.generate(
+            **inputs,
+            max_new_tokens=20,
+            do_sample=False,
+            temperature=0.0,
             pad_token_id=llm_tok.eos_token_id,
             eos_token_id=llm_tok.eos_token_id
         )
     generated = llm_tok.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+    # Extraction stricte du mot-clé
+    for cat in CATEGORIES + ['calm']:
+        if cat.replace('_', ' ') in generated.lower():
+            return f"The cat is {cat.replace('_', ' ')}."
+    return "The cat is calm."
 # ==========================================
+# 3. PIPELINE ANALYSE COMPLETE
 # ==========================================
 @spaces.GPU(duration=120)
 def analyze_cat_v12_final(video_path):
         clip.close()
         t_audio = time.time() - t_0
+        # --- B. VISION (Ton prompt parfait + params optimisés) ---
         t_1 = time.time()
         vlm_proc = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
             vlm_out = vlm_model.generate(
                 **vlm_inputs,
                 max_new_tokens=80,
+                do_sample=False,        # ✅ Greedy decoding
+                temperature=0.0,        # ✅ Zéro créativité
+                top_p=0.9,
+                pad_token_id=vlm_proc.tokenizer.eos_token_id
             )
         gen_tokens = vlm_out[0][input_length:]
         t_vlm = time.time() - t_1
+        # --- C. JUGE OPTIMISÉ ---
         t_2 = time.time()
         top_idx = np.argmax(audio_probs)
         audio_ctx = f"{CATEGORIES[top_idx].upper()} ({audio_probs[top_idx]*100:.1f}%)"
         )
         fig.update_layout(height=400, showlegend=False)
+        # --- E. RAPPORT FINAL ---
         t_total = time.time() - start_total
         report = f"""⚖️ VERDICT JUGE : {judge_decision}
 # --- Interface Gradio ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🐱 CatSense v12.13 - Vision Pure Mode")
+    gr.Markdown("✅ **SmolVLM2-256M** + **SmolLM2-135M Juge** + Audio Ensemble")
     with gr.Row():
         with gr.Column():
             video_input = gr.Video(label="Vidéo du chat")
     btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out])
+demo.launch()