Spaces:

ericjedha
/

crazycat

Sleeping

App Files Files Community

ericjedha commited on Dec 30, 2025

Commit

4639733

verified ·

1 Parent(s): adaf41a

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -13

app.py CHANGED Viewed

@@ -25,23 +25,25 @@ CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy',
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ==========================================
-# 1. CHARGEMENT DE LA TRINITÉ
 # ==========================================
 def load_models():
-    print("📥 Initialisation CatSense v12.12 (No-Assistant Fix)...")
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
-    vlm_proc = AutoProcessor.from_pretrained(vlm_id)
     vlm_model = AutoModelForImageTextToText.from_pretrained(
         vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
     llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     llm_tok = AutoTokenizer.from_pretrained(llm_id)
     llm_model = AutoModelForCausalLM.from_pretrained(
         llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
     audio_models = {}
     for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
                         ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
@@ -50,30 +52,38 @@ def load_models():
         m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
         audio_models[p] = m.to(DEVICE).eval()
-    path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pillar_c_ast_v95_2_f1_0_9109.pth")
     model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
     sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
     model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
     audio_models['C'] = model_c.to(DEVICE).eval()
     audio_models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
-    return vlm_proc, vlm_model, llm_tok, llm_model, audio_models
-vlm_proc, vlm_model, llm_tok, llm_model, audio_models = load_models()
 # ==========================================
-# 2. LOGIQUE DU JUGE
 # ==========================================
 def call_peace_judge(audio_top, vlm_desc):
     prompt_text = f"Audio Score: {audio_top}\nVisual Analysis: {vlm_desc}\nVerdict:"
     inputs = llm_tok(prompt_text, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
-        outputs = llm_model.generate(**inputs, max_new_tokens=20, temperature=0.01, do_sample=False)
     res = llm_tok.decode(outputs[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
     return res.strip().split('\n')[0]
 # ==========================================
-# 3. PIPELINE ANALYSE
 # ==========================================
 @spaces.GPU(duration=120)
 def analyze_cat_v12_final(video_path):
@@ -108,9 +118,12 @@ def analyze_cat_v12_final(video_path):
         clip.close()
         t_audio = time.time() - t_0
-        # --- B. VISION (CORRIGÉ : une seule fois, via apply_chat_template) ---
         t_1 = time.time()
         vlm_prompt = (
             "Describe the cat in the video\n"
             "count ears, mouth, tail and body posture.\n"
@@ -128,11 +141,17 @@ def analyze_cat_v12_final(video_path):
         ).to(DEVICE)
         with torch.no_grad():
-            vlm_out = vlm_model.generate(**vlm_inputs, max_new_tokens=100, do_sample=False)
         vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
-        # Nettoyage robuste de la réponse
         if "assistant" in vlm_res.lower():
             vlm_clean = vlm_res.split("assistant")[-1].strip()
         else:
@@ -175,7 +194,7 @@ def analyze_cat_v12_final(video_path):
 # --- Interface Gradio ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🐱 CatSense v12.12 - Raw Mode")
     with gr.Row():
         with gr.Column():
             video_input = gr.Video()

 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ==========================================
+# 1. CHARGEMENT DES MODÈLES (sans le VLM processor)
 # ==========================================
 def load_models():
+    print("📥 Initialisation CatSense v12.12 (Fresh Processor Fix)...")
+    # On charge SEULEMENT le modèle VLM (lourd), pas le processor
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
     vlm_model = AutoModelForImageTextToText.from_pretrained(
         vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
+    # LLM
     llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     llm_tok = AutoTokenizer.from_pretrained(llm_id)
     llm_model = AutoModelForCausalLM.from_pretrained(
         llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     ).to(DEVICE).eval()
+    # Audio models
     audio_models = {}
     for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
                         ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
         m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
         audio_models[p] = m.to(DEVICE).eval()
+    path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pilier_c_ast_v95_2_f1_0_9109.pth")
     model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
     sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
     model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
     audio_models['C'] = model_c.to(DEVICE).eval()
     audio_models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
+    return vlm_model, llm_tok, llm_model, audio_models
+# Chargement global des modèles lourds (pas du processor VLM)
+vlm_model, llm_tok, llm_model, audio_models = load_models()
 # ==========================================
+# 2. LOGIQUE DU JUGE (avec stochasticité)
 # ==========================================
 def call_peace_judge(audio_top, vlm_desc):
     prompt_text = f"Audio Score: {audio_top}\nVisual Analysis: {vlm_desc}\nVerdict:"
     inputs = llm_tok(prompt_text, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
+        outputs = llm_model.generate(
+            **inputs,
+            max_new_tokens=20,
+            do_sample=True,
+            temperature=0.4,
+            top_p=0.9,
+            pad_token_id=llm_tok.eos_token_id
+        )
     res = llm_tok.decode(outputs[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
     return res.strip().split('\n')[0]
 # ==========================================
+# 3. PIPELINE ANALYSE (Processor VLM FRESH à chaque appel)
 # ==========================================
 @spaces.GPU(duration=120)
 def analyze_cat_v12_final(video_path):
         clip.close()
         t_audio = time.time() - t_0
+        # --- B. VISION (Processor FRESH à chaque appel) ---
         t_1 = time.time()
+        # 🔑 CORRECTION MAJEURE : on charge le processor ICI
+        vlm_proc = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
         vlm_prompt = (
             "Describe the cat in the video\n"
             "count ears, mouth, tail and body posture.\n"
         ).to(DEVICE)
         with torch.no_grad():
+            vlm_out = vlm_model.generate(
+                **vlm_inputs,
+                max_new_tokens=100,
+                do_sample=True,          # ✅ Stochastic
+                temperature=0.7,         # ✅ Variabilité
+                top_p=0.9
+            )
         vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
+        # Nettoyage robuste
         if "assistant" in vlm_res.lower():
             vlm_clean = vlm_res.split("assistant")[-1].strip()
         else:
 # --- Interface Gradio ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🐱 CatSense v12.12 - Fresh Processor Mode")
     with gr.Row():
         with gr.Column():
             video_input = gr.Video()