Spaces:

ericjedha
/

crazycat

Sleeping

App Files Files Community

ericjedha commited on Jan 4

Commit

2e0024f

verified ·

1 Parent(s): 9e37bc0

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -56

app.py CHANGED Viewed

@@ -28,37 +28,48 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # 1. CHARGEMENT DES MODÈLES
 # ==========================================
 def load_models():
-    print("📥 Initialisation CatSense v12.13 (Vision Pure Mode)...")
-    # Modèle VLM
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
-    vlm_model = AutoModelForImageTextToText.from_pretrained(
-        vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
-    ).to(DEVICE).eval()
-    # LLM Juge
     llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     llm_tok = AutoTokenizer.from_pretrained(llm_id)
-    llm_model = AutoModelForCausalLM.from_pretrained(
-        llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
-    ).to(DEVICE).eval()
-    # Audio models
     audio_models = {}
-    for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
-                       ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
-        path = hf_hub_download(repo_id=repo, filename=f)
-        m = timm.create_model("vit_small_patch16_224", num_classes=len(CATEGORIES), in_chans=3)
-        m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
-        audio_models[p] = m.to(DEVICE).eval()
-    path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pillar_c_ast_v95_2_f1_0_9109.pth")
-    model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
-    sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
-    model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
-    audio_models['C'] = model_c.to(DEVICE).eval()
-    audio_models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
     return vlm_model, llm_tok, llm_model, audio_models
 # Chargement global
@@ -243,44 +254,50 @@ def analyze_cat_v12_final(video_path):
     try:
         # =========================
-        # A. AUDIO
         # =========================
         t_0 = time.time()
         clip = VideoFileClip(video_path)
         audio_probs = np.zeros(len(CATEGORIES))
         if clip.audio:
             clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
             w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
-            if len(w) < 48000:
-                w = np.pad(w, (0, 48000 - len(w)))
-            mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
-            mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
-            img = cv2.resize(
-                (np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8),
-                (224, 224)
-            )
-            img_t = (
-                torch.tensor(img)
-                .unsqueeze(0)
-                .repeat(1, 3, 1, 1)
-                .float()
-                .to(DEVICE) / 255.0
-            )
             with torch.no_grad():
-                pa = F.softmax(audio_models['A'](img_t), dim=1)
-                pb = F.softmax(audio_models['B'](img_t), dim=1)
-                ic = audio_models['ast_ext'](
-                    w, sampling_rate=16000, return_tensors="pt"
-                ).to(DEVICE)
-                pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
-                audio_probs = (
-                    pa * 0.3468 + pb * 0.2762 + pc * 0.3770
-                ).cpu().numpy()[0]
         clip.close()
         t_audio = time.time() - t_0

 # 1. CHARGEMENT DES MODÈLES
 # ==========================================
 def load_models():
+    print("📥 Initialisation CatSense v12.13 (Triumvirat Audio 0.95 Mode)...")
+    # --- VLM & LLM (Inchangés) ---
     vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
+    vlm_model = AutoModelForImageTextToText.from_pretrained(vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32).to(DEVICE).eval()
     llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
     llm_tok = AutoTokenizer.from_pretrained(llm_id)
+    llm_model = AutoModelForCausalLM.from_pretrained(llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32).to(DEVICE).eval()
+    # --- TRIUMVIRAT AUDIO ---
     audio_models = {}
+    # 1. Modèle Générique (EfficientFormerV2) - Flux 3ch
+    path_gen = hf_hub_download(repo_id="ericjedha/best_generique_model_eff", filename="best_silent_model.pth")
+    m_gen = timm.create_model("efficientformerv2_s0", num_classes=len(CATEGORIES)).to(DEVICE)
+    m_gen.load_state_dict(torch.load(path_gen, map_location=DEVICE))
+    audio_models['gen'] = m_gen.eval()
+    # 2. Spécialiste EfficientFormer - Flux 4ch
+    path_spec_eff = hf_hub_download(repo_id="ericjedha/best_specialist_modle_eff", filename="specialist_clean.pth")
+    m_spec_eff = timm.create_model("efficientformerv2_s0", num_classes=len(CATEGORIES)).to(DEVICE)
+    # Patch 4 canaux pour EfficientFormer
+    for name, module in m_spec_eff.named_modules():
+        if isinstance(module, torch.nn.Conv2d):
+            new_conv = torch.nn.Conv2d(4, module.out_channels, 3, 2, 1, bias=True).to(DEVICE)
+            parts = name.split('.'); parent = m_spec_eff
+            for part in parts[:-1]: parent = getattr(parent, part)
+            setattr(parent, parts[-1], new_conv)
+            break
+    m_spec_eff.load_state_dict(torch.load(path_spec_eff, map_location=DEVICE), strict=False)
+    audio_models['spec_eff'] = m_spec_eff.eval()
+    # 3. Spécialiste MobileNet - Flux 4ch
+    path_spec_mob = hf_hub_download(repo_id="ericjedha/best_mobilenet_spec_4ch", filename="best_mobilenet_spec_4ch.pth")
+    m_spec_mob = timm.create_model("mobilenetv3_small_100", num_classes=len(CATEGORIES)).to(DEVICE)
+    # Patch 4 canaux pour MobileNet
+    old_conv = m_spec_mob.conv_stem
+    m_spec_mob.conv_stem = torch.nn.Conv2d(4, old_conv.out_channels, 3, stride=2, padding=1, bias=False).to(DEVICE)
+    m_spec_mob.load_state_dict(torch.load(path_spec_mob, map_location=DEVICE))
+    audio_models['spec_mob'] = m_spec_mob.eval()
     return vlm_model, llm_tok, llm_model, audio_models
 # Chargement global
     try:
         # =========================
+        # A. AUDIO (TRIUMVIRAT OPTIMISÉ)
         # =========================
         t_0 = time.time()
         clip = VideoFileClip(video_path)
         audio_probs = np.zeros(len(CATEGORIES))
         if clip.audio:
             clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
             w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
+            w = np.pad(w, (0, max(0, 80000 - len(w))))[:80000]
+            # --- Extraction Features ---
+            # 1. Mel & Rythme
+            mel_raw = librosa.power_to_db(librosa.feature.melspectrogram(y=w, sr=16000, n_mels=128), ref=np.max)
+            mel = cv2.resize((mel_raw + 40) / 40, (224, 224))
+            oenv = librosa.onset.onset_strength(y=w, sr=16000)
+            rhy = cv2.resize(librosa.feature.tempogram(onset_envelope=oenv, sr=16000), (224, 224))
+            rhy = (rhy - rhy.min()) / (rhy.max() - rhy.min() + 1e-6)
+            # 2. Timbre (ZCR, Centroid)
+            zcr = cv2.resize(librosa.feature.zero_crossing_rate(w), (224, 224))
+            cent = cv2.resize(librosa.feature.spectral_centroid(y=w, sr=16000), (224, 224))
+            d_cent = librosa.feature.delta(cent)
+            def norm(x): return (x - x.min()) / (x.max() - x.min() + 1e-6)
+            # --- Préparation des Tenseurs ---
+            x3 = torch.from_numpy(np.stack([mel, rhy, mel * rhy], 0)).float().unsqueeze(0).to(DEVICE)
+            x4 = torch.from_numpy(np.stack([mel, norm(zcr), norm(cent), norm(d_cent)], 0)).float().unsqueeze(0).to(DEVICE)
+            # --- Inférence ---
             with torch.no_grad():
+                # Générique (EfficientFormer 3ch)
+                p1 = F.softmax(audio_models['gen'](x3), dim=1).cpu().numpy()
+                # Spécialiste Transformer (4ch)
+                p2 = F.softmax(audio_models['spec_eff'](x4), dim=1).cpu().numpy()
+                # Spécialiste MobileNet (4ch)
+                p3 = F.softmax(audio_models['spec_mob'](x4), dim=1).cpu().numpy()
+            # --- Moyenne Géométrique Gagnante ---
+            # alpha=0.20 (Gen), beta=0.55 (Spec_Eff), gamma=0.25 (Spec_Mob)
+            eps = 1e-7
+            log_probs = (0.20 * np.log(p1 + eps)) + (0.55 * np.log(p2 + eps)) + (0.25 * np.log(p3 + eps))
+            audio_probs = np.exp(log_probs)[0]
+            audio_probs /= audio_probs.sum()
         clip.close()
         t_audio = time.time() - t_0