ericjedha commited on
Commit
2e0024f
·
verified ·
1 Parent(s): 9e37bc0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -56
app.py CHANGED
@@ -28,37 +28,48 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
28
  # 1. CHARGEMENT DES MODÈLES
29
  # ==========================================
30
  def load_models():
31
- print("📥 Initialisation CatSense v12.13 (Vision Pure Mode)...")
32
-
33
- # Modèle VLM
34
  vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
35
- vlm_model = AutoModelForImageTextToText.from_pretrained(
36
- vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
37
- ).to(DEVICE).eval()
38
-
39
- # LLM Juge
40
  llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
41
  llm_tok = AutoTokenizer.from_pretrained(llm_id)
42
- llm_model = AutoModelForCausalLM.from_pretrained(
43
- llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
44
- ).to(DEVICE).eval()
45
 
46
- # Audio models
47
  audio_models = {}
48
- for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
49
- ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
50
- path = hf_hub_download(repo_id=repo, filename=f)
51
- m = timm.create_model("vit_small_patch16_224", num_classes=len(CATEGORIES), in_chans=3)
52
- m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
53
- audio_models[p] = m.to(DEVICE).eval()
54
-
55
- path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pillar_c_ast_v95_2_f1_0_9109.pth")
56
- model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
57
- sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
58
- model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
59
- audio_models['C'] = model_c.to(DEVICE).eval()
60
- audio_models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
61
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  return vlm_model, llm_tok, llm_model, audio_models
63
 
64
  # Chargement global
@@ -243,44 +254,50 @@ def analyze_cat_v12_final(video_path):
243
 
244
  try:
245
  # =========================
246
- # A. AUDIO
247
  # =========================
248
  t_0 = time.time()
249
  clip = VideoFileClip(video_path)
250
  audio_probs = np.zeros(len(CATEGORIES))
251
-
252
  if clip.audio:
253
  clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
254
  w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
255
- if len(w) < 48000:
256
- w = np.pad(w, (0, 48000 - len(w)))
257
-
258
- mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
259
- mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
260
- img = cv2.resize(
261
- (np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8),
262
- (224, 224)
263
- )
264
-
265
- img_t = (
266
- torch.tensor(img)
267
- .unsqueeze(0)
268
- .repeat(1, 3, 1, 1)
269
- .float()
270
- .to(DEVICE) / 255.0
271
- )
272
-
 
 
 
273
  with torch.no_grad():
274
- pa = F.softmax(audio_models['A'](img_t), dim=1)
275
- pb = F.softmax(audio_models['B'](img_t), dim=1)
276
- ic = audio_models['ast_ext'](
277
- w, sampling_rate=16000, return_tensors="pt"
278
- ).to(DEVICE)
279
- pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
280
-
281
- audio_probs = (
282
- pa * 0.3468 + pb * 0.2762 + pc * 0.3770
283
- ).cpu().numpy()[0]
 
 
 
284
 
285
  clip.close()
286
  t_audio = time.time() - t_0
 
28
  # 1. CHARGEMENT DES MODÈLES
29
  # ==========================================
30
  def load_models():
31
+ print("📥 Initialisation CatSense v12.13 (Triumvirat Audio 0.95 Mode)...")
32
+
33
+ # --- VLM & LLM (Inchangés) ---
34
  vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
35
+ vlm_model = AutoModelForImageTextToText.from_pretrained(vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32).to(DEVICE).eval()
36
+
 
 
 
37
  llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
38
  llm_tok = AutoTokenizer.from_pretrained(llm_id)
39
+ llm_model = AutoModelForCausalLM.from_pretrained(llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32).to(DEVICE).eval()
 
 
40
 
41
+ # --- TRIUMVIRAT AUDIO ---
42
  audio_models = {}
43
+
44
+ # 1. Modèle Générique (EfficientFormerV2) - Flux 3ch
45
+ path_gen = hf_hub_download(repo_id="ericjedha/best_generique_model_eff", filename="best_silent_model.pth")
46
+ m_gen = timm.create_model("efficientformerv2_s0", num_classes=len(CATEGORIES)).to(DEVICE)
47
+ m_gen.load_state_dict(torch.load(path_gen, map_location=DEVICE))
48
+ audio_models['gen'] = m_gen.eval()
49
+
50
+ # 2. Spécialiste EfficientFormer - Flux 4ch
51
+ path_spec_eff = hf_hub_download(repo_id="ericjedha/best_specialist_modle_eff", filename="specialist_clean.pth")
52
+ m_spec_eff = timm.create_model("efficientformerv2_s0", num_classes=len(CATEGORIES)).to(DEVICE)
53
+ # Patch 4 canaux pour EfficientFormer
54
+ for name, module in m_spec_eff.named_modules():
55
+ if isinstance(module, torch.nn.Conv2d):
56
+ new_conv = torch.nn.Conv2d(4, module.out_channels, 3, 2, 1, bias=True).to(DEVICE)
57
+ parts = name.split('.'); parent = m_spec_eff
58
+ for part in parts[:-1]: parent = getattr(parent, part)
59
+ setattr(parent, parts[-1], new_conv)
60
+ break
61
+ m_spec_eff.load_state_dict(torch.load(path_spec_eff, map_location=DEVICE), strict=False)
62
+ audio_models['spec_eff'] = m_spec_eff.eval()
63
+
64
+ # 3. Spécialiste MobileNet - Flux 4ch
65
+ path_spec_mob = hf_hub_download(repo_id="ericjedha/best_mobilenet_spec_4ch", filename="best_mobilenet_spec_4ch.pth")
66
+ m_spec_mob = timm.create_model("mobilenetv3_small_100", num_classes=len(CATEGORIES)).to(DEVICE)
67
+ # Patch 4 canaux pour MobileNet
68
+ old_conv = m_spec_mob.conv_stem
69
+ m_spec_mob.conv_stem = torch.nn.Conv2d(4, old_conv.out_channels, 3, stride=2, padding=1, bias=False).to(DEVICE)
70
+ m_spec_mob.load_state_dict(torch.load(path_spec_mob, map_location=DEVICE))
71
+ audio_models['spec_mob'] = m_spec_mob.eval()
72
+
73
  return vlm_model, llm_tok, llm_model, audio_models
74
 
75
  # Chargement global
 
254
 
255
  try:
256
  # =========================
257
+ # A. AUDIO (TRIUMVIRAT OPTIMISÉ)
258
  # =========================
259
  t_0 = time.time()
260
  clip = VideoFileClip(video_path)
261
  audio_probs = np.zeros(len(CATEGORIES))
262
+
263
  if clip.audio:
264
  clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
265
  w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
266
+ w = np.pad(w, (0, max(0, 80000 - len(w))))[:80000]
267
+
268
+ # --- Extraction Features ---
269
+ # 1. Mel & Rythme
270
+ mel_raw = librosa.power_to_db(librosa.feature.melspectrogram(y=w, sr=16000, n_mels=128), ref=np.max)
271
+ mel = cv2.resize((mel_raw + 40) / 40, (224, 224))
272
+ oenv = librosa.onset.onset_strength(y=w, sr=16000)
273
+ rhy = cv2.resize(librosa.feature.tempogram(onset_envelope=oenv, sr=16000), (224, 224))
274
+ rhy = (rhy - rhy.min()) / (rhy.max() - rhy.min() + 1e-6)
275
+
276
+ # 2. Timbre (ZCR, Centroid)
277
+ zcr = cv2.resize(librosa.feature.zero_crossing_rate(w), (224, 224))
278
+ cent = cv2.resize(librosa.feature.spectral_centroid(y=w, sr=16000), (224, 224))
279
+ d_cent = librosa.feature.delta(cent)
280
+ def norm(x): return (x - x.min()) / (x.max() - x.min() + 1e-6)
281
+
282
+ # --- Préparation des Tenseurs ---
283
+ x3 = torch.from_numpy(np.stack([mel, rhy, mel * rhy], 0)).float().unsqueeze(0).to(DEVICE)
284
+ x4 = torch.from_numpy(np.stack([mel, norm(zcr), norm(cent), norm(d_cent)], 0)).float().unsqueeze(0).to(DEVICE)
285
+
286
+ # --- Inférence ---
287
  with torch.no_grad():
288
+ # Générique (EfficientFormer 3ch)
289
+ p1 = F.softmax(audio_models['gen'](x3), dim=1).cpu().numpy()
290
+ # Spécialiste Transformer (4ch)
291
+ p2 = F.softmax(audio_models['spec_eff'](x4), dim=1).cpu().numpy()
292
+ # Spécialiste MobileNet (4ch)
293
+ p3 = F.softmax(audio_models['spec_mob'](x4), dim=1).cpu().numpy()
294
+
295
+ # --- Moyenne Géométrique Gagnante ---
296
+ # alpha=0.20 (Gen), beta=0.55 (Spec_Eff), gamma=0.25 (Spec_Mob)
297
+ eps = 1e-7
298
+ log_probs = (0.20 * np.log(p1 + eps)) + (0.55 * np.log(p2 + eps)) + (0.25 * np.log(p3 + eps))
299
+ audio_probs = np.exp(log_probs)[0]
300
+ audio_probs /= audio_probs.sum()
301
 
302
  clip.close()
303
  t_audio = time.time() - t_0