Update app.py
Browse files
app.py
CHANGED
|
@@ -28,37 +28,48 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
| 28 |
# 1. CHARGEMENT DES MODÈLES
|
| 29 |
# ==========================================
|
| 30 |
def load_models():
|
| 31 |
-
print("📥 Initialisation CatSense v12.13 (
|
| 32 |
-
|
| 33 |
-
#
|
| 34 |
vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
|
| 35 |
-
vlm_model = AutoModelForImageTextToText.from_pretrained(
|
| 36 |
-
|
| 37 |
-
).to(DEVICE).eval()
|
| 38 |
-
|
| 39 |
-
# LLM Juge
|
| 40 |
llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
|
| 41 |
llm_tok = AutoTokenizer.from_pretrained(llm_id)
|
| 42 |
-
llm_model = AutoModelForCausalLM.from_pretrained(
|
| 43 |
-
llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| 44 |
-
).to(DEVICE).eval()
|
| 45 |
|
| 46 |
-
#
|
| 47 |
audio_models = {}
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
return vlm_model, llm_tok, llm_model, audio_models
|
| 63 |
|
| 64 |
# Chargement global
|
|
@@ -243,44 +254,50 @@ def analyze_cat_v12_final(video_path):
|
|
| 243 |
|
| 244 |
try:
|
| 245 |
# =========================
|
| 246 |
-
# A. AUDIO
|
| 247 |
# =========================
|
| 248 |
t_0 = time.time()
|
| 249 |
clip = VideoFileClip(video_path)
|
| 250 |
audio_probs = np.zeros(len(CATEGORIES))
|
| 251 |
-
|
| 252 |
if clip.audio:
|
| 253 |
clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
|
| 254 |
w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
)
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
| 273 |
with torch.no_grad():
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
clip.close()
|
| 286 |
t_audio = time.time() - t_0
|
|
|
|
| 28 |
# 1. CHARGEMENT DES MODÈLES
|
| 29 |
# ==========================================
|
| 30 |
def load_models():
|
| 31 |
+
print("📥 Initialisation CatSense v12.13 (Triumvirat Audio 0.95 Mode)...")
|
| 32 |
+
|
| 33 |
+
# --- VLM & LLM (Inchangés) ---
|
| 34 |
vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
|
| 35 |
+
vlm_model = AutoModelForImageTextToText.from_pretrained(vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32).to(DEVICE).eval()
|
| 36 |
+
|
|
|
|
|
|
|
|
|
|
| 37 |
llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
|
| 38 |
llm_tok = AutoTokenizer.from_pretrained(llm_id)
|
| 39 |
+
llm_model = AutoModelForCausalLM.from_pretrained(llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32).to(DEVICE).eval()
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
# --- TRIUMVIRAT AUDIO ---
|
| 42 |
audio_models = {}
|
| 43 |
+
|
| 44 |
+
# 1. Modèle Générique (EfficientFormerV2) - Flux 3ch
|
| 45 |
+
path_gen = hf_hub_download(repo_id="ericjedha/best_generique_model_eff", filename="best_silent_model.pth")
|
| 46 |
+
m_gen = timm.create_model("efficientformerv2_s0", num_classes=len(CATEGORIES)).to(DEVICE)
|
| 47 |
+
m_gen.load_state_dict(torch.load(path_gen, map_location=DEVICE))
|
| 48 |
+
audio_models['gen'] = m_gen.eval()
|
| 49 |
+
|
| 50 |
+
# 2. Spécialiste EfficientFormer - Flux 4ch
|
| 51 |
+
path_spec_eff = hf_hub_download(repo_id="ericjedha/best_specialist_modle_eff", filename="specialist_clean.pth")
|
| 52 |
+
m_spec_eff = timm.create_model("efficientformerv2_s0", num_classes=len(CATEGORIES)).to(DEVICE)
|
| 53 |
+
# Patch 4 canaux pour EfficientFormer
|
| 54 |
+
for name, module in m_spec_eff.named_modules():
|
| 55 |
+
if isinstance(module, torch.nn.Conv2d):
|
| 56 |
+
new_conv = torch.nn.Conv2d(4, module.out_channels, 3, 2, 1, bias=True).to(DEVICE)
|
| 57 |
+
parts = name.split('.'); parent = m_spec_eff
|
| 58 |
+
for part in parts[:-1]: parent = getattr(parent, part)
|
| 59 |
+
setattr(parent, parts[-1], new_conv)
|
| 60 |
+
break
|
| 61 |
+
m_spec_eff.load_state_dict(torch.load(path_spec_eff, map_location=DEVICE), strict=False)
|
| 62 |
+
audio_models['spec_eff'] = m_spec_eff.eval()
|
| 63 |
+
|
| 64 |
+
# 3. Spécialiste MobileNet - Flux 4ch
|
| 65 |
+
path_spec_mob = hf_hub_download(repo_id="ericjedha/best_mobilenet_spec_4ch", filename="best_mobilenet_spec_4ch.pth")
|
| 66 |
+
m_spec_mob = timm.create_model("mobilenetv3_small_100", num_classes=len(CATEGORIES)).to(DEVICE)
|
| 67 |
+
# Patch 4 canaux pour MobileNet
|
| 68 |
+
old_conv = m_spec_mob.conv_stem
|
| 69 |
+
m_spec_mob.conv_stem = torch.nn.Conv2d(4, old_conv.out_channels, 3, stride=2, padding=1, bias=False).to(DEVICE)
|
| 70 |
+
m_spec_mob.load_state_dict(torch.load(path_spec_mob, map_location=DEVICE))
|
| 71 |
+
audio_models['spec_mob'] = m_spec_mob.eval()
|
| 72 |
+
|
| 73 |
return vlm_model, llm_tok, llm_model, audio_models
|
| 74 |
|
| 75 |
# Chargement global
|
|
|
|
| 254 |
|
| 255 |
try:
|
| 256 |
# =========================
|
| 257 |
+
# A. AUDIO (TRIUMVIRAT OPTIMISÉ)
|
| 258 |
# =========================
|
| 259 |
t_0 = time.time()
|
| 260 |
clip = VideoFileClip(video_path)
|
| 261 |
audio_probs = np.zeros(len(CATEGORIES))
|
| 262 |
+
|
| 263 |
if clip.audio:
|
| 264 |
clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
|
| 265 |
w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
|
| 266 |
+
w = np.pad(w, (0, max(0, 80000 - len(w))))[:80000]
|
| 267 |
+
|
| 268 |
+
# --- Extraction Features ---
|
| 269 |
+
# 1. Mel & Rythme
|
| 270 |
+
mel_raw = librosa.power_to_db(librosa.feature.melspectrogram(y=w, sr=16000, n_mels=128), ref=np.max)
|
| 271 |
+
mel = cv2.resize((mel_raw + 40) / 40, (224, 224))
|
| 272 |
+
oenv = librosa.onset.onset_strength(y=w, sr=16000)
|
| 273 |
+
rhy = cv2.resize(librosa.feature.tempogram(onset_envelope=oenv, sr=16000), (224, 224))
|
| 274 |
+
rhy = (rhy - rhy.min()) / (rhy.max() - rhy.min() + 1e-6)
|
| 275 |
+
|
| 276 |
+
# 2. Timbre (ZCR, Centroid)
|
| 277 |
+
zcr = cv2.resize(librosa.feature.zero_crossing_rate(w), (224, 224))
|
| 278 |
+
cent = cv2.resize(librosa.feature.spectral_centroid(y=w, sr=16000), (224, 224))
|
| 279 |
+
d_cent = librosa.feature.delta(cent)
|
| 280 |
+
def norm(x): return (x - x.min()) / (x.max() - x.min() + 1e-6)
|
| 281 |
+
|
| 282 |
+
# --- Préparation des Tenseurs ---
|
| 283 |
+
x3 = torch.from_numpy(np.stack([mel, rhy, mel * rhy], 0)).float().unsqueeze(0).to(DEVICE)
|
| 284 |
+
x4 = torch.from_numpy(np.stack([mel, norm(zcr), norm(cent), norm(d_cent)], 0)).float().unsqueeze(0).to(DEVICE)
|
| 285 |
+
|
| 286 |
+
# --- Inférence ---
|
| 287 |
with torch.no_grad():
|
| 288 |
+
# Générique (EfficientFormer 3ch)
|
| 289 |
+
p1 = F.softmax(audio_models['gen'](x3), dim=1).cpu().numpy()
|
| 290 |
+
# Spécialiste Transformer (4ch)
|
| 291 |
+
p2 = F.softmax(audio_models['spec_eff'](x4), dim=1).cpu().numpy()
|
| 292 |
+
# Spécialiste MobileNet (4ch)
|
| 293 |
+
p3 = F.softmax(audio_models['spec_mob'](x4), dim=1).cpu().numpy()
|
| 294 |
+
|
| 295 |
+
# --- Moyenne Géométrique Gagnante ---
|
| 296 |
+
# alpha=0.20 (Gen), beta=0.55 (Spec_Eff), gamma=0.25 (Spec_Mob)
|
| 297 |
+
eps = 1e-7
|
| 298 |
+
log_probs = (0.20 * np.log(p1 + eps)) + (0.55 * np.log(p2 + eps)) + (0.25 * np.log(p3 + eps))
|
| 299 |
+
audio_probs = np.exp(log_probs)[0]
|
| 300 |
+
audio_probs /= audio_probs.sum()
|
| 301 |
|
| 302 |
clip.close()
|
| 303 |
t_audio = time.time() - t_0
|