ericjedha commited on
Commit
0642309
·
verified ·
1 Parent(s): a87f75c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +352 -79
app.py CHANGED
@@ -13,6 +13,8 @@ from huggingface_hub import hf_hub_download
13
  from transformers import (
14
  AutoProcessor,
15
  AutoModelForImageTextToText,
 
 
16
  AutoModelForCausalLM,
17
  AutoTokenizer
18
  )
@@ -20,127 +22,398 @@ from moviepy import VideoFileClip
20
 
21
  # --- Configuration ---
22
  CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
23
- IDX = {c: i for i, c in enumerate(CATEGORIES)}
24
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
25
 
26
  # ==========================================
27
- # 1. CHARGEMENT DES MODÈLES (FOCUS DUO)
28
  # ==========================================
29
  def load_models():
30
- print("📥 Chargement du Duo Audio (Master 4ch + Student V3 192-mels)...")
31
-
32
- # --- VLM & LLM ---
33
  vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
34
- vlm_model = AutoModelForImageTextToText.from_pretrained(vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32).to(DEVICE).eval()
35
-
 
 
 
36
  llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
37
  llm_tok = AutoTokenizer.from_pretrained(llm_id)
38
- llm_model = AutoModelForCausalLM.from_pretrained(llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32).to(DEVICE).eval()
 
 
39
 
40
- # --- DUO AUDIO ---
41
  audio_models = {}
42
-
43
- # 1. Master Specialist (Expert 4ch - 128 Mels)
44
- path_spec = hf_hub_download(repo_id="ericjedha/best_specialist_modle_eff", filename="best_specialist_modle_eff.pth")
45
- m_spec = timm.create_model("efficientformerv2_s0", num_classes=len(CATEGORIES)).to(DEVICE)
46
- for name, module in m_spec.named_modules():
47
- if isinstance(module, torch.nn.Conv2d):
48
- new_conv = torch.nn.Conv2d(4, module.out_channels, 3, 2, 1, bias=True).to(DEVICE)
49
- parts = name.split('.'); parent = m_spec
50
- for part in parts[:-1]: parent = getattr(parent, part)
51
- setattr(parent, parts[-1], new_conv)
52
- break
53
- m_spec.load_state_dict(torch.load(path_spec, map_location=DEVICE), strict=False)
54
- audio_models['master'] = m_spec.eval()
55
-
56
- # 2. Student V3 (3ch - 192 Mels)
57
- path_student = hf_hub_download(repo_id="ericjedha/best_student", filename="best_student.pth")
58
- m_student = timm.create_model("efficientformerv2_s0", num_classes=len(CATEGORIES)).to(DEVICE)
59
- checkpoint = torch.load(path_student, map_location=DEVICE)
60
- # Support du dictionnaire de checkpoint complet
61
- state_dict = checkpoint['model_state_dict'] if 'model_state_dict' in checkpoint else checkpoint
62
- m_student.load_state_dict(state_dict)
63
- audio_models['student'] = m_student.eval()
64
-
65
  return vlm_model, llm_tok, llm_model, audio_models
66
 
67
- # Chargement initial
68
  vlm_model, llm_tok, llm_model, audio_models = load_models()
69
 
70
  # ==========================================
71
- # 2. PIPELINE ANALYSE DUO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  # ==========================================
73
  @spaces.GPU(duration=120)
74
- def analyze_cat_v13_duo(video_path):
75
- if not video_path: return "❌ Aucune vidéo.", None
76
- tmp_audio = f"temp_audio_{os.getpid()}.wav"
 
 
 
 
 
77
  start_total = time.time()
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  try:
80
- # --- PHASE AUDIO ---
 
 
81
  t_0 = time.time()
82
  clip = VideoFileClip(video_path)
83
  audio_probs = np.zeros(len(CATEGORIES))
84
-
85
  if clip.audio:
86
  clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
87
  w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
88
- w = np.pad(w, (0, max(0, 80000 - len(w))))[:80000]
89
-
90
- # A. Prep Master (128 Mels + Timbre 4ch)
91
- mel_128 = librosa.power_to_db(librosa.feature.melspectrogram(y=w, sr=16000, n_mels=128), ref=np.max)
92
- m_img_128 = cv2.resize((mel_128 + 40) / 40, (224, 224))
93
- zcr = cv2.resize(librosa.feature.zero_crossing_rate(w), (224, 224))
94
- cent = cv2.resize(librosa.feature.spectral_centroid(y=w, sr=16000), (224, 224))
95
- d_cent = librosa.feature.delta(cent)
96
- def norm(x): return (x - x.min()) / (x.max() - x.min() + 1e-6)
97
- x4 = torch.from_numpy(np.stack([m_img_128, norm(zcr), norm(cent), norm(d_cent)], 0)).float().unsqueeze(0).to(DEVICE)
98
-
99
- # B. Prep Student (192 Mels + 3ch)
100
- mel_192 = librosa.power_to_db(librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192, fmax=8000), ref=np.max)
101
- m_img_192 = cv2.resize((mel_192 + 40) / 40, (224, 224))
102
- xst = torch.from_numpy(np.stack([m_img_192]*3, 0)).float().unsqueeze(0).to(DEVICE)
103
-
104
- # C. Inférence & Fusion (Ratio 60/40)
 
105
  with torch.no_grad():
106
- p_m = F.softmax(audio_models['master'](x4), dim=1).cpu().numpy()
107
- p_s = F.softmax(audio_models['student'](xst), dim=1).cpu().numpy()
 
 
 
 
108
 
109
- eps = 1e-7
110
- log_probs = (0.60 * np.log(p_m + eps)) + (0.40 * np.log(p_s + eps))
111
- audio_probs = np.exp(log_probs)[0]
112
- audio_probs /= audio_probs.sum()
113
 
114
  clip.close()
115
  t_audio = time.time() - t_0
116
 
117
- # --- PHASE VISION & JUGE (Inchangées pour le comparatif) ---
118
- # ... (Garder le reste de ta fonction d'analyse et ton juge ici) ...
119
- # (J'utilise ici un résumé rapide pour la structure)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  top_idx = np.argmax(audio_probs)
121
  audio_ctx = f"{CATEGORIES[top_idx].upper()} ({audio_probs[top_idx]*100:.1f}%)"
122
-
123
- # Rapport final
124
- fig = px.bar(x=[audio_probs[i]*100 for i in np.argsort(audio_probs)[-5:][::-1]],
125
- y=[CATEGORIES[i].upper() for i in np.argsort(audio_probs)[-5:][::-1]],
126
- orientation='h', title="Duo Score 60/40")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- report = f"⚖️ DUO VERDICT : {CATEGORIES[top_idx].upper()}\n📊 Score Audio : {audio_ctx}"
129
  return report, fig
130
 
131
  except Exception as e:
132
  return f"❌ Erreur : {str(e)}", None
 
133
  finally:
134
- if os.path.exists(tmp_audio): os.remove(tmp_audio)
 
 
 
 
 
135
 
136
- # --- Gradio UI ---
137
- with gr.Blocks() as demo:
138
- gr.Markdown("# 🐱 CatSense Duo Tester")
 
 
139
  with gr.Row():
140
- video_input = gr.Video()
141
  with gr.Column():
142
- report_out = gr.Textbox(label="Verdict")
143
- chart_out = gr.Plot()
144
- gr.Button("Analyser").click(analyze_cat_v13_duo, inputs=video_input, outputs=[report_out, chart_out])
 
 
 
 
145
 
146
  demo.launch()
 
13
  from transformers import (
14
  AutoProcessor,
15
  AutoModelForImageTextToText,
16
+ ASTFeatureExtractor,
17
+ ASTForAudioClassification,
18
  AutoModelForCausalLM,
19
  AutoTokenizer
20
  )
 
22
 
23
  # --- Configuration ---
24
  CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
 
25
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
26
 
27
  # ==========================================
28
+ # 1. CHARGEMENT DES MODÈLES
29
  # ==========================================
30
  def load_models():
31
+ print("📥 Initialisation CatSense v12.13 (Vision Pure Mode)...")
32
+
33
+ # Modèle VLM
34
  vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
35
+ vlm_model = AutoModelForImageTextToText.from_pretrained(
36
+ vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
37
+ ).to(DEVICE).eval()
38
+
39
+ # LLM Juge
40
  llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
41
  llm_tok = AutoTokenizer.from_pretrained(llm_id)
42
+ llm_model = AutoModelForCausalLM.from_pretrained(
43
+ llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
44
+ ).to(DEVICE).eval()
45
 
46
+ # Audio models
47
  audio_models = {}
48
+ for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
49
+ ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
50
+ path = hf_hub_download(repo_id=repo, filename=f)
51
+ m = timm.create_model("vit_small_patch16_224", num_classes=len(CATEGORIES), in_chans=3)
52
+ m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
53
+ audio_models[p] = m.to(DEVICE).eval()
54
+
55
+ path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pillar_c_ast_v95_2_f1_0_9109.pth")
56
+ model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
57
+ sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
58
+ model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
59
+ audio_models['C'] = model_c.to(DEVICE).eval()
60
+ audio_models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
61
+
 
 
 
 
 
 
 
 
 
62
  return vlm_model, llm_tok, llm_model, audio_models
63
 
64
+ # Chargement global
65
  vlm_model, llm_tok, llm_model, audio_models = load_models()
66
 
67
  # ==========================================
68
+ # 2. JUGE HYBRIDE (règles + LLM)
69
+ # ==========================================
70
+ def call_peace_judge(audio_ctx, vlm_desc):
71
+ """
72
+ Deterministic + LLM hybrid judge.
73
+ AUDIO dominates when confidence > 30%.
74
+ Vision can refine but never neutralize strong audio signals.
75
+ """
76
+
77
+ vlm_lower = vlm_desc.lower()
78
+ audio_upper = audio_ctx.upper()
79
+
80
+ # =====================================================
81
+ # 1. HARD AUDIO GUARDRAILS (ABSOLUTE PRIORITY)
82
+ # =====================================================
83
+
84
+ if "PAIN" in audio_upper:
85
+ return "The cat is in pain."
86
+
87
+ if "ANGRY" in audio_upper:
88
+ return "The cat is angry."
89
+
90
+ if "DEFENSIVE" in audio_upper:
91
+ return "The cat is defensive."
92
+
93
+ if "BACK_OFF" in audio_upper or "BACKING_OFF" in audio_upper:
94
+ return "The cat is backing off."
95
+
96
+ # =====================================================
97
+ # 2. HARD VISUAL OVERRIDES (SAFETY FIRST)
98
+ # =====================================================
99
+
100
+ # Aggression / threat display
101
+ if any(x in vlm_lower for x in [
102
+ "front paws raised", "paws raised", "swiping",
103
+ "hissing", "mouth open and tense"
104
+ ]):
105
+ return "The cat is angry."
106
+
107
+ # Defensive posture
108
+ if any(x in vlm_lower for x in [
109
+ "arched back", "puffed fur", "ears flat",
110
+ "ears back", "sideways stance"
111
+ ]):
112
+ return "The cat is defensive."
113
+
114
+ # Pain indicators
115
+ if any(x in vlm_lower for x in [
116
+ "limping", "hunched", "crouched low",
117
+ "guarding", "withdrawn posture"
118
+ ]):
119
+ return "The cat is in pain."
120
+
121
+ # =====================================================
122
+ # 3. POSITIVE / LOW-RISK VISUAL STATES
123
+ # =====================================================
124
+
125
+ if any(x in vlm_lower for x in [
126
+ "kneading", "rubbing", "head bumping"
127
+ ]):
128
+ return "The cat is affectionate."
129
+
130
+ if any(x in vlm_lower for x in [
131
+ "playful", "rolling", "pouncing"
132
+ ]):
133
+ return "The cat is happy."
134
+
135
+ if any(x in vlm_lower for x in [
136
+ "stalking", "tail twitching", "low crawl"
137
+ ]):
138
+ return "The cat is hunting."
139
+
140
+ if any(x in vlm_lower for x in [
141
+ "approaching human", "following human",
142
+ "pawing at leg"
143
+ ]):
144
+ return "The cat is wanting attention."
145
+
146
+ if any(x in vlm_lower for x in [
147
+ "waiting posture", "looking at food",
148
+ "pacing near bowl"
149
+ ]):
150
+ return "The cat is hungry."
151
+
152
+ # =====================================================
153
+ # 4. LLM FALLBACK (NO CALM ALLOWED)
154
+ # =====================================================
155
+
156
+ messages = [
157
+ {
158
+ "role": "system",
159
+ "content": (
160
+ "You are a strict cat behavior decision engine.\n"
161
+ "Rules:\n"
162
+ "1. AUDIO has priority over vision.\n"
163
+ "2. You must choose the most conservative interpretation.\n"
164
+ "3. 'calm' is NOT a valid output.\n"
165
+ "4. If unsure, prefer defensive or in pain.\n\n"
166
+ "Allowed outputs ONLY:\n"
167
+ "affectionate, angry, backing off, defensive, hungry, happy, "
168
+ "hunting, in heat, calling kittens, in pain, wanting attention\n\n"
169
+ "Answer format EXACTLY:\n"
170
+ "The cat is [label]."
171
+ )
172
+ },
173
+ {
174
+ "role": "user",
175
+ "content": (
176
+ f"AUDIO SIGNAL (PRIMARY): {audio_ctx}\n"
177
+ f"VISION OBSERVATIONS (SECONDARY): {vlm_desc}\n\n"
178
+ "FINAL DECISION:"
179
+ )
180
+ }
181
+ ]
182
+
183
+ input_text = llm_tok.apply_chat_template(
184
+ messages,
185
+ tokenize=False,
186
+ add_generation_prompt=True
187
+ )
188
+
189
+ inputs = llm_tok(input_text, return_tensors="pt").to(DEVICE)
190
+
191
+ with torch.no_grad():
192
+ outputs = llm_model.generate(
193
+ **inputs,
194
+ max_new_tokens=15,
195
+ do_sample=False,
196
+ temperature=0.0,
197
+ pad_token_id=llm_tok.eos_token_id,
198
+ eos_token_id=llm_tok.eos_token_id
199
+ )
200
+
201
+ generated = llm_tok.decode(
202
+ outputs[0][inputs["input_ids"].shape[1]:],
203
+ skip_special_tokens=True
204
+ ).lower()
205
+
206
+ for cat in CATEGORIES:
207
+ if cat.replace("_", " ") in generated:
208
+ return f"The cat is {cat.replace('_', ' ')}."
209
+
210
+ # =====================================================
211
+ # 5. FINAL FAILSAFE (NEVER CALM)
212
+ # =====================================================
213
+ return "The cat is defensive."
214
+
215
+
216
+ # ==========================================
217
+ # 3. PIPELINE ANALYSE COMPLETE (CORRIGÉ)
218
  # ==========================================
219
  @spaces.GPU(duration=120)
220
+ def analyze_cat_v12_final(video_path):
221
+ if not video_path:
222
+ return "❌ Aucune vidéo.", None
223
+
224
+ if torch.cuda.is_available():
225
+ torch.cuda.empty_cache()
226
+
227
+ tmp_audio = f"temp_{os.getpid()}_{int(time.time())}.wav"
228
  start_total = time.time()
229
 
230
+ # --------------------------------------------------
231
+ # Helper: clean VLM repetitions (cheap & mobile-safe)
232
+ # --------------------------------------------------
233
+ def clean_vlm_output(text):
234
+ sentences = text.split(". ")
235
+ cleaned = []
236
+ seen = set()
237
+ for s in sentences:
238
+ key = s.strip().lower()
239
+ if key and key not in seen:
240
+ seen.add(key)
241
+ cleaned.append(s.strip())
242
+ return ". ".join(cleaned)
243
+
244
  try:
245
+ # =========================
246
+ # A. AUDIO
247
+ # =========================
248
  t_0 = time.time()
249
  clip = VideoFileClip(video_path)
250
  audio_probs = np.zeros(len(CATEGORIES))
251
+
252
  if clip.audio:
253
  clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
254
  w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
255
+ if len(w) < 48000:
256
+ w = np.pad(w, (0, 48000 - len(w)))
257
+
258
+ mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
259
+ mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
260
+ img = cv2.resize(
261
+ (np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8),
262
+ (224, 224)
263
+ )
264
+
265
+ img_t = (
266
+ torch.tensor(img)
267
+ .unsqueeze(0)
268
+ .repeat(1, 3, 1, 1)
269
+ .float()
270
+ .to(DEVICE) / 255.0
271
+ )
272
+
273
  with torch.no_grad():
274
+ pa = F.softmax(audio_models['A'](img_t), dim=1)
275
+ pb = F.softmax(audio_models['B'](img_t), dim=1)
276
+ ic = audio_models['ast_ext'](
277
+ w, sampling_rate=16000, return_tensors="pt"
278
+ ).to(DEVICE)
279
+ pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
280
 
281
+ audio_probs = (
282
+ pa * 0.3468 + pb * 0.2762 + pc * 0.3770
283
+ ).cpu().numpy()[0]
 
284
 
285
  clip.close()
286
  t_audio = time.time() - t_0
287
 
288
+ # =========================
289
+ # B. VISION (VLM STABILISÉ)
290
+ # =========================
291
+ t_1 = time.time()
292
+ vlm_proc = AutoProcessor.from_pretrained(
293
+ "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
294
+ )
295
+
296
+ vlm_prompt = (
297
+ "You are a feline behavior expert.\n"
298
+ "Describe ONLY observable physical features:\n"
299
+ "- ears position\n"
300
+ "- mouth state (open/closed/tense)\n"
301
+ "- tail position or movement\n"
302
+ "- body posture\n"
303
+ "Use short factual sentences.\n"
304
+ "One observation per sentence.\n"
305
+ "Do NOT interpret mood."
306
+ )
307
+
308
+ messages = [
309
+ {
310
+ "role": "user",
311
+ "content": [
312
+ {"type": "video", "path": video_path},
313
+ {"type": "text", "text": vlm_prompt}
314
+ ]
315
+ }
316
+ ]
317
+
318
+ vlm_inputs = vlm_proc.apply_chat_template(
319
+ messages,
320
+ add_generation_prompt=True,
321
+ tokenize=True,
322
+ return_dict=True,
323
+ return_tensors="pt"
324
+ ).to(DEVICE)
325
+
326
+ input_length = vlm_inputs["input_ids"].shape[1]
327
+
328
+ with torch.no_grad():
329
+ vlm_out = vlm_model.generate(
330
+ **vlm_inputs,
331
+ max_new_tokens=80,
332
+ do_sample=False,
333
+ temperature=0.0,
334
+ repetition_penalty=1.15, # 🔑 anti-loop
335
+ no_repeat_ngram_size=5, # 🔑 anti-phrase répétée
336
+ pad_token_id=vlm_proc.tokenizer.eos_token_id,
337
+ eos_token_id=vlm_proc.tokenizer.eos_token_id
338
+ )
339
+
340
+ gen_tokens = vlm_out[0][input_length:]
341
+ vlm_clean = vlm_proc.batch_decode(
342
+ [gen_tokens], skip_special_tokens=True
343
+ )[0]
344
+
345
+ vlm_clean = vlm_clean.strip().split("\n")[0]
346
+ if vlm_clean.lower().startswith("assistant:"):
347
+ vlm_clean = vlm_clean.split(":", 1)[-1].strip()
348
+
349
+ # nettoyage final anti-répétition
350
+ vlm_clean = clean_vlm_output(vlm_clean)
351
+
352
+ if torch.cuda.is_available():
353
+ torch.cuda.empty_cache()
354
+
355
+ t_vlm = time.time() - t_1
356
+
357
+ # =========================
358
+ # C. JUGE
359
+ # =========================
360
+ t_2 = time.time()
361
  top_idx = np.argmax(audio_probs)
362
  audio_ctx = f"{CATEGORIES[top_idx].upper()} ({audio_probs[top_idx]*100:.1f}%)"
363
+ judge_decision = call_peace_judge(audio_ctx, vlm_clean)
364
+ t_llm = time.time() - t_2
365
+
366
+ # =========================
367
+ # D. VISUELS
368
+ # =========================
369
+ top5 = np.argsort(audio_probs)[-5:][::-1]
370
+ fig = px.bar(
371
+ x=[audio_probs[i] * 100 for i in top5],
372
+ y=[CATEGORIES[i].upper() for i in top5],
373
+ orientation="h",
374
+ title="Top 5 Scores Audio",
375
+ labels={"x": "Probabilité (%)", "y": "Émotion"},
376
+ color=[audio_probs[i] * 100 for i in top5],
377
+ color_continuous_scale="Viridis"
378
+ )
379
+ fig.update_layout(height=400, showlegend=False)
380
+
381
+ # =========================
382
+ # E. RAPPORT FINAL
383
+ # =========================
384
+ t_total = time.time() - start_total
385
+ report = f"""⚖️ VERDICT JUGE : {judge_decision}
386
+ ------------------------------------------
387
+ 👁️ VISION : {vlm_clean}
388
+ 📊 AUDIO : {audio_ctx}
389
+ ⏱️ TEMPS : Audio {t_audio:.2f}s | Vision {t_vlm:.2f}s | Juge {t_llm:.2f}s | Total {t_total:.2f}s"""
390
 
 
391
  return report, fig
392
 
393
  except Exception as e:
394
  return f"❌ Erreur : {str(e)}", None
395
+
396
  finally:
397
+ if os.path.exists(tmp_audio):
398
+ try:
399
+ os.remove(tmp_audio)
400
+ except:
401
+ pass
402
+
403
 
404
+ # --- Interface Gradio ---
405
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
406
+ gr.Markdown("# 🐱 CatSense v12.13 - Vision Pure Mode")
407
+ gr.Markdown("✅ **SmolVLM2-256M** + **SmolLM2-135M Juge** + Audio Ensemble")
408
+
409
  with gr.Row():
 
410
  with gr.Column():
411
+ video_input = gr.Video(label="Vidéo du chat")
412
+ btn = gr.Button("🚀 ANALYSER", variant="primary", size="lg")
413
+ with gr.Column():
414
+ report_out = gr.Textbox(label="Résultat complet", lines=12, interactive=False)
415
+ chart_out = gr.Plot(label="Distribution des émotions (Audio)")
416
+
417
+ btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out])
418
 
419
  demo.launch()