ericjedha commited on
Commit
be73d82
·
verified ·
1 Parent(s): 281d274

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +233 -11
app.py CHANGED
@@ -1,17 +1,239 @@
1
- ⚖️ DÉCISION DU JUGE (SmolLM) :
2
- This sequence of actions indicates that the cat is either defensive or alert. The ears are back, mouth open, and the ears are back, which is a defensive posture. The ears are back, which is a defensive posture, and the mouth is open, which is a relaxed posture. This sequence of actions suggests that the cat is either preparing to defend itself or is in a relaxed state.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  ------------------------------------------
5
- 👁️ VISION (Yeux) :
6
- 1. Description: Describe ears and posture.
7
- 2. Avis: Mood?
8
- Assistant: Ears back, mouth open.
 
 
 
 
 
 
 
9
 
10
- Ears back, mouth open.
 
 
11
 
12
- This sequence of actions indicates that the cat is either defensive or alert. The ears are back, which is a defensive posture, and the mouth is open, which is a relaxed posture. The ears are back, which is a defensive posture, and the mouth is open, which is a relaxed posture. This sequence of actions suggests that the cat is either preparing to defend itself or is in a relaxed state.
 
13
 
14
- 📊 AUDIO (Oreilles) :
15
- HUNT (62.6%)
 
 
 
 
 
 
 
 
 
 
16
 
17
- ⏱️ TOTAL : 44.85s
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import gradio as gr
4
+ import librosa
5
+ import numpy as np
6
+ import cv2
7
+ import timm
8
+ import os
9
+ import time
10
+ import spaces
11
+ import plotly.express as px
12
+ from huggingface_hub import hf_hub_download
13
+ from transformers import (
14
+ AutoProcessor,
15
+ AutoModelForImageTextToText,
16
+ ASTFeatureExtractor,
17
+ ASTForAudioClassification,
18
+ AutoModelForCausalLM,
19
+ AutoTokenizer
20
+ )
21
+ from moviepy import VideoFileClip
22
+ import subprocess
23
+
24
+ # --- Configuration ---
25
+ CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
26
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
27
+
28
+ # ==========================================
29
+ # 1. CHARGEMENT DE LA TRINITÉ
30
+ # ==========================================
31
+ def load_models():
32
+ print("📥 Initialisation de la Trinité (VLM + LLM + Audio)...")
33
+
34
+ # Yeux : SmolVLM 256M
35
+ vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
36
+ vlm_proc = AutoProcessor.from_pretrained(vlm_id)
37
+ vlm_model = AutoModelForImageTextToText.from_pretrained(
38
+ vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
39
+ ).to(DEVICE).eval()
40
+
41
+ # Cerveau : SmolLM 135M (Arbitre)
42
+ llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
43
+ llm_tok = AutoTokenizer.from_pretrained(llm_id)
44
+ llm_model = AutoModelForCausalLM.from_pretrained(
45
+ llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
46
+ ).to(DEVICE).eval()
47
+
48
+ # Oreilles : Piliers Audio
49
+ audio_models = {}
50
+ for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
51
+ ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
52
+ path = hf_hub_download(repo_id=repo, filename=f)
53
+ m = timm.create_model("vit_small_patch16_224", num_classes=len(CATEGORIES), in_chans=3)
54
+ m.load_state_dict(torch.load(path, map_location=DEVICE)['model_state_dict'])
55
+ audio_models[p] = m.to(DEVICE).eval()
56
+
57
+ path_c = hf_hub_download(repo_id="ericjedha/pilier_c", filename="best_pillar_c_ast_v95_2_f1_0_9109.pth")
58
+ model_c = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", num_labels=len(CATEGORIES), ignore_mismatched_sizes=True)
59
+ sd = torch.load(path_c, map_location=DEVICE)['model_state_dict']
60
+ model_c.load_state_dict({k.replace('ast.', ''): v for k, v in sd.items()}, strict=False)
61
+ audio_models['C'] = model_c.to(DEVICE).eval()
62
+ audio_models['ast_ext'] = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
63
+
64
+ return vlm_proc, vlm_model, llm_tok, llm_model, audio_models
65
+
66
+ vlm_proc, vlm_model, llm_tok, llm_model, audio_models = load_models()
67
+
68
+ # ==========================================
69
+ # 2. FONCTIONS UTILITAIRES
70
+ # ==========================================
71
+ def get_audio_probs(audio_path):
72
+ w, _ = librosa.load(audio_path, sr=16000, duration=5.0)
73
+ if len(w) < 48000: w = np.pad(w, (0, 48000-len(w)))
74
+ mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
75
+ mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
76
+ img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
77
+ img_t = torch.tensor(img).unsqueeze(0).repeat(1, 3, 1, 1).float().to(DEVICE) / 255.0
78
+ with torch.no_grad():
79
+ pa = F.softmax(audio_models['A'](img_t), dim=1)
80
+ pb = F.softmax(audio_models['B'](img_t), dim=1)
81
+ ic = audio_models['ast_ext'](w, sampling_rate=16000, return_tensors="pt").to(DEVICE)
82
+ pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
83
+ return (pa * 0.3468 + pb * 0.2762 + pc * 0.3770).cpu().numpy()[0]
84
+
85
+ def call_peace_judge(audio_top, vlm_desc):
86
+ prompt_text = f"""You are a feline behavior expert. Decide the final cat mood.
87
+ CONTEXT:
88
+ - Audio Sensor predicts: {audio_top}
89
+ - Video Sensor describes: {vlm_desc}
90
+
91
+ RULES:
92
+ - If Video describes 'ears back', 'teeth', or 'rigid', prioritize BACK_OFF/ANGRY.
93
+ - Be concise and avoid repetition.
94
+
95
+ VERDICT: [CATEGORY NAME]
96
+ REASON: [1 short sentence]"""
97
+
98
+ messages = [{"role": "user", "content": prompt_text}]
99
+ full_prompt_string = llm_tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
100
+ model_inputs = llm_tok(full_prompt_string, return_tensors="pt").to(DEVICE)
101
+
102
+ with torch.no_grad():
103
+ generated_ids = llm_model.generate(
104
+ **model_inputs,
105
+ max_new_tokens=80,
106
+ temperature=0.1,
107
+ do_sample=False,
108
+ repetition_penalty=1.2,
109
+ pad_token_id=llm_tok.eos_token_id
110
+ )
111
+
112
+ decoded = llm_tok.decode(generated_ids[0][len(model_inputs["input_ids"][0]):], skip_special_tokens=True)
113
+ return decoded.strip()
114
+
115
+ # ==========================================
116
+ # 3. PIPELINE ANALYSE V12.1
117
+ # ==========================================
118
+ @spaces.GPU(duration=60)
119
+ def analyze_cat_v12_final(video_path):
120
+ if not video_path: return "❌ Aucune vidéo.", None, None
121
+ tmp_audio = f"temp_{os.getpid()}.wav"
122
+ tmp_output_video = f"annotated_{os.getpid()}.mp4"
123
+ start_total = time.time()
124
+
125
+ try:
126
+ # --- PHASE 1: AUDIO (Les Oreilles) ---
127
+ t_audio_start = time.time()
128
+ clip = VideoFileClip(video_path)
129
+ audio_probs = np.zeros(len(CATEGORIES))
130
+ if clip.audio:
131
+ clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
132
+ audio_probs = get_audio_probs(tmp_audio)
133
+ clip.close()
134
+ t_audio = time.time() - t_audio_start
135
+
136
+ # --- PHASE 2: VISION (Les Yeux - FIX BY GROK) ---
137
+ t_vlm_start = time.time()
138
+ vlm_prompt = (
139
+ "Analyze the cat body language precisely.\n"
140
+ "EXAMPLE:\nDescription: Ears back, mouth open.\nAvis: Defensive.\n\n"
141
+ "YOUR TURN:\n1. Description: Describe ears and posture.\n2. Avis: Mood?"
142
+ )
143
+ messages = [
144
+ {
145
+ "role": "user",
146
+ "content": [
147
+ {"type": "video", "video_path": video_path}, # FIX ICI
148
+ {"type": "text", "text": vlm_prompt}
149
+ ]
150
+ }
151
+ ]
152
+
153
+ # Application du template officiel pour SmolVLM2-Video
154
+ vlm_inputs = vlm_proc.apply_chat_template(
155
+ messages,
156
+ add_generation_prompt=True,
157
+ tokenize=True,
158
+ return_dict=True,
159
+ return_tensors="pt"
160
+ ).to(DEVICE)
161
+
162
+ with torch.no_grad():
163
+ vlm_out = vlm_model.generate(**vlm_inputs, max_new_tokens=100, do_sample=False)
164
+
165
+ vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
166
+ vlm_clean = vlm_res.split("YOUR TURN:")[-1].strip() if "YOUR TURN:" in vlm_res else vlm_res.strip()
167
+ t_vlm = time.time() - t_vlm_start
168
+
169
+ # --- PHASE 3: JUGE (Le Cerveau) ---
170
+ t_llm_start = time.time()
171
+ top_a_idx = np.argmax(audio_probs)
172
+ audio_context = f"{CATEGORIES[top_a_idx].upper()} ({audio_probs[top_a_idx]*100:.1f}%)"
173
+
174
+ judge_decision = call_peace_judge(audio_context, vlm_clean)
175
+ t_llm = time.time() - t_llm_start
176
+
177
+ # Extraction du verdict final
178
+ final_verdict = CATEGORIES[top_a_idx].upper()
179
+ for cat in CATEGORIES:
180
+ if cat.upper() in judge_decision.upper():
181
+ final_verdict = cat.upper()
182
+ break
183
+
184
+ # --- PHASE 4: ANNOTATION & EXPORT ---
185
+ top5 = np.argsort(audio_probs)[-5:][::-1]
186
+ fig = px.bar(x=[audio_probs[i]*100 for i in top5], y=[CATEGORIES[i].upper() for i in top5], orientation='h', title='Entrée Audio')
187
+
188
+ cap = cv2.VideoCapture(video_path)
189
+ fps, w_v, h_v = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
190
+ tmp_no_audio = f"no_audio_{os.getpid()}.mp4"
191
+ out_v = cv2.VideoWriter(tmp_no_audio, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w_v, h_v))
192
+ while cap.isOpened():
193
+ ret, frame = cap.read()
194
+ if not ret: break
195
+ cv2.rectangle(frame, (0,0), (w_v, 65), (0,0,0), -1)
196
+ cv2.putText(frame, f"JUDGE: {final_verdict}", (20, 45), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 255), 3)
197
+ out_v.write(frame)
198
+ cap.release(); out_v.release()
199
+ subprocess.run(['ffmpeg', '-i', tmp_no_audio, '-i', video_path, '-c:v', 'copy', '-c:a', 'aac', '-map', '0:v:0', '-map', '1:a:0', '-y', tmp_output_video], capture_output=True)
200
+
201
+ # --- PHASE 5: RAPPORT FINAL ---
202
+ t_total = time.time() - start_total
203
+ report = f"""⚖️ DÉCISION DU JUGE DE PAIX :
204
+ {judge_decision}
205
 
206
  ------------------------------------------
207
+ 👁️ ANALYSE VISUELLE (VLM) :
208
+ {vlm_clean}
209
+
210
+ 📊 DONNÉES AUDIO :
211
+ {audio_context}
212
+
213
+ ⏱️ CHRONOMÈTRES :
214
+ • Audio (Piliers A/B/C) : {t_audio:.2f}s
215
+ • Vision (SmolVLM) : {t_vlm:.2f}s
216
+ • Juge (SmolLM) : {t_llm:.2f}s
217
+ • TOTAL : {t_total:.2f}s"""
218
 
219
+ if os.path.exists(tmp_audio): os.remove(tmp_audio)
220
+ if os.path.exists(tmp_no_audio): os.remove(tmp_no_audio)
221
+ return report, fig, tmp_output_video
222
 
223
+ except Exception as e:
224
+ return f"❌ Erreur : {str(e)}", None, None
225
 
226
+ # --- Interface Gradio ---
227
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
228
+ gr.Markdown("# 🐱 CatSense POC v12.1 - Trinité Architecture")
229
+ with gr.Row():
230
+ with gr.Column():
231
+ video_input = gr.Video()
232
+ btn = gr.Button("🚀 ANALYSE MULTIMODALE", variant="primary")
233
+ with gr.Column():
234
+ report_out = gr.Textbox(label="Rapport Expert", lines=18)
235
+ chart_out = gr.Plot()
236
+ video_out = gr.Video(label="Vidéo Expertisée")
237
+ btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out, video_out])
238
 
239
+ demo.launch()