ericjedha commited on
Commit
27d8a18
·
verified ·
1 Parent(s): be73d82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -101
app.py CHANGED
@@ -29,7 +29,7 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
29
  # 1. CHARGEMENT DE LA TRINITÉ
30
  # ==========================================
31
  def load_models():
32
- print("📥 Initialisation de la Trinité (VLM + LLM + Audio)...")
33
 
34
  # Yeux : SmolVLM 256M
35
  vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
@@ -66,125 +66,98 @@ def load_models():
66
  vlm_proc, vlm_model, llm_tok, llm_model, audio_models = load_models()
67
 
68
  # ==========================================
69
- # 2. FONCTIONS UTILITAIRES
70
  # ==========================================
71
- def get_audio_probs(audio_path):
72
- w, _ = librosa.load(audio_path, sr=16000, duration=5.0)
73
- if len(w) < 48000: w = np.pad(w, (0, 48000-len(w)))
74
- mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
75
- mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
76
- img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
77
- img_t = torch.tensor(img).unsqueeze(0).repeat(1, 3, 1, 1).float().to(DEVICE) / 255.0
78
- with torch.no_grad():
79
- pa = F.softmax(audio_models['A'](img_t), dim=1)
80
- pb = F.softmax(audio_models['B'](img_t), dim=1)
81
- ic = audio_models['ast_ext'](w, sampling_rate=16000, return_tensors="pt").to(DEVICE)
82
- pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
83
- return (pa * 0.3468 + pb * 0.2762 + pc * 0.3770).cpu().numpy()[0]
84
-
85
  def call_peace_judge(audio_top, vlm_desc):
86
- prompt_text = f"""You are a feline behavior expert. Decide the final cat mood.
87
- CONTEXT:
88
- - Audio Sensor predicts: {audio_top}
89
- - Video Sensor describes: {vlm_desc}
90
-
91
- RULES:
92
- - If Video describes 'ears back', 'teeth', or 'rigid', prioritize BACK_OFF/ANGRY.
93
- - Be concise and avoid repetition.
94
-
95
- VERDICT: [CATEGORY NAME]
96
- REASON: [1 short sentence]"""
97
 
98
  messages = [{"role": "user", "content": prompt_text}]
99
- full_prompt_string = llm_tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
100
- model_inputs = llm_tok(full_prompt_string, return_tensors="pt").to(DEVICE)
101
 
102
  with torch.no_grad():
103
- generated_ids = llm_model.generate(
104
- **model_inputs,
105
- max_new_tokens=80,
106
- temperature=0.1,
107
- do_sample=False,
108
- repetition_penalty=1.2,
109
- pad_token_id=llm_tok.eos_token_id
110
  )
111
 
112
- decoded = llm_tok.decode(generated_ids[0][len(model_inputs["input_ids"][0]):], skip_special_tokens=True)
113
- return decoded.strip()
114
 
115
  # ==========================================
116
- # 3. PIPELINE ANALYSE V12.1
117
  # ==========================================
118
  @spaces.GPU(duration=60)
119
  def analyze_cat_v12_final(video_path):
120
  if not video_path: return "❌ Aucune vidéo.", None, None
 
 
121
  tmp_audio = f"temp_{os.getpid()}.wav"
122
  tmp_output_video = f"annotated_{os.getpid()}.mp4"
123
  start_total = time.time()
124
 
125
  try:
126
- # --- PHASE 1: AUDIO (Les Oreilles) ---
127
- t_audio_start = time.time()
128
  clip = VideoFileClip(video_path)
129
  audio_probs = np.zeros(len(CATEGORIES))
130
  if clip.audio:
131
  clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
132
- audio_probs = get_audio_probs(tmp_audio)
 
 
 
 
 
 
 
 
 
 
 
 
133
  clip.close()
134
- t_audio = time.time() - t_audio_start
135
-
136
- # --- PHASE 2: VISION (Les Yeux - FIX BY GROK) ---
137
- t_vlm_start = time.time()
138
- vlm_prompt = (
139
- "Analyze the cat body language precisely.\n"
140
- "EXAMPLE:\nDescription: Ears back, mouth open.\nAvis: Defensive.\n\n"
141
- "YOUR TURN:\n1. Description: Describe ears and posture.\n2. Avis: Mood?"
142
- )
143
- messages = [
144
- {
145
- "role": "user",
146
- "content": [
147
- {"type": "video", "video_path": video_path}, # FIX ICI
148
- {"type": "text", "text": vlm_prompt}
149
- ]
150
- }
151
- ]
152
 
153
- # Application du template officiel pour SmolVLM2-Video
154
- vlm_inputs = vlm_proc.apply_chat_template(
155
- messages,
156
- add_generation_prompt=True,
157
- tokenize=True,
158
- return_dict=True,
159
- return_tensors="pt"
160
- ).to(DEVICE)
161
 
162
  with torch.no_grad():
163
- vlm_out = vlm_model.generate(**vlm_inputs, max_new_tokens=100, do_sample=False)
164
 
165
  vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
166
- vlm_clean = vlm_res.split("YOUR TURN:")[-1].strip() if "YOUR TURN:" in vlm_res else vlm_res.strip()
167
- t_vlm = time.time() - t_vlm_start
168
 
169
- # --- PHASE 3: JUGE (Le Cerveau) ---
170
- t_llm_start = time.time()
171
- top_a_idx = np.argmax(audio_probs)
172
- audio_context = f"{CATEGORIES[top_a_idx].upper()} ({audio_probs[top_a_idx]*100:.1f}%)"
173
 
174
- judge_decision = call_peace_judge(audio_context, vlm_clean)
175
- t_llm = time.time() - t_llm_start
176
 
177
- # Extraction du verdict final
178
- final_verdict = CATEGORIES[top_a_idx].upper()
179
- for cat in CATEGORIES:
180
- if cat.upper() in judge_decision.upper():
181
- final_verdict = cat.upper()
182
- break
183
-
184
- # --- PHASE 4: ANNOTATION & EXPORT ---
185
  top5 = np.argsort(audio_probs)[-5:][::-1]
186
- fig = px.bar(x=[audio_probs[i]*100 for i in top5], y=[CATEGORIES[i].upper() for i in top5], orientation='h', title='Entrée Audio')
187
 
 
 
 
 
 
188
  cap = cv2.VideoCapture(video_path)
189
  fps, w_v, h_v = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
190
  tmp_no_audio = f"no_audio_{os.getpid()}.mp4"
@@ -193,28 +166,26 @@ def analyze_cat_v12_final(video_path):
193
  ret, frame = cap.read()
194
  if not ret: break
195
  cv2.rectangle(frame, (0,0), (w_v, 65), (0,0,0), -1)
196
- cv2.putText(frame, f"JUDGE: {final_verdict}", (20, 45), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 255), 3)
197
  out_v.write(frame)
198
  cap.release(); out_v.release()
199
  subprocess.run(['ffmpeg', '-i', tmp_no_audio, '-i', video_path, '-c:v', 'copy', '-c:a', 'aac', '-map', '0:v:0', '-map', '1:a:0', '-y', tmp_output_video], capture_output=True)
200
 
201
- # --- PHASE 5: RAPPORT FINAL ---
202
  t_total = time.time() - start_total
203
- report = f"""⚖️ DÉCISION DU JUGE DE PAIX :
204
  {judge_decision}
205
 
206
  ------------------------------------------
207
- 👁️ ANALYSE VISUELLE (VLM) :
208
  {vlm_clean}
209
 
210
- 📊 DONNÉES AUDIO :
211
- {audio_context}
212
 
213
- ⏱️ CHRONOMÈTRES :
214
- Audio (Piliers A/B/C) : {t_audio:.2f}s
215
- • Vision (SmolVLM) : {t_vlm:.2f}s
216
- • Juge (SmolLM) : {t_llm:.2f}s
217
- • TOTAL : {t_total:.2f}s"""
218
 
219
  if os.path.exists(tmp_audio): os.remove(tmp_audio)
220
  if os.path.exists(tmp_no_audio): os.remove(tmp_no_audio)
@@ -225,15 +196,15 @@ def analyze_cat_v12_final(video_path):
225
 
226
  # --- Interface Gradio ---
227
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
228
- gr.Markdown("# 🐱 CatSense POC v12.1 - Trinité Architecture")
229
  with gr.Row():
230
  with gr.Column():
231
  video_input = gr.Video()
232
- btn = gr.Button("🚀 ANALYSE MULTIMODALE", variant="primary")
233
  with gr.Column():
234
- report_out = gr.Textbox(label="Rapport Expert", lines=18)
235
  chart_out = gr.Plot()
236
- video_out = gr.Video(label="Vidéo Expertisée")
237
  btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out, video_out])
238
 
239
  demo.launch()
 
29
  # 1. CHARGEMENT DE LA TRINITÉ
30
  # ==========================================
31
  def load_models():
32
+ print("📥 Initialisation CatSense v12.2 (Stateless Mode)...")
33
 
34
  # Yeux : SmolVLM 256M
35
  vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
 
66
  vlm_proc, vlm_model, llm_tok, llm_model, audio_models = load_models()
67
 
68
  # ==========================================
69
+ # 2. LOGIQUE DU JUGE (FEW-SHOT & FAST)
70
  # ==========================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def call_peace_judge(audio_top, vlm_desc):
72
+ # Prompt ultra-court pour éviter que le 135M ne divague
73
+ prompt_text = f"""Task: Decide final cat mood.
74
+ Example: Audio=HAPPY, Video=Ears back/Hissing -> Verdict: BACK_OFF.
75
+ Current: Audio={audio_top}, Video={vlm_desc}.
76
+ Verdict:"""
 
 
 
 
 
 
77
 
78
  messages = [{"role": "user", "content": prompt_text}]
79
+ full_prompt = llm_tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
80
+ inputs = llm_tok(full_prompt, return_tensors="pt").to(DEVICE)
81
 
82
  with torch.no_grad():
83
+ outputs = llm_model.generate(
84
+ **inputs,
85
+ max_new_tokens=30,
86
+ temperature=0.01, # Déterministe
87
+ repetition_penalty=1.5,
88
+ do_sample=False
 
89
  )
90
 
91
+ res = llm_tok.decode(outputs[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
92
+ return res.strip().split('\n')[0]
93
 
94
  # ==========================================
95
+ # 3. PIPELINE ANALYSE (STATELESS)
96
  # ==========================================
97
  @spaces.GPU(duration=60)
98
  def analyze_cat_v12_final(video_path):
99
  if not video_path: return "❌ Aucune vidéo.", None, None
100
+ if torch.cuda.is_available(): torch.cuda.empty_cache() # Purge mémoire vive
101
+
102
  tmp_audio = f"temp_{os.getpid()}.wav"
103
  tmp_output_video = f"annotated_{os.getpid()}.mp4"
104
  start_total = time.time()
105
 
106
  try:
107
+ # --- A. AUDIO (Oreilles) ---
108
+ t_0 = time.time()
109
  clip = VideoFileClip(video_path)
110
  audio_probs = np.zeros(len(CATEGORIES))
111
  if clip.audio:
112
  clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
113
+ # Logique simplifiée get_audio_probs intégrée ici pour stabilité
114
+ w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
115
+ if len(w) < 48000: w = np.pad(w, (0, 48000-len(w)))
116
+ mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
117
+ mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
118
+ img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
119
+ img_t = torch.tensor(img).unsqueeze(0).repeat(1, 3, 1, 1).float().to(DEVICE) / 255.0
120
+ with torch.no_grad():
121
+ pa = F.softmax(audio_models['A'](img_t), dim=1)
122
+ pb = F.softmax(audio_models['B'](img_t), dim=1)
123
+ ic = audio_models['ast_ext'](w, sampling_rate=16000, return_tensors="pt").to(DEVICE)
124
+ pc = F.softmax(audio_models['C'](**ic).logits, dim=1)
125
+ audio_probs = (pa * 0.3468 + pb * 0.2762 + pc * 0.3770).cpu().numpy()[0]
126
  clip.close()
127
+ t_audio = time.time() - t_0
128
+
129
+ # --- B. VISION (Yeux - Nouveau Prompt Direct) ---
130
+ t_1 = time.time()
131
+ # On ne donne plus d'exemple au VLM pour éviter qu'il ne les répète
132
+ vlm_prompt = "Describe the cat's ears and mouth. Then name the mood."
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ messages = [{"role": "user", "content": [{"type": "video", "video_path": video_path}, {"type": "text", "text": vlm_prompt}]}]
135
+ vlm_inputs = vlm_proc.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(DEVICE)
 
 
 
 
 
 
136
 
137
  with torch.no_grad():
138
+ vlm_out = vlm_model.generate(**vlm_inputs, max_new_tokens=50, do_sample=True, temperature=0.1)
139
 
140
  vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
141
+ vlm_clean = vlm_res.split("assistant")[-1].strip()
142
+ t_vlm = time.time() - t_1
143
 
144
+ # --- C. JUGE (Cerveau) ---
145
+ t_2 = time.time()
146
+ top_a_label = CATEGORIES[np.argmax(audio_probs)].upper()
147
+ audio_ctx = f"{top_a_label} ({np.max(audio_probs)*100:.1f}%)"
148
 
149
+ judge_decision = call_peace_judge(audio_ctx, vlm_clean)
150
+ t_llm = time.time() - t_2
151
 
152
+ # --- D. VISUELS & EXPORT ---
 
 
 
 
 
 
 
153
  top5 = np.argsort(audio_probs)[-5:][::-1]
154
+ fig = px.bar(x=[audio_probs[i]*100 for i in top5], y=[CATEGORIES[i].upper() for i in top5], orientation='h', title='Audio Scores')
155
 
156
+ # Annotation vidéo simplifiée
157
+ final_v = top_a_label
158
+ for cat in CATEGORIES:
159
+ if cat.upper() in judge_decision.upper(): final_v = cat.upper(); break
160
+
161
  cap = cv2.VideoCapture(video_path)
162
  fps, w_v, h_v = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
163
  tmp_no_audio = f"no_audio_{os.getpid()}.mp4"
 
166
  ret, frame = cap.read()
167
  if not ret: break
168
  cv2.rectangle(frame, (0,0), (w_v, 65), (0,0,0), -1)
169
+ cv2.putText(frame, f"JUDGE: {final_v}", (20, 45), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 255), 3)
170
  out_v.write(frame)
171
  cap.release(); out_v.release()
172
  subprocess.run(['ffmpeg', '-i', tmp_no_audio, '-i', video_path, '-c:v', 'copy', '-c:a', 'aac', '-map', '0:v:0', '-map', '1:a:0', '-y', tmp_output_video], capture_output=True)
173
 
174
+ # --- E. RAPPORT ---
175
  t_total = time.time() - start_total
176
+ report = f"""⚖️ DÉCISION DU JUGE :
177
  {judge_decision}
178
 
179
  ------------------------------------------
180
+ 👁️ VISION (VLM) :
181
  {vlm_clean}
182
 
183
+ 📊 AUDIO :
184
+ {audio_ctx}
185
 
186
+ ⏱️ CHRONOS :
187
+ Audio: {t_audio:.2f}s | Vision: {t_vlm:.2f}s | Juge: {t_llm:.2f}s
188
+ TOTAL: {t_total:.2f}s"""
 
 
189
 
190
  if os.path.exists(tmp_audio): os.remove(tmp_audio)
191
  if os.path.exists(tmp_no_audio): os.remove(tmp_no_audio)
 
196
 
197
  # --- Interface Gradio ---
198
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
199
+ gr.Markdown("# 🐱 CatSense POC v12.2 - Final Trinité")
200
  with gr.Row():
201
  with gr.Column():
202
  video_input = gr.Video()
203
+ btn = gr.Button("🚀 ANALYSE EXPERTE", variant="primary")
204
  with gr.Column():
205
+ report_out = gr.Textbox(label="Rapport Expert", lines=12)
206
  chart_out = gr.Plot()
207
+ video_out = gr.Video(label="Vidéo Annotée")
208
  btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out, video_out])
209
 
210
  demo.launch()