ericjedha commited on
Commit
283a965
·
verified ·
1 Parent(s): 04b404d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -49
app.py CHANGED
@@ -128,55 +128,48 @@ def analyze_cat_v12_final(video_path):
128
  t_audio = time.time() - t_0
129
 
130
  # --- B. VISION (Processor FRESH à chaque appel) ---
131
- t_1 = time.time()
132
-
133
- # 🔑 CORRECTION : on charge le processor ICI
134
- vlm_proc = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
135
-
136
- # NOUVEAU PROMPT : pur, factuel, sans humeur ni assistant
137
- vlm_prompt = (
138
- "You are a feline behavior expert. "
139
- "Analyze precisely: number and position of ears, state of mouth (open/closed/tense), tail position and movement, and overall body posture. "
140
- "Do not interpret mood. Only describe observable features."
141
- )
142
-
143
- messages = [{"role": "user", "content": [{"type": "video", "path": video_path}, {"type": "text", "text": vlm_prompt}]}]
144
-
145
- vlm_inputs = vlm_proc.apply_chat_template(
146
- messages,
147
- add_generation_prompt=True,
148
- tokenize=True,
149
- return_dict=True,
150
- return_tensors="pt"
151
- ).to(DEVICE)
152
-
153
- with torch.no_grad():
154
- vlm_out = vlm_model.generate(
155
- **vlm_inputs,
156
- max_new_tokens=100,
157
- do_sample=True,
158
- temperature=0.7,
159
- top_p=0.9
160
- )
161
-
162
- vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
163
-
164
- # Supprimer systématiquement le prompt ET le rôle "Assistant:"
165
- cleaned = vlm_res
166
- if vlm_prompt in cleaned:
167
- cleaned = cleaned.split(vlm_prompt, 1)[-1]
168
- # Supprimer "Assistant:" (même avec majuscules ou espaces)
169
- if "assistant:" in cleaned.lower():
170
- # Trouver la première occurrence de "assistant:" de façon insensible à la casse
171
- idx = cleaned.lower().find("assistant:")
172
- if idx != -1:
173
- cleaned = cleaned[idx + len("assistant:"):].strip()
174
-
175
- # ✅ ICI : utiliser 'cleaned', pas 'vlm_clean'
176
- vlm_clean = cleaned.split('\n')[0].strip()
177
-
178
-
179
- t_vlm = time.time() - t_1
180
 
181
  # --- C. JUGE ---
182
  t_2 = time.time()
 
128
  t_audio = time.time() - t_0
129
 
130
  # --- B. VISION (Processor FRESH à chaque appel) ---
131
+ t_1 = time.time()
132
+
133
+ vlm_proc = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256VLM2-Video-Instruct")
134
+
135
+ vlm_prompt = (
136
+ "You are a feline behavior expert. "
137
+ "Analyze precisely: number and position of ears, state of mouth (open/closed/tense), tail position and movement, and overall body posture. "
138
+ "Do not interpret mood. Only describe observable features."
139
+ )
140
+
141
+ messages = [{"role": "user", "content": [{"type": "video", "path": video_path}, {"type": "text", "text": vlm_prompt}]}]
142
+
143
+ # Tokenize avec retour des inputs
144
+ vlm_inputs = vlm_proc.apply_chat_template(
145
+ messages,
146
+ add_generation_prompt=True,
147
+ tokenize=True,
148
+ return_dict=True,
149
+ return_tensors="pt"
150
+ ).to(DEVICE)
151
+
152
+ input_length = vlm_inputs["input_ids"].shape[1] # 🔑 nombre de tokens du prompt
153
+
154
+ with torch.no_grad():
155
+ vlm_out = vlm_model.generate(
156
+ **vlm_inputs,
157
+ max_new_tokens=80,
158
+ do_sample=True,
159
+ temperature=0.7,
160
+ top_p=0.9
161
+ )
162
+
163
+ # 🔑 DÉCODAGE SÉCURISÉ : uniquement les nouveaux tokens
164
+ gen_tokens = vlm_out[0][input_length:]
165
+ vlm_clean = vlm_proc.batch_decode(gen_tokens.unsqueeze(0), skip_special_tokens=True)[0]
166
+
167
+ # Nettoyage final : une seule phrase, sans "Assistant:"
168
+ vlm_clean = vlm_clean.strip().split('\n')[0]
169
+ if vlm_clean.lower().startswith("assistant:"):
170
+ vlm_clean = vlm_clean.split(":", 1)[-1].strip()
171
+
172
+ t_vlm = time.time() - t_1
 
 
 
 
 
 
 
173
 
174
  # --- C. JUGE ---
175
  t_2 = time.time()