Update app.py
Browse files
app.py
CHANGED
|
@@ -128,55 +128,48 @@ def analyze_cat_v12_final(video_path):
|
|
| 128 |
t_audio = time.time() - t_0
|
| 129 |
|
| 130 |
# --- B. VISION (Processor FRESH à chaque appel) ---
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
cleaned = cleaned[idx + len("assistant:"):].strip()
|
| 174 |
-
|
| 175 |
-
# ✅ ICI : utiliser 'cleaned', pas 'vlm_clean'
|
| 176 |
-
vlm_clean = cleaned.split('\n')[0].strip()
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
t_vlm = time.time() - t_1
|
| 180 |
|
| 181 |
# --- C. JUGE ---
|
| 182 |
t_2 = time.time()
|
|
|
|
| 128 |
t_audio = time.time() - t_0
|
| 129 |
|
| 130 |
# --- B. VISION (Processor FRESH à chaque appel) ---
|
| 131 |
+
t_1 = time.time()
|
| 132 |
+
|
| 133 |
+
vlm_proc = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256VLM2-Video-Instruct")
|
| 134 |
+
|
| 135 |
+
vlm_prompt = (
|
| 136 |
+
"You are a feline behavior expert. "
|
| 137 |
+
"Analyze precisely: number and position of ears, state of mouth (open/closed/tense), tail position and movement, and overall body posture. "
|
| 138 |
+
"Do not interpret mood. Only describe observable features."
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
messages = [{"role": "user", "content": [{"type": "video", "path": video_path}, {"type": "text", "text": vlm_prompt}]}]
|
| 142 |
+
|
| 143 |
+
# Tokenize avec retour des inputs
|
| 144 |
+
vlm_inputs = vlm_proc.apply_chat_template(
|
| 145 |
+
messages,
|
| 146 |
+
add_generation_prompt=True,
|
| 147 |
+
tokenize=True,
|
| 148 |
+
return_dict=True,
|
| 149 |
+
return_tensors="pt"
|
| 150 |
+
).to(DEVICE)
|
| 151 |
+
|
| 152 |
+
input_length = vlm_inputs["input_ids"].shape[1] # 🔑 nombre de tokens du prompt
|
| 153 |
+
|
| 154 |
+
with torch.no_grad():
|
| 155 |
+
vlm_out = vlm_model.generate(
|
| 156 |
+
**vlm_inputs,
|
| 157 |
+
max_new_tokens=80,
|
| 158 |
+
do_sample=True,
|
| 159 |
+
temperature=0.7,
|
| 160 |
+
top_p=0.9
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
# 🔑 DÉCODAGE SÉCURISÉ : uniquement les nouveaux tokens
|
| 164 |
+
gen_tokens = vlm_out[0][input_length:]
|
| 165 |
+
vlm_clean = vlm_proc.batch_decode(gen_tokens.unsqueeze(0), skip_special_tokens=True)[0]
|
| 166 |
+
|
| 167 |
+
# Nettoyage final : une seule phrase, sans "Assistant:"
|
| 168 |
+
vlm_clean = vlm_clean.strip().split('\n')[0]
|
| 169 |
+
if vlm_clean.lower().startswith("assistant:"):
|
| 170 |
+
vlm_clean = vlm_clean.split(":", 1)[-1].strip()
|
| 171 |
+
|
| 172 |
+
t_vlm = time.time() - t_1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
# --- C. JUGE ---
|
| 175 |
t_2 = time.time()
|