ericjedha commited on
Commit
fab092e
·
verified ·
1 Parent(s): 6bf2afb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -66
app.py CHANGED
@@ -18,9 +18,8 @@ from transformers import (
18
  AutoModelForCausalLM,
19
  AutoTokenizer
20
  )
21
- from moviepy.editor import VideoFileClip
22
- import decord
23
- decord.bridge.set_bridge('torch') # Nécessaire pour le traitement vidéo de SmolVLM
24
 
25
  # --- Configuration ---
26
  CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
@@ -32,21 +31,18 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
32
  def load_models():
33
  print("📥 Initialisation CatSense v12.9 (Pure Logic Mode)...")
34
 
35
- # Yeux : SmolVLM 256M
36
  vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
37
  vlm_proc = AutoProcessor.from_pretrained(vlm_id)
38
  vlm_model = AutoModelForImageTextToText.from_pretrained(
39
  vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
40
  ).to(DEVICE).eval()
41
 
42
- # Cerveau : SmolLM 135M
43
  llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
44
  llm_tok = AutoTokenizer.from_pretrained(llm_id)
45
  llm_model = AutoModelForCausalLM.from_pretrained(
46
  llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
47
  ).to(DEVICE).eval()
48
 
49
- # Oreilles : Piliers Audio
50
  audio_models = {}
51
  for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
52
  ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
@@ -70,7 +66,8 @@ vlm_proc, vlm_model, llm_tok, llm_model, audio_models = load_models()
70
  # 2. LOGIQUE DU JUGE (SANS ASSISTANT)
71
  # ==========================================
72
  def call_peace_judge(audio_top, vlm_desc):
73
- prompt_text = f"Audio: {audio_top}\nVideo: {vlm_desc}\nVerdict:"
 
74
  inputs = llm_tok(prompt_text, return_tensors="pt").to(DEVICE)
75
 
76
  with torch.no_grad():
@@ -88,24 +85,21 @@ def call_peace_judge(audio_top, vlm_desc):
88
  # ==========================================
89
  @spaces.GPU(duration=120)
90
  def analyze_cat_v12_final(video_path):
91
- if not video_path:
92
- return "❌ Aucune vidéo.", None
93
- if torch.cuda.is_available():
94
- torch.cuda.empty_cache()
95
 
96
  tmp_audio = f"temp_{os.getpid()}.wav"
97
  start_total = time.time()
98
 
99
  try:
100
- # --- A. AUDIO (Oreilles) ---
101
  t_0 = time.time()
102
  clip = VideoFileClip(video_path)
103
  audio_probs = np.zeros(len(CATEGORIES))
104
  if clip.audio:
105
  clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
106
  w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
107
- if len(w) < 48000:
108
- w = np.pad(w, (0, 48000 - len(w)))
109
  mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
110
  mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
111
  img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
@@ -119,57 +113,38 @@ def analyze_cat_v12_final(video_path):
119
  clip.close()
120
  t_audio = time.time() - t_0
121
 
122
- # --- B. VISION (avec chat template correct) ---
123
  t_1 = time.time()
124
 
125
- messages = [
126
- {
127
- "role": "user",
128
- "content": [
129
- {"type": "video", "path": video_path},
130
- {
131
- "type": "text",
132
- "text": "Describe the cat in the video focusing on ears, mouth, tail and body posture. Based on this, what is the cat's mood?"
133
- }
134
- ]
135
- }
136
- ]
137
-
138
- vlm_inputs = vlm_proc.apply_chat_template(
139
- messages,
140
- add_generation_prompt=True,
141
- return_tensors="pt",
142
- return_dict=True
143
- ).to(DEVICE)
144
-
145
  with torch.no_grad():
146
- vlm_out = vlm_model.generate(
147
- **vlm_inputs,
148
- max_new_tokens=80,
149
- do_sample=False
150
- )
151
-
152
- vlm_res = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
153
- vlm_clean = vlm_res.strip()
154
  t_vlm = time.time() - t_1
155
 
156
- # --- C. JUGE (Cerveau) ---
157
  t_2 = time.time()
158
- top_a_label = CATEGORIES[np.argmax(audio_probs)].upper()
159
- audio_ctx = f"{top_a_label} ({np.max(audio_probs)*100:.1f}%)"
160
-
161
  judge_decision = call_peace_judge(audio_ctx, vlm_clean)
162
  t_llm = time.time() - t_2
163
 
164
  # --- D. VISUELS ---
165
  top5 = np.argsort(audio_probs)[-5:][::-1]
166
- fig = px.bar(
167
- x=[audio_probs[i]*100 for i in top5],
168
- y=[CATEGORIES[i].upper() for i in top5],
169
- orientation='h',
170
- title='Top 5 Audio Scores',
171
- labels={'x': 'Confidence (%)', 'y': 'Émotion'}
172
- )
173
 
174
  # --- E. RAPPORT ---
175
  t_total = time.time() - start_total
@@ -177,23 +152,21 @@ def analyze_cat_v12_final(video_path):
177
  {judge_decision}
178
 
179
  ------------------------------------------
180
- 👁️ DESCRIPTION VISUELLE (VLM) :
181
  {vlm_clean}
182
 
183
- 📊 AUDIO DOMINANT :
184
  {audio_ctx}
185
 
186
- ⏱️ TEMPS DE TRAITEMENT :
187
  Audio: {t_audio:.2f}s | Vision: {t_vlm:.2f}s | Juge: {t_llm:.2f}s
188
  TOTAL: {t_total:.2f}s"""
189
 
190
- if os.path.exists(tmp_audio):
191
- os.remove(tmp_audio)
192
  return report, fig
193
 
194
  except Exception as e:
195
- if os.path.exists(tmp_audio):
196
- os.remove(tmp_audio)
197
  return f"❌ Erreur : {str(e)}", None
198
 
199
  # --- Interface Gradio ---
@@ -201,12 +174,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
201
  gr.Markdown("# 🐱 CatSense v12.9 - Trinité Simplifiée")
202
  with gr.Row():
203
  with gr.Column():
204
- video_input = gr.Video(label="Uploader une vidéo de chat")
205
- btn = gr.Button("🚀 ANALYSER", variant="primary", size="lg")
206
  with gr.Column():
207
- report_out = gr.Textbox(label="Rapport d'expertise", lines=15)
208
- chart_out = gr.Plot(label="Scores audio (top 5)")
209
-
210
  btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out])
211
 
212
  demo.launch()
 
18
  AutoModelForCausalLM,
19
  AutoTokenizer
20
  )
21
+ # CORRECTION MOVIEPY : Import direct pour éviter l'erreur .editor
22
+ from moviepy import VideoFileClip
 
23
 
24
  # --- Configuration ---
25
  CATEGORIES = ['affection', 'angry', 'back_off', 'defensive', 'feed_me', 'happy', 'hunt', 'in_heat', 'mother_call', 'pain', 'wants_attention']
 
31
  def load_models():
32
  print("📥 Initialisation CatSense v12.9 (Pure Logic Mode)...")
33
 
 
34
  vlm_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
35
  vlm_proc = AutoProcessor.from_pretrained(vlm_id)
36
  vlm_model = AutoModelForImageTextToText.from_pretrained(
37
  vlm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
38
  ).to(DEVICE).eval()
39
 
 
40
  llm_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
41
  llm_tok = AutoTokenizer.from_pretrained(llm_id)
42
  llm_model = AutoModelForCausalLM.from_pretrained(
43
  llm_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
44
  ).to(DEVICE).eval()
45
 
 
46
  audio_models = {}
47
  for p, repo, f in [('A', 'ericjedha/pilier_a', 'best_pillar_a_e29_f1_0_9005.pth'),
48
  ('B', 'ericjedha/pilier_b', 'best_pillar_b_f1_09103.pth')]:
 
66
  # 2. LOGIQUE DU JUGE (SANS ASSISTANT)
67
  # ==========================================
68
  def call_peace_judge(audio_top, vlm_desc):
69
+ # Prompt brut pour éviter le blabla
70
+ prompt_text = f"Audio: {audio_top}\nVideo Analysis: {vlm_desc}\nFinal Mood Verdict:"
71
  inputs = llm_tok(prompt_text, return_tensors="pt").to(DEVICE)
72
 
73
  with torch.no_grad():
 
85
  # ==========================================
86
  @spaces.GPU(duration=120)
87
  def analyze_cat_v12_final(video_path):
88
+ if not video_path: return "❌ Aucune vidéo.", None
89
+ if torch.cuda.is_available(): torch.cuda.empty_cache()
 
 
90
 
91
  tmp_audio = f"temp_{os.getpid()}.wav"
92
  start_total = time.time()
93
 
94
  try:
95
+ # --- A. AUDIO ---
96
  t_0 = time.time()
97
  clip = VideoFileClip(video_path)
98
  audio_probs = np.zeros(len(CATEGORIES))
99
  if clip.audio:
100
  clip.audio.write_audiofile(tmp_audio, fps=16000, logger=None)
101
  w, _ = librosa.load(tmp_audio, sr=16000, duration=5.0)
102
+ if len(w) < 48000: w = np.pad(w, (0, 48000 - len(w)))
 
103
  mel = librosa.feature.melspectrogram(y=w, sr=16000, n_mels=192)
104
  mel_db = (librosa.power_to_db(mel, ref=np.max) + 40) / 40
105
  img = cv2.resize((np.vstack([mel_db, np.zeros((10, mel_db.shape[1]))]) * 255).astype(np.uint8), (224, 224))
 
113
  clip.close()
114
  t_audio = time.time() - t_0
115
 
116
+ # --- B. VISION (Prompt Direct avec extraction propre) ---
117
  t_1 = time.time()
118
 
119
+ # On utilise le template pour éviter l'erreur de matching, mais on demande du factuel
120
+ messages = [{
121
+ "role": "user",
122
+ "content": [
123
+ {"type": "video", "path": video_path},
124
+ {"type": "text", "text": "Describe the cat: ears, mouth, tail and body posture. Based on this, what is the cat's mood?"}
125
+ ]
126
+ }]
127
+
128
+ vlm_inputs = vlm_proc.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(DEVICE)
129
+
 
 
 
 
 
 
 
 
 
130
  with torch.no_grad():
131
+ vlm_out = vlm_model.generate(**vlm_inputs, max_new_tokens=80, do_sample=False)
132
+
133
+ # On décode et on retire tout ce qui précède le texte de l'assistant pour avoir l'analyse brute
134
+ full_text = vlm_proc.batch_decode(vlm_out, skip_special_tokens=True)[0]
135
+ vlm_clean = full_text.split("assistant")[-1].strip()
 
 
 
136
  t_vlm = time.time() - t_1
137
 
138
+ # --- C. JUGE ---
139
  t_2 = time.time()
140
+ top_idx = np.argmax(audio_probs)
141
+ audio_ctx = f"{CATEGORIES[top_idx].upper()} ({audio_probs[top_idx]*100:.1f}%)"
 
142
  judge_decision = call_peace_judge(audio_ctx, vlm_clean)
143
  t_llm = time.time() - t_2
144
 
145
  # --- D. VISUELS ---
146
  top5 = np.argsort(audio_probs)[-5:][::-1]
147
+ fig = px.bar(x=[audio_probs[i]*100 for i in top5], y=[CATEGORIES[i].upper() for i in top5], orientation='h', title='Audio Scores')
 
 
 
 
 
 
148
 
149
  # --- E. RAPPORT ---
150
  t_total = time.time() - start_total
 
152
  {judge_decision}
153
 
154
  ------------------------------------------
155
+ 👁️ VISION (VLM) :
156
  {vlm_clean}
157
 
158
+ 📊 AUDIO :
159
  {audio_ctx}
160
 
161
+ ⏱️ CHRONOS :
162
  Audio: {t_audio:.2f}s | Vision: {t_vlm:.2f}s | Juge: {t_llm:.2f}s
163
  TOTAL: {t_total:.2f}s"""
164
 
165
+ if os.path.exists(tmp_audio): os.remove(tmp_audio)
 
166
  return report, fig
167
 
168
  except Exception as e:
169
+ if os.path.exists(tmp_audio): os.remove(tmp_audio)
 
170
  return f"❌ Erreur : {str(e)}", None
171
 
172
  # --- Interface Gradio ---
 
174
  gr.Markdown("# 🐱 CatSense v12.9 - Trinité Simplifiée")
175
  with gr.Row():
176
  with gr.Column():
177
+ video_input = gr.Video()
178
+ btn = gr.Button("🚀 ANALYSER", variant="primary")
179
  with gr.Column():
180
+ report_out = gr.Textbox(label="Résultat", lines=12)
181
+ chart_out = gr.Plot()
182
+
183
  btn.click(analyze_cat_v12_final, inputs=video_input, outputs=[report_out, chart_out])
184
 
185
  demo.launch()