hrlima commited on
Commit
79f2787
·
verified ·
1 Parent(s): a1238de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +229 -42
app.py CHANGED
@@ -3,6 +3,8 @@ import json
3
  import base64
4
  import tempfile
5
  import requests
 
 
6
  import firebase_admin
7
  from firebase_admin import credentials, firestore
8
  from flask import Flask, request, jsonify
@@ -24,23 +26,48 @@ try:
24
  except Exception as e:
25
  print(f"❌ Erro ao inicializar Firebase: {e}")
26
 
27
- # ====== MODELO (AUDIO) ======
28
- # Usamos pipeline de audio-classification com o modelo Whisper fine-tuned fornecido
29
  try:
30
  audio_pipeline = pipeline(
31
  task="audio-classification",
32
  model="firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
33
  )
34
- print("✅ Modelo de reconhecimento de emoção por voz carregado com sucesso!")
35
  except Exception as e:
36
- print(f"❌ Erro ao carregar modelo de áudio: {e}")
37
  audio_pipeline = None
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # ====== MAPEAMENTO DE EMOÇÕES (ING->PT) ======
40
  emotion_labels = {
41
  "angry": "raiva",
42
  "disgust": "insegurança",
43
  "fearful": "ansiedade",
 
44
  "happy": "alegria",
45
  "neutral": "neutro",
46
  "sad": "tristeza",
@@ -74,7 +101,7 @@ EMOTION_KEYWORDS = {
74
  }
75
 
76
  def fallback_emotion(text):
77
- text_lower = text.lower()
78
  match_counts = {k: sum(1 for w in v if w in text_lower) for k, v in EMOTION_KEYWORDS.items()}
79
  emotion = max(match_counts, key=match_counts.get)
80
  if match_counts[emotion] == 0:
@@ -88,7 +115,7 @@ def fallback_emotion(text):
88
  "debug": "Fallback ativado"
89
  }
90
 
91
- # ====== AJUSTE HÍBRIDO ======
92
  def hybrid_emotion(text, result):
93
  text_lower = (text or "").lower()
94
  detected = result.get("emotion", "neutro")
@@ -133,94 +160,252 @@ def fetch_url_to_tempfile(url):
133
  suffix = ".mp3"
134
  return save_bytes_to_tempfile(r.content, suffix=suffix)
135
 
136
- # ====== ROTA DE ANÁLISE ======
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  @app.route("/analyze", methods=["POST"])
138
  def analyze():
139
  try:
140
- # suportar multipart/form-data com file
141
  audio_path = None
142
  audio_bytes = None
143
  data = None
144
 
145
- # prioridade: arquivos enviados via multipart/form-data
146
  if "file" in request.files:
147
  f = request.files["file"]
148
  audio_bytes = f.read()
149
-
150
  else:
151
- # tentar JSON
152
  try:
153
  data = request.get_json(silent=True)
154
  except Exception:
155
  data = None
156
 
157
  if data:
158
- # base64
159
  if "audio_base64" in data:
160
  audio_bytes = base64.b64decode(data["audio_base64"])
161
- # url
162
  elif "audio_url" in data:
163
  audio_path = fetch_url_to_tempfile(data["audio_url"])
164
- # se vier apenas 'text', usar fallback textual
165
  elif "text" in data and (not audio_bytes and not audio_path):
166
- text = data["text"]
167
- return jsonify(fallback_emotion(text))
168
 
169
- # se temos bytes, salva como tempfile
170
  if audio_bytes:
171
  audio_path = save_bytes_to_tempfile(audio_bytes, suffix=".wav")
172
 
173
- # se não há áudio, retornar erro ou fallback
174
  if not audio_path:
175
- # se data com text foi tratado acima; aqui devolvemos erro pedindo áudio/text
176
- return jsonify({"error": "Nenhum áudio foi enviado. Envie 'file' (multipart/form-data), ou 'audio_base64'/'audio_url', ou 'text' para fallback."}), 400
177
 
178
- # ====== Chamar pipeline de áudio ======
179
  if not audio_pipeline:
180
- # pipeline indisponível -> tentar extrair texto (se disponível) ou fallback
181
- # se houver 'text' em JSON, use fallback_emotion
182
  if data and "text" in data:
183
  return jsonify(fallback_emotion(data["text"]))
184
  return jsonify({"error": "Modelo de áudio indisponível no momento."}), 500
185
 
186
- # A pipeline aceita caminho para arquivo
187
- raw_result = audio_pipeline(audio_path, top_k=10) # lista de dicts: [{'label':..., 'score':...}, ...]
188
- # Exemplo: raw_result = [{'label': 'Happy', 'score': 0.9}, ...]
189
- # Normalizar labels para minúsculas
190
- scores = {}
191
  for item in raw_result:
192
  label = item.get("label", "").lower()
193
- # alguns modelos usam 'fear' vs 'fearful' etc. padronizar
194
  if label == "fear":
195
  label = "fearful"
196
- scores[label] = float(item.get("score", 0.0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
- if not scores:
199
- return jsonify({"error": "Nenhum rótulo retornado pelo modelo."}), 500
 
 
200
 
201
- top_label = max(scores, key=scores.get)
202
- confidence = round(scores[top_label], 2)
 
 
203
  emotion_pt = emotion_labels.get(top_label, "desconhecido")
204
 
205
- # Ajuste especial: se for tristeza muito forte -> 'depressão'
206
- if emotion_pt == "tristeza" and confidence >= 0.9:
207
  emotion_pt = "depressão"
208
 
209
  # montar probabilidades mapeadas para pt (mantendo somente rótulos conhecidos)
210
- probabilities_pt = { emotion_labels.get(k, k): round(v, 3) for k, v in scores.items() }
 
 
211
 
 
212
  base_result = {
213
  "status": "ok",
214
  "emotion": emotion_pt,
215
  "emode": [emotion_pt],
216
- "confidence": confidence,
217
  "probabilities": probabilities_pt,
218
  "suggestion": gerar_sugestao(emotion_pt),
219
- "debug": "Modelo de áudio utilizado"
 
 
 
 
 
 
220
  }
221
 
222
- # Ler (tentar) a transcrição de texto se o modelo retornar (muitos pipelines de audio-classification não transcrevem)
223
- # Como fallback híbrido, se o usuário mandou também 'text' no JSON, usaremos isso para o híbrido.
224
  text_for_hybrid = None
225
  if data and "text" in data:
226
  text_for_hybrid = data["text"]
@@ -230,6 +415,7 @@ def analyze():
230
  return jsonify(final_result)
231
 
232
  except Exception as e:
 
233
  return jsonify({"error": str(e)}), 500
234
  finally:
235
  # limpar tempfiles (se existirem)
@@ -240,4 +426,5 @@ def analyze():
240
  pass
241
 
242
  if __name__ == "__main__":
243
- app.run(host="0.0.0.0", port=int(os.getenv("PORT", 7860)))
 
 
3
  import base64
4
  import tempfile
5
  import requests
6
+ import math
7
+ import numpy as np
8
  import firebase_admin
9
  from firebase_admin import credentials, firestore
10
  from flask import Flask, request, jsonify
 
26
  except Exception as e:
27
  print(f"❌ Erro ao inicializar Firebase: {e}")
28
 
29
+ # ====== PIPELINES ======
30
+ # 1) Pipeline de classificação de áudio (modelo Whisper fine-tuned)
31
  try:
32
  audio_pipeline = pipeline(
33
  task="audio-classification",
34
  model="firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
35
  )
36
+ print("✅ audio_pipeline carregado.")
37
  except Exception as e:
38
+ print(f"❌ Erro ao carregar audio_pipeline: {e}")
39
  audio_pipeline = None
40
 
41
+ # 2) Pipeline ASR (transcrição) - usar Whisper para obter texto que ajudará no texto-classifier
42
+ # Note: dependendo do ambiente, carregar whisper-large-v3 pode ser pesado.
43
+ try:
44
+ asr_pipeline = pipeline(
45
+ task="automatic-speech-recognition",
46
+ model="openai/whisper-large-v3"
47
+ )
48
+ print("✅ asr_pipeline carregado.")
49
+ except Exception as e:
50
+ print(f"⚠️ ASR indisponível: {e}")
51
+ asr_pipeline = None
52
+
53
+ # 3) Pipeline de classificação de texto (para multimodal ensemble)
54
+ try:
55
+ text_pipeline = pipeline(
56
+ task="text-classification",
57
+ model="pysentimiento/robertuito-emotion-analysis",
58
+ return_all_scores=True
59
+ )
60
+ print("✅ text_pipeline carregado.")
61
+ except Exception as e:
62
+ print(f"⚠️ text_pipeline indisponível: {e}")
63
+ text_pipeline = None
64
+
65
  # ====== MAPEAMENTO DE EMOÇÕES (ING->PT) ======
66
  emotion_labels = {
67
  "angry": "raiva",
68
  "disgust": "insegurança",
69
  "fearful": "ansiedade",
70
+ "fear": "ansiedade",
71
  "happy": "alegria",
72
  "neutral": "neutro",
73
  "sad": "tristeza",
 
101
  }
102
 
103
  def fallback_emotion(text):
104
+ text_lower = (text or "").lower()
105
  match_counts = {k: sum(1 for w in v if w in text_lower) for k, v in EMOTION_KEYWORDS.items()}
106
  emotion = max(match_counts, key=match_counts.get)
107
  if match_counts[emotion] == 0:
 
115
  "debug": "Fallback ativado"
116
  }
117
 
118
+ # ====== AJUSTE HÍBRIDO (mantido) ======
119
  def hybrid_emotion(text, result):
120
  text_lower = (text or "").lower()
121
  detected = result.get("emotion", "neutro")
 
160
  suffix = ".mp3"
161
  return save_bytes_to_tempfile(r.content, suffix=suffix)
162
 
163
+ # ====== UTIL: Softmax com temperatura para calibrar probabilidades ======
164
+ def tempered_softmax(scores_dict, temperature=1.0):
165
+ # scores_dict: {label: score} (scores raw in [0..1] but we re-calibrate)
166
+ # convert to logit-like by -log(1-score) as proxy if scores are probs; fallback simple rescale
167
+ labels = list(scores_dict.keys())
168
+ vals = np.array([scores_dict[l] for l in labels], dtype=float)
169
+ # small smoothing to avoid zeros
170
+ vals = np.clip(vals, 1e-8, 1-1e-8)
171
+ # convert probabilities -> logits approximately
172
+ logits = np.log(vals / (1 - vals))
173
+ scaled = logits / max(temperature, 1e-6)
174
+ exps = np.exp(scaled - np.max(scaled))
175
+ probs = exps / np.sum(exps)
176
+ return dict(zip(labels, probs))
177
+
178
+ # ====== UTIL: média de probabilidades de várias predições (normalização) ======
179
+ def average_probabilities(list_of_prob_dicts):
180
+ # all dicts share same keys (or not) - unify keys
181
+ all_keys = set()
182
+ for d in list_of_prob_dicts:
183
+ all_keys.update(d.keys())
184
+ avg = {k: 0.0 for k in all_keys}
185
+ for d in list_of_prob_dicts:
186
+ # treat missing as 0
187
+ for k in all_keys:
188
+ avg[k] += d.get(k, 0.0)
189
+ n = len(list_of_prob_dicts)
190
+ if n == 0:
191
+ return avg
192
+ for k in avg:
193
+ avg[k] = avg[k] / n
194
+ # normalize
195
+ total = sum(avg.values()) or 1.0
196
+ for k in avg:
197
+ avg[k] = avg[k] / total
198
+ return avg
199
+
200
+ # ====== ROTA DE ANÁLISE (melhorias de precisão multimodal) ======
201
  @app.route("/analyze", methods=["POST"])
202
  def analyze():
203
  try:
 
204
  audio_path = None
205
  audio_bytes = None
206
  data = None
207
 
208
+ # prioridade: arquivo multipart 'file'
209
  if "file" in request.files:
210
  f = request.files["file"]
211
  audio_bytes = f.read()
 
212
  else:
 
213
  try:
214
  data = request.get_json(silent=True)
215
  except Exception:
216
  data = None
217
 
218
  if data:
 
219
  if "audio_base64" in data:
220
  audio_bytes = base64.b64decode(data["audio_base64"])
 
221
  elif "audio_url" in data:
222
  audio_path = fetch_url_to_tempfile(data["audio_url"])
 
223
  elif "text" in data and (not audio_bytes and not audio_path):
224
+ # apenas texto -> fallback textual
225
+ return jsonify(fallback_emotion(data["text"]))
226
 
 
227
  if audio_bytes:
228
  audio_path = save_bytes_to_tempfile(audio_bytes, suffix=".wav")
229
 
 
230
  if not audio_path:
231
+ return jsonify({"error": "Nenhum áudio foi enviado. Envie 'file', 'audio_base64' ou 'audio_url', ou 'text' para fallback."}), 400
 
232
 
 
233
  if not audio_pipeline:
 
 
234
  if data and "text" in data:
235
  return jsonify(fallback_emotion(data["text"]))
236
  return jsonify({"error": "Modelo de áudio indisponível no momento."}), 500
237
 
238
+ # ====== 1) Classificação de áudio (obter top_k mais completo) ======
239
+ # aumentamos top_k para capturar incertezas e depois re-calibramos
240
+ raw_result = audio_pipeline(audio_path, top_k=15)
241
+ # raw_result geralmente é lista de dicts: [{'label': 'Happy', 'score': 0.9}, ...]
242
+ audio_scores = {}
243
  for item in raw_result:
244
  label = item.get("label", "").lower()
 
245
  if label == "fear":
246
  label = "fearful"
247
+ # some models return labels like 'Happy' or 'HAPPY' etc.
248
+ audio_scores[label] = float(item.get("score", 0.0))
249
+
250
+ if not audio_scores:
251
+ return jsonify({"error": "Nenhum rótulo retornado pelo modelo de áudio."}), 500
252
+
253
+ # ====== 2) Calibrar probabilidades de áudio com temperatura (ajustável) ======
254
+ # temperatura menor -> mais confiante; ajustar conforme necessidade (ex.: 0.7)
255
+ temp = float(os.getenv("AUDIO_SOFTMAX_TEMP", 0.7))
256
+ calibrated_audio_probs = tempered_softmax(audio_scores, temperature=temp)
257
+
258
+ # ====== 3) Tentar transcrever (ASR) e classificar texto (se disponível) ======
259
+ text_probs_list = []
260
+ transcription = None
261
+ if asr_pipeline:
262
+ try:
263
+ asr_out = asr_pipeline(audio_path)
264
+ # asr_out pode ser string ou dict dependendo da versão da pipeline
265
+ if isinstance(asr_out, dict):
266
+ transcription = asr_out.get("text", "") or asr_out.get("transcription", "")
267
+ else:
268
+ transcription = str(asr_out)
269
+ transcription = (transcription or "").strip()
270
+ # split into sentences for per-sentence classification (if long)
271
+ if transcription:
272
+ sentences = [s.strip() for s in transcription.replace("\n", " ").split(".") if s.strip()]
273
+ # limit to first N sentences to avoid long processing
274
+ max_sentences = 6
275
+ for s in sentences[:max_sentences]:
276
+ if text_pipeline:
277
+ text_scores = text_pipeline(s, return_all_scores=True)
278
+ # text_scores often returns a list with one element (list of label/score)
279
+ if isinstance(text_scores, list) and len(text_scores) > 0:
280
+ scores_list = text_scores[0]
281
+ # convert to map label->score
282
+ tmap = {}
283
+ for it in scores_list:
284
+ lbl = it.get("label", "").lower()
285
+ # map textual labels to our english subset if needed
286
+ tmap[lbl] = float(it.get("score", 0.0))
287
+ # normalize softmax (already probs, but ensure normalization and map labels to english keys)
288
+ # keep original labels (e.g., 'joy','sadness','anger','fear','others')
289
+ text_probs_list.append(tmap)
290
+ # if no sentences or classifier missing, attempt single-shot classify entire transcription
291
+ if not text_probs_list and text_pipeline and transcription:
292
+ text_scores = text_pipeline(transcription, return_all_scores=True)
293
+ if isinstance(text_scores, list) and len(text_scores) > 0:
294
+ scores_list = text_scores[0]
295
+ tmap = {}
296
+ for it in scores_list:
297
+ tmap[it.get("label", "").lower()] = float(it.get("score", 0.0))
298
+ text_probs_list.append(tmap)
299
+ except Exception as e:
300
+ # ASR failing shouldn't break the pipeline; apenas logar e seguir com áudio
301
+ print(f"⚠️ ASR falhou: {e}")
302
+
303
+ # agregue as probabilidades de texto (média)
304
+ combined_text_probs = {}
305
+ if text_probs_list:
306
+ combined_text_probs = average_probabilities(text_probs_list)
307
+ # dobrar a confiabilidade de texto se houver muitas sentenças -> confiabilidade maior
308
+ # map text labels (example: pysentimiento uses 'joy','sadness','anger','fear','others')
309
+ # convert to our english labels set used in audio if possible
310
+ # build a mapped version of text probs to common labels
311
+ text_to_common = {}
312
+ for k, v in combined_text_probs.items():
313
+ kl = k.lower()
314
+ # tenta mapear palavras comuns
315
+ if "joy" in kl or "happy" in kl or "alegr" in kl:
316
+ text_to_common["happy"] = v
317
+ elif "sad" in kl or "sadness" in kl:
318
+ text_to_common["sad"] = v
319
+ elif "anger" in kl or "angry" in kl:
320
+ text_to_common["angry"] = v
321
+ elif "fear" in kl or "anx" in kl:
322
+ text_to_common["fearful"] = v
323
+ elif "disgust" in kl:
324
+ text_to_common["disgust"] = v
325
+ elif "others" in kl or "neutral" in kl:
326
+ text_to_common["neutral"] = v
327
+ else:
328
+ # keep as-is for potential mapping later
329
+ text_to_common[kl] = v
330
+
331
+ # normalize mapped text_to_common
332
+ if text_to_common:
333
+ total = sum(text_to_common.values()) or 1.0
334
+ for k in list(text_to_common.keys()):
335
+ text_to_common[k] = text_to_common[k] / total
336
+
337
+ # ====== 4) Ensemble multimodal: combinar probabilidades de áudio e texto
338
+ # pesos base — ajustar conforme experimento (audio tende a carregar sinal prosódico)
339
+ base_weight_audio = float(os.getenv("WEIGHT_AUDIO", 0.65))
340
+ base_weight_text = float(os.getenv("WEIGHT_TEXT", 0.35))
341
+
342
+ # ajustar pesos dinamicamente pela confiança: se ASR/text forte -> aumentar peso text
343
+ # compute confidence proxies
344
+ audio_conf_proxy = max(calibrated_audio_probs.values()) # [0..1]
345
+ text_conf_proxy = max(text_to_common.values()) if text_to_common else 0.0
346
+
347
+ # scale weights
348
+ # quanto maior a confiança relativa, maior o peso
349
+ if (audio_conf_proxy + text_conf_proxy) > 0:
350
+ weight_audio = base_weight_audio * (audio_conf_proxy / (audio_conf_proxy + text_conf_proxy))
351
+ weight_text = base_weight_text * (text_conf_proxy / (audio_conf_proxy + text_conf_proxy))
352
+ # renormalize to sum to 1 if both non-zero, otherwise fallback
353
+ s = weight_audio + weight_text
354
+ if s > 0:
355
+ weight_audio = weight_audio / s
356
+ weight_text = weight_text / s
357
+ else:
358
+ # fallback para pesos base
359
+ weight_audio = base_weight_audio
360
+ weight_text = base_weight_text
361
+
362
+ # Build unified set of labels
363
+ all_labels = set(list(calibrated_audio_probs.keys()) + list(text_to_common.keys()))
364
+ merged_probs = {}
365
+ for lbl in all_labels:
366
+ a = calibrated_audio_probs.get(lbl, 0.0)
367
+ t = text_to_common.get(lbl, 0.0)
368
+ merged = a * weight_audio + t * weight_text
369
+ merged_probs[lbl] = merged
370
 
371
+ # normalize merged
372
+ total_m = sum(merged_probs.values()) or 1.0
373
+ for k in merged_probs:
374
+ merged_probs[k] = merged_probs[k] / total_m
375
 
376
+ # ====== 5) Escolher rótulo final e montar resposta ======
377
+ top_label = max(merged_probs, key=merged_probs.get)
378
+ top_score = merged_probs[top_label]
379
+ # map to portuguese
380
  emotion_pt = emotion_labels.get(top_label, "desconhecido")
381
 
382
+ # ajuste para tristeza muito forte
383
+ if emotion_pt == "tristeza" and top_score >= 0.92:
384
  emotion_pt = "depressão"
385
 
386
  # montar probabilidades mapeadas para pt (mantendo somente rótulos conhecidos)
387
+ probabilities_pt = {}
388
+ for k, v in merged_probs.items():
389
+ probabilities_pt[emotion_labels.get(k, k)] = round(float(v), 3)
390
 
391
+ # construir resultado base
392
  base_result = {
393
  "status": "ok",
394
  "emotion": emotion_pt,
395
  "emode": [emotion_pt],
396
+ "confidence": round(float(top_score), 3),
397
  "probabilities": probabilities_pt,
398
  "suggestion": gerar_sugestao(emotion_pt),
399
+ "debug": {
400
+ "audio_raw": audio_scores,
401
+ "audio_calibrated": {k: round(float(v), 3) for k, v in calibrated_audio_probs.items()},
402
+ "text_transcription": transcription,
403
+ "text_mapped_probs": {k: round(float(v), 3) for k, v in text_to_common.items()},
404
+ "weights": {"audio": round(weight_audio, 3), "text": round(weight_text, 3)}
405
+ }
406
  }
407
 
408
+ # aplicar híbrido com fallback textual se houver 'text' no JSON
 
409
  text_for_hybrid = None
410
  if data and "text" in data:
411
  text_for_hybrid = data["text"]
 
415
  return jsonify(final_result)
416
 
417
  except Exception as e:
418
+ print(f"❌ Erro na rota /analyze: {e}")
419
  return jsonify({"error": str(e)}), 500
420
  finally:
421
  # limpar tempfiles (se existirem)
 
426
  pass
427
 
428
  if __name__ == "__main__":
429
+ # porta padrão ou PORT env var
430
+ app.run(host="0.0.0.0", port=int(os.getenv("PORT", 7860)))