Spaces:

hrlima
/

emotion-api

Sleeping

App Files Files Community

hrlima commited on Nov 28, 2025

Commit

79f2787

verified ·

1 Parent(s): a1238de

Update app.py

Browse files

Files changed (1) hide show

app.py +229 -42

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import json
 import base64
 import tempfile
 import requests
 import firebase_admin
 from firebase_admin import credentials, firestore
 from flask import Flask, request, jsonify
@@ -24,23 +26,48 @@ try:
 except Exception as e:
     print(f"❌ Erro ao inicializar Firebase: {e}")
-# ====== MODELO (AUDIO) ======
-# Usamos pipeline de audio-classification com o modelo Whisper fine-tuned fornecido
 try:
     audio_pipeline = pipeline(
         task="audio-classification",
         model="firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
     )
-    print("✅ Modelo de reconhecimento de emoção por voz carregado com sucesso!")
 except Exception as e:
-    print(f"❌ Erro ao carregar modelo de áudio: {e}")
     audio_pipeline = None
 # ====== MAPEAMENTO DE EMOÇÕES (ING->PT) ======
 emotion_labels = {
     "angry": "raiva",
     "disgust": "insegurança",
     "fearful": "ansiedade",
     "happy": "alegria",
     "neutral": "neutro",
     "sad": "tristeza",
@@ -74,7 +101,7 @@ EMOTION_KEYWORDS = {
 }
 def fallback_emotion(text):
-    text_lower = text.lower()
     match_counts = {k: sum(1 for w in v if w in text_lower) for k, v in EMOTION_KEYWORDS.items()}
     emotion = max(match_counts, key=match_counts.get)
     if match_counts[emotion] == 0:
@@ -88,7 +115,7 @@ def fallback_emotion(text):
         "debug": "Fallback ativado"
     }
-# ====== AJUSTE HÍBRIDO ======
 def hybrid_emotion(text, result):
     text_lower = (text or "").lower()
     detected = result.get("emotion", "neutro")
@@ -133,94 +160,252 @@ def fetch_url_to_tempfile(url):
         suffix = ".mp3"
     return save_bytes_to_tempfile(r.content, suffix=suffix)
-# ====== ROTA DE ANÁLISE ======
 @app.route("/analyze", methods=["POST"])
 def analyze():
     try:
-        # suportar multipart/form-data com file
         audio_path = None
         audio_bytes = None
         data = None
-        # prioridade: arquivos enviados via multipart/form-data
         if "file" in request.files:
             f = request.files["file"]
             audio_bytes = f.read()
         else:
-            # tentar JSON
             try:
                 data = request.get_json(silent=True)
             except Exception:
                 data = None
             if data:
-                # base64
                 if "audio_base64" in data:
                     audio_bytes = base64.b64decode(data["audio_base64"])
-                # url
                 elif "audio_url" in data:
                     audio_path = fetch_url_to_tempfile(data["audio_url"])
-                # se vier apenas 'text', usar fallback textual
                 elif "text" in data and (not audio_bytes and not audio_path):
-                    text = data["text"]
-                    return jsonify(fallback_emotion(text))
-        # se temos bytes, salva como tempfile
         if audio_bytes:
             audio_path = save_bytes_to_tempfile(audio_bytes, suffix=".wav")
-        # se não há áudio, retornar erro ou fallback
         if not audio_path:
-            # se data com text já foi tratado acima; aqui devolvemos erro pedindo áudio/text
-            return jsonify({"error": "Nenhum áudio foi enviado. Envie 'file' (multipart/form-data), ou 'audio_base64'/'audio_url', ou 'text' para fallback."}), 400
-        # ====== Chamar pipeline de áudio ======
         if not audio_pipeline:
-            # pipeline indisponível -> tentar extrair texto (se disponível) ou fallback
-            # se houver 'text' em JSON, use fallback_emotion
             if data and "text" in data:
                 return jsonify(fallback_emotion(data["text"]))
             return jsonify({"error": "Modelo de áudio indisponível no momento."}), 500
-        # A pipeline aceita caminho para arquivo
-        raw_result = audio_pipeline(audio_path, top_k=10)  # lista de dicts: [{'label':..., 'score':...}, ...]
-        # Exemplo: raw_result = [{'label': 'Happy', 'score': 0.9}, ...]
-        # Normalizar labels para minúsculas
-        scores = {}
         for item in raw_result:
             label = item.get("label", "").lower()
-            # alguns modelos usam 'fear' vs 'fearful' etc. padronizar
             if label == "fear":
                 label = "fearful"
-            scores[label] = float(item.get("score", 0.0))
-        if not scores:
-            return jsonify({"error": "Nenhum rótulo retornado pelo modelo."}), 500
-        top_label = max(scores, key=scores.get)
-        confidence = round(scores[top_label], 2)
         emotion_pt = emotion_labels.get(top_label, "desconhecido")
-        # Ajuste especial: se for tristeza muito forte -> 'depressão'
-        if emotion_pt == "tristeza" and confidence >= 0.9:
             emotion_pt = "depressão"
         # montar probabilidades mapeadas para pt (mantendo somente rótulos conhecidos)
-        probabilities_pt = { emotion_labels.get(k, k): round(v, 3) for k, v in scores.items() }
         base_result = {
             "status": "ok",
             "emotion": emotion_pt,
             "emode": [emotion_pt],
-            "confidence": confidence,
             "probabilities": probabilities_pt,
             "suggestion": gerar_sugestao(emotion_pt),
-            "debug": "Modelo de áudio utilizado"
         }
-        # Ler (tentar) a transcrição de texto se o modelo retornar (muitos pipelines de audio-classification não transcrevem)
-        # Como fallback híbrido, se o usuário mandou também 'text' no JSON, usaremos isso para o híbrido.
         text_for_hybrid = None
         if data and "text" in data:
             text_for_hybrid = data["text"]
@@ -230,6 +415,7 @@ def analyze():
         return jsonify(final_result)
     except Exception as e:
         return jsonify({"error": str(e)}), 500
     finally:
         # limpar tempfiles (se existirem)
@@ -240,4 +426,5 @@ def analyze():
             pass
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=int(os.getenv("PORT", 7860)))

 import base64
 import tempfile
 import requests
+import math
+import numpy as np
 import firebase_admin
 from firebase_admin import credentials, firestore
 from flask import Flask, request, jsonify
 except Exception as e:
     print(f"❌ Erro ao inicializar Firebase: {e}")
+# ====== PIPELINES ======
+# 1) Pipeline de classificação de áudio (modelo Whisper fine-tuned)
 try:
     audio_pipeline = pipeline(
         task="audio-classification",
         model="firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
     )
+    print("✅ audio_pipeline carregado.")
 except Exception as e:
+    print(f"❌ Erro ao carregar audio_pipeline: {e}")
     audio_pipeline = None
+# 2) Pipeline ASR (transcrição) - usar Whisper para obter texto que ajudará no texto-classifier
+#    Note: dependendo do ambiente, carregar whisper-large-v3 pode ser pesado.
+try:
+    asr_pipeline = pipeline(
+        task="automatic-speech-recognition",
+        model="openai/whisper-large-v3"
+    )
+    print("✅ asr_pipeline carregado.")
+except Exception as e:
+    print(f"⚠️ ASR indisponível: {e}")
+    asr_pipeline = None
+# 3) Pipeline de classificação de texto (para multimodal ensemble)
+try:
+    text_pipeline = pipeline(
+        task="text-classification",
+        model="pysentimiento/robertuito-emotion-analysis",
+        return_all_scores=True
+    )
+    print("✅ text_pipeline carregado.")
+except Exception as e:
+    print(f"⚠️ text_pipeline indisponível: {e}")
+    text_pipeline = None
 # ====== MAPEAMENTO DE EMOÇÕES (ING->PT) ======
 emotion_labels = {
     "angry": "raiva",
     "disgust": "insegurança",
     "fearful": "ansiedade",
+    "fear": "ansiedade",
     "happy": "alegria",
     "neutral": "neutro",
     "sad": "tristeza",
 }
 def fallback_emotion(text):
+    text_lower = (text or "").lower()
     match_counts = {k: sum(1 for w in v if w in text_lower) for k, v in EMOTION_KEYWORDS.items()}
     emotion = max(match_counts, key=match_counts.get)
     if match_counts[emotion] == 0:
         "debug": "Fallback ativado"
     }
+# ====== AJUSTE HÍBRIDO (mantido) ======
 def hybrid_emotion(text, result):
     text_lower = (text or "").lower()
     detected = result.get("emotion", "neutro")
         suffix = ".mp3"
     return save_bytes_to_tempfile(r.content, suffix=suffix)
+# ====== UTIL: Softmax com temperatura para calibrar probabilidades ======
+def tempered_softmax(scores_dict, temperature=1.0):
+    # scores_dict: {label: score} (scores raw in [0..1] but we re-calibrate)
+    # convert to logit-like by -log(1-score) as proxy if scores are probs; fallback simple rescale
+    labels = list(scores_dict.keys())
+    vals = np.array([scores_dict[l] for l in labels], dtype=float)
+    # small smoothing to avoid zeros
+    vals = np.clip(vals, 1e-8, 1-1e-8)
+    # convert probabilities -> logits approximately
+    logits = np.log(vals / (1 - vals))
+    scaled = logits / max(temperature, 1e-6)
+    exps = np.exp(scaled - np.max(scaled))
+    probs = exps / np.sum(exps)
+    return dict(zip(labels, probs))
+# ====== UTIL: média de probabilidades de várias predições (normalização) ======
+def average_probabilities(list_of_prob_dicts):
+    # all dicts share same keys (or not) - unify keys
+    all_keys = set()
+    for d in list_of_prob_dicts:
+        all_keys.update(d.keys())
+    avg = {k: 0.0 for k in all_keys}
+    for d in list_of_prob_dicts:
+        # treat missing as 0
+        for k in all_keys:
+            avg[k] += d.get(k, 0.0)
+    n = len(list_of_prob_dicts)
+    if n == 0:
+        return avg
+    for k in avg:
+        avg[k] = avg[k] / n
+    # normalize
+    total = sum(avg.values()) or 1.0
+    for k in avg:
+        avg[k] = avg[k] / total
+    return avg
+# ====== ROTA DE ANÁLISE (melhorias de precisão multimodal) ======
 @app.route("/analyze", methods=["POST"])
 def analyze():
     try:
         audio_path = None
         audio_bytes = None
         data = None
+        # prioridade: arquivo multipart 'file'
         if "file" in request.files:
             f = request.files["file"]
             audio_bytes = f.read()
         else:
             try:
                 data = request.get_json(silent=True)
             except Exception:
                 data = None
             if data:
                 if "audio_base64" in data:
                     audio_bytes = base64.b64decode(data["audio_base64"])
                 elif "audio_url" in data:
                     audio_path = fetch_url_to_tempfile(data["audio_url"])
                 elif "text" in data and (not audio_bytes and not audio_path):
+                    # apenas texto -> fallback textual
+                    return jsonify(fallback_emotion(data["text"]))
         if audio_bytes:
             audio_path = save_bytes_to_tempfile(audio_bytes, suffix=".wav")
         if not audio_path:
+            return jsonify({"error": "Nenhum áudio foi enviado. Envie 'file', 'audio_base64' ou 'audio_url', ou 'text' para fallback."}), 400
         if not audio_pipeline:
             if data and "text" in data:
                 return jsonify(fallback_emotion(data["text"]))
             return jsonify({"error": "Modelo de áudio indisponível no momento."}), 500
+        # ====== 1) Classificação de áudio (obter top_k mais completo) ======
+        # aumentamos top_k para capturar incertezas e depois re-calibramos
+        raw_result = audio_pipeline(audio_path, top_k=15)
+        # raw_result geralmente é lista de dicts: [{'label': 'Happy', 'score': 0.9}, ...]
+        audio_scores = {}
         for item in raw_result:
             label = item.get("label", "").lower()
             if label == "fear":
                 label = "fearful"
+            # some models return labels like 'Happy' or 'HAPPY' etc.
+            audio_scores[label] = float(item.get("score", 0.0))
+        if not audio_scores:
+            return jsonify({"error": "Nenhum rótulo retornado pelo modelo de áudio."}), 500
+        # ====== 2) Calibrar probabilidades de áudio com temperatura (ajustável) ======
+        # temperatura menor -> mais confiante; ajustar conforme necessidade (ex.: 0.7)
+        temp = float(os.getenv("AUDIO_SOFTMAX_TEMP", 0.7))
+        calibrated_audio_probs = tempered_softmax(audio_scores, temperature=temp)
+        # ====== 3) Tentar transcrever (ASR) e classificar texto (se disponível) ======
+        text_probs_list = []
+        transcription = None
+        if asr_pipeline:
+            try:
+                asr_out = asr_pipeline(audio_path)
+                # asr_out pode ser string ou dict dependendo da versão da pipeline
+                if isinstance(asr_out, dict):
+                    transcription = asr_out.get("text", "") or asr_out.get("transcription", "")
+                else:
+                    transcription = str(asr_out)
+                transcription = (transcription or "").strip()
+                # split into sentences for per-sentence classification (if long)
+                if transcription:
+                    sentences = [s.strip() for s in transcription.replace("\n", " ").split(".") if s.strip()]
+                    # limit to first N sentences to avoid long processing
+                    max_sentences = 6
+                    for s in sentences[:max_sentences]:
+                        if text_pipeline:
+                            text_scores = text_pipeline(s, return_all_scores=True)
+                            # text_scores often returns a list with one element (list of label/score)
+                            if isinstance(text_scores, list) and len(text_scores) > 0:
+                                scores_list = text_scores[0]
+                                # convert to map label->score
+                                tmap = {}
+                                for it in scores_list:
+                                    lbl = it.get("label", "").lower()
+                                    # map textual labels to our english subset if needed
+                                    tmap[lbl] = float(it.get("score", 0.0))
+                                # normalize softmax (already probs, but ensure normalization and map labels to english keys)
+                                # keep original labels (e.g., 'joy','sadness','anger','fear','others')
+                                text_probs_list.append(tmap)
+                    # if no sentences or classifier missing, attempt single-shot classify entire transcription
+                    if not text_probs_list and text_pipeline and transcription:
+                        text_scores = text_pipeline(transcription, return_all_scores=True)
+                        if isinstance(text_scores, list) and len(text_scores) > 0:
+                            scores_list = text_scores[0]
+                            tmap = {}
+                            for it in scores_list:
+                                tmap[it.get("label", "").lower()] = float(it.get("score", 0.0))
+                            text_probs_list.append(tmap)
+            except Exception as e:
+                # ASR failing shouldn't break the pipeline; apenas logar e seguir com áudio
+                print(f"⚠️ ASR falhou: {e}")
+        # agregue as probabilidades de texto (média)
+        combined_text_probs = {}
+        if text_probs_list:
+            combined_text_probs = average_probabilities(text_probs_list)
+            # dobrar a confiabilidade de texto se houver muitas sentenças -> confiabilidade maior
+        # map text labels (example: pysentimiento uses 'joy','sadness','anger','fear','others')
+        # convert to our english labels set used in audio if possible
+        # build a mapped version of text probs to common labels
+        text_to_common = {}
+        for k, v in combined_text_probs.items():
+            kl = k.lower()
+            # tenta mapear palavras comuns
+            if "joy" in kl or "happy" in kl or "alegr" in kl:
+                text_to_common["happy"] = v
+            elif "sad" in kl or "sadness" in kl:
+                text_to_common["sad"] = v
+            elif "anger" in kl or "angry" in kl:
+                text_to_common["angry"] = v
+            elif "fear" in kl or "anx" in kl:
+                text_to_common["fearful"] = v
+            elif "disgust" in kl:
+                text_to_common["disgust"] = v
+            elif "others" in kl or "neutral" in kl:
+                text_to_common["neutral"] = v
+            else:
+                # keep as-is for potential mapping later
+                text_to_common[kl] = v
+        # normalize mapped text_to_common
+        if text_to_common:
+            total = sum(text_to_common.values()) or 1.0
+            for k in list(text_to_common.keys()):
+                text_to_common[k] = text_to_common[k] / total
+        # ====== 4) Ensemble multimodal: combinar probabilidades de áudio e texto
+        # pesos base — ajustar conforme experimento (audio tende a carregar sinal prosódico)
+        base_weight_audio = float(os.getenv("WEIGHT_AUDIO", 0.65))
+        base_weight_text = float(os.getenv("WEIGHT_TEXT", 0.35))
+        # ajustar pesos dinamicamente pela confiança: se ASR/text forte -> aumentar peso text
+        # compute confidence proxies
+        audio_conf_proxy = max(calibrated_audio_probs.values())  # [0..1]
+        text_conf_proxy = max(text_to_common.values()) if text_to_common else 0.0
+        # scale weights
+        # quanto maior a confiança relativa, maior o peso
+        if (audio_conf_proxy + text_conf_proxy) > 0:
+            weight_audio = base_weight_audio * (audio_conf_proxy / (audio_conf_proxy + text_conf_proxy))
+            weight_text = base_weight_text * (text_conf_proxy / (audio_conf_proxy + text_conf_proxy))
+            # renormalize to sum to 1 if both non-zero, otherwise fallback
+            s = weight_audio + weight_text
+            if s > 0:
+                weight_audio = weight_audio / s
+                weight_text = weight_text / s
+        else:
+            # fallback para pesos base
+            weight_audio = base_weight_audio
+            weight_text = base_weight_text
+        # Build unified set of labels
+        all_labels = set(list(calibrated_audio_probs.keys()) + list(text_to_common.keys()))
+        merged_probs = {}
+        for lbl in all_labels:
+            a = calibrated_audio_probs.get(lbl, 0.0)
+            t = text_to_common.get(lbl, 0.0)
+            merged = a * weight_audio + t * weight_text
+            merged_probs[lbl] = merged
+        # normalize merged
+        total_m = sum(merged_probs.values()) or 1.0
+        for k in merged_probs:
+            merged_probs[k] = merged_probs[k] / total_m
+        # ====== 5) Escolher rótulo final e montar resposta ======
+        top_label = max(merged_probs, key=merged_probs.get)
+        top_score = merged_probs[top_label]
+        # map to portuguese
         emotion_pt = emotion_labels.get(top_label, "desconhecido")
+        # ajuste para tristeza muito forte
+        if emotion_pt == "tristeza" and top_score >= 0.92:
             emotion_pt = "depressão"
         # montar probabilidades mapeadas para pt (mantendo somente rótulos conhecidos)
+        probabilities_pt = {}
+        for k, v in merged_probs.items():
+            probabilities_pt[emotion_labels.get(k, k)] = round(float(v), 3)
+        # construir resultado base
         base_result = {
             "status": "ok",
             "emotion": emotion_pt,
             "emode": [emotion_pt],
+            "confidence": round(float(top_score), 3),
             "probabilities": probabilities_pt,
             "suggestion": gerar_sugestao(emotion_pt),
+            "debug": {
+                "audio_raw": audio_scores,
+                "audio_calibrated": {k: round(float(v), 3) for k, v in calibrated_audio_probs.items()},
+                "text_transcription": transcription,
+                "text_mapped_probs": {k: round(float(v), 3) for k, v in text_to_common.items()},
+                "weights": {"audio": round(weight_audio, 3), "text": round(weight_text, 3)}
+            }
         }
+        # aplicar híbrido com fallback textual se houver 'text' no JSON
         text_for_hybrid = None
         if data and "text" in data:
             text_for_hybrid = data["text"]
         return jsonify(final_result)
     except Exception as e:
+        print(f"❌ Erro na rota /analyze: {e}")
         return jsonify({"error": str(e)}), 500
     finally:
         # limpar tempfiles (se existirem)
             pass
 if __name__ == "__main__":
+    # porta padrão ou PORT env var
+    app.run(host="0.0.0.0", port=int(os.getenv("PORT", 7860)))