Spaces:

hrlima
/

emotion-api

Sleeping

App Files Files Community

hrlima commited on Nov 28, 2025

Commit

a44c51b

verified ·

1 Parent(s): 79f2787

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -193

app.py CHANGED Viewed

@@ -26,8 +26,7 @@ try:
 except Exception as e:
     print(f"❌ Erro ao inicializar Firebase: {e}")
-# ====== PIPELINES ======
-# 1) Pipeline de classificação de áudio (modelo Whisper fine-tuned)
 try:
     audio_pipeline = pipeline(
         task="audio-classification",
@@ -38,30 +37,6 @@ except Exception as e:
     print(f"❌ Erro ao carregar audio_pipeline: {e}")
     audio_pipeline = None
-# 2) Pipeline ASR (transcrição) - usar Whisper para obter texto que ajudará no texto-classifier
-#    Note: dependendo do ambiente, carregar whisper-large-v3 pode ser pesado.
-try:
-    asr_pipeline = pipeline(
-        task="automatic-speech-recognition",
-        model="openai/whisper-large-v3"
-    )
-    print("✅ asr_pipeline carregado.")
-except Exception as e:
-    print(f"⚠️ ASR indisponível: {e}")
-    asr_pipeline = None
-# 3) Pipeline de classificação de texto (para multimodal ensemble)
-try:
-    text_pipeline = pipeline(
-        task="text-classification",
-        model="pysentimiento/robertuito-emotion-analysis",
-        return_all_scores=True
-    )
-    print("✅ text_pipeline carregado.")
-except Exception as e:
-    print(f"⚠️ text_pipeline indisponível: {e}")
-    text_pipeline = None
 # ====== MAPEAMENTO DE EMOÇÕES (ING->PT) ======
 emotion_labels = {
     "angry": "raiva",
@@ -72,7 +47,6 @@ emotion_labels = {
     "neutral": "neutro",
     "sad": "tristeza",
     "surprised": "surpreso",
-    # fallback caso o label seja diferente
 }
 # ====== SUGESTÕES ======
@@ -89,7 +63,7 @@ def gerar_sugestao(emotion_pt):
     }
     return sugestoes.get(emotion_pt, "Mantenha o equilíbrio emocional e cuide de você mesmo.")
-# ====== FALLBACK APRIMORADO COM PALAVRAS-CHAVE ======
 EMOTION_KEYWORDS = {
     "tristeza": ["triste","desanimado","melancólico","chateado","solitário","deprimido","abatido","infeliz","desmotivado"],
     "ansiedade": ["ansioso","preocupado","nervoso","tenso","inquieto","aflito","alarmado","sobrecarregado","inseguro","apreensivo"],
@@ -162,42 +136,35 @@ def fetch_url_to_tempfile(url):
 # ====== UTIL: Softmax com temperatura para calibrar probabilidades ======
 def tempered_softmax(scores_dict, temperature=1.0):
-    # scores_dict: {label: score} (scores raw in [0..1] but we re-calibrate)
-    # convert to logit-like by -log(1-score) as proxy if scores are probs; fallback simple rescale
     labels = list(scores_dict.keys())
     vals = np.array([scores_dict[l] for l in labels], dtype=float)
-    # small smoothing to avoid zeros
     vals = np.clip(vals, 1e-8, 1-1e-8)
-    # convert probabilities -> logits approximately
     logits = np.log(vals / (1 - vals))
     scaled = logits / max(temperature, 1e-6)
     exps = np.exp(scaled - np.max(scaled))
     probs = exps / np.sum(exps)
     return dict(zip(labels, probs))
-# ====== UTIL: média de probabilidades de várias predições (normalização) ======
 def average_probabilities(list_of_prob_dicts):
-    # all dicts share same keys (or not) - unify keys
     all_keys = set()
     for d in list_of_prob_dicts:
         all_keys.update(d.keys())
     avg = {k: 0.0 for k in all_keys}
     for d in list_of_prob_dicts:
-        # treat missing as 0
         for k in all_keys:
             avg[k] += d.get(k, 0.0)
     n = len(list_of_prob_dicts)
-    if n == 0:
-        return avg
     for k in avg:
         avg[k] = avg[k] / n
-    # normalize
     total = sum(avg.values()) or 1.0
     for k in avg:
         avg[k] = avg[k] / total
     return avg
-# ====== ROTA DE ANÁLISE (melhorias de precisão multimodal) ======
 @app.route("/analyze", methods=["POST"])
 def analyze():
     try:
@@ -235,160 +202,62 @@ def analyze():
                 return jsonify(fallback_emotion(data["text"]))
             return jsonify({"error": "Modelo de áudio indisponível no momento."}), 500
-        # ====== 1) Classificação de áudio (obter top_k mais completo) ======
-        # aumentamos top_k para capturar incertezas e depois re-calibramos
-        raw_result = audio_pipeline(audio_path, top_k=15)
-        # raw_result geralmente é lista de dicts: [{'label': 'Happy', 'score': 0.9}, ...]
-        audio_scores = {}
-        for item in raw_result:
-            label = item.get("label", "").lower()
-            if label == "fear":
-                label = "fearful"
-            # some models return labels like 'Happy' or 'HAPPY' etc.
-            audio_scores[label] = float(item.get("score", 0.0))
-        if not audio_scores:
-            return jsonify({"error": "Nenhum rótulo retornado pelo modelo de áudio."}), 500
-        # ====== 2) Calibrar probabilidades de áudio com temperatura (ajustável) ======
-        # temperatura menor -> mais confiante; ajustar conforme necessidade (ex.: 0.7)
-        temp = float(os.getenv("AUDIO_SOFTMAX_TEMP", 0.7))
-        calibrated_audio_probs = tempered_softmax(audio_scores, temperature=temp)
-        # ====== 3) Tentar transcrever (ASR) e classificar texto (se disponível) ======
-        text_probs_list = []
-        transcription = None
-        if asr_pipeline:
             try:
-                asr_out = asr_pipeline(audio_path)
-                # asr_out pode ser string ou dict dependendo da versão da pipeline
-                if isinstance(asr_out, dict):
-                    transcription = asr_out.get("text", "") or asr_out.get("transcription", "")
-                else:
-                    transcription = str(asr_out)
-                transcription = (transcription or "").strip()
-                # split into sentences for per-sentence classification (if long)
-                if transcription:
-                    sentences = [s.strip() for s in transcription.replace("\n", " ").split(".") if s.strip()]
-                    # limit to first N sentences to avoid long processing
-                    max_sentences = 6
-                    for s in sentences[:max_sentences]:
-                        if text_pipeline:
-                            text_scores = text_pipeline(s, return_all_scores=True)
-                            # text_scores often returns a list with one element (list of label/score)
-                            if isinstance(text_scores, list) and len(text_scores) > 0:
-                                scores_list = text_scores[0]
-                                # convert to map label->score
-                                tmap = {}
-                                for it in scores_list:
-                                    lbl = it.get("label", "").lower()
-                                    # map textual labels to our english subset if needed
-                                    tmap[lbl] = float(it.get("score", 0.0))
-                                # normalize softmax (already probs, but ensure normalization and map labels to english keys)
-                                # keep original labels (e.g., 'joy','sadness','anger','fear','others')
-                                text_probs_list.append(tmap)
-                    # if no sentences or classifier missing, attempt single-shot classify entire transcription
-                    if not text_probs_list and text_pipeline and transcription:
-                        text_scores = text_pipeline(transcription, return_all_scores=True)
-                        if isinstance(text_scores, list) and len(text_scores) > 0:
-                            scores_list = text_scores[0]
-                            tmap = {}
-                            for it in scores_list:
-                                tmap[it.get("label", "").lower()] = float(it.get("score", 0.0))
-                            text_probs_list.append(tmap)
             except Exception as e:
-                # ASR failing shouldn't break the pipeline; apenas logar e seguir com áudio
-                print(f"⚠️ ASR falhou: {e}")
-        # agregue as probabilidades de texto (média)
-        combined_text_probs = {}
-        if text_probs_list:
-            combined_text_probs = average_probabilities(text_probs_list)
-            # dobrar a confiabilidade de texto se houver muitas sentenças -> confiabilidade maior
-        # map text labels (example: pysentimiento uses 'joy','sadness','anger','fear','others')
-        # convert to our english labels set used in audio if possible
-        # build a mapped version of text probs to common labels
-        text_to_common = {}
-        for k, v in combined_text_probs.items():
-            kl = k.lower()
-            # tenta mapear palavras comuns
-            if "joy" in kl or "happy" in kl or "alegr" in kl:
-                text_to_common["happy"] = v
-            elif "sad" in kl or "sadness" in kl:
-                text_to_common["sad"] = v
-            elif "anger" in kl or "angry" in kl:
-                text_to_common["angry"] = v
-            elif "fear" in kl or "anx" in kl:
-                text_to_common["fearful"] = v
-            elif "disgust" in kl:
-                text_to_common["disgust"] = v
-            elif "others" in kl or "neutral" in kl:
-                text_to_common["neutral"] = v
-            else:
-                # keep as-is for potential mapping later
-                text_to_common[kl] = v
-        # normalize mapped text_to_common
-        if text_to_common:
-            total = sum(text_to_common.values()) or 1.0
-            for k in list(text_to_common.keys()):
-                text_to_common[k] = text_to_common[k] / total
-        # ====== 4) Ensemble multimodal: combinar probabilidades de áudio e texto
-        # pesos base — ajustar conforme experimento (audio tende a carregar sinal prosódico)
-        base_weight_audio = float(os.getenv("WEIGHT_AUDIO", 0.65))
-        base_weight_text = float(os.getenv("WEIGHT_TEXT", 0.35))
-        # ajustar pesos dinamicamente pela confiança: se ASR/text forte -> aumentar peso text
-        # compute confidence proxies
-        audio_conf_proxy = max(calibrated_audio_probs.values())  # [0..1]
-        text_conf_proxy = max(text_to_common.values()) if text_to_common else 0.0
-        # scale weights
-        # quanto maior a confiança relativa, maior o peso
-        if (audio_conf_proxy + text_conf_proxy) > 0:
-            weight_audio = base_weight_audio * (audio_conf_proxy / (audio_conf_proxy + text_conf_proxy))
-            weight_text = base_weight_text * (text_conf_proxy / (audio_conf_proxy + text_conf_proxy))
-            # renormalize to sum to 1 if both non-zero, otherwise fallback
-            s = weight_audio + weight_text
-            if s > 0:
-                weight_audio = weight_audio / s
-                weight_text = weight_text / s
-        else:
-            # fallback para pesos base
-            weight_audio = base_weight_audio
-            weight_text = base_weight_text
-        # Build unified set of labels
-        all_labels = set(list(calibrated_audio_probs.keys()) + list(text_to_common.keys()))
-        merged_probs = {}
-        for lbl in all_labels:
-            a = calibrated_audio_probs.get(lbl, 0.0)
-            t = text_to_common.get(lbl, 0.0)
-            merged = a * weight_audio + t * weight_text
-            merged_probs[lbl] = merged
-        # normalize merged
-        total_m = sum(merged_probs.values()) or 1.0
-        for k in merged_probs:
-            merged_probs[k] = merged_probs[k] / total_m
-        # ====== 5) Escolher rótulo final e montar resposta ======
-        top_label = max(merged_probs, key=merged_probs.get)
-        top_score = merged_probs[top_label]
         # map to portuguese
         emotion_pt = emotion_labels.get(top_label, "desconhecido")
-        # ajuste para tristeza muito forte
-        if emotion_pt == "tristeza" and top_score >= 0.92:
             emotion_pt = "depressão"
-        # montar probabilidades mapeadas para pt (mantendo somente rótulos conhecidos)
-        probabilities_pt = {}
-        for k, v in merged_probs.items():
-            probabilities_pt[emotion_labels.get(k, k)] = round(float(v), 3)
-        # construir resultado base
         base_result = {
             "status": "ok",
             "emotion": emotion_pt,
@@ -397,15 +266,14 @@ def analyze():
             "probabilities": probabilities_pt,
             "suggestion": gerar_sugestao(emotion_pt),
             "debug": {
-                "audio_raw": audio_scores,
-                "audio_calibrated": {k: round(float(v), 3) for k, v in calibrated_audio_probs.items()},
-                "text_transcription": transcription,
-                "text_mapped_probs": {k: round(float(v), 3) for k, v in text_to_common.items()},
-                "weights": {"audio": round(weight_audio, 3), "text": round(weight_text, 3)}
             }
         }
-        # aplicar híbrido com fallback textual se houver 'text' no JSON
         text_for_hybrid = None
         if data and "text" in data:
             text_for_hybrid = data["text"]
@@ -418,7 +286,6 @@ def analyze():
         print(f"❌ Erro na rota /analyze: {e}")
         return jsonify({"error": str(e)}), 500
     finally:
-        # limpar tempfiles (se existirem)
         try:
             if 'audio_path' in locals() and audio_path and os.path.exists(audio_path):
                 os.remove(audio_path)
@@ -426,5 +293,4 @@ def analyze():
             pass
 if __name__ == "__main__":
-    # porta padrão ou PORT env var
     app.run(host="0.0.0.0", port=int(os.getenv("PORT", 7860)))

 except Exception as e:
     print(f"❌ Erro ao inicializar Firebase: {e}")
+# ====== PIPELINE: Apenas o modelo de áudio (Whisper fine-tuned) ======
 try:
     audio_pipeline = pipeline(
         task="audio-classification",
     print(f"❌ Erro ao carregar audio_pipeline: {e}")
     audio_pipeline = None
 # ====== MAPEAMENTO DE EMOÇÕES (ING->PT) ======
 emotion_labels = {
     "angry": "raiva",
     "neutral": "neutro",
     "sad": "tristeza",
     "surprised": "surpreso",
 }
 # ====== SUGESTÕES ======
     }
     return sugestoes.get(emotion_pt, "Mantenha o equilíbrio emocional e cuide de você mesmo.")
+# ====== FALLBACK APRIMORADO COM PALAVRAS-CHAVE (mantido) ======
 EMOTION_KEYWORDS = {
     "tristeza": ["triste","desanimado","melancólico","chateado","solitário","deprimido","abatido","infeliz","desmotivado"],
     "ansiedade": ["ansioso","preocupado","nervoso","tenso","inquieto","aflito","alarmado","sobrecarregado","inseguro","apreensivo"],
 # ====== UTIL: Softmax com temperatura para calibrar probabilidades ======
 def tempered_softmax(scores_dict, temperature=1.0):
     labels = list(scores_dict.keys())
     vals = np.array([scores_dict[l] for l in labels], dtype=float)
     vals = np.clip(vals, 1e-8, 1-1e-8)
     logits = np.log(vals / (1 - vals))
     scaled = logits / max(temperature, 1e-6)
     exps = np.exp(scaled - np.max(scaled))
     probs = exps / np.sum(exps)
     return dict(zip(labels, probs))
+# ====== UTIL: média/união de probabilidades ======
 def average_probabilities(list_of_prob_dicts):
     all_keys = set()
     for d in list_of_prob_dicts:
         all_keys.update(d.keys())
+    if not all_keys:
+        return {}
     avg = {k: 0.0 for k in all_keys}
     for d in list_of_prob_dicts:
         for k in all_keys:
             avg[k] += d.get(k, 0.0)
     n = len(list_of_prob_dicts)
     for k in avg:
         avg[k] = avg[k] / n
     total = sum(avg.values()) or 1.0
     for k in avg:
         avg[k] = avg[k] / total
     return avg
+# ====== ROTA DE ANÁLISE (apenas modelo firdhokk, precisão aumentada por ensemble interno) ======
 @app.route("/analyze", methods=["POST"])
 def analyze():
     try:
                 return jsonify(fallback_emotion(data["text"]))
             return jsonify({"error": "Modelo de áudio indisponível no momento."}), 500
+        # -------------------------
+        # EXECUTAR VÁRIAS PASSAGENS (ensemble interno)
+        # -------------------------
+        # lista de top_k para executar o pipeline (captura incertezas)
+        topk_list = [10, 15, 20]
+        run_probs = []  # armazenará dicts label->score para cada run (antes de softmax)
+        raw_runs = []   # debug: guardar raw_result para inspeção
+        for topk in topk_list:
             try:
+                raw_result = audio_pipeline(audio_path, top_k=topk)
+                # normalizar formato: raw_result é lista de dicts [{'label':..., 'score':...}, ...]
+                probs = {}
+                for item in raw_result:
+                    label = item.get("label", "").lower()
+                    if label == "fear":
+                        label = "fearful"
+                    probs[label] = float(item.get("score", 0.0))
+                if probs:
+                    run_probs.append(probs)
+                    raw_runs.append({"top_k": topk, "raw": raw_result})
             except Exception as e:
+                # log e seguir para próximas tentativas (não interromper totalmente)
+                print(f"⚠️ audio_pipeline falhou no top_k={topk}: {e}")
+        if not run_probs:
+            return jsonify({"error": "Modelo não retornou rótulos em nenhuma tentativa."}), 500
+        # 1) média das probabilidades (por rótulo) entre as execuções
+        avg_probs = average_probabilities(run_probs)
+        # 2) recalibrar com temperatura (temperatura menor -> mais "afiado")
+        temp = float(os.getenv("AUDIO_SOFTMAX_TEMP", 0.6))  # default 0.6 para maior precisão
+        calibrated_probs = tempered_softmax(avg_probs, temperature=temp)
+        # 3) opcional: aplicar pequena regra de confiança mínima para reduzir rótulos com prob insignificante
+        # (zero out labels abaixo threshold then renormalize)
+        min_prob_threshold = float(os.getenv("MIN_LABEL_PROB", 0.02))  # 2% por padrão
+        filtered = {k: v if v >= min_prob_threshold else 0.0 for k, v in calibrated_probs.items()}
+        totalf = sum(filtered.values()) or 1.0
+        normalized = {k: (v / totalf) for k, v in filtered.items()}
+        # escolher rótulo final
+        top_label = max(normalized, key=normalized.get)
+        top_score = normalized[top_label]
         # map to portuguese
         emotion_pt = emotion_labels.get(top_label, "desconhecido")
+        # regra de negócio: tristeza muito forte -> depressão
+        if emotion_pt == "tristeza" and top_score >= float(os.getenv("DEPRESSION_THRESHOLD", 0.92)):
             emotion_pt = "depressão"
+        # montar probabilidades para output (mapeadas p/ pt)
+        probabilities_pt = { emotion_labels.get(k, k): round(float(v), 3) for k, v in normalized.items() }
         base_result = {
             "status": "ok",
             "emotion": emotion_pt,
             "probabilities": probabilities_pt,
             "suggestion": gerar_sugestao(emotion_pt),
             "debug": {
+                "runs": raw_runs,
+                "avg_probs": {k: round(float(v), 4) for k, v in avg_probs.items()},
+                "calibrated_probs": {k: round(float(v), 4) for k, v in calibrated_probs.items()},
+                "normalized_probs": {k: round(float(v), 4) for k, v in normalized.items()}
             }
         }
+        # permitir que cliente envie 'text' (override/híbrido) — mantido como opção leve
         text_for_hybrid = None
         if data and "text" in data:
             text_for_hybrid = data["text"]
         print(f"❌ Erro na rota /analyze: {e}")
         return jsonify({"error": str(e)}), 500
     finally:
         try:
             if 'audio_path' in locals() and audio_path and os.path.exists(audio_path):
                 os.remove(audio_path)
             pass
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=int(os.getenv("PORT", 7860)))