Spaces:

LingoJr
/

emotion-recognition

Runtime error

App Files Files Community

LingoJr commited on Nov 18, 2025

Commit

2de24ed

verified ·

1 Parent(s): 9b7ce52

Create app.py

Browse files

Files changed (1) hide show

app.py +172 -0

app.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# -------------------------------------------------------------
+# app.py — Multimodal Emotion Recognition System
+# Speech (Wav2Vec2) + Text (EmoBERTa)
+# FastAPI + Gradio integrated into one application
+# -------------------------------------------------------------
+from fastapi import FastAPI, UploadFile, File, Form
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+import torchaudio
+import io
+import torch
+import gradio as gr
+from typing import Optional
+# -------------------------------------------------------------
+# 1️⃣ Initialize FastAPI
+# -------------------------------------------------------------
+app = FastAPI(
+    title="Multimodal Emotion Recognition API",
+    description="Detect emotions from Speech or Text using AI",
+    version="1.0.0"
+)
+# Allow any frontend to access the API
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# -------------------------------------------------------------
+# 2️⃣ Load Models (Global = Faster)
+# -------------------------------------------------------------
+# Speech Emotion Model
+speech_classifier = pipeline(
+    "audio-classification",
+    model="superb/wav2vec2-base-superb-er"
+)
+# Text Emotion Model
+text_tokenizer = AutoTokenizer.from_pretrained("tae898/emoberta-base")
+text_model = AutoModelForSequenceClassification.from_pretrained("tae898/emoberta-base")
+# -------------------------------------------------------------
+# 3️⃣ FastAPI Endpoint → Multimodal /predict
+# -------------------------------------------------------------
+@app.post("/predict")
+async def predict(
+    file: Optional[UploadFile] = File(None),
+    text: Optional[str] = Form(None)
+):
+    """
+    Accepts:
+    - Audio file (wav/mp3)
+    - OR text
+    - OR both (audio takes priority)
+    """
+    # ----------------------------------------
+    # Case 1 — If audio is provided
+    # ----------------------------------------
+    if file is not None:
+        try:
+            audio_bytes = await file.read()
+            waveform, sr = torchaudio.load(io.BytesIO(audio_bytes))
+            preds = speech_classifier(
+                waveform.squeeze().numpy(),
+                sampling_rate=sr,
+                top_k=3
+            )
+            return {
+                "mode": "audio",
+                "filename": file.filename,
+                "emotion": preds[0]["label"],
+                "top_predictions": preds
+            }
+        except Exception as e:
+            return JSONResponse({"error": f"Audio error: {e}"}, status_code=500)
+    # ----------------------------------------
+    # Case 2 — If text is provided
+    # ----------------------------------------
+    if text is not None and text.strip() != "":
+        try:
+            inputs = text_tokenizer(text, return_tensors="pt", truncation=True)
+            with torch.no_grad():
+                outputs = text_model(**inputs)
+            probs = torch.nn.functional.softmax(outputs.logits, dim=1)
+            label_id = torch.argmax(probs).item()
+            emotion = text_model.config.id2label[label_id]
+            return {
+                "mode": "text",
+                "text": text,
+                "emotion": emotion,
+                "probabilities": {
+                    text_model.config.id2label[i]: float(round(p, 4))
+                    for i, p in enumerate(probs[0].tolist())
+                }
+            }
+        except Exception as e:
+            return JSONResponse({"error": f"Text error: {e}"}, status_code=500)
+    # ----------------------------------------
+    # Case 3 — Nothing provided
+    # ----------------------------------------
+    return JSONResponse(
+        {"error": "Provide an audio file or text."},
+        status_code=400
+    )
+# -------------------------------------------------------------
+# 4️⃣ Gradio Interface (Single Tab: Audio + Text)
+# -------------------------------------------------------------
+def gradio_combined(audio_file, text):
+    # Case 1 — Audio provided
+    if audio_file is not None:
+        waveform, sr = torchaudio.load(audio_file)
+        preds = speech_classifier(waveform.squeeze().numpy(), sampling_rate=sr, top_k=3)
+        return {
+            "Detected Emotion": preds[0]["label"],
+            "Top Predictions": {p["label"]: round(p["score"], 3) for p in preds},
+            "Source": "Audio"
+        }
+    # Case 2 — Text provided
+    if text.strip() != "":
+        inputs = text_tokenizer(text, return_tensors="pt", truncation=True)
+        with torch.no_grad():
+            outputs = text_model(**inputs)
+        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
+        label_id = torch.argmax(probs).item()
+        return {
+            "Detected Emotion": text_model.config.id2label[label_id],
+            "Top Predictions": {
+                text_model.config.id2label[i]: round(p, 3)
+                for i, p in enumerate(probs[0].tolist())
+            },
+            "Source": "Text"
+        }
+    return {"Error": "Please provide audio or text input."}
+# Building the UI
+gradio_ui = gr.Interface(
+    fn=gradio_combined,
+    inputs=[
+        gr.Audio(label="🎤 Upload or Record Speech", sources=["microphone", "upload"], type="filepath"),
+        gr.Textbox(label="💬 Enter Text Emotion", placeholder="Type something...")
+    ],
+    outputs="json",
+    title="🎭 Multimodal Emotion Recognizer",
+    description="Use either speech or text — the model detects the emotion automatically!"
+)
+# Mount Gradio at /gradio
+app = gr.mount_gradio_app(app, gradio_ui, path="/gradio")