Spaces:

abedir
/

clstm_fastAPI

Sleeping

App Files Files Community

abedir commited on Jan 26

Commit

3461076

verified ·

1 Parent(s): caf81b0

Upload 7 files

Browse files

Files changed (7) hide show

app.py +35 -0
audio_utils.py +24 -0
best_clstm.pt +3 -0
config.py +19 -0
inference.py +36 -0
model.py +81 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from fastapi import FastAPI, UploadFile, File
+import shutil
+import uuid
+import os
+from inference import predict
+app = FastAPI(title="Audio Emotion Recognition API")
+UPLOAD_DIR = "/tmp"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+@app.get("/")
+def root():
+    return {
+        "message": "Audio Emotion Recognition API is running"
+    }
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+@app.post("/predict")
+async def predict_emotion(file: UploadFile = File(...)):
+    file_path = f"{UPLOAD_DIR}/{uuid.uuid4()}.wav"
+    with open(file_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    result = predict(file_path)
+    os.remove(file_path)
+    return result

audio_utils.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import librosa
+import numpy as np
+import torch
+from config import CONFIG
+def preprocess_audio(path, device):
+    y, _ = librosa.load(path, sr=CONFIG["sample_rate"])
+    max_len = int(CONFIG["sample_rate"] * CONFIG["duration"])
+    y = y[:max_len] if len(y) > max_len else np.pad(y, (0, max_len - len(y)))
+    mel = librosa.feature.melspectrogram(
+        y=y,
+        sr=CONFIG["sample_rate"],
+        n_fft=CONFIG["n_fft"],
+        hop_length=CONFIG["hop_length"],
+        n_mels=CONFIG["n_mels"]
+    )
+    mel_db = librosa.power_to_db(mel, ref=np.max)
+    mel_db = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-9)
+    tensor = torch.from_numpy(mel_db).unsqueeze(0).unsqueeze(0)
+    return tensor.to(device)

best_clstm.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10dfec9f10188b7bfc8663e36f2baa9d52b6d7a2819ba9b05ac5172d49775b1f
+size 16568874

config.py ADDED Viewed

	@@ -0,0 +1,19 @@

+CONFIG = {
+    "model_path": "best_clstm.pt",
+    "sample_rate": 16000,
+    "duration": 3.0,
+    "n_mels": 40,
+    "n_fft": 512,
+    "hop_length": 256
+}
+EMOTION_CONFIG = {
+    "angry": "😠",
+    "calm": "😌",
+    "disgust": "🤢",
+    "fearful": "😨",
+    "happy": "😊",
+    "neutral": "😐",
+    "sad": "😢",
+    "surprised": "😲"
+}

inference.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+from model import CLSTMModel
+from config import CONFIG, EMOTION_CONFIG
+from audio_utils import preprocess_audio
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+checkpoint = torch.load(CONFIG["model_path"], map_location=device)
+if "label_map" in checkpoint:
+    inv = {v: k for k, v in checkpoint["label_map"].items()}
+    emotions = [inv[i] for i in range(len(inv))]
+else:
+    emotions = list(EMOTION_CONFIG.keys())
+model = CLSTMModel(
+    n_mels=CONFIG["n_mels"],
+    n_classes=len(emotions)
+).to(device)
+model.load_state_dict(checkpoint["model_state_dict"])
+model.eval()
+def predict(path):
+    x = preprocess_audio(path, device)
+    with torch.no_grad():
+        logits = model(x)
+        probs = torch.softmax(logits, dim=1)
+        idx = torch.argmax(probs, dim=1).item()
+    return {
+        "emotion": emotions[idx],
+        "confidence": float(probs[0][idx])
+    }

model.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+import torch.nn as nn
+import math
+class ConvBlock(nn.Module):
+    def __init__(self, in_ch, out_ch, kernel_size=(3,3), pool=(2,2)):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Conv2d(in_ch, out_ch, kernel_size,
+                      padding=(kernel_size[0]//2, kernel_size[1]//2)),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(),
+            nn.MaxPool2d(pool),
+            nn.Dropout2d(0.2)
+        )
+    def forward(self, x):
+        return self.net(x)
+class AttentionLayer(nn.Module):
+    def __init__(self, hidden_dim):
+        super().__init__()
+        self.attention = nn.Linear(hidden_dim, 1)
+    def forward(self, lstm_out):
+        weights = torch.softmax(self.attention(lstm_out), dim=1)
+        return torch.sum(weights * lstm_out, dim=1)
+class CLSTMModel(nn.Module):
+    def __init__(
+        self,
+        n_mels=40,
+        n_classes=8,
+        conv_channels=[32, 64, 128],
+        lstm_hidden=128,
+        lstm_layers=2,
+        dropout=0.4
+    ):
+        super().__init__()
+        self.conv1 = ConvBlock(1, conv_channels[0])
+        self.conv2 = ConvBlock(conv_channels[0], conv_channels[1])
+        self.conv3 = ConvBlock(conv_channels[1], conv_channels[2])
+        freq_after = math.ceil(n_mels / (2 ** 3))
+        self.lstm_input = conv_channels[2] * freq_after
+        self.lstm = nn.LSTM(
+            self.lstm_input,
+            lstm_hidden,
+            num_layers=lstm_layers,
+            batch_first=True,
+            bidirectional=True,
+            dropout=dropout if lstm_layers > 1 else 0
+        )
+        self.attention = AttentionLayer(lstm_hidden * 2)
+        self.classifier = nn.Sequential(
+            nn.Linear(lstm_hidden * 2, 256),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(256, 128),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(128, n_classes)
+        )
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        b, c, f, t = x.size()
+        x = x.permute(0, 3, 1, 2).contiguous().view(b, t, c * f)
+        out, _ = self.lstm(x)
+        out = self.attention(out)
+        return self.classifier(out)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi
+torch
+librosa
+numpy
+python-multipart
+soundfile