Spaces:
Sleeping
Sleeping
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import librosa | |
| import numpy as np | |
| import gradio as gr | |
| import openai | |
| import os | |
| # Emotion categories | |
| emotions = ["Neutral", "Happy", "Angry", "Sad", "Surprise"] | |
| # CNN model definition | |
| class CNN(nn.Module): | |
| def __init__(self, num_classes): | |
| super(CNN, self).__init__() | |
| self.name = "CNN" | |
| self.conv1 = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1) | |
| self.bn1 = nn.BatchNorm1d(256) | |
| self.pool = nn.AdaptiveMaxPool1d(output_size=96) | |
| self.conv2 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1) | |
| self.bn2 = nn.BatchNorm1d(128) | |
| self.conv3 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3, padding=1) | |
| self.bn3 = nn.BatchNorm1d(64) | |
| self.fc1 = nn.Linear(64 * 96, 128) | |
| self.dropout = nn.Dropout(0.5) | |
| self.fc2 = nn.Linear(128, num_classes) | |
| def forward(self, x): | |
| x = x.unsqueeze(1) | |
| x = x.permute(0, 2, 1) | |
| x = F.relu(self.bn1(self.conv1(x))) | |
| x = self.pool(x) | |
| x = F.relu(self.bn2(self.conv2(x))) | |
| x = self.pool(x) | |
| x = F.relu(self.bn3(self.conv3(x))) | |
| x = self.pool(x) | |
| x = x.view(x.size(0), -1) | |
| x = F.relu(self.fc1(x)) | |
| x = self.dropout(x) | |
| x = self.fc2(x) | |
| return x | |
| # Load the trained model | |
| model = CNN(num_classes=5) | |
| model.load_state_dict(torch.load("best_model.pth", map_location="cpu")) | |
| model.eval() | |
| # Extract features from audio file | |
| def extract_feature(audio_path): | |
| y, sr = librosa.load(audio_path, sr=16000) | |
| mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) | |
| max_len = 200 | |
| if mfcc.shape[1] > max_len: | |
| mfcc = mfcc[:, :max_len] | |
| else: | |
| pad_width = max_len - mfcc.shape[1] | |
| mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant') | |
| feature = np.tile(mfcc, (int(768 / 40), 1)) | |
| feature = torch.tensor(feature, dtype=torch.float32).unsqueeze(0) | |
| return feature | |
| # Full pipeline: emotion detection + GPT response | |
| def predict_and_reply(audio_path): | |
| feature = extract_feature(audio_path) | |
| with torch.no_grad(): | |
| output = model(feature) | |
| pred = torch.argmax(output, dim=1).item() | |
| emotion = emotions[pred] | |
| prompt = f"The user sounds {emotion.lower()}. What would you like to say to them?" | |
| try: | |
| openai.api_key = os.getenv("OPENAI_API_KEY", "your-openai-api-key") # Replace with real key or env var | |
| response = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "system", "content": "You are an empathetic AI assistant."}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| ) | |
| reply = response['choices'][0]['message']['content'] | |
| except Exception as e: | |
| reply = f"❌ GPT Error: {str(e)}" | |
| return f"🎧 Detected Emotion: **{emotion}**\n\n💬 GPT Says:\n{reply}" | |
| #️ Gradio app layout | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("## 🎙️ 情绪检测 + 聊天机器人") | |
| gr.Markdown("上传或录制一段简短的语音片段,我会识别你的情绪,并请求 GPT 做出共情的回应。") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio(label="🎧 语音输入", type="filepath", format="wav") | |
| submit_btn = gr.Button("🚀 提交") | |
| with gr.Column(): | |
| output_text = gr.Markdown(label="💬 GPT 回复") | |
| submit_btn.click(fn=predict_and_reply, inputs=audio_input, outputs=output_text) | |
| demo.launch() | |