Spaces:
Sleeping
Sleeping
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import librosa | |
| import numpy as np | |
| import gradio as gr | |
| import openai | |
| import os | |
| from transformers import Wav2Vec2FeatureExtractor | |
| from transformers import Wav2Vec2Model | |
| # ----------------- Setup --------------------- | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device) | |
| # Load Wav2Vec2 feature extractor | |
| model_name = "facebook/wav2vec2-base" | |
| feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name) | |
| # --------------- Load Emotion Classification Model ----------------- | |
| class CNN(nn.Module): | |
| def __init__(self, num_classes): | |
| super(CNN, self).__init__() | |
| self.name = "CNN" | |
| self.conv1 = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1) | |
| self.bn1 = nn.BatchNorm1d(256) | |
| self.pool = nn.AdaptiveMaxPool1d(output_size=96) | |
| self.conv2 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1) | |
| self.bn2 = nn.BatchNorm1d(128) | |
| self.conv3 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3, padding=1) | |
| self.bn3 = nn.BatchNorm1d(64) | |
| self.fc1 = nn.Linear(64 * 96, 128) | |
| self.dropout = nn.Dropout(0.5) | |
| self.fc2 = nn.Linear(128, num_classes) | |
| def forward(self, x): | |
| # x = x.unsqueeze(1) | |
| x = x.permute(0, 2, 1) | |
| x = F.relu(self.bn1(self.conv1(x))) | |
| #print(f"Before pooling 1, x shape: {x.shape}") | |
| x = self.pool(x) | |
| #print(f"After pooling 1, x shape: {x.shape}") | |
| x = F.relu(self.bn2(self.conv2(x))) | |
| #print(f"Before pooling 2, x shape: {x.shape}") | |
| x = self.pool(x) | |
| #print(f"After pooling 2, x shape: {x.shape}") | |
| x = F.relu(self.bn3(self.conv3(x))) | |
| #print(f"Before pooling 3, x shape: {x.shape}") | |
| x = self.pool(x) | |
| #print(f"After pooling 3, x shape: {x.shape}") | |
| x = x.view(x.size(0), -1) | |
| x = F.relu(self.fc1(x)) | |
| x = self.dropout(x) | |
| x = self.fc2(x) | |
| return x | |
| model = CNN(5) | |
| model.load_state_dict(torch.load("best_model_CNN_bs32_lr0.0005_epoch12_acc0.9248.pth", map_location=torch.device("cpu"))) | |
| model.eval() | |
| wav2vec2_model.eval() | |
| label_map = {0: "Neutral", 1: "Happy", 2: "Angry", 3: "Sad", 4: "Surprise"} | |
| # ------------------ ChatGPT API Setup --------------------- | |
| openai.api_key = "" # Use env variable or secret manager in production! | |
| def create_prompt_from_label(label): | |
| return f""" | |
| The user is currently feeling {label.lower()}. Start by briefly and thoughtfully acknowledging how someone might feel when experiencing this emotion. | |
| Then, as a recommendation system, suggest 3 pieces of entertainment content—such as movies, music, or shows—that align with or help support this mood. | |
| Ensure your tone is friendly and supportive, and make the recommendations short, engaging, and tailored to the {label.lower()} emotional state. | |
| You can add some lovely emoji to let it become warm. | |
| """ | |
| def get_recommendations(label): | |
| prompt = create_prompt_from_label(label) | |
| try: | |
| response = openai.ChatCompletion.create( | |
| model="gpt-4", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant that provides entertainment recommendations."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=500, | |
| temperature=0.7 | |
| ) | |
| return response['choices'][0]['message']['content'].strip() | |
| except Exception as e: | |
| return f"An error occurred: {e}" | |
| # ----------------- Inference Pipeline --------------------- | |
| def process_audio_and_recommend(file_path): | |
| audio, sr = librosa.load(file_path, sr=16000) | |
| max_duration = 5 | |
| max_samples = int(max_duration * sr) | |
| if len(audio) > max_samples: | |
| audio = audio[:max_samples] | |
| inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True) | |
| input_values = inputs["input_values"].to(device) | |
| with torch.no_grad(): | |
| # Get real Wav2Vec2 embeddings | |
| features = wav2vec2_model(input_values).last_hidden_state # shape: [1, seq_len, 768] | |
| outputs = model(features) # PASS DIRECTLY, no extra dim needed | |
| pred_idx = torch.argmax(outputs, dim=1).item() | |
| emotion = label_map[pred_idx] | |
| recommendations = get_recommendations(emotion) | |
| return f"🧠 Detected Emotion: {emotion}", recommendations | |
| # ----------------- Gradio UI --------------------- | |
| # interface = gr.Interface( | |
| # fn=process_audio_and_recommend, | |
| # inputs=gr.Audio(type="filepath"), | |
| # outputs=["text", "text"], | |
| # title="🎙️ Emotion-Based Entertainment Bot", | |
| # description="Upload your voice. We'll detect your emotion and ChatGPT will suggest entertainment!" | |
| # ) | |
| # interface.launch() | |
| with gr.Blocks(theme=gr.themes.Soft()) as interface: | |
| gr.Markdown("## 🎙️ 情绪检测 + 聊天机器人") | |
| gr.Markdown("上传或录制一段简短的语音片段,我会识别你的情绪,并请求 GPT 做出共情的回应。") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio(label="🎧 语音输入", type="filepath", format="wav") | |
| submit_btn = gr.Button("🚀 提交") | |
| with gr.Column(): | |
| output_text_1 = gr.Text(label="🧠 检测情绪") | |
| output_text_2 = gr.Text(label="💬 GPT 回复") | |
| submit_btn.click(fn=process_audio_and_recommend, inputs=audio_input, outputs=[output_text_1, output_text_2]) | |
| interface.launch() | |