Spaces:

Dddrl
/

emotion_detection_test

Sleeping

App Files Files Community

Dddrl commited on Mar 26

Commit

f2da24d

verified ·

1 Parent(s): e273214

Upload 2 files

Browse files

Files changed (1) hide show

app.py +89 -51

app.py CHANGED Viewed

@@ -6,98 +6,134 @@ import numpy as np
 import gradio as gr
 import openai
 import os
-# Emotion categories
-emotions = ["Neutral", "Happy", "Angry", "Sad", "Surprise"]
-# CNN model definition
 class CNN(nn.Module):
     def __init__(self, num_classes):
         super(CNN, self).__init__()
         self.name = "CNN"
         self.conv1 = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1)
         self.bn1 = nn.BatchNorm1d(256)
         self.pool = nn.AdaptiveMaxPool1d(output_size=96)
         self.conv2 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
         self.bn2 = nn.BatchNorm1d(128)
         self.conv3 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3, padding=1)
         self.bn3 = nn.BatchNorm1d(64)
         self.fc1 = nn.Linear(64 * 96, 128)
         self.dropout = nn.Dropout(0.5)
         self.fc2 = nn.Linear(128, num_classes)
     def forward(self, x):
-        x = x.unsqueeze(1)
         x = x.permute(0, 2, 1)
         x = F.relu(self.bn1(self.conv1(x)))
         x = self.pool(x)
         x = F.relu(self.bn2(self.conv2(x)))
         x = self.pool(x)
         x = F.relu(self.bn3(self.conv3(x)))
         x = self.pool(x)
         x = x.view(x.size(0), -1)
         x = F.relu(self.fc1(x))
         x = self.dropout(x)
         x = self.fc2(x)
         return x
-# Load the trained model
-model = CNN(num_classes=5)
-model.load_state_dict(torch.load("best_model_CNN_bs32_lr0.0005_epoch9_acc0.9238.pth", map_location="cpu"))
 model.eval()
-# Extract features from audio file
-def extract_feature(audio_path):
-    y, sr = librosa.load(audio_path, sr=16000)
-    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
-    max_len = 200
-    if mfcc.shape[1] > max_len:
-        mfcc = mfcc[:, :max_len]
-    else:
-        pad_width = max_len - mfcc.shape[1]
-        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
-    feature = np.tile(mfcc, (int(768 / 40), 1))
-    feature = torch.tensor(feature, dtype=torch.float32).unsqueeze(0)
-    return feature
-# Full pipeline: emotion detection + GPT response
-def predict_and_reply(audio_path):
-    model.eval()
-    # Load and preprocess audio
-    feature = extract_feature(audio_path)
-    # Move model and input to correct device
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    feature = feature.to(device)
-    # Predict
-    with torch.no_grad():
-        output = model(feature)
-        pred = torch.argmax(output, dim=1).item()
-        emotion = emotions[pred]
-    prompt = f"The user sounds {emotion.lower()}. What would you like to say to them?"
     try:
-        openai.api_key = os.getenv("OPENAI_API_KEY", "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")  # Replace with real key or env var
         response = openai.ChatCompletion.create(
-            model="gpt-3.5-turbo",
             messages=[
                 {"role": "system", "content": "You are a helpful assistant that provides entertainment recommendations."},
                 {"role": "user", "content": prompt}
-            ]
         )
-        reply = response['choices'][0]['message']['content']
     except Exception as e:
-        reply = f"❌ GPT Error: {str(e)}"
-    return f"🎧 Detected Emotion: **{emotion}**\n\n💬 GPT Says:\n{reply}"
-#️ Gradio app layout
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("## 🎙️ 情绪检测 + 聊天机器人")
     gr.Markdown("上传或录制一段简短的语音片段，我会识别你的情绪，并请求 GPT 做出共情的回应。")
@@ -106,8 +142,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             audio_input = gr.Audio(label="🎧 语音输入", type="filepath", format="wav")
             submit_btn = gr.Button("🚀 提交")
         with gr.Column():
-            output_text = gr.Markdown(label="💬 GPT 回复")
-    submit_btn.click(fn=predict_and_reply, inputs=audio_input, outputs=output_text)
-demo.launch()

 import gradio as gr
 import openai
 import os
+from transformers import Wav2Vec2FeatureExtractor
+from transformers import Wav2Vec2Model
+# ----------------- Setup ---------------------
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)
+# Load Wav2Vec2 feature extractor
+model_name = "facebook/wav2vec2-base"
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
+# --------------- Load Emotion Classification Model -----------------
 class CNN(nn.Module):
     def __init__(self, num_classes):
         super(CNN, self).__init__()
         self.name = "CNN"
         self.conv1 = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1)
         self.bn1 = nn.BatchNorm1d(256)
         self.pool = nn.AdaptiveMaxPool1d(output_size=96)
         self.conv2 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
         self.bn2 = nn.BatchNorm1d(128)
         self.conv3 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3, padding=1)
         self.bn3 = nn.BatchNorm1d(64)
         self.fc1 = nn.Linear(64 * 96, 128)
         self.dropout = nn.Dropout(0.5)
         self.fc2 = nn.Linear(128, num_classes)
     def forward(self, x):
+        # x = x.unsqueeze(1)
         x = x.permute(0, 2, 1)
         x = F.relu(self.bn1(self.conv1(x)))
+        #print(f"Before pooling 1, x shape: {x.shape}")
         x = self.pool(x)
+        #print(f"After pooling 1, x shape: {x.shape}")
         x = F.relu(self.bn2(self.conv2(x)))
+        #print(f"Before pooling 2, x shape: {x.shape}")
         x = self.pool(x)
+        #print(f"After pooling 2, x shape: {x.shape}")
         x = F.relu(self.bn3(self.conv3(x)))
+        #print(f"Before pooling 3, x shape: {x.shape}")
         x = self.pool(x)
+        #print(f"After pooling 3, x shape: {x.shape}")
         x = x.view(x.size(0), -1)
         x = F.relu(self.fc1(x))
         x = self.dropout(x)
         x = self.fc2(x)
         return x
+model = CNN(5)
+model.load_state_dict(torch.load("best_model_CNN_bs32_lr0.0005_epoch9_acc0.9238.pth", map_location=torch.device("cpu")))
 model.eval()
+wav2vec2_model.eval()
+label_map = {0: "Neutral", 1: "Happy", 2: "Angry", 3: "Sad", 4: "Surprise"}
+# ------------------ ChatGPT API Setup ---------------------
+openai.api_key = ""  # Use env variable or secret manager in production!
+def create_prompt_from_label(label):
+    return f"""
+The user is currently feeling {label.lower()}. Start by briefly and thoughtfully acknowledging how someone might feel when experiencing this emotion.
+Then, as a recommendation system, suggest 3 pieces of entertainment content—such as movies, music, or shows—that align with or help support this mood.
+Ensure your tone is friendly and supportive, and make the recommendations short, engaging, and tailored to the {label.lower()} emotional state.
+You can add some lovely emoji to let it become warm.
+"""
+def get_recommendations(label):
+    prompt = create_prompt_from_label(label)
     try:
         response = openai.ChatCompletion.create(
+            model="gpt-4",
             messages=[
                 {"role": "system", "content": "You are a helpful assistant that provides entertainment recommendations."},
                 {"role": "user", "content": prompt}
+            ],
+            max_tokens=500,
+            temperature=0.7
         )
+        return response['choices'][0]['message']['content'].strip()
     except Exception as e:
+        return f"An error occurred: {e}"
+# ----------------- Inference Pipeline ---------------------
+def process_audio_and_recommend(file_path):
+    audio, sr = librosa.load(file_path, sr=16000)
+    max_duration = 5
+    max_samples = int(max_duration * sr)
+    if len(audio) > max_samples:
+        audio = audio[:max_samples]
+    inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
+    input_values = inputs["input_values"].to(device)
+    with torch.no_grad():
+        # Get real Wav2Vec2 embeddings
+        features = wav2vec2_model(input_values).last_hidden_state  # shape: [1, seq_len, 768]
+        outputs = model(features)  # PASS DIRECTLY, no extra dim needed
+    pred_idx = torch.argmax(outputs, dim=1).item()
+    emotion = label_map[pred_idx]
+    recommendations = get_recommendations(emotion)
+    return f"🧠 Detected Emotion: {emotion}", recommendations
+# ----------------- Gradio UI ---------------------
+# interface = gr.Interface(
+#     fn=process_audio_and_recommend,
+#     inputs=gr.Audio(type="filepath"),
+#     outputs=["text", "text"],
+#     title="🎙️ Emotion-Based Entertainment Bot",
+#     description="Upload your voice. We'll detect your emotion and ChatGPT will suggest entertainment!"
+# )
+# interface.launch()
+with gr.Blocks(theme=gr.themes.Soft()) as interface:
     gr.Markdown("## 🎙️ 情绪检测 + 聊天机器人")
     gr.Markdown("上传或录制一段简短的语音片段，我会识别你的情绪，并请求 GPT 做出共情的回应。")
             audio_input = gr.Audio(label="🎧 语音输入", type="filepath", format="wav")
             submit_btn = gr.Button("🚀 提交")
         with gr.Column():
+            output_text_1 = gr.Text(label="🧠 检测情绪")
+            output_text_2 = gr.Text(label="💬 GPT 回复")
+    submit_btn.click(fn=process_audio_and_recommend, inputs=audio_input, outputs=[output_text_1, output_text_2])
+interface.launch()