Spaces:
Sleeping
Sleeping
| import os | |
| os.environ["GRADIO_LANGUAGE"] = "en" | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import librosa | |
| import numpy as np | |
| import openai | |
| from transformers import Wav2Vec2FeatureExtractor | |
| from transformers import Wav2Vec2Model | |
| import gradio as gr | |
| # ----------------- Setup --------------------- | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device) | |
| # Load Wav2Vec2 feature extractor | |
| model_name = "facebook/wav2vec2-base" | |
| feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name) | |
| # --------------- Load Emotion Classification Model ----------------- | |
| class CNN(nn.Module): | |
| def __init__(self, num_classes): | |
| super(CNN, self).__init__() | |
| self.name = "CNN" | |
| self.conv1 = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1) | |
| self.bn1 = nn.BatchNorm1d(256) | |
| self.pool = nn.AdaptiveMaxPool1d(output_size=96) | |
| self.conv2 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1) | |
| self.bn2 = nn.BatchNorm1d(128) | |
| self.conv3 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3, padding=1) | |
| self.bn3 = nn.BatchNorm1d(64) | |
| self.fc1 = nn.Linear(64 * 96, 128) | |
| self.dropout = nn.Dropout(0.5) | |
| self.fc2 = nn.Linear(128, num_classes) | |
| def forward(self, x): | |
| # x = x.unsqueeze(1) | |
| x = x.permute(0, 2, 1) | |
| x = F.relu(self.bn1(self.conv1(x))) | |
| #print(f"Before pooling 1, x shape: {x.shape}") | |
| x = self.pool(x) | |
| #print(f"After pooling 1, x shape: {x.shape}") | |
| x = F.relu(self.bn2(self.conv2(x))) | |
| #print(f"Before pooling 2, x shape: {x.shape}") | |
| x = self.pool(x) | |
| #print(f"After pooling 2, x shape: {x.shape}") | |
| x = F.relu(self.bn3(self.conv3(x))) | |
| #print(f"Before pooling 3, x shape: {x.shape}") | |
| x = self.pool(x) | |
| #print(f"After pooling 3, x shape: {x.shape}") | |
| x = x.view(x.size(0), -1) | |
| x = F.relu(self.fc1(x)) | |
| x = self.dropout(x) | |
| x = self.fc2(x) | |
| return x | |
| model = CNN(5) | |
| model.load_state_dict(torch.load("best_model_CNN_bs32_lr0.001_epoch14_acc0.9244.pth", map_location=torch.device("cpu"))) | |
| model.eval() | |
| wav2vec2_model.eval() | |
| label_map = {0: "Neutral", 1: "Happy", 2: "Angry", 3: "Sad", 4: "Surprise"} | |
| from openai import OpenAI | |
| import os | |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY", "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")) | |
| def create_prompt_from_label(label): | |
| return f""" | |
| The user is currently feeling {label.lower()}. Start by briefly and thoughtfully acknowledging how someone might feel when experiencing this emotion. | |
| Then, as a recommendation system, suggest 3 pieces of entertainment content—such as movies, music, or shows—that align with or help support this mood. | |
| Ensure your tone is friendly and supportive, and make the recommendations short, engaging, and tailored to the {label.lower()} emotional state. | |
| You can add some lovely emoji to let it become warm. Please include the website links for your recommondations. | |
| """ | |
| def get_recommendations(label): | |
| prompt = create_prompt_from_label(label) | |
| try: | |
| response = client.chat.completions.create( | |
| model="gpt-4", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant that provides entertainment recommendations."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=500, | |
| temperature=0.7 | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| return f"❌ GPT Error: {str(e)}" | |
| # ----------------- Inference Pipeline --------------------- | |
| def process_audio_and_recommend(file_path): | |
| audio, sr = librosa.load(file_path, sr=16000) | |
| max_duration = 5 | |
| max_samples = int(max_duration * sr) | |
| if len(audio) > max_samples: | |
| audio = audio[:max_samples] | |
| inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True) | |
| input_values = inputs["input_values"].to(device) | |
| with torch.no_grad(): | |
| # Get real Wav2Vec2 embeddings | |
| features = wav2vec2_model(input_values).last_hidden_state # shape: [1, seq_len, 768] | |
| outputs = model(features) # PASS DIRECTLY, no extra dim needed | |
| pred_idx = torch.argmax(outputs, dim=1).item() | |
| emotion = label_map[pred_idx] | |
| recommendations = get_recommendations(emotion) | |
| return f"{emotion}", recommendations | |
| # ----------------- Gradio UI --------------------- | |
| # interface = gr.Interface( | |
| # fn=process_audio_and_recommend, | |
| # inputs=gr.Audio(type="filepath"), | |
| # outputs=["text", "text"], | |
| # title="🎙️ Emotion-Based Entertainment Bot", | |
| # description="Upload your voice. We'll detect your emotion and ChatGPT will suggest entertainment!" | |
| # ) | |
| # # interface.launch() | |
| # with gr.Blocks(theme=gr.themes.Soft()) as interface: | |
| # gr.Markdown("## Your Personal Emotion Assistant 😇") | |
| # gr.Markdown("Whisper me your thoughts, and I’ll wrap them in a better mood 🧸 ") | |
| # with gr.Row(): | |
| # audio_input = gr.Audio(label="🎙️ Audio Input", type="filepath", format="wav") | |
| # submit_btn = gr.Button("Submit") | |
| # with gr.Column(): | |
| # output_text_1 = gr.Text(label="🪄 I feel that you are...:") | |
| # output_text_2 = gr.Text(label="💬 Entertainment You May Like: ") | |
| # submit_btn.click(fn=process_audio_and_recommend, inputs=audio_input, outputs=[output_text_1, output_text_2]) | |
| # interface.launch() | |
| with gr.Blocks(theme=gr.themes.Soft()) as interface: | |
| gr.Markdown( | |
| """ | |
| <div style="text-align: center;"> | |
| <h2 style="font-size: 2em;"> Your Personal Emotion Assistant 😇</h2> | |
| <p style="font-size: 1.1em; color: gray;"> | |
| Whisper me your thoughts, and I’ll wrap them in a better mood 🧸 | |
| </p> | |
| </div> | |
| """, | |
| elem_id="header", | |
| ) | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| gr.Markdown("#### Upload or record your voice below:") | |
| audio_input = gr.Audio(label="🎙️ Audio Input", type="filepath", format="wav") | |
| submit_btn = gr.Button("Submit", scale=1) | |
| with gr.Column(scale=2): | |
| gr.Markdown("#### Emotion & Suggestions") | |
| output_text_1 = gr.Text(label="🪄 I feel that you are...:", show_label=True) | |
| output_text_2 = gr.Textbox(label="💬 Entertainment You May Like:", lines=10, max_lines=20) | |
| submit_btn.click( | |
| fn=process_audio_and_recommend, | |
| inputs=audio_input, | |
| outputs=[output_text_1, output_text_2], | |
| ) | |
| interface.launch() | |