Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import numpy as np | |
| from transformers import ( | |
| Wav2Vec2ForCTC, | |
| Wav2Vec2Tokenizer, | |
| Wav2Vec2FeatureExtractor, | |
| AutoModelForAudioClassification, | |
| AutoFeatureExtractor, | |
| T5ForConditionalGeneration, | |
| T5Tokenizer, | |
| Wav2Vec2ForSequenceClassification | |
| ) | |
| import librosa | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # Initialize models and tokenizers | |
| print("Loading models...") | |
| # Speech-to-Text Model | |
| stt_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") | |
| stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") | |
| stt_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") | |
| # Emotion Recognition Model - using a more reliable model | |
| try: | |
| from transformers import Wav2Vec2ForSequenceClassification | |
| emotion_feature_extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er") | |
| emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er") | |
| except: | |
| # Fallback to a simpler approach using audio features | |
| emotion_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") | |
| emotion_model = None | |
| print("Using fallback emotion detection method") | |
| # Personality Generation Model | |
| personality_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base") | |
| personality_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base") | |
| print("Models loaded successfully!") | |
| # Emotion labels mapping (updated for broader coverage) | |
| EMOTION_LABELS = { | |
| 0: "angry", | |
| 1: "happy", | |
| 2: "sad", | |
| 3: "neutral", | |
| 4: "excited", | |
| 5: "calm", | |
| 6: "surprised" | |
| } | |
| def preprocess_audio(audio_path, target_sr=16000): | |
| """Load and preprocess audio for model input""" | |
| try: | |
| # Load audio file | |
| audio, sr = librosa.load(audio_path, sr=target_sr) | |
| # Ensure audio is not too short | |
| if len(audio) < target_sr * 0.5: # Less than 0.5 seconds | |
| audio = np.pad(audio, (0, int(target_sr * 0.5) - len(audio)), mode='constant') | |
| return audio, sr | |
| except Exception as e: | |
| print(f"Error preprocessing audio: {e}") | |
| return None, None | |
| def transcribe_audio(audio_path): | |
| """Convert speech to text using Wav2Vec2""" | |
| try: | |
| audio, sr = preprocess_audio(audio_path) | |
| if audio is None: | |
| return "Error: Could not process audio file" | |
| # Extract features | |
| inputs = stt_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True) | |
| # Get model predictions | |
| with torch.no_grad(): | |
| logits = stt_model(inputs.input_values).logits | |
| # Decode predictions | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = stt_tokenizer.batch_decode(predicted_ids)[0] | |
| return transcription.strip() | |
| except Exception as e: | |
| return f"Transcription error: {str(e)}" | |
| def detect_emotion(audio_path): | |
| """Detect emotion from audio using audio features analysis""" | |
| try: | |
| audio, sr = preprocess_audio(audio_path) | |
| if audio is None: | |
| return "Error: Could not process audio file", 0.0 | |
| if emotion_model is not None: | |
| # Use the wav2vec2 emotion model if available | |
| inputs = emotion_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| outputs = emotion_model(**inputs) | |
| predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
| emotion_id = torch.argmax(predictions, dim=-1).item() | |
| confidence = torch.max(predictions).item() | |
| emotion_label = EMOTION_LABELS.get(emotion_id, "neutral") | |
| else: | |
| # Fallback: Simple audio feature-based emotion detection | |
| # Analyze audio characteristics | |
| rms_energy = np.sqrt(np.mean(audio**2)) | |
| zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio)[0]) | |
| spectral_centroid = np.mean(librosa.feature.spectral_centroid(audio, sr=sr)[0]) | |
| # Simple heuristic-based emotion classification | |
| if rms_energy > 0.02 and zero_crossing_rate > 0.1: | |
| emotion_label = "excited" | |
| confidence = 0.75 | |
| elif rms_energy < 0.005: | |
| emotion_label = "calm" | |
| confidence = 0.70 | |
| elif spectral_centroid > 2000: | |
| emotion_label = "happy" | |
| confidence = 0.65 | |
| else: | |
| emotion_label = "neutral" | |
| confidence = 0.60 | |
| return emotion_label, confidence | |
| except Exception as e: | |
| return "neutral", 0.50 # Default fallback | |
| def generate_personality(transcription, emotion, confidence): | |
| """Generate personality description using FLAN-T5""" | |
| try: | |
| # Create a comprehensive prompt for personality analysis | |
| prompt = f"""Analyze this person's personality based on their speech: | |
| Speech content: "{transcription}" | |
| Detected emotion: {emotion} (confidence: {confidence:.2f}) | |
| Based on the way they speak, their word choice, emotional tone, and overall communication style, provide a detailed personality analysis. Consider their potential traits, communication style, emotional intelligence, and social characteristics. Write this as a natural, engaging personality profile in 3-4 sentences.""" | |
| # Tokenize and generate | |
| inputs = personality_tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True) | |
| with torch.no_grad(): | |
| outputs = personality_model.generate( | |
| inputs, | |
| max_length=200, | |
| min_length=50, | |
| temperature=0.7, | |
| do_sample=True, | |
| top_p=0.9, | |
| pad_token_id=personality_tokenizer.eos_token_id | |
| ) | |
| personality_description = personality_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return personality_description | |
| except Exception as e: | |
| return f"Personality generation error: {str(e)}" | |
| def create_confidence_bar(emotion, confidence): | |
| """Create a visual representation of emotion confidence""" | |
| bar_length = int(confidence * 20) # Scale to 20 characters | |
| bar = "β" * bar_length + "β" * (20 - bar_length) | |
| return f"{emotion.upper()} {bar} {confidence:.1%}" | |
| def analyze_voice(audio_file): | |
| """Main function that orchestrates the entire analysis pipeline""" | |
| if audio_file is None: | |
| return "Please upload or record an audio file.", "", "", "" | |
| try: | |
| # Step 1: Transcribe speech | |
| transcription = transcribe_audio(audio_file) | |
| # Step 2: Detect emotion | |
| emotion, confidence = detect_emotion(audio_file) | |
| # Step 3: Generate personality description | |
| personality = generate_personality(transcription, emotion, confidence) | |
| # Create formatted output | |
| confidence_display = create_confidence_bar(emotion, confidence) | |
| # Format results | |
| results_summary = f""" | |
| π― **VOICE ANALYSIS COMPLETE** | |
| **What they said:** {transcription} | |
| **How they felt:** {confidence_display} | |
| **Who they might be:** {personality} | |
| """ | |
| return transcription, confidence_display, personality, results_summary | |
| except Exception as e: | |
| error_msg = f"Analysis failed: {str(e)}" | |
| return error_msg, "", "", error_msg | |
| # Create the Gradio interface | |
| def create_interface(): | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(), | |
| title="Voice2Persona AI", | |
| css=""" | |
| .main-header { | |
| text-align: center; | |
| background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| font-size: 2.5em; | |
| font-weight: bold; | |
| margin-bottom: 0.5em; | |
| } | |
| .description { | |
| text-align: center; | |
| font-size: 1.1em; | |
| color: #666; | |
| margin-bottom: 2em; | |
| } | |
| .result-box { | |
| border-radius: 10px; | |
| padding: 20px; | |
| margin: 10px 0; | |
| } | |
| """ | |
| ) as interface: | |
| gr.HTML(""" | |
| <div class="main-header">ποΈ Voice2Persona AI</div> | |
| <div class="description"> | |
| Discover your voice's hidden story! Upload or record audio to uncover what you said, | |
| how you felt, and insights into your personality. | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π΅ Audio Input") | |
| audio_input = gr.Audio( | |
| label="Record or Upload Audio", | |
| type="filepath", | |
| sources=["microphone", "upload"] | |
| ) | |
| analyze_btn = gr.Button( | |
| "π Analyze Voice", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| gr.Markdown(""" | |
| **Tips for best results:** | |
| - Speak clearly for 3-10 seconds | |
| - Use a quiet environment | |
| - Express yourself naturally | |
| """) | |
| with gr.Column(scale=2): | |
| gr.Markdown("### π Analysis Results") | |
| with gr.Tab("π Complete Analysis"): | |
| results_display = gr.Markdown( | |
| label="Full Analysis", | |
| value="Upload audio to see your voice analysis here..." | |
| ) | |
| with gr.Tab("π Detailed Breakdown"): | |
| transcription_output = gr.Textbox( | |
| label="π¬ Speech Content (What you said)", | |
| placeholder="Transcription will appear here...", | |
| lines=3 | |
| ) | |
| emotion_output = gr.Textbox( | |
| label="π Emotional State (How you felt)", | |
| placeholder="Emotion analysis will appear here...", | |
| lines=2 | |
| ) | |
| personality_output = gr.Textbox( | |
| label="π§ Personality Insights (Who you might be)", | |
| placeholder="Personality analysis will appear here...", | |
| lines=5 | |
| ) | |
| # Connect the analyze button to the main function | |
| analyze_btn.click( | |
| fn=analyze_voice, | |
| inputs=[audio_input], | |
| outputs=[transcription_output, emotion_output, personality_output, results_display] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### About Voice2Persona AI | |
| This AI system combines three powerful models: | |
| - **Speech-to-Text**: Facebook's Wav2Vec2 for accurate transcription | |
| - **Emotion Detection**: Specialized model for voice emotion recognition | |
| - **Personality Analysis**: Google's FLAN-T5 for generating personality insights | |
| *Built with β€οΈ using Hugging Face Transformers and Gradio* | |
| """) | |
| return interface | |
| # Launch the app | |
| if __name__ == "__main__": | |
| app = create_interface() | |
| app.launch( | |
| share=True, | |
| show_error=True, | |
| server_name="0.0.0.0", | |
| server_port=7860 | |
| ) |