Spaces:
Build error
Build error
| import streamlit as st | |
| import whisper | |
| import librosa | |
| import numpy as np | |
| from pydub import AudioSegment | |
| from pydub.effects import normalize, speedup | |
| from pydub.silence import split_on_silence | |
| import tempfile | |
| import os | |
| from gtts import gTTS | |
| import io | |
| from audio_recorder_streamlit import audio_recorder | |
| import torch | |
| from auralis import TTS, TTSRequest | |
| import random | |
| import time | |
| # Streamlit Page Config and CSS omitted for brevity β use your existing styles | |
| # Load Whisper model once | |
| def load_whisper_model(): | |
| return whisper.load_model("base") | |
| # Load Hugging Face XTTS2 voice cloning model once | |
| def load_xtts_model(): | |
| return TTS().from_pretrained("AstraMindAI/xtts2-gpt") | |
| whisper_model = load_whisper_model() | |
| xtts_model = load_xtts_model() | |
| def create_tts(text): | |
| """Create TTS audio using gTTS.""" | |
| tts = gTTS(text, lang='en', slow=False) | |
| audio_buffer = io.BytesIO() | |
| tts.write_to_fp(audio_buffer) | |
| audio_buffer.seek(0) | |
| return audio_buffer | |
| def preprocess_audio(file_obj): | |
| audio = AudioSegment.from_file(file_obj) | |
| audio = normalize(audio) | |
| audio = audio.strip_silence(silence_thresh=-40, silence_len=500) | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: | |
| audio.export(tmp.name, format='wav', parameters=['-ar', '16000', '-ac', '1']) | |
| audio_path = tmp.name | |
| return audio_path | |
| def transcribe_audio(audio_path): | |
| result = whisper_model.transcribe(audio_path, word_timestamps=True) | |
| text = result["text"] | |
| segments = result["segments"] | |
| confidences = [seg.get('confidence', 0.5) for seg in segments] | |
| avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5 | |
| return text, segments, avg_confidence | |
| def analyze_prosody(audio_path, transcript, segments, confidence): | |
| y, sr = librosa.load(audio_path, sr=16000) | |
| total_duration = librosa.get_duration(y=y, sr=sr) | |
| pitches, magnitudes = librosa.piptrack(y=y, sr=sr, fmin=75, fmax=300) | |
| pitch_values = pitches[magnitudes > np.median(magnitudes)] | |
| pitch_mean = np.mean(pitch_values[pitch_values > 0]) if len(pitch_values[pitch_values > 0]) > 0 else 150 | |
| words = len(transcript.split()) | |
| pace_wpm = (words / total_duration) * 60 if total_duration > 0 else 0 | |
| intervals = librosa.effects.split(y, top_db=20) | |
| pause_ratio = 1 - (sum(end - start for start, end in intervals) / len(y) / sr) | |
| return { | |
| 'pitch_mean': pitch_mean, | |
| 'pace_wpm': pace_wpm, | |
| 'pause_ratio': pause_ratio, | |
| 'confidence': confidence | |
| } | |
| def pronunciation_feedback(transcript, segments, prosody): | |
| pace_var = np.var([seg['end'] - seg['start'] for seg in segments]) | |
| pronun_score = (prosody['confidence'] * 100) * (1 - abs(prosody['pace_wpm'] - 120) / 120) | |
| pronun_score = max(0, min(100, pronun_score - (pace_var * 10))) | |
| return pronun_score | |
| def calculate_score(prosody, pronun_score, transcript): | |
| pitch_score = min(100, max(0, (prosody['pitch_mean'] - 100) / 50 * 100)) | |
| pace_score = 100 if 100 < prosody['pace_wpm'] < 150 else 70 | |
| pause_score = 100 * (1 - prosody['pause_ratio']) | |
| conf_score = prosody['confidence'] * 100 | |
| prosody_total = (pitch_score + pace_score + pause_score + conf_score) / 4 | |
| content_score = min(100, len(transcript.split()) * 0.5 + (prosody['confidence'] * 50)) | |
| total = (prosody_total * 0.5) + (pronun_score * 0.3) + (content_score * 0.2) | |
| return min(100, total) | |
| def generate_voice_feedback(score, prosody, pronun_score): | |
| pace = prosody['pace_wpm'] | |
| pauses = prosody['pause_ratio'] | |
| pitch = prosody['pitch_mean'] | |
| if score > 90: | |
| opening = "Excellent work! Your speech was outstanding." | |
| elif score > 80: | |
| opening = "Great job! You have strong communication skills." | |
| elif score > 60: | |
| opening = "Good effort! You're making solid progress." | |
| else: | |
| opening = "Nice try! Keep practicing to improve." | |
| feedback_parts = [opening] | |
| if pace < 100: | |
| feedback_parts.append(f"Your pace was {pace:.0f} words per minute. Try speaking faster, aiming for 120 to 140 words per minute.") | |
| elif pace > 160: | |
| feedback_parts.append(f"You spoke at {pace:.0f} words per minute, which is quite fast. Slow down to around 140 words per minute for better clarity.") | |
| else: | |
| feedback_parts.append(f"Your pace of {pace:.0f} words per minute is excellent.") | |
| if pauses > 0.20: | |
| feedback_parts.append(f"You paused {pauses:.0%} of the time. Try reducing pauses to 10 to 15 percent for smoother flow.") | |
| elif pauses < 0.05: | |
| feedback_parts.append("Consider adding brief pauses between ideas for better comprehension.") | |
| else: | |
| feedback_parts.append("Your use of pauses is well balanced.") | |
| if pronun_score < 80: | |
| feedback_parts.append("Work on clearer pronunciation by practicing tongue twisters and speaking more slowly.") | |
| else: | |
| feedback_parts.append("Your pronunciation is clear and articulate.") | |
| feedback_parts.append("I've prepared an enhanced version of your speech with optimized pacing. Keep practicing!") | |
| return " ".join(feedback_parts) | |
| def generate_cloned_voice_xtts(audio_path, cleaned_text): | |
| request = TTSRequest( | |
| text=cleaned_text, | |
| speaker_files=[audio_path], | |
| language="en" | |
| ) | |
| out = xtts_model.generate_speech(request) | |
| output_path = tempfile.mktemp(suffix=".wav") | |
| out.save(output_path) | |
| return output_path | |
| st.markdown('<div class="main-header"><h1><span class="status-indicator"></span>π€ FLUENTRA AI</h1><h3>Your Voice-Activated Speech Coach</h3></div>', unsafe_allow_html=True) | |
| if not st.session_state.get('greeted', False): | |
| greeting_text = "Hello! I am Fluentra, your personal speech coach. Click the microphone button and speak for 20 to 60 seconds. I will analyze your speech and help you improve." | |
| st.markdown(f'<div class="voice-message">π {greeting_text}</div>', unsafe_allow_html=True) | |
| greeting_audio = create_tts(greeting_text) | |
| st.audio(greeting_audio, format="audio/mp3") | |
| st.session_state['greeted'] = True | |
| st.markdown("---") | |
| st.subheader("ποΈ Ready to Record") | |
| audio_bytes = audio_recorder( | |
| text="Click to Start Recording", | |
| recording_color="#00f7ff", | |
| neutral_color="#4a5568", | |
| icon_size="3x", | |
| pause_threshold=2.0 | |
| ) | |
| if audio_bytes: | |
| st.success("β Recording captured!") | |
| st.audio(audio_bytes, format="audio/wav") | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp: | |
| tmp.write(audio_bytes) | |
| recorded_path = tmp.name | |
| processing_msg = "Processing your speech. Please wait." | |
| st.markdown(f'<div class="voice-message">π {processing_msg}</div>', unsafe_allow_html=True) | |
| processing_audio = create_tts(processing_msg) | |
| st.audio(processing_audio, format="audio/mp3", autoplay=True) | |
| with st.spinner("π§ Analyzing..."): | |
| audio_path = preprocess_audio(recorded_path) | |
| transcript, segments, confidence = transcribe_audio(audio_path) | |
| prosody = analyze_prosody(audio_path, transcript, segments, confidence) | |
| pronun_score = pronunciation_feedback(transcript, segments, prosody) | |
| score = calculate_score(prosody, pronun_score, transcript) | |
| feedback_text = generate_voice_feedback(score, prosody, pronun_score) | |
| feedback_audio = create_tts(feedback_text) | |
| cleaned_text = " ".join([w for w in transcript.split() if w.lower() not in {"um", "uh", "like", "you know", "er", "ah", "so", "well"}]) | |
| cloned_voice_path = generate_cloned_voice_xtts(audio_path, cleaned_text) | |
| st.session_state['analysis_count'] = st.session_state.get('analysis_count', 0) + 1 | |
| st.markdown("---") | |
| st.subheader("π¬ Fluentra's Feedback") | |
| st.markdown(f'<div class="voice-message">π {feedback_text}</div>', unsafe_allow_html=True) | |
| st.audio(feedback_audio, format="audio/mp3") | |
| st.markdown("---") | |
| st.subheader("π Analysis Results") | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.markdown(f'<div class="metric-card"><h3>Overall Score</h3><h1>{score:.1f}/100</h1></div>', unsafe_allow_html=True) | |
| with col2: | |
| st.markdown(f'<div class="metric-card"><h3>Pace</h3><h1>{prosody["pace_wpm"]:.0f} WPM</h1></div>', unsafe_allow_html=True) | |
| with col3: | |
| st.markdown(f'<div class="metric-card"><h3>Pitch</h3><h1>{prosody["pitch_mean"]:.0f} Hz</h1></div>', unsafe_allow_html=True) | |
| with col4: | |
| st.markdown(f'<div class="metric-card"><h3>Confidence</h3><h1>{confidence:.0%}</h1></div>', unsafe_allow_html=True) | |
| st.markdown("---") | |
| st.subheader("β¨ Your Enhanced Voice") | |
| enhanced_msg = "Here is your speech with fillers removed and pace optimized." | |
| st.markdown(f'<div class="voice-message">π {enhanced_msg}</div>', unsafe_allow_html=True) | |
| st.audio(cloned_voice_path, format="audio/wav") | |
| st.markdown("---") | |
| with st.expander("π View Transcription"): | |
| st.info(transcript) | |
| if st.session_state['analysis_count'] == 1: | |
| closing = "Great start! Feel free to record again to track your improvement." | |
| else: | |
| closing = f"This is your {st.session_state['analysis_count']}th analysis. You're making progress!" | |
| st.markdown(f'<div class="voice-message">π {closing}</div>', unsafe_allow_html=True) | |
| closing_audio = create_tts(closing) | |
| st.audio(closing_audio, format="audio/mp3") | |
| os.unlink(audio_path) | |
| os.unlink(recorded_path) | |
| # Footer | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style='text-align: center; color: #00f7ff; padding: 2rem;'> | |
| <p>π€ <strong>FLUENTRA AI</strong> - Voice-Activated Speech Coach</p> | |
| <p>Powered by Whisper AI, Librosa & Google TTS | Β© 2025</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |