import streamlit as st import whisper import librosa import numpy as np from pydub import AudioSegment from pydub.effects import normalize, speedup from pydub.silence import split_on_silence import tempfile import os from gtts import gTTS import io from audio_recorder_streamlit import audio_recorder import torch from auralis import TTS, TTSRequest import random import time # Streamlit Page Config and CSS omitted for brevity — use your existing styles # Load Whisper model once @st.cache_resource def load_whisper_model(): return whisper.load_model("base") # Load Hugging Face XTTS2 voice cloning model once @st.cache_resource def load_xtts_model(): return TTS().from_pretrained("AstraMindAI/xtts2-gpt") whisper_model = load_whisper_model() xtts_model = load_xtts_model() def create_tts(text): """Create TTS audio using gTTS.""" tts = gTTS(text, lang='en', slow=False) audio_buffer = io.BytesIO() tts.write_to_fp(audio_buffer) audio_buffer.seek(0) return audio_buffer @st.cache_data def preprocess_audio(file_obj): audio = AudioSegment.from_file(file_obj) audio = normalize(audio) audio = audio.strip_silence(silence_thresh=-40, silence_len=500) with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: audio.export(tmp.name, format='wav', parameters=['-ar', '16000', '-ac', '1']) audio_path = tmp.name return audio_path def transcribe_audio(audio_path): result = whisper_model.transcribe(audio_path, word_timestamps=True) text = result["text"] segments = result["segments"] confidences = [seg.get('confidence', 0.5) for seg in segments] avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5 return text, segments, avg_confidence def analyze_prosody(audio_path, transcript, segments, confidence): y, sr = librosa.load(audio_path, sr=16000) total_duration = librosa.get_duration(y=y, sr=sr) pitches, magnitudes = librosa.piptrack(y=y, sr=sr, fmin=75, fmax=300) pitch_values = pitches[magnitudes > np.median(magnitudes)] pitch_mean = np.mean(pitch_values[pitch_values > 0]) if len(pitch_values[pitch_values > 0]) > 0 else 150 words = len(transcript.split()) pace_wpm = (words / total_duration) * 60 if total_duration > 0 else 0 intervals = librosa.effects.split(y, top_db=20) pause_ratio = 1 - (sum(end - start for start, end in intervals) / len(y) / sr) return { 'pitch_mean': pitch_mean, 'pace_wpm': pace_wpm, 'pause_ratio': pause_ratio, 'confidence': confidence } def pronunciation_feedback(transcript, segments, prosody): pace_var = np.var([seg['end'] - seg['start'] for seg in segments]) pronun_score = (prosody['confidence'] * 100) * (1 - abs(prosody['pace_wpm'] - 120) / 120) pronun_score = max(0, min(100, pronun_score - (pace_var * 10))) return pronun_score def calculate_score(prosody, pronun_score, transcript): pitch_score = min(100, max(0, (prosody['pitch_mean'] - 100) / 50 * 100)) pace_score = 100 if 100 < prosody['pace_wpm'] < 150 else 70 pause_score = 100 * (1 - prosody['pause_ratio']) conf_score = prosody['confidence'] * 100 prosody_total = (pitch_score + pace_score + pause_score + conf_score) / 4 content_score = min(100, len(transcript.split()) * 0.5 + (prosody['confidence'] * 50)) total = (prosody_total * 0.5) + (pronun_score * 0.3) + (content_score * 0.2) return min(100, total) def generate_voice_feedback(score, prosody, pronun_score): pace = prosody['pace_wpm'] pauses = prosody['pause_ratio'] pitch = prosody['pitch_mean'] if score > 90: opening = "Excellent work! Your speech was outstanding." elif score > 80: opening = "Great job! You have strong communication skills." elif score > 60: opening = "Good effort! You're making solid progress." else: opening = "Nice try! Keep practicing to improve." feedback_parts = [opening] if pace < 100: feedback_parts.append(f"Your pace was {pace:.0f} words per minute. Try speaking faster, aiming for 120 to 140 words per minute.") elif pace > 160: feedback_parts.append(f"You spoke at {pace:.0f} words per minute, which is quite fast. Slow down to around 140 words per minute for better clarity.") else: feedback_parts.append(f"Your pace of {pace:.0f} words per minute is excellent.") if pauses > 0.20: feedback_parts.append(f"You paused {pauses:.0%} of the time. Try reducing pauses to 10 to 15 percent for smoother flow.") elif pauses < 0.05: feedback_parts.append("Consider adding brief pauses between ideas for better comprehension.") else: feedback_parts.append("Your use of pauses is well balanced.") if pronun_score < 80: feedback_parts.append("Work on clearer pronunciation by practicing tongue twisters and speaking more slowly.") else: feedback_parts.append("Your pronunciation is clear and articulate.") feedback_parts.append("I've prepared an enhanced version of your speech with optimized pacing. Keep practicing!") return " ".join(feedback_parts) def generate_cloned_voice_xtts(audio_path, cleaned_text): request = TTSRequest( text=cleaned_text, speaker_files=[audio_path], language="en" ) out = xtts_model.generate_speech(request) output_path = tempfile.mktemp(suffix=".wav") out.save(output_path) return output_path st.markdown('

🎤 FLUENTRA AI

Your Voice-Activated Speech Coach

', unsafe_allow_html=True) if not st.session_state.get('greeted', False): greeting_text = "Hello! I am Fluentra, your personal speech coach. Click the microphone button and speak for 20 to 60 seconds. I will analyze your speech and help you improve." st.markdown(f'
🔊 {greeting_text}
', unsafe_allow_html=True) greeting_audio = create_tts(greeting_text) st.audio(greeting_audio, format="audio/mp3") st.session_state['greeted'] = True st.markdown("---") st.subheader("🎙️ Ready to Record") audio_bytes = audio_recorder( text="Click to Start Recording", recording_color="#00f7ff", neutral_color="#4a5568", icon_size="3x", pause_threshold=2.0 ) if audio_bytes: st.success("✅ Recording captured!") st.audio(audio_bytes, format="audio/wav") with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp: tmp.write(audio_bytes) recorded_path = tmp.name processing_msg = "Processing your speech. Please wait." st.markdown(f'
🔊 {processing_msg}
', unsafe_allow_html=True) processing_audio = create_tts(processing_msg) st.audio(processing_audio, format="audio/mp3", autoplay=True) with st.spinner("🧠 Analyzing..."): audio_path = preprocess_audio(recorded_path) transcript, segments, confidence = transcribe_audio(audio_path) prosody = analyze_prosody(audio_path, transcript, segments, confidence) pronun_score = pronunciation_feedback(transcript, segments, prosody) score = calculate_score(prosody, pronun_score, transcript) feedback_text = generate_voice_feedback(score, prosody, pronun_score) feedback_audio = create_tts(feedback_text) cleaned_text = " ".join([w for w in transcript.split() if w.lower() not in {"um", "uh", "like", "you know", "er", "ah", "so", "well"}]) cloned_voice_path = generate_cloned_voice_xtts(audio_path, cleaned_text) st.session_state['analysis_count'] = st.session_state.get('analysis_count', 0) + 1 st.markdown("---") st.subheader("💬 Fluentra's Feedback") st.markdown(f'
🔊 {feedback_text}
', unsafe_allow_html=True) st.audio(feedback_audio, format="audio/mp3") st.markdown("---") st.subheader("📊 Analysis Results") col1, col2, col3, col4 = st.columns(4) with col1: st.markdown(f'

Overall Score

{score:.1f}/100

', unsafe_allow_html=True) with col2: st.markdown(f'

Pace

{prosody["pace_wpm"]:.0f} WPM

', unsafe_allow_html=True) with col3: st.markdown(f'

Pitch

{prosody["pitch_mean"]:.0f} Hz

', unsafe_allow_html=True) with col4: st.markdown(f'

Confidence

{confidence:.0%}

', unsafe_allow_html=True) st.markdown("---") st.subheader("✨ Your Enhanced Voice") enhanced_msg = "Here is your speech with fillers removed and pace optimized." st.markdown(f'
🔊 {enhanced_msg}
', unsafe_allow_html=True) st.audio(cloned_voice_path, format="audio/wav") st.markdown("---") with st.expander("📝 View Transcription"): st.info(transcript) if st.session_state['analysis_count'] == 1: closing = "Great start! Feel free to record again to track your improvement." else: closing = f"This is your {st.session_state['analysis_count']}th analysis. You're making progress!" st.markdown(f'
🔊 {closing}
', unsafe_allow_html=True) closing_audio = create_tts(closing) st.audio(closing_audio, format="audio/mp3") os.unlink(audio_path) os.unlink(recorded_path) # Footer st.markdown("---") st.markdown("""

🎤 FLUENTRA AI - Voice-Activated Speech Coach

Powered by Whisper AI, Librosa & Google TTS | © 2025

""", unsafe_allow_html=True)