import streamlit as st import whisper import librosa import numpy as np from pydub import AudioSegment from pydub.effects import normalize, speedup from pydub.silence import split_on_silence import tempfile import os from gtts import gTTS import io from audio_recorder_streamlit import audio_recorder import torch from auralis import TTS, TTSRequest import random import time # Streamlit Page Config and CSS omitted for brevity — use your existing styles # Load Whisper model once @st.cache_resource def load_whisper_model(): return whisper.load_model("base") # Load Hugging Face XTTS2 voice cloning model once @st.cache_resource def load_xtts_model(): return TTS().from_pretrained("AstraMindAI/xtts2-gpt") whisper_model = load_whisper_model() xtts_model = load_xtts_model() def create_tts(text): """Create TTS audio using gTTS.""" tts = gTTS(text, lang='en', slow=False) audio_buffer = io.BytesIO() tts.write_to_fp(audio_buffer) audio_buffer.seek(0) return audio_buffer @st.cache_data def preprocess_audio(file_obj): audio = AudioSegment.from_file(file_obj) audio = normalize(audio) audio = audio.strip_silence(silence_thresh=-40, silence_len=500) with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: audio.export(tmp.name, format='wav', parameters=['-ar', '16000', '-ac', '1']) audio_path = tmp.name return audio_path def transcribe_audio(audio_path): result = whisper_model.transcribe(audio_path, word_timestamps=True) text = result["text"] segments = result["segments"] confidences = [seg.get('confidence', 0.5) for seg in segments] avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5 return text, segments, avg_confidence def analyze_prosody(audio_path, transcript, segments, confidence): y, sr = librosa.load(audio_path, sr=16000) total_duration = librosa.get_duration(y=y, sr=sr) pitches, magnitudes = librosa.piptrack(y=y, sr=sr, fmin=75, fmax=300) pitch_values = pitches[magnitudes > np.median(magnitudes)] pitch_mean = np.mean(pitch_values[pitch_values > 0]) if len(pitch_values[pitch_values > 0]) > 0 else 150 words = len(transcript.split()) pace_wpm = (words / total_duration) * 60 if total_duration > 0 else 0 intervals = librosa.effects.split(y, top_db=20) pause_ratio = 1 - (sum(end - start for start, end in intervals) / len(y) / sr) return { 'pitch_mean': pitch_mean, 'pace_wpm': pace_wpm, 'pause_ratio': pause_ratio, 'confidence': confidence } def pronunciation_feedback(transcript, segments, prosody): pace_var = np.var([seg['end'] - seg['start'] for seg in segments]) pronun_score = (prosody['confidence'] * 100) * (1 - abs(prosody['pace_wpm'] - 120) / 120) pronun_score = max(0, min(100, pronun_score - (pace_var * 10))) return pronun_score def calculate_score(prosody, pronun_score, transcript): pitch_score = min(100, max(0, (prosody['pitch_mean'] - 100) / 50 * 100)) pace_score = 100 if 100 < prosody['pace_wpm'] < 150 else 70 pause_score = 100 * (1 - prosody['pause_ratio']) conf_score = prosody['confidence'] * 100 prosody_total = (pitch_score + pace_score + pause_score + conf_score) / 4 content_score = min(100, len(transcript.split()) * 0.5 + (prosody['confidence'] * 50)) total = (prosody_total * 0.5) + (pronun_score * 0.3) + (content_score * 0.2) return min(100, total) def generate_voice_feedback(score, prosody, pronun_score): pace = prosody['pace_wpm'] pauses = prosody['pause_ratio'] pitch = prosody['pitch_mean'] if score > 90: opening = "Excellent work! Your speech was outstanding." elif score > 80: opening = "Great job! You have strong communication skills." elif score > 60: opening = "Good effort! You're making solid progress." else: opening = "Nice try! Keep practicing to improve." feedback_parts = [opening] if pace < 100: feedback_parts.append(f"Your pace was {pace:.0f} words per minute. Try speaking faster, aiming for 120 to 140 words per minute.") elif pace > 160: feedback_parts.append(f"You spoke at {pace:.0f} words per minute, which is quite fast. Slow down to around 140 words per minute for better clarity.") else: feedback_parts.append(f"Your pace of {pace:.0f} words per minute is excellent.") if pauses > 0.20: feedback_parts.append(f"You paused {pauses:.0%} of the time. Try reducing pauses to 10 to 15 percent for smoother flow.") elif pauses < 0.05: feedback_parts.append("Consider adding brief pauses between ideas for better comprehension.") else: feedback_parts.append("Your use of pauses is well balanced.") if pronun_score < 80: feedback_parts.append("Work on clearer pronunciation by practicing tongue twisters and speaking more slowly.") else: feedback_parts.append("Your pronunciation is clear and articulate.") feedback_parts.append("I've prepared an enhanced version of your speech with optimized pacing. Keep practicing!") return " ".join(feedback_parts) def generate_cloned_voice_xtts(audio_path, cleaned_text): request = TTSRequest( text=cleaned_text, speaker_files=[audio_path], language="en" ) out = xtts_model.generate_speech(request) output_path = tempfile.mktemp(suffix=".wav") out.save(output_path) return output_path st.markdown('
🎤 FLUENTRA AI - Voice-Activated Speech Coach
Powered by Whisper AI, Librosa & Google TTS | © 2025