Spaces:
Build error
Build error
Upload appg.py
Browse files
appg.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import whisper
|
| 3 |
+
import librosa
|
| 4 |
+
import numpy as np
|
| 5 |
+
from pydub import AudioSegment
|
| 6 |
+
from pydub.effects import normalize, speedup
|
| 7 |
+
from pydub.silence import split_on_silence
|
| 8 |
+
import tempfile
|
| 9 |
+
import os
|
| 10 |
+
from gtts import gTTS
|
| 11 |
+
import io
|
| 12 |
+
from audio_recorder_streamlit import audio_recorder
|
| 13 |
+
import torch
|
| 14 |
+
from auralis import TTS, TTSRequest
|
| 15 |
+
import random
|
| 16 |
+
import time
|
| 17 |
+
|
| 18 |
+
# Streamlit Page Config and CSS omitted for brevity β use your existing styles
|
| 19 |
+
|
| 20 |
+
# Load Whisper model once
|
| 21 |
+
@st.cache_resource
|
| 22 |
+
def load_whisper_model():
|
| 23 |
+
return whisper.load_model("base")
|
| 24 |
+
|
| 25 |
+
# Load Hugging Face XTTS2 voice cloning model once
|
| 26 |
+
@st.cache_resource
|
| 27 |
+
def load_xtts_model():
|
| 28 |
+
return TTS().from_pretrained("AstraMindAI/xtts2-gpt")
|
| 29 |
+
|
| 30 |
+
whisper_model = load_whisper_model()
|
| 31 |
+
xtts_model = load_xtts_model()
|
| 32 |
+
|
| 33 |
+
def create_tts(text):
|
| 34 |
+
"""Create TTS audio using gTTS."""
|
| 35 |
+
tts = gTTS(text, lang='en', slow=False)
|
| 36 |
+
audio_buffer = io.BytesIO()
|
| 37 |
+
tts.write_to_fp(audio_buffer)
|
| 38 |
+
audio_buffer.seek(0)
|
| 39 |
+
return audio_buffer
|
| 40 |
+
|
| 41 |
+
@st.cache_data
|
| 42 |
+
def preprocess_audio(file_obj):
|
| 43 |
+
audio = AudioSegment.from_file(file_obj)
|
| 44 |
+
audio = normalize(audio)
|
| 45 |
+
audio = audio.strip_silence(silence_thresh=-40, silence_len=500)
|
| 46 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
|
| 47 |
+
audio.export(tmp.name, format='wav', parameters=['-ar', '16000', '-ac', '1'])
|
| 48 |
+
audio_path = tmp.name
|
| 49 |
+
return audio_path
|
| 50 |
+
|
| 51 |
+
def transcribe_audio(audio_path):
|
| 52 |
+
result = whisper_model.transcribe(audio_path, word_timestamps=True)
|
| 53 |
+
text = result["text"]
|
| 54 |
+
segments = result["segments"]
|
| 55 |
+
confidences = [seg.get('confidence', 0.5) for seg in segments]
|
| 56 |
+
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5
|
| 57 |
+
return text, segments, avg_confidence
|
| 58 |
+
|
| 59 |
+
def analyze_prosody(audio_path, transcript, segments, confidence):
|
| 60 |
+
y, sr = librosa.load(audio_path, sr=16000)
|
| 61 |
+
total_duration = librosa.get_duration(y=y, sr=sr)
|
| 62 |
+
|
| 63 |
+
pitches, magnitudes = librosa.piptrack(y=y, sr=sr, fmin=75, fmax=300)
|
| 64 |
+
pitch_values = pitches[magnitudes > np.median(magnitudes)]
|
| 65 |
+
pitch_mean = np.mean(pitch_values[pitch_values > 0]) if len(pitch_values[pitch_values > 0]) > 0 else 150
|
| 66 |
+
|
| 67 |
+
words = len(transcript.split())
|
| 68 |
+
pace_wpm = (words / total_duration) * 60 if total_duration > 0 else 0
|
| 69 |
+
|
| 70 |
+
intervals = librosa.effects.split(y, top_db=20)
|
| 71 |
+
pause_ratio = 1 - (sum(end - start for start, end in intervals) / len(y) / sr)
|
| 72 |
+
|
| 73 |
+
return {
|
| 74 |
+
'pitch_mean': pitch_mean,
|
| 75 |
+
'pace_wpm': pace_wpm,
|
| 76 |
+
'pause_ratio': pause_ratio,
|
| 77 |
+
'confidence': confidence
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
def pronunciation_feedback(transcript, segments, prosody):
|
| 81 |
+
pace_var = np.var([seg['end'] - seg['start'] for seg in segments])
|
| 82 |
+
pronun_score = (prosody['confidence'] * 100) * (1 - abs(prosody['pace_wpm'] - 120) / 120)
|
| 83 |
+
pronun_score = max(0, min(100, pronun_score - (pace_var * 10)))
|
| 84 |
+
return pronun_score
|
| 85 |
+
|
| 86 |
+
def calculate_score(prosody, pronun_score, transcript):
|
| 87 |
+
pitch_score = min(100, max(0, (prosody['pitch_mean'] - 100) / 50 * 100))
|
| 88 |
+
pace_score = 100 if 100 < prosody['pace_wpm'] < 150 else 70
|
| 89 |
+
pause_score = 100 * (1 - prosody['pause_ratio'])
|
| 90 |
+
conf_score = prosody['confidence'] * 100
|
| 91 |
+
prosody_total = (pitch_score + pace_score + pause_score + conf_score) / 4
|
| 92 |
+
|
| 93 |
+
content_score = min(100, len(transcript.split()) * 0.5 + (prosody['confidence'] * 50))
|
| 94 |
+
|
| 95 |
+
total = (prosody_total * 0.5) + (pronun_score * 0.3) + (content_score * 0.2)
|
| 96 |
+
return min(100, total)
|
| 97 |
+
|
| 98 |
+
def generate_voice_feedback(score, prosody, pronun_score):
|
| 99 |
+
pace = prosody['pace_wpm']
|
| 100 |
+
pauses = prosody['pause_ratio']
|
| 101 |
+
pitch = prosody['pitch_mean']
|
| 102 |
+
|
| 103 |
+
if score > 90:
|
| 104 |
+
opening = "Excellent work! Your speech was outstanding."
|
| 105 |
+
elif score > 80:
|
| 106 |
+
opening = "Great job! You have strong communication skills."
|
| 107 |
+
elif score > 60:
|
| 108 |
+
opening = "Good effort! You're making solid progress."
|
| 109 |
+
else:
|
| 110 |
+
opening = "Nice try! Keep practicing to improve."
|
| 111 |
+
|
| 112 |
+
feedback_parts = [opening]
|
| 113 |
+
|
| 114 |
+
if pace < 100:
|
| 115 |
+
feedback_parts.append(f"Your pace was {pace:.0f} words per minute. Try speaking faster, aiming for 120 to 140 words per minute.")
|
| 116 |
+
elif pace > 160:
|
| 117 |
+
feedback_parts.append(f"You spoke at {pace:.0f} words per minute, which is quite fast. Slow down to around 140 words per minute for better clarity.")
|
| 118 |
+
else:
|
| 119 |
+
feedback_parts.append(f"Your pace of {pace:.0f} words per minute is excellent.")
|
| 120 |
+
|
| 121 |
+
if pauses > 0.20:
|
| 122 |
+
feedback_parts.append(f"You paused {pauses:.0%} of the time. Try reducing pauses to 10 to 15 percent for smoother flow.")
|
| 123 |
+
elif pauses < 0.05:
|
| 124 |
+
feedback_parts.append("Consider adding brief pauses between ideas for better comprehension.")
|
| 125 |
+
else:
|
| 126 |
+
feedback_parts.append("Your use of pauses is well balanced.")
|
| 127 |
+
|
| 128 |
+
if pronun_score < 80:
|
| 129 |
+
feedback_parts.append("Work on clearer pronunciation by practicing tongue twisters and speaking more slowly.")
|
| 130 |
+
else:
|
| 131 |
+
feedback_parts.append("Your pronunciation is clear and articulate.")
|
| 132 |
+
|
| 133 |
+
feedback_parts.append("I've prepared an enhanced version of your speech with optimized pacing. Keep practicing!")
|
| 134 |
+
return " ".join(feedback_parts)
|
| 135 |
+
|
| 136 |
+
def generate_cloned_voice_xtts(audio_path, cleaned_text):
|
| 137 |
+
request = TTSRequest(
|
| 138 |
+
text=cleaned_text,
|
| 139 |
+
speaker_files=[audio_path],
|
| 140 |
+
language="en"
|
| 141 |
+
)
|
| 142 |
+
out = xtts_model.generate_speech(request)
|
| 143 |
+
output_path = tempfile.mktemp(suffix=".wav")
|
| 144 |
+
out.save(output_path)
|
| 145 |
+
return output_path
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
st.markdown('<div class="main-header"><h1><span class="status-indicator"></span>π€ FLUENTRA AI</h1><h3>Your Voice-Activated Speech Coach</h3></div>', unsafe_allow_html=True)
|
| 149 |
+
|
| 150 |
+
if not st.session_state.get('greeted', False):
|
| 151 |
+
greeting_text = "Hello! I am Fluentra, your personal speech coach. Click the microphone button and speak for 20 to 60 seconds. I will analyze your speech and help you improve."
|
| 152 |
+
st.markdown(f'<div class="voice-message">π {greeting_text}</div>', unsafe_allow_html=True)
|
| 153 |
+
greeting_audio = create_tts(greeting_text)
|
| 154 |
+
st.audio(greeting_audio, format="audio/mp3")
|
| 155 |
+
st.session_state['greeted'] = True
|
| 156 |
+
|
| 157 |
+
st.markdown("---")
|
| 158 |
+
st.subheader("ποΈ Ready to Record")
|
| 159 |
+
audio_bytes = audio_recorder(
|
| 160 |
+
text="Click to Start Recording",
|
| 161 |
+
recording_color="#00f7ff",
|
| 162 |
+
neutral_color="#4a5568",
|
| 163 |
+
icon_size="3x",
|
| 164 |
+
pause_threshold=2.0
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
if audio_bytes:
|
| 168 |
+
st.success("β
Recording captured!")
|
| 169 |
+
st.audio(audio_bytes, format="audio/wav")
|
| 170 |
+
|
| 171 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
|
| 172 |
+
tmp.write(audio_bytes)
|
| 173 |
+
recorded_path = tmp.name
|
| 174 |
+
|
| 175 |
+
processing_msg = "Processing your speech. Please wait."
|
| 176 |
+
st.markdown(f'<div class="voice-message">π {processing_msg}</div>', unsafe_allow_html=True)
|
| 177 |
+
processing_audio = create_tts(processing_msg)
|
| 178 |
+
st.audio(processing_audio, format="audio/mp3", autoplay=True)
|
| 179 |
+
|
| 180 |
+
with st.spinner("π§ Analyzing..."):
|
| 181 |
+
audio_path = preprocess_audio(recorded_path)
|
| 182 |
+
transcript, segments, confidence = transcribe_audio(audio_path)
|
| 183 |
+
prosody = analyze_prosody(audio_path, transcript, segments, confidence)
|
| 184 |
+
pronun_score = pronunciation_feedback(transcript, segments, prosody)
|
| 185 |
+
score = calculate_score(prosody, pronun_score, transcript)
|
| 186 |
+
|
| 187 |
+
feedback_text = generate_voice_feedback(score, prosody, pronun_score)
|
| 188 |
+
feedback_audio = create_tts(feedback_text)
|
| 189 |
+
|
| 190 |
+
cleaned_text = " ".join([w for w in transcript.split() if w.lower() not in {"um", "uh", "like", "you know", "er", "ah", "so", "well"}])
|
| 191 |
+
cloned_voice_path = generate_cloned_voice_xtts(audio_path, cleaned_text)
|
| 192 |
+
|
| 193 |
+
st.session_state['analysis_count'] = st.session_state.get('analysis_count', 0) + 1
|
| 194 |
+
|
| 195 |
+
st.markdown("---")
|
| 196 |
+
st.subheader("π¬ Fluentra's Feedback")
|
| 197 |
+
st.markdown(f'<div class="voice-message">π {feedback_text}</div>', unsafe_allow_html=True)
|
| 198 |
+
st.audio(feedback_audio, format="audio/mp3")
|
| 199 |
+
|
| 200 |
+
st.markdown("---")
|
| 201 |
+
st.subheader("π Analysis Results")
|
| 202 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 203 |
+
with col1:
|
| 204 |
+
st.markdown(f'<div class="metric-card"><h3>Overall Score</h3><h1>{score:.1f}/100</h1></div>', unsafe_allow_html=True)
|
| 205 |
+
with col2:
|
| 206 |
+
st.markdown(f'<div class="metric-card"><h3>Pace</h3><h1>{prosody["pace_wpm"]:.0f} WPM</h1></div>', unsafe_allow_html=True)
|
| 207 |
+
with col3:
|
| 208 |
+
st.markdown(f'<div class="metric-card"><h3>Pitch</h3><h1>{prosody["pitch_mean"]:.0f} Hz</h1></div>', unsafe_allow_html=True)
|
| 209 |
+
with col4:
|
| 210 |
+
st.markdown(f'<div class="metric-card"><h3>Confidence</h3><h1>{confidence:.0%}</h1></div>', unsafe_allow_html=True)
|
| 211 |
+
|
| 212 |
+
st.markdown("---")
|
| 213 |
+
st.subheader("β¨ Your Enhanced Voice")
|
| 214 |
+
enhanced_msg = "Here is your speech with fillers removed and pace optimized."
|
| 215 |
+
st.markdown(f'<div class="voice-message">π {enhanced_msg}</div>', unsafe_allow_html=True)
|
| 216 |
+
st.audio(cloned_voice_path, format="audio/wav")
|
| 217 |
+
|
| 218 |
+
st.markdown("---")
|
| 219 |
+
with st.expander("π View Transcription"):
|
| 220 |
+
st.info(transcript)
|
| 221 |
+
|
| 222 |
+
if st.session_state['analysis_count'] == 1:
|
| 223 |
+
closing = "Great start! Feel free to record again to track your improvement."
|
| 224 |
+
else:
|
| 225 |
+
closing = f"This is your {st.session_state['analysis_count']}th analysis. You're making progress!"
|
| 226 |
+
|
| 227 |
+
st.markdown(f'<div class="voice-message">π {closing}</div>', unsafe_allow_html=True)
|
| 228 |
+
closing_audio = create_tts(closing)
|
| 229 |
+
st.audio(closing_audio, format="audio/mp3")
|
| 230 |
+
|
| 231 |
+
os.unlink(audio_path)
|
| 232 |
+
os.unlink(recorded_path)
|
| 233 |
+
|
| 234 |
+
# Footer
|
| 235 |
+
st.markdown("---")
|
| 236 |
+
st.markdown("""
|
| 237 |
+
<div style='text-align: center; color: #00f7ff; padding: 2rem;'>
|
| 238 |
+
<p>π€ <strong>FLUENTRA AI</strong> - Voice-Activated Speech Coach</p>
|
| 239 |
+
<p>Powered by Whisper AI, Librosa & Google TTS | Β© 2025</p>
|
| 240 |
+
</div>
|
| 241 |
+
""", unsafe_allow_html=True)
|