Khushi1612 commited on
Commit
894a53d
Β·
verified Β·
1 Parent(s): 9c54e1a

Upload appg.py

Browse files
Files changed (1) hide show
  1. appg.py +241 -0
appg.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import whisper
3
+ import librosa
4
+ import numpy as np
5
+ from pydub import AudioSegment
6
+ from pydub.effects import normalize, speedup
7
+ from pydub.silence import split_on_silence
8
+ import tempfile
9
+ import os
10
+ from gtts import gTTS
11
+ import io
12
+ from audio_recorder_streamlit import audio_recorder
13
+ import torch
14
+ from auralis import TTS, TTSRequest
15
+ import random
16
+ import time
17
+
18
+ # Streamlit Page Config and CSS omitted for brevity β€” use your existing styles
19
+
20
+ # Load Whisper model once
21
+ @st.cache_resource
22
+ def load_whisper_model():
23
+ return whisper.load_model("base")
24
+
25
+ # Load Hugging Face XTTS2 voice cloning model once
26
+ @st.cache_resource
27
+ def load_xtts_model():
28
+ return TTS().from_pretrained("AstraMindAI/xtts2-gpt")
29
+
30
+ whisper_model = load_whisper_model()
31
+ xtts_model = load_xtts_model()
32
+
33
+ def create_tts(text):
34
+ """Create TTS audio using gTTS."""
35
+ tts = gTTS(text, lang='en', slow=False)
36
+ audio_buffer = io.BytesIO()
37
+ tts.write_to_fp(audio_buffer)
38
+ audio_buffer.seek(0)
39
+ return audio_buffer
40
+
41
+ @st.cache_data
42
+ def preprocess_audio(file_obj):
43
+ audio = AudioSegment.from_file(file_obj)
44
+ audio = normalize(audio)
45
+ audio = audio.strip_silence(silence_thresh=-40, silence_len=500)
46
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
47
+ audio.export(tmp.name, format='wav', parameters=['-ar', '16000', '-ac', '1'])
48
+ audio_path = tmp.name
49
+ return audio_path
50
+
51
+ def transcribe_audio(audio_path):
52
+ result = whisper_model.transcribe(audio_path, word_timestamps=True)
53
+ text = result["text"]
54
+ segments = result["segments"]
55
+ confidences = [seg.get('confidence', 0.5) for seg in segments]
56
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5
57
+ return text, segments, avg_confidence
58
+
59
+ def analyze_prosody(audio_path, transcript, segments, confidence):
60
+ y, sr = librosa.load(audio_path, sr=16000)
61
+ total_duration = librosa.get_duration(y=y, sr=sr)
62
+
63
+ pitches, magnitudes = librosa.piptrack(y=y, sr=sr, fmin=75, fmax=300)
64
+ pitch_values = pitches[magnitudes > np.median(magnitudes)]
65
+ pitch_mean = np.mean(pitch_values[pitch_values > 0]) if len(pitch_values[pitch_values > 0]) > 0 else 150
66
+
67
+ words = len(transcript.split())
68
+ pace_wpm = (words / total_duration) * 60 if total_duration > 0 else 0
69
+
70
+ intervals = librosa.effects.split(y, top_db=20)
71
+ pause_ratio = 1 - (sum(end - start for start, end in intervals) / len(y) / sr)
72
+
73
+ return {
74
+ 'pitch_mean': pitch_mean,
75
+ 'pace_wpm': pace_wpm,
76
+ 'pause_ratio': pause_ratio,
77
+ 'confidence': confidence
78
+ }
79
+
80
+ def pronunciation_feedback(transcript, segments, prosody):
81
+ pace_var = np.var([seg['end'] - seg['start'] for seg in segments])
82
+ pronun_score = (prosody['confidence'] * 100) * (1 - abs(prosody['pace_wpm'] - 120) / 120)
83
+ pronun_score = max(0, min(100, pronun_score - (pace_var * 10)))
84
+ return pronun_score
85
+
86
+ def calculate_score(prosody, pronun_score, transcript):
87
+ pitch_score = min(100, max(0, (prosody['pitch_mean'] - 100) / 50 * 100))
88
+ pace_score = 100 if 100 < prosody['pace_wpm'] < 150 else 70
89
+ pause_score = 100 * (1 - prosody['pause_ratio'])
90
+ conf_score = prosody['confidence'] * 100
91
+ prosody_total = (pitch_score + pace_score + pause_score + conf_score) / 4
92
+
93
+ content_score = min(100, len(transcript.split()) * 0.5 + (prosody['confidence'] * 50))
94
+
95
+ total = (prosody_total * 0.5) + (pronun_score * 0.3) + (content_score * 0.2)
96
+ return min(100, total)
97
+
98
+ def generate_voice_feedback(score, prosody, pronun_score):
99
+ pace = prosody['pace_wpm']
100
+ pauses = prosody['pause_ratio']
101
+ pitch = prosody['pitch_mean']
102
+
103
+ if score > 90:
104
+ opening = "Excellent work! Your speech was outstanding."
105
+ elif score > 80:
106
+ opening = "Great job! You have strong communication skills."
107
+ elif score > 60:
108
+ opening = "Good effort! You're making solid progress."
109
+ else:
110
+ opening = "Nice try! Keep practicing to improve."
111
+
112
+ feedback_parts = [opening]
113
+
114
+ if pace < 100:
115
+ feedback_parts.append(f"Your pace was {pace:.0f} words per minute. Try speaking faster, aiming for 120 to 140 words per minute.")
116
+ elif pace > 160:
117
+ feedback_parts.append(f"You spoke at {pace:.0f} words per minute, which is quite fast. Slow down to around 140 words per minute for better clarity.")
118
+ else:
119
+ feedback_parts.append(f"Your pace of {pace:.0f} words per minute is excellent.")
120
+
121
+ if pauses > 0.20:
122
+ feedback_parts.append(f"You paused {pauses:.0%} of the time. Try reducing pauses to 10 to 15 percent for smoother flow.")
123
+ elif pauses < 0.05:
124
+ feedback_parts.append("Consider adding brief pauses between ideas for better comprehension.")
125
+ else:
126
+ feedback_parts.append("Your use of pauses is well balanced.")
127
+
128
+ if pronun_score < 80:
129
+ feedback_parts.append("Work on clearer pronunciation by practicing tongue twisters and speaking more slowly.")
130
+ else:
131
+ feedback_parts.append("Your pronunciation is clear and articulate.")
132
+
133
+ feedback_parts.append("I've prepared an enhanced version of your speech with optimized pacing. Keep practicing!")
134
+ return " ".join(feedback_parts)
135
+
136
+ def generate_cloned_voice_xtts(audio_path, cleaned_text):
137
+ request = TTSRequest(
138
+ text=cleaned_text,
139
+ speaker_files=[audio_path],
140
+ language="en"
141
+ )
142
+ out = xtts_model.generate_speech(request)
143
+ output_path = tempfile.mktemp(suffix=".wav")
144
+ out.save(output_path)
145
+ return output_path
146
+
147
+
148
+ st.markdown('<div class="main-header"><h1><span class="status-indicator"></span>🎀 FLUENTRA AI</h1><h3>Your Voice-Activated Speech Coach</h3></div>', unsafe_allow_html=True)
149
+
150
+ if not st.session_state.get('greeted', False):
151
+ greeting_text = "Hello! I am Fluentra, your personal speech coach. Click the microphone button and speak for 20 to 60 seconds. I will analyze your speech and help you improve."
152
+ st.markdown(f'<div class="voice-message">πŸ”Š {greeting_text}</div>', unsafe_allow_html=True)
153
+ greeting_audio = create_tts(greeting_text)
154
+ st.audio(greeting_audio, format="audio/mp3")
155
+ st.session_state['greeted'] = True
156
+
157
+ st.markdown("---")
158
+ st.subheader("πŸŽ™οΈ Ready to Record")
159
+ audio_bytes = audio_recorder(
160
+ text="Click to Start Recording",
161
+ recording_color="#00f7ff",
162
+ neutral_color="#4a5568",
163
+ icon_size="3x",
164
+ pause_threshold=2.0
165
+ )
166
+
167
+ if audio_bytes:
168
+ st.success("βœ… Recording captured!")
169
+ st.audio(audio_bytes, format="audio/wav")
170
+
171
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
172
+ tmp.write(audio_bytes)
173
+ recorded_path = tmp.name
174
+
175
+ processing_msg = "Processing your speech. Please wait."
176
+ st.markdown(f'<div class="voice-message">πŸ”Š {processing_msg}</div>', unsafe_allow_html=True)
177
+ processing_audio = create_tts(processing_msg)
178
+ st.audio(processing_audio, format="audio/mp3", autoplay=True)
179
+
180
+ with st.spinner("🧠 Analyzing..."):
181
+ audio_path = preprocess_audio(recorded_path)
182
+ transcript, segments, confidence = transcribe_audio(audio_path)
183
+ prosody = analyze_prosody(audio_path, transcript, segments, confidence)
184
+ pronun_score = pronunciation_feedback(transcript, segments, prosody)
185
+ score = calculate_score(prosody, pronun_score, transcript)
186
+
187
+ feedback_text = generate_voice_feedback(score, prosody, pronun_score)
188
+ feedback_audio = create_tts(feedback_text)
189
+
190
+ cleaned_text = " ".join([w for w in transcript.split() if w.lower() not in {"um", "uh", "like", "you know", "er", "ah", "so", "well"}])
191
+ cloned_voice_path = generate_cloned_voice_xtts(audio_path, cleaned_text)
192
+
193
+ st.session_state['analysis_count'] = st.session_state.get('analysis_count', 0) + 1
194
+
195
+ st.markdown("---")
196
+ st.subheader("πŸ’¬ Fluentra's Feedback")
197
+ st.markdown(f'<div class="voice-message">πŸ”Š {feedback_text}</div>', unsafe_allow_html=True)
198
+ st.audio(feedback_audio, format="audio/mp3")
199
+
200
+ st.markdown("---")
201
+ st.subheader("πŸ“Š Analysis Results")
202
+ col1, col2, col3, col4 = st.columns(4)
203
+ with col1:
204
+ st.markdown(f'<div class="metric-card"><h3>Overall Score</h3><h1>{score:.1f}/100</h1></div>', unsafe_allow_html=True)
205
+ with col2:
206
+ st.markdown(f'<div class="metric-card"><h3>Pace</h3><h1>{prosody["pace_wpm"]:.0f} WPM</h1></div>', unsafe_allow_html=True)
207
+ with col3:
208
+ st.markdown(f'<div class="metric-card"><h3>Pitch</h3><h1>{prosody["pitch_mean"]:.0f} Hz</h1></div>', unsafe_allow_html=True)
209
+ with col4:
210
+ st.markdown(f'<div class="metric-card"><h3>Confidence</h3><h1>{confidence:.0%}</h1></div>', unsafe_allow_html=True)
211
+
212
+ st.markdown("---")
213
+ st.subheader("✨ Your Enhanced Voice")
214
+ enhanced_msg = "Here is your speech with fillers removed and pace optimized."
215
+ st.markdown(f'<div class="voice-message">πŸ”Š {enhanced_msg}</div>', unsafe_allow_html=True)
216
+ st.audio(cloned_voice_path, format="audio/wav")
217
+
218
+ st.markdown("---")
219
+ with st.expander("πŸ“ View Transcription"):
220
+ st.info(transcript)
221
+
222
+ if st.session_state['analysis_count'] == 1:
223
+ closing = "Great start! Feel free to record again to track your improvement."
224
+ else:
225
+ closing = f"This is your {st.session_state['analysis_count']}th analysis. You're making progress!"
226
+
227
+ st.markdown(f'<div class="voice-message">πŸ”Š {closing}</div>', unsafe_allow_html=True)
228
+ closing_audio = create_tts(closing)
229
+ st.audio(closing_audio, format="audio/mp3")
230
+
231
+ os.unlink(audio_path)
232
+ os.unlink(recorded_path)
233
+
234
+ # Footer
235
+ st.markdown("---")
236
+ st.markdown("""
237
+ <div style='text-align: center; color: #00f7ff; padding: 2rem;'>
238
+ <p>🎀 <strong>FLUENTRA AI</strong> - Voice-Activated Speech Coach</p>
239
+ <p>Powered by Whisper AI, Librosa & Google TTS | Β© 2025</p>
240
+ </div>
241
+ """, unsafe_allow_html=True)