Spaces:

Khushi1612
/

FinalYearProject

Build error

File size: 10,046 Bytes

894a53d

import streamlit as st
import whisper
import librosa
import numpy as np
from pydub import AudioSegment
from pydub.effects import normalize, speedup
from pydub.silence import split_on_silence
import tempfile
import os
from gtts import gTTS
import io
from audio_recorder_streamlit import audio_recorder
import torch
from auralis import TTS, TTSRequest
import random
import time

# Streamlit Page Config and CSS omitted for brevity — use your existing styles

# Load Whisper model once
@st.cache_resource
def load_whisper_model():
    return whisper.load_model("base")

# Load Hugging Face XTTS2 voice cloning model once
@st.cache_resource
def load_xtts_model():
    return TTS().from_pretrained("AstraMindAI/xtts2-gpt")

whisper_model = load_whisper_model()
xtts_model = load_xtts_model()

def create_tts(text):
    """Create TTS audio using gTTS."""
    tts = gTTS(text, lang='en', slow=False)
    audio_buffer = io.BytesIO()
    tts.write_to_fp(audio_buffer)
    audio_buffer.seek(0)
    return audio_buffer

@st.cache_data
def preprocess_audio(file_obj):
    audio = AudioSegment.from_file(file_obj)
    audio = normalize(audio)
    audio = audio.strip_silence(silence_thresh=-40, silence_len=500)
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
        audio.export(tmp.name, format='wav', parameters=['-ar', '16000', '-ac', '1'])
        audio_path = tmp.name
    return audio_path

def transcribe_audio(audio_path):
    result = whisper_model.transcribe(audio_path, word_timestamps=True)
    text = result["text"]
    segments = result["segments"]
    confidences = [seg.get('confidence', 0.5) for seg in segments]
    avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5
    return text, segments, avg_confidence

def analyze_prosody(audio_path, transcript, segments, confidence):
    y, sr = librosa.load(audio_path, sr=16000)
    total_duration = librosa.get_duration(y=y, sr=sr)

    pitches, magnitudes = librosa.piptrack(y=y, sr=sr, fmin=75, fmax=300)
    pitch_values = pitches[magnitudes > np.median(magnitudes)]
    pitch_mean = np.mean(pitch_values[pitch_values > 0]) if len(pitch_values[pitch_values > 0]) > 0 else 150

    words = len(transcript.split())
    pace_wpm = (words / total_duration) * 60 if total_duration > 0 else 0

    intervals = librosa.effects.split(y, top_db=20)
    pause_ratio = 1 - (sum(end - start for start, end in intervals) / len(y) / sr)

    return {
        'pitch_mean': pitch_mean,
        'pace_wpm': pace_wpm,
        'pause_ratio': pause_ratio,
        'confidence': confidence
    }

def pronunciation_feedback(transcript, segments, prosody):
    pace_var = np.var([seg['end'] - seg['start'] for seg in segments])
    pronun_score = (prosody['confidence'] * 100) * (1 - abs(prosody['pace_wpm'] - 120) / 120)
    pronun_score = max(0, min(100, pronun_score - (pace_var * 10)))
    return pronun_score

def calculate_score(prosody, pronun_score, transcript):
    pitch_score = min(100, max(0, (prosody['pitch_mean'] - 100) / 50 * 100))
    pace_score = 100 if 100 < prosody['pace_wpm'] < 150 else 70
    pause_score = 100 * (1 - prosody['pause_ratio'])
    conf_score = prosody['confidence'] * 100
    prosody_total = (pitch_score + pace_score + pause_score + conf_score) / 4

    content_score = min(100, len(transcript.split()) * 0.5 + (prosody['confidence'] * 50))

    total = (prosody_total * 0.5) + (pronun_score * 0.3) + (content_score * 0.2)
    return min(100, total)

def generate_voice_feedback(score, prosody, pronun_score):
    pace = prosody['pace_wpm']
    pauses = prosody['pause_ratio']
    pitch = prosody['pitch_mean']

    if score > 90:
        opening = "Excellent work! Your speech was outstanding."
    elif score > 80:
        opening = "Great job! You have strong communication skills."
    elif score > 60:
        opening = "Good effort! You're making solid progress."
    else:
        opening = "Nice try! Keep practicing to improve."

    feedback_parts = [opening]

    if pace < 100:
        feedback_parts.append(f"Your pace was {pace:.0f} words per minute. Try speaking faster, aiming for 120 to 140 words per minute.")
    elif pace > 160:
        feedback_parts.append(f"You spoke at {pace:.0f} words per minute, which is quite fast. Slow down to around 140 words per minute for better clarity.")
    else:
        feedback_parts.append(f"Your pace of {pace:.0f} words per minute is excellent.")

    if pauses > 0.20:
        feedback_parts.append(f"You paused {pauses:.0%} of the time. Try reducing pauses to 10 to 15 percent for smoother flow.")
    elif pauses < 0.05:
        feedback_parts.append("Consider adding brief pauses between ideas for better comprehension.")
    else:
        feedback_parts.append("Your use of pauses is well balanced.")

    if pronun_score < 80:
        feedback_parts.append("Work on clearer pronunciation by practicing tongue twisters and speaking more slowly.")
    else:
        feedback_parts.append("Your pronunciation is clear and articulate.")

    feedback_parts.append("I've prepared an enhanced version of your speech with optimized pacing. Keep practicing!")
    return " ".join(feedback_parts)

def generate_cloned_voice_xtts(audio_path, cleaned_text):
    request = TTSRequest(
        text=cleaned_text,
        speaker_files=[audio_path],
        language="en"
    )
    out = xtts_model.generate_speech(request)
    output_path = tempfile.mktemp(suffix=".wav")
    out.save(output_path)
    return output_path


st.markdown('<div class="main-header"><h1><span class="status-indicator"></span>🎤 FLUENTRA AI</h1><h3>Your Voice-Activated Speech Coach</h3></div>', unsafe_allow_html=True)

if not st.session_state.get('greeted', False):
    greeting_text = "Hello! I am Fluentra, your personal speech coach. Click the microphone button and speak for 20 to 60 seconds. I will analyze your speech and help you improve."
    st.markdown(f'<div class="voice-message">🔊 {greeting_text}</div>', unsafe_allow_html=True)
    greeting_audio = create_tts(greeting_text)
    st.audio(greeting_audio, format="audio/mp3")
    st.session_state['greeted'] = True

st.markdown("---")
st.subheader("🎙️ Ready to Record")
audio_bytes = audio_recorder(
    text="Click to Start Recording",
    recording_color="#00f7ff",
    neutral_color="#4a5568",
    icon_size="3x",
    pause_threshold=2.0
)

if audio_bytes:
    st.success("✅ Recording captured!")
    st.audio(audio_bytes, format="audio/wav")

    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
        tmp.write(audio_bytes)
        recorded_path = tmp.name

    processing_msg = "Processing your speech. Please wait."
    st.markdown(f'<div class="voice-message">🔊 {processing_msg}</div>', unsafe_allow_html=True)
    processing_audio = create_tts(processing_msg)
    st.audio(processing_audio, format="audio/mp3", autoplay=True)

    with st.spinner("🧠 Analyzing..."):
        audio_path = preprocess_audio(recorded_path)
        transcript, segments, confidence = transcribe_audio(audio_path)
        prosody = analyze_prosody(audio_path, transcript, segments, confidence)
        pronun_score = pronunciation_feedback(transcript, segments, prosody)
        score = calculate_score(prosody, pronun_score, transcript)

        feedback_text = generate_voice_feedback(score, prosody, pronun_score)
        feedback_audio = create_tts(feedback_text)

        cleaned_text = " ".join([w for w in transcript.split() if w.lower() not in {"um", "uh", "like", "you know", "er", "ah", "so", "well"}])
        cloned_voice_path = generate_cloned_voice_xtts(audio_path, cleaned_text)

    st.session_state['analysis_count'] = st.session_state.get('analysis_count', 0) + 1

    st.markdown("---")
    st.subheader("💬 Fluentra's Feedback")
    st.markdown(f'<div class="voice-message">🔊 {feedback_text}</div>', unsafe_allow_html=True)
    st.audio(feedback_audio, format="audio/mp3")

    st.markdown("---")
    st.subheader("📊 Analysis Results")
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.markdown(f'<div class="metric-card"><h3>Overall Score</h3><h1>{score:.1f}/100</h1></div>', unsafe_allow_html=True)
    with col2:
        st.markdown(f'<div class="metric-card"><h3>Pace</h3><h1>{prosody["pace_wpm"]:.0f} WPM</h1></div>', unsafe_allow_html=True)
    with col3:
        st.markdown(f'<div class="metric-card"><h3>Pitch</h3><h1>{prosody["pitch_mean"]:.0f} Hz</h1></div>', unsafe_allow_html=True)
    with col4:
        st.markdown(f'<div class="metric-card"><h3>Confidence</h3><h1>{confidence:.0%}</h1></div>', unsafe_allow_html=True)

    st.markdown("---")
    st.subheader("✨ Your Enhanced Voice")
    enhanced_msg = "Here is your speech with fillers removed and pace optimized."
    st.markdown(f'<div class="voice-message">🔊 {enhanced_msg}</div>', unsafe_allow_html=True)
    st.audio(cloned_voice_path, format="audio/wav")

    st.markdown("---")
    with st.expander("📝 View Transcription"):
        st.info(transcript)

    if st.session_state['analysis_count'] == 1:
        closing = "Great start! Feel free to record again to track your improvement."
    else:
        closing = f"This is your {st.session_state['analysis_count']}th analysis. You're making progress!"

    st.markdown(f'<div class="voice-message">🔊 {closing}</div>', unsafe_allow_html=True)
    closing_audio = create_tts(closing)
    st.audio(closing_audio, format="audio/mp3")

    os.unlink(audio_path)
    os.unlink(recorded_path)

# Footer
st.markdown("---")
st.markdown("""

<div style='text-align: center; color: #00f7ff; padding: 2rem;'>

    <p>🎤 <strong>FLUENTRA AI</strong> - Voice-Activated Speech Coach</p>

    <p>Powered by Whisper AI, Librosa & Google TTS | © 2025</p>

</div>

""", unsafe_allow_html=True)