Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import time | |
| import os | |
| from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer | |
| import pandas as pd | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| import re | |
| # Download necessary NLTK data | |
| try: | |
| # Make the download more reliable by specifying download directory | |
| nltk_data_dir = '/home/user/nltk_data' | |
| os.makedirs(nltk_data_dir, exist_ok=True) | |
| # Download all required resources | |
| nltk.download('punkt', download_dir=nltk_data_dir) | |
| nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir) | |
| # Set the data path to include our custom directory | |
| nltk.data.path.insert(0, nltk_data_dir) | |
| except Exception as e: | |
| print(f"NLTK download issue: {e}") | |
| # Fallback simple approach if the directory approach fails | |
| nltk.download('punkt') | |
| nltk.download('averaged_perceptron_tagger') | |
| # Add error handling around model loading | |
| try: | |
| # Load Whisper for ASR | |
| asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3") | |
| # Load Grammar Scoring Model (CoLA) | |
| cola_model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-CoLA") | |
| cola_tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-CoLA") | |
| grammar_pipeline = pipeline("text-classification", model=cola_model, tokenizer=cola_tokenizer) | |
| # Load Grammar Correction Model (T5) | |
| correction_pipeline = pipeline("text2text-generation", model="vennify/t5-base-grammar-correction") | |
| # Add sentiment analysis | |
| sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") | |
| # Add fluency analysis (using BERT) | |
| fluency_pipeline = pipeline("text-classification", model="textattack/bert-base-uncased-CoLA") | |
| # Set variables to track loaded models | |
| MODELS_LOADED = True | |
| except Exception as e: | |
| print(f"Error loading models: {e}") | |
| # Set variable to track failed model loading | |
| MODELS_LOADED = False | |
| # Common English filler words to detect | |
| FILLER_WORDS = ["um", "uh", "like", "you know", "actually", "basically", "literally", | |
| "sort of", "kind of", "i mean", "so", "well", "right", "okay", "yeah"] | |
| def count_filler_words(text): | |
| """Count filler words in the text""" | |
| text = text.lower() | |
| count = 0 | |
| for word in FILLER_WORDS: | |
| count += len(re.findall(r'\b' + word + r'\b', text)) | |
| return count, count / max(len(text.split()), 1) # Count and ratio | |
| def calculate_speaking_rate(text, duration): | |
| """Calculate words per minute""" | |
| if duration <= 0: | |
| return 0 | |
| words = len(text.split()) | |
| return (words / duration) * 60 # Words per minute | |
| def analyze_vocabulary_richness(text): | |
| """Analyze vocabulary richness""" | |
| # Split text by simple regex instead of using word_tokenize to avoid NLTK issues | |
| try: | |
| # Try using word_tokenize first | |
| words = word_tokenize(text.lower()) | |
| except LookupError: | |
| # Fallback to simple regex-based tokenization if NLTK fails | |
| words = re.findall(r'\b\w+\b', text.lower()) | |
| if not words: | |
| return 0, {} | |
| # Vocabulary richness (unique words / total words) | |
| unique_words = set(words) | |
| richness = len(unique_words) / len(words) | |
| # Use simple POS tagging or skip it if NLTK fails | |
| try: | |
| pos_tags = nltk.pos_tag(words) | |
| pos_counts = {} | |
| for _, tag in pos_tags: | |
| pos_counts[tag] = pos_counts.get(tag, 0) + 1 | |
| except Exception: | |
| # Return simplified count if POS tagging fails | |
| pos_counts = {"WORD": len(words), "UNIQUE": len(unique_words)} | |
| return richness, pos_counts | |
| def analyze_sentence_complexity(text): | |
| """Analyze sentence complexity with error handling""" | |
| try: | |
| # Simple sentence splitting by punctuation | |
| sentences = re.split(r'[.!?]+', text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| if not sentences: | |
| return 0, 0 | |
| # Average words per sentence | |
| words_per_sentence = [len(s.split()) for s in sentences] | |
| avg_words = sum(words_per_sentence) / len(sentences) | |
| # Sentence length variation (standard deviation) | |
| sentence_length_variation = np.std(words_per_sentence) if len(sentences) > 1 else 0 | |
| return avg_words, sentence_length_variation | |
| except Exception: | |
| # In case of any error, return simple defaults | |
| word_count = len(text.split()) | |
| # Assume approximately 15 words per sentence if we can't detect | |
| return word_count / max(1, text.count('.') + text.count('!') + text.count('?')), 0 | |
| def create_detailed_feedback(transcription, grammar_score, corrected_text, | |
| sentiment, fluency, filler_ratio, speaking_rate, | |
| vocabulary_richness, avg_words_per_sentence): | |
| """Create detailed feedback based on all metrics""" | |
| feedback = [] | |
| # Grammar feedback | |
| if "acceptable" in grammar_score.lower(): | |
| feedback.append("✅ Your grammar is good!") | |
| else: | |
| feedback.append("❗ Your grammar needs improvement. Check the corrections provided.") | |
| # Fluency feedback | |
| if fluency > 0.7: | |
| feedback.append("✅ Your speech flows naturally.") | |
| else: | |
| feedback.append("❗ Work on making your speech more fluid and natural.") | |
| # Filler words feedback | |
| if filler_ratio > 0.1: | |
| feedback.append(f"❗ You used too many filler words ({filler_ratio:.1%} of your words).") | |
| else: | |
| feedback.append("✅ Good job minimizing filler words!") | |
| # Speaking rate feedback | |
| if 120 <= speaking_rate <= 160: | |
| feedback.append(f"✅ Your speaking pace is good ({speaking_rate:.0f} words/min).") | |
| elif speaking_rate < 120: | |
| feedback.append(f"❗ Try speaking a bit faster ({speaking_rate:.0f} words/min is slower than ideal).") | |
| else: | |
| feedback.append(f"❗ Try speaking a bit slower ({speaking_rate:.0f} words/min is faster than ideal).") | |
| # Vocabulary feedback | |
| if vocabulary_richness > 0.6: | |
| feedback.append("✅ Excellent vocabulary diversity!") | |
| elif vocabulary_richness > 0.4: | |
| feedback.append("✅ Good vocabulary usage.") | |
| else: | |
| feedback.append("❗ Try using more varied vocabulary.") | |
| # Sentence complexity feedback | |
| if 10 <= avg_words_per_sentence <= 20: | |
| feedback.append("✅ Good sentence structure and length.") | |
| elif avg_words_per_sentence < 10: | |
| feedback.append("❗ Try using more complex sentences occasionally.") | |
| else: | |
| feedback.append("❗ Your sentences are quite long. Consider varying your sentence length.") | |
| # Overall sentiment feedback | |
| if sentiment == "POSITIVE": | |
| feedback.append("✅ Your tone is positive and engaging.") | |
| else: | |
| feedback.append("ℹ️ Your tone is neutral/negative. Consider if this matches your intent.") | |
| return "\n".join(feedback) | |
| def process_audio(audio): | |
| if audio is None: | |
| return "No audio provided.", "", "", "", None, "" | |
| start_time = time.time() | |
| # Check if models loaded properly | |
| if 'MODELS_LOADED' in globals() and not MODELS_LOADED: | |
| return ("Models failed to load. Please check the logs for details.", | |
| "Error", "Error", "Unable to process audio due to model loading issues.", | |
| None, "## Error\nThe required models couldn't be loaded. Please check the system configuration.") | |
| try: | |
| # Get audio duration (assuming audio[1] contains the sample rate) | |
| sample_rate = 16000 # Default if we can't determine | |
| if isinstance(audio, tuple) and len(audio) > 1: | |
| sample_rate = audio[1] | |
| # For file uploads, we need to handle differently | |
| duration = 0 | |
| if isinstance(audio, str): | |
| # This is a file path | |
| try: | |
| import librosa | |
| y, sr = librosa.load(audio, sr=None) | |
| duration = librosa.get_duration(y=y, sr=sr) | |
| except Exception as e: | |
| print(f"Error getting duration: {e}") | |
| # Estimate duration based on file size | |
| try: | |
| file_size = os.path.getsize(audio) | |
| # Rough estimate: 16kHz, 16-bit audio is about 32KB per second | |
| duration = file_size / 32000 | |
| except: | |
| duration = 10 # Default to 10 seconds if we can't determine | |
| else: | |
| # Assuming a tuple with (samples, sample_rate) | |
| try: | |
| duration = len(audio[0]) / sample_rate if sample_rate > 0 else 0 | |
| except: | |
| duration = 10 # Default duration | |
| # Step 1: Transcription | |
| try: | |
| transcription_result = asr_pipeline(audio) | |
| transcription = transcription_result["text"] | |
| except Exception as e: | |
| print(f"Transcription error: {e}") | |
| return ("Error in speech recognition. Please try again.", | |
| "Error", "Error", "There was an error processing your audio.", | |
| None, f"## Error\nError in speech recognition: {str(e)[:100]}...") | |
| if not transcription or transcription.strip() == "": | |
| return ("No speech detected. Please speak louder or check your microphone.", | |
| "N/A", "N/A", "No speech detected in the audio.", | |
| None, "## No Speech Detected\nPlease try recording again with clearer speech.") | |
| # Step 2: Grammar Scoring | |
| try: | |
| score_output = grammar_pipeline(transcription)[0] | |
| label = score_output["label"] | |
| confidence = score_output["score"] | |
| grammar_score = f"{label} ({confidence:.2f})" | |
| except Exception as e: | |
| print(f"Grammar scoring error: {e}") | |
| label = "UNKNOWN" | |
| confidence = 0.5 | |
| grammar_score = "Could not analyze grammar" | |
| # Step 3: Grammar Correction | |
| try: | |
| corrected = correction_pipeline(transcription, max_length=128)[0]["generated_text"] | |
| except Exception as e: | |
| print(f"Grammar correction error: {e}") | |
| corrected = transcription | |
| # Step 4: Sentiment Analysis | |
| try: | |
| sentiment_result = sentiment_pipeline(transcription)[0] | |
| sentiment = sentiment_result["label"] | |
| sentiment_score = sentiment_result["score"] | |
| except Exception as e: | |
| print(f"Sentiment analysis error: {e}") | |
| sentiment = "NEUTRAL" | |
| sentiment_score = 0.5 | |
| # Step 5: Fluency Analysis | |
| try: | |
| fluency_result = fluency_pipeline(transcription)[0] | |
| fluency_score = fluency_result["score"] if fluency_result["label"] == "acceptable" else 1 - fluency_result["score"] | |
| except Exception as e: | |
| print(f"Fluency analysis error: {e}") | |
| fluency_score = 0.5 | |
| # Step 6: Filler Words Analysis | |
| try: | |
| filler_count, filler_ratio = count_filler_words(transcription) | |
| except Exception as e: | |
| print(f"Filler word analysis error: {e}") | |
| filler_count, filler_ratio = 0, 0 | |
| # Step 7: Speaking Rate | |
| try: | |
| speaking_rate = calculate_speaking_rate(transcription, duration) | |
| except Exception as e: | |
| print(f"Speaking rate calculation error: {e}") | |
| speaking_rate = 0 | |
| # Step 8: Vocabulary Richness | |
| try: | |
| vocab_richness, pos_counts = analyze_vocabulary_richness(transcription) | |
| except Exception as e: | |
| print(f"Vocabulary analysis error: {e}") | |
| vocab_richness, pos_counts = 0.5, {"N/A": 1} | |
| # Step 9: Sentence Complexity | |
| try: | |
| avg_words, sentence_variation = analyze_sentence_complexity(transcription) | |
| except Exception as e: | |
| print(f"Sentence complexity analysis error: {e}") | |
| avg_words, sentence_variation = 0, 0 | |
| # Create feedback | |
| try: | |
| feedback = create_detailed_feedback( | |
| transcription, grammar_score, corrected, sentiment, | |
| fluency_score, filler_ratio, speaking_rate, vocab_richness, avg_words | |
| ) | |
| except Exception as e: | |
| print(f"Feedback creation error: {e}") | |
| feedback = "Error generating detailed feedback." | |
| # Create metrics visualization | |
| try: | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| # Define metrics for radar chart | |
| categories = ['Grammar', 'Fluency', 'Vocabulary', 'Speaking Rate', 'Clarity'] | |
| # Normalize scores between 0 and 1 | |
| grammar_norm = confidence if label == "acceptable" else 1 - confidence | |
| speaking_rate_norm = max(0, min(1, 1 - abs((speaking_rate - 140) / 100))) # Optimal around 140 wpm | |
| values = [ | |
| grammar_norm, | |
| fluency_score, | |
| vocab_richness, | |
| speaking_rate_norm, | |
| 1 - filler_ratio # Lower filler ratio is better | |
| ] | |
| # Complete the loop for the radar chart | |
| values += values[:1] | |
| categories += categories[:1] | |
| # Convert to radians and plot | |
| angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist() | |
| angles += angles[:1] | |
| ax.plot(angles, values, linewidth=2, linestyle='solid') | |
| ax.fill(angles, values, alpha=0.25) | |
| ax.set_yticklabels([]) | |
| ax.set_xticks(angles[:-1]) | |
| ax.set_xticklabels(categories[:-1]) | |
| ax.grid(True) | |
| plt.title('Speaking Performance Metrics', size=15, color='navy', y=1.1) | |
| except Exception as e: | |
| print(f"Visualization error: {e}") | |
| # Create a simple error figure | |
| fig, ax = plt.subplots(figsize=(6, 3)) | |
| ax.text(0.5, 0.5, "Error creating visualization", | |
| horizontalalignment='center', verticalalignment='center') | |
| ax.axis('off') | |
| # Create detailed analysis text | |
| processing_time = time.time() - start_time | |
| try: | |
| pos_counts_str = ', '.join([f"{k}: {v}" for k, v in sorted(pos_counts.items(), key=lambda x: x[1], reverse=True)[:5]]) | |
| except: | |
| pos_counts_str = "N/A" | |
| detailed_analysis = f""" | |
| ## Detailed Speech Analysis | |
| **Processing Time:** {processing_time:.2f} seconds | |
| **Audio Duration:** {duration:.2f} seconds | |
| ### Metrics: | |
| - **Grammar Score:** {confidence:.2f} ({label}) | |
| - **Fluency Score:** {fluency_score:.2f} | |
| - **Speaking Rate:** {speaking_rate:.1f} words per minute | |
| - **Vocabulary Richness:** {vocab_richness:.2f} (higher is better) | |
| - **Filler Words:** {filler_count} occurrences ({filler_ratio:.1%} of speech) | |
| - **Avg Words Per Sentence:** {avg_words:.1f} | |
| - **Sentiment:** {sentiment} ({sentiment_score:.2f}) | |
| ### Word Types Used: | |
| {pos_counts_str} | |
| """ | |
| return transcription, grammar_score, corrected, feedback, fig, detailed_analysis | |
| except Exception as e: | |
| print(f"Unexpected error in process_audio: {e}") | |
| return ("An unexpected error occurred during processing.", | |
| "Error", "Error", "There was an unexpected error processing your audio.", | |
| None, f"## Unexpected Error\n\nAn error occurred: {str(e)[:200]}...") | |
| # Create theme | |
| theme = gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="indigo", | |
| ).set( | |
| button_primary_background_fill="*primary_500", | |
| button_primary_background_fill_hover="*primary_600", | |
| button_primary_text_color="white", | |
| block_title_text_weight="600", | |
| block_border_width="2px", | |
| block_shadow="0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)", | |
| ) | |
| with gr.Blocks(theme=theme, css=""" | |
| .container { max-width: 1000px; margin: auto; } | |
| .header { text-align: center; margin-bottom: 20px; } | |
| .header h1 { color: #1e40af; font-size: 2.5rem; } | |
| .header p { color: #6b7280; font-size: 1.1rem; } | |
| .footer { text-align: center; margin-top: 30px; color: #6b7280; } | |
| .tips-box { background-color: #f0f9ff; border-radius: 10px; padding: 15px; margin: 10px 0; } | |
| .score-card { border: 2px solid #dbeafe; border-radius: 10px; padding: 10px; } | |
| """) as demo: | |
| gr.HTML(""" | |
| <div class="header"> | |
| <h1>🎙️ Advanced ENGLISH Speaking Assessment</h1> | |
| <p>Record or upload your speech to receive comprehensive feedback on your English speaking skills</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="🎤 Speak or Upload Audio" | |
| ) | |
| with gr.Accordion("Speaking Tips", open=False): | |
| gr.HTML(""" | |
| <div class="tips-box"> | |
| <h4>Tips for Better Results:</h4> | |
| <ul> | |
| <li>Speak clearly and at a moderate pace</li> | |
| <li>Minimize background noise</li> | |
| <li>Try to speak for at least 20-30 seconds</li> | |
| <li>Avoid filler words like "um", "uh", "like"</li> | |
| <li>Practice with both prepared and impromptu topics</li> | |
| </ul> | |
| </div> | |
| """) | |
| submit_btn = gr.Button("Analyze Speech", variant="primary") | |
| with gr.Row(): | |
| with gr.Column(): | |
| transcription_output = gr.Textbox(label="📝 Transcription", lines=3) | |
| corrected_output = gr.Textbox(label="✍️ Grammar Correction", lines=3) | |
| grammar_score_output = gr.Textbox(label="✅ Grammar Score") | |
| with gr.Row(): | |
| with gr.Column(): | |
| metrics_chart = gr.Plot(label="Performance Metrics") | |
| with gr.Column(): | |
| feedback_output = gr.Textbox(label="💬 Feedback", lines=8) | |
| with gr.Accordion("Detailed Analysis", open=False): | |
| detailed_analysis = gr.Markdown() | |
| gr.HTML(""" | |
| <div class="footer"> | |
| <p>This tool provides an assessment of your spoken English. For professional evaluation, consult a qualified language instructor.</p> | |
| </div> | |
| """) | |
| submit_btn.click( | |
| fn=process_audio, | |
| inputs=[audio_input], | |
| outputs=[ | |
| transcription_output, | |
| grammar_score_output, | |
| corrected_output, | |
| feedback_output, | |
| metrics_chart, | |
| detailed_analysis | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |