Spaces:
Sleeping
Sleeping
| # app.py | |
| import gradio as gr | |
| import os | |
| from openai import OpenAI | |
| import json | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| import whisper | |
| import pandas as pd | |
| # --- 0. CONFIGURACIÓN INICIAL --- | |
| try: | |
| client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) | |
| api_key_found = True | |
| except TypeError: | |
| api_key_found = False | |
| print("Loading Whisper model...") | |
| whisper_model = whisper.load_model("base", device="cpu") | |
| print("Whisper model loaded.") | |
| # --- PROMPT DEL EXAMINADOR EXPERTO --- | |
| SYSTEM_PROMPT = """ | |
| You are an expert English language examiner specializing in phonetics and accent reduction for ESL learners. Your task is to provide a detailed, diagnostic assessment of a student's spoken English based on a reference sentence and detailed word-level audio analysis. | |
| **Input You Will Receive:** | |
| You will be given a JSON object containing: | |
| 1. `reference_transcript`: The correct sentence the student was supposed to say. | |
| 2. `spoken_words`: A list of words detected by Whisper, each with: | |
| - `word`: The word as transcribed by Whisper. | |
| - `start`: The start time of the word in seconds. | |
| - `end`: The end time of the word in seconds. | |
| - `energy`: A numeric value (RMS) indicating the pronunciation's energy/loudness. | |
| **Your Analysis and Output:** | |
| Your entire response MUST be in English. You must return a single, valid JSON object with the following structure. Do not include any text outside of this JSON object. | |
| **JSON Output Structure:** | |
| { | |
| "overall_score_100": integer, | |
| "cefr_level": "string (A1, A2, B1, B2, C1, or C2)", | |
| "holistic_feedback": { | |
| "strengths": "string (A paragraph in English summarizing the student's strong points in pronunciation, rhythm, and clarity.)", | |
| "areas_for_improvement": "string (A paragraph in English detailing the main patterns of error and what to focus on.)" | |
| }, | |
| "word_by_word_analysis": [ | |
| { | |
| "reference_word": "string (The word from the correct sentence)", | |
| "spoken_word": "string (The word Whisper transcribed, or 'OMITTED')", | |
| "word_score_100": integer, | |
| "correct_ipa": "string (The correct IPA transcription)", | |
| "feedback": "string (Specific phonetic feedback for this word. If correct, simply state 'Excellent pronunciation.')" | |
| } | |
| ] | |
| } | |
| """ | |
| # --- 1. EXTRACCIÓN DETALLADA DE CARACTERÍSTICAS (WHISPER + LIBROSA) --- | |
| def extract_word_level_features(audio_path): | |
| """ | |
| This function uses Whisper to get word timestamps and Librosa to get | |
| features for each word's audio segment. | |
| """ | |
| try: | |
| y, sr = librosa.load(audio_path, sr=16000) | |
| result = whisper_model.transcribe(audio_path, word_timestamps=True, fp16=False) | |
| if not result["segments"] or not result["segments"][0]["words"]: | |
| return [] | |
| word_segments = result["segments"][0]["words"] | |
| features_list = [] | |
| for segment in word_segments: | |
| start_sample = int(segment['start'] * sr) | |
| end_sample = int(segment['end'] * sr) | |
| word_audio = y[start_sample:end_sample] | |
| # Calculate Root Mean Square (RMS) energy for the word | |
| rms_energy = np.mean(librosa.feature.rms(y=word_audio)) | |
| features_list.append({ | |
| "word": segment['word'].strip(), | |
| "start": round(segment['start'], 2), | |
| "end": round(segment['end'], 2), | |
| "energy": round(float(rms_energy), 4) | |
| }) | |
| return features_list | |
| except Exception as e: | |
| print(f"Error during feature extraction: {e}") | |
| return [] | |
| # --- 2. FUNCIÓN PRINCIPAL DE EVALUACIÓN --- | |
| def run_evaluation(audio_input, reference_transcript): | |
| if not api_key_found: raise gr.Error("OpenAI API key not found.") | |
| if audio_input is None or not reference_transcript: | |
| return 0, "N/A", "Please provide both an audio file and the reference text.", None | |
| sr, y = audio_input | |
| temp_audio_path = "temp_audio.wav" | |
| sf.write(temp_audio_path, y, sr) | |
| # Step 1: Extract detailed features using Whisper and Librosa | |
| word_features = extract_word_level_features(temp_audio_path) | |
| if not word_features: | |
| return 0, "N/A", "Could not process the audio. Please try recording again.", None | |
| # Step 2: Construct the detailed prompt for the OpenAI API | |
| prompt_data = { | |
| "reference_transcript": reference_transcript, | |
| "spoken_words": word_features | |
| } | |
| print("Sending detailed data to GPT-4o for analysis...") | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| response_format={"type": "json_object"}, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": json.dumps(prompt_data)} | |
| ] | |
| ) | |
| # Step 3: Process the API response and format it for display | |
| try: | |
| result = json.loads(response.choices[0].message.content) | |
| # Format the detailed report for Gradio | |
| holistic_feedback_md = f"### Strengths\n{result['holistic_feedback']['strengths']}\n\n" | |
| holistic_feedback_md += f"### Areas for Improvement\n{result['holistic_feedback']['areas_for_improvement']}" | |
| # Create a pandas DataFrame for better display | |
| word_analysis_df = pd.DataFrame(result['word_by_word_analysis']) | |
| return ( | |
| result.get("overall_score_100", 0), | |
| result.get("cefr_level", "N/A"), | |
| holistic_feedback_md, | |
| gr.DataFrame(value=word_analysis_df, headers=["Reference Word", "Spoken Word", "Score", "Correct IPA", "Feedback"], interactive=False) | |
| ) | |
| except (json.JSONDecodeError, KeyError) as e: | |
| print(f"Error processing API response: {e}") | |
| error_msg = "The API response was not in the expected format. Please try again." | |
| return 0, "Error", error_msg, None | |
| # --- 3. INTERFAZ DE GRADIO --- | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🇬🇧 Expert Pronunciation Assessment") | |
| gr.Markdown("Record yourself saying the reference sentence. Our AI examiner will provide a detailed diagnostic report on your performance.") | |
| frase_ejemplo = "The rainbow is a division of white light into many beautiful colors." | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_in = gr.Audio(sources=["microphone"], type="numpy", label="1. Record Your Voice") | |
| text_in = gr.Textbox(lines=3, label="2. Reference Sentence", value=frase_ejemplo) | |
| submit_btn = gr.Button("Get Assessment", variant="primary") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### Assessment Summary") | |
| with gr.Row(): | |
| score_out = gr.Number(label="Overall Score (0-100)", interactive=False) | |
| level_out = gr.Textbox(label="Estimated CEFR Level", interactive=False) | |
| holistic_feedback_out = gr.Markdown(label="Examiner's Feedback") | |
| gr.Markdown("--- \n ### Detailed Word-by-Word Analysis") | |
| word_analysis_out = gr.DataFrame(headers=["Reference Word", "Spoken Word", "Score", "Correct IPA", "Feedback"], label="Phonetic Breakdown") | |
| submit_btn.click( | |
| fn=run_evaluation, | |
| inputs=[audio_in, text_in], | |
| outputs=[score_out, level_out, holistic_feedback_out, word_analysis_out] | |
| ) | |
| if __name__ == "__main__": | |
| if not api_key_found: | |
| print("\nFATAL: OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.") | |
| else: | |
| demo.launch(debug=True) |