EvaluadorOpenAI / app.py
mramirez2001's picture
Upload app.py
4ece3bb verified
raw
history blame
7.57 kB
# app.py
import gradio as gr
import os
from openai import OpenAI
import json
import librosa
import numpy as np
import soundfile as sf
import whisper
import pandas as pd
# --- 0. CONFIGURACIÓN INICIAL ---
try:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
api_key_found = True
except TypeError:
api_key_found = False
print("Loading Whisper model...")
whisper_model = whisper.load_model("base", device="cpu")
print("Whisper model loaded.")
# --- PROMPT DEL EXAMINADOR EXPERTO ---
SYSTEM_PROMPT = """
You are an expert English language examiner specializing in phonetics and accent reduction for ESL learners. Your task is to provide a detailed, diagnostic assessment of a student's spoken English based on a reference sentence and detailed word-level audio analysis.
**Input You Will Receive:**
You will be given a JSON object containing:
1. `reference_transcript`: The correct sentence the student was supposed to say.
2. `spoken_words`: A list of words detected by Whisper, each with:
- `word`: The word as transcribed by Whisper.
- `start`: The start time of the word in seconds.
- `end`: The end time of the word in seconds.
- `energy`: A numeric value (RMS) indicating the pronunciation's energy/loudness.
**Your Analysis and Output:**
Your entire response MUST be in English. You must return a single, valid JSON object with the following structure. Do not include any text outside of this JSON object.
**JSON Output Structure:**
{
"overall_score_100": integer,
"cefr_level": "string (A1, A2, B1, B2, C1, or C2)",
"holistic_feedback": {
"strengths": "string (A paragraph in English summarizing the student's strong points in pronunciation, rhythm, and clarity.)",
"areas_for_improvement": "string (A paragraph in English detailing the main patterns of error and what to focus on.)"
},
"word_by_word_analysis": [
{
"reference_word": "string (The word from the correct sentence)",
"spoken_word": "string (The word Whisper transcribed, or 'OMITTED')",
"word_score_100": integer,
"correct_ipa": "string (The correct IPA transcription)",
"feedback": "string (Specific phonetic feedback for this word. If correct, simply state 'Excellent pronunciation.')"
}
]
}
"""
# --- 1. EXTRACCIÓN DETALLADA DE CARACTERÍSTICAS (WHISPER + LIBROSA) ---
def extract_word_level_features(audio_path):
"""
This function uses Whisper to get word timestamps and Librosa to get
features for each word's audio segment.
"""
try:
y, sr = librosa.load(audio_path, sr=16000)
result = whisper_model.transcribe(audio_path, word_timestamps=True, fp16=False)
if not result["segments"] or not result["segments"][0]["words"]:
return []
word_segments = result["segments"][0]["words"]
features_list = []
for segment in word_segments:
start_sample = int(segment['start'] * sr)
end_sample = int(segment['end'] * sr)
word_audio = y[start_sample:end_sample]
# Calculate Root Mean Square (RMS) energy for the word
rms_energy = np.mean(librosa.feature.rms(y=word_audio))
features_list.append({
"word": segment['word'].strip(),
"start": round(segment['start'], 2),
"end": round(segment['end'], 2),
"energy": round(float(rms_energy), 4)
})
return features_list
except Exception as e:
print(f"Error during feature extraction: {e}")
return []
# --- 2. FUNCIÓN PRINCIPAL DE EVALUACIÓN ---
def run_evaluation(audio_input, reference_transcript):
if not api_key_found: raise gr.Error("OpenAI API key not found.")
if audio_input is None or not reference_transcript:
return 0, "N/A", "Please provide both an audio file and the reference text.", None
sr, y = audio_input
temp_audio_path = "temp_audio.wav"
sf.write(temp_audio_path, y, sr)
# Step 1: Extract detailed features using Whisper and Librosa
word_features = extract_word_level_features(temp_audio_path)
if not word_features:
return 0, "N/A", "Could not process the audio. Please try recording again.", None
# Step 2: Construct the detailed prompt for the OpenAI API
prompt_data = {
"reference_transcript": reference_transcript,
"spoken_words": word_features
}
print("Sending detailed data to GPT-4o for analysis...")
response = client.chat.completions.create(
model="gpt-4o",
response_format={"type": "json_object"},
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": json.dumps(prompt_data)}
]
)
# Step 3: Process the API response and format it for display
try:
result = json.loads(response.choices[0].message.content)
# Format the detailed report for Gradio
holistic_feedback_md = f"### Strengths\n{result['holistic_feedback']['strengths']}\n\n"
holistic_feedback_md += f"### Areas for Improvement\n{result['holistic_feedback']['areas_for_improvement']}"
# Create a pandas DataFrame for better display
word_analysis_df = pd.DataFrame(result['word_by_word_analysis'])
return (
result.get("overall_score_100", 0),
result.get("cefr_level", "N/A"),
holistic_feedback_md,
gr.DataFrame(value=word_analysis_df, headers=["Reference Word", "Spoken Word", "Score", "Correct IPA", "Feedback"], interactive=False)
)
except (json.JSONDecodeError, KeyError) as e:
print(f"Error processing API response: {e}")
error_msg = "The API response was not in the expected format. Please try again."
return 0, "Error", error_msg, None
# --- 3. INTERFAZ DE GRADIO ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🇬🇧 Expert Pronunciation Assessment")
gr.Markdown("Record yourself saying the reference sentence. Our AI examiner will provide a detailed diagnostic report on your performance.")
frase_ejemplo = "The rainbow is a division of white light into many beautiful colors."
with gr.Row():
with gr.Column(scale=1):
audio_in = gr.Audio(sources=["microphone"], type="numpy", label="1. Record Your Voice")
text_in = gr.Textbox(lines=3, label="2. Reference Sentence", value=frase_ejemplo)
submit_btn = gr.Button("Get Assessment", variant="primary")
with gr.Column(scale=2):
gr.Markdown("### Assessment Summary")
with gr.Row():
score_out = gr.Number(label="Overall Score (0-100)", interactive=False)
level_out = gr.Textbox(label="Estimated CEFR Level", interactive=False)
holistic_feedback_out = gr.Markdown(label="Examiner's Feedback")
gr.Markdown("--- \n ### Detailed Word-by-Word Analysis")
word_analysis_out = gr.DataFrame(headers=["Reference Word", "Spoken Word", "Score", "Correct IPA", "Feedback"], label="Phonetic Breakdown")
submit_btn.click(
fn=run_evaluation,
inputs=[audio_in, text_in],
outputs=[score_out, level_out, holistic_feedback_out, word_analysis_out]
)
if __name__ == "__main__":
if not api_key_found:
print("\nFATAL: OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
else:
demo.launch(debug=True)