EvaluadorOpenAI / app.py
mramirez2001's picture
Update app.py
b2021cd verified
# app.py
import gradio as gr
import os
from openai import OpenAI
import json
import librosa
import numpy as np
import soundfile as sf
import whisper
import pandas as pd
from gtts import gTTS
import re
import base64
import io
# --- 0. CONFIGURACI脫N INICIAL ---
try:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
api_key_found = True
except TypeError:
api_key_found = False
print("Loading Whisper for transcription...")
whisper_model = whisper.load_model("base", device="cpu")
print("Whisper model loaded.")
# --- 1. DEFINICI脫N DE PROMPTS PARA LA IA ---
CONVERSATION_SYSTEM_PROMPT = """
You are a friendly and encouraging English language tutor named Alex.
A student will speak to you. Your task is to keep a natural, simple conversation going.
1. Briefly analyze the user's previous response to estimate their CEFR level (A1, A2, B1, etc.).
2. Formulate a simple, open-ended follow-up question that is appropriate for THAT estimated level.
3. Your entire response must be a single, short paragraph in natural, conversational English. DO NOT use JSON.
"""
FINAL_EVALUATION_SYSTEM_PROMPT = """
You are an expert English language examiner providing a final report. Analyze the entire conversation history provided.
Your task is to return a single, valid JSON object with the following structure. Do not include any text outside this JSON object.
JSON Output Structure:
{
"cefr_level": "string (e.g., A2, B1)",
"feedback_en": { "strengths": "string", "areas_for_improvement": "string", "word_by_word_feedback": [{"word": "string", "feedback": "string"}] },
"feedback_es": { "fortalezas": "string", "areas_a_mejorar": "string", "feedback_por_palabra": [{"palabra": "string", "feedback": "string"}] }
}
"""
SENTENCE_EVALUATION_SYSTEM_PROMPT = """
You are an expert English language examiner specializing in phonetics. Your task is to provide a detailed, diagnostic assessment of a student's spoken English based on a reference sentence and detailed word-level audio analysis.
Input You Will Receive: A JSON object with `reference_transcript` and a list of `spoken_words` with timestamps and energy.
Your entire response MUST be a single, valid JSON object with the following structure. Do not include any text outside this JSON object.
JSON Output Structure:
{
"overall_score_100": integer,
"cefr_level": "string (A1, A2, B1, B2, C1, or C2)",
"holistic_feedback": { "strengths": "string", "areas_for_improvement": "string" },
"word_by_word_analysis": [ { "reference_word": "string", "spoken_word": "string", "word_score_100": integer, "correct_ipa": "string", "feedback_en": "string", "feedback_es": "string" } ]
}
"""
# --- 2. FUNCIONES L脫GICAS ---
def extract_word_level_features(audio_path):
try:
y, sr = librosa.load(audio_path, sr=16000)
result = whisper_model.transcribe(audio_path, word_timestamps=True, fp16=False)
if not result["segments"] or 'words' not in result["segments"][0]: return []
word_segments = result["segments"][0]["words"]
features_list = []
for segment in word_segments:
start_sample = int(segment['start'] * sr); end_sample = int(segment['end'] * sr)
word_audio = y[start_sample:end_sample]
rms_energy = np.mean(librosa.feature.rms(y=word_audio)) if len(word_audio) > 0 else 0
features_list.append({"word": segment['word'].strip(), "start": round(segment['start'], 2), "end": round(segment['end'], 2), "energy": round(float(rms_energy), 4)})
return features_list
except Exception as e:
print(f"Error during feature extraction: {e}"); return []
def chat_interaction(audio_input, history_state):
if not api_key_found: raise gr.Error("OpenAI API key not found.")
if audio_input is None:
user_turns = len(history_state[1:]) // 2 if history_state else 0
responses_remaining = 5 - user_turns
# Muestra el estado actual sin hacer nada si no hay audio
chat_display = [(history_state[i]['content'], history_state[i+1]['content']) for i in range(1, len(history_state), 2)]
return chat_display, history_state, f"Responses remaining: {responses_remaining}", gr.update(visible=False), gr.update(visible=False)
sr, y = audio_input; temp_audio_path = "temp_audio_chat.wav"; sf.write(temp_audio_path, y, sr)
user_text = client.audio.transcriptions.create(model="whisper-1", file=open(temp_audio_path, "rb")).text
if not history_state:
history_state = [{"role": "system", "content": CONVERSATION_SYSTEM_PROMPT}]
history_state.append({"role": "user", "content": user_text, "display_content": user_text})
user_turns = (len(history_state) - 1) // 2
responses_remaining = 5 - user_turns
if user_turns < 5:
response = client.chat.completions.create(model="gpt-4o", messages=history_state, temperature=0.7)
ai_response_text = response.choices[0].message.content
try:
tts = gTTS(text=ai_response_text, lang='en')
mp3_fp = io.BytesIO()
tts.write_to_fp(mp3_fp)
mp3_fp.seek(0)
audio_base64 = base64.b64encode(mp3_fp.read()).decode('utf-8')
audio_player = f'<audio src="data:audio/mpeg;base64,{audio_base64}" controls autoplay></audio>'
ai_display_content = f"{ai_response_text}<br>{audio_player}"
except Exception as e:
print(f"Error al generar TTS para la respuesta del chat: {e}")
ai_display_content = ai_response_text
history_state.append({"role": "assistant", "content": ai_response_text, "display_content": ai_display_content})
chat_display = [(msg['display_content'], None) if msg['role']=='user' else (None, msg['display_content']) for msg in history_state[1:]]
return chat_display, history_state, f"Responses remaining: {responses_remaining}", gr.update(visible=False), gr.update(visible=False)
else: # Turno 5: generar evaluaci贸n
print("Generating final evaluation...")
final_messages = [{"role": "system", "content": FINAL_EVALUATION_SYSTEM_PROMPT}] + history_state[1:]
response = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=final_messages)
try:
result = json.loads(response.choices[0].message.content)
fb_en = result.get('feedback_en', {}); md_en = f"## Final Report (CEFR Level: {result.get('cefr_level', 'N/A')})\n### Strengths\n{fb_en.get('strengths', '')}\n### Areas for Improvement\n{fb_en.get('areas_for_improvement', '')}\n### Word-by-Word Feedback\n"
for item in fb_en.get('word_by_word_feedback', []): md_en += f"- **{item['word']}**: {item['feedback']}\n"
fb_es = result.get('feedback_es', {}); md_es = f"## Reporte Final (Nivel MCERL: {result.get('cefr_level', 'N/A')})\n### Fortalezas\n{fb_es.get('fortalezas', '')}\n### 脕reas a Mejorar\n{fb_es.get('areas_a_mejorar', '')}\n### Retroalimentaci贸n por Palabra\n"
for item in fb_es.get('feedback_por_palabra', []): md_es += f"- **{item['palabra']}**: {item['feedback']}\n"
chat_display = [(msg['display_content'], None) if msg['role']=='user' else (None, msg['display_content']) for msg in history_state[1:]]
chat_display[-1] = (chat_display[-1][0], "Thank you! Your final report is now available on the right.")
# --- CAMBIO CLAVE: Reiniciamos el historial para la siguiente conversaci贸n ---
return chat_display, [], "Conversation finished!", gr.update(value=md_en, visible=True), gr.update(value=md_es, visible=True)
except Exception as e:
print(f"Error parsing final report: {e}")
return history_state[1:], [], "Error!", gr.update(value="Error generating report.", visible=True), gr.update(visible=False)
def run_sentence_evaluation(audio_input, reference_transcript):
if not api_key_found: raise gr.Error("OpenAI API key not found.")
if audio_input is None or not reference_transcript:
return 0, "N/A", "Please provide both an audio file and the reference text.", ""
sr, y = audio_input; temp_audio_path = "temp_audio_sentence.wav"; sf.write(temp_audio_path, y, sr)
word_features = extract_word_level_features(temp_audio_path)
if not word_features:
return 0, "N/A", "Could not process the audio.", ""
prompt_data = {"reference_transcript": reference_transcript, "spoken_words": word_features}
print("Sending detailed data to GPT-4o for sentence analysis...")
response = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=[{"role": "system", "content": SENTENCE_EVALUATION_SYSTEM_PROMPT}, {"role": "user", "content": json.dumps(prompt_data)}])
try:
result = json.loads(response.choices[0].message.content)
holistic_feedback_md = f"### Strengths\n{result['holistic_feedback']['strengths']}\n\n### Areas for Improvement\n{result['holistic_feedback']['areas_for_improvement']}"
word_analysis_list = result['word_by_word_analysis']
md_table = "| Reference Word | Spoken Word | Score | Feedback (EN) | Feedback (ES) | Reference Audio |\n| :--- | :--- | :---: | :--- | :--- | :---: |\n"
for index, item in enumerate(word_analysis_list):
word_to_speak = item['reference_word']
try:
tts = gTTS(text=word_to_speak, lang='en'); mp3_fp = io.BytesIO(); tts.write_to_fp(mp3_fp); mp3_fp.seek(0)
audio_base64 = base64.b64encode(mp3_fp.read()).decode('utf-8')
audio_player = f'<audio src="data:audio/mpeg;base64,{audio_base64}" controls></audio>'
except Exception as e:
print(f"Error al generar TTS para '{word_to_speak}': {e}"); audio_player = "Error"
md_table += (f"| **{item['reference_word']}** | {item['spoken_word']} | {item['word_score_100']} | {item['feedback_en']} | {item['feedback_es']} | {audio_player} |\n")
return (result.get("overall_score_100", 0), result.get("cefr_level", "N/A"), holistic_feedback_md, md_table)
except (json.JSONDecodeError, KeyError) as e:
print(f"Error processing API response: {e}"); error_msg = "The API response was not in the expected format."
return 0, "Error", error_msg, ""
# --- 3. INTERFAZ DE GRADIO CON PESTA脩AS (Con ajustes en la Pesta帽a 1) ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 馃嚞馃嚙 AI English Speaking Practice & Assessment")
with gr.Tabs():
# --- PESTA脩A 1: CHAT AI (CON MEJORAS) ---
with gr.TabItem("Pr谩ctica Conversacional (Chat AI)"):
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(value=[(None, "Hi there! I'm Alex. How are you doing today?")], label="Conversation with your AI Tutor", height=500)
audio_in_chat = gr.Audio(sources=["microphone"], type="numpy", label="Record your response")
with gr.Row():
counter_out = gr.Textbox(value="Responses remaining: 5", label="Conversation Progress", interactive=False)
# --- CAMBIO: Bot贸n para reiniciar la conversaci贸n completa ---
new_conversation_btn = gr.Button("New Conversation")
with gr.Column(scale=1):
gr.Markdown("### Final Report")
feedback_en_out = gr.Markdown(label="English Feedback", visible=False)
feedback_es_out = gr.Markdown(label="Retroalimentaci贸n en Espa帽ol", visible=False)
history = gr.State([])
# Funci贸n para borrar el audio despu茅s de enviarlo (no es un bot贸n, es una acci贸n)
def clear_audio_input():
return None
# Funci贸n para reiniciar toda la conversaci贸n
def clear_conversation():
return [], [(None, "Hi there! I'm Alex. How are you doing today?")], "Responses remaining: 5", gr.update(visible=False), gr.update(visible=False), None
# --- CAMBIO: Se renombra el bot贸n y se conecta a la nueva funci贸n de reinicio ---
new_conversation_btn.click(
fn=clear_conversation,
inputs=[],
outputs=[history, chatbot, counter_out, feedback_en_out, feedback_es_out, audio_in_chat]
)
audio_in_chat.stop_recording(
fn=chat_interaction,
inputs=[audio_in_chat, history],
outputs=[chatbot, history, counter_out, feedback_en_out, feedback_es_out]
).then(
fn=clear_audio_input,
inputs=[],
outputs=[audio_in_chat]
)
# --- PESTA脩A 2: EVALUACI脫N POR FRASE ---
with gr.TabItem("Evaluaci贸n por Frase"):
TONGUE_TWISTERS = ["Peter Piper picked a peck of pickled peppers.", "She sells seashells by the seashore.", "How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "Betty Botter bought some butter but she said the butter鈥檚 bitter.", "A proper copper coffee pot."]
gr.Markdown("Choose a tongue twister or write your own sentence. Record yourself, and our AI examiner will provide a detailed diagnostic report.")
tongue_twister_selector = gr.Dropdown(choices=TONGUE_TWISTERS, label="Or Choose a Tongue Twister to Practice")
with gr.Row():
with gr.Column(scale=1):
audio_in_sentence = gr.Audio(sources=["microphone"], type="numpy", label="1. Record Your Voice")
text_in_sentence = gr.Textbox(lines=3, label="2. Reference Sentence", value=TONGUE_TWISTERS[0])
submit_btn_sentence = gr.Button("Get Assessment", variant="primary")
with gr.Column(scale=2):
gr.Markdown("### Assessment Summary")
with gr.Row():
score_out_sentence = gr.Number(label="Overall Score (0-100)", interactive=False)
level_out_sentence = gr.Textbox(label="Estimated CEFR Level", interactive=False)
holistic_feedback_out_sentence = gr.Markdown(label="Examiner's Feedback")
gr.Markdown("--- \n ### Detailed Word-by-Word Analysis")
word_analysis_out_sentence = gr.Markdown(label="Phonetic Breakdown")
def update_text(choice): return gr.Textbox(value=choice)
tongue_twister_selector.change(fn=update_text, inputs=tongue_twister_selector, outputs=text_in_sentence)
submit_btn_sentence.click(fn=run_sentence_evaluation, inputs=[audio_in_sentence, text_in_sentence], outputs=[score_out_sentence, level_out_sentence, holistic_feedback_out_sentence, word_analysis_out_sentence])
if __name__ == "__main__":
if not api_key_found: print("\nFATAL: OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
else: demo.launch(debug=True)