# app.py
import gradio as gr
import os
from openai import OpenAI
import json
import librosa
import numpy as np
import soundfile as sf
import whisper
import pandas as pd
from gtts import gTTS
import re
import base64
import io
# --- 0. CONFIGURACIÓN INICIAL ---
try:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
api_key_found = True
except TypeError:
api_key_found = False
print("Loading Whisper for transcription...")
whisper_model = whisper.load_model("base", device="cpu")
print("Whisper model loaded.")
# --- 1. DEFINICIÓN DE PROMPTS PARA LA IA ---
CONVERSATION_SYSTEM_PROMPT = """
You are a friendly and encouraging English language tutor named Alex.
A student will speak to you. Your task is to keep a natural, simple conversation going.
1. Briefly analyze the user's previous response to estimate their CEFR level (A1, A2, B1, etc.).
2. Formulate a simple, open-ended follow-up question that is appropriate for THAT estimated level.
3. Your entire response must be a single, short paragraph in natural, conversational English. DO NOT use JSON.
"""
FINAL_EVALUATION_SYSTEM_PROMPT = """
You are an expert English language examiner providing a final report. Analyze the entire conversation history provided.
Your task is to return a single, valid JSON object with the following structure. Do not include any text outside this JSON object.
JSON Output Structure:
{
"cefr_level": "string (e.g., A2, B1)",
"feedback_en": { "strengths": "string", "areas_for_improvement": "string", "word_by_word_feedback": [{"word": "string", "feedback": "string"}] },
"feedback_es": { "fortalezas": "string", "areas_a_mejorar": "string", "feedback_por_palabra": [{"palabra": "string", "feedback": "string"}] }
}
"""
SENTENCE_EVALUATION_SYSTEM_PROMPT = """
You are an expert English language examiner specializing in phonetics. Your task is to provide a detailed, diagnostic assessment of a student's spoken English based on a reference sentence and detailed word-level audio analysis.
Input You Will Receive: A JSON object with `reference_transcript` and a list of `spoken_words` with timestamps and energy.
Your entire response MUST be a single, valid JSON object with the following structure. Do not include any text outside this JSON object.
JSON Output Structure:
{
"overall_score_100": integer,
"cefr_level": "string (A1, A2, B1, B2, C1, or C2)",
"holistic_feedback": { "strengths": "string", "areas_for_improvement": "string" },
"word_by_word_analysis": [ { "reference_word": "string", "spoken_word": "string", "word_score_100": integer, "correct_ipa": "string", "feedback_en": "string", "feedback_es": "string" } ]
}
"""
# --- 2. FUNCIONES LÓGICAS ---
def extract_word_level_features(audio_path):
try:
y, sr = librosa.load(audio_path, sr=16000)
result = whisper_model.transcribe(audio_path, word_timestamps=True, fp16=False)
if not result["segments"] or 'words' not in result["segments"][0]: return []
word_segments = result["segments"][0]["words"]
features_list = []
for segment in word_segments:
start_sample = int(segment['start'] * sr); end_sample = int(segment['end'] * sr)
word_audio = y[start_sample:end_sample]
rms_energy = np.mean(librosa.feature.rms(y=word_audio)) if len(word_audio) > 0 else 0
features_list.append({"word": segment['word'].strip(), "start": round(segment['start'], 2), "end": round(segment['end'], 2), "energy": round(float(rms_energy), 4)})
return features_list
except Exception as e:
print(f"Error during feature extraction: {e}"); return []
def chat_interaction(audio_input, history_state):
if not api_key_found: raise gr.Error("OpenAI API key not found.")
if audio_input is None:
user_turns = len(history_state[1:]) // 2 if history_state else 0
responses_remaining = 5 - user_turns
# Muestra el estado actual sin hacer nada si no hay audio
chat_display = [(history_state[i]['content'], history_state[i+1]['content']) for i in range(1, len(history_state), 2)]
return chat_display, history_state, f"Responses remaining: {responses_remaining}", gr.update(visible=False), gr.update(visible=False)
sr, y = audio_input; temp_audio_path = "temp_audio_chat.wav"; sf.write(temp_audio_path, y, sr)
user_text = client.audio.transcriptions.create(model="whisper-1", file=open(temp_audio_path, "rb")).text
if not history_state:
history_state = [{"role": "system", "content": CONVERSATION_SYSTEM_PROMPT}]
history_state.append({"role": "user", "content": user_text, "display_content": user_text})
user_turns = (len(history_state) - 1) // 2
responses_remaining = 5 - user_turns
if user_turns < 5:
response = client.chat.completions.create(model="gpt-4o", messages=history_state, temperature=0.7)
ai_response_text = response.choices[0].message.content
try:
tts = gTTS(text=ai_response_text, lang='en')
mp3_fp = io.BytesIO()
tts.write_to_fp(mp3_fp)
mp3_fp.seek(0)
audio_base64 = base64.b64encode(mp3_fp.read()).decode('utf-8')
audio_player = f''
ai_display_content = f"{ai_response_text} {audio_player}"
except Exception as e:
print(f"Error al generar TTS para la respuesta del chat: {e}")
ai_display_content = ai_response_text
history_state.append({"role": "assistant", "content": ai_response_text, "display_content": ai_display_content})
chat_display = [(msg['display_content'], None) if msg['role']=='user' else (None, msg['display_content']) for msg in history_state[1:]]
return chat_display, history_state, f"Responses remaining: {responses_remaining}", gr.update(visible=False), gr.update(visible=False)
else: # Turno 5: generar evaluación
print("Generating final evaluation...")
final_messages = [{"role": "system", "content": FINAL_EVALUATION_SYSTEM_PROMPT}] + history_state[1:]
response = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=final_messages)
try:
result = json.loads(response.choices[0].message.content)
fb_en = result.get('feedback_en', {}); md_en = f"## Final Report (CEFR Level: {result.get('cefr_level', 'N/A')})\n### Strengths\n{fb_en.get('strengths', '')}\n### Areas for Improvement\n{fb_en.get('areas_for_improvement', '')}\n### Word-by-Word Feedback\n"
for item in fb_en.get('word_by_word_feedback', []): md_en += f"- **{item['word']}**: {item['feedback']}\n"
fb_es = result.get('feedback_es', {}); md_es = f"## Reporte Final (Nivel MCERL: {result.get('cefr_level', 'N/A')})\n### Fortalezas\n{fb_es.get('fortalezas', '')}\n### Áreas a Mejorar\n{fb_es.get('areas_a_mejorar', '')}\n### Retroalimentación por Palabra\n"
for item in fb_es.get('feedback_por_palabra', []): md_es += f"- **{item['palabra']}**: {item['feedback']}\n"
chat_display = [(msg['display_content'], None) if msg['role']=='user' else (None, msg['display_content']) for msg in history_state[1:]]
chat_display[-1] = (chat_display[-1][0], "Thank you! Your final report is now available on the right.")
# --- CAMBIO CLAVE: Reiniciamos el historial para la siguiente conversación ---
return chat_display, [], "Conversation finished!", gr.update(value=md_en, visible=True), gr.update(value=md_es, visible=True)
except Exception as e:
print(f"Error parsing final report: {e}")
return history_state[1:], [], "Error!", gr.update(value="Error generating report.", visible=True), gr.update(visible=False)
def run_sentence_evaluation(audio_input, reference_transcript):
if not api_key_found: raise gr.Error("OpenAI API key not found.")
if audio_input is None or not reference_transcript:
return 0, "N/A", "Please provide both an audio file and the reference text.", ""
sr, y = audio_input; temp_audio_path = "temp_audio_sentence.wav"; sf.write(temp_audio_path, y, sr)
word_features = extract_word_level_features(temp_audio_path)
if not word_features:
return 0, "N/A", "Could not process the audio.", ""
prompt_data = {"reference_transcript": reference_transcript, "spoken_words": word_features}
print("Sending detailed data to GPT-4o for sentence analysis...")
response = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=[{"role": "system", "content": SENTENCE_EVALUATION_SYSTEM_PROMPT}, {"role": "user", "content": json.dumps(prompt_data)}])
try:
result = json.loads(response.choices[0].message.content)
holistic_feedback_md = f"### Strengths\n{result['holistic_feedback']['strengths']}\n\n### Areas for Improvement\n{result['holistic_feedback']['areas_for_improvement']}"
word_analysis_list = result['word_by_word_analysis']
md_table = "| Reference Word | Spoken Word | Score | Feedback (EN) | Feedback (ES) | Reference Audio |\n| :--- | :--- | :---: | :--- | :--- | :---: |\n"
for index, item in enumerate(word_analysis_list):
word_to_speak = item['reference_word']
try:
tts = gTTS(text=word_to_speak, lang='en'); mp3_fp = io.BytesIO(); tts.write_to_fp(mp3_fp); mp3_fp.seek(0)
audio_base64 = base64.b64encode(mp3_fp.read()).decode('utf-8')
audio_player = f''
except Exception as e:
print(f"Error al generar TTS para '{word_to_speak}': {e}"); audio_player = "Error"
md_table += (f"| **{item['reference_word']}** | {item['spoken_word']} | {item['word_score_100']} | {item['feedback_en']} | {item['feedback_es']} | {audio_player} |\n")
return (result.get("overall_score_100", 0), result.get("cefr_level", "N/A"), holistic_feedback_md, md_table)
except (json.JSONDecodeError, KeyError) as e:
print(f"Error processing API response: {e}"); error_msg = "The API response was not in the expected format."
return 0, "Error", error_msg, ""
# --- 3. INTERFAZ DE GRADIO CON PESTAÑAS (Con ajustes en la Pestaña 1) ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🇬🇧 AI English Speaking Practice & Assessment")
with gr.Tabs():
# --- PESTAÑA 1: CHAT AI (CON MEJORAS) ---
with gr.TabItem("Práctica Conversacional (Chat AI)"):
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(value=[(None, "Hi there! I'm Alex. How are you doing today?")], label="Conversation with your AI Tutor", height=500)
audio_in_chat = gr.Audio(sources=["microphone"], type="numpy", label="Record your response")
with gr.Row():
counter_out = gr.Textbox(value="Responses remaining: 5", label="Conversation Progress", interactive=False)
# --- CAMBIO: Botón para reiniciar la conversación completa ---
new_conversation_btn = gr.Button("New Conversation")
with gr.Column(scale=1):
gr.Markdown("### Final Report")
feedback_en_out = gr.Markdown(label="English Feedback", visible=False)
feedback_es_out = gr.Markdown(label="Retroalimentación en Español", visible=False)
history = gr.State([])
# Función para borrar el audio después de enviarlo (no es un botón, es una acción)
def clear_audio_input():
return None
# Función para reiniciar toda la conversación
def clear_conversation():
return [], [(None, "Hi there! I'm Alex. How are you doing today?")], "Responses remaining: 5", gr.update(visible=False), gr.update(visible=False), None
# --- CAMBIO: Se renombra el botón y se conecta a la nueva función de reinicio ---
new_conversation_btn.click(
fn=clear_conversation,
inputs=[],
outputs=[history, chatbot, counter_out, feedback_en_out, feedback_es_out, audio_in_chat]
)
audio_in_chat.stop_recording(
fn=chat_interaction,
inputs=[audio_in_chat, history],
outputs=[chatbot, history, counter_out, feedback_en_out, feedback_es_out]
).then(
fn=clear_audio_input,
inputs=[],
outputs=[audio_in_chat]
)
# --- PESTAÑA 2: EVALUACIÓN POR FRASE ---
with gr.TabItem("Evaluación por Frase"):
TONGUE_TWISTERS = ["Peter Piper picked a peck of pickled peppers.", "She sells seashells by the seashore.", "How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "Betty Botter bought some butter but she said the butter’s bitter.", "A proper copper coffee pot."]
gr.Markdown("Choose a tongue twister or write your own sentence. Record yourself, and our AI examiner will provide a detailed diagnostic report.")
tongue_twister_selector = gr.Dropdown(choices=TONGUE_TWISTERS, label="Or Choose a Tongue Twister to Practice")
with gr.Row():
with gr.Column(scale=1):
audio_in_sentence = gr.Audio(sources=["microphone"], type="numpy", label="1. Record Your Voice")
text_in_sentence = gr.Textbox(lines=3, label="2. Reference Sentence", value=TONGUE_TWISTERS[0])
submit_btn_sentence = gr.Button("Get Assessment", variant="primary")
with gr.Column(scale=2):
gr.Markdown("### Assessment Summary")
with gr.Row():
score_out_sentence = gr.Number(label="Overall Score (0-100)", interactive=False)
level_out_sentence = gr.Textbox(label="Estimated CEFR Level", interactive=False)
holistic_feedback_out_sentence = gr.Markdown(label="Examiner's Feedback")
gr.Markdown("--- \n ### Detailed Word-by-Word Analysis")
word_analysis_out_sentence = gr.Markdown(label="Phonetic Breakdown")
def update_text(choice): return gr.Textbox(value=choice)
tongue_twister_selector.change(fn=update_text, inputs=tongue_twister_selector, outputs=text_in_sentence)
submit_btn_sentence.click(fn=run_sentence_evaluation, inputs=[audio_in_sentence, text_in_sentence], outputs=[score_out_sentence, level_out_sentence, holistic_feedback_out_sentence, word_analysis_out_sentence])
if __name__ == "__main__":
if not api_key_found: print("\nFATAL: OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
else: demo.launch(debug=True)