Spaces:

mramirez2001
/

EvaluadorOpenAI

Sleeping

App Files Files Community

EvaluadorOpenAI / app.py

mramirez2001

Upload app.py

4ece3bb verified 7 months ago

raw

history blame

7.57 kB

	# app.py

	import gradio as gr
	import os
	from openai import OpenAI
	import json
	import librosa
	import numpy as np
	import soundfile as sf
	import whisper
	import pandas as pd

	# --- 0. CONFIGURACIÓN INICIAL ---
	try:
	client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
	api_key_found = True
	except TypeError:
	api_key_found = False

	print("Loading Whisper model...")
	whisper_model = whisper.load_model("base", device="cpu")
	print("Whisper model loaded.")

	# --- PROMPT DEL EXAMINADOR EXPERTO ---
	SYSTEM_PROMPT = """
	You are an expert English language examiner specializing in phonetics and accent reduction for ESL learners. Your task is to provide a detailed, diagnostic assessment of a student's spoken English based on a reference sentence and detailed word-level audio analysis.

	Input You Will Receive:
	You will be given a JSON object containing:
	1. `reference_transcript`: The correct sentence the student was supposed to say.
	2. `spoken_words`: A list of words detected by Whisper, each with:
	- `word`: The word as transcribed by Whisper.
	- `start`: The start time of the word in seconds.
	- `end`: The end time of the word in seconds.
	- `energy`: A numeric value (RMS) indicating the pronunciation's energy/loudness.

	Your Analysis and Output:
	Your entire response MUST be in English. You must return a single, valid JSON object with the following structure. Do not include any text outside of this JSON object.

	JSON Output Structure:
	{
	"overall_score_100": integer,
	"cefr_level": "string (A1, A2, B1, B2, C1, or C2)",
	"holistic_feedback": {
	"strengths": "string (A paragraph in English summarizing the student's strong points in pronunciation, rhythm, and clarity.)",
	"areas_for_improvement": "string (A paragraph in English detailing the main patterns of error and what to focus on.)"
	},
	"word_by_word_analysis": [
	{
	"reference_word": "string (The word from the correct sentence)",
	"spoken_word": "string (The word Whisper transcribed, or 'OMITTED')",
	"word_score_100": integer,
	"correct_ipa": "string (The correct IPA transcription)",
	"feedback": "string (Specific phonetic feedback for this word. If correct, simply state 'Excellent pronunciation.')"
	}
	]
	}
	"""

	# --- 1. EXTRACCIÓN DETALLADA DE CARACTERÍSTICAS (WHISPER + LIBROSA) ---
	def extract_word_level_features(audio_path):
	"""
	This function uses Whisper to get word timestamps and Librosa to get
	features for each word's audio segment.
	"""
	try:
	y, sr = librosa.load(audio_path, sr=16000)

	result = whisper_model.transcribe(audio_path, word_timestamps=True, fp16=False)
	if not result["segments"] or not result["segments"][0]["words"]:
	return []

	word_segments = result["segments"][0]["words"]

	features_list = []
	for segment in word_segments:
	start_sample = int(segment['start'] * sr)
	end_sample = int(segment['end'] * sr)
	word_audio = y[start_sample:end_sample]

	# Calculate Root Mean Square (RMS) energy for the word
	rms_energy = np.mean(librosa.feature.rms(y=word_audio))

	features_list.append({
	"word": segment['word'].strip(),
	"start": round(segment['start'], 2),
	"end": round(segment['end'], 2),
	"energy": round(float(rms_energy), 4)
	})
	return features_list
	except Exception as e:
	print(f"Error during feature extraction: {e}")
	return []

	# --- 2. FUNCIÓN PRINCIPAL DE EVALUACIÓN ---
	def run_evaluation(audio_input, reference_transcript):
	if not api_key_found: raise gr.Error("OpenAI API key not found.")
	if audio_input is None or not reference_transcript:
	return 0, "N/A", "Please provide both an audio file and the reference text.", None

	sr, y = audio_input
	temp_audio_path = "temp_audio.wav"
	sf.write(temp_audio_path, y, sr)

	# Step 1: Extract detailed features using Whisper and Librosa
	word_features = extract_word_level_features(temp_audio_path)
	if not word_features:
	return 0, "N/A", "Could not process the audio. Please try recording again.", None

	# Step 2: Construct the detailed prompt for the OpenAI API
	prompt_data = {
	"reference_transcript": reference_transcript,
	"spoken_words": word_features
	}

	print("Sending detailed data to GPT-4o for analysis...")
	response = client.chat.completions.create(
	model="gpt-4o",
	response_format={"type": "json_object"},
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": json.dumps(prompt_data)}
	]
	)

	# Step 3: Process the API response and format it for display
	try:
	result = json.loads(response.choices[0].message.content)

	# Format the detailed report for Gradio
	holistic_feedback_md = f"### Strengths\n{result['holistic_feedback']['strengths']}\n\n"
	holistic_feedback_md += f"### Areas for Improvement\n{result['holistic_feedback']['areas_for_improvement']}"

	# Create a pandas DataFrame for better display
	word_analysis_df = pd.DataFrame(result['word_by_word_analysis'])

	return (
	result.get("overall_score_100", 0),
	result.get("cefr_level", "N/A"),
	holistic_feedback_md,
	gr.DataFrame(value=word_analysis_df, headers=["Reference Word", "Spoken Word", "Score", "Correct IPA", "Feedback"], interactive=False)
	)

	except (json.JSONDecodeError, KeyError) as e:
	print(f"Error processing API response: {e}")
	error_msg = "The API response was not in the expected format. Please try again."
	return 0, "Error", error_msg, None


	# --- 3. INTERFAZ DE GRADIO ---
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🇬🇧 Expert Pronunciation Assessment")
	gr.Markdown("Record yourself saying the reference sentence. Our AI examiner will provide a detailed diagnostic report on your performance.")

	frase_ejemplo = "The rainbow is a division of white light into many beautiful colors."

	with gr.Row():
	with gr.Column(scale=1):
	audio_in = gr.Audio(sources=["microphone"], type="numpy", label="1. Record Your Voice")
	text_in = gr.Textbox(lines=3, label="2. Reference Sentence", value=frase_ejemplo)
	submit_btn = gr.Button("Get Assessment", variant="primary")

	with gr.Column(scale=2):
	gr.Markdown("### Assessment Summary")
	with gr.Row():
	score_out = gr.Number(label="Overall Score (0-100)", interactive=False)
	level_out = gr.Textbox(label="Estimated CEFR Level", interactive=False)

	holistic_feedback_out = gr.Markdown(label="Examiner's Feedback")

	gr.Markdown("--- \n ### Detailed Word-by-Word Analysis")
	word_analysis_out = gr.DataFrame(headers=["Reference Word", "Spoken Word", "Score", "Correct IPA", "Feedback"], label="Phonetic Breakdown")

	submit_btn.click(
	fn=run_evaluation,
	inputs=[audio_in, text_in],
	outputs=[score_out, level_out, holistic_feedback_out, word_analysis_out]
	)

	if __name__ == "__main__":
	if not api_key_found:
	print("\nFATAL: OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
	else:
	demo.launch(debug=True)