mramirez2001 commited on
Commit
2db2b9e
verified
1 Parent(s): 0c8505c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -76
app.py CHANGED
@@ -9,11 +9,9 @@ import numpy as np
9
  import soundfile as sf
10
  import whisper
11
  import pandas as pd
12
- # --- CAMBIO: Importamos la librer铆a gTTS ---
13
  from gtts import gTTS
14
 
15
  # --- 0. CONFIGURACI脫N INICIAL ---
16
- # (Sin cambios)
17
  try:
18
  client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
19
  api_key_found = True
@@ -26,125 +24,156 @@ print("Whisper model loaded.")
26
 
27
 
28
  # --- 1. DEFINICI脫N DE PROMPTS PARA LA IA ---
29
- # (Sin cambios, puedes copiar tus prompts aqu铆)
30
- CONVERSATION_SYSTEM_PROMPT = "..."
31
- FINAL_EVALUATION_SYSTEM_PROMPT = "..."
32
- SENTENCE_EVALUATION_SYSTEM_PROMPT = "..." # El que ya tienes que pide feedback biling眉e
33
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  # --- 2. FUNCIONES L脫GICAS ---
36
 
37
- # (La funci贸n 'extract_word_level_features' y 'chat_interaction' se mantienen igual)
38
  def extract_word_level_features(audio_path):
39
- # ... (c贸digo de la funci贸n sin cambios)
40
- pass
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  def chat_interaction(audio_input, history_state):
43
- # ... (c贸digo de la funci贸n sin cambios)
44
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- # --- CAMBIO: La funci贸n de evaluaci贸n de frase ahora genera audios de referencia ---
47
  def run_sentence_evaluation(audio_input, reference_transcript):
48
  if not api_key_found: raise gr.Error("OpenAI API key not found.")
49
  if audio_input is None or not reference_transcript:
50
  return 0, "N/A", "Please provide both an audio file and the reference text.", None
51
-
52
- # --- Procesamiento del audio del usuario (sin cambios) ---
53
  sr, y = audio_input; temp_audio_path = "temp_audio_sentence.wav"; sf.write(temp_audio_path, y, sr)
54
  word_features = extract_word_level_features(temp_audio_path)
55
  if not word_features:
56
  return 0, "N/A", "Could not process the audio.", None
57
  prompt_data = {"reference_transcript": reference_transcript, "spoken_words": word_features}
58
-
59
  print("Sending detailed data to GPT-4o for sentence analysis...")
60
  response = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=[{"role": "system", "content": SENTENCE_EVALUATION_SYSTEM_PROMPT}, {"role": "user", "content": json.dumps(prompt_data)}])
61
-
62
  try:
63
  result = json.loads(response.choices[0].message.content)
64
  holistic_feedback_md = f"### Strengths\n{result['holistic_feedback']['strengths']}\n\n### Areas for Improvement\n{result['holistic_feedback']['areas_for_improvement']}"
65
  word_analysis_df = pd.DataFrame(result['word_by_word_analysis'])
66
-
67
- # --- NUEVO: Generar audio de referencia para cada palabra ---
68
- print("Generando audios de referencia...")
69
  reference_audio_paths = []
70
- # Crear una carpeta temporal para los audios de referencia si no existe
71
- os.makedirs("reference_audio", exist_ok=True)
72
-
73
  for index, row in word_analysis_df.iterrows():
74
- word_to_speak = row['reference_word']
75
- audio_path = f"reference_audio/{index}_{word_to_speak.lower()}.mp3"
76
  try:
77
- tts = gTTS(text=word_to_speak, lang='en')
78
- tts.save(audio_path)
79
- reference_audio_paths.append(audio_path)
80
- except Exception as e:
81
- print(f"Error al generar TTS para '{word_to_speak}': {e}")
82
- reference_audio_paths.append(None)
83
-
84
  word_analysis_df['reference_audio'] = reference_audio_paths
85
-
86
- # Seleccionar y renombrar columnas para la tabla de Gradio
87
- df_for_display = word_analysis_df[[
88
- 'reference_word', 'spoken_word', 'word_score_100',
89
- 'feedback_en', 'feedback_es', 'reference_audio'
90
- ]]
91
-
92
- return (
93
- result.get("overall_score_100", 0),
94
- result.get("cefr_level", "N/A"),
95
- holistic_feedback_md,
96
- gr.DataFrame(value=df_for_display)
97
- )
98
  except (json.JSONDecodeError, KeyError) as e:
99
  print(f"Error processing API response: {e}"); error_msg = "The API response was not in the expected format."
100
  return 0, "Error", error_msg, None
101
 
102
 
103
- # --- 3. INTERFAZ DE GRADIO CON PESTA脩AS (Con la tabla actualizada) ---
104
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
105
  gr.Markdown("# 馃嚞馃嚙 AI English Speaking Practice & Assessment")
106
  with gr.Tabs():
107
- # --- PESTA脩A 1: CHAT AI (sin cambios) ---
108
  with gr.TabItem("Pr谩ctica Conversacional (Chat AI)"):
109
- # ... (c贸digo de la interfaz del chatbot sin cambios)
110
- pass
 
 
 
 
 
 
111
 
112
  # --- PESTA脩A 2: EVALUACI脫N POR FRASE ---
113
  with gr.TabItem("Evaluaci贸n por Frase"):
114
- TONGUE_TWISTERS = ["Peter Piper picked a peck of pickled peppers.", "She sells seashells by the seashore.", "How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
115
- gr.Markdown("Choose a tongue twister or write your own sentence...")
116
  tongue_twister_selector = gr.Dropdown(choices=TONGUE_TWISTERS, label="Or Choose a Tongue Twister to Practice")
117
-
118
  with gr.Row():
119
  with gr.Column(scale=1):
120
- # ... (componentes de entrada sin cambios)
121
- audio_in_sentence = gr.Audio(...)
122
- text_in_sentence = gr.Textbox(...)
123
- submit_btn_sentence = gr.Button(...)
124
-
125
  with gr.Column(scale=2):
126
- # ... (componentes de resumen sin cambios)
127
- score_out_sentence = gr.Number(...)
128
- level_out_sentence = gr.Textbox(...)
129
- holistic_feedback_out_sentence = gr.Markdown(...)
130
-
131
  gr.Markdown("--- \n ### Detailed Word-by-Word Analysis")
132
-
133
- # --- CAMBIO: Actualizamos las cabeceras de la tabla para incluir el audio ---
134
- word_analysis_out_sentence = gr.DataFrame(
135
- headers=["Reference Word", "Spoken Word", "Score (0-100)", "Feedback (English)", "Feedback (Espa帽ol)", "Reference Audio"],
136
- label="Phonetic Breakdown",
137
- wrap=True
138
- )
139
-
140
  def update_text(choice): return gr.Textbox(value=choice)
141
  tongue_twister_selector.change(fn=update_text, inputs=tongue_twister_selector, outputs=text_in_sentence)
142
-
143
- submit_btn_sentence.click(
144
- fn=run_sentence_evaluation,
145
- inputs=[audio_in_sentence, text_in_sentence],
146
- outputs=[score_out_sentence, level_out_sentence, holistic_feedback_out_sentence, word_analysis_out_sentence]
147
- )
148
 
149
  if __name__ == "__main__":
150
  if not api_key_found: print("\nFATAL: OpenAI API key not found.")
 
9
  import soundfile as sf
10
  import whisper
11
  import pandas as pd
 
12
  from gtts import gTTS
13
 
14
  # --- 0. CONFIGURACI脫N INICIAL ---
 
15
  try:
16
  client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
17
  api_key_found = True
 
24
 
25
 
26
  # --- 1. DEFINICI脫N DE PROMPTS PARA LA IA ---
27
+ # (Estos son los prompts completos y correctos para cada funci贸n)
28
+
29
+ CONVERSATION_SYSTEM_PROMPT = """
30
+ You are a friendly and encouraging English language tutor named Alex.
31
+ A student will speak to you. Your task is to keep a natural, simple conversation going.
32
+ 1. Briefly analyze the user's previous response to estimate their CEFR level (A1, A2, B1, etc.).
33
+ 2. Formulate a simple, open-ended follow-up question that is appropriate for THAT estimated level.
34
+ 3. Your entire response must be a single, short paragraph in natural, conversational English. DO NOT use JSON.
35
+ """
36
+
37
+ FINAL_EVALUATION_SYSTEM_PROMPT = """
38
+ You are an expert English language examiner providing a final report. Analyze the entire conversation history provided.
39
+ Your task is to return a single, valid JSON object with the following structure. Do not include any text outside this JSON object.
40
+ JSON Output Structure:
41
+ {
42
+ "cefr_level": "string (e.g., A2, B1)",
43
+ "feedback_en": { "strengths": "string", "areas_for_improvement": "string", "word_by_word_feedback": [{"word": "string", "feedback": "string"}] },
44
+ "feedback_es": { "fortalezas": "string", "areas_a_mejorar": "string", "feedback_por_palabra": [{"palabra": "string", "feedback": "string"}] }
45
+ }
46
+ """
47
+
48
+ SENTENCE_EVALUATION_SYSTEM_PROMPT = """
49
+ You are an expert English language examiner specializing in phonetics. Your task is to provide a detailed, diagnostic assessment of a student's spoken English based on a reference sentence and detailed word-level audio analysis.
50
+ Input You Will Receive: A JSON object with `reference_transcript` and a list of `spoken_words` with timestamps and energy.
51
+ Your entire response MUST be a single, valid JSON object with the following structure. Do not include any text outside this JSON object.
52
+ JSON Output Structure:
53
+ {
54
+ "overall_score_100": integer,
55
+ "cefr_level": "string (A1, A2, B1, B2, C1, or C2)",
56
+ "holistic_feedback": { "strengths": "string", "areas_for_improvement": "string" },
57
+ "word_by_word_analysis": [ { "reference_word": "string", "spoken_word": "string", "word_score_100": integer, "correct_ipa": "string", "feedback_en": "string", "feedback_es": "string" } ]
58
+ }
59
+ """
60
 
61
  # --- 2. FUNCIONES L脫GICAS ---
62
 
 
63
  def extract_word_level_features(audio_path):
64
+ try:
65
+ y, sr = librosa.load(audio_path, sr=16000)
66
+ result = whisper_model.transcribe(audio_path, word_timestamps=True, fp16=False)
67
+ if not result["segments"] or 'words' not in result["segments"][0]: return []
68
+ word_segments = result["segments"][0]["words"]
69
+ features_list = []
70
+ for segment in word_segments:
71
+ start_sample = int(segment['start'] * sr); end_sample = int(segment['end'] * sr)
72
+ word_audio = y[start_sample:end_sample]
73
+ rms_energy = np.mean(librosa.feature.rms(y=word_audio)) if len(word_audio) > 0 else 0
74
+ features_list.append({"word": segment['word'].strip(), "start": round(segment['start'], 2), "end": round(segment['end'], 2), "energy": round(float(rms_energy), 4)})
75
+ return features_list
76
+ except Exception as e:
77
+ print(f"Error during feature extraction: {e}"); return []
78
 
79
  def chat_interaction(audio_input, history_state):
80
+ if not api_key_found: raise gr.Error("OpenAI API key not found.")
81
+ if audio_input is None: return history_state, history_state, gr.Markdown(visible=False), gr.Markdown(visible=False)
82
+ sr, y = audio_input; temp_audio_path = "temp_audio_chat.wav"; sf.write(temp_audio_path, y, sr)
83
+ user_text = client.audio.transcriptions.create(model="whisper-1", file=open(temp_audio_path, "rb")).text
84
+ if not history_state: history_state = []
85
+ history_state.append({"role": "user", "content": user_text})
86
+ chat_display = [(history_state[i]['content'], history_state[i+1]['content']) for i in range(0, len(history_state)-1, 2)]
87
+ chat_display.append((user_text, None))
88
+
89
+ if len(history_state) < 9:
90
+ messages_to_send = [{"role": "system", "content": CONVERSATION_SYSTEM_PROMPT}] + history_state
91
+ response = client.chat.completions.create(model="gpt-4o", messages=messages_to_send, temperature=0.7)
92
+ ai_response = response.choices[0].message.content
93
+ history_state.append({"role": "assistant", "content": ai_response})
94
+ chat_display[-1] = (chat_display[-1][0], ai_response)
95
+ return chat_display, history_state, gr.Markdown(visible=False), gr.Markdown(visible=False)
96
+ else:
97
+ print("Generating final evaluation..."); messages_to_send = [{"role": "system", "content": FINAL_EVALUATION_SYSTEM_PROMPT}] + history_state
98
+ response = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=messages_to_send)
99
+ try:
100
+ result = json.loads(response.choices[0].message.content)
101
+ fb_en = result.get('feedback_en', {}); md_en = f"## Final Report (CEFR Level: {result.get('cefr_level', 'N/A')})\n### Strengths\n{fb_en.get('strengths', '')}\n### Areas for Improvement\n{fb_en.get('areas_for_improvement', '')}\n### Word-by-Word Feedback\n"
102
+ for item in fb_en.get('word_by_word_feedback', []): md_en += f"- **{item['word']}**: {item['feedback']}\n"
103
+ fb_es = result.get('feedback_es', {}); md_es = f"## Reporte Final (Nivel MCERL: {result.get('cefr_level', 'N/A')})\n### Fortalezas\n{fb_es.get('fortalezas', '')}\n### 脕reas a Mejorar\n{fb_es.get('areas_a_mejorar', '')}\n### Retroalimentaci贸n por Palabra\n"
104
+ for item in fb_es.get('feedback_por_palabra', []): md_es += f"- **{item['palabra']}**: {item['feedback']}\n"
105
+ chat_display[-1] = (chat_display[-1][0], "Thank you for the conversation! Here is your final report.")
106
+ return chat_display, history_state, gr.Markdown(value=md_en, visible=True), gr.Markdown(value=md_es, visible=True)
107
+ except (json.JSONDecodeError, KeyError) as e:
108
+ print(f"Error parsing final report: {e}"); return chat_display, history_state, gr.Markdown(value="Error generating report.", visible=True), gr.Markdown(visible=False)
109
 
 
110
  def run_sentence_evaluation(audio_input, reference_transcript):
111
  if not api_key_found: raise gr.Error("OpenAI API key not found.")
112
  if audio_input is None or not reference_transcript:
113
  return 0, "N/A", "Please provide both an audio file and the reference text.", None
 
 
114
  sr, y = audio_input; temp_audio_path = "temp_audio_sentence.wav"; sf.write(temp_audio_path, y, sr)
115
  word_features = extract_word_level_features(temp_audio_path)
116
  if not word_features:
117
  return 0, "N/A", "Could not process the audio.", None
118
  prompt_data = {"reference_transcript": reference_transcript, "spoken_words": word_features}
 
119
  print("Sending detailed data to GPT-4o for sentence analysis...")
120
  response = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=[{"role": "system", "content": SENTENCE_EVALUATION_SYSTEM_PROMPT}, {"role": "user", "content": json.dumps(prompt_data)}])
 
121
  try:
122
  result = json.loads(response.choices[0].message.content)
123
  holistic_feedback_md = f"### Strengths\n{result['holistic_feedback']['strengths']}\n\n### Areas for Improvement\n{result['holistic_feedback']['areas_for_improvement']}"
124
  word_analysis_df = pd.DataFrame(result['word_by_word_analysis'])
125
+ os.makedirs("reference_audio", exist_ok=True)
 
 
126
  reference_audio_paths = []
 
 
 
127
  for index, row in word_analysis_df.iterrows():
128
+ word_to_speak = row['reference_word']; audio_path = f"reference_audio/{index}_{word_to_speak.lower()}.mp3"
 
129
  try:
130
+ tts = gTTS(text=word_to_speak, lang='en'); tts.save(audio_path); reference_audio_paths.append(audio_path)
131
+ except Exception: reference_audio_paths.append(None)
 
 
 
 
 
132
  word_analysis_df['reference_audio'] = reference_audio_paths
133
+ df_for_display = word_analysis_df[['reference_word', 'spoken_word', 'word_score_100', 'feedback_en', 'feedback_es', 'reference_audio']]
134
+ return (result.get("overall_score_100", 0), result.get("cefr_level", "N/A"), holistic_feedback_md, gr.DataFrame(value=df_for_display))
 
 
 
 
 
 
 
 
 
 
 
135
  except (json.JSONDecodeError, KeyError) as e:
136
  print(f"Error processing API response: {e}"); error_msg = "The API response was not in the expected format."
137
  return 0, "Error", error_msg, None
138
 
139
 
140
+ # --- 3. INTERFAZ DE GRADIO CON PESTA脩AS ---
141
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
142
  gr.Markdown("# 馃嚞馃嚙 AI English Speaking Practice & Assessment")
143
  with gr.Tabs():
144
+ # --- PESTA脩A 1: CHAT AI ---
145
  with gr.TabItem("Pr谩ctica Conversacional (Chat AI)"):
146
+ with gr.Row():
147
+ with gr.Column(scale=2):
148
+ chatbot = gr.Chatbot(value=[(None, "Hi there! I'm Alex. How are you doing today?")], label="Conversation with your AI Tutor")
149
+ audio_in_chat = gr.Audio(sources=["microphone"], type="numpy", label="Record your response")
150
+ with gr.Column(scale=1):
151
+ gr.Markdown("### Final Report"); feedback_en_out = gr.Markdown(label="English Feedback", visible=False); feedback_es_out = gr.Markdown(label="Retroalimentaci贸n en Espa帽ol", visible=False)
152
+ history = gr.State([])
153
+ audio_in_chat.stop_recording(fn=chat_interaction, inputs=[audio_in_chat, history], outputs=[chatbot, history, feedback_en_out, feedback_es_out])
154
 
155
  # --- PESTA脩A 2: EVALUACI脫N POR FRASE ---
156
  with gr.TabItem("Evaluaci贸n por Frase"):
157
+ TONGUE_TWISTERS = ["Peter Piper picked a peck of pickled peppers.", "She sells seashells by the seashore.", "How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "Betty Botter bought some butter but she said the butter鈥檚 bitter.", "A proper copper coffee pot."]
158
+ gr.Markdown("Choose a tongue twister or write your own sentence. Record yourself, and our AI examiner will provide a detailed diagnostic report.")
159
  tongue_twister_selector = gr.Dropdown(choices=TONGUE_TWISTERS, label="Or Choose a Tongue Twister to Practice")
 
160
  with gr.Row():
161
  with gr.Column(scale=1):
162
+ # --- CORRECCI脫N: Par谩metros completos en lugar de '...' ---
163
+ audio_in_sentence = gr.Audio(sources=["microphone"], type="numpy", label="1. Record Your Voice")
164
+ text_in_sentence = gr.Textbox(lines=3, label="2. Reference Sentence", value=TONGUE_TWISTERS[0])
165
+ submit_btn_sentence = gr.Button("Get Assessment", variant="primary")
 
166
  with gr.Column(scale=2):
167
+ gr.Markdown("### Assessment Summary")
168
+ with gr.Row():
169
+ score_out_sentence = gr.Number(label="Overall Score (0-100)", interactive=False)
170
+ level_out_sentence = gr.Textbox(label="Estimated CEFR Level", interactive=False)
171
+ holistic_feedback_out_sentence = gr.Markdown(label="Examiner's Feedback")
172
  gr.Markdown("--- \n ### Detailed Word-by-Word Analysis")
173
+ word_analysis_out_sentence = gr.DataFrame(headers=["Reference Word", "Spoken Word", "Score (0-100)", "Feedback (English)", "Feedback (Espa帽ol)", "Reference Audio"], label="Phonetic Breakdown", wrap=True)
 
 
 
 
 
 
 
174
  def update_text(choice): return gr.Textbox(value=choice)
175
  tongue_twister_selector.change(fn=update_text, inputs=tongue_twister_selector, outputs=text_in_sentence)
176
+ submit_btn_sentence.click(fn=run_sentence_evaluation, inputs=[audio_in_sentence, text_in_sentence], outputs=[score_out_sentence, level_out_sentence, holistic_feedback_out_sentence, word_analysis_out_sentence])
 
 
 
 
 
177
 
178
  if __name__ == "__main__":
179
  if not api_key_found: print("\nFATAL: OpenAI API key not found.")