mramirez2001 commited on
Commit
5e0b8e4
verified
1 Parent(s): 4ece3bb

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -52
app.py CHANGED
@@ -21,7 +21,16 @@ print("Loading Whisper model...")
21
  whisper_model = whisper.load_model("base", device="cpu")
22
  print("Whisper model loaded.")
23
 
24
- # --- PROMPT DEL EXAMINADOR EXPERTO ---
 
 
 
 
 
 
 
 
 
25
  SYSTEM_PROMPT = """
26
  You are an expert English language examiner specializing in phonetics and accent reduction for ESL learners. Your task is to provide a detailed, diagnostic assessment of a student's spoken English based on a reference sentence and detailed word-level audio analysis.
27
 
@@ -57,42 +66,27 @@ Your entire response MUST be in English. You must return a single, valid JSON ob
57
  }
58
  """
59
 
60
- # --- 1. EXTRACCI脫N DETALLADA DE CARACTER脥STICAS (WHISPER + LIBROSA) ---
61
  def extract_word_level_features(audio_path):
62
- """
63
- This function uses Whisper to get word timestamps and Librosa to get
64
- features for each word's audio segment.
65
- """
66
  try:
67
  y, sr = librosa.load(audio_path, sr=16000)
68
-
69
  result = whisper_model.transcribe(audio_path, word_timestamps=True, fp16=False)
70
- if not result["segments"] or not result["segments"][0]["words"]:
71
  return []
72
-
73
  word_segments = result["segments"][0]["words"]
74
-
75
  features_list = []
76
  for segment in word_segments:
77
  start_sample = int(segment['start'] * sr)
78
  end_sample = int(segment['end'] * sr)
79
  word_audio = y[start_sample:end_sample]
80
-
81
- # Calculate Root Mean Square (RMS) energy for the word
82
- rms_energy = np.mean(librosa.feature.rms(y=word_audio))
83
-
84
- features_list.append({
85
- "word": segment['word'].strip(),
86
- "start": round(segment['start'], 2),
87
- "end": round(segment['end'], 2),
88
- "energy": round(float(rms_energy), 4)
89
- })
90
  return features_list
91
  except Exception as e:
92
  print(f"Error during feature extraction: {e}")
93
  return []
94
 
95
- # --- 2. FUNCI脫N PRINCIPAL DE EVALUACI脫N ---
96
  def run_evaluation(audio_input, reference_transcript):
97
  if not api_key_found: raise gr.Error("OpenAI API key not found.")
98
  if audio_input is None or not reference_transcript:
@@ -102,62 +96,47 @@ def run_evaluation(audio_input, reference_transcript):
102
  temp_audio_path = "temp_audio.wav"
103
  sf.write(temp_audio_path, y, sr)
104
 
105
- # Step 1: Extract detailed features using Whisper and Librosa
106
  word_features = extract_word_level_features(temp_audio_path)
107
  if not word_features:
108
  return 0, "N/A", "Could not process the audio. Please try recording again.", None
109
 
110
- # Step 2: Construct the detailed prompt for the OpenAI API
111
- prompt_data = {
112
- "reference_transcript": reference_transcript,
113
- "spoken_words": word_features
114
- }
115
 
116
  print("Sending detailed data to GPT-4o for analysis...")
117
  response = client.chat.completions.create(
118
- model="gpt-4o",
119
- response_format={"type": "json_object"},
120
- messages=[
121
- {"role": "system", "content": SYSTEM_PROMPT},
122
- {"role": "user", "content": json.dumps(prompt_data)}
123
- ]
124
  )
125
 
126
- # Step 3: Process the API response and format it for display
127
  try:
128
  result = json.loads(response.choices[0].message.content)
129
-
130
- # Format the detailed report for Gradio
131
  holistic_feedback_md = f"### Strengths\n{result['holistic_feedback']['strengths']}\n\n"
132
  holistic_feedback_md += f"### Areas for Improvement\n{result['holistic_feedback']['areas_for_improvement']}"
133
-
134
- # Create a pandas DataFrame for better display
135
  word_analysis_df = pd.DataFrame(result['word_by_word_analysis'])
136
 
137
- return (
138
- result.get("overall_score_100", 0),
139
- result.get("cefr_level", "N/A"),
140
- holistic_feedback_md,
141
- gr.DataFrame(value=word_analysis_df, headers=["Reference Word", "Spoken Word", "Score", "Correct IPA", "Feedback"], interactive=False)
142
- )
143
-
144
  except (json.JSONDecodeError, KeyError) as e:
145
  print(f"Error processing API response: {e}")
146
  error_msg = "The API response was not in the expected format. Please try again."
147
  return 0, "Error", error_msg, None
148
 
149
-
150
- # --- 3. INTERFAZ DE GRADIO ---
151
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
152
  gr.Markdown("# 馃嚞馃嚙 Expert Pronunciation Assessment")
153
- gr.Markdown("Record yourself saying the reference sentence. Our AI examiner will provide a detailed diagnostic report on your performance.")
154
-
155
- frase_ejemplo = "The rainbow is a division of white light into many beautiful colors."
156
 
 
 
 
 
 
 
 
157
  with gr.Row():
158
  with gr.Column(scale=1):
159
  audio_in = gr.Audio(sources=["microphone"], type="numpy", label="1. Record Your Voice")
160
- text_in = gr.Textbox(lines=3, label="2. Reference Sentence", value=frase_ejemplo)
161
  submit_btn = gr.Button("Get Assessment", variant="primary")
162
 
163
  with gr.Column(scale=2):
@@ -169,7 +148,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
169
  holistic_feedback_out = gr.Markdown(label="Examiner's Feedback")
170
 
171
  gr.Markdown("--- \n ### Detailed Word-by-Word Analysis")
172
- word_analysis_out = gr.DataFrame(headers=["Reference Word", "Spoken Word", "Score", "Correct IPA", "Feedback"], label="Phonetic Breakdown")
 
 
 
 
 
 
173
 
174
  submit_btn.click(
175
  fn=run_evaluation,
 
21
  whisper_model = whisper.load_model("base", device="cpu")
22
  print("Whisper model loaded.")
23
 
24
+ # --- NUEVO: Lista de Trabalenguas ---
25
+ TONGUE_TWISTERS = [
26
+ "Peter Piper picked a peck of pickled peppers.",
27
+ "She sells seashells by the seashore.",
28
+ "How much wood would a woodchuck chuck if a woodchuck could chuck wood?",
29
+ "Betty Botter bought some butter but she said the butter鈥檚 bitter.",
30
+ "A proper copper coffee pot."
31
+ ]
32
+
33
+ # --- PROMPT DEL EXAMINADOR EXPERTO (Sin cambios) ---
34
  SYSTEM_PROMPT = """
35
  You are an expert English language examiner specializing in phonetics and accent reduction for ESL learners. Your task is to provide a detailed, diagnostic assessment of a student's spoken English based on a reference sentence and detailed word-level audio analysis.
36
 
 
66
  }
67
  """
68
 
69
+ # --- 1. EXTRACCI脫N DE CARACTER脥STICAS (Sin cambios) ---
70
  def extract_word_level_features(audio_path):
 
 
 
 
71
  try:
72
  y, sr = librosa.load(audio_path, sr=16000)
 
73
  result = whisper_model.transcribe(audio_path, word_timestamps=True, fp16=False)
74
+ if not result["segments"] or 'words' not in result["segments"][0]:
75
  return []
 
76
  word_segments = result["segments"][0]["words"]
 
77
  features_list = []
78
  for segment in word_segments:
79
  start_sample = int(segment['start'] * sr)
80
  end_sample = int(segment['end'] * sr)
81
  word_audio = y[start_sample:end_sample]
82
+ rms_energy = np.mean(librosa.feature.rms(y=word_audio)) if len(word_audio) > 0 else 0
83
+ features_list.append({"word": segment['word'].strip(), "start": round(segment['start'], 2), "end": round(segment['end'], 2), "energy": round(float(rms_energy), 4)})
 
 
 
 
 
 
 
 
84
  return features_list
85
  except Exception as e:
86
  print(f"Error during feature extraction: {e}")
87
  return []
88
 
89
+ # --- 2. FUNCI脫N PRINCIPAL DE EVALUACI脫N (Sin cambios) ---
90
  def run_evaluation(audio_input, reference_transcript):
91
  if not api_key_found: raise gr.Error("OpenAI API key not found.")
92
  if audio_input is None or not reference_transcript:
 
96
  temp_audio_path = "temp_audio.wav"
97
  sf.write(temp_audio_path, y, sr)
98
 
 
99
  word_features = extract_word_level_features(temp_audio_path)
100
  if not word_features:
101
  return 0, "N/A", "Could not process the audio. Please try recording again.", None
102
 
103
+ prompt_data = {"reference_transcript": reference_transcript, "spoken_words": word_features}
 
 
 
 
104
 
105
  print("Sending detailed data to GPT-4o for analysis...")
106
  response = client.chat.completions.create(
107
+ model="gpt-4o", response_format={"type": "json_object"},
108
+ messages=[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": json.dumps(prompt_data)}]
 
 
 
 
109
  )
110
 
 
111
  try:
112
  result = json.loads(response.choices[0].message.content)
 
 
113
  holistic_feedback_md = f"### Strengths\n{result['holistic_feedback']['strengths']}\n\n"
114
  holistic_feedback_md += f"### Areas for Improvement\n{result['holistic_feedback']['areas_for_improvement']}"
 
 
115
  word_analysis_df = pd.DataFrame(result['word_by_word_analysis'])
116
 
117
+ return (result.get("overall_score_100", 0), result.get("cefr_level", "N/A"), holistic_feedback_md,
118
+ gr.DataFrame(value=word_analysis_df, headers=["Reference Word", "Spoken Word", "Score", "Correct IPA", "Feedback"], interactive=False))
 
 
 
 
 
119
  except (json.JSONDecodeError, KeyError) as e:
120
  print(f"Error processing API response: {e}")
121
  error_msg = "The API response was not in the expected format. Please try again."
122
  return 0, "Error", error_msg, None
123
 
124
+ # --- 3. INTERFAZ DE GRADIO (Con las nuevas adecuaciones) ---
 
125
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
126
  gr.Markdown("# 馃嚞馃嚙 Expert Pronunciation Assessment")
127
+ gr.Markdown("Choose a tongue twister or write your own sentence. Record yourself, and our AI examiner will provide a detailed diagnostic report.")
 
 
128
 
129
+ # --- NUEVO: Selector de Trabalenguas ---
130
+ tongue_twister_selector = gr.Dropdown(
131
+ choices=TONGUE_TWISTERS,
132
+ label="Or Choose a Tongue Twister to Practice",
133
+ info="Selecting one will automatically fill the reference sentence below."
134
+ )
135
+
136
  with gr.Row():
137
  with gr.Column(scale=1):
138
  audio_in = gr.Audio(sources=["microphone"], type="numpy", label="1. Record Your Voice")
139
+ text_in = gr.Textbox(lines=3, label="2. Reference Sentence", value=TONGUE_TWISTERS[0])
140
  submit_btn = gr.Button("Get Assessment", variant="primary")
141
 
142
  with gr.Column(scale=2):
 
148
  holistic_feedback_out = gr.Markdown(label="Examiner's Feedback")
149
 
150
  gr.Markdown("--- \n ### Detailed Word-by-Word Analysis")
151
+ word_analysis_out = gr.DataFrame(headers=["Reference Word", "Spoken Word", "Score", "Correct IPA", "Feedback"], label="Phonetic Breakdown", wrap=True)
152
+
153
+ # --- NUEVO: L贸gica de Interacci贸n ---
154
+ # Cuando el dropdown cambia, actualiza el campo de texto.
155
+ def update_text(choice):
156
+ return gr.Textbox(value=choice)
157
+ tongue_twister_selector.change(fn=update_text, inputs=tongue_twister_selector, outputs=text_in)
158
 
159
  submit_btn.click(
160
  fn=run_evaluation,