AK97GAMERZ commited on
Commit
1cc081e
·
verified ·
1 Parent(s): 85be1e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -186
app.py CHANGED
@@ -4,50 +4,41 @@ import os
4
  import fitz # PyMuPDF
5
  import tempfile
6
  import subprocess # For calling Piper TTS
7
- import wave # For saving WAV files
8
  import pathlib
9
  import whisper # For Speech-to-Text
10
  import numpy as np
11
  import soundfile as sf # To read audio data for Whisper
12
 
13
  # --- Configuration ---
14
- # 1. Hugging Face Spaces Secrets:
15
- # - GOOGLE_API_KEY: Your Gemini API Key
16
- # - PIPER_VOICE_PATH: Path to the piper voice model (.onnx file).
17
- # You'll need to upload the voice model and its .json config to your Space.
18
- # Example: "voices/en_US-lessac-medium.onnx"
19
- # Download voices from: https://huggingface.co/rhasspy/piper-voices/tree/main
20
-
21
  try:
22
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
23
- PIPER_VOICE_PATH_ONNX = os.environ.get("PIPER_VOICE_PATH_ONNX") # e.g., voices/en_US-lessac-medium.onnx
24
  PIPER_VOICE_PATH_JSON = PIPER_VOICE_PATH_ONNX + ".json" if PIPER_VOICE_PATH_ONNX else None
25
 
26
  if not GOOGLE_API_KEY:
27
  print("Warning: GOOGLE_API_KEY not found in secrets.")
28
  if not PIPER_VOICE_PATH_ONNX or not os.path.exists(PIPER_VOICE_PATH_ONNX):
29
  print(f"Warning: Piper voice ONNX model not found at specified path: {PIPER_VOICE_PATH_ONNX}. TTS will not work.")
30
- PIPER_VOICE_PATH_ONNX = None # Disable TTS if model not found
31
- if PIPER_VOICE_PATH_JSON and not os.path.exists(PIPER_VOICE_PATH_JSON):
32
- print(f"Warning: Piper voice JSON config not found at specified path: {PIPER_VOICE_PATH_JSON}. TTS might have issues.")
33
-
34
 
35
  except KeyError as e:
36
  print(f"Please set the following environment variables in Hugging Face Space secrets: {e}")
37
  GOOGLE_API_KEY = None
38
  PIPER_VOICE_PATH_ONNX = None
 
39
 
40
 
41
  # Initialize Gemini
42
  if GOOGLE_API_KEY:
43
  genai.configure(api_key=GOOGLE_API_KEY)
44
- gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Using 1.5 Flash
45
  else:
46
  gemini_model = None
47
 
48
- # Initialize Whisper STT model (load it once)
49
- # You can choose model size: "tiny", "base", "small", "medium", "large"
50
- # Smaller models are faster but less accurate. "base" or "small" is a good start.
51
  try:
52
  stt_model = whisper.load_model("base")
53
  print("Whisper STT model loaded successfully.")
@@ -58,19 +49,22 @@ except Exception as e:
58
  # --- Helper Functions ---
59
 
60
  def pdf_to_text(pdf_file_path):
61
- """Extracts text from a PDF file."""
62
  if not pdf_file_path:
63
  return ""
64
- doc = fitz.open(pdf_file_path)
65
- text = ""
66
- for page_num in range(len(doc)):
67
- page = doc.load_page(page_num)
68
- text += page.get_text()
69
- doc.close()
70
- return text
 
 
 
 
 
71
 
72
  def generate_lecture_prompt(chapter_text):
73
- """Creates a detailed prompt for Gemini."""
74
  prompt = f"""
75
  You are an expert, engaging, and slightly humorous AI tutor, like the best human teacher one could ask for.
76
  Your goal is to generate a comprehensive and interactive lecture based on the following PDF chapter text.
@@ -115,162 +109,176 @@ def text_to_speech_piper(text, output_filename="lecture_audio.wav"):
115
  print("Piper TTS model not available or no text provided. Skipping TTS.")
116
  return None
117
 
118
- # Ensure piper executable is in PATH or provide full path
119
- # On Hugging Face Spaces, 'piper' might need to be installed via packages.txt or built.
120
- # Assuming 'piper' is available:
121
- piper_executable = "piper" # Or full path if not in PATH
122
-
123
- # Create a temporary file for the text input if text is very long
124
- with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt", encoding='utf-8') as tmp_text_file:
125
- tmp_text_file.write(text)
126
- text_input_path = tmp_text_file.name
127
 
128
  command = [
129
  piper_executable,
130
  "--model", PIPER_VOICE_PATH_ONNX,
131
- "--output_file", output_filename,
132
- "--text_file", text_input_path # Using text file for potentially long inputs
133
  ]
134
- # If your voice has a JSON config, Piper usually finds it if it's next to the ONNX file.
135
- # If not, you might need to add: "--config", PIPER_VOICE_PATH_JSON
 
 
136
 
137
- print(f"Running Piper TTS command: {' '.join(command)}")
138
  try:
139
- process = subprocess.run(command, capture_output=True, text=True, check=True, encoding='utf-8', errors='ignore')
140
- print("Piper TTS STDOUT:", process.stdout)
141
- print("Piper TTS STDERR:", process.stderr)
142
- os.remove(text_input_path) # Clean up temp file
 
 
 
 
 
 
 
 
 
 
 
143
  if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
 
 
 
144
  return output_filename
145
  else:
146
- print(f"Piper TTS failed to create or created an empty output file: {output_filename}")
147
- if process.stderr: print("Piper Error:", process.stderr)
 
148
  return None
149
- except subprocess.CalledProcessError as e:
150
- print(f"Error during Piper TTS execution: {e}")
151
- print("Piper STDOUT:", e.stdout)
152
- print("Piper STDERR:", e.stderr)
153
- os.remove(text_input_path) # Clean up temp file
154
- return None
155
  except FileNotFoundError:
156
  print(f"Error: '{piper_executable}' command not found. Make sure Piper is installed and in your PATH.")
157
- print("On Hugging Face Spaces, you might need to add 'piper-tts' to requirements.txt (if it installs CLI) or install via packages.txt.")
158
- os.remove(text_input_path) # Clean up temp file
 
 
159
  return None
160
 
161
 
162
- def transcribe_audio(audio_filepath):
163
- """Transcribes audio file to text using Whisper."""
164
- if not stt_model or not audio_filepath:
165
- print("Whisper STT model not available or no audio file. Skipping transcription.")
166
  return "Error: STT not available."
 
 
 
 
167
  try:
168
- # Whisper expects a NumPy array or path to file.
169
- # Gradio mic input provides (sample_rate, data_numpy_array) or a filepath string
170
-
171
- if isinstance(audio_filepath, tuple): # If (rate, data) format
172
- sample_rate, data = audio_filepath
173
- # Ensure data is float32, as whisper expects
174
- if data.dtype != np.float32:
175
- data = data.astype(np.float32) / np.iinfo(data.dtype).max # Normalize if int
176
-
177
- # Save to a temporary WAV file because whisper.transcribe() is easier with file paths
178
- # for some backend configurations or if there are issues with direct array processing.
179
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio_file:
180
- sf.write(tmp_audio_file.name, data, sample_rate)
181
- temp_audio_path = tmp_audio_file.name
182
 
183
- result = stt_model.transcribe(temp_audio_path)
184
- os.remove(temp_audio_path) # Clean up temp audio file
185
-
186
- elif isinstance(audio_filepath, str): # If it's already a file path
187
- result = stt_model.transcribe(audio_filepath)
 
188
  else:
189
  return "Error: Invalid audio input format for transcription."
190
 
 
 
 
 
 
191
  return result["text"]
192
  except Exception as e:
193
  print(f"Error during audio transcription: {e}")
 
 
 
 
 
194
  return f"Error during transcription: {str(e)}"
195
 
196
  # --- Main Gradio App Logic ---
197
 
198
- lecture_state = {"full_lecture_text": "", "current_segment_index": 0, "segments": []}
199
-
200
  def process_pdf_and_generate_lecture(pdf_file_obj, progress=gr.Progress(track_tqdm=True)):
201
- """Processes PDF, generates lecture text, and converts to speech."""
202
  if not gemini_model:
203
- return "Gemini API not configured.", None, "Error: Gemini API key missing.", None
204
  if not pdf_file_obj:
205
- return "Please upload a PDF file.", None, "No PDF uploaded.", None
206
 
207
  progress(0.1, desc="Extracting text from PDF...")
208
- pdf_text = pdf_to_text(pdf_file_obj.name) # .name gives the temp path of uploaded file
 
209
 
210
  if not pdf_text.strip():
211
- return "Could not extract text from PDF or PDF is empty.", None, "Empty PDF content.", None
 
 
 
 
 
 
212
 
213
  progress(0.3, desc="Generating lecture script with Gemini...")
214
- lecture_prompt = generate_lecture_prompt(pdf_text[:15000]) # Limit context window for safety
215
 
 
216
  try:
217
  response = gemini_model.generate_content(lecture_prompt)
218
  lecture_text = response.text
219
  except Exception as e:
220
  print(f"Error calling Gemini API: {e}")
221
- return f"Error generating lecture: {e}", None, "Gemini API Error.", None
222
 
223
- lecture_state["full_lecture_text"] = lecture_text
224
- # Simple segmentation for now (e.g., by paragraphs) for potential future "resume"
225
- lecture_state["segments"] = [s.strip() for s in lecture_text.split("\n\n") if s.strip()]
226
- lecture_state["current_segment_index"] = 0
227
-
228
- # For whiteboard: extract parts starting with "Imagine on our whiteboard:"
229
  whiteboard_content = ""
230
  for line in lecture_text.split('\n'):
231
  if line.lower().startswith("imagine on our whiteboard:"):
232
  whiteboard_content += line.replace("Imagine on our whiteboard:", "").strip() + "\n\n"
233
  if not whiteboard_content:
234
- whiteboard_content = "No specific whiteboard content described for this section. The AI will verbally describe visuals."
235
 
236
  progress(0.7, desc="Converting lecture to speech (TTS)...")
237
  # Create a unique filename for audio to avoid caching issues if files are static
238
- audio_output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
 
 
239
 
240
- lecture_audio_path = text_to_speech_piper(lecture_text, output_filename=audio_output_path)
241
 
242
- if not lecture_audio_path:
243
  progress(1.0, desc="TTS failed. Displaying text only.")
244
- return lecture_text, None, whiteboard_content, "TTS failed. Audio not available."
 
245
 
246
  progress(1.0, desc="Lecture ready!")
247
- return lecture_text, lecture_audio_path, whiteboard_content, "Lecture generated successfully!"
 
248
 
249
 
250
- def handle_student_doubt(student_audio_query, lecture_context_text, progress=gr.Progress(track_tqdm=True)):
251
- if not student_audio_query:
252
- return "No doubt recorded. Please record your question.", None, lecture_context_text
253
  if not gemini_model:
254
- return "Gemini API not configured. Cannot answer doubt.", None, lecture_context_text
255
 
256
  progress(0.2, desc="Transcribing your question...")
257
- # student_audio_query from gr.Audio is a tuple (sample_rate, numpy_array) or filepath
258
- # For Whisper, we often save it to a temp file if it's raw data.
259
- # The `transcribe_audio` function handles this.
260
-
261
- # Gradio's audio input (mic) typically gives a filepath to a temp WAV
262
- student_question_text = transcribe_audio(student_audio_query)
263
 
264
  if student_question_text.startswith("Error:"):
265
- return f"Could not understand your question: {student_question_text}", None, lecture_context_text
266
 
267
  progress(0.5, desc="Thinking about your question...")
268
 
 
 
 
269
  doubt_prompt = f"""
270
  A student has a doubt regarding the lecture.
271
- Current Lecture Context:
272
  ---
273
- {lecture_context_text[-2000:]}
274
  ---
275
  Student's Question: "{student_question_text}"
276
 
@@ -278,44 +286,56 @@ def handle_student_doubt(student_audio_query, lecture_context_text, progress=gr.
278
  1. Acknowledge the question.
279
  2. Provide a clear, concise, and helpful answer.
280
  3. Use analogies if helpful. Maintain your encouraging and slightly humorous tone.
281
- 4. After answering, gently prompt if they understood or if they'd like to continue the lecture. For example: "Does that make sense? Shall we get back to where we left off in the lecture?"
282
 
283
  Keep your answer focused on the question.
284
  """
 
285
  try:
286
  response = gemini_model.generate_content(doubt_prompt)
287
  answer_text = response.text
288
  except Exception as e:
289
  print(f"Error calling Gemini API for doubt: {e}")
290
- return f"Error processing doubt: {e}", None, lecture_context_text
291
 
292
  progress(0.8, desc="Preparing audio for the answer...")
293
- answer_audio_path_temp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
294
- answer_audio_path = text_to_speech_piper(answer_text, output_filename=answer_audio_path_temp)
 
 
 
 
295
 
296
  if not answer_audio_path:
297
  progress(1.0, desc="TTS for answer failed.")
298
- # Fallback: return text only if TTS fails
299
- return f"AI Tutor (Text): {answer_text}\n(Audio for answer failed to generate)", None, lecture_context_text
300
 
301
  progress(1.0, desc="Answer ready!")
302
- return f"Your Question: {student_question_text}\n\nAI Tutor: {answer_text}", answer_audio_path, lecture_context_text
303
 
304
 
305
  # --- Gradio UI ---
306
  css = """
307
  body { font-family: 'Arial', sans-serif; }
308
- .gradio-container { max-width: 900px !important; margin: auto !important; }
309
- .gr-button { background-color: #4CAF50; color: white; border-radius: 8px; }
310
- .gr-button:hover { background-color: #45a049; }
 
 
 
311
  .panel_description { padding: 10px; margin-bottom:10px; border-radius:5px; background-color:#f0f0f0; }
312
  .important_text { color: #D32F2F; font-weight: bold; }
313
  .markdown-output h1, .markdown-output h2 { color: #1976D2; }
314
- .markdown-output strong { color: #555; }
315
- .whiteboard-display { border: 2px dashed #ccc; padding: 15px; margin-top: 15px; background-color: #f9f9f9; min-height: 150px; }
 
 
 
 
 
316
  """
317
 
318
- with gr.Blocks(css=css, theme=gr.themes.Soft()) as app:
319
  gr.Markdown("# 🤖 AI Human-Like Tutor", elem_id="app_title")
320
  gr.Markdown(
321
  "Upload a PDF chapter, and the AI will generate an engaging lecture with voice, "
@@ -324,42 +344,34 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as app:
324
  )
325
 
326
  if not GOOGLE_API_KEY or not gemini_model:
327
- gr.Markdown("<p class='important_text'>🔴 Configuration Error: GOOGLE_API_KEY for Gemini is not set in Hugging Face Secrets. The app will not function correctly.</p>")
328
  if not PIPER_VOICE_PATH_ONNX:
329
- gr.Markdown("<p class='important_text'>🟡 Configuration Warning: PIPER_VOICE_PATH not set or model not found. TTS (Text-to-Speech) will be disabled.</p>")
330
  if not stt_model:
331
- gr.Markdown("<p class='important_text'>🟡 Configuration Warning: Whisper STT model failed to load. Mic input for doubts cannot be transcribed.</p>")
332
 
333
-
334
- # Store lecture context for doubt handling
335
  lecture_context_state = gr.State(value="")
336
 
337
  with gr.Row():
338
  with gr.Column(scale=1):
339
  pdf_upload = gr.File(label="Upload PDF Chapter", file_types=[".pdf"])
340
- generate_button = gr.Button("🚀 Generate Lecture", variant="primary")
341
- status_message = gr.Textbox(label="Status", interactive=False)
342
 
343
  gr.Markdown("---")
344
  gr.Markdown("### 🤔 Ask a Doubt")
345
- # "Raise Hand" button could toggle visibility of mic_input and ask_doubt_button
346
- # For simplicity, they are always visible here.
347
  raise_hand_button = gr.Button("✋ Raise Hand / Prepare to Ask")
348
-
349
- # Mic input for student's doubt
350
- # Using type="filepath" as Whisper model prefers file paths for robust processing.
351
- # Gradio will save the recorded audio to a temporary file and pass its path.
352
  student_mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Your Doubt (after clicking Raise Hand)")
353
-
354
  ask_doubt_button = gr.Button("💬 Ask My Doubt", variant="secondary")
 
355
 
356
  with gr.Column(scale=2):
357
- gr.Markdown("## giảng Lecture Output")
358
  lecture_display = gr.Markdown(label="Lecture Script")
359
  lecture_audio = gr.Audio(label="🎧 Listen to Lecture", type="filepath", autoplay=False)
360
 
361
- gr.Markdown("### 칠판 Whiteboard Area")
362
- whiteboard_display = gr.Textbox(label="Visuals & Notes (as described by AI)", lines=8, interactive=False, elem_classes=["whiteboard-display"])
363
 
364
  gr.Markdown("---")
365
  gr.Markdown("### 💡 Doubt Resolution")
@@ -372,63 +384,48 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as app:
372
  inputs=[pdf_upload],
373
  outputs=[lecture_display, lecture_audio, whiteboard_display, status_message, lecture_context_state],
374
  api_name="generate_lecture"
375
- ).then(
376
- fn=lambda lecture_text: lecture_text, # Store full lecture text for context
377
- inputs=[lecture_display],
378
- outputs=[lecture_context_state]
379
  )
380
 
381
- # Simple "Raise Hand" action - could be expanded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  raise_hand_button.click(
383
- lambda: gr.Info("Mic enabled! Record your question and click 'Ask My Doubt'.")
384
  )
385
 
386
  ask_doubt_button.click(
387
  fn=handle_student_doubt,
388
- inputs=[student_mic_input, lecture_context_state], # Pass current lecture context
389
- outputs=[doubt_answer_display, doubt_answer_audio, status_message], # status_message can be updated here too
390
  api_name="ask_doubt"
391
  )
392
 
393
- # --- Instructions for Hugging Face Spaces ---
394
- # 1. Create a new Space on Hugging Face.
395
- # 2. Choose "Gradio" as the SDK.
396
- # 3. Add your GOOGLE_API_KEY to the Space secrets (Settings -> Secrets).
397
- # - Name: GOOGLE_API_KEY
398
- # - Value: sk-yourActualGeminiApiKey...
399
- # 4. Download a Piper voice model:
400
- # - Go to: https://huggingface.co/rhasspy/piper-voices/tree/main
401
- # - Choose a voice, e.g., en_US-lessac-medium.onnx and its corresponding en_US-lessac-medium.onnx.json file.
402
- # - Upload these two files to a folder in your Space, e.g., create a `voices` folder and put them there.
403
- # 5. Add PIPER_VOICE_PATH_ONNX to Space secrets:
404
- # - Name: PIPER_VOICE_PATH_ONNX
405
- # - Value: voices/en_US-lessac-medium.onnx (or whatever path you used)
406
- # 6. Create a `requirements.txt` file in your Space repository with the content provided above.
407
- # 7. Create an `app.py` file in your Space repository with the Python code above.
408
- # 8. (Optional but Recommended) If `ffmpeg` is needed by Whisper or Piper for audio conversion/handling on Spaces:
409
- # Create a `packages.txt` file in your Space repository with the line:
410
- # ffmpeg
411
- # 9. The app should build and run. You might need to wait a bit for models (like Whisper) to download on first run.
412
-
413
  if __name__ == "__main__":
414
- # This part is for local execution, not strictly needed for HF Spaces
415
  # For local run, you'd set env vars:
416
  # os.environ['GOOGLE_API_KEY'] = 'YOUR_LOCAL_KEY'
417
  # os.environ['PIPER_VOICE_PATH_ONNX'] = 'path/to/your/local/voice.onnx'
418
- # Make sure 'piper' executable is in your PATH or adjust `piper_executable` variable.
419
- # And Whisper model will download to default cache.
 
 
420
 
421
- # Check if secrets are loaded for local testing (if you want to emulate HF secrets)
422
- if not GOOGLE_API_KEY:
423
- print("Local Run: GOOGLE_API_KEY not set as env variable.")
424
- if not PIPER_VOICE_PATH_ONNX:
425
- print("Local Run: PIPER_VOICE_PATH_ONNX not set as env variable. TTS might fail if piper executable is not found or model path is incorrect.")
426
- else:
427
- # Check if the local piper model files actually exist
428
- if not os.path.exists(PIPER_VOICE_PATH_ONNX):
429
- print(f"Local Run Warning: Piper ONNX model not found at {PIPER_VOICE_PATH_ONNX}")
430
- if not os.path.exists(PIPER_VOICE_PATH_ONNX + ".json"):
431
- print(f"Local Run Warning: Piper JSON config not found at {PIPER_VOICE_PATH_ONNX + '.json'}")
432
-
433
-
434
- app.launch(debug=True, share=False) # Set share=True to get a public link if running locally
 
4
  import fitz # PyMuPDF
5
  import tempfile
6
  import subprocess # For calling Piper TTS
7
+ # import wave # Not directly used now with Popen, but good to have if manipulating WAVs
8
  import pathlib
9
  import whisper # For Speech-to-Text
10
  import numpy as np
11
  import soundfile as sf # To read audio data for Whisper
12
 
13
  # --- Configuration ---
 
 
 
 
 
 
 
14
  try:
15
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
16
+ PIPER_VOICE_PATH_ONNX = os.environ.get("PIPER_VOICE_PATH_ONNX")
17
  PIPER_VOICE_PATH_JSON = PIPER_VOICE_PATH_ONNX + ".json" if PIPER_VOICE_PATH_ONNX else None
18
 
19
  if not GOOGLE_API_KEY:
20
  print("Warning: GOOGLE_API_KEY not found in secrets.")
21
  if not PIPER_VOICE_PATH_ONNX or not os.path.exists(PIPER_VOICE_PATH_ONNX):
22
  print(f"Warning: Piper voice ONNX model not found at specified path: {PIPER_VOICE_PATH_ONNX}. TTS will not work.")
23
+ PIPER_VOICE_PATH_ONNX = None
24
+ if PIPER_VOICE_PATH_ONNX and (not PIPER_VOICE_PATH_JSON or not os.path.exists(PIPER_VOICE_PATH_JSON)):
25
+ print(f"Warning: Piper voice JSON config not found at specified path: {PIPER_VOICE_PATH_JSON}. TTS might have issues if model needs explicit config.")
 
26
 
27
  except KeyError as e:
28
  print(f"Please set the following environment variables in Hugging Face Space secrets: {e}")
29
  GOOGLE_API_KEY = None
30
  PIPER_VOICE_PATH_ONNX = None
31
+ PIPER_VOICE_PATH_JSON = None
32
 
33
 
34
  # Initialize Gemini
35
  if GOOGLE_API_KEY:
36
  genai.configure(api_key=GOOGLE_API_KEY)
37
+ gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')
38
  else:
39
  gemini_model = None
40
 
41
+ # Initialize Whisper STT model
 
 
42
  try:
43
  stt_model = whisper.load_model("base")
44
  print("Whisper STT model loaded successfully.")
 
49
  # --- Helper Functions ---
50
 
51
  def pdf_to_text(pdf_file_path):
 
52
  if not pdf_file_path:
53
  return ""
54
+ try:
55
+ doc = fitz.open(pdf_file_path)
56
+ text = ""
57
+ for page_num in range(len(doc)):
58
+ page = doc.load_page(page_num)
59
+ text += page.get_text()
60
+ doc.close()
61
+ return text
62
+ except Exception as e:
63
+ print(f"Error reading PDF {pdf_file_path}: {e}")
64
+ return ""
65
+
66
 
67
  def generate_lecture_prompt(chapter_text):
 
68
  prompt = f"""
69
  You are an expert, engaging, and slightly humorous AI tutor, like the best human teacher one could ask for.
70
  Your goal is to generate a comprehensive and interactive lecture based on the following PDF chapter text.
 
109
  print("Piper TTS model not available or no text provided. Skipping TTS.")
110
  return None
111
 
112
+ piper_executable = "piper"
 
 
 
 
 
 
 
 
113
 
114
  command = [
115
  piper_executable,
116
  "--model", PIPER_VOICE_PATH_ONNX,
117
+ "--output_file", output_filename
 
118
  ]
119
+ # If your voice explicitly needs the .json config file passed (usually not if named correctly)
120
+ # and PIPER_VOICE_PATH_JSON and os.path.exists(PIPER_VOICE_PATH_JSON):
121
+ # command.extend(["--config", PIPER_VOICE_PATH_JSON])
122
+
123
 
124
+ print(f"Running Piper TTS command: {' '.join(command)} (text will be piped via stdin)")
125
  try:
126
+ process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8', errors='ignore')
127
+ stdout, stderr = process.communicate(input=text)
128
+
129
+ if process.returncode != 0:
130
+ print(f"Error during Piper TTS execution. Return code: {process.returncode}")
131
+ print("Piper STDOUT:", stdout)
132
+ print("Piper STDERR:", stderr)
133
+ # Attempt to remove potentially empty/corrupted output file
134
+ if os.path.exists(output_filename):
135
+ try:
136
+ os.remove(output_filename)
137
+ except OSError as e_rm:
138
+ print(f"Could not remove potentially corrupted output file {output_filename}: {e_rm}")
139
+ return None
140
+
141
  if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
142
+ print("Piper TTS successful.")
143
+ if stdout: print("Piper TTS STDOUT:", stdout)
144
+ if stderr: print("Piper TTS STDERR:", stderr) # Should ideally be empty on success
145
  return output_filename
146
  else:
147
+ print(f"Piper TTS created an empty or no output file: {output_filename}")
148
+ if stdout: print("Piper STDOUT:", stdout)
149
+ if stderr: print("Piper Error:", stderr)
150
  return None
151
+
 
 
 
 
 
152
  except FileNotFoundError:
153
  print(f"Error: '{piper_executable}' command not found. Make sure Piper is installed and in your PATH.")
154
+ print("On Hugging Face Spaces, ensure 'piper-tts' is in requirements.txt and properly installed, and 'piper' is available in the environment.")
155
+ return None
156
+ except Exception as e:
157
+ print(f"An unexpected error occurred during Piper TTS execution: {e}")
158
  return None
159
 
160
 
161
+ def transcribe_audio(audio_input):
162
+ if not stt_model:
163
+ print("Whisper STT model not available. Skipping transcription.")
 
164
  return "Error: STT not available."
165
+ if not audio_input:
166
+ print("No audio input provided for transcription.")
167
+ return "Error: No audio provided."
168
+
169
  try:
170
+ # Gradio audio input can be a filepath string or (sample_rate, numpy_array)
171
+ # Whisper's transcribe method directly accepts filepaths.
172
+ # If it's (rate, data), we need to save it to a temp file.
173
+ temp_audio_path = None
174
+ if isinstance(audio_input, tuple):
175
+ sample_rate, data = audio_input
176
+ if data.dtype != np.float32: # Whisper expects float32
177
+ data = data.astype(np.float32) / np.iinfo(data.dtype).max if np.issubdtype(data.dtype, np.integer) else data.astype(np.float32)
 
 
 
 
 
 
178
 
179
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_f:
180
+ sf.write(tmp_f.name, data, sample_rate)
181
+ temp_audio_path = tmp_f.name
182
+ audio_filepath_to_transcribe = temp_audio_path
183
+ elif isinstance(audio_input, str) and os.path.exists(audio_input):
184
+ audio_filepath_to_transcribe = audio_input
185
  else:
186
  return "Error: Invalid audio input format for transcription."
187
 
188
+ result = stt_model.transcribe(audio_filepath_to_transcribe, fp16=False) # fp16=False for CPU
189
+
190
+ if temp_audio_path: # Clean up temp file if we created one
191
+ os.remove(temp_audio_path)
192
+
193
  return result["text"]
194
  except Exception as e:
195
  print(f"Error during audio transcription: {e}")
196
+ if temp_audio_path and os.path.exists(temp_audio_path): # Ensure cleanup on error too
197
+ try:
198
+ os.remove(temp_audio_path)
199
+ except OSError:
200
+ pass
201
  return f"Error during transcription: {str(e)}"
202
 
203
  # --- Main Gradio App Logic ---
204
 
 
 
205
  def process_pdf_and_generate_lecture(pdf_file_obj, progress=gr.Progress(track_tqdm=True)):
 
206
  if not gemini_model:
207
+ return "Gemini API not configured. Please check secrets.", None, "Error: Gemini API key missing.", "API Error", ""
208
  if not pdf_file_obj:
209
+ return "Please upload a PDF file.", None, "No PDF uploaded.", "Input Error", ""
210
 
211
  progress(0.1, desc="Extracting text from PDF...")
212
+ # pdf_file_obj.name is the temporary path of the uploaded file
213
+ pdf_text = pdf_to_text(pdf_file_obj.name)
214
 
215
  if not pdf_text.strip():
216
+ return "Could not extract text from PDF or PDF is empty.", None, "Empty PDF content.", "PDF Error", ""
217
+
218
+ # Limit context window for safety/cost, adjust as needed
219
+ max_text_length = 25000 # Increased slightly, but be mindful of API limits/costs
220
+ if len(pdf_text) > max_text_length:
221
+ print(f"PDF text truncated from {len(pdf_text)} to {max_text_length} characters for Gemini prompt.")
222
+ pdf_text = pdf_text[:max_text_length]
223
 
224
  progress(0.3, desc="Generating lecture script with Gemini...")
225
+ lecture_prompt = generate_lecture_prompt(pdf_text)
226
 
227
+ lecture_text = "" # Initialize to ensure it's always defined
228
  try:
229
  response = gemini_model.generate_content(lecture_prompt)
230
  lecture_text = response.text
231
  except Exception as e:
232
  print(f"Error calling Gemini API: {e}")
233
+ return f"Error generating lecture: {e}", None, "Gemini API Error.", "API Error", "" # Return 5 values
234
 
 
 
 
 
 
 
235
  whiteboard_content = ""
236
  for line in lecture_text.split('\n'):
237
  if line.lower().startswith("imagine on our whiteboard:"):
238
  whiteboard_content += line.replace("Imagine on our whiteboard:", "").strip() + "\n\n"
239
  if not whiteboard_content:
240
+ whiteboard_content = "No specific whiteboard content described for this section. AI will verbally describe visuals."
241
 
242
  progress(0.7, desc="Converting lecture to speech (TTS)...")
243
  # Create a unique filename for audio to avoid caching issues if files are static
244
+ # Using a temporary file that Gradio will handle for serving
245
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio_file:
246
+ audio_output_path_for_piper = tmp_audio_file.name
247
 
248
+ lecture_audio_path = text_to_speech_piper(lecture_text, output_filename=audio_output_path_for_piper)
249
 
250
+ if not lecture_audio_path: # lecture_audio_path will be None if TTS failed
251
  progress(1.0, desc="TTS failed. Displaying text only.")
252
+ # Return 5 values, including the lecture_text for the context state
253
+ return lecture_text, None, whiteboard_content, "TTS failed. Audio not available.", lecture_text
254
 
255
  progress(1.0, desc="Lecture ready!")
256
+ # Return 5 values, including the lecture_text for the context state
257
+ return lecture_text, lecture_audio_path, whiteboard_content, "Lecture generated successfully!", lecture_text
258
 
259
 
260
+ def handle_student_doubt(student_audio_query, lecture_context_text_from_state, progress=gr.Progress(track_tqdm=True)):
261
+ if not student_audio_query: # student_audio_query is a filepath from gr.Audio
262
+ return "No doubt recorded. Please record your question.", None, "Please record your question first."
263
  if not gemini_model:
264
+ return "Gemini API not configured. Cannot answer doubt.", None, "API Error."
265
 
266
  progress(0.2, desc="Transcribing your question...")
267
+ student_question_text = transcribe_audio(student_audio_query) # student_audio_query is already filepath
 
 
 
 
 
268
 
269
  if student_question_text.startswith("Error:"):
270
+ return f"Could not understand your question: {student_question_text}", None, "STT Error."
271
 
272
  progress(0.5, desc="Thinking about your question...")
273
 
274
+ # Use only the last part of the lecture context to keep the prompt manageable
275
+ context_for_doubt = lecture_context_text_from_state[-3000:] if lecture_context_text_from_state else "No prior lecture context available."
276
+
277
  doubt_prompt = f"""
278
  A student has a doubt regarding the lecture.
279
+ Current Lecture Context (last part):
280
  ---
281
+ {context_for_doubt}
282
  ---
283
  Student's Question: "{student_question_text}"
284
 
 
286
  1. Acknowledge the question.
287
  2. Provide a clear, concise, and helpful answer.
288
  3. Use analogies if helpful. Maintain your encouraging and slightly humorous tone.
289
+ 4. After answering, gently prompt if they understood or if they'd like to continue the lecture. For example: "Does that make sense? Let me know if you have more questions or if we should resume!"
290
 
291
  Keep your answer focused on the question.
292
  """
293
+ answer_text = "" # Initialize
294
  try:
295
  response = gemini_model.generate_content(doubt_prompt)
296
  answer_text = response.text
297
  except Exception as e:
298
  print(f"Error calling Gemini API for doubt: {e}")
299
+ return f"Error processing doubt: {e}", None, "API Error."
300
 
301
  progress(0.8, desc="Preparing audio for the answer...")
302
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_ans_audio_f:
303
+ answer_audio_path_for_piper = tmp_ans_audio_f.name
304
+
305
+ answer_audio_path = text_to_speech_piper(answer_text, output_filename=answer_audio_path_for_piper)
306
+
307
+ full_response_text = f"**Your Question:** {student_question_text}\n\n**AI Tutor:** {answer_text}"
308
 
309
  if not answer_audio_path:
310
  progress(1.0, desc="TTS for answer failed.")
311
+ return full_response_text, None, "TTS for answer failed. Text only."
 
312
 
313
  progress(1.0, desc="Answer ready!")
314
+ return full_response_text, answer_audio_path, "Answer provided."
315
 
316
 
317
  # --- Gradio UI ---
318
  css = """
319
  body { font-family: 'Arial', sans-serif; }
320
+ .gradio-container { max-width: 950px !important; margin: auto !important; }
321
+ .gr-button { border-radius: 8px; }
322
+ .gr-button.gr-button-primary { background-color: #4CAF50; color: white; }
323
+ .gr-button.gr-button-primary:hover { background-color: #45a049; }
324
+ .gr-button.gr-button-secondary { background-color: #008CBA; color: white; }
325
+ .gr-button.gr-button-secondary:hover { background-color: #007ba7; }
326
  .panel_description { padding: 10px; margin-bottom:10px; border-radius:5px; background-color:#f0f0f0; }
327
  .important_text { color: #D32F2F; font-weight: bold; }
328
  .markdown-output h1, .markdown-output h2 { color: #1976D2; }
329
+ .markdown-output strong { color: #444; }
330
+ .whiteboard-display { border: 2px dashed #ccc; padding: 15px; margin-top: 15px; background-color: #f9f9f9; min-height: 150px; font-family: 'Courier New', Courier, monospace; white-space: pre-wrap;}
331
+ #app_title { text-align: center; color: #2c3e50; margin-bottom: 20px;}
332
+ .status-box { font-weight: bold; padding: 8px; border-radius: 4px; margin-top: 5px; text-align: center;}
333
+ .status-box-success { background-color: #e7f7e7; color: #28a745; }
334
+ .status-box-error { background-color: #fdecea; color: #dc3545; }
335
+ .status-box-info { background-color: #e7f3fe; color: #007bff; }
336
  """
337
 
338
+ with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as app:
339
  gr.Markdown("# 🤖 AI Human-Like Tutor", elem_id="app_title")
340
  gr.Markdown(
341
  "Upload a PDF chapter, and the AI will generate an engaging lecture with voice, "
 
344
  )
345
 
346
  if not GOOGLE_API_KEY or not gemini_model:
347
+ gr.Markdown("<p class='important_text panel_description'>🔴 Configuration Error: GOOGLE_API_KEY for Gemini is not set in Hugging Face Secrets. The app will not function correctly.</p>")
348
  if not PIPER_VOICE_PATH_ONNX:
349
+ gr.Markdown("<p class='important_text panel_description'>🟡 Configuration Warning: PIPER_VOICE_PATH_ONNX not set or model not found. TTS (Text-to-Speech) will be disabled.</p>")
350
  if not stt_model:
351
+ gr.Markdown("<p class='important_text panel_description'>🟡 Configuration Warning: Whisper STT model failed to load. Mic input for doubts cannot be transcribed.</p>")
352
 
 
 
353
  lecture_context_state = gr.State(value="")
354
 
355
  with gr.Row():
356
  with gr.Column(scale=1):
357
  pdf_upload = gr.File(label="Upload PDF Chapter", file_types=[".pdf"])
358
+ generate_button = gr.Button("🚀 Generate Lecture", variant="primary", elem_id="generate_button")
359
+ status_message = gr.Textbox(label="Status", interactive=False, elem_classes=["status-box"]) # For general status
360
 
361
  gr.Markdown("---")
362
  gr.Markdown("### 🤔 Ask a Doubt")
 
 
363
  raise_hand_button = gr.Button("✋ Raise Hand / Prepare to Ask")
 
 
 
 
364
  student_mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Your Doubt (after clicking Raise Hand)")
 
365
  ask_doubt_button = gr.Button("💬 Ask My Doubt", variant="secondary")
366
+ doubt_status_message = gr.Textbox(label="Doubt Status", interactive=False, elem_classes=["status-box"]) # Specific for doubt
367
 
368
  with gr.Column(scale=2):
369
+ gr.Markdown("## giảng Lecture Output") # Kept your original header text
370
  lecture_display = gr.Markdown(label="Lecture Script")
371
  lecture_audio = gr.Audio(label="🎧 Listen to Lecture", type="filepath", autoplay=False)
372
 
373
+ gr.Markdown("### 칠판 Whiteboard Area") # Kept your original header text
374
+ whiteboard_display = gr.Textbox(label="Visuals & Notes (as described by AI)", lines=10, interactive=False, elem_classes=["whiteboard-display"])
375
 
376
  gr.Markdown("---")
377
  gr.Markdown("### 💡 Doubt Resolution")
 
384
  inputs=[pdf_upload],
385
  outputs=[lecture_display, lecture_audio, whiteboard_display, status_message, lecture_context_state],
386
  api_name="generate_lecture"
 
 
 
 
387
  )
388
 
389
+ # Update status_message based on the text content for better visual feedback
390
+ @status_message.change(inputs=status_message)
391
+ def update_status_styling(status_text):
392
+ if "Error" in status_text or "failed" in status_text or "not configured" in status_text or "not found" in status_text:
393
+ return gr.Textbox(value=status_text, elem_classes=["status-box", "status-box-error"])
394
+ elif "successfully" in status_text or "ready" in status_text:
395
+ return gr.Textbox(value=status_text, elem_classes=["status-box", "status-box-success"])
396
+ else:
397
+ return gr.Textbox(value=status_text, elem_classes=["status-box", "status-box-info"])
398
+
399
+ @doubt_status_message.change(inputs=doubt_status_message)
400
+ def update_doubt_status_styling(status_text):
401
+ if "Error" in status_text or "failed" in status_text or "not configured" in status_text or "not found" in status_text:
402
+ return gr.Textbox(value=status_text, elem_classes=["status-box", "status-box-error"])
403
+ elif "successfully" in status_text or "ready" in status_text or "provided" in status_text:
404
+ return gr.Textbox(value=status_text, elem_classes=["status-box", "status-box-success"])
405
+ else:
406
+ return gr.Textbox(value=status_text, elem_classes=["status-box", "status-box-info"])
407
+
408
+
409
  raise_hand_button.click(
410
+ lambda: gr.Info("Mic enabled! Record your question then click 'Ask My Doubt'.")
411
  )
412
 
413
  ask_doubt_button.click(
414
  fn=handle_student_doubt,
415
+ inputs=[student_mic_input, lecture_context_state],
416
+ outputs=[doubt_answer_display, doubt_answer_audio, doubt_status_message],
417
  api_name="ask_doubt"
418
  )
419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  if __name__ == "__main__":
 
421
  # For local run, you'd set env vars:
422
  # os.environ['GOOGLE_API_KEY'] = 'YOUR_LOCAL_KEY'
423
  # os.environ['PIPER_VOICE_PATH_ONNX'] = 'path/to/your/local/voice.onnx'
424
+ if not GOOGLE_API_KEY: print("Local Run: GOOGLE_API_KEY not set.")
425
+ if not PIPER_VOICE_PATH_ONNX: print("Local Run: PIPER_VOICE_PATH_ONNX not set.")
426
+ elif not os.path.exists(PIPER_VOICE_PATH_ONNX) or (PIPER_VOICE_PATH_JSON and not os.path.exists(PIPER_VOICE_PATH_JSON)):
427
+ print("Local Run: Piper voice model/config files not found at specified path(s).")
428
 
429
+ # Using ssr_mode=False for broader compatibility if issues arise with experimental SSR
430
+ # You can remove share=False if you want it to be accessible on your local network
431
+ app.launch(debug=True, share=False, server_port=7860) # Explicitly setting server_port