Nguyen5 commited on
Commit
cedda96
·
1 Parent(s): ea73680
Files changed (2) hide show
  1. app.py +12 -41
  2. speech_io.py +0 -40
app.py CHANGED
@@ -180,17 +180,12 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
180
  "Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
181
  )
182
 
183
- # State für nahtlose Konversation
184
- state = gr.State(AppState(conversation=[], recording_state="idle", mode="Audio", last_record_path=None, status_text="Bereit"))
185
-
186
  # Einspaltiges Layout, alles untereinander (verhindert abgeschnittene Bereiche)
187
  with gr.Column(elem_id="chat-wrap"):
188
  chatbot = gr.Chatbot(
189
  label="Chat",
190
  height=280,
191
  )
192
- spoken_out = gr.Textbox(label="Gesprochener Text", interactive=False)
193
- status_md = gr.Markdown("Bereit")
194
 
195
  # Eingabezeile à la ChatGPT: Plus + Text + Mikro + Senden
196
  with gr.Row(elem_id="chat-input-row"):
@@ -216,55 +211,31 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
216
  show_label=False,
217
  )
218
  send_btn = gr.Button("➤", elem_classes=["compact-btn", "send-btn"], scale=1)
219
- lang_dd = gr.Dropdown(choices=["auto","de","en","vi"], value="auto", label="Sprache")
220
- mode_radio = gr.Radio(choices=["Audio","Text"], value="Audio", label="Eingabemodus")
221
- record_player = gr.Audio(label="Letzte Aufnahme", type="filepath", interactive=False)
222
- stop_rec_btn = gr.Button("⏹ Aufnahme löschen")
223
 
224
  # Senden bei Enter
225
  chat_text.submit(
226
  chat_fn,
227
- [chat_text, chat_audio, chatbot, state, lang_dd],
228
- [chatbot, chat_text, chat_audio, spoken_out, status_md],
229
  )
230
- def transcribe_to_textbox(audio_path, lang, app_state: AppState):
231
- if audio_path:
232
- app_state.recording_state = "processing"
233
- app_state.last_record_path = audio_path
234
- s = transcribe_audio(audio_path, language=lang)
235
- app_state.status_text = "✅ Verarbeitung abgeschlossen"
236
- return s, s, audio_path, app_state.status_text
237
- chat_audio.stream(
238
  transcribe_to_textbox,
239
- [chat_audio, lang_dd, state],
240
- [chat_text, spoken_out, record_player, status_md],
241
  )
242
- chat_audio.change(
243
  transcribe_to_textbox,
244
- [chat_audio, lang_dd, state],
245
- [chat_text, spoken_out, record_player, status_md],
246
  )
247
  send_btn.click(
248
  chat_fn,
249
- [chat_text, chat_audio, chatbot, state, lang_dd],
250
- [chatbot, chat_text, chat_audio, spoken_out, status_md],
251
  )
252
 
253
- def toggle_mode(m, app_state: AppState):
254
- app_state.mode = m
255
- status = "Audio-Modus aktiv" if m == "Audio" else "Text-Modus aktiv"
256
- return gr.update(visible=(m == "Text")), gr.update(visible=(m == "Audio")), status
257
- mode_radio.change(toggle_mode, [mode_radio, state], [chat_text, chat_audio, status_md])
258
-
259
- def clear_record(p):
260
- try:
261
- if isinstance(p, str) and os.path.exists(p):
262
- os.remove(p)
263
- except:
264
- pass
265
- return None
266
- stop_rec_btn.click(clear_record, [record_player], [record_player])
267
-
268
  # Quellen & Dokumente kompakt unterhalb
269
  with gr.Accordion("Quellen & Dokumente", open=False):
270
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
 
180
  "Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
181
  )
182
 
 
 
 
183
  # Einspaltiges Layout, alles untereinander (verhindert abgeschnittene Bereiche)
184
  with gr.Column(elem_id="chat-wrap"):
185
  chatbot = gr.Chatbot(
186
  label="Chat",
187
  height=280,
188
  )
 
 
189
 
190
  # Eingabezeile à la ChatGPT: Plus + Text + Mikro + Senden
191
  with gr.Row(elem_id="chat-input-row"):
 
211
  show_label=False,
212
  )
213
  send_btn = gr.Button("➤", elem_classes=["compact-btn", "send-btn"], scale=1)
 
 
 
 
214
 
215
  # Senden bei Enter
216
  chat_text.submit(
217
  chat_fn,
218
+ [chat_text, chat_audio, chatbot],
219
+ [chatbot, chat_text, chat_audio],
220
  )
221
+ def transcribe_to_textbox(audio_path):
222
+ return transcribe_audio(audio_path, language=ASR_LANGUAGE_HINT)
223
+ chat_audio.change(
 
 
 
 
 
224
  transcribe_to_textbox,
225
+ [chat_audio],
226
+ [chat_text],
227
  )
228
+ chat_audio.stream(
229
  transcribe_to_textbox,
230
+ [chat_audio],
231
+ [chat_text],
232
  )
233
  send_btn.click(
234
  chat_fn,
235
+ [chat_text, chat_audio, chatbot],
236
+ [chatbot, chat_text, chat_audio],
237
  )
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  # Quellen & Dokumente kompakt unterhalb
240
  with gr.Accordion("Quellen & Dokumente", open=False):
241
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
speech_io.py CHANGED
@@ -25,8 +25,6 @@ ASR_DEFAULT_LANGUAGE = os.getenv("ASR_LANGUAGE", "de") # "auto" um Auto-Detect
25
  TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
26
  ASR_PROMPT = os.getenv("ASR_PROMPT", "Dies ist ein Diktat in deutscher Sprache.")
27
  ASR_MAX_DURATION_S = int(os.getenv("ASR_MAX_DURATION_S", "30"))
28
- ASR_BACKEND = os.getenv("ASR_BACKEND", "local") # local | groq
29
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
30
 
31
  _asr = None
32
  _tts = None
@@ -102,15 +100,6 @@ def transcribe_audio(audio_path: str, language: Optional[str] = None, max_durati
102
  print(">>> Kein Audio gefunden.")
103
  return ""
104
 
105
- # Groq-Backend optional
106
- if ASR_BACKEND.lower() == "groq" and GROQ_API_KEY:
107
- try:
108
- txt = transcribe_with_groq(audio_path)
109
- if isinstance(txt, str) and txt.strip():
110
- return txt.strip()
111
- except Exception as e:
112
- print(f">>> Groq-Backend Fehler: {e}. Fallback auf lokalen Whisper.")
113
-
114
  # WAV einlesen (soundfile garantiert PCM korrekt)
115
  data, sr = sf.read(audio_path, always_2d=False)
116
 
@@ -227,35 +216,6 @@ def transcribe_audio(audio_path: str, language: Optional[str] = None, max_durati
227
  print("ASR:", text)
228
  return text
229
 
230
- def transcribe_with_groq(file_path: str) -> Optional[str]:
231
- """Transkription via Groq Whisper large v3 turbo (verbose_json)."""
232
- try:
233
- import groq
234
- except Exception:
235
- return None
236
- if not (isinstance(file_path, str) and os.path.exists(file_path) and GROQ_API_KEY):
237
- return None
238
- client = groq.Client(api_key=GROQ_API_KEY)
239
- with open(file_path, "rb") as f:
240
- try:
241
- resp = client.audio.transcriptions.with_raw_response.create(
242
- model="whisper-large-v3-turbo",
243
- file=("audio.wav", f),
244
- response_format="verbose_json",
245
- )
246
- data = resp.parse()
247
- # verbose_json enthält segments mit no_speech_prob
248
- segments = getattr(data, "segments", None) or data.get("segments") if isinstance(data, dict) else None
249
- if segments and len(segments) > 0:
250
- ns = segments[0].get("no_speech_prob", 0)
251
- if ns and ns > 0.7:
252
- return ""
253
- text = getattr(data, "text", None) or data.get("text") if isinstance(data, dict) else None
254
- return (text or "").strip()
255
- except Exception as e:
256
- print(f">>> Groq Transkription fehlgeschlagen: {e}")
257
- return None
258
-
259
  # ========================================================
260
  # TEXT-TO-SPEECH (TTS)
261
  # ========================================================
 
25
  TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
26
  ASR_PROMPT = os.getenv("ASR_PROMPT", "Dies ist ein Diktat in deutscher Sprache.")
27
  ASR_MAX_DURATION_S = int(os.getenv("ASR_MAX_DURATION_S", "30"))
 
 
28
 
29
  _asr = None
30
  _tts = None
 
100
  print(">>> Kein Audio gefunden.")
101
  return ""
102
 
 
 
 
 
 
 
 
 
 
103
  # WAV einlesen (soundfile garantiert PCM korrekt)
104
  data, sr = sf.read(audio_path, always_2d=False)
105
 
 
216
  print("ASR:", text)
217
  return text
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  # ========================================================
220
  # TEXT-TO-SPEECH (TTS)
221
  # ========================================================