commit
Browse files- app.py +12 -41
- speech_io.py +0 -40
app.py
CHANGED
|
@@ -180,17 +180,12 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
|
|
| 180 |
"Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
|
| 181 |
)
|
| 182 |
|
| 183 |
-
# State für nahtlose Konversation
|
| 184 |
-
state = gr.State(AppState(conversation=[], recording_state="idle", mode="Audio", last_record_path=None, status_text="Bereit"))
|
| 185 |
-
|
| 186 |
# Einspaltiges Layout, alles untereinander (verhindert abgeschnittene Bereiche)
|
| 187 |
with gr.Column(elem_id="chat-wrap"):
|
| 188 |
chatbot = gr.Chatbot(
|
| 189 |
label="Chat",
|
| 190 |
height=280,
|
| 191 |
)
|
| 192 |
-
spoken_out = gr.Textbox(label="Gesprochener Text", interactive=False)
|
| 193 |
-
status_md = gr.Markdown("Bereit")
|
| 194 |
|
| 195 |
# Eingabezeile à la ChatGPT: Plus + Text + Mikro + Senden
|
| 196 |
with gr.Row(elem_id="chat-input-row"):
|
|
@@ -216,55 +211,31 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
|
|
| 216 |
show_label=False,
|
| 217 |
)
|
| 218 |
send_btn = gr.Button("➤", elem_classes=["compact-btn", "send-btn"], scale=1)
|
| 219 |
-
lang_dd = gr.Dropdown(choices=["auto","de","en","vi"], value="auto", label="Sprache")
|
| 220 |
-
mode_radio = gr.Radio(choices=["Audio","Text"], value="Audio", label="Eingabemodus")
|
| 221 |
-
record_player = gr.Audio(label="Letzte Aufnahme", type="filepath", interactive=False)
|
| 222 |
-
stop_rec_btn = gr.Button("⏹ Aufnahme löschen")
|
| 223 |
|
| 224 |
# Senden bei Enter
|
| 225 |
chat_text.submit(
|
| 226 |
chat_fn,
|
| 227 |
-
[chat_text, chat_audio, chatbot
|
| 228 |
-
[chatbot, chat_text, chat_audio
|
| 229 |
)
|
| 230 |
-
def transcribe_to_textbox(audio_path
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
app_state.last_record_path = audio_path
|
| 234 |
-
s = transcribe_audio(audio_path, language=lang)
|
| 235 |
-
app_state.status_text = "✅ Verarbeitung abgeschlossen"
|
| 236 |
-
return s, s, audio_path, app_state.status_text
|
| 237 |
-
chat_audio.stream(
|
| 238 |
transcribe_to_textbox,
|
| 239 |
-
[chat_audio
|
| 240 |
-
[chat_text
|
| 241 |
)
|
| 242 |
-
chat_audio.
|
| 243 |
transcribe_to_textbox,
|
| 244 |
-
[chat_audio
|
| 245 |
-
[chat_text
|
| 246 |
)
|
| 247 |
send_btn.click(
|
| 248 |
chat_fn,
|
| 249 |
-
[chat_text, chat_audio, chatbot
|
| 250 |
-
[chatbot, chat_text, chat_audio
|
| 251 |
)
|
| 252 |
|
| 253 |
-
def toggle_mode(m, app_state: AppState):
|
| 254 |
-
app_state.mode = m
|
| 255 |
-
status = "Audio-Modus aktiv" if m == "Audio" else "Text-Modus aktiv"
|
| 256 |
-
return gr.update(visible=(m == "Text")), gr.update(visible=(m == "Audio")), status
|
| 257 |
-
mode_radio.change(toggle_mode, [mode_radio, state], [chat_text, chat_audio, status_md])
|
| 258 |
-
|
| 259 |
-
def clear_record(p):
|
| 260 |
-
try:
|
| 261 |
-
if isinstance(p, str) and os.path.exists(p):
|
| 262 |
-
os.remove(p)
|
| 263 |
-
except:
|
| 264 |
-
pass
|
| 265 |
-
return None
|
| 266 |
-
stop_rec_btn.click(clear_record, [record_player], [record_player])
|
| 267 |
-
|
| 268 |
# Quellen & Dokumente kompakt unterhalb
|
| 269 |
with gr.Accordion("Quellen & Dokumente", open=False):
|
| 270 |
gr.Markdown("### 📄 Prüfungsordnung (PDF)")
|
|
|
|
| 180 |
"Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
|
| 181 |
)
|
| 182 |
|
|
|
|
|
|
|
|
|
|
| 183 |
# Einspaltiges Layout, alles untereinander (verhindert abgeschnittene Bereiche)
|
| 184 |
with gr.Column(elem_id="chat-wrap"):
|
| 185 |
chatbot = gr.Chatbot(
|
| 186 |
label="Chat",
|
| 187 |
height=280,
|
| 188 |
)
|
|
|
|
|
|
|
| 189 |
|
| 190 |
# Eingabezeile à la ChatGPT: Plus + Text + Mikro + Senden
|
| 191 |
with gr.Row(elem_id="chat-input-row"):
|
|
|
|
| 211 |
show_label=False,
|
| 212 |
)
|
| 213 |
send_btn = gr.Button("➤", elem_classes=["compact-btn", "send-btn"], scale=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
# Senden bei Enter
|
| 216 |
chat_text.submit(
|
| 217 |
chat_fn,
|
| 218 |
+
[chat_text, chat_audio, chatbot],
|
| 219 |
+
[chatbot, chat_text, chat_audio],
|
| 220 |
)
|
| 221 |
+
def transcribe_to_textbox(audio_path):
|
| 222 |
+
return transcribe_audio(audio_path, language=ASR_LANGUAGE_HINT)
|
| 223 |
+
chat_audio.change(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
transcribe_to_textbox,
|
| 225 |
+
[chat_audio],
|
| 226 |
+
[chat_text],
|
| 227 |
)
|
| 228 |
+
chat_audio.stream(
|
| 229 |
transcribe_to_textbox,
|
| 230 |
+
[chat_audio],
|
| 231 |
+
[chat_text],
|
| 232 |
)
|
| 233 |
send_btn.click(
|
| 234 |
chat_fn,
|
| 235 |
+
[chat_text, chat_audio, chatbot],
|
| 236 |
+
[chatbot, chat_text, chat_audio],
|
| 237 |
)
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
# Quellen & Dokumente kompakt unterhalb
|
| 240 |
with gr.Accordion("Quellen & Dokumente", open=False):
|
| 241 |
gr.Markdown("### 📄 Prüfungsordnung (PDF)")
|
speech_io.py
CHANGED
|
@@ -25,8 +25,6 @@ ASR_DEFAULT_LANGUAGE = os.getenv("ASR_LANGUAGE", "de") # "auto" um Auto-Detect
|
|
| 25 |
TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
|
| 26 |
ASR_PROMPT = os.getenv("ASR_PROMPT", "Dies ist ein Diktat in deutscher Sprache.")
|
| 27 |
ASR_MAX_DURATION_S = int(os.getenv("ASR_MAX_DURATION_S", "30"))
|
| 28 |
-
ASR_BACKEND = os.getenv("ASR_BACKEND", "local") # local | groq
|
| 29 |
-
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
| 30 |
|
| 31 |
_asr = None
|
| 32 |
_tts = None
|
|
@@ -102,15 +100,6 @@ def transcribe_audio(audio_path: str, language: Optional[str] = None, max_durati
|
|
| 102 |
print(">>> Kein Audio gefunden.")
|
| 103 |
return ""
|
| 104 |
|
| 105 |
-
# Groq-Backend optional
|
| 106 |
-
if ASR_BACKEND.lower() == "groq" and GROQ_API_KEY:
|
| 107 |
-
try:
|
| 108 |
-
txt = transcribe_with_groq(audio_path)
|
| 109 |
-
if isinstance(txt, str) and txt.strip():
|
| 110 |
-
return txt.strip()
|
| 111 |
-
except Exception as e:
|
| 112 |
-
print(f">>> Groq-Backend Fehler: {e}. Fallback auf lokalen Whisper.")
|
| 113 |
-
|
| 114 |
# WAV einlesen (soundfile garantiert PCM korrekt)
|
| 115 |
data, sr = sf.read(audio_path, always_2d=False)
|
| 116 |
|
|
@@ -227,35 +216,6 @@ def transcribe_audio(audio_path: str, language: Optional[str] = None, max_durati
|
|
| 227 |
print("ASR:", text)
|
| 228 |
return text
|
| 229 |
|
| 230 |
-
def transcribe_with_groq(file_path: str) -> Optional[str]:
|
| 231 |
-
"""Transkription via Groq Whisper large v3 turbo (verbose_json)."""
|
| 232 |
-
try:
|
| 233 |
-
import groq
|
| 234 |
-
except Exception:
|
| 235 |
-
return None
|
| 236 |
-
if not (isinstance(file_path, str) and os.path.exists(file_path) and GROQ_API_KEY):
|
| 237 |
-
return None
|
| 238 |
-
client = groq.Client(api_key=GROQ_API_KEY)
|
| 239 |
-
with open(file_path, "rb") as f:
|
| 240 |
-
try:
|
| 241 |
-
resp = client.audio.transcriptions.with_raw_response.create(
|
| 242 |
-
model="whisper-large-v3-turbo",
|
| 243 |
-
file=("audio.wav", f),
|
| 244 |
-
response_format="verbose_json",
|
| 245 |
-
)
|
| 246 |
-
data = resp.parse()
|
| 247 |
-
# verbose_json enthält segments mit no_speech_prob
|
| 248 |
-
segments = getattr(data, "segments", None) or data.get("segments") if isinstance(data, dict) else None
|
| 249 |
-
if segments and len(segments) > 0:
|
| 250 |
-
ns = segments[0].get("no_speech_prob", 0)
|
| 251 |
-
if ns and ns > 0.7:
|
| 252 |
-
return ""
|
| 253 |
-
text = getattr(data, "text", None) or data.get("text") if isinstance(data, dict) else None
|
| 254 |
-
return (text or "").strip()
|
| 255 |
-
except Exception as e:
|
| 256 |
-
print(f">>> Groq Transkription fehlgeschlagen: {e}")
|
| 257 |
-
return None
|
| 258 |
-
|
| 259 |
# ========================================================
|
| 260 |
# TEXT-TO-SPEECH (TTS)
|
| 261 |
# ========================================================
|
|
|
|
| 25 |
TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
|
| 26 |
ASR_PROMPT = os.getenv("ASR_PROMPT", "Dies ist ein Diktat in deutscher Sprache.")
|
| 27 |
ASR_MAX_DURATION_S = int(os.getenv("ASR_MAX_DURATION_S", "30"))
|
|
|
|
|
|
|
| 28 |
|
| 29 |
_asr = None
|
| 30 |
_tts = None
|
|
|
|
| 100 |
print(">>> Kein Audio gefunden.")
|
| 101 |
return ""
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
# WAV einlesen (soundfile garantiert PCM korrekt)
|
| 104 |
data, sr = sf.read(audio_path, always_2d=False)
|
| 105 |
|
|
|
|
| 216 |
print("ASR:", text)
|
| 217 |
return text
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
# ========================================================
|
| 220 |
# TEXT-TO-SPEECH (TTS)
|
| 221 |
# ========================================================
|