mgokg commited on
Commit
d0b84c0
·
verified ·
1 Parent(s): b33b22e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -35
app.py CHANGED
@@ -1,26 +1,27 @@
1
  import asyncio
 
2
  import numpy as np
 
3
  import gradio as gr
 
4
  from fastrtc import AsyncStreamHandler, Stream, wait_for_item
5
- from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
6
- import torch
7
 
8
- # 1. Initialize Open Source Models
9
- # Using Whisper for STT, Phi-3 for LLM (fast), and a local TTS pipeline
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
11
 
12
- # STT: Whisper Tiny is fast enough for real-time
13
  stt_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device)
14
 
15
- # LLM: Phi-3-mini is lightweight and powerful
16
  llm_model = "microsoft/Phi-3-mini-4k-instruct"
17
  tokenizer = AutoTokenizer.from_pretrained(llm_model)
18
- llm_pipe = pipeline("text-generation", model=llm_model, tokenizer=tokenizer, device=device)
19
 
20
- # TTS: We use a simple gTTS or local pipeline (using a placeholder for logic here)
21
- # For a pure local OS experience, Bark or Parler-TTS are great but heavy.
22
- # Here we use a standard TTS pipeline.
23
- tts_pipe = pipeline("text-to-speech", model="facebook/mms-tts-deu", device=device) # German
24
 
25
  class OpenSourceHandler(AsyncStreamHandler):
26
  def __init__(self, expected_layout="mono", output_sample_rate=24000):
@@ -29,24 +30,34 @@ class OpenSourceHandler(AsyncStreamHandler):
29
 
30
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
31
  rate, array = frame
32
- # 1. STT: Convert Audio to Text
33
- # Whisper expects float32
 
 
 
 
 
34
  audio_fp32 = array.astype(np.float32) / 32768.0
35
  text_result = stt_pipe({"sampling_rate": rate, "raw": audio_fp32})["text"]
36
 
37
- if len(text_result.strip()) > 2:
38
- # 2. LLM: Generate Response
 
 
39
  prompt = f"<|user|>\n{text_result}<|end|>\n<|assistant|>"
40
- response = llm_pipe(prompt, max_new_tokens=50, do_sample=True)[0]["generated_text"]
41
- answer = response.split("<|assistant|>")[-1].strip()
42
 
43
- # 3. TTS: Convert Text back to Audio
 
44
  audio_out = tts_pipe(answer)
45
- audio_data = np.frombuffer(audio_out["audio"], dtype=np.float32)
46
 
47
- # Convert to int16 for the output stream
48
  audio_int16 = (audio_data * 32767).astype(np.int16)
49
- self.output_queue.put_nowait((self.output_sample_rate, audio_int16))
 
 
50
 
51
  async def emit(self) -> tuple[int, np.ndarray] | None:
52
  return await wait_for_item(self.output_queue)
@@ -54,28 +65,23 @@ class OpenSourceHandler(AsyncStreamHandler):
54
  def copy(self) -> "OpenSourceHandler":
55
  return OpenSourceHandler(output_sample_rate=self.output_sample_rate)
56
 
57
- # 2. Define the Gradio Stream
 
 
58
  stream = Stream(
59
  modality="audio",
60
  mode="send-receive",
61
  handler=OpenSourceHandler(),
62
  additional_inputs=[
63
- gr.Dropdown(
64
- label="System Language",
65
- choices=["de-DE", "en-US"],
66
- value="de-DE",
67
- ),
68
  ],
69
  )
70
 
 
 
 
71
  if __name__ == "__main__":
72
  import uvicorn
73
- import os
74
-
75
- # Hugging Face Spaces provides the port via an environment variable,
76
- # but defaults to 7860.
77
  port = int(os.getenv("PORT", 7860))
78
-
79
- # We bind to 0.0.0.0 to make the app accessible within the HF container
80
- #uvicorn.run(app, host="0.0.0.0", port=port)
81
- stream.launch(server_name="0.0.0.0", server_port=port)
 
1
  import asyncio
2
+ import os
3
  import numpy as np
4
+ import torch
5
  import gradio as gr
6
+ from fastapi import FastAPI
7
  from fastrtc import AsyncStreamHandler, Stream, wait_for_item
8
+ from transformers import pipeline, AutoTokenizer
 
9
 
10
+ # 1. Hardware & Modell-Initialisierung
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ # Wir nutzen 4-bit Quantisierung falls CUDA verfügbar ist, um Latenz zu sparen
13
+ model_kwargs = {"torch_dtype": torch.float16, "load_in_4bit": True} if device == "cuda" else {}
14
 
15
+ # STT: Whisper-tiny für minimale Latenz
16
  stt_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device)
17
 
18
+ # LLM: Phi-3-mini (Instruction-tuned)
19
  llm_model = "microsoft/Phi-3-mini-4k-instruct"
20
  tokenizer = AutoTokenizer.from_pretrained(llm_model)
21
+ llm_pipe = pipeline("text-generation", model=llm_model, tokenizer=tokenizer, device=device, model_kwargs=model_kwargs)
22
 
23
+ # TTS: MMS-TTS für Deutsch
24
+ tts_pipe = pipeline("text-to-speech", model="facebook/mms-tts-deu", device=device)
 
 
25
 
26
  class OpenSourceHandler(AsyncStreamHandler):
27
  def __init__(self, expected_layout="mono", output_sample_rate=24000):
 
30
 
31
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
32
  rate, array = frame
33
+
34
+ # Zugriff auf die Gradio-Zusatzeingaben (z.B. Sprache oder System-Prompt)
35
+ # latest_args[0] ist der erste zusätzliche Input nach dem Audio
36
+ # Hier beispielhaft für zukünftige Erweiterungen:
37
+ # system_msg = self.latest_args[0] if self.latest_args else "Du bist ein Assistent."
38
+
39
+ # 1. STT: Audio -> Text (Whisper erwartet Float32)
40
  audio_fp32 = array.astype(np.float32) / 32768.0
41
  text_result = stt_pipe({"sampling_rate": rate, "raw": audio_fp32})["text"]
42
 
43
+ # Rausch-Filter: Nur antworten, wenn wirklich Text erkannt wurde
44
+ if len(text_result.strip()) > 3:
45
+ # 2. LLM: Antwort generieren
46
+ # return_full_text=False verhindert, dass der Prompt mit ausgegeben wird
47
  prompt = f"<|user|>\n{text_result}<|end|>\n<|assistant|>"
48
+ outputs = llm_pipe(prompt, max_new_tokens=64, do_sample=True, return_full_text=False)
49
+ answer = outputs[0]["generated_text"].strip()
50
 
51
+ # 3. TTS: Text -> Audio
52
+ # MMS-TTS gibt ein Dict zurück: {'audio': ndarray, 'sampling_rate': int}
53
  audio_out = tts_pipe(answer)
54
+ audio_data = audio_out["audio"] # Das ist bereits ein numpy array
55
 
56
+ # Resampling / Konvertierung zu Int16 für den Stream
57
  audio_int16 = (audio_data * 32767).astype(np.int16)
58
+
59
+ # Wir nutzen await für die Queue, um sauberes Async-Verhalten zu garantieren
60
+ await self.output_queue.put((self.output_sample_rate, audio_int16))
61
 
62
  async def emit(self) -> tuple[int, np.ndarray] | None:
63
  return await wait_for_item(self.output_queue)
 
65
  def copy(self) -> "OpenSourceHandler":
66
  return OpenSourceHandler(output_sample_rate=self.output_sample_rate)
67
 
68
+ # 2. FastAPI & Stream Setup
69
+ app = FastAPI() # Hier definieren wir die App!
70
+
71
  stream = Stream(
72
  modality="audio",
73
  mode="send-receive",
74
  handler=OpenSourceHandler(),
75
  additional_inputs=[
76
+ gr.Textbox(label="System Message", value="Du bist ein hilfreicher KI-Assistent."),
 
 
 
 
77
  ],
78
  )
79
 
80
+ # WICHTIG: Mountet den Stream in die FastAPI App
81
+ stream.mount(app)
82
+
83
  if __name__ == "__main__":
84
  import uvicorn
85
+ # Port 7860 für Hugging Face Spaces
 
 
 
86
  port = int(os.getenv("PORT", 7860))
87
+ uvicorn.run(app, host="0.0.0.0", port=port)