ai-tomoni commited on
Commit
5f63e76
·
verified ·
1 Parent(s): a977820

Create app_with_audio.py

Browse files
Files changed (1) hide show
  1. app_with_audio.py +84 -0
app_with_audio.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import InferenceClient
3
+ import os
4
+ import whisper
5
+ from gtts import gTTS
6
+ import time
7
+
8
+ # Load token and model
9
+ HF_TOKEN = os.getenv("tomoniaccess")
10
+ client = InferenceClient(
11
+ model="mistralai/Mistral-7B-Instruct-v0.3",
12
+ token=HF_TOKEN
13
+ )
14
+
15
+ # Load Whisper base model
16
+ whisper_model = whisper.load_model("base")
17
+
18
+ SYSTEM_MESSAGE = (
19
+ "Du bist ein einfühlsamer Unterstützer für Menschen mit Depressionen. "
20
+ "Sprich sanft, validiere ihre Gefühle und biete kleine, konkrete Hilfestellungen an. "
21
+ "Mach keine Diagnosen und verweise bei Bedarf freundlich auf professionelle Hilfe."
22
+ )
23
+
24
+ def full_pipeline(audio_path, max_tokens, temperature, top_p):
25
+ t0 = time.time()
26
+
27
+ # 1. Transcription
28
+ t1 = time.time()
29
+ result = whisper_model.transcribe(audio_path, language="de")
30
+ user_input = result["text"]
31
+ t2 = time.time()
32
+ print(f"⏱️ Transcription took {t2 - t1:.2f} sec")
33
+
34
+ # 2. Chat completion
35
+ messages = [
36
+ {"role": "system", "content": SYSTEM_MESSAGE},
37
+ {"role": "user", "content": user_input}
38
+ ]
39
+ response_text = ""
40
+ t3 = time.time()
41
+ for message in client.chat_completion(
42
+ messages,
43
+ max_tokens=max_tokens,
44
+ stream=True,
45
+ temperature=temperature,
46
+ top_p=top_p,
47
+ ):
48
+ token = message.choices[0].delta.content
49
+ if token:
50
+ response_text += token
51
+ t4 = time.time()
52
+ print(f"🤖 Mistral response took {t4 - t3:.2f} sec")
53
+
54
+ # 3. Text to Speech
55
+ tts = gTTS(response_text, lang="de")
56
+ audio_output_path = "response.mp3"
57
+ tts.save(audio_output_path)
58
+ t5 = time.time()
59
+ print(f"🔊 TTS took {t5 - t4:.2f} sec")
60
+
61
+ print(f"✅ Total processing time: {t5 - t0:.2f} sec")
62
+
63
+ return user_input, response_text, audio_output_path
64
+
65
+ # Gradio UI
66
+ demo = gr.Interface(
67
+ fn=full_pipeline,
68
+ inputs=[
69
+ gr.Audio(source="microphone", type="filepath", label="Sprich hier"),
70
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max neue Tokens"),
71
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperatur"),
72
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
73
+ ],
74
+ outputs=[
75
+ gr.Textbox(label="Dein gesprochener Input"),
76
+ gr.Textbox(label="Antwort des Bots"),
77
+ gr.Audio(type="filepath", label="Antwort als Audio"),
78
+ ],
79
+ title="Einfühlsamer Chatbot für emotionale Unterstützung",
80
+ description="Sprich ins Mikrofon. Der Bot antwortet auf Deutsch, einfühlsam und unterstützend."
81
+ )
82
+
83
+ if __name__ == "__main__":
84
+ demo.launch()