File size: 2,617 Bytes
5f63e76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
from huggingface_hub import InferenceClient
import os
import whisper
from gtts import gTTS
import time

# Load token and model
HF_TOKEN = os.getenv("tomoniaccess")
client = InferenceClient(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    token=HF_TOKEN
)

# Load Whisper base model
whisper_model = whisper.load_model("base")

SYSTEM_MESSAGE = (
    "Du bist ein einfühlsamer Unterstützer für Menschen mit Depressionen. "
    "Sprich sanft, validiere ihre Gefühle und biete kleine, konkrete Hilfestellungen an. "
    "Mach keine Diagnosen und verweise bei Bedarf freundlich auf professionelle Hilfe."
)

def full_pipeline(audio_path, max_tokens, temperature, top_p):
    t0 = time.time()

    # 1. Transcription
    t1 = time.time()
    result = whisper_model.transcribe(audio_path, language="de")
    user_input = result["text"]
    t2 = time.time()
    print(f"⏱️ Transcription took {t2 - t1:.2f} sec")

    # 2. Chat completion
    messages = [
        {"role": "system", "content": SYSTEM_MESSAGE},
        {"role": "user", "content": user_input}
    ]
    response_text = ""
    t3 = time.time()
    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content
        if token:
            response_text += token
    t4 = time.time()
    print(f"🤖 Mistral response took {t4 - t3:.2f} sec")

    # 3. Text to Speech
    tts = gTTS(response_text, lang="de")
    audio_output_path = "response.mp3"
    tts.save(audio_output_path)
    t5 = time.time()
    print(f"🔊 TTS took {t5 - t4:.2f} sec")

    print(f"✅ Total processing time: {t5 - t0:.2f} sec")

    return user_input, response_text, audio_output_path

# Gradio UI
demo = gr.Interface(
    fn=full_pipeline,
    inputs=[
        gr.Audio(source="microphone", type="filepath", label="Sprich hier"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max neue Tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperatur"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
    ],
    outputs=[
        gr.Textbox(label="Dein gesprochener Input"),
        gr.Textbox(label="Antwort des Bots"),
        gr.Audio(type="filepath", label="Antwort als Audio"),
    ],
    title="Einfühlsamer Chatbot für emotionale Unterstützung",
    description="Sprich ins Mikrofon. Der Bot antwortet auf Deutsch, einfühlsam und unterstützend."
)

if __name__ == "__main__":
    demo.launch()