bdstar commited on
Commit
eb9f5ce
·
1 Parent(s): 9adb73b

init commit

Browse files
Files changed (4) hide show
  1. app.py +170 -0
  2. apt.txt +1 -0
  3. requirements.txt +31 -0
  4. runtime.txt +1 -0
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import subprocess, json, os, io, tempfile
3
+ from faster_whisper import WhisperModel
4
+ from ollama import Client as OllamaClient
5
+
6
+ # ---- CONFIG ----
7
+ LLM_MODEL = "llama3.2:3b" # or "mistral:7b", "qwen2.5:3b"
8
+ WHISPER_SIZE = "small" # "base", "small", "medium"
9
+ USE_SILERO = True # set False to use Coqui XTTS v2
10
+ USE_CONTEXT = False # <— new: disable conversational memory
11
+
12
+ import os
13
+ USE_REMOTE_OLLAMA = bool(os.getenv("OLLAMA_HOST"))
14
+
15
+
16
+ if not USE_REMOTE_OLLAMA:
17
+ # Transformers fallback for Spaces (CPU-friendly small instruct model)
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
19
+ HF_CHAT_MODEL = os.getenv("HF_CHAT_MODEL", "google/gemma-2-2b-it") # small instruct model that runs on CPU
20
+ HF_TOKEN = os.getenv("HF_TOKEN")
21
+
22
+ _tok = AutoTokenizer.from_pretrained(HF_CHAT_MODEL, token=HF_TOKEN)
23
+ _mdl = AutoModelForCausalLM.from_pretrained(HF_CHAT_MODEL, token=HF_TOKEN, torch_dtype="auto", device_map="auto")
24
+ gen = pipeline("text-generation", model=_mdl, tokenizer=_tok, max_new_tokens=256)
25
+
26
+
27
+
28
+ # ---- STT (faster-whisper) ----
29
+ # Run on GPU if available: compute_type="float16", device="cuda"
30
+ stt_model = WhisperModel(WHISPER_SIZE, device="cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu",
31
+ compute_type="float16" if os.environ.get("CUDA_VISIBLE_DEVICES") else "int8")
32
+
33
+ def speech_to_text(audio_path: str) -> str:
34
+ segments, info = stt_model.transcribe(audio_path, beam_size=1, vad_filter=True, language="en")
35
+ text = "".join(seg.text for seg in segments).strip()
36
+ return text
37
+
38
+ # ---- LLM (Ollama) ----
39
+ # ollama = OllamaClient(host="http://127.0.0.1:11434")
40
+
41
+ SYSTEM_PROMPT = """You are a friendly AI voice assistant.
42
+ Reply in one short, natural sentence only.
43
+ Sound warm and conversational, never formal.
44
+ Avoid multi-sentence or paragraph answers."""
45
+
46
+ def chat_with_llm(history_messages, user_text):
47
+ if USE_REMOTE_OLLAMA:
48
+ # Only system + current user
49
+ messages = [
50
+ {"role": "system", "content": SYSTEM_PROMPT},
51
+ {"role": "user", "content": user_text},
52
+ ]
53
+ resp = ollama.chat(model=LLM_MODEL, messages=messages)
54
+ return resp["message"]["content"]
55
+ else:
56
+ # Only system + current user
57
+ prompt = f"{SYSTEM_PROMPT}\nUser: {user_text}\nAssistant:"
58
+ out = gen(prompt, return_full_text=False, max_new_tokens=25, temperature=0.8, repetition_penalty=1.1,)[0]["generated_text"].split("\n")[0].strip()
59
+ return out
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+ # near top-level (global singletons)
70
+ _SILERO_TTS = None
71
+
72
+ def tts_silero(text: str) -> str:
73
+ """
74
+ Return path to WAV synthesized by Silero TTS.
75
+ Uses a cached model instance to avoid re-downloading each request.
76
+ """
77
+ import torch, tempfile
78
+ import soundfile as sf
79
+
80
+ global _SILERO_TTS
81
+ if _SILERO_TTS is None:
82
+ obj = torch.hub.load(
83
+ repo_or_dir="snakers4/silero-models",
84
+ model="silero_tts",
85
+ language="en",
86
+ speaker="v3_en",
87
+ trust_repo=True, # avoids interactive trust prompt
88
+ )
89
+ _SILERO_TTS = obj[0] if isinstance(obj, (list, tuple)) else obj
90
+
91
+ model = _SILERO_TTS
92
+ sample_rate = 48000
93
+ speaker = "en_0"
94
+ audio = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
95
+
96
+ out_wav = tempfile.mktemp(suffix=".wav")
97
+ sf.write(out_wav, audio, sample_rate)
98
+ return out_wav
99
+
100
+
101
+
102
+ def tts_coqui_xtts(text: str) -> str:
103
+ """
104
+ Returns path to a WAV file synthesized by Coqui XTTS v2 (higher quality; GPU-friendly).
105
+ """
106
+ from TTS.api import TTS
107
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
108
+ out_wav = tempfile.mktemp(suffix=".wav")
109
+ tts.tts_to_file(text=text, file_path=out_wav, speaker="female-en-5", language="en")
110
+ return out_wav
111
+
112
+ def text_to_speech(text: str) -> str:
113
+ if USE_SILERO:
114
+ return tts_silero(text)
115
+ else:
116
+ return tts_coqui_xtts(text)
117
+
118
+ # ---- Gradio pipeline ----
119
+ def pipeline(audio, history):
120
+ # audio is (sample_rate, np.array) OR a filepath (depends on Gradio version)
121
+ # Normalize to a temp wav file
122
+ if audio is None:
123
+ return history, None, "Please speak something."
124
+
125
+ if isinstance(audio, tuple):
126
+ # (sr, data) -> write wav
127
+ import soundfile as sf, numpy as np, tempfile
128
+ sr, data = audio
129
+ tmp_in = tempfile.mktemp(suffix=".wav")
130
+ sf.write(tmp_in, data.astype("float32"), sr)
131
+ audio_path = tmp_in
132
+ else:
133
+ audio_path = audio # path already
134
+
135
+ user_text = speech_to_text(audio_path)
136
+ if not user_text:
137
+ return history, None, "Didn't catch that—could you repeat?"
138
+
139
+ reply = chat_with_llm(history, user_text)
140
+
141
+ # Extract the "Reply:" line for TTS; speak only the conversational reply
142
+ speak_text = reply
143
+ for tag in ["Reply:", "Correction:", "Why:"]:
144
+ # Try to find "Reply:" block
145
+ if "Reply:" in reply:
146
+ speak_text = reply.split("Reply:", 1)[1].strip()
147
+ break
148
+
149
+ wav_path = text_to_speech(speak_text)
150
+ updated = (history or []) + [
151
+ {"role": "user", "content": user_text},
152
+ {"role": "assistant", "content": reply},
153
+ ]
154
+ return updated, wav_path, ""
155
+
156
+ with gr.Blocks(title="Voice Coach") as demo:
157
+ gr.Markdown("## 🎙️ Interactive Voice Chat")
158
+ with gr.Row():
159
+ audio_in = gr.Audio(sources=["microphone"], type="filepath", label="Speak")
160
+ audio_out = gr.Audio(label="Assistant (TTS)", autoplay=True)
161
+ chatbox = gr.Chatbot(type="messages", height=300)
162
+ status = gr.Markdown()
163
+ btn = gr.Button("Send")
164
+
165
+ # Use continuous recording or press "Send" after recording
166
+ audio_in.change(pipeline, inputs=[audio_in, chatbox], outputs=[chatbox, audio_out, status])
167
+ btn.click(pipeline, inputs=[audio_in, chatbox], outputs=[chatbox, audio_out, status])
168
+
169
+ if __name__ == "__main__":
170
+ demo.launch()
apt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Core app / UI ---
2
+ gradio==5.49.1
3
+ gradio-client==1.13.3
4
+ huggingface_hub==0.33.5 # NOTE: <1.0 to satisfy transformers, >=0.33.5 to satisfy gradio
5
+
6
+ # --- STT ---
7
+ faster-whisper==1.2.0
8
+ ctranslate2==4.6.0
9
+ onnxruntime==1.23.2
10
+
11
+ # --- LLM fallback for Spaces (since no local Ollama) ---
12
+ transformers==4.44.2
13
+ accelerate
14
+ safetensors
15
+ sentencepiece
16
+ einops
17
+ protobuf==3.20.3 # keeps old protobuf constraint friendly to py3.10 wheels
18
+
19
+ # --- TTS & audio IO ---
20
+ torch==2.4.0 # CPU build; OK on Spaces' CPU machines
21
+ soundfile==0.13.1
22
+ pydub==0.25.1
23
+ numpy==1.26.4 # last NumPy supporting Python 3.10
24
+
25
+ # --- HTTP server ---
26
+ uvicorn==0.38.0
27
+
28
+ ollama
29
+
30
+ omegaconf==2.3.0
31
+ num2words==0.5.13
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.11