raichemathew1 commited on
Commit
de2ad9c
·
0 Parent(s):

Initial local S2S shell starter

Browse files
Files changed (7) hide show
  1. .gitignore +15 -0
  2. README.md +54 -0
  3. config.json +20 -0
  4. download_models.py +38 -0
  5. requirements.txt +7 -0
  6. run_shell_s2s.bat +5 -0
  7. shell_s2s.py +262 -0
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+
5
+ logs/
6
+ output/
7
+
8
+ models/llm/*.gguf
9
+ models/llm/*.bin
10
+ models/llm/*.safetensors
11
+
12
+ *.wav
13
+ *.mp3
14
+
15
+ .env
README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local S2S Shell Starter
2
+
3
+ A simple local speech-to-speech assistant that runs from a Windows terminal.
4
+
5
+ ## Stack
6
+
7
+ - STT: faster-whisper medium
8
+ - LLM: Qwen2.5 3B Instruct GGUF Q4_K_M
9
+ - TTS: Windows SAPI voice
10
+ - UI: terminal only
11
+
12
+ ## Pipeline
13
+
14
+ microphone -> faster-whisper -> Qwen2.5 3B GGUF -> Windows SAPI speech
15
+
16
+ ## Hardware Target
17
+
18
+ - CPU fallback supported
19
+ - NVIDIA GPU auto-used when available
20
+ - 8GB+ VRAM recommended for smoother local use
21
+
22
+ ## Setup
23
+
24
+ Run from PowerShell:
25
+
26
+ py -3.11 -m venv .venv
27
+ .\.venv\Scripts\python.exe -m pip install --upgrade pip setuptools wheel
28
+ .\.venv\Scripts\python.exe -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
29
+ .\.venv\Scripts\python.exe -m pip install -r requirements.txt
30
+ .\.venv\Scripts\python.exe download_models.py
31
+
32
+ ## Run
33
+
34
+ .\run_shell_s2s.bat
35
+
36
+ ## Shell Commands
37
+
38
+ Enter = record mic and run speech-to-speech
39
+ t = type text and hear reply
40
+ d = list audio devices
41
+ q = quit
42
+
43
+ ## Model Download
44
+
45
+ The downloader fetches:
46
+
47
+ Repo: bartowski/Qwen2.5-3B-Instruct-GGUF
48
+ File: Qwen2.5-3B-Instruct-Q4_K_M.gguf
49
+
50
+ The GGUF model file is not committed to this repository.
51
+
52
+ ## Scope
53
+
54
+ This is a local voice-chat starter. It does not control the computer, run tools, or perform system automation.
config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "stt_model": "medium",
3
+ "stt_device": "auto",
4
+ "stt_compute_type": "auto",
5
+
6
+ "record_seconds": 4,
7
+ "sample_rate": 16000,
8
+
9
+ "llm_repo_id": "bartowski/Qwen2.5-3B-Instruct-GGUF",
10
+ "llm_filename": "Qwen2.5-3B-Instruct-Q4_K_M.gguf",
11
+ "llm_model_path": "models/llm/qwen2.5-3b-instruct-q4_k_m.gguf",
12
+ "llm_context_size": 2048,
13
+ "llm_gpu_layers": "auto",
14
+ "llm_temperature": 0.35,
15
+ "llm_max_tokens": 140,
16
+
17
+ "tts_model": "tts_models/en/ljspeech/vits",
18
+
19
+ "system_prompt": "You are a concise local voice assistant. Answer in one or two short sentences."
20
+ }
download_models.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ ROOT = Path(__file__).resolve().parent
6
+ CFG = json.loads((ROOT / "config.json").read_text(encoding="utf-8-sig"))
7
+
8
+ llm_dir = ROOT / "models" / "llm"
9
+ llm_dir.mkdir(parents=True, exist_ok=True)
10
+
11
+ target = ROOT / CFG["llm_model_path"]
12
+ repo_id = CFG["llm_repo_id"]
13
+ filename = CFG["llm_filename"]
14
+
15
+ print("LOCAL S2S SHELL - MODEL DOWNLOAD")
16
+ print("Repo:", repo_id)
17
+ print("File:", filename)
18
+ print("Target:", target)
19
+
20
+ if target.exists():
21
+ print("GREEN: LLM already exists.")
22
+ raise SystemExit(0)
23
+
24
+ downloaded = hf_hub_download(
25
+ repo_id=repo_id,
26
+ filename=filename,
27
+ local_dir=str(llm_dir),
28
+ local_dir_use_symlinks=False
29
+ )
30
+
31
+ downloaded_path = Path(downloaded)
32
+
33
+ if downloaded_path.resolve() != target.resolve():
34
+ if target.exists():
35
+ target.unlink()
36
+ downloaded_path.rename(target)
37
+
38
+ print("GREEN: downloaded", target)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ faster-whisper
2
+ llama-cpp-python
3
+ sounddevice
4
+ soundfile
5
+ numpy
6
+ huggingface-hub
7
+ pywin32
run_shell_s2s.bat ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ @echo off
2
+ cd /d "%~dp0"
3
+ call ".venv\Scripts\activate.bat"
4
+ python shell_s2s.py
5
+ pause
shell_s2s.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import time
5
+ import traceback
6
+ from pathlib import Path
7
+
8
+ import sounddevice as sd
9
+ import soundfile as sf
10
+ from faster_whisper import WhisperModel
11
+ from llama_cpp import Llama
12
+ import win32com.client
13
+
14
+
15
+ ROOT = Path(__file__).resolve().parent
16
+ CFG = json.loads((ROOT / "config.json").read_text(encoding="utf-8-sig"))
17
+ OUTPUT = ROOT / "output"
18
+ LOGS = ROOT / "logs"
19
+ OUTPUT.mkdir(exist_ok=True)
20
+ LOGS.mkdir(exist_ok=True)
21
+
22
+ STT = None
23
+ LLM = None
24
+ SPEAKER = None
25
+
26
+
27
+ def log(msg: str) -> None:
28
+ stamp = time.strftime("%Y-%m-%d %H:%M:%S")
29
+ line = f"[{stamp}] {msg}"
30
+ print(line)
31
+ with open(LOGS / "shell_s2s.log", "a", encoding="utf-8", errors="replace") as f:
32
+ f.write(line + "\n")
33
+
34
+
35
+ def resolve_torch_cuda() -> bool:
36
+ try:
37
+ import torch
38
+ return bool(torch.cuda.is_available())
39
+ except Exception:
40
+ return False
41
+
42
+
43
+ def resolve_stt_device() -> str:
44
+ requested = str(CFG.get("stt_device", "auto")).lower().strip()
45
+ if requested in ("cpu", "cuda"):
46
+ return requested
47
+ return "cuda" if resolve_torch_cuda() else "cpu"
48
+
49
+
50
+ def resolve_stt_compute(device: str) -> str:
51
+ requested = str(CFG.get("stt_compute_type", "auto")).lower().strip()
52
+ if requested != "auto":
53
+ return requested
54
+ return "float16" if device == "cuda" else "int8"
55
+
56
+
57
+ def resolve_llm_gpu_layers() -> int:
58
+ requested = CFG.get("llm_gpu_layers", "auto")
59
+
60
+ if isinstance(requested, int):
61
+ return requested
62
+
63
+ requested = str(requested).lower().strip()
64
+
65
+ if requested == "cpu":
66
+ return 0
67
+
68
+ if requested == "gpu":
69
+ return -1
70
+
71
+ if requested == "auto":
72
+ return -1 if resolve_torch_cuda() else 0
73
+
74
+ try:
75
+ return int(requested)
76
+ except Exception:
77
+ return 0
78
+
79
+
80
+ def load_stt() -> None:
81
+ global STT
82
+ if STT is not None:
83
+ return
84
+
85
+ model = CFG.get("stt_model", "medium")
86
+ device = resolve_stt_device()
87
+ compute = resolve_stt_compute(device)
88
+
89
+ log(f"Loading STT: faster-whisper {model} device={device} compute={compute}")
90
+ STT = WhisperModel(model, device=device, compute_type=compute)
91
+
92
+
93
+ def load_llm() -> None:
94
+ global LLM
95
+ if LLM is not None:
96
+ return
97
+
98
+ model_path = ROOT / CFG["llm_model_path"]
99
+ if not model_path.exists():
100
+ raise FileNotFoundError(f"Missing LLM model: {model_path}. Run download_models.py first.")
101
+
102
+ gpu_layers = resolve_llm_gpu_layers()
103
+ log(f"Loading LLM: {model_path.name} gpu_layers={gpu_layers}")
104
+
105
+ LLM = Llama(
106
+ model_path=str(model_path),
107
+ n_ctx=int(CFG.get("llm_context_size", 2048)),
108
+ n_gpu_layers=gpu_layers,
109
+ verbose=False
110
+ )
111
+
112
+
113
+ def load_sapi() -> None:
114
+ global SPEAKER
115
+ if SPEAKER is not None:
116
+ return
117
+
118
+ log("Loading Windows SAPI voice")
119
+ SPEAKER = win32com.client.Dispatch("SAPI.SpVoice")
120
+
121
+ try:
122
+ # Slightly faster than default.
123
+ SPEAKER.Rate = 1
124
+ SPEAKER.Volume = 100
125
+ except Exception:
126
+ pass
127
+
128
+
129
+ def load_all() -> None:
130
+ t0 = time.perf_counter()
131
+ load_stt()
132
+ load_llm()
133
+ load_sapi()
134
+ log(f"GREEN: all models loaded in {time.perf_counter() - t0:.2f}s")
135
+
136
+
137
+ def record_audio() -> Path:
138
+ seconds = float(CFG.get("record_seconds", 4))
139
+ sample_rate = int(CFG.get("sample_rate", 16000))
140
+ out = OUTPUT / "input.wav"
141
+
142
+ print("")
143
+ print(f"Recording {seconds:.1f}s. Speak now.")
144
+ audio = sd.rec(
145
+ int(seconds * sample_rate),
146
+ samplerate=sample_rate,
147
+ channels=1,
148
+ dtype="float32"
149
+ )
150
+ sd.wait()
151
+
152
+ sf.write(str(out), audio, sample_rate)
153
+ return out
154
+
155
+
156
+ def transcribe(audio_path: Path) -> str:
157
+ t0 = time.perf_counter()
158
+
159
+ segments, info = STT.transcribe(
160
+ str(audio_path),
161
+ beam_size=1,
162
+ vad_filter=True,
163
+ condition_on_previous_text=False
164
+ )
165
+
166
+ text = " ".join(seg.text.strip() for seg in segments).strip()
167
+ log(f"STT {time.perf_counter() - t0:.2f}s: {text}")
168
+ return text
169
+
170
+
171
+ def generate_reply(user_text: str) -> str:
172
+ t0 = time.perf_counter()
173
+
174
+ system = CFG.get("system_prompt", "You are concise.")
175
+ prompt = (
176
+ "<|im_start|>system\n"
177
+ f"{system}\n"
178
+ "<|im_end|>\n"
179
+ "<|im_start|>user\n"
180
+ f"{user_text}\n"
181
+ "<|im_end|>\n"
182
+ "<|im_start|>assistant\n"
183
+ )
184
+
185
+ result = LLM(
186
+ prompt,
187
+ max_tokens=int(CFG.get("llm_max_tokens", 140)),
188
+ temperature=float(CFG.get("llm_temperature", 0.35)),
189
+ stop=["<|im_end|>", "<|im_start|>"]
190
+ )
191
+
192
+ reply = result["choices"][0]["text"].strip()
193
+ log(f"LLM {time.perf_counter() - t0:.2f}s: {reply}")
194
+ return reply
195
+
196
+
197
+ def speak_text(reply: str) -> None:
198
+ t0 = time.perf_counter()
199
+ SPEAKER.Speak(reply)
200
+ log(f"SAPI SPEAK {time.perf_counter() - t0:.2f}s")
201
+
202
+
203
+ def show_devices() -> None:
204
+ print(sd.query_devices())
205
+
206
+
207
+ def one_turn_from_text(text: str) -> None:
208
+ if not text.strip():
209
+ return
210
+ reply = generate_reply(text.strip())
211
+ speak_text(reply)
212
+
213
+
214
+ def one_turn_from_mic() -> None:
215
+ audio = record_audio()
216
+ text = transcribe(audio)
217
+ if not text:
218
+ log("No speech detected.")
219
+ return
220
+ one_turn_from_text(text)
221
+
222
+
223
+ def main() -> int:
224
+ print("LOCAL S2S SHELL - SAPI LOW LATENCY")
225
+ print("")
226
+ print("Commands:")
227
+ print(" Enter = record mic and run speech-to-speech")
228
+ print(" t = type text and hear reply")
229
+ print(" d = list audio devices")
230
+ print(" q = quit")
231
+ print("")
232
+
233
+ load_all()
234
+
235
+ while True:
236
+ cmd = input("\nS2S> ").strip().lower()
237
+
238
+ if cmd in ("q", "quit", "exit"):
239
+ print("bye")
240
+ return 0
241
+
242
+ try:
243
+ if cmd == "d":
244
+ show_devices()
245
+ elif cmd == "t":
246
+ text = input("TEXT> ")
247
+ one_turn_from_text(text)
248
+ else:
249
+ one_turn_from_mic()
250
+
251
+ except KeyboardInterrupt:
252
+ print("")
253
+ return 0
254
+ except Exception as e:
255
+ log("ERROR: " + repr(e))
256
+ traceback.print_exc()
257
+
258
+ return 0
259
+
260
+
261
+ if __name__ == "__main__":
262
+ raise SystemExit(main())