AIencoder commited on
Commit
5696928
·
verified ·
1 Parent(s): 50c695a

v3: install llama-cpp-python from prebuilt wheel (HF dataset, no source build)

Browse files
Files changed (3) hide show
  1. README.md +8 -4
  2. app.py +4 -13
  3. requirements.txt +7 -1
README.md CHANGED
@@ -25,9 +25,13 @@ Two tabs:
25
  Hadamard rotation Gaussianizes per-block weight distributions and
26
  reduces the per-block max-abs that drives Q4 / Q4_K rounding error.
27
 
28
- ## Notes
29
 
30
- - Python pinned to 3.12 (3.13 dropped stdlib `audioop` which Gradio's
31
  pydub dep needs).
32
- - First call cold-starts the model (~668 MB GGUF download). Subsequent
33
- calls are fast.
 
 
 
 
 
25
  Hadamard rotation Gaussianizes per-block weight distributions and
26
  reduces the per-block max-abs that drives Q4 / Q4_K rounding error.
27
 
28
+ ## Build details
29
 
30
+ - **Python 3.12** pinned (3.13 dropped stdlib `audioop` which Gradio's
31
  pydub dep needs).
32
+ - **llama-cpp-python** installed from a **prebuilt wheel** at
33
+ [AIencoder/llama-cpp-wheels](https://huggingface.co/datasets/AIencoder/llama-cpp-wheels)
34
+ (variant `0.3.16+basic_avx2_fma_f16c-cp312`). HF Spaces don't reliably
35
+ build this from source, so we ship the binary.
36
+ - First `generate` cold-starts (~668 MB GGUF download). Subsequent calls
37
+ are fast (model stays loaded in memory).
app.py CHANGED
@@ -1,7 +1,8 @@
1
  """TurboCPP — llama.cpp + TurboQuant — HuggingFace Space.
2
 
3
  Two tabs:
4
- 1. Run inference: live llama.cpp on TinyLlama-1.1B-Chat-Q4_K_M.
 
5
  2. TurboQuant math viz: shows what the offline rotation does to the
6
  weight distribution that quantization sees.
7
  """
@@ -23,9 +24,6 @@ from hadamard import block_hadamard_inplace
23
  from bench import heavy_tailed_weight, measure
24
 
25
 
26
- # ---------------------------------------------------------------------------
27
- # Inference tab — lazy-load llama-cpp-python + a small GGUF.
28
- # ---------------------------------------------------------------------------
29
  _llm = None
30
  _load_error = None
31
 
@@ -91,9 +89,6 @@ def chat(prompt: str, max_tokens: int, temperature: float):
91
  return text or "(empty)", stats
92
 
93
 
94
- # ---------------------------------------------------------------------------
95
- # Visualization tab
96
- # ---------------------------------------------------------------------------
97
  def _plot(W_raw, W_rot, block):
98
  fig, axes = plt.subplots(1, 3, figsize=(13, 3.6))
99
  raw = W_raw.flatten().numpy()
@@ -149,16 +144,12 @@ def visualize(rows, cols, block, seed):
149
  return _plot(W, W_rot, int(block)), summary
150
 
151
 
152
- # ---------------------------------------------------------------------------
153
- # UI
154
- # ---------------------------------------------------------------------------
155
  with gr.Blocks(title="turbocpp - llama.cpp + TurboQuant",
156
  theme=gr.themes.Soft()) as demo:
157
  gr.Markdown("# turbocpp - llama.cpp + TurboQuant")
158
  gr.Markdown(
159
- "Live llama.cpp running TinyLlama-1.1B-Chat (Q4_K_M) plus an "
160
- "interactive math visualizer for the Hadamard-rotation "
161
- "preprocessor. "
162
  "Code: [github.com/Ary5272/turbocpp](https://github.com/Ary5272/turbocpp)"
163
  )
164
 
 
1
  """TurboCPP — llama.cpp + TurboQuant — HuggingFace Space.
2
 
3
  Two tabs:
4
+ 1. Run inference: live llama.cpp on TinyLlama-1.1B-Chat-Q4_K_M via the
5
+ prebuilt llama-cpp-python wheel from AIencoder/llama-cpp-wheels.
6
  2. TurboQuant math viz: shows what the offline rotation does to the
7
  weight distribution that quantization sees.
8
  """
 
24
  from bench import heavy_tailed_weight, measure
25
 
26
 
 
 
 
27
  _llm = None
28
  _load_error = None
29
 
 
89
  return text or "(empty)", stats
90
 
91
 
 
 
 
92
  def _plot(W_raw, W_rot, block):
93
  fig, axes = plt.subplots(1, 3, figsize=(13, 3.6))
94
  raw = W_raw.flatten().numpy()
 
144
  return _plot(W, W_rot, int(block)), summary
145
 
146
 
 
 
 
147
  with gr.Blocks(title="turbocpp - llama.cpp + TurboQuant",
148
  theme=gr.themes.Soft()) as demo:
149
  gr.Markdown("# turbocpp - llama.cpp + TurboQuant")
150
  gr.Markdown(
151
+ "Live llama.cpp running TinyLlama-1.1B-Chat (Q4_K_M) via a "
152
+ "prebuilt wheel + interactive Hadamard-rotation visualizer. "
 
153
  "Code: [github.com/Ary5272/turbocpp](https://github.com/Ary5272/turbocpp)"
154
  )
155
 
requirements.txt CHANGED
@@ -4,5 +4,11 @@ numpy>=1.24
4
  torch>=2.0
5
  pillow>=10.0
6
  huggingface_hub>=0.24
7
- llama-cpp-python>=0.3.2
 
 
 
 
 
 
8
  audioop-lts; python_version >= "3.13"
 
4
  torch>=2.0
5
  pillow>=10.0
6
  huggingface_hub>=0.24
7
+
8
+ # llama-cpp-python: prebuilt wheel from AIencoder/llama-cpp-wheels.
9
+ # CPU-only (no BLAS dep), AVX2 + FMA + F16C — the right baseline for HF
10
+ # Spaces' x86_64 CPU. Avoids source-build, which is unreliable on Spaces.
11
+ https://huggingface.co/datasets/AIencoder/llama-cpp-wheels/resolve/main/llama_cpp_python-0.3.16%2Bbasic_avx2_fma_f16c-cp312-cp312-manylinux_2_31_x86_64.whl
12
+
13
+ # Backport for stdlib audioop, removed in Python 3.13.
14
  audioop-lts; python_version >= "3.13"