Spaces:
Sleeping
Sleeping
v3: install llama-cpp-python from prebuilt wheel (HF dataset, no source build)
Browse files- README.md +8 -4
- app.py +4 -13
- requirements.txt +7 -1
README.md
CHANGED
|
@@ -25,9 +25,13 @@ Two tabs:
|
|
| 25 |
Hadamard rotation Gaussianizes per-block weight distributions and
|
| 26 |
reduces the per-block max-abs that drives Q4 / Q4_K rounding error.
|
| 27 |
|
| 28 |
-
##
|
| 29 |
|
| 30 |
-
- Python
|
| 31 |
pydub dep needs).
|
| 32 |
-
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
Hadamard rotation Gaussianizes per-block weight distributions and
|
| 26 |
reduces the per-block max-abs that drives Q4 / Q4_K rounding error.
|
| 27 |
|
| 28 |
+
## Build details
|
| 29 |
|
| 30 |
+
- **Python 3.12** pinned (3.13 dropped stdlib `audioop` which Gradio's
|
| 31 |
pydub dep needs).
|
| 32 |
+
- **llama-cpp-python** installed from a **prebuilt wheel** at
|
| 33 |
+
[AIencoder/llama-cpp-wheels](https://huggingface.co/datasets/AIencoder/llama-cpp-wheels)
|
| 34 |
+
(variant `0.3.16+basic_avx2_fma_f16c-cp312`). HF Spaces don't reliably
|
| 35 |
+
build this from source, so we ship the binary.
|
| 36 |
+
- First `generate` cold-starts (~668 MB GGUF download). Subsequent calls
|
| 37 |
+
are fast (model stays loaded in memory).
|
app.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
"""TurboCPP — llama.cpp + TurboQuant — HuggingFace Space.
|
| 2 |
|
| 3 |
Two tabs:
|
| 4 |
-
1. Run inference: live llama.cpp on TinyLlama-1.1B-Chat-Q4_K_M
|
|
|
|
| 5 |
2. TurboQuant math viz: shows what the offline rotation does to the
|
| 6 |
weight distribution that quantization sees.
|
| 7 |
"""
|
|
@@ -23,9 +24,6 @@ from hadamard import block_hadamard_inplace
|
|
| 23 |
from bench import heavy_tailed_weight, measure
|
| 24 |
|
| 25 |
|
| 26 |
-
# ---------------------------------------------------------------------------
|
| 27 |
-
# Inference tab — lazy-load llama-cpp-python + a small GGUF.
|
| 28 |
-
# ---------------------------------------------------------------------------
|
| 29 |
_llm = None
|
| 30 |
_load_error = None
|
| 31 |
|
|
@@ -91,9 +89,6 @@ def chat(prompt: str, max_tokens: int, temperature: float):
|
|
| 91 |
return text or "(empty)", stats
|
| 92 |
|
| 93 |
|
| 94 |
-
# ---------------------------------------------------------------------------
|
| 95 |
-
# Visualization tab
|
| 96 |
-
# ---------------------------------------------------------------------------
|
| 97 |
def _plot(W_raw, W_rot, block):
|
| 98 |
fig, axes = plt.subplots(1, 3, figsize=(13, 3.6))
|
| 99 |
raw = W_raw.flatten().numpy()
|
|
@@ -149,16 +144,12 @@ def visualize(rows, cols, block, seed):
|
|
| 149 |
return _plot(W, W_rot, int(block)), summary
|
| 150 |
|
| 151 |
|
| 152 |
-
# ---------------------------------------------------------------------------
|
| 153 |
-
# UI
|
| 154 |
-
# ---------------------------------------------------------------------------
|
| 155 |
with gr.Blocks(title="turbocpp - llama.cpp + TurboQuant",
|
| 156 |
theme=gr.themes.Soft()) as demo:
|
| 157 |
gr.Markdown("# turbocpp - llama.cpp + TurboQuant")
|
| 158 |
gr.Markdown(
|
| 159 |
-
"Live llama.cpp running TinyLlama-1.1B-Chat (Q4_K_M)
|
| 160 |
-
"
|
| 161 |
-
"preprocessor. "
|
| 162 |
"Code: [github.com/Ary5272/turbocpp](https://github.com/Ary5272/turbocpp)"
|
| 163 |
)
|
| 164 |
|
|
|
|
| 1 |
"""TurboCPP — llama.cpp + TurboQuant — HuggingFace Space.
|
| 2 |
|
| 3 |
Two tabs:
|
| 4 |
+
1. Run inference: live llama.cpp on TinyLlama-1.1B-Chat-Q4_K_M via the
|
| 5 |
+
prebuilt llama-cpp-python wheel from AIencoder/llama-cpp-wheels.
|
| 6 |
2. TurboQuant math viz: shows what the offline rotation does to the
|
| 7 |
weight distribution that quantization sees.
|
| 8 |
"""
|
|
|
|
| 24 |
from bench import heavy_tailed_weight, measure
|
| 25 |
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
_llm = None
|
| 28 |
_load_error = None
|
| 29 |
|
|
|
|
| 89 |
return text or "(empty)", stats
|
| 90 |
|
| 91 |
|
|
|
|
|
|
|
|
|
|
| 92 |
def _plot(W_raw, W_rot, block):
|
| 93 |
fig, axes = plt.subplots(1, 3, figsize=(13, 3.6))
|
| 94 |
raw = W_raw.flatten().numpy()
|
|
|
|
| 144 |
return _plot(W, W_rot, int(block)), summary
|
| 145 |
|
| 146 |
|
|
|
|
|
|
|
|
|
|
| 147 |
with gr.Blocks(title="turbocpp - llama.cpp + TurboQuant",
|
| 148 |
theme=gr.themes.Soft()) as demo:
|
| 149 |
gr.Markdown("# turbocpp - llama.cpp + TurboQuant")
|
| 150 |
gr.Markdown(
|
| 151 |
+
"Live llama.cpp running TinyLlama-1.1B-Chat (Q4_K_M) via a "
|
| 152 |
+
"prebuilt wheel + interactive Hadamard-rotation visualizer. "
|
|
|
|
| 153 |
"Code: [github.com/Ary5272/turbocpp](https://github.com/Ary5272/turbocpp)"
|
| 154 |
)
|
| 155 |
|
requirements.txt
CHANGED
|
@@ -4,5 +4,11 @@ numpy>=1.24
|
|
| 4 |
torch>=2.0
|
| 5 |
pillow>=10.0
|
| 6 |
huggingface_hub>=0.24
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
audioop-lts; python_version >= "3.13"
|
|
|
|
| 4 |
torch>=2.0
|
| 5 |
pillow>=10.0
|
| 6 |
huggingface_hub>=0.24
|
| 7 |
+
|
| 8 |
+
# llama-cpp-python: prebuilt wheel from AIencoder/llama-cpp-wheels.
|
| 9 |
+
# CPU-only (no BLAS dep), AVX2 + FMA + F16C — the right baseline for HF
|
| 10 |
+
# Spaces' x86_64 CPU. Avoids source-build, which is unreliable on Spaces.
|
| 11 |
+
https://huggingface.co/datasets/AIencoder/llama-cpp-wheels/resolve/main/llama_cpp_python-0.3.16%2Bbasic_avx2_fma_f16c-cp312-cp312-manylinux_2_31_x86_64.whl
|
| 12 |
+
|
| 13 |
+
# Backport for stdlib audioop, removed in Python 3.13.
|
| 14 |
audioop-lts; python_version >= "3.13"
|