Spaces:
Running on Zero
Running on Zero
pr/2
#2
by sriharsha-cr - opened
- README.md +3 -2
- app.py +6 -0
- core/tokenizer_utils.py +3 -3
- models/model_loader.py +12 -1
- requirements.txt +0 -2
README.md
CHANGED
|
@@ -35,6 +35,7 @@ No cloud. No API bill. Two small models running quietly on your machine.
|
|
| 35 |
|
| 36 |
[](https://youtu.be/hDbIDtjjiB0)
|
| 37 |
|
|
|
|
| 38 |
---
|
| 39 |
|
| 40 |
## Why this fits Thousand Token Wood
|
|
@@ -134,7 +135,7 @@ Full docs: [Architecture](docs/architecture.md) · [Setup](docs/setup.md) · [Ge
|
|
| 134 |
|
| 135 |
Built by **[Sriharsha C R](https://www.linkedin.com/in/sriharsha-cr)** — AI Engineer and Cloud Native developer.
|
| 136 |
|
| 137 |
-
[](https://www.linkedin.com/
|
| 138 |
-
[](https://x.com/sriharsha_cr)
|
| 139 |
[](https://huggingface.co/sriharsha-cr)
|
| 140 |
[](https://github.com/SriharshaCR)
|
|
|
|
| 35 |
|
| 36 |
[](https://youtu.be/hDbIDtjjiB0)
|
| 37 |
|
| 38 |
+
|
| 39 |
---
|
| 40 |
|
| 41 |
## Why this fits Thousand Token Wood
|
|
|
|
| 135 |
|
| 136 |
Built by **[Sriharsha C R](https://www.linkedin.com/in/sriharsha-cr)** — AI Engineer and Cloud Native developer.
|
| 137 |
|
| 138 |
+
[](https://www.linkedin.com/posts/sriharsha-cr_tinypress-prompt-compression-engine-activity-7471426128331624448-aKfe)
|
| 139 |
+
[](https://x.com/sriharsha_cr/status/2065662576684650879)
|
| 140 |
[](https://huggingface.co/sriharsha-cr)
|
| 141 |
[](https://github.com/SriharshaCR)
|
app.py
CHANGED
|
@@ -2,6 +2,12 @@ import gradio as gr
|
|
| 2 |
import config
|
| 3 |
from ui.compress_tab import build_compress_tab
|
| 4 |
from ui.history_tab import build_history_tab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def build_app() -> gr.Blocks:
|
|
|
|
| 2 |
import config
|
| 3 |
from ui.compress_tab import build_compress_tab
|
| 4 |
from ui.history_tab import build_history_tab
|
| 5 |
+
from models.model_loader import get_tokenizer_only
|
| 6 |
+
|
| 7 |
+
try:
|
| 8 |
+
get_tokenizer_only() # pre-warm; first keystroke is instant
|
| 9 |
+
except Exception:
|
| 10 |
+
pass # falls back to lazy load on first keystroke
|
| 11 |
|
| 12 |
|
| 13 |
def build_app() -> gr.Blocks:
|
core/tokenizer_utils.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
-
from models.model_loader import
|
| 2 |
|
| 3 |
|
| 4 |
def count_tokens(text: str) -> int:
|
| 5 |
-
|
| 6 |
return len(tokenizer.encode(text, add_special_tokens=False))
|
| 7 |
|
| 8 |
|
| 9 |
def get_token_strings(text: str) -> list[str]:
|
| 10 |
"""Return the decoded surface string for every token in text."""
|
| 11 |
-
|
| 12 |
ids = tokenizer.encode(text, add_special_tokens=False)
|
| 13 |
return [tokenizer.decode([i]) for i in ids]
|
|
|
|
| 1 |
+
from models.model_loader import get_tokenizer_only
|
| 2 |
|
| 3 |
|
| 4 |
def count_tokens(text: str) -> int:
|
| 5 |
+
tokenizer = get_tokenizer_only()
|
| 6 |
return len(tokenizer.encode(text, add_special_tokens=False))
|
| 7 |
|
| 8 |
|
| 9 |
def get_token_strings(text: str) -> list[str]:
|
| 10 |
"""Return the decoded surface string for every token in text."""
|
| 11 |
+
tokenizer = get_tokenizer_only()
|
| 12 |
ids = tokenizer.encode(text, add_special_tokens=False)
|
| 13 |
return [tokenizer.decode([i]) for i in ids]
|
models/model_loader.py
CHANGED
|
@@ -6,6 +6,7 @@ import config
|
|
| 6 |
|
| 7 |
_llm = None
|
| 8 |
_tokenizer = None
|
|
|
|
| 9 |
_embedder = None
|
| 10 |
_current_model_id = None
|
| 11 |
_current_embedder_id = None
|
|
@@ -24,6 +25,15 @@ def get_current_embedder_id() -> str | None:
|
|
| 24 |
return _current_embedder_id
|
| 25 |
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def get_llm():
|
| 28 |
global _llm, _tokenizer
|
| 29 |
if _llm is None:
|
|
@@ -55,12 +65,13 @@ def _load_llm(model_id: str):
|
|
| 55 |
|
| 56 |
def _unload_llm():
|
| 57 |
"""Free GPU/CPU memory before loading a different model."""
|
| 58 |
-
global _llm, _tokenizer, _current_model_id
|
| 59 |
del _llm
|
| 60 |
del _tokenizer
|
| 61 |
_llm = None
|
| 62 |
_tokenizer = None
|
| 63 |
_current_model_id = None
|
|
|
|
| 64 |
gc.collect()
|
| 65 |
if torch.cuda.is_available():
|
| 66 |
torch.cuda.empty_cache()
|
|
|
|
| 6 |
|
| 7 |
_llm = None
|
| 8 |
_tokenizer = None
|
| 9 |
+
_tokenizer_only = None
|
| 10 |
_embedder = None
|
| 11 |
_current_model_id = None
|
| 12 |
_current_embedder_id = None
|
|
|
|
| 25 |
return _current_embedder_id
|
| 26 |
|
| 27 |
|
| 28 |
+
def get_tokenizer_only():
|
| 29 |
+
global _tokenizer_only
|
| 30 |
+
if _tokenizer is not None:
|
| 31 |
+
return _tokenizer
|
| 32 |
+
if _tokenizer_only is None:
|
| 33 |
+
_tokenizer_only = AutoTokenizer.from_pretrained(config.LLM_MODEL)
|
| 34 |
+
return _tokenizer_only
|
| 35 |
+
|
| 36 |
+
|
| 37 |
def get_llm():
|
| 38 |
global _llm, _tokenizer
|
| 39 |
if _llm is None:
|
|
|
|
| 65 |
|
| 66 |
def _unload_llm():
|
| 67 |
"""Free GPU/CPU memory before loading a different model."""
|
| 68 |
+
global _llm, _tokenizer, _current_model_id, _tokenizer_only
|
| 69 |
del _llm
|
| 70 |
del _tokenizer
|
| 71 |
_llm = None
|
| 72 |
_tokenizer = None
|
| 73 |
_current_model_id = None
|
| 74 |
+
_tokenizer_only = None
|
| 75 |
gc.collect()
|
| 76 |
if torch.cuda.is_available():
|
| 77 |
torch.cuda.empty_cache()
|
requirements.txt
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
-
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 2 |
gradio==6.18.0
|
| 3 |
transformers>=4.40.0
|
| 4 |
sentence-transformers>=3.0.0
|
| 5 |
-
torch>=2.2.0
|
| 6 |
numpy>=1.26.0
|
| 7 |
pandas>=2.0.0
|
| 8 |
accelerate>=0.30.0
|
|
|
|
|
|
|
| 1 |
gradio==6.18.0
|
| 2 |
transformers>=4.40.0
|
| 3 |
sentence-transformers>=3.0.0
|
|
|
|
| 4 |
numpy>=1.26.0
|
| 5 |
pandas>=2.0.0
|
| 6 |
accelerate>=0.30.0
|