Spaces:
Running on Zero
Running on Zero
File size: 4,078 Bytes
ebc3bf5 ca8a415 ebc3bf5 ca8a415 39003c5 ebc3bf5 ca8a415 39003c5 ca8a415 ebc3bf5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | import os
# Model settings
LLM_MODEL = os.getenv("LLM_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
EMBEDDER_MODEL = os.getenv("EMBEDDER_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
# Curated <32B open-weight causal LMs for local inference (shown in the UI dropdown).
AVAILABLE_MODELS = [
"Qwen/Qwen2.5-0.5B-Instruct",
"HuggingFaceTB/SmolLM2-135M-Instruct",
"HuggingFaceTB/SmolLM2-360M-Instruct",
"Qwen/Qwen2.5-1.5B-Instruct",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"HuggingFaceTB/SmolLM2-1.7B-Instruct",
"microsoft/Phi-3.5-mini-instruct",
]
MODEL_INFO = {
"HuggingFaceTB/SmolLM2-135M-Instruct": (
"β‘ **Fastest Β· 135M params** β Loads in ~10 s. \n"
"Good for exploring the interface. May produce looser compressions; "
"switch up for high-fidelity output."
),
"HuggingFaceTB/SmolLM2-360M-Instruct": (
"π **Fast Β· 360M params** β Loads in ~20 s. \n"
"Noticeably better compression than 135M with a small speed trade-off."
),
"Qwen/Qwen2.5-0.5B-Instruct": (
"β‘ **Fast Β· 500M params Β· Default** β Loads in ~25 s. \n"
"Strong instruction-following for its size; reliably respects token budgets. "
"Best balance of speed and quality."
),
"TinyLlama/TinyLlama-1.1B-Chat-v1.0": (
"π **Fast Β· 1.1B params** β Loads in ~40 s. \n"
"Fully open, no licence required. Good general-purpose compression at 1B scale."
),
"Qwen/Qwen2.5-1.5B-Instruct": (
"βοΈ **Balanced Β· 1.5B params** β Loads in ~60 s. \n"
"Solid all-rounder; strong semantic fidelity and budget adherence."
),
"HuggingFaceTB/SmolLM2-1.7B-Instruct": (
"βοΈ **Balanced Β· 1.7B params** β Loads in ~60 s. \n"
"Designed for edge inference; efficient on CPU."
),
"microsoft/Phi-3.5-mini-instruct": (
"π **Best quality Β· 3.8B params** β Loads in ~2 min. \n"
"Strongest reasoning and fidelity in this list. GPU strongly recommended."
),
}
# Curated sentence-transformer embedding models for quality scoring.
AVAILABLE_EMBEDDER_MODELS = [
"sentence-transformers/all-MiniLM-L6-v2",
"sentence-transformers/all-mpnet-base-v2",
"BAAI/bge-small-en-v1.5",
"BAAI/bge-base-en-v1.5",
"mixedbread-ai/mxbai-embed-large-v1",
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
]
EMBEDDER_INFO = {
"sentence-transformers/all-MiniLM-L6-v2": (
"β‘ **Fast Β· 22M params Β· Default** \n"
"Great baseline. Scores are reliable for typical compression ratios. "
"Runs comfortably on CPU β minimal overhead."
),
"sentence-transformers/all-mpnet-base-v2": (
"βοΈ **Balanced Β· 110M params** \n"
"Noticeably sharper quality scores than MiniLM, especially on longer texts. "
"Small speed trade-off; fine on CPU."
),
"BAAI/bge-small-en-v1.5": (
"β‘ **Fast Β· 33M params** \n"
"Strong quality-to-size ratio β often matches MiniLM on accuracy while being "
"slightly more sensitive to meaning shifts. Good CPU option."
),
"BAAI/bge-base-en-v1.5": (
"βοΈ **Balanced Β· 109M params** \n"
"Consistently strong on semantic similarity benchmarks. "
"Scores will be more discriminating β small differences in compression quality show up more clearly."
),
"mixedbread-ai/mxbai-embed-large-v1": (
"π **High quality Β· 335M params** \n"
"Top-tier similarity scores. Quality readings will be the most accurate here, "
"but slower to load and run. GPU recommended."
),
"Alibaba-NLP/gte-Qwen2-1.5B-instruct": (
"π¬ **Best quality Β· 1.5B params** \n"
"Strongest semantic understanding in this list. Scores will reflect subtle meaning loss "
"that smaller models miss. Requires significant RAM/VRAM β GPU strongly recommended."
),
}
# Compression settings
DEFAULT_TARGET_TOKENS = 500
MAX_NEW_TOKENS = 1024
# Gradio
APP_TITLE = "TinyPress"
SERVER_PORT = int(os.getenv("PORT", 7860))
|