import os # Model settings LLM_MODEL = os.getenv("LLM_MODEL", "Qwen/Qwen2.5-0.5B-Instruct") EMBEDDER_MODEL = os.getenv("EMBEDDER_MODEL", "sentence-transformers/all-MiniLM-L6-v2") # Curated <32B open-weight causal LMs for local inference (shown in the UI dropdown). AVAILABLE_MODELS = [ "Qwen/Qwen2.5-0.5B-Instruct", "HuggingFaceTB/SmolLM2-135M-Instruct", "HuggingFaceTB/SmolLM2-360M-Instruct", "Qwen/Qwen2.5-1.5B-Instruct", "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "HuggingFaceTB/SmolLM2-1.7B-Instruct", "microsoft/Phi-3.5-mini-instruct", ] MODEL_INFO = { "HuggingFaceTB/SmolLM2-135M-Instruct": ( "⚡ **Fastest · 135M params** — Loads in ~10 s. \n" "Good for exploring the interface. May produce looser compressions; " "switch up for high-fidelity output." ), "HuggingFaceTB/SmolLM2-360M-Instruct": ( "🚀 **Fast · 360M params** — Loads in ~20 s. \n" "Noticeably better compression than 135M with a small speed trade-off." ), "Qwen/Qwen2.5-0.5B-Instruct": ( "⚡ **Fast · 500M params · Default** — Loads in ~25 s. \n" "Strong instruction-following for its size; reliably respects token budgets. " "Best balance of speed and quality." ), "TinyLlama/TinyLlama-1.1B-Chat-v1.0": ( "🚀 **Fast · 1.1B params** — Loads in ~40 s. \n" "Fully open, no licence required. Good general-purpose compression at 1B scale." ), "Qwen/Qwen2.5-1.5B-Instruct": ( "⚖️ **Balanced · 1.5B params** — Loads in ~60 s. \n" "Solid all-rounder; strong semantic fidelity and budget adherence." ), "HuggingFaceTB/SmolLM2-1.7B-Instruct": ( "⚖️ **Balanced · 1.7B params** — Loads in ~60 s. \n" "Designed for edge inference; efficient on CPU." ), "microsoft/Phi-3.5-mini-instruct": ( "🏆 **Best quality · 3.8B params** — Loads in ~2 min. \n" "Strongest reasoning and fidelity in this list. GPU strongly recommended." ), } # Curated sentence-transformer embedding models for quality scoring. AVAILABLE_EMBEDDER_MODELS = [ "sentence-transformers/all-MiniLM-L6-v2", "sentence-transformers/all-mpnet-base-v2", "BAAI/bge-small-en-v1.5", "BAAI/bge-base-en-v1.5", "mixedbread-ai/mxbai-embed-large-v1", "Alibaba-NLP/gte-Qwen2-1.5B-instruct", ] EMBEDDER_INFO = { "sentence-transformers/all-MiniLM-L6-v2": ( "⚡ **Fast · 22M params · Default** \n" "Great baseline. Scores are reliable for typical compression ratios. " "Runs comfortably on CPU — minimal overhead." ), "sentence-transformers/all-mpnet-base-v2": ( "⚖️ **Balanced · 110M params** \n" "Noticeably sharper quality scores than MiniLM, especially on longer texts. " "Small speed trade-off; fine on CPU." ), "BAAI/bge-small-en-v1.5": ( "⚡ **Fast · 33M params** \n" "Strong quality-to-size ratio — often matches MiniLM on accuracy while being " "slightly more sensitive to meaning shifts. Good CPU option." ), "BAAI/bge-base-en-v1.5": ( "⚖️ **Balanced · 109M params** \n" "Consistently strong on semantic similarity benchmarks. " "Scores will be more discriminating — small differences in compression quality show up more clearly." ), "mixedbread-ai/mxbai-embed-large-v1": ( "🏆 **High quality · 335M params** \n" "Top-tier similarity scores. Quality readings will be the most accurate here, " "but slower to load and run. GPU recommended." ), "Alibaba-NLP/gte-Qwen2-1.5B-instruct": ( "🔬 **Best quality · 1.5B params** \n" "Strongest semantic understanding in this list. Scores will reflect subtle meaning loss " "that smaller models miss. Requires significant RAM/VRAM — GPU strongly recommended." ), } # Compression settings DEFAULT_TARGET_TOKENS = 500 MAX_NEW_TOKENS = 1024 # Gradio APP_TITLE = "TinyPress" SERVER_PORT = int(os.getenv("PORT", 7860))