#!/usr/bin/env python3 """ Script to set Qwen2.5-7B-Instruct Q4_K_M on Hugging Face Space. Upgrade from Gemma 2-2B-it for better Vietnamese legal understanding. Usage: python3 set_hf_space_env_qwen.py """ import os import sys from pathlib import Path try: from huggingface_hub import HfApi except ImportError: print("āŒ huggingface_hub not installed. Install with: pip install huggingface_hub") sys.exit(1) # Space configuration SPACE_ID = "davidtran999/hue-portal-backend" # Environment variables for Qwen2.5-7B-Instruct Q4_K_M # Optimized for 2 vCPU + 16GB RAM free tier ENV_VARS = { "DEFAULT_LLM_PROVIDER": "llama_cpp", "LLM_PROVIDER": "llama_cpp", # Qwen2.5-7B-Instruct Q4_K_M (~4GB, best balance for free tier) "LLAMA_CPP_MODEL_REPO": "bartowski/Qwen2.5-7B-Instruct-GGUF", "LLAMA_CPP_MODEL_FILE": "Qwen2.5-7B-Instruct-Q4_K_M.gguf", # Context: 2048 tokens (reduced from 4096 to speed up inference on free tier CPU) # Qwen2.5-7B Q4_K_M supports up to 8192, but 2048 is optimal for 2 vCPU free tier "LLAMA_CPP_CONTEXT": "2048", "LLAMA_CPP_THREADS": "2", "LLAMA_CPP_BATCH": "512", # Increased to 512 for faster prompt eval (was 256) "LLAMA_CPP_MAX_TOKENS": "512", "LLAMA_CPP_TEMPERATURE": "0.35", "LLAMA_CPP_TOP_P": "0.85", "LLAMA_CPP_REPEAT_PENALTY": "1.1", "LLAMA_CPP_USE_MMAP": "true", "LLAMA_CPP_USE_MLOCK": "true", "RUN_HEAVY_STARTUP_TASKS": "0", } def main(): # Get HF token hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") if not hf_token: token_file = Path.home() / ".cache" / "huggingface" / "token" if token_file.exists(): hf_token = token_file.read_text(encoding="utf-8").strip() if not hf_token: print("āŒ HF_TOKEN not found.") print("\nšŸ’” Option 1: Set token as environment variable") print(" export HF_TOKEN=your_token_here") print(" python3 set_hf_space_env_qwen.py") print("\nšŸ’” Option 2: Login with Hugging Face CLI") print(" huggingface-cli login") print(" python3 set_hf_space_env_qwen.py") sys.exit(1) # Initialize API api = HfApi(token=hf_token) print(f"šŸš€ Upgrading to Qwen2.5-7B-Instruct Q4_K_M on Space: {SPACE_ID}") print("=" * 60) print("šŸ“Š Model specs:") print(" - Size: ~4GB (downloads from HF, no storage limit)") print(" - RAM: ~6-8GB (fits 16GB free tier)") print(" - Expected latency: 7-9s on 2 vCPU") print(" - Vietnamese legal: Excellent") print("=" * 60) for key, value in ENV_VARS.items(): try: print(f"Setting {key}={value}...", end=" ") api.delete_space_variable(repo_id=SPACE_ID, key=key) except Exception: pass # Ignore if variable doesn't exist yet try: api.add_space_variable(repo_id=SPACE_ID, key=key, value=str(value)) print("āœ…") except Exception as exc: print(f"āŒ {exc}") print("=" * 60) print("āœ… Config updated! Restarting Space...") try: api.restart_space(repo_id=SPACE_ID) print("āœ… Space restarted. Wait 2-3 minutes for model download & load.") print("\nšŸ’” Monitor logs at:") print(f" https://huggingface.co/spaces/{SPACE_ID}/logs") except Exception as exc: print(f"āš ļø Config saved but restart failed: {exc}") print(" Please restart Space manually from HF dashboard.") if __name__ == "__main__": main()