Spaces:

davidtran999
/

hue-portal-backend

Paused

File size: 3,571 Bytes

3718c84

#!/usr/bin/env python3
"""
Script to set Qwen2.5-7B-Instruct Q4_K_M on Hugging Face Space.
Upgrade from Gemma 2-2B-it for better Vietnamese legal understanding.

Usage:
    python3 set_hf_space_env_qwen.py
"""
import os
import sys
from pathlib import Path

try:
    from huggingface_hub import HfApi
except ImportError:
    print("❌ huggingface_hub not installed. Install with: pip install huggingface_hub")
    sys.exit(1)

# Space configuration
SPACE_ID = "davidtran999/hue-portal-backend"

# Environment variables for Qwen2.5-7B-Instruct Q4_K_M
# Optimized for 2 vCPU + 16GB RAM free tier
ENV_VARS = {
    "DEFAULT_LLM_PROVIDER": "llama_cpp",
    "LLM_PROVIDER": "llama_cpp",
    # Qwen2.5-7B-Instruct Q4_K_M (~4GB, best balance for free tier)
    "LLAMA_CPP_MODEL_REPO": "bartowski/Qwen2.5-7B-Instruct-GGUF",
    "LLAMA_CPP_MODEL_FILE": "Qwen2.5-7B-Instruct-Q4_K_M.gguf",
    # Context: 2048 tokens (reduced from 4096 to speed up inference on free tier CPU)
    # Qwen2.5-7B Q4_K_M supports up to 8192, but 2048 is optimal for 2 vCPU free tier
    "LLAMA_CPP_CONTEXT": "2048",
    "LLAMA_CPP_THREADS": "2",
    "LLAMA_CPP_BATCH": "512",  # Increased to 512 for faster prompt eval (was 256)
    "LLAMA_CPP_MAX_TOKENS": "512",
    "LLAMA_CPP_TEMPERATURE": "0.35",
    "LLAMA_CPP_TOP_P": "0.85",
    "LLAMA_CPP_REPEAT_PENALTY": "1.1",
    "LLAMA_CPP_USE_MMAP": "true",
    "LLAMA_CPP_USE_MLOCK": "true",
    "RUN_HEAVY_STARTUP_TASKS": "0",
}

def main():
    # Get HF token
    hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
    if not hf_token:
        token_file = Path.home() / ".cache" / "huggingface" / "token"
        if token_file.exists():
            hf_token = token_file.read_text(encoding="utf-8").strip()
    if not hf_token:
        print("❌ HF_TOKEN not found.")
        print("\n💡 Option 1: Set token as environment variable")
        print("   export HF_TOKEN=your_token_here")
        print("   python3 set_hf_space_env_qwen.py")
        print("\n💡 Option 2: Login with Hugging Face CLI")
        print("   huggingface-cli login")
        print("   python3 set_hf_space_env_qwen.py")
        sys.exit(1)
    
    # Initialize API
    api = HfApi(token=hf_token)
    
    print(f"🚀 Upgrading to Qwen2.5-7B-Instruct Q4_K_M on Space: {SPACE_ID}")
    print("=" * 60)
    print("📊 Model specs:")
    print("   - Size: ~4GB (downloads from HF, no storage limit)")
    print("   - RAM: ~6-8GB (fits 16GB free tier)")
    print("   - Expected latency: 7-9s on 2 vCPU")
    print("   - Vietnamese legal: Excellent")
    print("=" * 60)
    
    for key, value in ENV_VARS.items():
        try:
            print(f"Setting {key}={value}...", end=" ")
            api.delete_space_variable(repo_id=SPACE_ID, key=key)
        except Exception:
            pass  # Ignore if variable doesn't exist yet
        try:
            api.add_space_variable(repo_id=SPACE_ID, key=key, value=str(value))
            print("✅")
        except Exception as exc:
            print(f"❌ {exc}")
    
    print("=" * 60)
    print("✅ Config updated! Restarting Space...")
    
    try:
        api.restart_space(repo_id=SPACE_ID)
        print("✅ Space restarted. Wait 2-3 minutes for model download & load.")
        print("\n💡 Monitor logs at:")
        print(f"   https://huggingface.co/spaces/{SPACE_ID}/logs")
    except Exception as exc:
        print(f"⚠️  Config saved but restart failed: {exc}")
        print("   Please restart Space manually from HF dashboard.")

if __name__ == "__main__":
    main()