Spaces:

davidtran999
/

hue-portal-backend

Paused

hue-portal-backend / set_hf_space_env_qwen.py

Davidtran99

chore: sync with main repo

3718c84 10 days ago

3.57 kB

	#!/usr/bin/env python3
	"""
	Script to set Qwen2.5-7B-Instruct Q4_K_M on Hugging Face Space.
	Upgrade from Gemma 2-2B-it for better Vietnamese legal understanding.

	Usage:
	python3 set_hf_space_env_qwen.py
	"""
	import os
	import sys
	from pathlib import Path

	try:
	from huggingface_hub import HfApi
	except ImportError:
	print("❌ huggingface_hub not installed. Install with: pip install huggingface_hub")
	sys.exit(1)

	# Space configuration
	SPACE_ID = "davidtran999/hue-portal-backend"

	# Environment variables for Qwen2.5-7B-Instruct Q4_K_M
	# Optimized for 2 vCPU + 16GB RAM free tier
	ENV_VARS = {
	"DEFAULT_LLM_PROVIDER": "llama_cpp",
	"LLM_PROVIDER": "llama_cpp",
	# Qwen2.5-7B-Instruct Q4_K_M (~4GB, best balance for free tier)
	"LLAMA_CPP_MODEL_REPO": "bartowski/Qwen2.5-7B-Instruct-GGUF",
	"LLAMA_CPP_MODEL_FILE": "Qwen2.5-7B-Instruct-Q4_K_M.gguf",
	# Context: 2048 tokens (reduced from 4096 to speed up inference on free tier CPU)
	# Qwen2.5-7B Q4_K_M supports up to 8192, but 2048 is optimal for 2 vCPU free tier
	"LLAMA_CPP_CONTEXT": "2048",
	"LLAMA_CPP_THREADS": "2",
	"LLAMA_CPP_BATCH": "512", # Increased to 512 for faster prompt eval (was 256)
	"LLAMA_CPP_MAX_TOKENS": "512",
	"LLAMA_CPP_TEMPERATURE": "0.35",
	"LLAMA_CPP_TOP_P": "0.85",
	"LLAMA_CPP_REPEAT_PENALTY": "1.1",
	"LLAMA_CPP_USE_MMAP": "true",
	"LLAMA_CPP_USE_MLOCK": "true",
	"RUN_HEAVY_STARTUP_TASKS": "0",
	}

	def main():
	# Get HF token
	hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
	if not hf_token:
	token_file = Path.home() / ".cache" / "huggingface" / "token"
	if token_file.exists():
	hf_token = token_file.read_text(encoding="utf-8").strip()
	if not hf_token:
	print("❌ HF_TOKEN not found.")
	print("\n💡 Option 1: Set token as environment variable")
	print(" export HF_TOKEN=your_token_here")
	print(" python3 set_hf_space_env_qwen.py")
	print("\n💡 Option 2: Login with Hugging Face CLI")
	print(" huggingface-cli login")
	print(" python3 set_hf_space_env_qwen.py")
	sys.exit(1)

	# Initialize API
	api = HfApi(token=hf_token)

	print(f"🚀 Upgrading to Qwen2.5-7B-Instruct Q4_K_M on Space: {SPACE_ID}")
	print("=" * 60)
	print("📊 Model specs:")
	print(" - Size: ~4GB (downloads from HF, no storage limit)")
	print(" - RAM: ~6-8GB (fits 16GB free tier)")
	print(" - Expected latency: 7-9s on 2 vCPU")
	print(" - Vietnamese legal: Excellent")
	print("=" * 60)

	for key, value in ENV_VARS.items():
	try:
	print(f"Setting {key}={value}...", end=" ")
	api.delete_space_variable(repo_id=SPACE_ID, key=key)
	except Exception:
	pass # Ignore if variable doesn't exist yet
	try:
	api.add_space_variable(repo_id=SPACE_ID, key=key, value=str(value))
	print("✅")
	except Exception as exc:
	print(f"❌ {exc}")

	print("=" * 60)
	print("✅ Config updated! Restarting Space...")

	try:
	api.restart_space(repo_id=SPACE_ID)
	print("✅ Space restarted. Wait 2-3 minutes for model download & load.")
	print("\n💡 Monitor logs at:")
	print(f" https://huggingface.co/spaces/{SPACE_ID}/logs")
	except Exception as exc:
	print(f"⚠️ Config saved but restart failed: {exc}")
	print(" Please restart Space manually from HF dashboard.")

	if __name__ == "__main__":
	main()