Spaces:

Forol
/

gemma3-api-backend

Running

Update app.py

9aff491 verified about 9 hours ago

1.45 kB

	import os
	import subprocess
	import sys

	print("--- STEP 1: Installing Pre-Compiled llama-cpp-python Wheel (Fast Track) ---")
	# This forces pip to pull a ready-made binary instead of compiling it from C++ source
	subprocess.run([
	sys.executable, "-m", "pip", "install", "llama-cpp-python[server]",
	"--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cpu"
	])

	# Now that it's installed, we can safely import huggingface tools
	from huggingface_hub import hf_hub_download

	# =========================================================================
	# CONFIGURATION: Targets the exact repository and 4-bit model file
	# =========================================================================
	REPO_ID = "bartowski/google_gemma-3-4b-it-GGUF"
	FILENAME = "google_gemma-3-4b-it-Q4_K_M.gguf"

	print("--- STEP 2: Downloading Gemma 3 4B Model Weights ---")
	model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
	print(f"Model successfully saved to cache area: {model_path}")

	print("--- STEP 3: Initializing OpenAI-Compatible Server ---")
	cmd = [
	"python3", "-m", "llama_cpp.server",
	"--model", model_path,
	"--model_alias", "gemma-3",
	"--host", "0.0.0.0",
	"--port", "7860", # Mandatory port required by Hugging Face
	"--n_ctx", "2048", # Context limit optimized for RAM protection
	"--n_threads", "2" # Uses exactly the 2 free vCPUs allocated
	]

	# Run server engine
	subprocess.run(cmd)