import os import subprocess import sys print("--- STEP 1: Installing Pre-Compiled llama-cpp-python Wheel (Fast Track) ---") # This forces pip to pull a ready-made binary instead of compiling it from C++ source subprocess.run([ sys.executable, "-m", "pip", "install", "llama-cpp-python[server]", "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cpu" ]) # Now that it's installed, we can safely import huggingface tools from huggingface_hub import hf_hub_download # ========================================================================= # CONFIGURATION: Targets the exact repository and 4-bit model file # ========================================================================= REPO_ID = "bartowski/google_gemma-3-4b-it-GGUF" FILENAME = "google_gemma-3-4b-it-Q4_K_M.gguf" print("--- STEP 2: Downloading Gemma 3 4B Model Weights ---") model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) print(f"Model successfully saved to cache area: {model_path}") print("--- STEP 3: Initializing OpenAI-Compatible Server ---") cmd = [ "python3", "-m", "llama_cpp.server", "--model", model_path, "--model_alias", "gemma-3", "--host", "0.0.0.0", "--port", "7860", # Mandatory port required by Hugging Face "--n_ctx", "2048", # Context limit optimized for RAM protection "--n_threads", "2" # Uses exactly the 2 free vCPUs allocated ] # Run server engine subprocess.run(cmd)