Leadership-Calibration / local_llm.py
Nimesh Desai
Added support to use local Huggingface LLM
11d2671
"""
local_llm.py
------------
LLM configuration for deployment on Hugging Face Spaces (free CPU tier).
Strategy: Instead of loading model weights locally (too slow on CPU),
we call the HuggingFace Serverless Inference API. The model runs on
HF's GPU infrastructure; your Space just makes API calls.
Required HF Space Secret:
HF_TOKEN β†’ your HuggingFace token (read access is enough)
Set it in: Space Settings β†’ Variables and Secrets
"""
import os
from crewai import LLM
# ---------------------------------------------------------------------------
# Model options (ranked by capability vs. availability tradeoff)
# ---------------------------------------------------------------------------
# PRIMARY: Llama 3.1 8B β€” best reasoning for agentic tasks, gated model
# (user must accept license at huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
# FALLBACK: Phi-3 Mini β€” no gating, slightly weaker but very reliable
# ---------------------------------------------------------------------------
PRIMARY_MODEL = "huggingface/meta-llama/Llama-3.1-8B-Instruct"
FALLBACK_MODEL = "huggingface/microsoft/Phi-3-mini-4k-instruct"
def get_llm(
model: str | None = None,
temperature: float = 0.3,
max_tokens: int = 1024,
) -> LLM:
"""
Return a CrewAI LLM instance backed by the HF Serverless Inference API.
Parameters
----------
model : Override the default model string (full litellm format).
temperature : Sampling temperature (lower = more deterministic).
max_tokens : Max tokens in the model response.
Raises
------
EnvironmentError if HF_TOKEN is not set.
"""
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
raise EnvironmentError(
"HF_TOKEN environment variable is not set.\n"
"Add it in your HF Space β†’ Settings β†’ Variables and Secrets."
)
chosen_model = model or os.environ.get("LLM_MODEL", PRIMARY_MODEL)
llm = LLM(
model=chosen_model,
api_key=hf_token,
temperature=temperature,
max_tokens=max_tokens,
timeout=120, # HF cold-starts can be slow; be patient
max_retries=2,
)
return llm
def get_fallback_llm(temperature: float = 0.3, max_tokens: int = 1024) -> LLM:
"""
Return the fallback LLM (no gated access required).
Use this if the primary model raises an access / 403 error.
"""
return get_llm(
model=FALLBACK_MODEL,
temperature=temperature,
max_tokens=max_tokens,
)
# ---------------------------------------------------------------------------
# Quick smoke-test (run: python local_llm.py)
# ---------------------------------------------------------------------------
if __name__ == "__main__":
import sys
print(f"Testing primary model: {PRIMARY_MODEL}")
try:
llm = get_llm()
resp = llm.call([{"role": "user", "content": "Reply with exactly: OK"}])
print(f"Response: {resp}")
print("βœ… Primary model OK")
except Exception as e:
print(f"⚠️ Primary model failed: {e}")
print(f"Trying fallback model: {FALLBACK_MODEL}")
try:
llm = get_fallback_llm()
resp = llm.call([{"role": "user", "content": "Reply with exactly: OK"}])
print(f"Response: {resp}")
print("βœ… Fallback model OK")
except Exception as e2:
print(f"❌ Fallback also failed: {e2}")
sys.exit(1)