Spaces:

Talk2Nimesh
/

Leadership-Calibration

Sleeping

Nimesh Desai

Added support to use local Huggingface LLM

11d2671 about 2 months ago

3.52 kB

	"""
	local_llm.py
	------------
	LLM configuration for deployment on Hugging Face Spaces (free CPU tier).

	Strategy: Instead of loading model weights locally (too slow on CPU),
	we call the HuggingFace Serverless Inference API. The model runs on
	HF's GPU infrastructure; your Space just makes API calls.

	Required HF Space Secret:
	HF_TOKEN → your HuggingFace token (read access is enough)
	Set it in: Space Settings → Variables and Secrets
	"""

	import os
	from crewai import LLM

	# ---------------------------------------------------------------------------
	# Model options (ranked by capability vs. availability tradeoff)
	# ---------------------------------------------------------------------------
	# PRIMARY: Llama 3.1 8B — best reasoning for agentic tasks, gated model
	# (user must accept license at huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
	# FALLBACK: Phi-3 Mini — no gating, slightly weaker but very reliable
	# ---------------------------------------------------------------------------


	PRIMARY_MODEL = "huggingface/meta-llama/Llama-3.1-8B-Instruct"
	FALLBACK_MODEL = "huggingface/microsoft/Phi-3-mini-4k-instruct"


	def get_llm(
	model: str \| None = None,
	temperature: float = 0.3,
	max_tokens: int = 1024,
	) -> LLM:
	"""
	Return a CrewAI LLM instance backed by the HF Serverless Inference API.

	Parameters
	----------
	model : Override the default model string (full litellm format).
	temperature : Sampling temperature (lower = more deterministic).
	max_tokens : Max tokens in the model response.

	Raises
	------
	EnvironmentError if HF_TOKEN is not set.
	"""
	hf_token = os.environ.get("HF_TOKEN")
	if not hf_token:
	raise EnvironmentError(
	"HF_TOKEN environment variable is not set.\n"
	"Add it in your HF Space → Settings → Variables and Secrets."
	)

	chosen_model = model or os.environ.get("LLM_MODEL", PRIMARY_MODEL)

	llm = LLM(
	model=chosen_model,
	api_key=hf_token,
	temperature=temperature,
	max_tokens=max_tokens,
	timeout=120, # HF cold-starts can be slow; be patient
	max_retries=2,
	)

	return llm


	def get_fallback_llm(temperature: float = 0.3, max_tokens: int = 1024) -> LLM:
	"""
	Return the fallback LLM (no gated access required).
	Use this if the primary model raises an access / 403 error.
	"""
	return get_llm(
	model=FALLBACK_MODEL,
	temperature=temperature,
	max_tokens=max_tokens,
	)


	# ---------------------------------------------------------------------------
	# Quick smoke-test (run: python local_llm.py)
	# ---------------------------------------------------------------------------
	if __name__ == "__main__":
	import sys

	print(f"Testing primary model: {PRIMARY_MODEL}")
	try:
	llm = get_llm()
	resp = llm.call([{"role": "user", "content": "Reply with exactly: OK"}])
	print(f"Response: {resp}")
	print("✅ Primary model OK")
	except Exception as e:
	print(f"⚠️ Primary model failed: {e}")
	print(f"Trying fallback model: {FALLBACK_MODEL}")
	try:
	llm = get_fallback_llm()
	resp = llm.call([{"role": "user", "content": "Reply with exactly: OK"}])
	print(f"Response: {resp}")
	print("✅ Fallback model OK")
	except Exception as e2:
	print(f"❌ Fallback also failed: {e2}")
	sys.exit(1)