Spaces:
Sleeping
Sleeping
| """ | |
| local_llm.py | |
| ------------ | |
| LLM configuration for deployment on Hugging Face Spaces (free CPU tier). | |
| Strategy: Instead of loading model weights locally (too slow on CPU), | |
| we call the HuggingFace Serverless Inference API. The model runs on | |
| HF's GPU infrastructure; your Space just makes API calls. | |
| Required HF Space Secret: | |
| HF_TOKEN β your HuggingFace token (read access is enough) | |
| Set it in: Space Settings β Variables and Secrets | |
| """ | |
| import os | |
| from crewai import LLM | |
| # --------------------------------------------------------------------------- | |
| # Model options (ranked by capability vs. availability tradeoff) | |
| # --------------------------------------------------------------------------- | |
| # PRIMARY: Llama 3.1 8B β best reasoning for agentic tasks, gated model | |
| # (user must accept license at huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | |
| # FALLBACK: Phi-3 Mini β no gating, slightly weaker but very reliable | |
| # --------------------------------------------------------------------------- | |
| PRIMARY_MODEL = "huggingface/meta-llama/Llama-3.1-8B-Instruct" | |
| FALLBACK_MODEL = "huggingface/microsoft/Phi-3-mini-4k-instruct" | |
| def get_llm( | |
| model: str | None = None, | |
| temperature: float = 0.3, | |
| max_tokens: int = 1024, | |
| ) -> LLM: | |
| """ | |
| Return a CrewAI LLM instance backed by the HF Serverless Inference API. | |
| Parameters | |
| ---------- | |
| model : Override the default model string (full litellm format). | |
| temperature : Sampling temperature (lower = more deterministic). | |
| max_tokens : Max tokens in the model response. | |
| Raises | |
| ------ | |
| EnvironmentError if HF_TOKEN is not set. | |
| """ | |
| hf_token = os.environ.get("HF_TOKEN") | |
| if not hf_token: | |
| raise EnvironmentError( | |
| "HF_TOKEN environment variable is not set.\n" | |
| "Add it in your HF Space β Settings β Variables and Secrets." | |
| ) | |
| chosen_model = model or os.environ.get("LLM_MODEL", PRIMARY_MODEL) | |
| llm = LLM( | |
| model=chosen_model, | |
| api_key=hf_token, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| timeout=120, # HF cold-starts can be slow; be patient | |
| max_retries=2, | |
| ) | |
| return llm | |
| def get_fallback_llm(temperature: float = 0.3, max_tokens: int = 1024) -> LLM: | |
| """ | |
| Return the fallback LLM (no gated access required). | |
| Use this if the primary model raises an access / 403 error. | |
| """ | |
| return get_llm( | |
| model=FALLBACK_MODEL, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Quick smoke-test (run: python local_llm.py) | |
| # --------------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| import sys | |
| print(f"Testing primary model: {PRIMARY_MODEL}") | |
| try: | |
| llm = get_llm() | |
| resp = llm.call([{"role": "user", "content": "Reply with exactly: OK"}]) | |
| print(f"Response: {resp}") | |
| print("β Primary model OK") | |
| except Exception as e: | |
| print(f"β οΈ Primary model failed: {e}") | |
| print(f"Trying fallback model: {FALLBACK_MODEL}") | |
| try: | |
| llm = get_fallback_llm() | |
| resp = llm.call([{"role": "user", "content": "Reply with exactly: OK"}]) | |
| print(f"Response: {resp}") | |
| print("β Fallback model OK") | |
| except Exception as e2: | |
| print(f"β Fallback also failed: {e2}") | |
| sys.exit(1) | |