# llm_utils.py import os import requests # 1) HUGGING FACE INFERENCE API APPROACH # --------------------------------------- # This approach sends your prompt to the hosted Inference API on Hugging Face. # You need: # - A model endpoint, e.g. 'tiiuae/falcon-7b-instruct' # - A valid HUGGINGFACEHUB_API_TOKEN with access to that model. # # Pros: no heavy model to download locally # Cons: subject to model availability, rate limits, and does not run fully offline HF_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "YOUR_API_TOKEN_HERE") MODEL_ID = "tiiuae/falcon-7b-instruct" API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}" def get_llm_opinion_inference_api(prompt): """ Queries the Hugging Face Inference API for a text generation model. Returns the generated text as a string. """ headers = { "Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json" } payload = { "inputs": prompt, "parameters": { "max_new_tokens": 200, "temperature": 0.5, "top_p": 0.9, "do_sample": True } } response = requests.post(API_URL, headers=headers, json=payload) if response.status_code != 200: return f"Error: Hugging Face Inference API returned status {response.status_code}\n{response.text}" # The Inference API returns an array of generated text(s) data = response.json() if isinstance(data, dict) and "error" in data: return f"Error: {data['error']}" # Typically, data[0]["generated_text"] holds the string return data[0]["generated_text"] # 2) LOCAL PIPELINE APPROACH # -------------------------- # This approach loads a model locally via the Transformers library. # This can be done on a Hugging Face Space if: # - The model size fits the hardware resources (RAM/GPU) # - The Space is configured to install transformers, etc. # Pros: no external calls, faster for repeated queries # Cons: potentially large downloads, memory usage # # If you want to use this approach, uncomment and adapt as needed: from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline def create_local_pipeline(model_id="tiiuae/falcon-7b-instruct"): # Download and load the model locally tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto" # or "cpu" if no GPU is available ) return pipeline("text-generation", model=model, tokenizer=tokenizer) # Example pipeline initialization (done once): # generator = create_local_pipeline() def get_llm_opinion_local(prompt, generator): # Generate text from the local pipeline outputs = generator(prompt, max_length=256, do_sample=True, temperature=0.5) return outputs[0]["generated_text"] # 3) WRAPPING LOGIC # ----------------- # You can unify the approaches in a single function. For instance, if you want # to prefer local inference if a pipeline is initialized, otherwise fallback to # the Inference API: def get_llm_opinion(prompt, generator=None): """ High-level function to get the LLM's opinion. If a local pipeline 'generator' is provided, use that. Otherwise, fallback to the Hugging Face Inference API. """ if generator is not None: # local pipeline approach outputs = generator(prompt, max_length=256, do_sample=True, temperature=0.5) return outputs[0]["generated_text"] else: # inference API approach return get_llm_opinion_inference_api(prompt)