VoIPAnalyzer / llm_utils.py
fabioantonini's picture
Upload llm_utils.py
91967b1 verified
# llm_utils.py
import os
import requests
# 1) HUGGING FACE INFERENCE API APPROACH
# ---------------------------------------
# This approach sends your prompt to the hosted Inference API on Hugging Face.
# You need:
# - A model endpoint, e.g. 'tiiuae/falcon-7b-instruct'
# - A valid HUGGINGFACEHUB_API_TOKEN with access to that model.
#
# Pros: no heavy model to download locally
# Cons: subject to model availability, rate limits, and does not run fully offline
HF_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "YOUR_API_TOKEN_HERE")
MODEL_ID = "tiiuae/falcon-7b-instruct"
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
def get_llm_opinion_inference_api(prompt):
"""
Queries the Hugging Face Inference API for a text generation model.
Returns the generated text as a string.
"""
headers = {
"Authorization": f"Bearer {HF_API_TOKEN}",
"Content-Type": "application/json"
}
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": 200,
"temperature": 0.5,
"top_p": 0.9,
"do_sample": True
}
}
response = requests.post(API_URL, headers=headers, json=payload)
if response.status_code != 200:
return f"Error: Hugging Face Inference API returned status {response.status_code}\n{response.text}"
# The Inference API returns an array of generated text(s)
data = response.json()
if isinstance(data, dict) and "error" in data:
return f"Error: {data['error']}"
# Typically, data[0]["generated_text"] holds the string
return data[0]["generated_text"]
# 2) LOCAL PIPELINE APPROACH
# --------------------------
# This approach loads a model locally via the Transformers library.
# This can be done on a Hugging Face Space if:
# - The model size fits the hardware resources (RAM/GPU)
# - The Space is configured to install transformers, etc.
# Pros: no external calls, faster for repeated queries
# Cons: potentially large downloads, memory usage
#
# If you want to use this approach, uncomment and adapt as needed:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
def create_local_pipeline(model_id="tiiuae/falcon-7b-instruct"):
# Download and load the model locally
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto" # or "cpu" if no GPU is available
)
return pipeline("text-generation", model=model, tokenizer=tokenizer)
# Example pipeline initialization (done once):
# generator = create_local_pipeline()
def get_llm_opinion_local(prompt, generator):
# Generate text from the local pipeline
outputs = generator(prompt, max_length=256, do_sample=True, temperature=0.5)
return outputs[0]["generated_text"]
# 3) WRAPPING LOGIC
# -----------------
# You can unify the approaches in a single function. For instance, if you want
# to prefer local inference if a pipeline is initialized, otherwise fallback to
# the Inference API:
def get_llm_opinion(prompt, generator=None):
"""
High-level function to get the LLM's opinion.
If a local pipeline 'generator' is provided, use that.
Otherwise, fallback to the Hugging Face Inference API.
"""
if generator is not None:
# local pipeline approach
outputs = generator(prompt, max_length=256, do_sample=True, temperature=0.5)
return outputs[0]["generated_text"]
else:
# inference API approach
return get_llm_opinion_inference_api(prompt)