Spaces:

fabioantonini
/

VoIPAnalyzer

Sleeping

File size: 3,699 Bytes

91967b1

# llm_utils.py

import os
import requests

# 1) HUGGING FACE INFERENCE API APPROACH
# ---------------------------------------
# This approach sends your prompt to the hosted Inference API on Hugging Face.
# You need:
#   - A model endpoint, e.g. 'tiiuae/falcon-7b-instruct'
#   - A valid HUGGINGFACEHUB_API_TOKEN with access to that model.
#
# Pros: no heavy model to download locally
# Cons: subject to model availability, rate limits, and does not run fully offline

HF_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "YOUR_API_TOKEN_HERE")
MODEL_ID = "tiiuae/falcon-7b-instruct"
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"

def get_llm_opinion_inference_api(prompt):
    """

    Queries the Hugging Face Inference API for a text generation model.

    Returns the generated text as a string.

    """
    headers = {
        "Authorization": f"Bearer {HF_API_TOKEN}",
        "Content-Type": "application/json"
    }
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 200,
            "temperature": 0.5,
            "top_p": 0.9,
            "do_sample": True
        }
    }

    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code != 200:
        return f"Error: Hugging Face Inference API returned status {response.status_code}\n{response.text}"

    # The Inference API returns an array of generated text(s)
    data = response.json()
    if isinstance(data, dict) and "error" in data:
        return f"Error: {data['error']}"
    
    # Typically, data[0]["generated_text"] holds the string
    return data[0]["generated_text"]

# 2) LOCAL PIPELINE APPROACH
# --------------------------
# This approach loads a model locally via the Transformers library.
# This can be done on a Hugging Face Space if:
#   - The model size fits the hardware resources (RAM/GPU)
#   - The Space is configured to install transformers, etc.
# Pros: no external calls, faster for repeated queries
# Cons: potentially large downloads, memory usage
#
# If you want to use this approach, uncomment and adapt as needed:


from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

def create_local_pipeline(model_id="tiiuae/falcon-7b-instruct"):
    # Download and load the model locally
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto"  # or "cpu" if no GPU is available
    )
    return pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example pipeline initialization (done once):
# generator = create_local_pipeline()

def get_llm_opinion_local(prompt, generator):
    # Generate text from the local pipeline
    outputs = generator(prompt, max_length=256, do_sample=True, temperature=0.5)
    return outputs[0]["generated_text"]


# 3) WRAPPING LOGIC
# -----------------
# You can unify the approaches in a single function. For instance, if you want
# to prefer local inference if a pipeline is initialized, otherwise fallback to
# the Inference API:

def get_llm_opinion(prompt, generator=None):
    """

    High-level function to get the LLM's opinion.

    If a local pipeline 'generator' is provided, use that.

    Otherwise, fallback to the Hugging Face Inference API.

    """
    if generator is not None:
        # local pipeline approach
        outputs = generator(prompt, max_length=256, do_sample=True, temperature=0.5)
        return outputs[0]["generated_text"]
    else:
        # inference API approach
        return get_llm_opinion_inference_api(prompt)