Spaces:

fabioantonini
/

VoIPAnalyzer

Sleeping

App Files Files Community

fabioantonini commited on Mar 21, 2025

Commit

91967b1

verified ·

1 Parent(s): 53c4d05

Upload llm_utils.py

Browse files

Files changed (1) hide show

llm_utils.py +101 -0

llm_utils.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# llm_utils.py
+import os
+import requests
+# 1) HUGGING FACE INFERENCE API APPROACH
+# ---------------------------------------
+# This approach sends your prompt to the hosted Inference API on Hugging Face.
+# You need:
+#   - A model endpoint, e.g. 'tiiuae/falcon-7b-instruct'
+#   - A valid HUGGINGFACEHUB_API_TOKEN with access to that model.
+#
+# Pros: no heavy model to download locally
+# Cons: subject to model availability, rate limits, and does not run fully offline
+HF_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "YOUR_API_TOKEN_HERE")
+MODEL_ID = "tiiuae/falcon-7b-instruct"
+API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
+def get_llm_opinion_inference_api(prompt):
+    """
+    Queries the Hugging Face Inference API for a text generation model.
+    Returns the generated text as a string.
+    """
+    headers = {
+        "Authorization": f"Bearer {HF_API_TOKEN}",
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "max_new_tokens": 200,
+            "temperature": 0.5,
+            "top_p": 0.9,
+            "do_sample": True
+        }
+    }
+    response = requests.post(API_URL, headers=headers, json=payload)
+    if response.status_code != 200:
+        return f"Error: Hugging Face Inference API returned status {response.status_code}\n{response.text}"
+    # The Inference API returns an array of generated text(s)
+    data = response.json()
+    if isinstance(data, dict) and "error" in data:
+        return f"Error: {data['error']}"
+    # Typically, data[0]["generated_text"] holds the string
+    return data[0]["generated_text"]
+# 2) LOCAL PIPELINE APPROACH
+# --------------------------
+# This approach loads a model locally via the Transformers library.
+# This can be done on a Hugging Face Space if:
+#   - The model size fits the hardware resources (RAM/GPU)
+#   - The Space is configured to install transformers, etc.
+# Pros: no external calls, faster for repeated queries
+# Cons: potentially large downloads, memory usage
+#
+# If you want to use this approach, uncomment and adapt as needed:
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+def create_local_pipeline(model_id="tiiuae/falcon-7b-instruct"):
+    # Download and load the model locally
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        device_map="auto"  # or "cpu" if no GPU is available
+    )
+    return pipeline("text-generation", model=model, tokenizer=tokenizer)
+# Example pipeline initialization (done once):
+# generator = create_local_pipeline()
+def get_llm_opinion_local(prompt, generator):
+    # Generate text from the local pipeline
+    outputs = generator(prompt, max_length=256, do_sample=True, temperature=0.5)
+    return outputs[0]["generated_text"]
+# 3) WRAPPING LOGIC
+# -----------------
+# You can unify the approaches in a single function. For instance, if you want
+# to prefer local inference if a pipeline is initialized, otherwise fallback to
+# the Inference API:
+def get_llm_opinion(prompt, generator=None):
+    """
+    High-level function to get the LLM's opinion.
+    If a local pipeline 'generator' is provided, use that.
+    Otherwise, fallback to the Hugging Face Inference API.
+    """
+    if generator is not None:
+        # local pipeline approach
+        outputs = generator(prompt, max_length=256, do_sample=True, temperature=0.5)
+        return outputs[0]["generated_text"]
+    else:
+        # inference API approach
+        return get_llm_opinion_inference_api(prompt)