Spaces:
Sleeping
Sleeping
Update helper.py to use correct huggingface inferencing URL
Browse files
helper.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from sentence_transformers import SentenceTransformer
|
| 2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
from pypdf import PdfReader
|
| 4 |
import requests
|
|
@@ -35,9 +35,10 @@ def generate_hypothetical_answer(query):
|
|
| 35 |
import os
|
| 36 |
import time
|
| 37 |
|
| 38 |
-
# Hugging Face API endpoint
|
| 39 |
-
api_url = "https://
|
| 40 |
|
|
|
|
| 41 |
# Get API token from environment variable
|
| 42 |
api_token = os.getenv("HUGGINGFACE_API_TOKEN")
|
| 43 |
if not api_token:
|
|
@@ -58,20 +59,21 @@ def generate_hypothetical_answer(query):
|
|
| 58 |
Hypothetical answer:
|
| 59 |
"""
|
| 60 |
|
| 61 |
-
# Prepare the request payload
|
| 62 |
payload = {
|
| 63 |
"inputs": prompt,
|
| 64 |
"parameters": {
|
| 65 |
"max_new_tokens": 256,
|
| 66 |
"temperature": 0.7,
|
| 67 |
"top_p": 0.95,
|
| 68 |
-
"do_sample": True
|
|
|
|
| 69 |
}
|
| 70 |
}
|
| 71 |
|
| 72 |
try:
|
| 73 |
# Make the API request to Hugging Face
|
| 74 |
-
print("Sending request to Hugging Face API for hypothetical answer...")
|
| 75 |
print(f"API URL: {api_url}")
|
| 76 |
print(f"Headers: {headers}")
|
| 77 |
print(f"Payload: {json.dumps(payload, indent=2)}")
|
|
@@ -164,8 +166,8 @@ def query_llm_with_context(query, context, top_n=3):
|
|
| 164 |
Query: {query}
|
| 165 |
"""
|
| 166 |
|
| 167 |
-
# Hugging Face API endpoint
|
| 168 |
-
api_url = "https://
|
| 169 |
|
| 170 |
# Get API token from environment variable
|
| 171 |
api_token = os.getenv("HUGGINGFACE_API_TOKEN")
|
|
@@ -179,20 +181,21 @@ def query_llm_with_context(query, context, top_n=3):
|
|
| 179 |
"Content-Type": "application/json"
|
| 180 |
}
|
| 181 |
|
| 182 |
-
# Prepare the request payload
|
| 183 |
payload = {
|
| 184 |
"inputs": prompt,
|
| 185 |
"parameters": {
|
| 186 |
"max_new_tokens": 512,
|
| 187 |
"temperature": 0.7,
|
| 188 |
"top_p": 0.95,
|
| 189 |
-
"do_sample": True
|
|
|
|
| 190 |
}
|
| 191 |
}
|
| 192 |
|
| 193 |
try:
|
| 194 |
# Make the API request to Hugging Face
|
| 195 |
-
print("Sending request to Hugging Face API...")
|
| 196 |
print(f"API URL: {api_url}")
|
| 197 |
print(f"Headers: {headers}")
|
| 198 |
print(f"Payload: {json.dumps(payload, indent=2)}")
|
|
@@ -239,6 +242,10 @@ def query_llm_with_context(query, context, top_n=3):
|
|
| 239 |
print(f"HTTP error occurred: {e}")
|
| 240 |
print(f"Response status code: {e.response.status_code}")
|
| 241 |
print(f"Response headers: {e.response.headers}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
if e.response.status_code == 401:
|
| 244 |
return "Authentication error. Please check your Hugging Face API token."
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
from pypdf import PdfReader
|
| 4 |
import requests
|
|
|
|
| 35 |
import os
|
| 36 |
import time
|
| 37 |
|
| 38 |
+
# Hugging Face API endpoint with vLLM
|
| 39 |
+
api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions"
|
| 40 |
|
| 41 |
+
|
| 42 |
# Get API token from environment variable
|
| 43 |
api_token = os.getenv("HUGGINGFACE_API_TOKEN")
|
| 44 |
if not api_token:
|
|
|
|
| 59 |
Hypothetical answer:
|
| 60 |
"""
|
| 61 |
|
| 62 |
+
# Prepare the request payload for vLLM
|
| 63 |
payload = {
|
| 64 |
"inputs": prompt,
|
| 65 |
"parameters": {
|
| 66 |
"max_new_tokens": 256,
|
| 67 |
"temperature": 0.7,
|
| 68 |
"top_p": 0.95,
|
| 69 |
+
"do_sample": True,
|
| 70 |
+
"use_vllm": True # Enable vLLM for faster inference
|
| 71 |
}
|
| 72 |
}
|
| 73 |
|
| 74 |
try:
|
| 75 |
# Make the API request to Hugging Face
|
| 76 |
+
print("Sending request to Hugging Face API with vLLM for hypothetical answer...")
|
| 77 |
print(f"API URL: {api_url}")
|
| 78 |
print(f"Headers: {headers}")
|
| 79 |
print(f"Payload: {json.dumps(payload, indent=2)}")
|
|
|
|
| 166 |
Query: {query}
|
| 167 |
"""
|
| 168 |
|
| 169 |
+
# Hugging Face API endpoint with vLLM
|
| 170 |
+
api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions"
|
| 171 |
|
| 172 |
# Get API token from environment variable
|
| 173 |
api_token = os.getenv("HUGGINGFACE_API_TOKEN")
|
|
|
|
| 181 |
"Content-Type": "application/json"
|
| 182 |
}
|
| 183 |
|
| 184 |
+
# Prepare the request payload for vLLM
|
| 185 |
payload = {
|
| 186 |
"inputs": prompt,
|
| 187 |
"parameters": {
|
| 188 |
"max_new_tokens": 512,
|
| 189 |
"temperature": 0.7,
|
| 190 |
"top_p": 0.95,
|
| 191 |
+
"do_sample": True,
|
| 192 |
+
"use_vllm": True # Enable vLLM for faster inference
|
| 193 |
}
|
| 194 |
}
|
| 195 |
|
| 196 |
try:
|
| 197 |
# Make the API request to Hugging Face
|
| 198 |
+
print("Sending request to Hugging Face API with vLLM...")
|
| 199 |
print(f"API URL: {api_url}")
|
| 200 |
print(f"Headers: {headers}")
|
| 201 |
print(f"Payload: {json.dumps(payload, indent=2)}")
|
|
|
|
| 242 |
print(f"HTTP error occurred: {e}")
|
| 243 |
print(f"Response status code: {e.response.status_code}")
|
| 244 |
print(f"Response headers: {e.response.headers}")
|
| 245 |
+
try:
|
| 246 |
+
print(f"Response content: {e.response.text}")
|
| 247 |
+
except:
|
| 248 |
+
print("Could not print response content")
|
| 249 |
|
| 250 |
if e.response.status_code == 401:
|
| 251 |
return "Authentication error. Please check your Hugging Face API token."
|