File size: 15,262 Bytes
625e9e8 58a026c 625e9e8 58a026c 625e9e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 |
import os
import time
from typing import Optional
from dotenv import load_dotenv
from huggingface_hub import InferenceClient
load_dotenv() # Loads .env file automatically
class HuggingFaceInferenceAPI:
"""
Manages interactions with Hugging Face Inference API using the official InferenceClient.
"""
def __init__(self, api_token: Optional[str] = None):
self.api_token = api_token or os.environ.get("HF_TOKEN")
if not self.api_token:
raise ValueError(
"HF_TOKEN not found. Please set the HF_TOKEN environment variable or pass it as an argument."
)
# Using the newer provider system for Llama 3.2
self.client = InferenceClient(
provider="auto",
api_key=self.api_token
)
self.model = "meta-llama/Llama-3.2-3B-Instruct"
# self.max_retries = 3
# self.base_wait_time = 1 # seconds
def _generate_text(self, prompt: str, max_tokens: int = 200) -> str:
"""
Generate text using the InferenceClient with retry logic.
Args:
prompt: The input prompt.
max_tokens: Maximum tokens to generate.
Returns:
The generated text.
"""
try:
# Use the OpenAI-compatible syntax for better reliability
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
stream=False
)
# Correctly extract the string content from the response object
return response.choices[0].message.content
except Exception as e:
print(f"Error: {e}")
return f"Error generating response: {e}"
def moderate_query(self, query: str) -> bool:
# """
# Moderates a query using Llama-Guard-3-1B via Hugging Face Inference API.
# Args:
# query: The user's query.
# Returns:
# True if the query is safe, False otherwise.
# """
# # try:
# # prompt = f"[INST] Is this prompt safe and appropriate for a product inquiry chatbot? '{query}' Answer only with 'safe' or 'unsafe'. [/INST]"
# # response = self._generate_text(prompt, max_tokens=10)
# # if "unsafe" in response.lower():
# # print(f"Query flagged as unsafe: {query}")
# # return False
# # return True
# # except Exception as e:
# # print(f"Error during moderation: {e}")
# # # Default to safe to avoid blocking users if the moderation model fails.
# # return True
# # Llama Guard 3 expects this specific instruction format to be accurate
# moderation_prompt = f"""[INST] Task: Check if there is unsafe content in 'User' messages \
# in accordance with our safety policy.
# <BEGIN UNSAFE CONTENT CATEGORIES>
# S1: Violent Crimes.
# S2: Non-Violent Crimes.
# S3: Sexually Explicit Content.
# S4: Child Sexual Exploitation.
# S5: Defamation.
# S6: Specialized Advice.
# S7: Public Interest Topics.
# S8: Harassment.
# <END UNSAFE CONTENT CATEGORIES>
# User: {query} [/INST]"""
# try:
# print("Sending moderation request...")
# # We call the specialized guard model instead of the general Llama 3.2 model here
# response = self.client.chat.completions.create(
# model="meta-llama/Llama-Guard-3-1B",
# messages=[{"role": "user", "content": moderation_prompt}],
# max_tokens=10
# )
# # Llama-Guard-3 returns "safe" or "unsafe" as its primary output
# result = response.choices[0].message.content.strip().lower()
# print(f"Moderation result for query '{query}': {result}")
# # If the word 'unsafe' appears, we flag it. Otherwise, it's safe.
# return "unsafe" not in result
# except Exception as e:
# print(f"Moderation API Error: {repr(e)}")
# # Default to True (safe) so the user isn't blocked by a minor API hiccup
# return True
"""
Moderates a query using a stable, high-availability model (Qwen 2.5).
"""
# Qwen 2.5 is currently the most reliable for free-tier serverless inference
moderator_model = "Qwen/Qwen2.5-7B-Instruct"
moderation_prompt = f"""<|im_start|>system
You are a content moderator. Your job is to classify if a user query is SAFE or UNSAFE.
- SAFE: General questions, product inquiries, electronics, store help, or friendly chat.
- UNSAFE: Hate speech, violence, illegal acts, or sexual content.
Respond with ONLY the word 'SAFE' or 'UNSAFE'.<|im_end|>
<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant"""
try:
print(f"Sending moderation request to {moderator_model}...")
response = self.client.chat.completions.create(
model=moderator_model,
messages=[{"role": "user", "content": moderation_prompt}],
max_tokens=5,
)
result = response.choices[0].message.content.strip().upper()
print(f"Moderation result: {result}")
return "UNSAFE" not in result
except Exception as e:
# Improved error logging to see exactly what's happening
print(f"Moderation API Error: {repr(e)}")
# If the API fails, we assume safe to keep the UX smooth
return True
def generate_response(self, query: str, system_prompt: str) -> str:
"""
Generates a response using Mistral-7B-Instruct via Hugging Face Inference API.
Args:
query: The user's query.
system_prompt: The system prompt with context and instructions.
Returns:
The generated response.
"""
try:
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": query},
]
# Format messages for the model
formatted_messages = "\n".join(
[f"<s>[INST] {m['content']} [/INST]" if m["role"] == "user"
else f"{m['content']}" for m in messages]
)
response = self._generate_text(formatted_messages, max_tokens=500)
return response.strip()
except Exception as e:
print(f"Error during response generation: {e}")
return "I'm sorry, but I encountered an error while trying to generate a response."
def rewrite_query(self, query: str, system_prompt: str) -> str:
"""
Rewrites a query using Mistral-7B-Instruct via Hugging Face Inference API.
Args:
query: The user's query.
system_prompt: The system prompt with instructions.
Returns:
The rewritten query.
"""
try:
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"User query: '{query}'"},
]
# Format messages for the model
formatted_messages = "\n".join(
[f"<s>[INST] {m['content']} [/INST]" if m["role"] == "user"
else f"{m['content']}" for m in messages]
)
response = self._generate_text(formatted_messages, max_tokens=200)
rewritten = response.strip()
# Remove potential quotes around the rewritten query
if rewritten.startswith('"') and rewritten.endswith('"'):
rewritten = rewritten[1:-1]
if rewritten.startswith("'") and rewritten.endswith("'"):
rewritten = rewritten[1:-1]
return rewritten
except Exception as e:
print(f"Error during query rewrite: {e}")
return query # Fallback to original query on error
# Initialize the API client
_api_client = None
def get_api_client() -> HuggingFaceInferenceAPI:
"""Get or initialize the Hugging Face Inference API client."""
global _api_client
if _api_client is None:
_api_client = HuggingFaceInferenceAPI()
return _api_client
def moderate_query(query: str) -> bool:
"""
Moderates a query using Qwen via Hugging Face Inference API.
Args:
query: The user's query.
Returns:
True if the query is safe, False otherwise.
"""
print("Moderating query...")
client = get_api_client()
return client.moderate_query(query)
def generate_response(query: str, retrieved_docs: list, history: list) -> str:
"""
Generates a response using Llama-3.2-3B-Instruct via Hugging Face Inference API,
ensuring it adheres to the retrieved documents.
Args:
query: The user's query.
retrieved_docs: A list of document contents.
history: The chat history from Gradio.
Returns:
The generated response.
"""
system_prompt = """You are a specialized product inquiry assistant. \
Your primary and ONLY role is to answer user questions based on \
the 'Retrieved Documents' provided below.
Follow these rules strictly:
1. Base your entire response on the information found within the 'Retrieved Documents'. \
Do not use any external knowledge.
2. If there are no documents or \
the documents do not contain the information needed to answer the query, \
you MUST respond with: \"I'm sorry, but I cannot answer your question with the information I have.\"
3. If the documents contain relavant information, use it to construct a clear and concise answer.
The documents may include metadata such as price, product name, brand, and category.
The documents may also include product descriptions and features.
The documents may include customer reviews which can be used to answer questions \
about product quality and user satisfaction.
4. Some documents may not be fully relevant; \
carefully select and synthesize information only from the relevant parts.
5. Do not fabricate or assume any information not present in the documents.
6. Analyze the chat history provided under 'Chat History' for conversational context, \
but do not use it as a source for answers.
7. Respond in a friendly and helpful tone, with concise answers and directly related to the query.\
8. Make sure to ask the user relevant follow-up questions.\
9. Always format prices with a dollar sign and two decimal places.\
10. Do not use the term 'Retrieved Documents' in your response. It is only for your reference.
Retrieved Documents:
```
{context}
```
Chat History:
{chat_history}
"""
context = "\n\n---\n\n".join(doc for doc in retrieved_docs)
# Format chat history for the prompt
#formatted_history = "\n".join([f"User: {user_msg}\nAssistant: {bot_msg}" for user_msg, bot_msg in history])
formatted_history = ""
for msg in history:
if msg["role"] == "user":
formatted_history += f"User: {msg['content']}\n"
elif msg["role"] == "assistant":
formatted_history += f"Assistant: {msg['content']}\n"
prompt = system_prompt.format(context=context, chat_history=formatted_history)
client = get_api_client()
return client.generate_response(query, prompt)
def rewrite_query(query: str, history: list) -> str:
"""
Rewrites a conversational query into a self-contained query using the chat history
via Hugging Face Inference API.
Args:
query: The user's potentially vague query.
history: The chat history from Gradio.
Returns:
A self-contained query.
"""
system_prompt = """You are an expert at query rewriting. Your task is to rewrite a given 'user query' \
into a self-contained, specific query that can be understood without the context of the 'chat history'.
Follow these rules strictly:
1. Analyze the 'chat history' to understand the context of the conversation.
2. Identify any pronouns (e.g., 'it', 'its', 'they', 'that') or vague references in the 'user query'.
3. Replace these pronouns and vague references with the specific entities or topics they refer to from the chat history.
4. If the 'user query' is already self-contained and specific, return it unchanged.
5. CRITICAL: If the 'user query' is about a completely new topic not covered in the chat history, \
you MUST return it unchanged. Do NOT try to connect it to the previous conversation.
6. The rewritten query should be a single, clear question or statement.
7. Output ONLY the rewritten query, with no extra text, labels, or explanations.
Here are some examples of how to behave:
---
Example 1: Rewriting a contextual query
Chat History:
User: Do you have the TechPro Ultrabook in stock?
Assistant: Yes, the TechPro Ultrabook (TP-UB100) is available.
User query: 'Tell me about its warranty.'
Rewritten query: 'What is the warranty for the TechPro Ultrabook (TP-UB100)?'
---
Example 2: Handling a topic change
Chat History:
User: Do you have the TechPro Ultrabook in stock?
Assistant: Yes, the TechPro Ultrabook (TP-UB100) is available.
User query: 'Okay, do you have any monitors?'
Rewritten query: 'Okay, do you have any monitors?'
---
Example 3: Handling a self-contained query
Chat History:
User: What's the price of the BlueWave Gaming Laptop?
Assistant: The BlueWave Gaming Laptop (BW-GL200) is $1299.99.
User query: 'What is the price of the GameSphere X console?'
Rewritten query: 'What is the price of the GameSphere X console?'
---
Chat History:
{chat_history}
"""
# Format chat history for the prompt
#formatted_history = "\n".join([f"User: {user_msg}\nAssistant: {bot_msg}" for user_msg, bot_msg in history])
formatted_history = ""
for msg in history:
if msg["role"] == "user":
formatted_history += f"User: {msg['content']}\n"
elif msg["role"] == "assistant":
formatted_history += f"Assistant: {msg['content']}\n"
prompt = system_prompt.format(chat_history=formatted_history)
client = get_api_client()
return client.rewrite_query(query, prompt)
|