File size: 15,262 Bytes
625e9e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58a026c
 
 
 
 
 
 
625e9e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58a026c
 
 
 
 
 
 
625e9e8
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
import os
import time
from typing import Optional

from dotenv import load_dotenv
from huggingface_hub import InferenceClient

load_dotenv()  # Loads .env file automatically

class HuggingFaceInferenceAPI:
    """
    Manages interactions with Hugging Face Inference API using the official InferenceClient.
    """
    def __init__(self, api_token: Optional[str] = None):
        self.api_token = api_token or os.environ.get("HF_TOKEN")
        if not self.api_token:
            raise ValueError(
                "HF_TOKEN not found. Please set the HF_TOKEN environment variable or pass it as an argument."
            )
        # Using the newer provider system for Llama 3.2
        self.client = InferenceClient(
            provider="auto", 
            api_key=self.api_token
        )
        self.model = "meta-llama/Llama-3.2-3B-Instruct"
        # self.max_retries = 3
        # self.base_wait_time = 1  # seconds

    def _generate_text(self, prompt: str, max_tokens: int = 200) -> str:
        """
        Generate text using the InferenceClient with retry logic.

        Args:
            prompt: The input prompt.
            max_tokens: Maximum tokens to generate.

        Returns:
            The generated text.
        """
        try:
            # Use the OpenAI-compatible syntax for better reliability
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_tokens,
                stream=False
            )
            # Correctly extract the string content from the response object
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error: {e}")
            return f"Error generating response: {e}"

    def moderate_query(self, query: str) -> bool:
        # """
        # Moderates a query using Llama-Guard-3-1B via Hugging Face Inference API.

        # Args:
        #     query: The user's query.

        # Returns:
        #     True if the query is safe, False otherwise.
        # """
        # # try:
        # #     prompt = f"[INST] Is this prompt safe and appropriate for a product inquiry chatbot? '{query}' Answer only with 'safe' or 'unsafe'. [/INST]"
        # #     response = self._generate_text(prompt, max_tokens=10)
            
        # #     if "unsafe" in response.lower():
        # #         print(f"Query flagged as unsafe: {query}")
        # #         return False
        # #     return True
        # # except Exception as e:
        # #     print(f"Error during moderation: {e}")
        # #     # Default to safe to avoid blocking users if the moderation model fails.
        # #     return True
        
        # # Llama Guard 3 expects this specific instruction format to be accurate
        # moderation_prompt = f"""[INST] Task: Check if there is unsafe content in 'User' messages \
        #     in accordance with our safety policy.
        
        # <BEGIN UNSAFE CONTENT CATEGORIES>
        # S1: Violent Crimes. 
        # S2: Non-Violent Crimes. 
        # S3: Sexually Explicit Content.
        # S4: Child Sexual Exploitation.
        # S5: Defamation.
        # S6: Specialized Advice.
        # S7: Public Interest Topics.
        # S8: Harassment.
        # <END UNSAFE CONTENT CATEGORIES>

        # User: {query} [/INST]"""

        # try:
        #     print("Sending moderation request...")
        #     # We call the specialized guard model instead of the general Llama 3.2 model here
        #     response = self.client.chat.completions.create(
        #         model="meta-llama/Llama-Guard-3-1B",
        #         messages=[{"role": "user", "content": moderation_prompt}],
        #         max_tokens=10
        #     )
            
        #     # Llama-Guard-3 returns "safe" or "unsafe" as its primary output
        #     result = response.choices[0].message.content.strip().lower()
        #     print(f"Moderation result for query '{query}': {result}")
            
        #     # If the word 'unsafe' appears, we flag it. Otherwise, it's safe.
        #     return "unsafe" not in result
            
        # except Exception as e:
        #     print(f"Moderation API Error: {repr(e)}")
        #     # Default to True (safe) so the user isn't blocked by a minor API hiccup
        #     return True
        
        
        """
        Moderates a query using a stable, high-availability model (Qwen 2.5).
        """
        # Qwen 2.5 is currently the most reliable for free-tier serverless inference
        moderator_model = "Qwen/Qwen2.5-7B-Instruct"
        
        moderation_prompt = f"""<|im_start|>system
        You are a content moderator. Your job is to classify if a user query is SAFE or UNSAFE.
        - SAFE: General questions, product inquiries, electronics, store help, or friendly chat.
        - UNSAFE: Hate speech, violence, illegal acts, or sexual content.
        Respond with ONLY the word 'SAFE' or 'UNSAFE'.<|im_end|>
        <|im_start|>user
        {query}<|im_end|>
        <|im_start|>assistant"""

        try:
            print(f"Sending moderation request to {moderator_model}...")
            response = self.client.chat.completions.create(
                model=moderator_model,
                messages=[{"role": "user", "content": moderation_prompt}],
                max_tokens=5,
            )
            
            result = response.choices[0].message.content.strip().upper()
            print(f"Moderation result: {result}")
            
            return "UNSAFE" not in result

        except Exception as e:
            # Improved error logging to see exactly what's happening
            print(f"Moderation API Error: {repr(e)}")
            # If the API fails, we assume safe to keep the UX smooth
            return True
    
    
    
    def generate_response(self, query: str, system_prompt: str) -> str:
        """
        Generates a response using Mistral-7B-Instruct via Hugging Face Inference API.

        Args:
            query: The user's query.
            system_prompt: The system prompt with context and instructions.

        Returns:
            The generated response.
        """
        try:
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": query},
            ]
            
            # Format messages for the model
            formatted_messages = "\n".join(
                [f"<s>[INST] {m['content']} [/INST]" if m["role"] == "user" 
                 else f"{m['content']}" for m in messages]
            )
            
            response = self._generate_text(formatted_messages, max_tokens=500)
            return response.strip()
        except Exception as e:
            print(f"Error during response generation: {e}")
            return "I'm sorry, but I encountered an error while trying to generate a response."

    
    def rewrite_query(self, query: str, system_prompt: str) -> str:
        """
        Rewrites a query using Mistral-7B-Instruct via Hugging Face Inference API.

        Args:
            query: The user's query.
            system_prompt: The system prompt with instructions.

        Returns:
            The rewritten query.
        """
        try:
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"User query: '{query}'"},
            ]
            
            # Format messages for the model
            formatted_messages = "\n".join(
                [f"<s>[INST] {m['content']} [/INST]" if m["role"] == "user" 
                 else f"{m['content']}" for m in messages]
            )
            
            response = self._generate_text(formatted_messages, max_tokens=200)
            rewritten = response.strip()
            
            # Remove potential quotes around the rewritten query
            if rewritten.startswith('"') and rewritten.endswith('"'):
                rewritten = rewritten[1:-1]
            if rewritten.startswith("'") and rewritten.endswith("'"):
                rewritten = rewritten[1:-1]
                
            return rewritten
        except Exception as e:
            print(f"Error during query rewrite: {e}")
            return query  # Fallback to original query on error


# Initialize the API client
_api_client = None

def get_api_client() -> HuggingFaceInferenceAPI:
    """Get or initialize the Hugging Face Inference API client."""
    global _api_client
    if _api_client is None:
        _api_client = HuggingFaceInferenceAPI()
    return _api_client


def moderate_query(query: str) -> bool:
    """
    Moderates a query using Qwen via Hugging Face Inference API.

    Args:
        query: The user's query.

    Returns:
        True if the query is safe, False otherwise.
    """
    print("Moderating query...")
    client = get_api_client()
    return client.moderate_query(query)

def generate_response(query: str, retrieved_docs: list, history: list) -> str:
    """
    Generates a response using Llama-3.2-3B-Instruct via Hugging Face Inference API,
    ensuring it adheres to the retrieved documents.

    Args:
        query: The user's query.
        retrieved_docs: A list of document contents.
        history: The chat history from Gradio.

    Returns:
        The generated response.
    """
    system_prompt = """You are a specialized product inquiry assistant. \
        Your primary and ONLY role is to answer user questions based on \
            the 'Retrieved Documents' provided below.

            Follow these rules strictly:
            1.  Base your entire response on the information found within the 'Retrieved Documents'. \
                Do not use any external knowledge.
            2.  If there are no documents or \
                the documents do not contain the information needed to answer the query, \
                you MUST respond with: \"I'm sorry, but I cannot answer your question with the information I have.\"
            3.  If the documents contain relavant information, use it to construct a clear and concise answer.
                The documents may include metadata such as price, product name, brand, and category.
                The documents may also include product descriptions and features.
                The documents may include customer reviews which can be used to answer questions \
                    about product quality and user satisfaction.
            4.  Some documents may not be fully relevant; \
                carefully select and synthesize information only from the relevant parts.
            5.  Do not fabricate or assume any information not present in the documents.
            6.  Analyze the chat history provided under 'Chat History' for conversational context, \
                but do not use it as a source for answers.
            7.  Respond in a friendly and helpful tone, with concise answers and directly related to the query.\
            8.  Make sure to ask the user relevant follow-up questions.\
            9.  Always format prices with a dollar sign and two decimal places.\
            10. Do not use the term 'Retrieved Documents' in your response. It is only for your reference.
            

            Retrieved Documents:
            ```
            {context}
            ```

            Chat History:
            {chat_history}
            """

    context = "\n\n---\n\n".join(doc for doc in retrieved_docs)
    
    # Format chat history for the prompt
    #formatted_history = "\n".join([f"User: {user_msg}\nAssistant: {bot_msg}" for user_msg, bot_msg in history])
    formatted_history = ""
    for msg in history:
        if msg["role"] == "user":
            formatted_history += f"User: {msg['content']}\n"
        elif msg["role"] == "assistant":
            formatted_history += f"Assistant: {msg['content']}\n"

    prompt = system_prompt.format(context=context, chat_history=formatted_history)

    client = get_api_client()
    return client.generate_response(query, prompt)



def rewrite_query(query: str, history: list) -> str:
    """
    Rewrites a conversational query into a self-contained query using the chat history
    via Hugging Face Inference API.

    Args:
        query: The user's potentially vague query.
        history: The chat history from Gradio.

    Returns:
        A self-contained query.
    """
    system_prompt = """You are an expert at query rewriting. Your task is to rewrite a given 'user query' \
        into a self-contained, specific query that can be understood without the context of the 'chat history'.
        
        Follow these rules strictly:
        1.  Analyze the 'chat history' to understand the context of the conversation.
        2.  Identify any pronouns (e.g., 'it', 'its', 'they', 'that') or vague references in the 'user query'.
        3.  Replace these pronouns and vague references with the specific entities or topics they refer to from the chat history.
        4.  If the 'user query' is already self-contained and specific, return it unchanged.
        5.  CRITICAL: If the 'user query' is about a completely new topic not covered in the chat history, \
            you MUST return it unchanged. Do NOT try to connect it to the previous conversation.
        6.  The rewritten query should be a single, clear question or statement.
        7.  Output ONLY the rewritten query, with no extra text, labels, or explanations.

        Here are some examples of how to behave:

        ---
        Example 1: Rewriting a contextual query
        Chat History:
        User: Do you have the TechPro Ultrabook in stock?
        Assistant: Yes, the TechPro Ultrabook (TP-UB100) is available.
        User query: 'Tell me about its warranty.'
        Rewritten query: 'What is the warranty for the TechPro Ultrabook (TP-UB100)?'
        ---
        Example 2: Handling a topic change
        Chat History:
        User: Do you have the TechPro Ultrabook in stock?
        Assistant: Yes, the TechPro Ultrabook (TP-UB100) is available.
        User query: 'Okay, do you have any monitors?'
        Rewritten query: 'Okay, do you have any monitors?'
        ---
        Example 3: Handling a self-contained query
        Chat History:
        User: What's the price of the BlueWave Gaming Laptop?
        Assistant: The BlueWave Gaming Laptop (BW-GL200) is $1299.99.
        User query: 'What is the price of the GameSphere X console?'
        Rewritten query: 'What is the price of the GameSphere X console?'
        ---

        Chat History:
        {chat_history}
        """

    # Format chat history for the prompt
    #formatted_history = "\n".join([f"User: {user_msg}\nAssistant: {bot_msg}" for user_msg, bot_msg in history])
    formatted_history = ""
    for msg in history:
        if msg["role"] == "user":
            formatted_history += f"User: {msg['content']}\n"
        elif msg["role"] == "assistant":
            formatted_history += f"Assistant: {msg['content']}\n"

    prompt = system_prompt.format(chat_history=formatted_history)
    
    client = get_api_client()
    return client.rewrite_query(query, prompt)