# --- Imports, Logging & Reproducibility --- import os import random import logging import numpy as np import torch import nest_asyncio import pandas as pd import gradio as gr from typing import List # Llama-Index & Transformers from llama_index.core import ( VectorStoreIndex, StorageContext, Settings, QueryBundle ) from llama_index.core.schema import Document from llama_index.core.postprocessor import SentenceTransformerRerank from llama_index.core.retrievers import BaseRetriever from llama_index.retrievers.bm25 import BM25Retriever from llama_index.vector_stores.qdrant import QdrantVectorStore from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core.node_parser import SentenceSplitter from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig from huggingface_hub import login import qdrant_client from llama_index.core.query_engine import RetrieverQueryEngine # Configure logging logging.basicConfig( format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO ) logger = logging.getLogger(__name__) # Reproducibility SEED = 42 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) nest_asyncio.apply() # --- Hugging Face Spaces Configuration --- # HF_TOKEN, QDRANT_HOST, and QDRANT_API_KEY should be set as Space Secrets HF_TOKEN = os.getenv("HF_TOKEN") QDRANT_HOST = os.getenv("QDRANT_HOST") QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") os.environ['OMP_NUM_THREADS'] = '4' if not QDRANT_HOST or not QDRANT_API_KEY or not HF_TOKEN: raise EnvironmentError("Please set QDRANT_HOST, QDRANT_API_KEY, and HF_TOKEN as Space Secrets.") login(token=HF_TOKEN) # --- Qdrant Connection and Collection Setup --- qdrant = qdrant_client.QdrantClient( url=QDRANT_HOST, api_key=QDRANT_API_KEY, prefer_grpc=False ) COLLECTION_NAME = "C2C_RAG" # --- RAG Components Setup --- device = "cuda" if torch.cuda.is_available() else "cpu" embed_model = HuggingFaceEmbedding( model_name="BAAI/bge-large-en-v1.5", device=device ) # This part needs to be pre-indexed or loaded differently # For Spaces, you would typically pre-index the data # and then load the index, but let's assume the collection exists. # We'll just define a placeholder for nodes for the BM25 retriever. bm25_nodes = [] # BM25 retriever requires nodes; in a Space, this is tricky. # Qdrant-backed vector store (read-only for this case) vector_store = QdrantVectorStore( client=qdrant, collection_name=COLLECTION_NAME, prefer_grpc=False ) # Load index from the existing vector store index = VectorStoreIndex.from_vector_store( vector_store=vector_store, embed_model=embed_model, ) # --- Define Hybrid Retriever & Reranker --- Settings.llm = None class HybridRetriever(BaseRetriever): def __init__(self, dense, bm25, similarity_top_k=10): super().__init__() self.dense = dense self.bm25 = bm25 self.similarity_top_k = similarity_top_k def _retrieve(self, query_bundle: QueryBundle) -> List[Document]: dense_hits = [] try: self.dense.similarity_top_k = self.similarity_top_k dense_hits = self.dense.retrieve(query_bundle) except Exception as e: logger.error(f"Dense retrieval error: {e}") bm25_hits = [] if self.bm25: try: self.bm25.similarity_top_k = self.similarity_top_k bm25_hits = self.bm25.retrieve(query_bundle) except Exception as e: logger.warning(f"BM25 retrieval error: {e}") combined = dense_hits + bm25_hits unique = [] seen = set() for hit in combined: nid = hit.node.node_id if nid not in seen: seen.add(nid) unique.append(hit) return unique[:self.similarity_top_k] # Instantiate retrievers dense_retriever = index.as_retriever(similarity_top_k=10) bm25_retriever = None logger.warning("BM25 retriever is disabled as the original data is not available in the Space.") hybrid_retriever = HybridRetriever(dense=dense_retriever, bm25=bm25_retriever) reranker = SentenceTransformerRerank( model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=4 ) # CORRECTED: Remove the 'llm=None' argument from llama_index.core.query_engine import RetrieverQueryEngine query_engine = RetrieverQueryEngine( retriever=hybrid_retriever, node_postprocessors=[reranker], ) # --- Load & Quantize LLaMA Model --- quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16 ) MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) llm = AutoModelForCausalLM.from_pretrained( MODEL_ID, quantization_config=quant_config, device_map="auto" ) generator = pipeline( task="text-generation", model=llm, tokenizer=tokenizer, device_map="auto" ) # --- Chatbot Logic & Gradio Interface (Improved) --- SYSTEM_PROMPT = ( "You are a friendly and helpful Level 0 IT Support Assistant. " "If the user's question lacks details or clarity, ask a concise follow-up question " "to gather the information you need before providing a solution. " "Once clarified, then:\n" "Your purpose is to provide simple, step-by-step solutions for common, entry-level technical issues. " "Examples of Level 0 issues include: forgotten passwords, basic printer problems, network connectivity checks, or simple software reinstallation. " "Do not answer questions about booking tickets, Level 1 or Level 2 support, or advanced technical configurations. " "If a user's question is beyond your scope (e.g., requires access to internal systems, involves advanced troubleshooting, or is not a basic IT issue), politely state that it's a higher-level issue and advise them to contact the dedicated IT support team directly. " "Always maintain a conversational tone and end with a polite closing." ) HDR = { "sys": "<|start_header_id|>system<|end_header_id|>", "usr": "<|start_header_id|>user<|end_header_id|>", "ast": "<|start_header_id|>assistant<|end_header_id|>", "eot": "<|eot_id|>" } chat_history = [] GREETINGS = {"hello", "hi", "hey", "good morning", "good afternoon", "good evening"} OUT_OF_SCOPE_KEYWORDS = ["book tickets", "level 1", "level 2", "advanced configuration", "request a laptop", "purchase software"] def is_out_of_scope(query): return any(keyword in query.lower() for keyword in OUT_OF_SCOPE_KEYWORDS) def format_history(history): return "".join( f"{HDR['usr']}{u}{HDR['eot']}{HDR['ast']}{a}{HDR['eot']}" for u, a in history ) def chat(query, k, temperature, top_p): global chat_history if query.lower().strip() in GREETINGS: reply = "Hello there! How can I help with your IT support question today?" chat_history.append((query, reply)) return reply words = query.strip().split() if len(words) < 3: reply = "Could you provide more detail about what you're experiencing? Any error messages or steps you've tried will help me assist you." chat_history.append((query, reply)) return reply if is_out_of_scope(query): reply = "I apologize, but that seems to be a question for our dedicated IT support team. I can only assist with Level 0 issues like password resets or basic connectivity problems. Please contact them directly for help." chat_history.append((query, reply)) return reply query_engine.retriever.similarity_top_k = k response = query_engine.query(query) context_nodes = response.source_nodes context_str = "\n---\n".join(node.text for node in context_nodes) if context_nodes else "" hist_str = format_history(chat_history[-3:]) prompt = ( f"<|begin_of_text|>" f"{HDR['sys']}{SYSTEM_PROMPT}{HDR['eot']}" f"{hist_str}" f"{HDR['usr']}Context:\n{context_str}{HDR['eot']}" f"{HDR['usr']}Question: {query}{HDR['eot']}" f"{HDR['ast']}" ) gen_args = { "do_sample": True, "max_new_tokens": 356, # Now using the 356 token limit "temperature": temperature, "top_p": top_p, "pad_token_id": tokenizer.eos_token_id } output = generator(prompt, **gen_args) text = output[0]["generated_text"] answer = text.split(HDR["ast"])[-1].strip() chat_history.append((query, answer)) return answer # Gradio Interface with gr.Blocks(theme=gr.themes.Soft(), title="💬 Level 0 IT Support Chatbot") as demo: gr.Markdown("### 🤖 Level 0 IT Support Chatbot (RAG + Qdrant + LLaMA3)") chatbot = gr.Chatbot(label="Chat", height=500) state = gr.State([]) inp = gr.Textbox(placeholder="Ask your IT support question...", label="Your Message", lines=2) with gr.Row(): send_btn = gr.Button("Send", variant="primary") clear_btn = gr.Button("Clear", variant="secondary") with gr.Accordion("Advanced Settings", open=False): k_slider = gr.Slider(minimum=1, maximum=20, value=10, step=1, label="Context Hits (k)") temp_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature") top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top-p") def respond(message, history, k_val, temp_val, top_p_val): reply = chat(message, k_val, temp_val, top_p_val) history.append([message, reply]) return "", history, history inputs = [inp, state, k_slider, temp_slider, top_p_slider] inp.submit(respond, inputs, [inp, chatbot, state]) send_btn.click(respond, inputs, [inp, chatbot, state]) clear_btn.click(lambda: ("", [], [], 10, 0.7, 0.9), None, [inp, chatbot, state, k_slider, temp_slider, top_p_slider], queue=False) demo.queue().launch(server_name="0.0.0.0", server_port=7860)