import tiktoken import PyPDF2 from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai.embeddings import OpenAIEmbeddings from langchain_qdrant import QdrantVectorStore from qdrant_client import QdrantClient from qdrant_client.http.models import Distance, VectorParams from langchain_core.runnables import RunnablePassthrough from langchain_core.output_parsers import StrOutputParser from langchain_openai import ChatOpenAI from langchain_core.prompts import ChatPromptTemplate HUMAN_TEMPLATE = """ #CONTEXT: {context} QUERY: {query} Use the provide context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, or it's not contained in the provided context response with "I don't know" """ #Read PDF data def read_pdf_data(pdf_file): # Create PDF reader object pdf_reader = PyPDF2.PdfReader(pdf_file) # Extract text from each page text = "" for page in pdf_reader.pages: text += page.extract_text() return text def tiktoken_len(text): tokens = tiktoken.encoding_for_model("gpt-4").encode( text, ) return len(tokens) #Split data into chunks def split_data(text): text_splitter = RecursiveCharacterTextSplitter( chunk_size = 100, chunk_overlap = 0, length_function = tiktoken_len, ) chunks = text_splitter.split_text(text) return chunks #Create embeddings instance def create_embeddings(): embedding_model = OpenAIEmbeddings(model="text-embedding-3-small") return embedding_model # Create a vector database using Qdrant def create_vector_store(embedding_model, chunks): embedding_dim = 1536 # YOUR ANSWER HERE client = QdrantClient(":memory:") client.create_collection( collection_name="lcel_doc_v2", vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE), ) vector_store = QdrantVectorStore( client=client, collection_name="lcel_doc_v2", embedding=embedding_model, ) _ = vector_store.add_texts(texts=chunks) return vector_store # create RAG def create_rag(vector_store): # Initialize OpenAI chat model with a valid model name openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo") # Create chat prompt template chat_prompt = ChatPromptTemplate.from_messages([ ("system", "You are a helpful assistant that answers questions based on the provided context."), ("human", HUMAN_TEMPLATE) ]) # Set up retriever with configurable k retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # Create RAG pipeline simple_rag = ( {"context": retriever, "query": RunnablePassthrough()} | chat_prompt | openai_chat_model | StrOutputParser() ) return simple_rag # Invoke RAG def invoke_rag(vector_store, query): rag_chain = create_rag(vector_store) response = rag_chain.invoke(query) return response def get_ticket_category(query): # Define the system prompt for categorization CATEGORY_PROMPT = """You are a ticket categorization system. Categorize the following query into exactly one of these categories: - HR Support: For queries about employment, benefits, leaves, workplace policies, etc. - IT Support: For queries about software, hardware, network, system access, etc. - Transportation Support: For queries about company transport, parking, vehicle maintenance, etc. - Other: For queries that do not fit into the above categories. Respond with ONLY the category name, nothing else. Query: {query} """ # Create OpenAI client for categorization client = ChatOpenAI(model="gpt-3.5-turbo") # Create the prompt template prompt = ChatPromptTemplate.from_messages([ ("system", CATEGORY_PROMPT) ]) # Create the chain chain = prompt | client | StrOutputParser() # Get the category category = chain.invoke({"query": query}) # Clean and validate the response category = category.strip() valid_categories = ["HR Support", "IT Support", "Transportation Support"] if category not in valid_categories: return "Other" # Default category if classification fails return category