File size: 4,355 Bytes
bb116be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import tiktoken
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate


HUMAN_TEMPLATE = """
#CONTEXT:
{context}

QUERY:
{query}

Use the provide context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, or it's not contained in the provided context response with "I don't know"
"""

#Read PDF data
def read_pdf_data(pdf_file):
    # Create PDF reader object
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    
    # Extract text from each page
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-4").encode(
        text,
    )
    return len(tokens)

#Split data into chunks
def split_data(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 100,
        chunk_overlap = 0,
        length_function = tiktoken_len,
    )   
    chunks = text_splitter.split_text(text)
    return chunks

#Create embeddings instance

def create_embeddings():
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    return embedding_model


# Create a vector database using Qdrant
def create_vector_store(embedding_model, chunks):
    embedding_dim = 1536  # YOUR ANSWER HERE
    client = QdrantClient(":memory:")
    client.create_collection(
        collection_name="lcel_doc_v2",
        vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE),
    )   
    vector_store = QdrantVectorStore(
        client=client,
        collection_name="lcel_doc_v2",
        embedding=embedding_model,
    )
    _ = vector_store.add_texts(texts=chunks)
    return vector_store

# create RAG
def create_rag(vector_store):
    # Initialize OpenAI chat model with a valid model name
    openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
    
    # Create chat prompt template
    chat_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a helpful assistant that answers questions based on the provided context."),
        ("human", HUMAN_TEMPLATE)
    ])
    
    # Set up retriever with configurable k
    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
    
    # Create RAG pipeline
    simple_rag = (
        {"context": retriever, "query": RunnablePassthrough()}
        | chat_prompt
        | openai_chat_model
        | StrOutputParser() 
    ) 
    
    return simple_rag

# Invoke RAG
def invoke_rag(vector_store, query):
    rag_chain = create_rag(vector_store)
    response = rag_chain.invoke(query)
    return response


def get_ticket_category(query):
    # Define the system prompt for categorization
    CATEGORY_PROMPT = """You are a ticket categorization system. Categorize the following query into exactly one of these categories:
    - HR Support: For queries about employment, benefits, leaves, workplace policies, etc.
    - IT Support: For queries about software, hardware, network, system access, etc.
    - Transportation Support: For queries about company transport, parking, vehicle maintenance, etc.
    - Other: For queries that do not fit into the above categories.
    Respond with ONLY the category name, nothing else.
    
    Query: {query}
    """
    
    # Create OpenAI client for categorization
    client = ChatOpenAI(model="gpt-3.5-turbo")
    
    # Create the prompt template
    prompt = ChatPromptTemplate.from_messages([
        ("system", CATEGORY_PROMPT)
    ])
    
    # Create the chain
    chain = prompt | client | StrOutputParser()
    
    # Get the category
    category = chain.invoke({"query": query})
    
    # Clean and validate the response
    category = category.strip()
    valid_categories = ["HR Support", "IT Support", "Transportation Support"]
    
    if category not in valid_categories:
        return "Other"  # Default category if classification fails
        
    return category