dhanvanth183 commited on
Commit
7f02a59
·
verified ·
1 Parent(s): ec4bd3d

Upload 3 files

Browse files

First version with working UI. Need some improvements in the results.

Files changed (3) hide show
  1. app.py +78 -0
  2. indexing.py +62 -0
  3. utils.py +65 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import ChatOpenAI
2
+ from langchain.chains import ConversationChain
3
+ from langchain.memory import ConversationBufferWindowMemory
4
+ from langchain.prompts import (
5
+ SystemMessagePromptTemplate,
6
+ HumanMessagePromptTemplate,
7
+ ChatPromptTemplate,
8
+ MessagesPlaceholder
9
+ )
10
+ import streamlit as st
11
+ from utils import find_match, query_refiner, get_conversation_string
12
+
13
+ from dotenv import load_dotenv
14
+ import os
15
+ load_dotenv()
16
+
17
+ st.subheader("Aido-We assist Universities for recruiting International students")
18
+
19
+ if 'responses' not in st.session_state:
20
+ st.session_state['responses'] = ["How can I assist you?"]
21
+
22
+ if 'requests' not in st.session_state:
23
+ st.session_state['requests'] = []
24
+
25
+ llm = ChatOpenAI(model_name="gpt-4o-mini", api_key=os.getenv('OPENAI_API_KEY'))
26
+
27
+ if 'buffer_memory' not in st.session_state:
28
+ st.session_state.buffer_memory = ConversationBufferWindowMemory(k=3, return_messages=True)
29
+
30
+ system_msg_template = SystemMessagePromptTemplate.from_template(template="""Answer the question as truthfully as possible using the provided context,
31
+ and if the answer is not contained within the text below, say 'I don't know'""")
32
+
33
+ human_msg_template = HumanMessagePromptTemplate.from_template(template="{input}")
34
+
35
+ prompt_template = ChatPromptTemplate.from_messages(
36
+ [system_msg_template, MessagesPlaceholder(variable_name="history"), human_msg_template])
37
+
38
+ conversation = ConversationChain(memory=st.session_state.buffer_memory, prompt=prompt_template, llm=llm, verbose=True)
39
+
40
+ # container for chat history
41
+ response_container = st.container()
42
+ # container for text box
43
+ textcontainer = st.container()
44
+
45
+ with textcontainer:
46
+ # Replace the single-line text input with a text area that expands
47
+ query = st.text_area(
48
+ "Query: ",
49
+ key="input",
50
+ height=100, # Initial height
51
+ max_chars=None, # No character limit
52
+ help="Type your question here.",
53
+ placeholder="● What are some concerns students from Algeria have about studying in the USA?"
54
+ )
55
+
56
+ # Add a submit button to control when the query is processed
57
+ submit_button = st.button("Submit")
58
+
59
+ if submit_button and query:
60
+ with st.spinner("typing..."):
61
+ conversation_string = get_conversation_string()
62
+ refined_query = query_refiner(conversation_string, query)
63
+ st.subheader("Refined Query:")
64
+ st.write(refined_query)
65
+ context = find_match(refined_query)
66
+ response = conversation.predict(input=f"Context:\n {context} \n\n Query:\n{query}")
67
+ st.session_state.requests.append(query)
68
+ st.session_state.responses.append(response)
69
+
70
+ with response_container:
71
+ if st.session_state['responses']:
72
+ for i in range(len(st.session_state['responses'])):
73
+ # Using Streamlit's native chat message functionality instead of streamlit_chat
74
+ with st.chat_message("assistant"):
75
+ st.write(st.session_state['responses'][i])
76
+ if i < len(st.session_state['requests']):
77
+ with st.chat_message("user"):
78
+ st.write(st.session_state["requests"][i])
indexing.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import DirectoryLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_openai import OpenAIEmbeddings
4
+ from pinecone import Pinecone, ServerlessSpec
5
+ #from langchain_community.vectorstores import Pinecone
6
+ from dotenv import load_dotenv
7
+ import os
8
+ from langchain_pinecone import PineconeVectorStore
9
+ load_dotenv()
10
+
11
+
12
+
13
+ directory = "D:/Projects/Aido/data"
14
+
15
+ def load_docs(directory):
16
+ loader = DirectoryLoader(directory)
17
+ documents = loader.load()
18
+ return documents
19
+
20
+ documents = load_docs(directory)
21
+ print(f"Number of documents in the dataset: {len(documents)}")
22
+
23
+ def split_docs(documents,chunk_size=400,chunk_overlap=150):
24
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
25
+ docs = text_splitter.split_documents(documents)
26
+ return docs
27
+
28
+ docs = split_docs(documents)
29
+ print(f"There are total of {len(docs)} chunks derived from {len(documents)} document" )
30
+
31
+ embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
32
+
33
+
34
+ pc= Pinecone(api_key=os.getenv('PINECONE_API_KEY') ) # next to api key in console
35
+
36
+ index_name = "aido"
37
+ if index_name not in pc.list_indexes().names():
38
+ pc.create_index(
39
+ name=index_name,
40
+ dimension=1536, # dimensionality of text-embedding-ada-002
41
+ metric="cosine"
42
+ )
43
+ pinecone_index = pc.Index(index_name)
44
+
45
+
46
+ index = PineconeVectorStore.from_documents(
47
+ docs,
48
+ embeddings,
49
+ index_name=index_name
50
+ )
51
+
52
+ def get_similiar_docs(query,k=3,score=False):
53
+ if score:
54
+ similar_docs = index.similarity_search_with_score(query,k=k)
55
+ else:
56
+ similar_docs = index.similarity_search(query,k=k)
57
+ return similar_docs
58
+
59
+
60
+ query = "What do students from Albino doubtful on their return on investment when considering studying in the USA?"
61
+ similar_docs = get_similiar_docs(query)
62
+ print(similar_docs)
utils.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import OpenAIEmbeddings
2
+ from pinecone import Pinecone
3
+ import streamlit as st
4
+ from openai import OpenAI
5
+ import os
6
+ from dotenv import load_dotenv
7
+
8
+ load_dotenv()
9
+
10
+ # Initialize OpenAI client
11
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
12
+
13
+ # Initialize embeddings
14
+ embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", api_key=os.getenv('OPENAI_API_KEY'))
15
+
16
+ # Initialize Pinecone
17
+ pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
18
+
19
+ # Check if index exists and connect to it
20
+ index_name = "aido-hybrid"
21
+ if index_name not in pc.list_indexes().names():
22
+ print("Creating a new Pinecone index...")
23
+ pc.create_index(
24
+ name=index_name,
25
+ dimension=1536, # dimensionality of text-embedding-ada-002
26
+ metric="cosine"
27
+ )
28
+
29
+ # Connect to the existing Pinecone index
30
+ index = pc.Index(index_name)
31
+
32
+
33
+ def find_match(input):
34
+ # Get embeddings for the input query
35
+ input_em = embeddings.embed_query(input)
36
+
37
+ # Query Pinecone
38
+ result = index.query(vector=input_em, top_k=5, include_metadata=True)
39
+
40
+ # Return the top 2 matches
41
+ return result['matches'][0]['metadata']['text'] + "\n" + result['matches'][1]['metadata']['text']
42
+
43
+
44
+ def query_refiner(conversation, query):
45
+ # Using the new ChatCompletion API instead of the deprecated Completion API
46
+ response = client.chat.completions.create(
47
+ model="gpt-3.5-turbo",
48
+ messages=[
49
+ {"role": "system",
50
+ "content": "You are a helpful assistant that refines user queries based on conversation context."},
51
+ {"role": "user",
52
+ "content": f"Given the following user query and conversation log, formulate a question that would be the most relevant to provide the user with an answer from a knowledge base.\n\nCONVERSATION LOG: \n{conversation}\n\nQuery: {query}\n\nRefined Query:"}
53
+ ],
54
+ temperature=0.7,
55
+ max_tokens=256
56
+ )
57
+ return response.choices[0].message.content
58
+
59
+
60
+ def get_conversation_string():
61
+ conversation_string = ""
62
+ for i in range(len(st.session_state['responses']) - 1):
63
+ conversation_string += "Human: " + st.session_state['requests'][i] + "\n"
64
+ conversation_string += "Bot: " + st.session_state['responses'][i + 1] + "\n"
65
+ return conversation_string