Spaces:
Sleeping
Sleeping
Upload 3 files
Browse filesFirst version with working UI. Need some improvements in the results.
- app.py +78 -0
- indexing.py +62 -0
- utils.py +65 -0
app.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_openai import ChatOpenAI
|
| 2 |
+
from langchain.chains import ConversationChain
|
| 3 |
+
from langchain.memory import ConversationBufferWindowMemory
|
| 4 |
+
from langchain.prompts import (
|
| 5 |
+
SystemMessagePromptTemplate,
|
| 6 |
+
HumanMessagePromptTemplate,
|
| 7 |
+
ChatPromptTemplate,
|
| 8 |
+
MessagesPlaceholder
|
| 9 |
+
)
|
| 10 |
+
import streamlit as st
|
| 11 |
+
from utils import find_match, query_refiner, get_conversation_string
|
| 12 |
+
|
| 13 |
+
from dotenv import load_dotenv
|
| 14 |
+
import os
|
| 15 |
+
load_dotenv()
|
| 16 |
+
|
| 17 |
+
st.subheader("Aido-We assist Universities for recruiting International students")
|
| 18 |
+
|
| 19 |
+
if 'responses' not in st.session_state:
|
| 20 |
+
st.session_state['responses'] = ["How can I assist you?"]
|
| 21 |
+
|
| 22 |
+
if 'requests' not in st.session_state:
|
| 23 |
+
st.session_state['requests'] = []
|
| 24 |
+
|
| 25 |
+
llm = ChatOpenAI(model_name="gpt-4o-mini", api_key=os.getenv('OPENAI_API_KEY'))
|
| 26 |
+
|
| 27 |
+
if 'buffer_memory' not in st.session_state:
|
| 28 |
+
st.session_state.buffer_memory = ConversationBufferWindowMemory(k=3, return_messages=True)
|
| 29 |
+
|
| 30 |
+
system_msg_template = SystemMessagePromptTemplate.from_template(template="""Answer the question as truthfully as possible using the provided context,
|
| 31 |
+
and if the answer is not contained within the text below, say 'I don't know'""")
|
| 32 |
+
|
| 33 |
+
human_msg_template = HumanMessagePromptTemplate.from_template(template="{input}")
|
| 34 |
+
|
| 35 |
+
prompt_template = ChatPromptTemplate.from_messages(
|
| 36 |
+
[system_msg_template, MessagesPlaceholder(variable_name="history"), human_msg_template])
|
| 37 |
+
|
| 38 |
+
conversation = ConversationChain(memory=st.session_state.buffer_memory, prompt=prompt_template, llm=llm, verbose=True)
|
| 39 |
+
|
| 40 |
+
# container for chat history
|
| 41 |
+
response_container = st.container()
|
| 42 |
+
# container for text box
|
| 43 |
+
textcontainer = st.container()
|
| 44 |
+
|
| 45 |
+
with textcontainer:
|
| 46 |
+
# Replace the single-line text input with a text area that expands
|
| 47 |
+
query = st.text_area(
|
| 48 |
+
"Query: ",
|
| 49 |
+
key="input",
|
| 50 |
+
height=100, # Initial height
|
| 51 |
+
max_chars=None, # No character limit
|
| 52 |
+
help="Type your question here.",
|
| 53 |
+
placeholder="● What are some concerns students from Algeria have about studying in the USA?"
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Add a submit button to control when the query is processed
|
| 57 |
+
submit_button = st.button("Submit")
|
| 58 |
+
|
| 59 |
+
if submit_button and query:
|
| 60 |
+
with st.spinner("typing..."):
|
| 61 |
+
conversation_string = get_conversation_string()
|
| 62 |
+
refined_query = query_refiner(conversation_string, query)
|
| 63 |
+
st.subheader("Refined Query:")
|
| 64 |
+
st.write(refined_query)
|
| 65 |
+
context = find_match(refined_query)
|
| 66 |
+
response = conversation.predict(input=f"Context:\n {context} \n\n Query:\n{query}")
|
| 67 |
+
st.session_state.requests.append(query)
|
| 68 |
+
st.session_state.responses.append(response)
|
| 69 |
+
|
| 70 |
+
with response_container:
|
| 71 |
+
if st.session_state['responses']:
|
| 72 |
+
for i in range(len(st.session_state['responses'])):
|
| 73 |
+
# Using Streamlit's native chat message functionality instead of streamlit_chat
|
| 74 |
+
with st.chat_message("assistant"):
|
| 75 |
+
st.write(st.session_state['responses'][i])
|
| 76 |
+
if i < len(st.session_state['requests']):
|
| 77 |
+
with st.chat_message("user"):
|
| 78 |
+
st.write(st.session_state["requests"][i])
|
indexing.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_community.document_loaders import DirectoryLoader
|
| 2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
+
from langchain_openai import OpenAIEmbeddings
|
| 4 |
+
from pinecone import Pinecone, ServerlessSpec
|
| 5 |
+
#from langchain_community.vectorstores import Pinecone
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
import os
|
| 8 |
+
from langchain_pinecone import PineconeVectorStore
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
directory = "D:/Projects/Aido/data"
|
| 14 |
+
|
| 15 |
+
def load_docs(directory):
|
| 16 |
+
loader = DirectoryLoader(directory)
|
| 17 |
+
documents = loader.load()
|
| 18 |
+
return documents
|
| 19 |
+
|
| 20 |
+
documents = load_docs(directory)
|
| 21 |
+
print(f"Number of documents in the dataset: {len(documents)}")
|
| 22 |
+
|
| 23 |
+
def split_docs(documents,chunk_size=400,chunk_overlap=150):
|
| 24 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 25 |
+
docs = text_splitter.split_documents(documents)
|
| 26 |
+
return docs
|
| 27 |
+
|
| 28 |
+
docs = split_docs(documents)
|
| 29 |
+
print(f"There are total of {len(docs)} chunks derived from {len(documents)} document" )
|
| 30 |
+
|
| 31 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
pc= Pinecone(api_key=os.getenv('PINECONE_API_KEY') ) # next to api key in console
|
| 35 |
+
|
| 36 |
+
index_name = "aido"
|
| 37 |
+
if index_name not in pc.list_indexes().names():
|
| 38 |
+
pc.create_index(
|
| 39 |
+
name=index_name,
|
| 40 |
+
dimension=1536, # dimensionality of text-embedding-ada-002
|
| 41 |
+
metric="cosine"
|
| 42 |
+
)
|
| 43 |
+
pinecone_index = pc.Index(index_name)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
index = PineconeVectorStore.from_documents(
|
| 47 |
+
docs,
|
| 48 |
+
embeddings,
|
| 49 |
+
index_name=index_name
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
def get_similiar_docs(query,k=3,score=False):
|
| 53 |
+
if score:
|
| 54 |
+
similar_docs = index.similarity_search_with_score(query,k=k)
|
| 55 |
+
else:
|
| 56 |
+
similar_docs = index.similarity_search(query,k=k)
|
| 57 |
+
return similar_docs
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
query = "What do students from Albino doubtful on their return on investment when considering studying in the USA?"
|
| 61 |
+
similar_docs = get_similiar_docs(query)
|
| 62 |
+
print(similar_docs)
|
utils.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_openai import OpenAIEmbeddings
|
| 2 |
+
from pinecone import Pinecone
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
import os
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
load_dotenv()
|
| 9 |
+
|
| 10 |
+
# Initialize OpenAI client
|
| 11 |
+
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
| 12 |
+
|
| 13 |
+
# Initialize embeddings
|
| 14 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", api_key=os.getenv('OPENAI_API_KEY'))
|
| 15 |
+
|
| 16 |
+
# Initialize Pinecone
|
| 17 |
+
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
|
| 18 |
+
|
| 19 |
+
# Check if index exists and connect to it
|
| 20 |
+
index_name = "aido-hybrid"
|
| 21 |
+
if index_name not in pc.list_indexes().names():
|
| 22 |
+
print("Creating a new Pinecone index...")
|
| 23 |
+
pc.create_index(
|
| 24 |
+
name=index_name,
|
| 25 |
+
dimension=1536, # dimensionality of text-embedding-ada-002
|
| 26 |
+
metric="cosine"
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# Connect to the existing Pinecone index
|
| 30 |
+
index = pc.Index(index_name)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def find_match(input):
|
| 34 |
+
# Get embeddings for the input query
|
| 35 |
+
input_em = embeddings.embed_query(input)
|
| 36 |
+
|
| 37 |
+
# Query Pinecone
|
| 38 |
+
result = index.query(vector=input_em, top_k=5, include_metadata=True)
|
| 39 |
+
|
| 40 |
+
# Return the top 2 matches
|
| 41 |
+
return result['matches'][0]['metadata']['text'] + "\n" + result['matches'][1]['metadata']['text']
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def query_refiner(conversation, query):
|
| 45 |
+
# Using the new ChatCompletion API instead of the deprecated Completion API
|
| 46 |
+
response = client.chat.completions.create(
|
| 47 |
+
model="gpt-3.5-turbo",
|
| 48 |
+
messages=[
|
| 49 |
+
{"role": "system",
|
| 50 |
+
"content": "You are a helpful assistant that refines user queries based on conversation context."},
|
| 51 |
+
{"role": "user",
|
| 52 |
+
"content": f"Given the following user query and conversation log, formulate a question that would be the most relevant to provide the user with an answer from a knowledge base.\n\nCONVERSATION LOG: \n{conversation}\n\nQuery: {query}\n\nRefined Query:"}
|
| 53 |
+
],
|
| 54 |
+
temperature=0.7,
|
| 55 |
+
max_tokens=256
|
| 56 |
+
)
|
| 57 |
+
return response.choices[0].message.content
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def get_conversation_string():
|
| 61 |
+
conversation_string = ""
|
| 62 |
+
for i in range(len(st.session_state['responses']) - 1):
|
| 63 |
+
conversation_string += "Human: " + st.session_state['requests'][i] + "\n"
|
| 64 |
+
conversation_string += "Bot: " + st.session_state['responses'][i + 1] + "\n"
|
| 65 |
+
return conversation_string
|