mbhoge commited on
Commit
b2d921d
·
1 Parent(s): 605b47c

Learning Path Index Files

Browse files

First Commit of the Learning Path Index Context Based Search Project

.env_template ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # This file won't become part of the git history as long as it exists in
2
+ # the .gitignore file, and it should stay like that
3
+ OPENAI_API_KEY=<yourOpenAIAPI key>
4
+ PINECONE_API_KEY=<yourPineCone key>
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Ignore all files with .env extension in any directory
2
+ **/*.env
3
+
4
+ # Ignore all .env files in the root directory and its subdirectories
5
+ .env
6
+
7
+ __*/**
8
+ faiss_learning_path_index/
Learning_Pathway_Index.csv ADDED
The diff for this file is too large to render. See raw diff
 
ProjectArch.drawio ADDED
The diff for this file is too large to render. See raw diff
 
faiss_index.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from langchain.text_splitter import CharacterTextSplitter
4
+ from langchain.embeddings.openai import OpenAIEmbeddings
5
+ from langchain.document_loaders import TextLoader
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chains import RetrievalQA
8
+ from langchain.llms import OpenAI
9
+
10
+ def faiss_index():
11
+ current_directory = os.getcwd()
12
+ data_path = current_directory + "\\final_project\\Learning_Pathway_Index.csv"
13
+ loader = TextLoader(data_path)
14
+ documents = loader.load()
15
+ text_splitter = CharacterTextSplitter(
16
+ chunk_size=1000, chunk_overlap=30, separator="\n"
17
+ )
18
+ docs = text_splitter.split_documents(documents=documents)
19
+
20
+ embeddings = OpenAIEmbeddings()
21
+ vectorstore = FAISS.from_documents(docs, embeddings)
22
+ vectorstore.save_local("faiss_learning_path_index")
23
+
24
+ new_vectorstore = FAISS.load_local("faiss_learning_path_index", embeddings)
25
+ qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=new_vectorstore.as_retriever())
26
+ res = qa.run("Give me Machine Learning Course with 10 or 20 min duration.")
27
+ print(res)
28
+
29
+
30
+ if __name__ == "__main__":
31
+ faiss_index()
interface.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Define your Streamlit app and return the input variable
4
+ def app():
5
+ # Add a title to your app
6
+ st.title("KaggleX Learning Path Index Search")
7
+
8
+ # Add some text to your app
9
+ st.write("Embark your Learning Path Journey with right search !!")
10
+
11
+ # Add a text input to your app
12
+ user_input = st.text_input("Enter your course query here")
13
+
14
+ # Store the input in a variable
15
+ my_variable = user_input
16
+ # Display the stored variable
17
+ # st.write(f"The stored variable is: {my_variable}")
18
+
19
+ return my_variable
20
+
21
+ # Run your Streamlit app
22
+ # if __name__ == "__main__":
23
+ # var = app()
24
+ # print(var)
main.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from datetime import datetime
4
+ import time
5
+ from langchain.llms import OpenAI
6
+ from langchain.document_loaders import TextLoader
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain.embeddings.openai import OpenAIEmbeddings
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.prompts import PromptTemplate
11
+ from langchain.llms import OpenAI
12
+ from langchain.vectorstores import FAISS
13
+ from langchain.prompts import PromptTemplate
14
+
15
+ from interface import app
16
+ import streamlit as st
17
+ # Define GenerateLearningPathIndexEmbeddings class:
18
+ # - Load .csv file
19
+ # - Chunk text
20
+ # - Chunk size = 1000 characters
21
+ # - Chunk overlap = 30 characters
22
+ # - Create FAISS vector store from chunked text and OpenAI embeddings
23
+ # - Get FAISS vector store
24
+ # This class is used to generate the FAISS vector store from the .csv file.
25
+ class GenerateLearningPathIndexEmbeddings:
26
+ def __init__(self, csv_filename):
27
+ load_dotenv() # Load .env file
28
+ self.openai_api_key = os.getenv("OPENAI_API_KEY")
29
+ self.data_path = os.path.join(os.getcwd(), csv_filename)
30
+ self.our_custom_data = None
31
+ self.openai_embeddings = None
32
+ self.faiss_vectorstore = None
33
+
34
+ self.load_csv_data()
35
+ self.get_openai_embeddings()
36
+ self.create_faiss_vectorstore_with_csv_data_and_openai_embeddings()
37
+
38
+ def load_csv_data(self):
39
+ # Load your dataset (e.g., CSV, JSON, etc.)
40
+ print(' -- Started loading .csv file for chunking purposes.')
41
+ loader = TextLoader(self.data_path)
42
+ document = loader.load()
43
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=30, separator="\n")
44
+ self.our_custom_data = text_splitter.split_documents(document)
45
+ print(f' -- Finished spitting (i.e. chunking) text (i.e. documents) from the .csv file (i.e. {self.data_path}).')
46
+
47
+ def get_openai_embeddings(self):
48
+ self.openai_embeddings = OpenAIEmbeddings(openai_api_key=self.openai_api_key, request_timeout=60)
49
+
50
+ def create_faiss_vectorstore_with_csv_data_and_openai_embeddings(self):
51
+ faiss_vectorstore_foldername = "faiss_learning_path_index"
52
+ if not os.path.exists(faiss_vectorstore_foldername):
53
+ print(' -- Creating a new FAISS vector store from chunked text and OpenAI embeddings.')
54
+ vectorstore = FAISS.from_documents(self.our_custom_data, self.openai_embeddings)
55
+ vectorstore.save_local(faiss_vectorstore_foldername)
56
+ print(f' -- Saved the newly created FAISS vector store at "{faiss_vectorstore_foldername}".')
57
+ else:
58
+ print(f' -- WARNING: Found existing FAISS vector store at "{faiss_vectorstore_foldername}", loading from cache.')
59
+ print(f' -- NOTE: Delete the FAISS vector store at "{faiss_vectorstore_foldername}", if you wish to regenerate it from scratch for the next run.')
60
+ self.faiss_vectorstore = FAISS.load_local(
61
+ "faiss_learning_path_index", self.openai_embeddings
62
+ )
63
+
64
+ def get_faiss_vector_store(self):
65
+ return self.faiss_vectorstore
66
+
67
+
68
+ # https://discuss.streamlit.io/t/how-to-check-if-code-is-run-inside-streamlit-and-not-e-g-ipython/23439/7
69
+ def running_inside_streamlit():
70
+ """
71
+ Function to check whether python code is run within streamlit
72
+
73
+ Returns
74
+ -------
75
+ use_streamlit : boolean
76
+ True if code is run within streamlit, else False
77
+ """
78
+ try:
79
+ from streamlit.runtime.scriptrunner import get_script_run_ctx
80
+ if not get_script_run_ctx():
81
+ use_streamlit = False
82
+ else:
83
+ use_streamlit = True
84
+ except ModuleNotFoundError:
85
+ use_streamlit = False
86
+ return use_streamlit
87
+
88
+
89
+ # Define GenAI class:
90
+ # - Create prompt template
91
+ # - Create GenAI project
92
+ # - Get response for query
93
+ # This class is used to get the response for a query from the GenAI project.
94
+ # The GenAI project is created from the FAISS vector store.
95
+ class GenAILearningPathIndex:
96
+ def __init__(self, faiss_vectorstore):
97
+ load_dotenv() # Load .env file
98
+ self.openai_api_key = os.getenv("OPENAI_API_KEY")
99
+ self.faiss_vectorstore = faiss_vectorstore
100
+
101
+ prompt_template = \
102
+ """
103
+ Use the following template to answer the question at the end,
104
+ from the Learning Path Index csv file,
105
+ display top 4 results in a tablular format and it
106
+ should look like this:
107
+ | Learning Pathway | duration | link | Module
108
+ | --- | --- | --- | --- |
109
+ | ... | ... | ... | ... |
110
+ it must contain a link for each line of the result in a table,
111
+ consider the duration and Module information mentioned in the question,
112
+ If you don't know the answer, don't make an entry in the table,
113
+ {context}
114
+ Question: {question}
115
+ """
116
+ PROMPT = PromptTemplate(template=prompt_template, input_variables=["context","question"])
117
+ # The chain_type_kwargs are passed to the chain_type when it is created.
118
+ self.chain_type_kwargs = {"prompt": PROMPT}
119
+ # Create the GenAI project
120
+ self.llm = OpenAI(temperature=1.0, openai_api_key=self.openai_api_key)
121
+ # Get response for query
122
+ # The response is returned as a string.
123
+
124
+ def get_response_for(self, query: str):
125
+ qa = RetrievalQA.from_chain_type(
126
+ llm=self.llm, chain_type="stuff",
127
+ retriever=self.faiss_vectorstore.as_retriever(),
128
+ chain_type_kwargs=self.chain_type_kwargs
129
+ )
130
+ return qa.run(query)
131
+
132
+ def get_formatted_time(current_time = time.time()):
133
+ return datetime.utcfromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')
134
+
135
+ # Load the model
136
+ @st.cache_data
137
+ def load_model():
138
+ start_time = time.time()
139
+ print(f"\nStarted loading custom embeddings (created from .csv file) at {get_formatted_time(start_time)}")
140
+ learningPathIndexEmbeddings = GenerateLearningPathIndexEmbeddings("Learning_Pathway_Index.csv")
141
+ faiss_vectorstore = learningPathIndexEmbeddings.get_faiss_vector_store()
142
+ end_time = time.time()
143
+ print(f"Finished loading custom embeddings (created from .csv file) at {get_formatted_time(end_time)}")
144
+ print(f"Custom embeddings (created from .csv file) took about {end_time - start_time} seconds to load.")
145
+ return faiss_vectorstore
146
+
147
+ # Query the model
148
+ def query_gpt_model(query: str):
149
+ start_time = time.time()
150
+ print(f"\nQuery processing start time: {get_formatted_time(start_time)}")
151
+ genAIproject = GenAILearningPathIndex(faiss_vectorstore)
152
+ answer = genAIproject.get_response_for(query)
153
+ end_time = time.time()
154
+ print(f"\nQuery processing finish time: {get_formatted_time(end_time)}")
155
+ print(f"\nAnswer (took about {end_time - start_time} seconds)")
156
+ return answer
157
+
158
+
159
+ if __name__=='__main__':
160
+ faiss_vectorstore = load_model()
161
+
162
+ if running_inside_streamlit():
163
+ print("\nStreamlit environment detected. \nTo run a CLI interactive version just run `python main.py` in the CLI.\n")
164
+ query_from_stream_list = app()
165
+ if query_from_stream_list:
166
+ answer = query_gpt_model(query_from_stream_list)
167
+ st.write(answer)
168
+ else:
169
+ print("\nCommand-line interactive environment detected.\n")
170
+ while True:
171
+ query = input("\nEnter a query: ")
172
+ if query == "exit":
173
+ break
174
+ if query.strip() == "":
175
+ continue
176
+
177
+ if query:
178
+ answer = query_gpt_model(query)
179
+
180
+ print("\n\n> Question:")
181
+ print(query)
182
+ print(answer)
openai_faiss_exmpl.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import openai
3
+ import faiss
4
+ import os
5
+
6
+ # Load your custom CSV data
7
+ data = pd.read_csv( os.getcwd() + "\\Learning_Pathway_Index.csv")
8
+
9
+ # Initialize and populate FAISS index
10
+ vector_dimension = 768 # For example, if you use a GPT-3 model with 768-dimensional embeddings
11
+ index = faiss.IndexFlatL2(vector_dimension)
12
+ vectors = [] # List to store vector representations of data
13
+
14
+ for text in data['text_column']:
15
+ # Vectorize the text using a pre-trained model (e.g., GPT-3)
16
+ # Replace 'YOUR_OPENAI_API_KEY' with your actual API key
17
+ openai.api_key = os.getenv("OPENAI_API_KEY")
18
+ response = openai.Completion.create(
19
+ engine="text-davinci-002",
20
+ prompt=text,
21
+ max_tokens=50 # Adjust the token limit as needed
22
+ )
23
+ vector = response.choices[0].embedding
24
+ vectors.append(vector)
25
+
26
+ # Convert the list of vectors to a numpy array
27
+ vectors = np.array(vectors).astype('float32')
28
+
29
+ # Add vectors to the FAISS index
30
+ index.add(vectors)
31
+
32
+ # Accept user questions using OpenAI
33
+ user_question = input("Ask a question: ")
34
+
35
+ # Vectorize the user's question
36
+ user_vector = vectorize_user_question(user_question) # Implement this function
37
+
38
+ # Search for similar items in the FAISS index
39
+ k = 5 # Number of similar items to retrieve
40
+ distances, indices = index.search(user_vector, k)
41
+
42
+ # Retrieve and display the similar items
43
+ similar_items = data.iloc[indices[0]]
44
+ print(similar_items)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ langchain==0.0.216
2
+ streamlit==1.27.2
3
+ tqdm==4.65.0
4
+ # Pre-requisites: [sudo] apt install libopenblas-base libomp-dev
5
+ # See https://github.com/onfido/faiss_prebuilt
6
+ faiss-cpu==1.7.4
7
+ faiss-gpu==1.7.2