Tomas Larsson commited on
Commit
cb76759
·
1 Parent(s): b8685e9
Files changed (5) hide show
  1. app.py +100 -22
  2. embeddings.npy +3 -0
  3. fire.jpg +0 -0
  4. start2.py +173 -0
  5. vectorstore2.pkl +3 -0
app.py CHANGED
@@ -3,7 +3,7 @@ import streamlit as st
3
  st.session_state.em = "0"
4
 
5
  import os
6
-
7
 
8
 
9
  import requests
@@ -17,19 +17,17 @@ import numpy as np
17
  st.set_page_config(layout="wide")
18
 
19
 
 
 
20
 
 
 
21
 
22
 
23
-
24
- started = 'started' in st.session_state
25
-
26
-
27
 
28
 
29
- if started:
30
- retriever = st.session_state.retriever
31
- rag_chain = st.session_state.rag_chain
32
-
33
 
34
  os.environ["OPENAI_API_KEY"] = os.getenv('openkey')
35
 
@@ -92,7 +90,7 @@ def strip_repeated_dots_and_blanks(text):
92
 
93
 
94
  # Title of the page
95
- st.title('Question and Answer App')
96
 
97
  # Text input for the question
98
  question = st.text_input("Type your question here:")
@@ -100,8 +98,33 @@ question = st.text_input("Type your question here:")
100
  # A button to submit the question
101
  submit_button = st.button('Submit')
102
 
 
 
 
 
 
 
103
  # Placeholder for displaying the answer
104
- answer_placeholder = st.empty()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  # Logic to display an answer when the submit button is pressed
107
  if submit_button:
@@ -109,13 +132,66 @@ if submit_button:
109
  # Process the question here (a placeholder answer is used in this example)
110
  try:
111
  if started:
112
- Awnser = rag_chain.invoke(question)
113
- contexts = retriever.get_relevant_documents(question)
114
- answer_placeholder.text(Awnser) # Display the answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  else:
116
- answer_placeholder.text("Waiting for system to wake up "+ st.session_state.ln + " " + st.session_state.em )
117
  except Exception as e:
118
- answer_placeholder.text(e) # Display the answer
119
 
120
  else:
121
  answer_placeholder.warning("Please type a question.")
@@ -123,10 +199,12 @@ if submit_button:
123
 
124
 
125
 
126
- if 'retriever' not in st.session_state:
127
- st.session_state.em = "mm"
128
 
129
- if 'retriever' not in st.session_state:
130
- st.session_state.em = "1"
131
- exec(open('start.py').read())
132
- st.session_state.em = "2"
 
 
 
3
  st.session_state.em = "0"
4
 
5
  import os
6
+ import json
7
 
8
 
9
  import requests
 
17
  st.set_page_config(layout="wide")
18
 
19
 
20
+ # Path to the image
21
+ image_path = 'fire.jpg'
22
 
23
+ # Display the image with st.image
24
+ st.image(image_path, caption='', use_column_width=True)
25
 
26
 
27
+ started = 'docs' in st.session_state
28
+ exec(open('start2.py').read())
 
 
29
 
30
 
 
 
 
 
31
 
32
  os.environ["OPENAI_API_KEY"] = os.getenv('openkey')
33
 
 
90
 
91
 
92
  # Title of the page
93
+ st.title('Peerstreet Question and Answer App')
94
 
95
  # Text input for the question
96
  question = st.text_input("Type your question here:")
 
98
  # A button to submit the question
99
  submit_button = st.button('Submit')
100
 
101
+
102
+ # Create tabs
103
+ Awnser_tab, Content_tab, Info_tab = st.tabs(["Awnser", "Content used to create answer", "Infrmation about this app"])
104
+
105
+
106
+
107
  # Placeholder for displaying the answer
108
+ with Awnser_tab:
109
+ answer_placeholder = st.empty()
110
+ with Content_tab:
111
+ content_placeholder = st.empty()
112
+ with Info_tab:
113
+ st.markdown("""## Use at your own risk, accuracy of responses are not guaranteed.
114
+
115
+ This app base its anwsers on 110 documents filed by the court. This does not include any scanned documents at this point
116
+ as it takes more work to retrieve the text from them. It does include most orders filed by the court up to Feb 29th.
117
+
118
+
119
+ This is a simple RAG (retrieval augmented generation) system and does not consider order of events when
120
+ retrieving onformation and generating responses. It can also easily missinterpret information, but information used to generate the
121
+ response is presented in the content tab with link to the full document so that you can read the details in its proper context.
122
+
123
+
124
+
125
+ """
126
+ )
127
+
128
 
129
  # Logic to display an answer when the submit button is pressed
130
  if submit_button:
 
132
  # Process the question here (a placeholder answer is used in this example)
133
  try:
134
  if started:
135
+ #Awnser = rag_chain.invoke(question)
136
+ #contexts = retriever.get_relevant_documents(question)
137
+ answer, selected_items,selected_sources,selected_chunks,highest_simularities = ask(question)
138
+ answer_placeholder.markdown(answer) # Display the answer
139
+
140
+ # Prepare the data to be saved
141
+
142
+
143
+ data_to_save = {
144
+ "query": question,
145
+ "answer": answer,
146
+ "selected_items": selected_items,
147
+ "selected_sources": selected_sources,
148
+ "selected_chunks": selected_chunks,
149
+ "highest_similarities": [f"{sim:.2f}" for sim in highest_simularities]
150
+ }
151
+
152
+ # The file to which the data will be appended
153
+ file_path = 'results.json'
154
+
155
+ try:
156
+ # Read the existing content of the file
157
+ with open(file_path, 'r') as file:
158
+ existing_data = json.load(file)
159
+ except (FileNotFoundError, json.JSONDecodeError):
160
+ # If the file doesn't exist or is empty, start with an empty list
161
+ existing_data = []
162
+
163
+ # Append the new data
164
+ existing_data.append(data_to_save)
165
+
166
+ # Write the updated data back to the file
167
+ with open(file_path, 'w') as file:
168
+ json.dump(existing_data, file, indent=4)
169
+
170
+
171
+
172
+
173
+ url = 'https://cases.stretto.com/public/x247/12208/PLEADINGS/'
174
+
175
+
176
+
177
+
178
+
179
+ string = ""
180
+ for k in range(len(selected_items)):
181
+ temp = " [" + selected_sources[k] + "](" + url + selected_sources[k] + ")" + " text block: " + selected_chunks[k] + " Relevance: " +f"{highest_simularities[k]:.2f}"
182
+
183
+
184
+ string = string + "### Paragraph used. \n" + selected_items[k] + "\n\n source:" + temp + "\n"
185
+
186
+
187
+ content_placeholder.markdown(string)
188
+
189
+
190
+
191
  else:
192
+ answer_placeholder.markdown("Waiting for system to wake up "+ st.session_state.ln + " " + st.session_state.em )
193
  except Exception as e:
194
+ answer_placeholder.markdown(e) # Display the answer
195
 
196
  else:
197
  answer_placeholder.warning("Please type a question.")
 
199
 
200
 
201
 
202
+ #if 'retriever' not in st.session_state:
203
+ # st.session_state.em = "mm"
204
 
205
+ #if 'retriever' not in st.session_state:
206
+ # st.session_state.em = "1"
207
+ # exec(open('start.py').read())
208
+ # st.session_state.em = "2"
209
+
210
+
embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e59b70d53bc2779e24f76bcc2377fd60b9d3cdabf20b26cd8cfc176ec316292
3
+ size 66072704
fire.jpg ADDED
start2.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from openai import OpenAI
3
+ import numpy as np
4
+
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain.prompts import ChatPromptTemplate
7
+ from langchain.schema.runnable import RunnablePassthrough
8
+ from langchain.schema.output_parser import StrOutputParser
9
+ from scipy.spatial.distance import cosine
10
+
11
+ if not started:
12
+ print("------------starting------------")
13
+
14
+
15
+
16
+
17
+ import pickle
18
+
19
+
20
+ # Path to the pickle file where you want to save your data
21
+ pickle_file_path = 'vectorstore2.pkl'
22
+
23
+
24
+ with open(pickle_file_path, 'rb') as file:
25
+ st.session_state.docs = pickle.load(file)
26
+
27
+
28
+ st.session_state.embeddings = np.load('embeddings.npy')
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+ def strip_repeated_dots_and_blanks(text):
37
+ # Replace multiple dots with a single dot
38
+ text = re.sub(r'\.{2,}', '.', text)
39
+ # Replace multiple spaces with a single space
40
+ text = re.sub(r' {2,}', ' ', text)
41
+ text = re.sub('\n \n', '\n\n', text)
42
+
43
+ return text
44
+
45
+
46
+ # Function to get embeddings from OpenAI API
47
+ def get_embeddings(texts):
48
+ client = OpenAI()
49
+ embeddings = []
50
+ for k in texts:
51
+ response = client.embeddings.create(
52
+ input = k,
53
+ model="text-embedding-3-small"
54
+ )
55
+ embeddings = embeddings + [response.data[0].embedding]
56
+
57
+ return embeddings #[item['embedding'] for item in response['data']]
58
+
59
+
60
+
61
+ def cosine_similarity(vec_a, vec_b):
62
+ # Note: Cosine similarity is 1 - cosine distance
63
+ return 1 - cosine(vec_a, vec_b)
64
+
65
+
66
+
67
+
68
+ def askq(query):
69
+ embeddings = st.session_state.embeddings
70
+ docs = st.session_state.docs
71
+
72
+ question = strip_repeated_dots_and_blanks(query)
73
+ query_embedding = get_embeddings([query])[0]
74
+
75
+
76
+ # Calculate similarity of each text to the query
77
+ similarities = [cosine_similarity(embedding, query_embedding) for embedding in embeddings]
78
+
79
+
80
+
81
+ similarities_array = np.array(similarities)
82
+ highest_indices = np.argpartition(similarities_array, -5)[-5:]
83
+
84
+ # Sort these two indices by their similarity values so the highest comes first
85
+ highest_indices = highest_indices[np.argsort(similarities_array[highest_indices])[::-1]]
86
+
87
+ # highest_indices = highest_indices[highest_indices>0.5]
88
+ # highest_indices = [x for x in highest_indices if x > 0.5]
89
+ # highest_indices = [index for index in highest_indices if similarities_array[index] > 0.5]
90
+
91
+ # Filter indices by their corresponding similarity values
92
+ filtered_indices_and_values = [(index, similarities_array[index]) for index in highest_indices if similarities_array[index] > 0.5]
93
+
94
+ highest_indices = [item[0] for item in filtered_indices_and_values]
95
+ highest_simularities = [item[1] for item in filtered_indices_and_values]
96
+
97
+
98
+
99
+ selected_items = [docs[i].page_content for i in highest_indices]
100
+ selected_sources = [docs[i].metadata['source'] for i in highest_indices]
101
+ selected_chunks = [docs[i].metadata['chunk'] for i in highest_indices]
102
+ selected_chunks = [docs[i].metadata['chunk'] for i in similarities_array]
103
+
104
+
105
+ content = ' '.join(selected_items)
106
+
107
+
108
+ # Combine content and question into a single prompt
109
+ prompt = f"""Anwser question provided given content. If an awnser can't be found in the provided ontent respond that you could not find
110
+ the awnser to the question, applogice and say that you will instead tell a lawyer joke follow that with the joke.
111
+ Content: {content}\n\nQuestion: {question}\nAnswer:"""
112
+
113
+
114
+ # Define LLM
115
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)
116
+ Awnser = llm.invoke(prompt).content
117
+
118
+ return Awnser, selected_items,selected_sources,selected_chunks,highest_simularities
119
+
120
+
121
+ import numpy as np
122
+ import streamlit as st
123
+
124
+ # Assuming `strip_repeated_dots_and_blanks`, `get_embeddings`, and `cosine_similarity` are defined elsewhere correctly
125
+ # Assuming `ChatOpenAI` is a correctly defined or imported class for handling OpenAI chat
126
+
127
+ def ask(query):
128
+ embeddings = st.session_state.embeddings
129
+ docs = st.session_state.docs
130
+
131
+ question = strip_repeated_dots_and_blanks(query)
132
+ query_embedding = get_embeddings([query])[0]
133
+
134
+ # Calculate similarity of each text to the query
135
+ similarities = [cosine_similarity(embedding, query_embedding) for embedding in embeddings]
136
+
137
+ # Create a NumPy array of similarities
138
+ similarities_array = np.array(similarities)
139
+ # Get indices of the top 5 most similar texts
140
+ highest_indices = np.argpartition(similarities_array, -5)[-5:]
141
+ # Sort the top 5 indices by their similarity values in descending order
142
+ highest_indices = highest_indices[np.argsort(similarities_array[highest_indices])[::-1]]
143
+
144
+ # Filter indices and their corresponding similarity values for those above 0.5
145
+ filtered_indices_and_values = [(index, similarities_array[index]) for index in highest_indices if similarities_array[index] > 0.4]
146
+
147
+ # Extract filtered indices and their similarities
148
+ highest_indices = [item[0] for item in filtered_indices_and_values]
149
+ highest_simularities = [item[1] for item in filtered_indices_and_values]
150
+
151
+ # Select items based on filtered indices
152
+ selected_items = [docs[i].page_content for i in highest_indices]
153
+ selected_sources = [docs[i].metadata['source'] for i in highest_indices]
154
+ selected_chunks = [docs[i].metadata['chunk'] for i in highest_indices]
155
+
156
+ # Combine selected items into a single content string
157
+ content = ' '.join(selected_items)
158
+
159
+ # Prepare the prompt
160
+ prompt = f"""Answer the question provided given the content. If an answer can't be found in the provided content,
161
+ respond that you could not find the answer to the question, apologize and instead provide a suggestion for where to search for more information related to teh question.
162
+
163
+ -------------------
164
+ Content: {content}\n\nQuestion: {question}\nAnswer:
165
+ -------------------
166
+
167
+ """
168
+
169
+ # Initialize the LLM (assuming correct implementation or import)
170
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)
171
+ answer = llm.invoke(prompt).content
172
+
173
+ return answer, selected_items, selected_sources, selected_chunks, highest_simularities
vectorstore2.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0eeb601bfdd128945a52712a20a89f9bfd89c85ea1d25215d552f68ca094b012
3
+ size 5582531