Spaces:
Sleeping
Sleeping
File size: 7,356 Bytes
cb76759 1acb91b cb76759 1acb91b cb76759 1acb91b cb76759 1acb91b cb76759 1acb91b 52cc340 b019728 cb76759 1acb91b f5db197 cb76759 f5db197 cb76759 52cc340 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
from openai import OpenAI
import numpy as np
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from scipy.spatial.distance import cosine
def find_first_with_docket(items):
# Loop through each item in the list
k = 0
for item in items:
# Check if "docket" is in the item (case-insensitive search)
if "docket" in item.lower():
return item
k = k + 1
# Return None if no item contains "docket"
return 0
def escape_markdownold(text):
# List of markdown special characters to escape
special_chars = r"\*|_|#|\{|\}|\[|\]|\(|\)|\#|\+|\-|\.|\!|\\"
# Use regex sub function to escape special characters by adding a backslash before them
escaped_text = re.sub(f"([{special_chars}])", r"\\\1", text)
return escaped_text
def escape_markdown(text):
# List of special characters in markdown that need escaping
markdown_chars = ["\\", "`", "*", "_", "{", "}", "[", "]", "(", ")", "#", "+", "-", ".", "!", "|", ">","$"]
# Escape each character with a backslash
for char in markdown_chars:
text = text.replace(char, "\\" + char)
return text
if not started:
print("------------starting------------")
import pickle
# Path to the pickle file where you want to save your data
pickle_file_path = 'vectorstore.pkl'
with open(pickle_file_path, 'rb') as file:
st.session_state.docs = pickle.load(file)
st.session_state.embeddings = np.load('embeddings.npy')
def strip_repeated_dots_and_blanks(text):
# Replace multiple dots with a single dot
text = re.sub(r'\.{2,}', '.', text)
# Replace multiple spaces with a single space
text = re.sub(r' {2,}', ' ', text)
text = re.sub('\n \n', '\n\n', text)
return text
# Function to get embeddings from OpenAI API
def get_embeddings(texts):
client = OpenAI()
embeddings = []
for k in texts:
response = client.embeddings.create(
input = k,
model="text-embedding-3-small"
)
embeddings = embeddings + [response.data[0].embedding]
return embeddings #[item['embedding'] for item in response['data']]
def cosine_similarity(vec_a, vec_b):
# Note: Cosine similarity is 1 - cosine distance
return 1 - cosine(vec_a, vec_b)
def askq(query):
embeddings = st.session_state.embeddings
docs = st.session_state.docs
question = strip_repeated_dots_and_blanks(query)
query_embedding = get_embeddings([query])[0]
# Calculate similarity of each text to the query
similarities = [cosine_similarity(embedding, query_embedding) for embedding in embeddings]
similarities_array = np.array(similarities)
highest_indices = np.argpartition(similarities_array, -5)[-5:]
# Sort these two indices by their similarity values so the highest comes first
highest_indices = highest_indices[np.argsort(similarities_array[highest_indices])[::-1]]
# highest_indices = highest_indices[highest_indices>0.5]
# highest_indices = [x for x in highest_indices if x > 0.5]
# highest_indices = [index for index in highest_indices if similarities_array[index] > 0.5]
# Filter indices by their corresponding similarity values
filtered_indices_and_values = [(index, similarities_array[index]) for index in highest_indices if similarities_array[index] > 0.5]
highest_indices = [item[0] for item in filtered_indices_and_values]
highest_simularities = [item[1] for item in filtered_indices_and_values]
selected_items = [docs[i].page_content for i in highest_indices]
selected_sources = [docs[i].metadata['source'] for i in highest_indices]
selected_chunks = [docs[i].metadata['chunk'] for i in highest_indices]
selected_chunks = [docs[i].metadata['chunk'] for i in similarities_array]
content = ' '.join(selected_items)
# Combine content and question into a single prompt
prompt = f"""Anwser the question or request provided given content. If an awnser can't be found in the provided ontent respond that you could not find
the awnser to the question, applogice and say that you will instead tell a lawyer joke follow that with the joke.
Content: {content}\n\nQuestion: {question}\nAnswer:"""
# Define LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)
#llm = ChatOpenAI(model_name="gpt-4", temperature=0.2)
Awnser = llm.invoke(prompt).content
return Awnser, selected_items,selected_sources,selected_chunks,highest_simularities
import numpy as np
import streamlit as st
# Assuming `strip_repeated_dots_and_blanks`, `get_embeddings`, and `cosine_similarity` are defined elsewhere correctly
# Assuming `ChatOpenAI` is a correctly defined or imported class for handling OpenAI chat
def ask(query):
embeddings = st.session_state.embeddings
docs = st.session_state.docs
question = strip_repeated_dots_and_blanks(query)
query_embedding = get_embeddings([query])[0]
# Calculate similarity of each text to the query
similarities = [cosine_similarity(embedding, query_embedding) for embedding in embeddings]
# Create a NumPy array of similarities
similarities_array = np.array(similarities)
# Get indices of the top 5 most similar texts
highest_indices = np.argpartition(similarities_array, -5)[-5:]
# Sort the top 5 indices by their similarity values in descending order
highest_indices = highest_indices[np.argsort(similarities_array[highest_indices])[::-1]]
# Filter indices and their corresponding similarity values for those above 0.5
filtered_indices_and_values = [(index, similarities_array[index]) for index in highest_indices if similarities_array[index] > 0.4]
# Extract filtered indices and their similarities
highest_indices = [item[0] for item in filtered_indices_and_values]
highest_simularities = [item[1] for item in filtered_indices_and_values]
# Select items based on filtered indices
selected_items = [docs[i].page_content for i in highest_indices]
selected_sources = [docs[i].metadata['source'] for i in highest_indices]
selected_chunks = [docs[i].metadata['chunk'] for i in highest_indices]
titles = [docs[i].metadata['title'] for i in highest_indices]
dates = [docs[i].metadata['date'] for i in highest_indices]
# Combine selected items into a single content string
content = ' '.join(selected_items)
# Prepare the prompt
prompt = f"""Answer the question or request provided given the content. If an answer can't be found in the provided content,
respond that you could not find the answer to the question, apologize and instead provide a suggestion for where to search for more information related to the question.
\
-------------------
Content: {content}\n\nQuestion: {question}\nAnswer:
-------------------
"""
# Initialize the LLM (assuming correct implementation or import)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)
answer = llm.invoke(prompt).content
return answer, selected_items, selected_sources, titles, dates, selected_chunks, highest_simularities
|