psdocuments / start2.py
Tomas Larsson
rebuild
b019728
from openai import OpenAI
import numpy as np
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from scipy.spatial.distance import cosine
def find_first_with_docket(items):
# Loop through each item in the list
k = 0
for item in items:
# Check if "docket" is in the item (case-insensitive search)
if "docket" in item.lower():
return item
k = k + 1
# Return None if no item contains "docket"
return 0
def escape_markdownold(text):
# List of markdown special characters to escape
special_chars = r"\*|_|#|\{|\}|\[|\]|\(|\)|\#|\+|\-|\.|\!|\\"
# Use regex sub function to escape special characters by adding a backslash before them
escaped_text = re.sub(f"([{special_chars}])", r"\\\1", text)
return escaped_text
def escape_markdown(text):
# List of special characters in markdown that need escaping
markdown_chars = ["\\", "`", "*", "_", "{", "}", "[", "]", "(", ")", "#", "+", "-", ".", "!", "|", ">","$"]
# Escape each character with a backslash
for char in markdown_chars:
text = text.replace(char, "\\" + char)
return text
if not started:
print("------------starting------------")
import pickle
# Path to the pickle file where you want to save your data
pickle_file_path = 'vectorstore.pkl'
with open(pickle_file_path, 'rb') as file:
st.session_state.docs = pickle.load(file)
st.session_state.embeddings = np.load('embeddings.npy')
def strip_repeated_dots_and_blanks(text):
# Replace multiple dots with a single dot
text = re.sub(r'\.{2,}', '.', text)
# Replace multiple spaces with a single space
text = re.sub(r' {2,}', ' ', text)
text = re.sub('\n \n', '\n\n', text)
return text
# Function to get embeddings from OpenAI API
def get_embeddings(texts):
client = OpenAI()
embeddings = []
for k in texts:
response = client.embeddings.create(
input = k,
model="text-embedding-3-small"
)
embeddings = embeddings + [response.data[0].embedding]
return embeddings #[item['embedding'] for item in response['data']]
def cosine_similarity(vec_a, vec_b):
# Note: Cosine similarity is 1 - cosine distance
return 1 - cosine(vec_a, vec_b)
def askq(query):
embeddings = st.session_state.embeddings
docs = st.session_state.docs
question = strip_repeated_dots_and_blanks(query)
query_embedding = get_embeddings([query])[0]
# Calculate similarity of each text to the query
similarities = [cosine_similarity(embedding, query_embedding) for embedding in embeddings]
similarities_array = np.array(similarities)
highest_indices = np.argpartition(similarities_array, -5)[-5:]
# Sort these two indices by their similarity values so the highest comes first
highest_indices = highest_indices[np.argsort(similarities_array[highest_indices])[::-1]]
# highest_indices = highest_indices[highest_indices>0.5]
# highest_indices = [x for x in highest_indices if x > 0.5]
# highest_indices = [index for index in highest_indices if similarities_array[index] > 0.5]
# Filter indices by their corresponding similarity values
filtered_indices_and_values = [(index, similarities_array[index]) for index in highest_indices if similarities_array[index] > 0.5]
highest_indices = [item[0] for item in filtered_indices_and_values]
highest_simularities = [item[1] for item in filtered_indices_and_values]
selected_items = [docs[i].page_content for i in highest_indices]
selected_sources = [docs[i].metadata['source'] for i in highest_indices]
selected_chunks = [docs[i].metadata['chunk'] for i in highest_indices]
selected_chunks = [docs[i].metadata['chunk'] for i in similarities_array]
content = ' '.join(selected_items)
# Combine content and question into a single prompt
prompt = f"""Anwser the question or request provided given content. If an awnser can't be found in the provided ontent respond that you could not find
the awnser to the question, applogice and say that you will instead tell a lawyer joke follow that with the joke.
Content: {content}\n\nQuestion: {question}\nAnswer:"""
# Define LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)
#llm = ChatOpenAI(model_name="gpt-4", temperature=0.2)
Awnser = llm.invoke(prompt).content
return Awnser, selected_items,selected_sources,selected_chunks,highest_simularities
import numpy as np
import streamlit as st
# Assuming `strip_repeated_dots_and_blanks`, `get_embeddings`, and `cosine_similarity` are defined elsewhere correctly
# Assuming `ChatOpenAI` is a correctly defined or imported class for handling OpenAI chat
def ask(query):
embeddings = st.session_state.embeddings
docs = st.session_state.docs
question = strip_repeated_dots_and_blanks(query)
query_embedding = get_embeddings([query])[0]
# Calculate similarity of each text to the query
similarities = [cosine_similarity(embedding, query_embedding) for embedding in embeddings]
# Create a NumPy array of similarities
similarities_array = np.array(similarities)
# Get indices of the top 5 most similar texts
highest_indices = np.argpartition(similarities_array, -5)[-5:]
# Sort the top 5 indices by their similarity values in descending order
highest_indices = highest_indices[np.argsort(similarities_array[highest_indices])[::-1]]
# Filter indices and their corresponding similarity values for those above 0.5
filtered_indices_and_values = [(index, similarities_array[index]) for index in highest_indices if similarities_array[index] > 0.4]
# Extract filtered indices and their similarities
highest_indices = [item[0] for item in filtered_indices_and_values]
highest_simularities = [item[1] for item in filtered_indices_and_values]
# Select items based on filtered indices
selected_items = [docs[i].page_content for i in highest_indices]
selected_sources = [docs[i].metadata['source'] for i in highest_indices]
selected_chunks = [docs[i].metadata['chunk'] for i in highest_indices]
titles = [docs[i].metadata['title'] for i in highest_indices]
dates = [docs[i].metadata['date'] for i in highest_indices]
# Combine selected items into a single content string
content = ' '.join(selected_items)
# Prepare the prompt
prompt = f"""Answer the question or request provided given the content. If an answer can't be found in the provided content,
respond that you could not find the answer to the question, apologize and instead provide a suggestion for where to search for more information related to the question.
\
-------------------
Content: {content}\n\nQuestion: {question}\nAnswer:
-------------------
"""
# Initialize the LLM (assuming correct implementation or import)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)
answer = llm.invoke(prompt).content
return answer, selected_items, selected_sources, titles, dates, selected_chunks, highest_simularities