Spaces:

tlarsson
/

psdocuments

Sleeping

Tomas Larsson

rebuild

b019728 almost 2 years ago

7.36 kB


	from openai import OpenAI
	import numpy as np

	from langchain_openai import ChatOpenAI
	from langchain.prompts import ChatPromptTemplate
	from langchain.schema.runnable import RunnablePassthrough
	from langchain.schema.output_parser import StrOutputParser
	from scipy.spatial.distance import cosine


	def find_first_with_docket(items):
	# Loop through each item in the list
	k = 0
	for item in items:
	# Check if "docket" is in the item (case-insensitive search)
	if "docket" in item.lower():
	return item
	k = k + 1
	# Return None if no item contains "docket"
	return 0



	def escape_markdownold(text):
	# List of markdown special characters to escape
	special_chars = r"\*\|_\|#\|\{\|\}\|\[\|\]\|$\|$\|\#\|\+\|\-\|\.\|\!\|\\"

	# Use regex sub function to escape special characters by adding a backslash before them
	escaped_text = re.sub(f"([{special_chars}])", r"\\\1", text)

	return escaped_text

	def escape_markdown(text):
	# List of special characters in markdown that need escaping
	markdown_chars = ["\\", "`", "*", "_", "{", "}", "[", "]", "(", ")", "#", "+", "-", ".", "!", "\|", ">","$"]
	# Escape each character with a backslash
	for char in markdown_chars:
	text = text.replace(char, "\\" + char)
	return text



	if not started:
	print("------------starting------------")




	import pickle


	# Path to the pickle file where you want to save your data
	pickle_file_path = 'vectorstore.pkl'


	with open(pickle_file_path, 'rb') as file:
	st.session_state.docs = pickle.load(file)


	st.session_state.embeddings = np.load('embeddings.npy')







	def strip_repeated_dots_and_blanks(text):
	# Replace multiple dots with a single dot
	text = re.sub(r'\.{2,}', '.', text)
	# Replace multiple spaces with a single space
	text = re.sub(r' {2,}', ' ', text)
	text = re.sub('\n \n', '\n\n', text)

	return text


	# Function to get embeddings from OpenAI API
	def get_embeddings(texts):
	client = OpenAI()
	embeddings = []
	for k in texts:
	response = client.embeddings.create(
	input = k,
	model="text-embedding-3-small"
	)
	embeddings = embeddings + [response.data[0].embedding]

	return embeddings #[item['embedding'] for item in response['data']]



	def cosine_similarity(vec_a, vec_b):
	# Note: Cosine similarity is 1 - cosine distance
	return 1 - cosine(vec_a, vec_b)




	def askq(query):
	embeddings = st.session_state.embeddings
	docs = st.session_state.docs

	question = strip_repeated_dots_and_blanks(query)
	query_embedding = get_embeddings([query])[0]


	# Calculate similarity of each text to the query
	similarities = [cosine_similarity(embedding, query_embedding) for embedding in embeddings]



	similarities_array = np.array(similarities)
	highest_indices = np.argpartition(similarities_array, -5)[-5:]

	# Sort these two indices by their similarity values so the highest comes first
	highest_indices = highest_indices[np.argsort(similarities_array[highest_indices])[::-1]]

	# highest_indices = highest_indices[highest_indices>0.5]
	# highest_indices = [x for x in highest_indices if x > 0.5]
	# highest_indices = [index for index in highest_indices if similarities_array[index] > 0.5]

	# Filter indices by their corresponding similarity values
	filtered_indices_and_values = [(index, similarities_array[index]) for index in highest_indices if similarities_array[index] > 0.5]

	highest_indices = [item[0] for item in filtered_indices_and_values]
	highest_simularities = [item[1] for item in filtered_indices_and_values]



	selected_items = [docs[i].page_content for i in highest_indices]
	selected_sources = [docs[i].metadata['source'] for i in highest_indices]
	selected_chunks = [docs[i].metadata['chunk'] for i in highest_indices]
	selected_chunks = [docs[i].metadata['chunk'] for i in similarities_array]


	content = ' '.join(selected_items)


	# Combine content and question into a single prompt
	prompt = f"""Anwser the question or request provided given content. If an awnser can't be found in the provided ontent respond that you could not find
	the awnser to the question, applogice and say that you will instead tell a lawyer joke follow that with the joke.
	Content: {content}\n\nQuestion: {question}\nAnswer:"""


	# Define LLM
	llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)
	#llm = ChatOpenAI(model_name="gpt-4", temperature=0.2)

	Awnser = llm.invoke(prompt).content

	return Awnser, selected_items,selected_sources,selected_chunks,highest_simularities


	import numpy as np
	import streamlit as st

	# Assuming `strip_repeated_dots_and_blanks`, `get_embeddings`, and `cosine_similarity` are defined elsewhere correctly
	# Assuming `ChatOpenAI` is a correctly defined or imported class for handling OpenAI chat

	def ask(query):
	embeddings = st.session_state.embeddings
	docs = st.session_state.docs

	question = strip_repeated_dots_and_blanks(query)
	query_embedding = get_embeddings([query])[0]

	# Calculate similarity of each text to the query
	similarities = [cosine_similarity(embedding, query_embedding) for embedding in embeddings]

	# Create a NumPy array of similarities
	similarities_array = np.array(similarities)
	# Get indices of the top 5 most similar texts
	highest_indices = np.argpartition(similarities_array, -5)[-5:]
	# Sort the top 5 indices by their similarity values in descending order
	highest_indices = highest_indices[np.argsort(similarities_array[highest_indices])[::-1]]

	# Filter indices and their corresponding similarity values for those above 0.5
	filtered_indices_and_values = [(index, similarities_array[index]) for index in highest_indices if similarities_array[index] > 0.4]

	# Extract filtered indices and their similarities
	highest_indices = [item[0] for item in filtered_indices_and_values]
	highest_simularities = [item[1] for item in filtered_indices_and_values]

	# Select items based on filtered indices
	selected_items = [docs[i].page_content for i in highest_indices]
	selected_sources = [docs[i].metadata['source'] for i in highest_indices]
	selected_chunks = [docs[i].metadata['chunk'] for i in highest_indices]
	titles = [docs[i].metadata['title'] for i in highest_indices]
	dates = [docs[i].metadata['date'] for i in highest_indices]



	# Combine selected items into a single content string
	content = ' '.join(selected_items)

	# Prepare the prompt
	prompt = f"""Answer the question or request provided given the content. If an answer can't be found in the provided content,
	respond that you could not find the answer to the question, apologize and instead provide a suggestion for where to search for more information related to the question.
	\
	-------------------

	Content: {content}\n\nQuestion: {question}\nAnswer:

	-------------------

	"""

	# Initialize the LLM (assuming correct implementation or import)
	llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)
	answer = llm.invoke(prompt).content

	return answer, selected_items, selected_sources, titles, dates, selected_chunks, highest_simularities