Spaces:

tlarsson
/

psdocuments

Sleeping

File size: 7,356 Bytes


from openai import OpenAI
import numpy as np

from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from scipy.spatial.distance import cosine


def find_first_with_docket(items):
    # Loop through each item in the list
    k = 0
    for item in items:
        # Check if "docket" is in the item (case-insensitive search)
        if "docket" in item.lower():
            return item
        k = k + 1
    # Return None if no item contains "docket"
    return 0



def escape_markdownold(text):
    # List of markdown special characters to escape
    special_chars = r"\*|_|#|\{|\}|\[|\]|\(|\)|\#|\+|\-|\.|\!|\\"
    
    # Use regex sub function to escape special characters by adding a backslash before them
    escaped_text = re.sub(f"([{special_chars}])", r"\\\1", text)
    
    return escaped_text

def escape_markdown(text):
    # List of special characters in markdown that need escaping
    markdown_chars = ["\\", "`", "*", "_", "{", "}", "[", "]", "(", ")", "#", "+", "-", ".", "!", "|", ">","$"]
    # Escape each character with a backslash
    for char in markdown_chars:
        text = text.replace(char, "\\" + char)
    return text



if not started:
    print("------------starting------------")




    import pickle


    # Path to the pickle file where you want to save your data
    pickle_file_path = 'vectorstore.pkl'


    with open(pickle_file_path, 'rb') as file:
        st.session_state.docs = pickle.load(file)


    st.session_state.embeddings = np.load('embeddings.npy')







def strip_repeated_dots_and_blanks(text):
    # Replace multiple dots with a single dot
    text = re.sub(r'\.{2,}', '.', text)
    # Replace multiple spaces with a single space
    text = re.sub(r' {2,}', ' ', text)
    text = re.sub('\n \n', '\n\n', text)
    
    return text


# Function to get embeddings from OpenAI API
def get_embeddings(texts):
    client = OpenAI()
    embeddings  = []
    for k in texts: 
        response = client.embeddings.create(
            input = k,
            model="text-embedding-3-small"
        )
        embeddings  = embeddings  + [response.data[0].embedding]

    return embeddings  #[item['embedding'] for item in response['data']]



def cosine_similarity(vec_a, vec_b):
    # Note: Cosine similarity is 1 - cosine distance
    return 1 - cosine(vec_a, vec_b)




def askq(query):
    embeddings = st.session_state.embeddings
    docs = st.session_state.docs

    question = strip_repeated_dots_and_blanks(query)
    query_embedding = get_embeddings([query])[0]
 

    # Calculate similarity of each text to the query
    similarities = [cosine_similarity(embedding, query_embedding) for embedding in embeddings]



    similarities_array = np.array(similarities)
    highest_indices = np.argpartition(similarities_array, -5)[-5:]

    # Sort these two indices by their similarity values so the highest comes first
    highest_indices = highest_indices[np.argsort(similarities_array[highest_indices])[::-1]]

#    highest_indices = highest_indices[highest_indices>0.5]
   # highest_indices = [x for x in highest_indices if x > 0.5]
#    highest_indices = [index for index in highest_indices if similarities_array[index] > 0.5]

    # Filter indices by their corresponding similarity values
    filtered_indices_and_values = [(index, similarities_array[index]) for index in highest_indices if similarities_array[index] > 0.5]

    highest_indices = [item[0] for item in filtered_indices_and_values]
    highest_simularities = [item[1] for item in filtered_indices_and_values]



    selected_items = [docs[i].page_content for i in highest_indices]
    selected_sources = [docs[i].metadata['source'] for i in highest_indices]
    selected_chunks = [docs[i].metadata['chunk'] for i in highest_indices]
    selected_chunks = [docs[i].metadata['chunk'] for i in similarities_array]


    content = ' '.join(selected_items)


    # Combine content and question into a single prompt
    prompt = f"""Anwser the question or request provided given content. If an awnser can't be found in the provided ontent respond that you could not find 
    the awnser to the question, applogice and say that you will instead tell a lawyer joke follow that with the joke.   
    Content: {content}\n\nQuestion: {question}\nAnswer:"""


    # Define LLM
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)
    #llm = ChatOpenAI(model_name="gpt-4", temperature=0.2)
    
    Awnser = llm.invoke(prompt).content

    return Awnser, selected_items,selected_sources,selected_chunks,highest_simularities


import numpy as np
import streamlit as st

# Assuming `strip_repeated_dots_and_blanks`, `get_embeddings`, and `cosine_similarity` are defined elsewhere correctly
# Assuming `ChatOpenAI` is a correctly defined or imported class for handling OpenAI chat

def ask(query):
    embeddings = st.session_state.embeddings
    docs = st.session_state.docs

    question = strip_repeated_dots_and_blanks(query)
    query_embedding = get_embeddings([query])[0]
 
    # Calculate similarity of each text to the query
    similarities = [cosine_similarity(embedding, query_embedding) for embedding in embeddings]

    # Create a NumPy array of similarities
    similarities_array = np.array(similarities)
    # Get indices of the top 5 most similar texts
    highest_indices = np.argpartition(similarities_array, -5)[-5:]
    # Sort the top 5 indices by their similarity values in descending order
    highest_indices = highest_indices[np.argsort(similarities_array[highest_indices])[::-1]]

    # Filter indices and their corresponding similarity values for those above 0.5
    filtered_indices_and_values = [(index, similarities_array[index]) for index in highest_indices if similarities_array[index] > 0.4]
    
    # Extract filtered indices and their similarities
    highest_indices = [item[0] for item in filtered_indices_and_values]
    highest_simularities = [item[1] for item in filtered_indices_and_values]

    # Select items based on filtered indices
    selected_items = [docs[i].page_content for i in highest_indices]
    selected_sources = [docs[i].metadata['source'] for i in highest_indices]
    selected_chunks = [docs[i].metadata['chunk'] for i in highest_indices]
    titles = [docs[i].metadata['title'] for i in highest_indices]
    dates = [docs[i].metadata['date'] for i in highest_indices]
    
    

    # Combine selected items into a single content string
    content = ' '.join(selected_items)

    # Prepare the prompt
    prompt = f"""Answer the question or request provided given the content. If an answer can't be found in the provided content, 
    respond that you could not find the answer to the question, apologize and instead provide a suggestion for where to search for more information related to the question.
\   
    -------------------
       
    Content: {content}\n\nQuestion: {question}\nAnswer:
    
    -------------------
    
    """

    # Initialize the LLM (assuming correct implementation or import)
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)
    answer = llm.invoke(prompt).content

    return answer, selected_items, selected_sources, titles, dates, selected_chunks, highest_simularities