# Retriever function from pinecone import Pinecone from langchain_openai import AzureOpenAIEmbeddings import uuid import pandas as pd import streamlit as st import os # Initialize Pinecone client pc = Pinecone(api_key=st.secrets["retrival_key"]) index = pc.Index("openai-serverless") def read_file(file): """ Reads the content of a text file and returns it as a string. :param approver: The type of approver. :return: The content of the file as a string. """ fp = f"assets/{file}.md" try: with open(fp, 'r', encoding='utf-8') as file: content = file.read() return content except FileNotFoundError: print(f"The file at {fp} was not found.") except IOError: print(f"An error occurred while reading the file at {fp}.") # Azure OpenAI configuration os.environ["AZURE_OPENAI_API_KEY"] = st.secrets["azure_api_key"] os.environ["AZURE_OPENAI_ENDPOINT"] = "https://davidfearn-gpt4.openai.azure.com/" os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] = "text-embedding-ada-002" os.environ["AZURE_OPENAI_API_VERSION"] = "2024-08-01-preview" # Model configuration embeddings_model = AzureOpenAIEmbeddings( azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"], openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], ) df_chunks = pd.read_pickle('Chunks_Complete.pkl') def process_search_results(search_results): """ Processes search results to extract and organize metadata and other details. :param search_results: List of search result matches from Pinecone. :return: A list of dictionaries containing relevant metadata and scores. """ processed_results = [] for result in search_results: processed_results.append({ "id": result['id'], "score": result['score'], "Title": result['metadata'].get('Title', ''), "ChunkText": result['metadata'].get('ChunkText', ''), "PageNumber": result['metadata'].get('PageNumber', ''), "Chunk": result['metadata'].get('Chunk', '') }) return processed_results def reconstruct_text_from_chunks(df_chunks): """ Reconstructs a single string of text from the chunks in the DataFrame. :param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID'] :return: A string combining all chunk texts in order. """ return " ".join(df_chunks.sort_values(by=['Chunk'])['ChunkText'].tolist()) def lookup_related_chunks(df_chunks, chunk_id, pagesReturned): """ Returns all chunks matching the title and page number of the specified chunk ID, including chunks from the previous and next pages, handling edge cases where there is no preceding or succeeding page. :param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID'] :param chunk_id: The unique ID of the chunk to look up. :return: DataFrame with all chunks matching the title and page range of the specified chunk ID. """ target_chunk = df_chunks[df_chunks['ChunkID'] == chunk_id] if target_chunk.empty: raise ValueError("Chunk ID not found") title = target_chunk.iloc[0]['Title'] page_number = target_chunk.iloc[0]['PageNumber'] # Determine the valid page range min_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].min() max_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].max() page_range = [page for page in [page_number - pagesReturned, page_number, page_number + pagesReturned] if min_page <= page <= max_page] return df_chunks[(df_chunks['Title'] == title) & (df_chunks['PageNumber'].isin(page_range))] def search_and_reconstruct(query, k, pagesReturned): """ Combines search, lookup of related chunks, and text reconstruction. :param query: The query string to search for. :param df_chunks: DataFrame with chunk data. :param namespace: Pinecone namespace to search within. :param top_k: Number of top search results to retrieve. :return: A list of dictionaries with document title, page number, and reconstructed text. """ search_results = search_knowledgebase(query, k) processed_results = process_search_results(search_results) reconstructed_results = [] for result in processed_results: chunk_id = result['id'] related_chunks = lookup_related_chunks(df_chunks, chunk_id, pagesReturned) reconstructed_text = reconstruct_text_from_chunks(related_chunks) reconstructed_results.append({ "Title": result['Title'], "Score": result['score'], "PageNumber": result['PageNumber'], "ReconstructedText": reconstructed_text }) return reconstructed_results def search_knowledgebase(query, k): namespace="gskRegIntel" """ Embeds a query string and searches the vector database for similar entries. :param query: The string to embed and search for. :param namespace: Pinecone namespace to search within. :param top_k: Number of top results to retrieve. :return: List of search results with metadata and scores. """ try: # Generate embedding for the query query_embedding = embeddings_model.embed_query(query) # Perform search in Pinecone results = index.query(vector=query_embedding, top_k=k, namespace=namespace, include_metadata=True) return results.matches except Exception as e: print(f"Error during search: {e}") return []