Spaces:
Sleeping
Sleeping
| # Retriever function | |
| from pinecone import Pinecone | |
| from langchain_openai import AzureOpenAIEmbeddings | |
| import uuid | |
| import pandas as pd | |
| import streamlit as st | |
| import os | |
| # Initialize Pinecone client | |
| pc = Pinecone(api_key=st.secrets["retrival_key"]) | |
| index = pc.Index("openai-serverless") | |
| def read_file(file): | |
| """ | |
| Reads the content of a text file and returns it as a string. | |
| :param approver: The type of approver. | |
| :return: The content of the file as a string. | |
| """ | |
| fp = f"assets/{file}.md" | |
| try: | |
| with open(fp, 'r', encoding='utf-8') as file: | |
| content = file.read() | |
| return content | |
| except FileNotFoundError: | |
| print(f"The file at {fp} was not found.") | |
| except IOError: | |
| print(f"An error occurred while reading the file at {fp}.") | |
| # Azure OpenAI configuration | |
| os.environ["AZURE_OPENAI_API_KEY"] = st.secrets["azure_api_key"] | |
| os.environ["AZURE_OPENAI_ENDPOINT"] = "https://davidfearn-gpt4.openai.azure.com/" | |
| os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] = "text-embedding-ada-002" | |
| os.environ["AZURE_OPENAI_API_VERSION"] = "2024-08-01-preview" | |
| # Model configuration | |
| embeddings_model = AzureOpenAIEmbeddings( | |
| azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], | |
| azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"], | |
| openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], | |
| ) | |
| df_chunks = pd.read_pickle('Chunks_Complete.pkl') | |
| def process_search_results(search_results): | |
| """ | |
| Processes search results to extract and organize metadata and other details. | |
| :param search_results: List of search result matches from Pinecone. | |
| :return: A list of dictionaries containing relevant metadata and scores. | |
| """ | |
| processed_results = [] | |
| for result in search_results: | |
| processed_results.append({ | |
| "id": result['id'], | |
| "score": result['score'], | |
| "Title": result['metadata'].get('Title', ''), | |
| "ChunkText": result['metadata'].get('ChunkText', ''), | |
| "PageNumber": result['metadata'].get('PageNumber', ''), | |
| "Chunk": result['metadata'].get('Chunk', '') | |
| }) | |
| return processed_results | |
| def reconstruct_text_from_chunks(df_chunks): | |
| """ | |
| Reconstructs a single string of text from the chunks in the DataFrame. | |
| :param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID'] | |
| :return: A string combining all chunk texts in order. | |
| """ | |
| return " ".join(df_chunks.sort_values(by=['Chunk'])['ChunkText'].tolist()) | |
| def lookup_related_chunks(df_chunks, chunk_id, pagesReturned): | |
| """ | |
| Returns all chunks matching the title and page number of the specified chunk ID, | |
| including chunks from the previous and next pages, handling edge cases where | |
| there is no preceding or succeeding page. | |
| :param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID'] | |
| :param chunk_id: The unique ID of the chunk to look up. | |
| :return: DataFrame with all chunks matching the title and page range of the specified chunk ID. | |
| """ | |
| target_chunk = df_chunks[df_chunks['ChunkID'] == chunk_id] | |
| if target_chunk.empty: | |
| raise ValueError("Chunk ID not found") | |
| title = target_chunk.iloc[0]['Title'] | |
| page_number = target_chunk.iloc[0]['PageNumber'] | |
| # Determine the valid page range | |
| min_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].min() | |
| max_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].max() | |
| page_range = [page for page in [page_number - pagesReturned, page_number, page_number + pagesReturned] if min_page <= page <= max_page] | |
| return df_chunks[(df_chunks['Title'] == title) & (df_chunks['PageNumber'].isin(page_range))] | |
| def search_and_reconstruct(query, k, pagesReturned): | |
| """ | |
| Combines search, lookup of related chunks, and text reconstruction. | |
| :param query: The query string to search for. | |
| :param df_chunks: DataFrame with chunk data. | |
| :param namespace: Pinecone namespace to search within. | |
| :param top_k: Number of top search results to retrieve. | |
| :return: A list of dictionaries with document title, page number, and reconstructed text. | |
| """ | |
| search_results = search_knowledgebase(query, k) | |
| processed_results = process_search_results(search_results) | |
| reconstructed_results = [] | |
| for result in processed_results: | |
| chunk_id = result['id'] | |
| related_chunks = lookup_related_chunks(df_chunks, chunk_id, pagesReturned) | |
| reconstructed_text = reconstruct_text_from_chunks(related_chunks) | |
| reconstructed_results.append({ | |
| "Title": result['Title'], | |
| "Score": result['score'], | |
| "PageNumber": result['PageNumber'], | |
| "ReconstructedText": reconstructed_text | |
| }) | |
| return reconstructed_results | |
| def search_knowledgebase(query, k): | |
| namespace="gskRegIntel" | |
| """ | |
| Embeds a query string and searches the vector database for similar entries. | |
| :param query: The string to embed and search for. | |
| :param namespace: Pinecone namespace to search within. | |
| :param top_k: Number of top results to retrieve. | |
| :return: List of search results with metadata and scores. | |
| """ | |
| try: | |
| # Generate embedding for the query | |
| query_embedding = embeddings_model.embed_query(query) | |
| # Perform search in Pinecone | |
| results = index.query(vector=query_embedding, top_k=k, namespace=namespace, include_metadata=True) | |
| return results.matches | |
| except Exception as e: | |
| print(f"Error during search: {e}") | |
| return [] |