Spaces:
Sleeping
Sleeping
File size: 5,647 Bytes
8a9102c 3a5f5ac 8a9102c 48db34e 8a9102c 3a5f5ac 8a9102c 84c55b0 8a9102c 84c55b0 8a9102c 84c55b0 8a9102c 84c55b0 8a9102c 84c55b0 8a9102c 5f62726 8a9102c 51fdff8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | # Retriever function
from pinecone import Pinecone
from langchain_openai import AzureOpenAIEmbeddings
import uuid
import pandas as pd
import streamlit as st
import os
# Initialize Pinecone client
pc = Pinecone(api_key=st.secrets["retrival_key"])
index = pc.Index("openai-serverless")
def read_file(file):
"""
Reads the content of a text file and returns it as a string.
:param approver: The type of approver.
:return: The content of the file as a string.
"""
fp = f"assets/{file}.md"
try:
with open(fp, 'r', encoding='utf-8') as file:
content = file.read()
return content
except FileNotFoundError:
print(f"The file at {fp} was not found.")
except IOError:
print(f"An error occurred while reading the file at {fp}.")
# Azure OpenAI configuration
os.environ["AZURE_OPENAI_API_KEY"] = st.secrets["azure_api_key"]
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://davidfearn-gpt4.openai.azure.com/"
os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] = "text-embedding-ada-002"
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-08-01-preview"
# Model configuration
embeddings_model = AzureOpenAIEmbeddings(
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
)
df_chunks = pd.read_pickle('Chunks_Complete.pkl')
def process_search_results(search_results):
"""
Processes search results to extract and organize metadata and other details.
:param search_results: List of search result matches from Pinecone.
:return: A list of dictionaries containing relevant metadata and scores.
"""
processed_results = []
for result in search_results:
processed_results.append({
"id": result['id'],
"score": result['score'],
"Title": result['metadata'].get('Title', ''),
"ChunkText": result['metadata'].get('ChunkText', ''),
"PageNumber": result['metadata'].get('PageNumber', ''),
"Chunk": result['metadata'].get('Chunk', '')
})
return processed_results
def reconstruct_text_from_chunks(df_chunks):
"""
Reconstructs a single string of text from the chunks in the DataFrame.
:param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
:return: A string combining all chunk texts in order.
"""
return " ".join(df_chunks.sort_values(by=['Chunk'])['ChunkText'].tolist())
def lookup_related_chunks(df_chunks, chunk_id, pagesReturned):
"""
Returns all chunks matching the title and page number of the specified chunk ID,
including chunks from the previous and next pages, handling edge cases where
there is no preceding or succeeding page.
:param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
:param chunk_id: The unique ID of the chunk to look up.
:return: DataFrame with all chunks matching the title and page range of the specified chunk ID.
"""
target_chunk = df_chunks[df_chunks['ChunkID'] == chunk_id]
if target_chunk.empty:
raise ValueError("Chunk ID not found")
title = target_chunk.iloc[0]['Title']
page_number = target_chunk.iloc[0]['PageNumber']
# Determine the valid page range
min_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].min()
max_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].max()
page_range = [page for page in [page_number - pagesReturned, page_number, page_number + pagesReturned] if min_page <= page <= max_page]
return df_chunks[(df_chunks['Title'] == title) & (df_chunks['PageNumber'].isin(page_range))]
def search_and_reconstruct(query, k, pagesReturned):
"""
Combines search, lookup of related chunks, and text reconstruction.
:param query: The query string to search for.
:param df_chunks: DataFrame with chunk data.
:param namespace: Pinecone namespace to search within.
:param top_k: Number of top search results to retrieve.
:return: A list of dictionaries with document title, page number, and reconstructed text.
"""
search_results = search_knowledgebase(query, k)
processed_results = process_search_results(search_results)
reconstructed_results = []
for result in processed_results:
chunk_id = result['id']
related_chunks = lookup_related_chunks(df_chunks, chunk_id, pagesReturned)
reconstructed_text = reconstruct_text_from_chunks(related_chunks)
reconstructed_results.append({
"Title": result['Title'],
"Score": result['score'],
"PageNumber": result['PageNumber'],
"ReconstructedText": reconstructed_text
})
return reconstructed_results
def search_knowledgebase(query, k):
namespace="gskRegIntel"
"""
Embeds a query string and searches the vector database for similar entries.
:param query: The string to embed and search for.
:param namespace: Pinecone namespace to search within.
:param top_k: Number of top results to retrieve.
:return: List of search results with metadata and scores.
"""
try:
# Generate embedding for the query
query_embedding = embeddings_model.embed_query(query)
# Perform search in Pinecone
results = index.query(vector=query_embedding, top_k=k, namespace=namespace, include_metadata=True)
return results.matches
except Exception as e:
print(f"Error during search: {e}")
return [] |