GSKRAGDemoReplic / retriver.py
davidfearne's picture
Update retriver.py
48db34e verified
# Retriever function
from pinecone import Pinecone
from langchain_openai import AzureOpenAIEmbeddings
import uuid
import pandas as pd
import streamlit as st
import os
# Initialize Pinecone client
pc = Pinecone(api_key=st.secrets["retrival_key"])
index = pc.Index("openai-serverless")
def read_file(file):
"""
Reads the content of a text file and returns it as a string.
:param approver: The type of approver.
:return: The content of the file as a string.
"""
fp = f"assets/{file}.md"
try:
with open(fp, 'r', encoding='utf-8') as file:
content = file.read()
return content
except FileNotFoundError:
print(f"The file at {fp} was not found.")
except IOError:
print(f"An error occurred while reading the file at {fp}.")
# Azure OpenAI configuration
os.environ["AZURE_OPENAI_API_KEY"] = st.secrets["azure_api_key"]
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://davidfearn-gpt4.openai.azure.com/"
os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] = "text-embedding-ada-002"
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-08-01-preview"
# Model configuration
embeddings_model = AzureOpenAIEmbeddings(
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
)
df_chunks = pd.read_pickle('Chunks_Complete.pkl')
def process_search_results(search_results):
"""
Processes search results to extract and organize metadata and other details.
:param search_results: List of search result matches from Pinecone.
:return: A list of dictionaries containing relevant metadata and scores.
"""
processed_results = []
for result in search_results:
processed_results.append({
"id": result['id'],
"score": result['score'],
"Title": result['metadata'].get('Title', ''),
"ChunkText": result['metadata'].get('ChunkText', ''),
"PageNumber": result['metadata'].get('PageNumber', ''),
"Chunk": result['metadata'].get('Chunk', '')
})
return processed_results
def reconstruct_text_from_chunks(df_chunks):
"""
Reconstructs a single string of text from the chunks in the DataFrame.
:param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
:return: A string combining all chunk texts in order.
"""
return " ".join(df_chunks.sort_values(by=['Chunk'])['ChunkText'].tolist())
def lookup_related_chunks(df_chunks, chunk_id, pagesReturned):
"""
Returns all chunks matching the title and page number of the specified chunk ID,
including chunks from the previous and next pages, handling edge cases where
there is no preceding or succeeding page.
:param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
:param chunk_id: The unique ID of the chunk to look up.
:return: DataFrame with all chunks matching the title and page range of the specified chunk ID.
"""
target_chunk = df_chunks[df_chunks['ChunkID'] == chunk_id]
if target_chunk.empty:
raise ValueError("Chunk ID not found")
title = target_chunk.iloc[0]['Title']
page_number = target_chunk.iloc[0]['PageNumber']
# Determine the valid page range
min_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].min()
max_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].max()
page_range = [page for page in [page_number - pagesReturned, page_number, page_number + pagesReturned] if min_page <= page <= max_page]
return df_chunks[(df_chunks['Title'] == title) & (df_chunks['PageNumber'].isin(page_range))]
def search_and_reconstruct(query, k, pagesReturned):
"""
Combines search, lookup of related chunks, and text reconstruction.
:param query: The query string to search for.
:param df_chunks: DataFrame with chunk data.
:param namespace: Pinecone namespace to search within.
:param top_k: Number of top search results to retrieve.
:return: A list of dictionaries with document title, page number, and reconstructed text.
"""
search_results = search_knowledgebase(query, k)
processed_results = process_search_results(search_results)
reconstructed_results = []
for result in processed_results:
chunk_id = result['id']
related_chunks = lookup_related_chunks(df_chunks, chunk_id, pagesReturned)
reconstructed_text = reconstruct_text_from_chunks(related_chunks)
reconstructed_results.append({
"Title": result['Title'],
"Score": result['score'],
"PageNumber": result['PageNumber'],
"ReconstructedText": reconstructed_text
})
return reconstructed_results
def search_knowledgebase(query, k):
namespace="gskRegIntel"
"""
Embeds a query string and searches the vector database for similar entries.
:param query: The string to embed and search for.
:param namespace: Pinecone namespace to search within.
:param top_k: Number of top results to retrieve.
:return: List of search results with metadata and scores.
"""
try:
# Generate embedding for the query
query_embedding = embeddings_model.embed_query(query)
# Perform search in Pinecone
results = index.query(vector=query_embedding, top_k=k, namespace=namespace, include_metadata=True)
return results.matches
except Exception as e:
print(f"Error during search: {e}")
return []