Spaces:

DivyaS1
/

GSKRAGDemoReplic

Sleeping

App Files Files Community

GSKRAGDemoReplic / retriver.py

davidfearne

Update retriver.py

48db34e verified about 1 year ago

raw

history blame contribute delete

5.65 kB

	# Retriever function

	from pinecone import Pinecone
	from langchain_openai import AzureOpenAIEmbeddings
	import uuid
	import pandas as pd
	import streamlit as st
	import os
	# Initialize Pinecone client
	pc = Pinecone(api_key=st.secrets["retrival_key"])
	index = pc.Index("openai-serverless")



	def read_file(file):
	"""
	Reads the content of a text file and returns it as a string.

	:param approver: The type of approver.
	:return: The content of the file as a string.
	"""
	fp = f"assets/{file}.md"
	try:
	with open(fp, 'r', encoding='utf-8') as file:
	content = file.read()
	return content
	except FileNotFoundError:
	print(f"The file at {fp} was not found.")
	except IOError:
	print(f"An error occurred while reading the file at {fp}.")

	# Azure OpenAI configuration
	os.environ["AZURE_OPENAI_API_KEY"] = st.secrets["azure_api_key"]
	os.environ["AZURE_OPENAI_ENDPOINT"] = "https://davidfearn-gpt4.openai.azure.com/"
	os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] = "text-embedding-ada-002"
	os.environ["AZURE_OPENAI_API_VERSION"] = "2024-08-01-preview"

	# Model configuration
	embeddings_model = AzureOpenAIEmbeddings(
	azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
	azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
	openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
	)

	df_chunks = pd.read_pickle('Chunks_Complete.pkl')

	def process_search_results(search_results):
	"""
	Processes search results to extract and organize metadata and other details.

	:param search_results: List of search result matches from Pinecone.
	:return: A list of dictionaries containing relevant metadata and scores.
	"""
	processed_results = []

	for result in search_results:
	processed_results.append({
	"id": result['id'],
	"score": result['score'],
	"Title": result['metadata'].get('Title', ''),
	"ChunkText": result['metadata'].get('ChunkText', ''),
	"PageNumber": result['metadata'].get('PageNumber', ''),
	"Chunk": result['metadata'].get('Chunk', '')
	})

	return processed_results

	def reconstruct_text_from_chunks(df_chunks):
	"""
	Reconstructs a single string of text from the chunks in the DataFrame.

	:param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
	:return: A string combining all chunk texts in order.
	"""
	return " ".join(df_chunks.sort_values(by=['Chunk'])['ChunkText'].tolist())

	def lookup_related_chunks(df_chunks, chunk_id, pagesReturned):
	"""
	Returns all chunks matching the title and page number of the specified chunk ID,
	including chunks from the previous and next pages, handling edge cases where
	there is no preceding or succeeding page.

	:param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
	:param chunk_id: The unique ID of the chunk to look up.
	:return: DataFrame with all chunks matching the title and page range of the specified chunk ID.
	"""
	target_chunk = df_chunks[df_chunks['ChunkID'] == chunk_id]
	if target_chunk.empty:
	raise ValueError("Chunk ID not found")

	title = target_chunk.iloc[0]['Title']
	page_number = target_chunk.iloc[0]['PageNumber']

	# Determine the valid page range
	min_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].min()
	max_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].max()

	page_range = [page for page in [page_number - pagesReturned, page_number, page_number + pagesReturned] if min_page <= page <= max_page]

	return df_chunks[(df_chunks['Title'] == title) & (df_chunks['PageNumber'].isin(page_range))]

	def search_and_reconstruct(query, k, pagesReturned):
	"""
	Combines search, lookup of related chunks, and text reconstruction.

	:param query: The query string to search for.
	:param df_chunks: DataFrame with chunk data.
	:param namespace: Pinecone namespace to search within.
	:param top_k: Number of top search results to retrieve.
	:return: A list of dictionaries with document title, page number, and reconstructed text.
	"""
	search_results = search_knowledgebase(query, k)
	processed_results = process_search_results(search_results)

	reconstructed_results = []

	for result in processed_results:
	chunk_id = result['id']
	related_chunks = lookup_related_chunks(df_chunks, chunk_id, pagesReturned)
	reconstructed_text = reconstruct_text_from_chunks(related_chunks)


	reconstructed_results.append({
	"Title": result['Title'],
	"Score": result['score'],
	"PageNumber": result['PageNumber'],
	"ReconstructedText": reconstructed_text
	})

	return reconstructed_results

	def search_knowledgebase(query, k):

	namespace="gskRegIntel"

	"""
	Embeds a query string and searches the vector database for similar entries.

	:param query: The string to embed and search for.
	:param namespace: Pinecone namespace to search within.
	:param top_k: Number of top results to retrieve.
	:return: List of search results with metadata and scores.
	"""
	try:
	# Generate embedding for the query
	query_embedding = embeddings_model.embed_query(query)

	# Perform search in Pinecone
	results = index.query(vector=query_embedding, top_k=k, namespace=namespace, include_metadata=True)

	return results.matches

	except Exception as e:
	print(f"Error during search: {e}")
	return []