Spaces:

avimittal30
/

FinQuery

Sleeping

App Files Files Community

FinQuery / data.py

avimittal30

pushing files

a7aaec4 9 months ago

raw

history blame contribute delete

3.09 kB

	from helper import extract_text_from_pdf, chunk_text, embedding_function, embedding_model, generate_hypothetical_answer, query_llm_with_context
	import numpy as np
	import faiss
	import pickle
	import os
	import logging
	from helper import query_llm_with_context
	logging.basicConfig(level=logging.INFO)

	# Path for storing the FAISS index and document chunks
	index_path = "./faiss_index"
	chunks_path = "./document_chunks.pkl"

	pdf_path = 'C:\Git Projects\AnnualReport_rag\IBM.pdf'

	print('Extracting text from pdf...')
	pdf_text = extract_text_from_pdf(pdf_path)

	print('Chunking pdf...')
	chunks = chunk_text(pdf_text, chunk_size=1000, chunk_overlap=100)

	print('Embedding chunks...')
	embeddings = embedding_function(chunks)

	print(f"Embeddings type: {type(embeddings)}")
	print(f"First embedding type: {type(embeddings[0])}")
	print(f"First embedding shape or length: {len(embeddings[0]) if hasattr(embeddings[0], '__len__') else 'unknown'}")

	# Convert embeddings to numpy array if they aren't already
	if not isinstance(embeddings, np.ndarray):
	print("Converting embeddings to numpy array...")
	embeddings = np.array(embeddings).astype('float32')

	# Get the dimension of the embeddings
	dimension = embeddings.shape[1]
	print(f"Embedding dimension: {dimension}")

	# Initialize FAISS index
	print('Initializing FAISS index...')
	index = faiss.IndexFlatL2(dimension) # L2 distance for similarity search

	# Add vectors to the index
	print('Adding vectors to FAISS index...')
	index.add(embeddings)

	# Save the index
	print('Saving FAISS index...')
	faiss.write_index(index, index_path)

	# Save the document chunks for retrieval
	print('Saving document chunks...')
	with open(chunks_path, 'wb') as f:
	pickle.dump(chunks, f)

	print(f"Total vectors in index: {index.ntotal}")


	def retrieve_documents(query, n_results=5):
	# Generate embedding for the query
	query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32')

	# Search the index
	distances, indices = index.search(query_embedding, n_results)

	# Get the documents
	documents = [chunks[i] for i in indices[0]]

	# Convert distances to similarity scores (L2 distance: lower is better)
	# Normalize distances to [0, 1] range where 1 is most similar
	max_distance = np.max(distances)
	similarity_scores = [1 - (dist / max_distance) for dist in distances[0]]

	return documents, similarity_scores


	# Test the retrieval
	query="how has the profitability of the company been in last five years"
	print('Retrieving documents...')
	general_docs, general_scores = retrieve_documents(query, n_results=15)
	print(f"Number of docs returned for general query: {len(general_docs)}")

	# Print the results
	# for i, (doc, score) in enumerate(zip(general_docs, general_scores)):
	# print(f"\nResult {i+1} (Score: {score:.4f}):")
	# print(f"{doc[:200]}...")

	new_query=query+generate_hypothetical_answer(query)
	combined_context=retrieve_documents(new_query, n_results=15)

	answer = query_llm_with_context(query, combined_context, top_n=3)

	print('final_response:{answer}')