Spaces:

PranavRatnalikar
/

FinancialChatbot

Sleeping

App Files Files Community

FinancialChatbot / data_loader.py

PranavRatnalikar

Update data_loader.py

4a6432c verified 12 months ago

raw

history blame contribute delete

1.89 kB

	import os
	import pdfplumber
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import FAISS
	from langchain_google_genai import GoogleGenerativeAIEmbeddings

	DATASET_DIR = "." # Root directory (Hugging Face doesn't allow separate dataset folders)
	FAISS_INDEX_PATH = "financial_faiss_index"

	def get_pdf_text(pdf_files):
	"""Extracts text from PDFs."""
	text = ""
	for pdf in pdf_files:
	with pdfplumber.open(pdf) as reader:
	for page in reader.pages:
	text += page.extract_text() or "" # Handle NoneType
	return text.strip()

	def preprocess_and_store_embeddings(api_key):
	"""Extracts text from financial documents, creates embeddings, and saves FAISS index."""
	financial_text = ""

	# Process all PDFs in the root directory
	for file in os.listdir(DATASET_DIR):
	if file.endswith(".pdf"):
	file_path = os.path.join(DATASET_DIR, file)
	financial_text += get_pdf_text([file_path]) + "\n\n"

	if not financial_text:
	print("No financial documents found. Please upload PDFs.")
	return False

	# Split text into chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
	text_chunks = text_splitter.split_text(financial_text)

	# Generate embeddings
	embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
	vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)

	# Save FAISS index in root folder
	vector_store.save_local(FAISS_INDEX_PATH)
	print("✅ FAISS index saved successfully!")

	return True

	if __name__ == "__main__":
	api_key = os.getenv("GOOGLE_API_KEY")
	if api_key:
	preprocess_and_store_embeddings(api_key)
	else:
	print("❌ Google API Key not found. Please provide a valid key.")