FinancialChatbot / data_loader.py
PranavRatnalikar's picture
Update data_loader.py
4a6432c verified
import os
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
DATASET_DIR = "." # Root directory (Hugging Face doesn't allow separate dataset folders)
FAISS_INDEX_PATH = "financial_faiss_index"
def get_pdf_text(pdf_files):
"""Extracts text from PDFs."""
text = ""
for pdf in pdf_files:
with pdfplumber.open(pdf) as reader:
for page in reader.pages:
text += page.extract_text() or "" # Handle NoneType
return text.strip()
def preprocess_and_store_embeddings(api_key):
"""Extracts text from financial documents, creates embeddings, and saves FAISS index."""
financial_text = ""
# Process all PDFs in the root directory
for file in os.listdir(DATASET_DIR):
if file.endswith(".pdf"):
file_path = os.path.join(DATASET_DIR, file)
financial_text += get_pdf_text([file_path]) + "\n\n"
if not financial_text:
print("No financial documents found. Please upload PDFs.")
return False
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
text_chunks = text_splitter.split_text(financial_text)
# Generate embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
# Save FAISS index in root folder
vector_store.save_local(FAISS_INDEX_PATH)
print("✅ FAISS index saved successfully!")
return True
if __name__ == "__main__":
api_key = os.getenv("GOOGLE_API_KEY")
if api_key:
preprocess_and_store_embeddings(api_key)
else:
print("❌ Google API Key not found. Please provide a valid key.")