import os import pdfplumber from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from langchain_google_genai import GoogleGenerativeAIEmbeddings DATASET_DIR = "." # Root directory (Hugging Face doesn't allow separate dataset folders) FAISS_INDEX_PATH = "financial_faiss_index" def get_pdf_text(pdf_files): """Extracts text from PDFs.""" text = "" for pdf in pdf_files: with pdfplumber.open(pdf) as reader: for page in reader.pages: text += page.extract_text() or "" # Handle NoneType return text.strip() def preprocess_and_store_embeddings(api_key): """Extracts text from financial documents, creates embeddings, and saves FAISS index.""" financial_text = "" # Process all PDFs in the root directory for file in os.listdir(DATASET_DIR): if file.endswith(".pdf"): file_path = os.path.join(DATASET_DIR, file) financial_text += get_pdf_text([file_path]) + "\n\n" if not financial_text: print("No financial documents found. Please upload PDFs.") return False # Split text into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) text_chunks = text_splitter.split_text(financial_text) # Generate embeddings embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key) vector_store = FAISS.from_texts(text_chunks, embedding=embeddings) # Save FAISS index in root folder vector_store.save_local(FAISS_INDEX_PATH) print("✅ FAISS index saved successfully!") return True if __name__ == "__main__": api_key = os.getenv("GOOGLE_API_KEY") if api_key: preprocess_and_store_embeddings(api_key) else: print("❌ Google API Key not found. Please provide a valid key.")