Spaces:
Sleeping
Sleeping
| import os | |
| import pdfplumber | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import FAISS | |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
| DATASET_DIR = "." # Root directory (Hugging Face doesn't allow separate dataset folders) | |
| FAISS_INDEX_PATH = "financial_faiss_index" | |
| def get_pdf_text(pdf_files): | |
| """Extracts text from PDFs.""" | |
| text = "" | |
| for pdf in pdf_files: | |
| with pdfplumber.open(pdf) as reader: | |
| for page in reader.pages: | |
| text += page.extract_text() or "" # Handle NoneType | |
| return text.strip() | |
| def preprocess_and_store_embeddings(api_key): | |
| """Extracts text from financial documents, creates embeddings, and saves FAISS index.""" | |
| financial_text = "" | |
| # Process all PDFs in the root directory | |
| for file in os.listdir(DATASET_DIR): | |
| if file.endswith(".pdf"): | |
| file_path = os.path.join(DATASET_DIR, file) | |
| financial_text += get_pdf_text([file_path]) + "\n\n" | |
| if not financial_text: | |
| print("No financial documents found. Please upload PDFs.") | |
| return False | |
| # Split text into chunks | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) | |
| text_chunks = text_splitter.split_text(financial_text) | |
| # Generate embeddings | |
| embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key) | |
| vector_store = FAISS.from_texts(text_chunks, embedding=embeddings) | |
| # Save FAISS index in root folder | |
| vector_store.save_local(FAISS_INDEX_PATH) | |
| print("✅ FAISS index saved successfully!") | |
| return True | |
| if __name__ == "__main__": | |
| api_key = os.getenv("GOOGLE_API_KEY") | |
| if api_key: | |
| preprocess_and_store_embeddings(api_key) | |
| else: | |
| print("❌ Google API Key not found. Please provide a valid key.") | |