Spaces:
Sleeping
Sleeping
File size: 1,888 Bytes
4a6432c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import os
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
DATASET_DIR = "." # Root directory (Hugging Face doesn't allow separate dataset folders)
FAISS_INDEX_PATH = "financial_faiss_index"
def get_pdf_text(pdf_files):
"""Extracts text from PDFs."""
text = ""
for pdf in pdf_files:
with pdfplumber.open(pdf) as reader:
for page in reader.pages:
text += page.extract_text() or "" # Handle NoneType
return text.strip()
def preprocess_and_store_embeddings(api_key):
"""Extracts text from financial documents, creates embeddings, and saves FAISS index."""
financial_text = ""
# Process all PDFs in the root directory
for file in os.listdir(DATASET_DIR):
if file.endswith(".pdf"):
file_path = os.path.join(DATASET_DIR, file)
financial_text += get_pdf_text([file_path]) + "\n\n"
if not financial_text:
print("No financial documents found. Please upload PDFs.")
return False
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
text_chunks = text_splitter.split_text(financial_text)
# Generate embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
# Save FAISS index in root folder
vector_store.save_local(FAISS_INDEX_PATH)
print("✅ FAISS index saved successfully!")
return True
if __name__ == "__main__":
api_key = os.getenv("GOOGLE_API_KEY")
if api_key:
preprocess_and_store_embeddings(api_key)
else:
print("❌ Google API Key not found. Please provide a valid key.")
|