File size: 1,888 Bytes
4a6432c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings

DATASET_DIR = "."  # Root directory (Hugging Face doesn't allow separate dataset folders)
FAISS_INDEX_PATH = "financial_faiss_index"

def get_pdf_text(pdf_files):
    """Extracts text from PDFs."""
    text = ""
    for pdf in pdf_files:
        with pdfplumber.open(pdf) as reader:
            for page in reader.pages:
                text += page.extract_text() or ""  # Handle NoneType
    return text.strip()

def preprocess_and_store_embeddings(api_key):
    """Extracts text from financial documents, creates embeddings, and saves FAISS index."""
    financial_text = ""
    
    # Process all PDFs in the root directory
    for file in os.listdir(DATASET_DIR):
        if file.endswith(".pdf"):
            file_path = os.path.join(DATASET_DIR, file)
            financial_text += get_pdf_text([file_path]) + "\n\n"

    if not financial_text:
        print("No financial documents found. Please upload PDFs.")
        return False

    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    text_chunks = text_splitter.split_text(financial_text)

    # Generate embeddings
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)

    # Save FAISS index in root folder
    vector_store.save_local(FAISS_INDEX_PATH)
    print("✅ FAISS index saved successfully!")

    return True

if __name__ == "__main__":
    api_key = os.getenv("GOOGLE_API_KEY")
    if api_key:
        preprocess_and_store_embeddings(api_key)
    else:
        print("❌ Google API Key not found. Please provide a valid key.")