PranavRatnalikar commited on
Commit
4a6432c
·
verified ·
1 Parent(s): af04e16

Update data_loader.py

Browse files
Files changed (1) hide show
  1. data_loader.py +52 -49
data_loader.py CHANGED
@@ -1,49 +1,52 @@
1
- import os
2
- import zipfile
3
- import pdfplumber
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_google_genai import GoogleGenerativeAIEmbeddings
6
- from langchain.vectorstores import FAISS
7
-
8
- DATASET_ZIP = "dataset.zip"
9
- FAISS_INDEX_FILE = "faiss_index"
10
-
11
- def extract_dataset():
12
- """Extract dataset.zip contents if not already extracted."""
13
- if os.path.exists(DATASET_ZIP):
14
- with zipfile.ZipFile(DATASET_ZIP, 'r') as zip_ref:
15
- zip_ref.extractall("./")
16
- print("✅ Dataset extracted!")
17
- else:
18
- print("⚠️ No dataset.zip found, ensure financial data is available.")
19
-
20
- def extract_text_from_pdfs():
21
- """Extract text from all PDFs in root directory."""
22
- text_data = ""
23
- for file in os.listdir("./"):
24
- if file.endswith(".pdf"):
25
- with pdfplumber.open(file) as pdf:
26
- for page in pdf.pages:
27
- text_data += page.extract_text() or ""
28
- return text_data
29
-
30
- def create_vector_store(api_key):
31
- """Create FAISS vector database from extracted text."""
32
- extract_dataset()
33
- text_data = extract_text_from_pdfs()
34
-
35
- if not text_data:
36
- print("⚠️ No valid text extracted.")
37
- return
38
-
39
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
40
- text_chunks = text_splitter.split_text(text_data)
41
-
42
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
43
- vector_db = FAISS.from_texts(text_chunks, embedding=embeddings)
44
- vector_db.save_local(FAISS_INDEX_FILE)
45
- print("✅ FAISS index created and saved!")
46
-
47
- if __name__ == "__main__":
48
- api_key = input("Enter Google API Key: ")
49
- create_vector_store(api_key)
 
 
 
 
1
+ import os
2
+ import pdfplumber
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import FAISS
5
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
6
+
7
+ DATASET_DIR = "." # Root directory (Hugging Face doesn't allow separate dataset folders)
8
+ FAISS_INDEX_PATH = "financial_faiss_index"
9
+
10
+ def get_pdf_text(pdf_files):
11
+ """Extracts text from PDFs."""
12
+ text = ""
13
+ for pdf in pdf_files:
14
+ with pdfplumber.open(pdf) as reader:
15
+ for page in reader.pages:
16
+ text += page.extract_text() or "" # Handle NoneType
17
+ return text.strip()
18
+
19
+ def preprocess_and_store_embeddings(api_key):
20
+ """Extracts text from financial documents, creates embeddings, and saves FAISS index."""
21
+ financial_text = ""
22
+
23
+ # Process all PDFs in the root directory
24
+ for file in os.listdir(DATASET_DIR):
25
+ if file.endswith(".pdf"):
26
+ file_path = os.path.join(DATASET_DIR, file)
27
+ financial_text += get_pdf_text([file_path]) + "\n\n"
28
+
29
+ if not financial_text:
30
+ print("No financial documents found. Please upload PDFs.")
31
+ return False
32
+
33
+ # Split text into chunks
34
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
35
+ text_chunks = text_splitter.split_text(financial_text)
36
+
37
+ # Generate embeddings
38
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
39
+ vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
40
+
41
+ # Save FAISS index in root folder
42
+ vector_store.save_local(FAISS_INDEX_PATH)
43
+ print("✅ FAISS index saved successfully!")
44
+
45
+ return True
46
+
47
+ if __name__ == "__main__":
48
+ api_key = os.getenv("GOOGLE_API_KEY")
49
+ if api_key:
50
+ preprocess_and_store_embeddings(api_key)
51
+ else:
52
+ print("❌ Google API Key not found. Please provide a valid key.")