abhivsh commited on
Commit
b0e9414
·
verified ·
1 Parent(s): 687ed30

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -9
app.py CHANGED
@@ -70,25 +70,29 @@ def gen_splits(folder_name):
70
  new_file_paths = [os.path.join(os.getcwd(), folder_name, file) for file in file_paths]
71
 
72
  splits = []
 
 
73
  for file_path in new_file_paths:
74
  if not file_path.lower().endswith(".pdf"):
75
  continue
76
 
77
- # Open document using fitz
78
  doc = fitz.open(file_path)
79
  file_name = os.path.basename(file_path)
80
 
81
  for page_num in range(len(doc)):
82
  page = doc.load_page(page_num)
83
- text = page.get_text("text") # "text" maintains logical flow; "blocks" is better for tables
84
-
85
- # Creating a LangChain Document object for each page
86
- # This replaces the need for RecursiveCharacterTextSplitter
 
 
 
87
  page_doc = Document(
88
  page_content=text,
89
  metadata={
90
  "source": file_name,
91
- "page": page_num + 1, # 1-indexed for user readability
92
  "total_pages": len(doc),
93
  "format": "PDF",
94
  "extraction_method": "PyMuPDF"
@@ -97,18 +101,33 @@ def gen_splits(folder_name):
97
  splits.append(page_doc)
98
 
99
  doc.close()
100
-
 
101
  return splits
102
 
103
  splits = gen_splits(DESTINATION_FOLDER)
104
  embedding_func = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
105
 
106
  def vectordb_from_splits(splits):
107
- vectordb = Chroma.from_documents(documents=splits, persist_directory=PERSIST_DIR, embedding=embedding_func)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  return vectordb
109
 
110
- vectordb = vectordb_from_splits(splits)
111
 
 
112
 
113
 
114
  # RAG Chain
 
70
  new_file_paths = [os.path.join(os.getcwd(), folder_name, file) for file in file_paths]
71
 
72
  splits = []
73
+ empty_pages = 0
74
+
75
  for file_path in new_file_paths:
76
  if not file_path.lower().endswith(".pdf"):
77
  continue
78
 
 
79
  doc = fitz.open(file_path)
80
  file_name = os.path.basename(file_path)
81
 
82
  for page_num in range(len(doc)):
83
  page = doc.load_page(page_num)
84
+ text = page.get_text("text").strip() # strip whitespace
85
+
86
+ # ── Skip empty/image-only pages ────────────────────────────────
87
+ if not text or len(text) < 20: # ← 20 chars minimum threshold
88
+ empty_pages += 1
89
+ continue
90
+
91
  page_doc = Document(
92
  page_content=text,
93
  metadata={
94
  "source": file_name,
95
+ "page": page_num + 1,
96
  "total_pages": len(doc),
97
  "format": "PDF",
98
  "extraction_method": "PyMuPDF"
 
101
  splits.append(page_doc)
102
 
103
  doc.close()
104
+
105
+ print(f"✓ Loaded {len(splits)} pages | Skipped {empty_pages} empty/image-only pages")
106
  return splits
107
 
108
  splits = gen_splits(DESTINATION_FOLDER)
109
  embedding_func = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
110
 
111
  def vectordb_from_splits(splits):
112
+ # ── Reuse existing ChromaDB if persist dir already populated ──────────────
113
+ if os.path.exists(PERSIST_DIR) and os.listdir(PERSIST_DIR):
114
+ print("✓ Loading existing ChromaDB from disk — skipping re-embedding.")
115
+ return Chroma(persist_directory=PERSIST_DIR, embedding_function=embedding_func)
116
+
117
+ if not splits:
118
+ raise ValueError("No text content extracted. Check if PDFs are scanned images.")
119
+
120
+ print(f"Building ChromaDB from {len(splits)} chunks...")
121
+ vectordb = Chroma.from_documents(
122
+ documents=splits,
123
+ persist_directory=PERSIST_DIR,
124
+ embedding=embedding_func
125
+ )
126
+ print(f"✓ ChromaDB built successfully.")
127
  return vectordb
128
 
 
129
 
130
+ vectordb = vectordb_from_splits(splits)
131
 
132
 
133
  # RAG Chain