Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -70,25 +70,29 @@ def gen_splits(folder_name):
|
|
| 70 |
new_file_paths = [os.path.join(os.getcwd(), folder_name, file) for file in file_paths]
|
| 71 |
|
| 72 |
splits = []
|
|
|
|
|
|
|
| 73 |
for file_path in new_file_paths:
|
| 74 |
if not file_path.lower().endswith(".pdf"):
|
| 75 |
continue
|
| 76 |
|
| 77 |
-
# Open document using fitz
|
| 78 |
doc = fitz.open(file_path)
|
| 79 |
file_name = os.path.basename(file_path)
|
| 80 |
|
| 81 |
for page_num in range(len(doc)):
|
| 82 |
page = doc.load_page(page_num)
|
| 83 |
-
text = page.get_text("text")
|
| 84 |
-
|
| 85 |
-
#
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
| 87 |
page_doc = Document(
|
| 88 |
page_content=text,
|
| 89 |
metadata={
|
| 90 |
"source": file_name,
|
| 91 |
-
"page": page_num + 1,
|
| 92 |
"total_pages": len(doc),
|
| 93 |
"format": "PDF",
|
| 94 |
"extraction_method": "PyMuPDF"
|
|
@@ -97,18 +101,33 @@ def gen_splits(folder_name):
|
|
| 97 |
splits.append(page_doc)
|
| 98 |
|
| 99 |
doc.close()
|
| 100 |
-
|
|
|
|
| 101 |
return splits
|
| 102 |
|
| 103 |
splits = gen_splits(DESTINATION_FOLDER)
|
| 104 |
embedding_func = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
|
| 105 |
|
| 106 |
def vectordb_from_splits(splits):
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
return vectordb
|
| 109 |
|
| 110 |
-
vectordb = vectordb_from_splits(splits)
|
| 111 |
|
|
|
|
| 112 |
|
| 113 |
|
| 114 |
# RAG Chain
|
|
|
|
| 70 |
new_file_paths = [os.path.join(os.getcwd(), folder_name, file) for file in file_paths]
|
| 71 |
|
| 72 |
splits = []
|
| 73 |
+
empty_pages = 0
|
| 74 |
+
|
| 75 |
for file_path in new_file_paths:
|
| 76 |
if not file_path.lower().endswith(".pdf"):
|
| 77 |
continue
|
| 78 |
|
|
|
|
| 79 |
doc = fitz.open(file_path)
|
| 80 |
file_name = os.path.basename(file_path)
|
| 81 |
|
| 82 |
for page_num in range(len(doc)):
|
| 83 |
page = doc.load_page(page_num)
|
| 84 |
+
text = page.get_text("text").strip() # ← strip whitespace
|
| 85 |
+
|
| 86 |
+
# ── Skip empty/image-only pages ────────────────────────────────
|
| 87 |
+
if not text or len(text) < 20: # ← 20 chars minimum threshold
|
| 88 |
+
empty_pages += 1
|
| 89 |
+
continue
|
| 90 |
+
|
| 91 |
page_doc = Document(
|
| 92 |
page_content=text,
|
| 93 |
metadata={
|
| 94 |
"source": file_name,
|
| 95 |
+
"page": page_num + 1,
|
| 96 |
"total_pages": len(doc),
|
| 97 |
"format": "PDF",
|
| 98 |
"extraction_method": "PyMuPDF"
|
|
|
|
| 101 |
splits.append(page_doc)
|
| 102 |
|
| 103 |
doc.close()
|
| 104 |
+
|
| 105 |
+
print(f"✓ Loaded {len(splits)} pages | Skipped {empty_pages} empty/image-only pages")
|
| 106 |
return splits
|
| 107 |
|
| 108 |
splits = gen_splits(DESTINATION_FOLDER)
|
| 109 |
embedding_func = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
|
| 110 |
|
| 111 |
def vectordb_from_splits(splits):
|
| 112 |
+
# ── Reuse existing ChromaDB if persist dir already populated ──────────────
|
| 113 |
+
if os.path.exists(PERSIST_DIR) and os.listdir(PERSIST_DIR):
|
| 114 |
+
print("✓ Loading existing ChromaDB from disk — skipping re-embedding.")
|
| 115 |
+
return Chroma(persist_directory=PERSIST_DIR, embedding_function=embedding_func)
|
| 116 |
+
|
| 117 |
+
if not splits:
|
| 118 |
+
raise ValueError("No text content extracted. Check if PDFs are scanned images.")
|
| 119 |
+
|
| 120 |
+
print(f"Building ChromaDB from {len(splits)} chunks...")
|
| 121 |
+
vectordb = Chroma.from_documents(
|
| 122 |
+
documents=splits,
|
| 123 |
+
persist_directory=PERSIST_DIR,
|
| 124 |
+
embedding=embedding_func
|
| 125 |
+
)
|
| 126 |
+
print(f"✓ ChromaDB built successfully.")
|
| 127 |
return vectordb
|
| 128 |
|
|
|
|
| 129 |
|
| 130 |
+
vectordb = vectordb_from_splits(splits)
|
| 131 |
|
| 132 |
|
| 133 |
# RAG Chain
|