Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -57,17 +57,25 @@ check_poppler_installed()
|
|
| 57 |
|
| 58 |
def load_docs(document_path):
|
| 59 |
try:
|
| 60 |
-
#
|
| 61 |
-
loader = PyMuPDFLoader(document_path)
|
| 62 |
-
documents = loader.load()
|
| 63 |
|
| 64 |
-
#
|
| 65 |
-
|
|
|
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 72 |
chunk_size=1000,
|
| 73 |
chunk_overlap=100,
|
|
@@ -78,7 +86,7 @@ def load_docs(document_path):
|
|
| 78 |
# Debug: Show filtered chunks
|
| 79 |
st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
|
| 80 |
for i, doc in enumerate(split_docs[:5]): # Show first 5 chunks
|
| 81 |
-
st.write(f"Chunk {i + 1}: {doc.page_content[:
|
| 82 |
|
| 83 |
return split_docs
|
| 84 |
except Exception as e:
|
|
@@ -86,6 +94,31 @@ def load_docs(document_path):
|
|
| 86 |
st.stop()
|
| 87 |
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
def already_indexed(vectordb, file_name):
|
| 90 |
indexed_sources = set(
|
| 91 |
x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]
|
|
@@ -236,15 +269,17 @@ if __name__ == "__main__":
|
|
| 236 |
else:
|
| 237 |
st.write("✅ File already downloaded.")
|
| 238 |
|
| 239 |
-
# Generate PDF preview
|
| 240 |
-
st.
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
| 248 |
|
| 249 |
# Load the document into the system
|
| 250 |
st.write("🔄 Loading document into the system...")
|
|
@@ -258,10 +293,6 @@ if __name__ == "__main__":
|
|
| 258 |
st.error(f"Failed to load the document: {e}")
|
| 259 |
st.stop()
|
| 260 |
|
| 261 |
-
# Display the PDF preview if available
|
| 262 |
-
if st.session_state.pdf_preview:
|
| 263 |
-
st.image(st.session_state.pdf_preview, caption="First Page Preview", use_container_width=True)
|
| 264 |
-
|
| 265 |
# Display previous chat messages
|
| 266 |
if st.session_state.messages:
|
| 267 |
for message in st.session_state.messages:
|
|
|
|
| 57 |
|
| 58 |
def load_docs(document_path):
|
| 59 |
try:
|
| 60 |
+
import fitz # PyMuPDF for text extraction
|
|
|
|
|
|
|
| 61 |
|
| 62 |
+
# Step 1: Extract plain text from PDF
|
| 63 |
+
doc = fitz.open(document_path)
|
| 64 |
+
extracted_text = []
|
| 65 |
|
| 66 |
+
for page_num, page in enumerate(doc):
|
| 67 |
+
page_text = page.get_text("text") # Extract text
|
| 68 |
+
clean_page_text = clean_extracted_text(page_text)
|
| 69 |
+
if clean_page_text: # Keep only non-empty cleaned text
|
| 70 |
+
extracted_text.append(clean_page_text)
|
| 71 |
|
| 72 |
+
doc.close()
|
| 73 |
+
|
| 74 |
+
# Step 2: Combine cleaned text
|
| 75 |
+
full_text = "\n".join(extracted_text)
|
| 76 |
+
st.write(f"📄 Total Cleaned Text Length: {len(full_text)} characters")
|
| 77 |
+
|
| 78 |
+
# Step 3: Chunk the cleaned text
|
| 79 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 80 |
chunk_size=1000,
|
| 81 |
chunk_overlap=100,
|
|
|
|
| 86 |
# Debug: Show filtered chunks
|
| 87 |
st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
|
| 88 |
for i, doc in enumerate(split_docs[:5]): # Show first 5 chunks
|
| 89 |
+
st.write(f"Chunk {i + 1}: {doc.page_content[:300]}...")
|
| 90 |
|
| 91 |
return split_docs
|
| 92 |
except Exception as e:
|
|
|
|
| 94 |
st.stop()
|
| 95 |
|
| 96 |
|
| 97 |
+
def clean_extracted_text(text):
|
| 98 |
+
"""
|
| 99 |
+
Cleans extracted text to remove metadata, headers, and irrelevant content.
|
| 100 |
+
"""
|
| 101 |
+
lines = text.split("\n")
|
| 102 |
+
cleaned_lines = []
|
| 103 |
+
|
| 104 |
+
for line in lines:
|
| 105 |
+
line = line.strip()
|
| 106 |
+
|
| 107 |
+
# Filter out lines with metadata patterns
|
| 108 |
+
if (
|
| 109 |
+
re.match(r"^(U\.S\.|United States|Sheet|Figure|References|Patent No|Date of Patent)", line)
|
| 110 |
+
or re.match(r"^\(?\d+\)?$", line) # Matches single numbers (page numbers)
|
| 111 |
+
or "Examiner" in line
|
| 112 |
+
or "Attorney" in line
|
| 113 |
+
or len(line) < 30 # Skip very short lines
|
| 114 |
+
):
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
cleaned_lines.append(line)
|
| 118 |
+
|
| 119 |
+
return "\n".join(cleaned_lines)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
def already_indexed(vectordb, file_name):
|
| 123 |
indexed_sources = set(
|
| 124 |
x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]
|
|
|
|
| 269 |
else:
|
| 270 |
st.write("✅ File already downloaded.")
|
| 271 |
|
| 272 |
+
# Generate PDF preview only if not already displayed
|
| 273 |
+
if not st.session_state.get("pdf_preview_displayed", False):
|
| 274 |
+
st.write("🖼️ Generating PDF preview...")
|
| 275 |
+
preview_image_path = preview_pdf(pdf_path)
|
| 276 |
+
if preview_image_path:
|
| 277 |
+
st.session_state.pdf_preview = preview_image_path
|
| 278 |
+
st.image(preview_image_path, caption="First Page Preview", use_container_width=True)
|
| 279 |
+
st.session_state["pdf_preview_displayed"] = True
|
| 280 |
+
else:
|
| 281 |
+
st.warning("Failed to generate PDF preview.")
|
| 282 |
+
st.session_state.pdf_preview = None
|
| 283 |
|
| 284 |
# Load the document into the system
|
| 285 |
st.write("🔄 Loading document into the system...")
|
|
|
|
| 293 |
st.error(f"Failed to load the document: {e}")
|
| 294 |
st.stop()
|
| 295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
# Display previous chat messages
|
| 297 |
if st.session_state.messages:
|
| 298 |
for message in st.session_state.messages:
|