Spaces:

rameshmoorthy
/

DAR_standardiser

Sleeping

rameshmoorthy commited on Jun 20, 2025

Commit

8e2a4c2

verified ·

1 Parent(s): 4a4313c

Create dar_processor.py

Files changed (1) hide show

dar_processor.py ADDED Viewed

+import pdfplumber
+# Note: The Gemini call logic is now centralized in gemini_utils.py
+# This file is now only for PDF-specific processing.
+def preprocess_pdf_text(pdf_path: str) -> str:
+    """
+    Extracts all text from all pages of the PDF using pdfplumber,
+    attempting to preserve layout for better LLM understanding.
+    """
+    processed_text_parts = []
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            for i, page in enumerate(pdf.pages):
+                page_text = page.extract_text(x_tolerance=2, y_tolerance=2, layout=True)
+                if page_text:
+                    processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n{page_text}")
+                else:
+                    processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n[No text extracted from this page]")
+        full_text = "".join(processed_text_parts)
+        return full_text
+    except Exception as e:
+        error_msg = f"Error processing PDF with pdfplumber: {type(e).__name__} - {e}"
+        print(error_msg)
+        return error_msg
+# We are keeping the get_structured_data_with_gemini call in the main app flow
+# but importing it from gemini_utils to keep API calls together.
+from gemini_utils import get_structured_data_with_gemini