rameshmoorthy commited on
Commit
8e2a4c2
·
verified ·
1 Parent(s): 4a4313c

Create dar_processor.py

Browse files
Files changed (1) hide show
  1. dar_processor.py +29 -0
dar_processor.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ # Note: The Gemini call logic is now centralized in gemini_utils.py
3
+ # This file is now only for PDF-specific processing.
4
+
5
+ def preprocess_pdf_text(pdf_path: str) -> str:
6
+ """
7
+ Extracts all text from all pages of the PDF using pdfplumber,
8
+ attempting to preserve layout for better LLM understanding.
9
+ """
10
+ processed_text_parts = []
11
+ try:
12
+ with pdfplumber.open(pdf_path) as pdf:
13
+ for i, page in enumerate(pdf.pages):
14
+ page_text = page.extract_text(x_tolerance=2, y_tolerance=2, layout=True)
15
+ if page_text:
16
+ processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n{page_text}")
17
+ else:
18
+ processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n[No text extracted from this page]")
19
+
20
+ full_text = "".join(processed_text_parts)
21
+ return full_text
22
+ except Exception as e:
23
+ error_msg = f"Error processing PDF with pdfplumber: {type(e).__name__} - {e}"
24
+ print(error_msg)
25
+ return error_msg
26
+
27
+ # We are keeping the get_structured_data_with_gemini call in the main app flow
28
+ # but importing it from gemini_utils to keep API calls together.
29
+ from gemini_utils import get_structured_data_with_gemini