Spaces:
Running
Running
File size: 755 Bytes
f1bab1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
# pdf_extractor.py
import fitz # PyMuPDF
def extract_text_pdf_raw(pdf_path):
doc = fitz.open(pdf_path)
return "\n".join(page.get_text() for page in doc)
def extract_label_value_pairs(pdf_path):
raw_text = extract_text_pdf_raw(pdf_path)
label_value_map = {}
for line in raw_text.split('\n'):
line = line.strip()
if ':' in line:
label, value = line.split(':', 1)
if len(value.strip()) > 0:
label_value_map[label.strip().lower()] = value.strip()
elif '-' in line:
parts = line.split('-', 1)
if len(parts) == 2:
label, value = parts
label_value_map[label.strip().lower()] = value.strip()
return label_value_map |