File size: 755 Bytes
f1bab1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# pdf_extractor.py
import fitz  # PyMuPDF

def extract_text_pdf_raw(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

def extract_label_value_pairs(pdf_path):
    raw_text = extract_text_pdf_raw(pdf_path)
    label_value_map = {}

    for line in raw_text.split('\n'):
        line = line.strip()
        if ':' in line:
            label, value = line.split(':', 1)
            if len(value.strip()) > 0:
                label_value_map[label.strip().lower()] = value.strip()
        elif '-' in line:
            parts = line.split('-', 1)
            if len(parts) == 2:
                label, value = parts
                label_value_map[label.strip().lower()] = value.strip()

    return label_value_map