Spaces:
Sleeping
Sleeping
Commit
·
bcf1f17
1
Parent(s):
93fb443
Update
Browse files- processor.py +24 -10
processor.py
CHANGED
|
@@ -8,16 +8,6 @@ def load_images(uploaded_file):
|
|
| 8 |
return convert_from_path(uploaded_file)
|
| 9 |
else:
|
| 10 |
return [Image.open(uploaded_file)]
|
| 11 |
-
|
| 12 |
-
from processor import (
|
| 13 |
-
load_images,
|
| 14 |
-
analyze_layout,
|
| 15 |
-
extract_text_from_blocks,
|
| 16 |
-
rule_based_kv_extraction
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
-
def convert_pdf_to_images(pdf_path):
|
| 20 |
-
return convert_from_path(pdf_path)
|
| 21 |
|
| 22 |
def analyze_layout(image):
|
| 23 |
model = lp.EfficientDetLayoutModel(
|
|
@@ -26,3 +16,27 @@ def analyze_layout(image):
|
|
| 26 |
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
|
| 27 |
)
|
| 28 |
return model.detect(image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
return convert_from_path(uploaded_file)
|
| 9 |
else:
|
| 10 |
return [Image.open(uploaded_file)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def analyze_layout(image):
|
| 13 |
model = lp.EfficientDetLayoutModel(
|
|
|
|
| 16 |
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
|
| 17 |
)
|
| 18 |
return model.detect(image)
|
| 19 |
+
|
| 20 |
+
def extract_text_from_blocks(image, layout):
|
| 21 |
+
blocks = []
|
| 22 |
+
for block in layout:
|
| 23 |
+
segment_image = block.crop_image(image)
|
| 24 |
+
text = pytesseract.image_to_string(segment_image)
|
| 25 |
+
blocks.append({
|
| 26 |
+
"type": block.type,
|
| 27 |
+
"text": text.strip(),
|
| 28 |
+
"coordinates": block.coordinates
|
| 29 |
+
})
|
| 30 |
+
return blocks
|
| 31 |
+
|
| 32 |
+
def rule_based_kv_extraction(blocks):
|
| 33 |
+
data = {}
|
| 34 |
+
for block in blocks:
|
| 35 |
+
text = block["text"].lower()
|
| 36 |
+
if "invoice" in text:
|
| 37 |
+
data["Invoice Number"] = block["text"]
|
| 38 |
+
elif "total" in text:
|
| 39 |
+
data["Total Amount"] = block["text"]
|
| 40 |
+
elif "customer" in text:
|
| 41 |
+
data["Customer Name"] = block["text"]
|
| 42 |
+
return data
|