Spaces:
Sleeping
Sleeping
Commit
·
d813a84
1
Parent(s):
4a27fa8
Update
Browse files- processor.py +8 -38
processor.py
CHANGED
|
@@ -2,44 +2,14 @@ import layoutparser as lp
|
|
| 2 |
import pytesseract
|
| 3 |
from pdf2image import convert_from_path
|
| 4 |
from PIL import Image
|
| 5 |
-
import json
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
|
| 10 |
-
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
|
| 11 |
-
)
|
| 12 |
-
|
| 13 |
-
def load_images(uploaded_file):
|
| 14 |
-
if uploaded_file.name.endswith(".pdf"):
|
| 15 |
-
return convert_from_path(uploaded_file)
|
| 16 |
-
else:
|
| 17 |
-
return [Image.open(uploaded_file)]
|
| 18 |
|
| 19 |
def analyze_layout(image):
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
cropped = block.crop_image(image)
|
| 27 |
-
text = pytesseract.image_to_string(cropped)
|
| 28 |
-
blocks.append({
|
| 29 |
-
"type": block.type,
|
| 30 |
-
"text": text.strip(),
|
| 31 |
-
"coords": block.coordinates
|
| 32 |
-
})
|
| 33 |
-
return blocks
|
| 34 |
-
|
| 35 |
-
def rule_based_kv_extraction(blocks):
|
| 36 |
-
data = {}
|
| 37 |
-
for b in blocks:
|
| 38 |
-
t = b["text"].lower()
|
| 39 |
-
if "invoice" in t and "number" in t:
|
| 40 |
-
data["Invoice Number"] = b["text"]
|
| 41 |
-
elif "total" in t:
|
| 42 |
-
data["Total Amount"] = b["text"]
|
| 43 |
-
elif "customer" in t:
|
| 44 |
-
data["Customer Name"] = b["text"]
|
| 45 |
-
return data
|
|
|
|
| 2 |
import pytesseract
|
| 3 |
from pdf2image import convert_from_path
|
| 4 |
from PIL import Image
|
|
|
|
| 5 |
|
| 6 |
+
def convert_pdf_to_images(pdf_path):
|
| 7 |
+
return convert_from_path(pdf_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def analyze_layout(image):
|
| 10 |
+
model = lp.EfficientDetLayoutModel(
|
| 11 |
+
"lp://efficientdet/PubLayNet",
|
| 12 |
+
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.6],
|
| 13 |
+
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
|
| 14 |
+
)
|
| 15 |
+
return model.detect(image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|