Spaces:
Sleeping
Sleeping
| import layoutparser as lp | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| def load_images(uploaded_file): | |
| if uploaded_file.name.endswith(".pdf"): | |
| return convert_from_path(uploaded_file) | |
| else: | |
| return [Image.open(uploaded_file)] | |
| def analyze_layout(image): | |
| model = lp.EfficientDetLayoutModel( | |
| "lp://efficientdet/PubLayNet", | |
| extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.6], | |
| label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"} | |
| ) | |
| return model.detect(image) | |
| def extract_text_from_blocks(image, layout): | |
| blocks = [] | |
| for block in layout: | |
| segment_image = block.crop_image(image) | |
| text = pytesseract.image_to_string(segment_image) | |
| blocks.append({ | |
| "type": block.type, | |
| "text": text.strip(), | |
| "coordinates": block.coordinates | |
| }) | |
| return blocks | |
| def rule_based_kv_extraction(blocks): | |
| data = {} | |
| for block in blocks: | |
| text = block["text"].lower() | |
| if "invoice" in text: | |
| data["Invoice Number"] = block["text"] | |
| elif "total" in text: | |
| data["Total Amount"] = block["text"] | |
| elif "customer" in text: | |
| data["Customer Name"] = block["text"] | |
| return data | |