Spaces:

resolverkatla
/

Midterm_Task_5

Sleeping

Midterm_Task_5 / layout_extractor.py

Update

4a27fa8 9 months ago

1.29 kB

	import layoutparser as lp
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image

	def convert_pdf_to_images(pdf_path):
	return convert_from_path(pdf_path)

	# ✅ Use EfficientDet instead of Detectron2 for better compatibility
	def analyze_layout(image):
	model = lp.EfficientDetLayoutModel(
	"lp://efficientdet/PubLayNet",
	extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.6],
	label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
	)
	layout = model.detect(image)
	return layout

	def extract_text_from_blocks(image, layout):
	blocks = []
	for block in layout:
	segment_image = block.crop_image(image)
	text = pytesseract.image_to_string(segment_image)
	blocks.append({
	"type": block.type,
	"text": text.strip(),
	"coordinates": block.coordinates
	})
	return blocks

	def extract_key_values(blocks):
	data = {}
	for block in blocks:
	text = block["text"].lower()
	if "invoice" in text:
	data["Invoice Number"] = block["text"]
	elif "total" in text:
	data["Total Amount"] = block["text"]
	elif "customer" in text:
	data["Customer Name"] = block["text"]
	return data