Spaces:
Sleeping
Sleeping
File size: 2,122 Bytes
554ef3f 6f8fe5a 554ef3f 6f8fe5a 554ef3f a652976 6f8fe5a a652976 6f8fe5a c998c72 554ef3f c998c72 554ef3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import cv2
import pytesseract
import re
from PIL import Image
from ultralytics import YOLO
# Path to your trained YOLO model
MODEL_PATH = "yolov8m_invoiceOCR.pt"
# YOLO class names (order matters)
class_names = [
"Discount_Percentage", "Due_Date", "Email_Client", "Name_Client", "Products",
"Remise", "Subtotal", "Tax", "Tax_Precentage", "Tel_Client", "billing address",
"header", "invoice date", "invoice number", "shipping address", "total"
]
# Load YOLOv8 model
model = YOLO(MODEL_PATH)
def initialize_data_dict():
return {label: [] if label == "Products" else "" for label in class_names}
def parse_products(raw_text):
structured = []
lines = raw_text.split('\n')
for line in lines:
match = re.match(r"(\d+)\s+(.*)\s+([\d,]+\.\d{2})\s+([\d,]+\.\d{2})", line)
if match:
qty, desc, unit_price, amount = match.groups()
structured.append({
"qty": qty,
"description": desc.strip(),
"unit_price": unit_price,
"amount": amount
})
elif line.strip():
structured.append({
"qty": 0,
"description": line.strip(),
"unit_price": 0,
"amount": 0
})
return structured
def extract_invoice_data_from_image(image_path: str):
image_bgr = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
pil_img = Image.fromarray(image_rgb)
results = model(image_path)[0]
data = initialize_data_dict()
for box in results.boxes:
x1, y1, x2, y2 = map(int, box.xyxy[0])
cls_id = int(box.cls[0])
label = class_names[cls_id]
cropped_img = pil_img.crop((x1, y1, x2, y2))
extracted_text = pytesseract.image_to_string(cropped_img, config='--psm 6').strip()
if label == "Products" and extracted_text:
structured_products = parse_products(extracted_text)
data["Products"].extend(structured_products)
elif extracted_text:
data[label] = extracted_text
return data
|