import cv2 import pytesseract import re from PIL import Image from ultralytics import YOLO # Path to your trained YOLO model MODEL_PATH = "yolov8m_invoiceOCR.pt" # YOLO class names (order matters) class_names = [ "Discount_Percentage", "Due_Date", "Email_Client", "Name_Client", "Products", "Remise", "Subtotal", "Tax", "Tax_Precentage", "Tel_Client", "billing address", "header", "invoice date", "invoice number", "shipping address", "total" ] # Load YOLOv8 model model = YOLO(MODEL_PATH) def initialize_data_dict(): return {label: [] if label == "Products" else "" for label in class_names} def parse_products(raw_text): structured = [] lines = raw_text.split('\n') for line in lines: match = re.match(r"(\d+)\s+(.*)\s+([\d,]+\.\d{2})\s+([\d,]+\.\d{2})", line) if match: qty, desc, unit_price, amount = match.groups() structured.append({ "qty": qty, "description": desc.strip(), "unit_price": unit_price, "amount": amount }) elif line.strip(): structured.append({ "qty": 0, "description": line.strip(), "unit_price": 0, "amount": 0 }) return structured def extract_invoice_data_from_image(image_path: str): image_bgr = cv2.imread(image_path) image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB) pil_img = Image.fromarray(image_rgb) results = model(image_path)[0] data = initialize_data_dict() for box in results.boxes: x1, y1, x2, y2 = map(int, box.xyxy[0]) cls_id = int(box.cls[0]) label = class_names[cls_id] cropped_img = pil_img.crop((x1, y1, x2, y2)) extracted_text = pytesseract.image_to_string(cropped_img, config='--psm 6').strip() if label == "Products" and extracted_text: structured_products = parse_products(extracted_text) data["Products"].extend(structured_products) elif extracted_text: data[label] = extracted_text return data