import cv2 import pytesseract import numpy as np import pandas as pd from PIL import Image, ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True def load_local_image(path: str) -> np.ndarray: img = Image.open(path).convert("RGB") return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) def sort_contours(cnts, method="top-to-bottom"): reverse = False i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0 if method == "right-to-left" or method == "bottom-to-top": reverse = True boundingBoxes = [cv2.boundingRect(c) for c in cnts] (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes), key=lambda b: b[1][i], reverse=reverse)) return cnts, boundingBoxes from collections import Counter def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame: gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) # Detect horizontal lines horizontal = binary.copy() cols = horizontal.shape[1] horizontal_size = cols // 15 horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1)) horizontal = cv2.erode(horizontal, horizontal_structure) horizontal = cv2.dilate(horizontal, horizontal_structure) # Detect vertical lines vertical = binary.copy() rows = vertical.shape[0] vertical_size = rows // 15 vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size)) vertical = cv2.erode(vertical, vertical_structure) vertical = cv2.dilate(vertical, vertical_structure) # Combine mask mask = cv2.add(horizontal, vertical) contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cells = [] # Inside loop over contours for c in contours: x, y, w, h = cv2.boundingRect(c) # NEW: filter out garbage boxes (lines, dash artifacts, etc.) if w < 20 or h < 20: continue # noise # Heuristic: skip cell if mostly empty image (white) roi = table_img[y:y+h, x:x+w] white_ratio = cv2.countNonZero(cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)) / (w * h + 1e-5) if white_ratio < 0.05: # >95% empty continue cells.append((x, y, w, h)) if not cells: return pd.DataFrame() # Sort top-to-bottom cells = sorted(cells, key=lambda b: (b[1], b[0])) # Group by row using y-coordinate proximity row_tolerance = 15 rows = [] current_row = [] last_y = None for cell in cells: x, y, w, h = cell if last_y is None or abs(y - last_y) <= row_tolerance: current_row.append(cell) else: rows.append(sorted(current_row, key=lambda b: b[0])) current_row = [cell] last_y = y if current_row: rows.append(sorted(current_row, key=lambda b: b[0])) # Determine most common number of columns (mode) col_counts = [len(r) for r in rows] if not col_counts: return pd.DataFrame() most_common_cols = Counter(col_counts).most_common(1)[0][0] # Extract text table_data = [] for row in rows: sorted_row = sorted(row, key=lambda b: b[0]) row_data = [] for x, y, w, h in sorted_row: cell_img = table_img[y:y+h, x:x+w] cell_text = pytesseract.image_to_string(cell_img, config="--psm 7").strip() row_data.append(cell_text) # Adjust row length to match majority column count if len(row_data) < most_common_cols: row_data += [""] * (most_common_cols - len(row_data)) elif len(row_data) > most_common_cols: row_data = row_data[:most_common_cols] table_data.append(row_data) return pd.DataFrame(table_data) def detect_table_boxes(image: np.ndarray) -> list[tuple[int, int, int, int]]: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) horizontal = binary.copy() cols = horizontal.shape[1] horizontalsize = cols // 15 horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) horizontal = cv2.erode(horizontal, horizontalStructure) horizontal = cv2.dilate(horizontal, horizontalStructure) vertical = binary.copy() rows = vertical.shape[0] verticalsize = rows // 15 verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) vertical = cv2.erode(vertical, verticalStructure) vertical = cv2.dilate(vertical, verticalStructure) mask = horizontal + vertical contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) boxes = [] for contour in contours: x, y, w, h = cv2.boundingRect(contour) if w > 100 and h > 50: boxes.append((x, y, w, h)) return boxes def extract_non_table_text(image: np.ndarray, table_boxes: list[tuple]) -> str: mask = np.zeros(image.shape[:2], dtype=np.uint8) for x, y, w, h in table_boxes: cv2.rectangle(mask, (x, y), (x + w, y + h), 255, -1) inverse_mask = cv2.bitwise_not(mask) non_table_img = cv2.bitwise_and(image, image, mask=inverse_mask) gray = cv2.cvtColor(non_table_img, cv2.COLOR_BGR2GRAY) custom_config = r'--oem 3 --psm 6' return pytesseract.image_to_string(gray, config=custom_config) def dataframe_to_markdown(df: pd.DataFrame) -> str: return df.to_markdown(index=False) def extract_image(filepath: str) -> str: image = load_local_image(filepath) table_boxes = detect_table_boxes(image) tables = [] for i, (x, y, w, h) in enumerate(table_boxes): cropped = image[y:y+h, x:x+w] try: df = extract_cells_from_grid(cropped) tables.append((df, (x, y, w, h))) except Exception as e: print(f"[Warning] Skipping table {i} due to error: {e}") non_table_text = extract_non_table_text(image, table_boxes) output = "" if non_table_text.strip(): output += f"### Non-Table Text:\n{non_table_text.strip()}\n\n" for i, (df, _) in enumerate(tables): output += f"### Table {i+1} (Markdown):\n{dataframe_to_markdown(df)}\n\n" return output.strip()