Spaces:

Rahul-Samedavar
/

ShastraDocs2

Sleeping

File size: 6,346 Bytes
import cv2
import pytesseract
import numpy as np
import pandas as pd
from PIL import Image, ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True

def load_local_image(path: str) -> np.ndarray:
    img = Image.open(path).convert("RGB")
    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

def sort_contours(cnts, method="top-to-bottom"):
    reverse = False
    i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0
    if method == "right-to-left" or method == "bottom-to-top":
        reverse = True
    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
                                        key=lambda b: b[1][i], reverse=reverse))
    return cnts, boundingBoxes

from collections import Counter

def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame:
    gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    # Detect horizontal lines
    horizontal = binary.copy()
    cols = horizontal.shape[1]
    horizontal_size = cols // 15
    horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
    horizontal = cv2.erode(horizontal, horizontal_structure)
    horizontal = cv2.dilate(horizontal, horizontal_structure)

    # Detect vertical lines
    vertical = binary.copy()
    rows = vertical.shape[0]
    vertical_size = rows // 15
    vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
    vertical = cv2.erode(vertical, vertical_structure)
    vertical = cv2.dilate(vertical, vertical_structure)

    # Combine mask
    mask = cv2.add(horizontal, vertical)
    contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    cells = []
# Inside loop over contours
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        
        # NEW: filter out garbage boxes (lines, dash artifacts, etc.)
        if w < 20 or h < 20:
            continue  # noise
        
        # Heuristic: skip cell if mostly empty image (white)
        roi = table_img[y:y+h, x:x+w]
        white_ratio = cv2.countNonZero(cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)) / (w * h + 1e-5)
        if white_ratio < 0.05:  # >95% empty
            continue
        
        cells.append((x, y, w, h))


    if not cells:
        return pd.DataFrame()

    # Sort top-to-bottom
    cells = sorted(cells, key=lambda b: (b[1], b[0]))

    # Group by row using y-coordinate proximity
    row_tolerance = 15
    rows = []
    current_row = []
    last_y = None

    for cell in cells:
        x, y, w, h = cell
        if last_y is None or abs(y - last_y) <= row_tolerance:
            current_row.append(cell)
        else:
            rows.append(sorted(current_row, key=lambda b: b[0]))
            current_row = [cell]
        last_y = y
    if current_row:
        rows.append(sorted(current_row, key=lambda b: b[0]))

    # Determine most common number of columns (mode)
    col_counts = [len(r) for r in rows]
    if not col_counts:
        return pd.DataFrame()
    most_common_cols = Counter(col_counts).most_common(1)[0][0]

    # Extract text
    table_data = []
    for row in rows:
        sorted_row = sorted(row, key=lambda b: b[0])
        row_data = []
        for x, y, w, h in sorted_row:
            cell_img = table_img[y:y+h, x:x+w]
            cell_text = pytesseract.image_to_string(cell_img, config="--psm 7").strip()
            row_data.append(cell_text)
        # Adjust row length to match majority column count
        if len(row_data) < most_common_cols:
            row_data += [""] * (most_common_cols - len(row_data))
        elif len(row_data) > most_common_cols:
            row_data = row_data[:most_common_cols]
        table_data.append(row_data)

    return pd.DataFrame(table_data)


def detect_table_boxes(image: np.ndarray) -> list[tuple[int, int, int, int]]:
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    horizontal = binary.copy()
    cols = horizontal.shape[1]
    horizontalsize = cols // 15
    horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
    horizontal = cv2.erode(horizontal, horizontalStructure)
    horizontal = cv2.dilate(horizontal, horizontalStructure)

    vertical = binary.copy()
    rows = vertical.shape[0]
    verticalsize = rows // 15
    verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
    vertical = cv2.erode(vertical, verticalStructure)
    vertical = cv2.dilate(vertical, verticalStructure)

    mask = horizontal + vertical
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    boxes = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        if w > 100 and h > 50:
            boxes.append((x, y, w, h))
    return boxes

def extract_non_table_text(image: np.ndarray, table_boxes: list[tuple]) -> str:
    mask = np.zeros(image.shape[:2], dtype=np.uint8)
    for x, y, w, h in table_boxes:
        cv2.rectangle(mask, (x, y), (x + w, y + h), 255, -1)

    inverse_mask = cv2.bitwise_not(mask)
    non_table_img = cv2.bitwise_and(image, image, mask=inverse_mask)

    gray = cv2.cvtColor(non_table_img, cv2.COLOR_BGR2GRAY)
    custom_config = r'--oem 3 --psm 6'
    return pytesseract.image_to_string(gray, config=custom_config)

def dataframe_to_markdown(df: pd.DataFrame) -> str:
    return df.to_markdown(index=False)

def extract_image(filepath: str) -> str:
    image = load_local_image(filepath)
    table_boxes = detect_table_boxes(image)

    tables = []
    for i, (x, y, w, h) in enumerate(table_boxes):
        cropped = image[y:y+h, x:x+w]
        try:
            df = extract_cells_from_grid(cropped)
            tables.append((df, (x, y, w, h)))
        except Exception as e:
            print(f"[Warning] Skipping table {i} due to error: {e}")

    non_table_text = extract_non_table_text(image, table_boxes)

    output = ""
    if non_table_text.strip():
        output += f"### Non-Table Text:\n{non_table_text.strip()}\n\n"

    for i, (df, _) in enumerate(tables):
        output += f"### Table {i+1} (Markdown):\n{dataframe_to_markdown(df)}\n\n"

    return output.strip()