Spaces:

quantumbit
/

rag-bajaj

Sleeping

File size: 4,433 Bytes

e8051be

import cv2
import pytesseract
import numpy as np
import pandas as pd
from PIL import Image, ImageFile
from typing import List, Dict, Any

ImageFile.LOAD_TRUNCATED_IMAGES = True

def load_local_image(path: str) -> np.ndarray:
    """Load image from local path."""
    img = Image.open(path).convert("RGB")
    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

def sort_contours(cnts, method="top-to-bottom"):
    """Sort contours based on the specified method."""
    reverse = False
    i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0
    if method == "right-to-left" or method == "bottom-to-top":
        reverse = True
    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
                                        key=lambda b: b[1][i], reverse=reverse))
    return cnts, boundingBoxes

def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame:
    """Extract table structure from image using OpenCV."""
    gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    # Detect horizontal lines
    horizontal = binary.copy()
    cols = horizontal.shape[1]
    horizontal_size = cols // 15
    horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
    horizontal = cv2.erode(horizontal, horizontal_structure)
    horizontal = cv2.dilate(horizontal, horizontal_structure)

    # Detect vertical lines
    vertical = binary.copy()
    rows = vertical.shape[0]
    vertical_size = rows // 15
    vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
    vertical = cv2.erode(vertical, vertical_structure)
    vertical = cv2.dilate(vertical, vertical_structure)

    # Combine mask
    mask = cv2.add(horizontal, vertical)
    contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    cells = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        if w > 30 and h > 20:  # Filter small contours
            cell_img = table_img[y:y+h, x:x+w]
            try:
                text = pytesseract.image_to_string(cell_img, config='--psm 7').strip()
                cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text})
            except:
                cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': ''})

    # Sort cells by position to create table structure
    cells.sort(key=lambda cell: (cell['y'], cell['x']))
    
    # Group cells into rows
    rows = []
    current_row = []
    current_y = 0
    
    for cell in cells:
        if abs(cell['y'] - current_y) > 20:  # New row threshold
            if current_row:
                rows.append(current_row)
            current_row = [cell]
            current_y = cell['y']
        else:
            current_row.append(cell)
    
    if current_row:
        rows.append(current_row)

    # Convert to DataFrame
    table_data = []
    for row in rows:
        row_data = [cell['text'] for cell in sorted(row, key=lambda c: c['x'])]
        table_data.append(row_data)

    if table_data:
        max_cols = max(len(row) for row in table_data)
        for row in table_data:
            while len(row) < max_cols:
                row.append('')
        return pd.DataFrame(table_data)
    else:
        return pd.DataFrame()

def extract_image_content(image_path: str) -> str:
    """Extract text content from images using OCR."""
    try:
        # Load image
        img = load_local_image(image_path)
        
        # Basic OCR
        text = pytesseract.image_to_string(img)
        
        # Try to detect if it's a table
        if '|' in text or '\\t' in text or len(text.split('\\n')) > 3:
            # Try table extraction
            try:
                table_df = extract_cells_from_grid(img)
                if not table_df.empty:
                    table_text = "\\n".join([" | ".join(row) for row in table_df.values])
                    return f"[Table detected]\\n{table_text}\\n\\n[OCR Text]\\n{text}"
            except:
                pass
        
        return text.strip() if text.strip() else "[No text detected in image]"
        
    except Exception as e:
        return f"[Error processing image: {str(e)}]"