Spaces:
Sleeping
Sleeping
| import cv2 | |
| import pytesseract | |
| import numpy as np | |
| import pandas as pd | |
| from PIL import Image, ImageFile | |
| from typing import List, Dict, Any | |
| ImageFile.LOAD_TRUNCATED_IMAGES = True | |
| def load_local_image(path: str) -> np.ndarray: | |
| """Load image from local path.""" | |
| img = Image.open(path).convert("RGB") | |
| return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) | |
| def sort_contours(cnts, method="top-to-bottom"): | |
| """Sort contours based on the specified method.""" | |
| reverse = False | |
| i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0 | |
| if method == "right-to-left" or method == "bottom-to-top": | |
| reverse = True | |
| boundingBoxes = [cv2.boundingRect(c) for c in cnts] | |
| (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes), | |
| key=lambda b: b[1][i], reverse=reverse)) | |
| return cnts, boundingBoxes | |
| def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame: | |
| """Extract table structure from image using OpenCV.""" | |
| gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY) | |
| _, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) | |
| # Detect horizontal lines | |
| horizontal = binary.copy() | |
| cols = horizontal.shape[1] | |
| horizontal_size = cols // 15 | |
| horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1)) | |
| horizontal = cv2.erode(horizontal, horizontal_structure) | |
| horizontal = cv2.dilate(horizontal, horizontal_structure) | |
| # Detect vertical lines | |
| vertical = binary.copy() | |
| rows = vertical.shape[0] | |
| vertical_size = rows // 15 | |
| vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size)) | |
| vertical = cv2.erode(vertical, vertical_structure) | |
| vertical = cv2.dilate(vertical, vertical_structure) | |
| # Combine mask | |
| mask = cv2.add(horizontal, vertical) | |
| contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) | |
| cells = [] | |
| for contour in contours: | |
| x, y, w, h = cv2.boundingRect(contour) | |
| if w > 30 and h > 20: # Filter small contours | |
| cell_img = table_img[y:y+h, x:x+w] | |
| try: | |
| text = pytesseract.image_to_string(cell_img, config='--psm 7').strip() | |
| cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text}) | |
| except: | |
| cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': ''}) | |
| # Sort cells by position to create table structure | |
| cells.sort(key=lambda cell: (cell['y'], cell['x'])) | |
| # Group cells into rows | |
| rows = [] | |
| current_row = [] | |
| current_y = 0 | |
| for cell in cells: | |
| if abs(cell['y'] - current_y) > 20: # New row threshold | |
| if current_row: | |
| rows.append(current_row) | |
| current_row = [cell] | |
| current_y = cell['y'] | |
| else: | |
| current_row.append(cell) | |
| if current_row: | |
| rows.append(current_row) | |
| # Convert to DataFrame | |
| table_data = [] | |
| for row in rows: | |
| row_data = [cell['text'] for cell in sorted(row, key=lambda c: c['x'])] | |
| table_data.append(row_data) | |
| if table_data: | |
| max_cols = max(len(row) for row in table_data) | |
| for row in table_data: | |
| while len(row) < max_cols: | |
| row.append('') | |
| return pd.DataFrame(table_data) | |
| else: | |
| return pd.DataFrame() | |
| def extract_image_content(image_path: str) -> str: | |
| """Extract text content from images using OCR.""" | |
| try: | |
| # Load image | |
| img = load_local_image(image_path) | |
| # Basic OCR | |
| text = pytesseract.image_to_string(img) | |
| # Try to detect if it's a table | |
| if '|' in text or '\\t' in text or len(text.split('\\n')) > 3: | |
| # Try table extraction | |
| try: | |
| table_df = extract_cells_from_grid(img) | |
| if not table_df.empty: | |
| table_text = "\\n".join([" | ".join(row) for row in table_df.values]) | |
| return f"[Table detected]\\n{table_text}\\n\\n[OCR Text]\\n{text}" | |
| except: | |
| pass | |
| return text.strip() if text.strip() else "[No text detected in image]" | |
| except Exception as e: | |
| return f"[Error processing image: {str(e)}]" | |