Spaces:

quantumbit
/

rag-bajaj

Sleeping

App Files Files Community

rag-bajaj / preprocessing /preprocessing_modules /image_extractor.py

quantumbit

Upload 39 files

e8051be verified 6 months ago

raw

history blame contribute delete

4.43 kB

	import cv2
	import pytesseract
	import numpy as np
	import pandas as pd
	from PIL import Image, ImageFile
	from typing import List, Dict, Any

	ImageFile.LOAD_TRUNCATED_IMAGES = True

	def load_local_image(path: str) -> np.ndarray:
	"""Load image from local path."""
	img = Image.open(path).convert("RGB")
	return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

	def sort_contours(cnts, method="top-to-bottom"):
	"""Sort contours based on the specified method."""
	reverse = False
	i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0
	if method == "right-to-left" or method == "bottom-to-top":
	reverse = True
	boundingBoxes = [cv2.boundingRect(c) for c in cnts]
	(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
	key=lambda b: b[1][i], reverse=reverse))
	return cnts, boundingBoxes

	def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame:
	"""Extract table structure from image using OpenCV."""
	gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY)
	_, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY \| cv2.THRESH_OTSU)

	# Detect horizontal lines
	horizontal = binary.copy()
	cols = horizontal.shape[1]
	horizontal_size = cols // 15
	horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
	horizontal = cv2.erode(horizontal, horizontal_structure)
	horizontal = cv2.dilate(horizontal, horizontal_structure)

	# Detect vertical lines
	vertical = binary.copy()
	rows = vertical.shape[0]
	vertical_size = rows // 15
	vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
	vertical = cv2.erode(vertical, vertical_structure)
	vertical = cv2.dilate(vertical, vertical_structure)

	# Combine mask
	mask = cv2.add(horizontal, vertical)
	contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

	cells = []
	for contour in contours:
	x, y, w, h = cv2.boundingRect(contour)
	if w > 30 and h > 20: # Filter small contours
	cell_img = table_img[y:y+h, x:x+w]
	try:
	text = pytesseract.image_to_string(cell_img, config='--psm 7').strip()
	cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text})
	except:
	cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': ''})

	# Sort cells by position to create table structure
	cells.sort(key=lambda cell: (cell['y'], cell['x']))

	# Group cells into rows
	rows = []
	current_row = []
	current_y = 0

	for cell in cells:
	if abs(cell['y'] - current_y) > 20: # New row threshold
	if current_row:
	rows.append(current_row)
	current_row = [cell]
	current_y = cell['y']
	else:
	current_row.append(cell)

	if current_row:
	rows.append(current_row)

	# Convert to DataFrame
	table_data = []
	for row in rows:
	row_data = [cell['text'] for cell in sorted(row, key=lambda c: c['x'])]
	table_data.append(row_data)

	if table_data:
	max_cols = max(len(row) for row in table_data)
	for row in table_data:
	while len(row) < max_cols:
	row.append('')
	return pd.DataFrame(table_data)
	else:
	return pd.DataFrame()

	def extract_image_content(image_path: str) -> str:
	"""Extract text content from images using OCR."""
	try:
	# Load image
	img = load_local_image(image_path)

	# Basic OCR
	text = pytesseract.image_to_string(img)

	# Try to detect if it's a table
	if '\|' in text or '\\t' in text or len(text.split('\\n')) > 3:
	# Try table extraction
	try:
	table_df = extract_cells_from_grid(img)
	if not table_df.empty:
	table_text = "\\n".join([" \| ".join(row) for row in table_df.values])
	return f"[Table detected]\\n{table_text}\\n\\n[OCR Text]\\n{text}"
	except:
	pass

	return text.strip() if text.strip() else "[No text detected in image]"

	except Exception as e:
	return f"[Error processing image: {str(e)}]"