Spaces:

Rahul-Samedavar
/

ShastraDocs2

Sleeping

App Files Files Community

ShastraDocs2 / preprocessing /preprocessing_modules /image_extractor.py

Rahul-Samedavar

made onseshotter faster

8882944 7 months ago

raw

history blame contribute delete

6.35 kB

	import cv2
	import pytesseract
	import numpy as np
	import pandas as pd
	from PIL import Image, ImageFile

	ImageFile.LOAD_TRUNCATED_IMAGES = True

	def load_local_image(path: str) -> np.ndarray:
	img = Image.open(path).convert("RGB")
	return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

	def sort_contours(cnts, method="top-to-bottom"):
	reverse = False
	i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0
	if method == "right-to-left" or method == "bottom-to-top":
	reverse = True
	boundingBoxes = [cv2.boundingRect(c) for c in cnts]
	(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
	key=lambda b: b[1][i], reverse=reverse))
	return cnts, boundingBoxes

	from collections import Counter

	def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame:
	gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY)
	_, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY \| cv2.THRESH_OTSU)

	# Detect horizontal lines
	horizontal = binary.copy()
	cols = horizontal.shape[1]
	horizontal_size = cols // 15
	horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
	horizontal = cv2.erode(horizontal, horizontal_structure)
	horizontal = cv2.dilate(horizontal, horizontal_structure)

	# Detect vertical lines
	vertical = binary.copy()
	rows = vertical.shape[0]
	vertical_size = rows // 15
	vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
	vertical = cv2.erode(vertical, vertical_structure)
	vertical = cv2.dilate(vertical, vertical_structure)

	# Combine mask
	mask = cv2.add(horizontal, vertical)
	contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

	cells = []
	# Inside loop over contours
	for c in contours:
	x, y, w, h = cv2.boundingRect(c)

	# NEW: filter out garbage boxes (lines, dash artifacts, etc.)
	if w < 20 or h < 20:
	continue # noise

	# Heuristic: skip cell if mostly empty image (white)
	roi = table_img[y:y+h, x:x+w]
	white_ratio = cv2.countNonZero(cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)) / (w * h + 1e-5)
	if white_ratio < 0.05: # >95% empty
	continue

	cells.append((x, y, w, h))


	if not cells:
	return pd.DataFrame()

	# Sort top-to-bottom
	cells = sorted(cells, key=lambda b: (b[1], b[0]))

	# Group by row using y-coordinate proximity
	row_tolerance = 15
	rows = []
	current_row = []
	last_y = None

	for cell in cells:
	x, y, w, h = cell
	if last_y is None or abs(y - last_y) <= row_tolerance:
	current_row.append(cell)
	else:
	rows.append(sorted(current_row, key=lambda b: b[0]))
	current_row = [cell]
	last_y = y
	if current_row:
	rows.append(sorted(current_row, key=lambda b: b[0]))

	# Determine most common number of columns (mode)
	col_counts = [len(r) for r in rows]
	if not col_counts:
	return pd.DataFrame()
	most_common_cols = Counter(col_counts).most_common(1)[0][0]

	# Extract text
	table_data = []
	for row in rows:
	sorted_row = sorted(row, key=lambda b: b[0])
	row_data = []
	for x, y, w, h in sorted_row:
	cell_img = table_img[y:y+h, x:x+w]
	cell_text = pytesseract.image_to_string(cell_img, config="--psm 7").strip()
	row_data.append(cell_text)
	# Adjust row length to match majority column count
	if len(row_data) < most_common_cols:
	row_data += [""] * (most_common_cols - len(row_data))
	elif len(row_data) > most_common_cols:
	row_data = row_data[:most_common_cols]
	table_data.append(row_data)

	return pd.DataFrame(table_data)


	def detect_table_boxes(image: np.ndarray) -> list[tuple[int, int, int, int]]:
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	_, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY \| cv2.THRESH_OTSU)

	horizontal = binary.copy()
	cols = horizontal.shape[1]
	horizontalsize = cols // 15
	horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
	horizontal = cv2.erode(horizontal, horizontalStructure)
	horizontal = cv2.dilate(horizontal, horizontalStructure)

	vertical = binary.copy()
	rows = vertical.shape[0]
	verticalsize = rows // 15
	verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
	vertical = cv2.erode(vertical, verticalStructure)
	vertical = cv2.dilate(vertical, verticalStructure)

	mask = horizontal + vertical
	contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

	boxes = []
	for contour in contours:
	x, y, w, h = cv2.boundingRect(contour)
	if w > 100 and h > 50:
	boxes.append((x, y, w, h))
	return boxes

	def extract_non_table_text(image: np.ndarray, table_boxes: list[tuple]) -> str:
	mask = np.zeros(image.shape[:2], dtype=np.uint8)
	for x, y, w, h in table_boxes:
	cv2.rectangle(mask, (x, y), (x + w, y + h), 255, -1)

	inverse_mask = cv2.bitwise_not(mask)
	non_table_img = cv2.bitwise_and(image, image, mask=inverse_mask)

	gray = cv2.cvtColor(non_table_img, cv2.COLOR_BGR2GRAY)
	custom_config = r'--oem 3 --psm 6'
	return pytesseract.image_to_string(gray, config=custom_config)

	def dataframe_to_markdown(df: pd.DataFrame) -> str:
	return df.to_markdown(index=False)

	def extract_image(filepath: str) -> str:
	image = load_local_image(filepath)
	table_boxes = detect_table_boxes(image)

	tables = []
	for i, (x, y, w, h) in enumerate(table_boxes):
	cropped = image[y:y+h, x:x+w]
	try:
	df = extract_cells_from_grid(cropped)
	tables.append((df, (x, y, w, h)))
	except Exception as e:
	print(f"[Warning] Skipping table {i} due to error: {e}")

	non_table_text = extract_non_table_text(image, table_boxes)

	output = ""
	if non_table_text.strip():
	output += f"### Non-Table Text:\n{non_table_text.strip()}\n\n"

	for i, (df, _) in enumerate(tables):
	output += f"### Table {i+1} (Markdown):\n{dataframe_to_markdown(df)}\n\n"

	return output.strip()