Spaces:

redhairedshanks1
/

Extract-Text-and-Table

Paused

App Files Files Community

Extract-Text-and-Table / services /extract_table.py

redhairedshanks1

Create extract_table.py

bb7d43f verified 5 months ago

raw

history blame contribute delete

3.58 kB

	import os
	import fitz # PyMuPDF (not used in this script, but often for PDF handling)
	import pdfplumber # For extracting tables from PDFs
	import pandas as pd # For handling tabular data (CSV, Excel)
	from docx.api import Document # For reading DOCX documents
	import logging

	# Setup logging to ensure messages are visible during standalone use
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def extract_tables_from_file(file, start_page=None, end_page=None, filename=None):
	"""
	Extracts tables from a document, depending on its file extension.

	Supports PDF, DOCX, CSV, XLS/XLSX formats.

	Args:
	file: File-like object.
	start_page (int, optional): Start page for partial PDF parsing.
	end_page (int, optional): End page for partial PDF parsing.
	filename (str, optional): Filename used to determine file extension.

	Returns:
	str: All extracted tables formatted as a single string.
	"""
	ext = os.path.splitext(filename or "")[-1].lower()
	tables = []

	# ------------------ PDF (.pdf) Extraction ------------------ #
	if ext == ".pdf":
	try:
	with pdfplumber.open(file.name) as pdf:
	total_pages = len(pdf.pages)
	start = max(start_page or 1, 1)
	end = min(end_page or total_pages, total_pages)

	for i, page in enumerate(pdf.pages):
	page_num = i + 1
	if not (start <= page_num <= end):
	continue
	try:
	for table in page.extract_tables():
	# Join each row into a string with pipe-separated columns
	rows = [" \| ".join(cell or "" for cell in row) for row in table if row]
	tables.append(f"Page {page_num} Table:\n" + "\n".join(rows))
	except Exception as e:
	logger.warning(f"PDF table extraction failed on page {page_num}: {e}")
	except Exception as e:
	logger.error(f"Failed to read PDF file: {e}")

	# ------------------ DOCX (.docx) Extraction ------------------ #
	elif ext == ".docx":
	try:
	doc = Document(file.name)
	for t in doc.tables:
	# Extract text from each table row
	rows = [" \| ".join(cell.text.strip() for cell in row.cells) for row in t.rows]
	tables.append("\n".join(rows))
	except Exception as e:
	logger.error(f"DOCX table extraction failed: {e}")

	# ------------------ CSV (.csv) Extraction ------------------ #
	elif ext == ".csv":
	try:
	df = pd.read_csv(file.name)
	tables.append(df.to_string(index=False)) # Convert DataFrame to string without index
	except Exception as e:
	logger.warning(f"CSV parsing error: {e}")

	# ------------------ Excel (.xls, .xlsx) Extraction ------------------ #
	elif ext in [".xls", ".xlsx"]:
	try:
	xl = pd.ExcelFile(file.name)
	for s in xl.sheet_names:
	sheet_df = xl.parse(s)
	tables.append(f"Sheet: {s}\n{sheet_df.to_string(index=False)}")
	except Exception as e:
	logger.warning(f"Excel parsing error: {e}")

	# ------------------ Unsupported File Type ------------------ #
	else:
	logger.warning(f"Unsupported file type: {ext}")

	# Join all extracted tables into a single string separated by newlines
	return "\n\n".join(tables)