Spaces:

krishnavadithya
/

expiryprocess

Sleeping

App Files Files Community

expiryprocess / src /excel_to_pdf.py

krishnavadithya

Upload folder using huggingface_hub

aacdfd5 verified 9 months ago

raw

history blame contribute delete

9.14 kB



	import os
	import math
	from openpyxl import load_workbook
	from reportlab.lib import colors
	from reportlab.lib.pagesizes import letter, A4, A3, landscape, portrait
	from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.lib.enums import TA_LEFT, TA_CENTER
	from reportlab.lib.units import inch
	import pyexcel as p


	def convert_xls_to_xlsx(xls_path, xlsx_path=None):
	"""Convert the old .xls file to .xlsx format"""
	if xlsx_path is None:
	xlsx_path = os.path.splitext(xls_path)[0] + '.xlsx'
	p.save_book_as(file_name=xls_path, dest_file_name=xlsx_path)
	return xlsx_path


	def determine_page_format(num_columns, max_column_width=None):
	"""
	Determine the optimal page size and orientation based on table dimensions.

	Args:
	num_columns (int): Number of columns in the table.
	max_column_width (float, optional): Maximum column width if available.

	Returns:
	tuple: (pagesize, orientation function)
	"""
	# Define thresholds for decision making
	if num_columns <= 5:
	# Few columns, likely to fit on portrait A4
	return A4, portrait
	elif num_columns <= 8:
	# Medium number of columns, use landscape A4
	return A4, landscape
	elif num_columns <= 12:
	# Many columns, use portrait A3
	return A3, portrait
	else:
	# Lots of columns, use landscape A3
	return A3, landscape


	def is_effectively_empty(value):
	"""
	Return True if the cell value is considered empty.

	Empty means:
	- The value is None.
	- The value is a float and math.isnan(value) is True.
	- The value is a string that is empty (after stripping whitespace).
	"""
	if value is None:
	return True
	if isinstance(value, float) and math.isnan(value):
	return True
	if isinstance(value, str) and not value.strip():
	return True
	return False


	def excel_to_pdf(excel_path, pdf_path=None, sheet_name=None, max_rows_per_table=50):
	"""
	Convert Excel file to PDF with adaptive page size based on content,
	removing columns that contain only NaN (or empty) values.

	Args:
	excel_path (str): Path to the Excel file.
	pdf_path (str, optional): Path for the output PDF file.
	sheet_name (str, optional): Name of the sheet to convert.
	max_rows_per_table (int): Maximum rows per table before splitting.

	Returns:
	str: Path to the created PDF file.
	"""
	if excel_path.endswith('.xls'):
	excel_path = convert_xls_to_xlsx(excel_path)

	if pdf_path is None:
	pdf_path = os.path.splitext(excel_path)[0] + '.pdf'

	# Load Excel file
	wb = load_workbook(excel_path)
	sheets = [sheet_name] if sheet_name else wb.sheetnames

	# Create paragraph styles for cell content
	styles = getSampleStyleSheet()
	header_style = ParagraphStyle(
	name='HeaderStyle',
	parent=styles['Normal'],
	fontName='Helvetica-Bold',
	fontSize=9,
	alignment=TA_CENTER,
	textColor=colors.white,
	leading=12
	)
	cell_style = ParagraphStyle(
	name='CellStyle',
	parent=styles['Normal'],
	fontName='Helvetica',
	fontSize=8,
	alignment=TA_LEFT,
	leading=10 # Line spacing
	)

	elements = []

	# Determine the effective maximum number of columns among all sheets (after filtering out empty ones)
	global_effective_max_columns = 0
	for sh in sheets:
	sheet = wb[sh]
	effective_cols = 0
	for col in range(1, sheet.max_column + 1):
	# Check if any cell in the column is non-empty
	for row in range(1, sheet.max_row + 1):
	if not is_effectively_empty(sheet.cell(row=row, column=col).value):
	effective_cols += 1
	break
	global_effective_max_columns = max(global_effective_max_columns, effective_cols)

	# Determine optimal page format based on effective column count
	pagesize, orientation_func = determine_page_format(global_effective_max_columns)

	# Create the document with determined format
	doc = SimpleDocTemplate(
	pdf_path,
	pagesize=orientation_func(pagesize),
	leftMargin=10,
	rightMargin=10,
	topMargin=15,
	bottomMargin=15
	)

	# Process each sheet
	for sheet_idx, current_sheet in enumerate(sheets):
	sheet = wb[current_sheet]

	# Determine which columns to keep (those with at least one non-empty cell)
	columns_to_keep = []
	for col in range(1, sheet.max_column + 1):
	for row in range(1, sheet.max_row + 1):
	if not is_effectively_empty(sheet.cell(row=row, column=col).value):
	columns_to_keep.append(col)
	break

	# If no columns have valid data, skip this sheet.
	if not columns_to_keep:
	continue

	# Calculate appropriate column widths (only for kept columns)
	max_col_width = 130 # Maximum column width in points
	min_col_width = 40 # Minimum column width in points
	if pagesize == A3:
	max_col_width = 150 # Allow wider columns on A3

	col_widths = []
	for col in columns_to_keep:
	max_length = 0
	# Sample first 100 rows for efficiency
	for row in range(1, min(100, sheet.max_row) + 1):
	cell = sheet.cell(row=row, column=col)
	if cell.value:
	content_length = len(str(cell.value))
	# Cap the length for width calculation at 30 characters
	max_length = max(max_length, min(content_length, 30))
	# Adjust multiplier based on page format (narrower columns for A4, wider for A3)
	multiplier = 5.5 if pagesize == A4 else 6.0
	width = min(max(min_col_width, max_length * multiplier), max_col_width)
	col_widths.append(width)

	# Build the header row from the kept columns
	header_row = []
	# Using row 1 as header (or adjust if your header is in another row)
	for col in columns_to_keep:
	cell_value = sheet.cell(row=1, column=col).value
	header_row.append(Paragraph(str(cell_value or ""), header_style))

	# Process data rows in chunks to avoid huge tables that might get chopped
	row_count = sheet.max_row
	# Start after header row
	start_row = 2
	while start_row <= row_count:
	end_row = min(start_row + max_rows_per_table - 1, row_count)

	# Create data for this chunk, starting with the header row
	chunk_data = [header_row]
	for row_idx in range(start_row, end_row + 1):
	data_row = []
	for col in columns_to_keep:
	cell = sheet.cell(row=row_idx, column=col)
	cell_value = cell.value or ""
	data_row.append(Paragraph(str(cell_value), cell_style))
	chunk_data.append(data_row)

	# Create table for this chunk
	table = Table(chunk_data, colWidths=col_widths, repeatRows=1)

	# Style the table
	table_style = TableStyle([
	# Header styling
	('BACKGROUND', (0, 0), (-1, 0), colors.darkblue),
	('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
	('ALIGN', (0, 0), (-1, 0), 'CENTER'),

	# Grid
	('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
	('VALIGN', (0, 0), (-1, -1), 'TOP'),

	# Row background colors
	('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey]),

	# Cell padding
	('LEFTPADDING', (0, 0), (-1, -1), 3),
	('RIGHTPADDING', (0, 0), (-1, -1), 3),
	('TOPPADDING', (0, 0), (-1, -1), 3),
	('BOTTOMPADDING', (0, 0), (-1, -1), 3)
	])

	table.setStyle(table_style)
	table.hAlign = 'LEFT'
	table.spaceBefore = 5
	table.spaceAfter = 15

	elements.append(table)

	# Uncomment below if you wish to add a continuation note when splitting tables
	# if end_row < row_count:
	# continuation = Paragraph(f"Table continues... (Rows {start_row}-{end_row} of {row_count})", styles['Italic'])
	# elements.append(continuation)
	# elements.append(Spacer(1, 0.2 * inch))

	start_row = end_row + 1

	# Add page break between sheets (except for the last sheet)
	if sheet_idx < len(sheets) - 1:
	elements.append(PageBreak())

	# Build PDF
	doc.build(elements)

	return pdf_path