Spaces:

blazeofchi
/

xlsx-extract

Build error

App Files Files Community

xlsx-extract / main.py

blazeofchi

Upload folder using huggingface_hub

dfffdd6 verified about 1 year ago

raw

history blame contribute delete

8.52 kB

	import gradio as gr
	from openpyxl import load_workbook
	import zipfile
	import os
	import tempfile
	import xml.etree.ElementTree as ET


	import tempfile
	import zipfile
	import os
	import xml.etree.ElementTree as ET


	def extract_images_from_excel(file, sheet_names=None):
	"""
	Extracts images from an Excel file, referenced in drawing.xml files.
	"""

	output_path = "output"

	with tempfile.TemporaryDirectory() as temp_dir:
	with zipfile.ZipFile(file, "r") as zip_ref:
	zip_ref.extractall(temp_dir)

	xl_folder = os.path.join(temp_dir, "xl")
	worksheets_folder = os.path.join(xl_folder, "worksheets")
	rels_folder = os.path.join(worksheets_folder, "_rels")
	drawings_folder = os.path.join(xl_folder, "drawings")
	media_folder = os.path.join(xl_folder, "media")

	if not os.path.exists(rels_folder) or not os.path.exists(media_folder):
	print("No images or relationships found in the Excel file.")
	return

	sheet_name_to_rel = {}
	if sheet_names:
	workbook_file = os.path.join(xl_folder, "workbook.xml")

	root = ET.fromstring(open(workbook_file).read())
	namespace = {"ns": root.tag.split("}")[0].strip("{")}

	sheets = root.findall(".//ns:sheet", namespace)

	for sheet in sheets:
	sheet_id = sheet.attrib["sheetId"]
	name = sheet.attrib["name"]
	if name in sheet_names:
	sheet_name_to_rel[name] = f"sheet{sheet_id}.xml.rels"

	extracted_images = []

	for rel_file in os.listdir(rels_folder):
	if sheet_names:
	if not any(
	sheet_name_to_rel[sheet] in rel_file for sheet in sheet_names
	):
	continue
	namespace = {
	"rel": "http://schemas.openxmlformats.org/package/2006/relationships"
	}
	rel_file_path = os.path.join(rels_folder, rel_file)
	rel_tree = ET.parse(rel_file_path)

	for relationship in rel_tree.findall("rel:Relationship", namespace):
	if relationship.attrib["Type"].endswith("/drawing"):
	drawing_file = os.path.join(
	drawings_folder, os.path.basename(relationship.attrib["Target"])
	)

	if os.path.exists(drawing_file):
	drawing_tree = ET.parse(drawing_file)

	namespace = {
	"xdr": "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
	"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
	"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
	}

	images_path = drawing_tree.findall(".//xdr:cNvPr", namespace)

	print("Found images elems in drawing.xml file", images_path)
	for i in images_path:
	image_name = i.attrib.get("name", None)

	image_path = os.path.normpath(
	os.path.join(media_folder, os.path.basename(image_name))
	)

	if os.path.exists(image_path):
	output_image_path = os.path.join(
	output_path, os.path.basename(image_name)
	)
	os.makedirs(output_path, exist_ok=True)

	with open(image_path, "rb") as img_file:
	with open(output_image_path, "wb") as out_file:
	out_file.write(img_file.read())
	extracted_images.append(output_image_path)

	print(f"Extracted {len(extracted_images)} images to {temp_dir}")
	return extracted_images


	def get_markdown_format(cell):
	"""
	Convert Excel cell formatting (bold, italic, strikethrough) to Markdown format.
	"""
	if cell is None or cell.value is None:
	return ""

	value = str(cell.value)

	if cell.font.bold:
	value = f"{value}"
	if cell.font.italic:
	value = f"{value}"
	if cell.font.strike:
	value = f"~~{value}~~"

	value = value.replace("\n", " ").replace("\|", "\\\|")

	return value


	def extract_tables_from_sheet(sheet):
	"""
	Extract tables from an Excel sheet.
	Detects tables based on non-empty rows and retains formatting.
	"""
	tables = []
	current_table = []

	for row in sheet.iter_rows():
	if all(cell.value is None for cell in row):
	if current_table:
	tables.append(current_table)
	current_table = []
	else:
	current_table.append(row)
	if current_table:
	tables.append(current_table)

	return tables


	def convert_table_to_markdown(table, images=[]):
	"""
	Convert a table to Markdown format, preserving Excel formatting.
	"""

	try:
	non_empty_columns = [
	col_idx
	for col_idx in range(len(table[0]))
	if any(row[col_idx].value is not None for row in table)
	]
	except IndexError:
	non_empty_columns = [
	col_idx
	for col_idx in range(len(table[0]))
	if any(row[col_idx] is not None for row in table)
	]

	filtered_table = [
	[row[col_idx] if col_idx < len(row) else None for col_idx in non_empty_columns]
	for row in table
	]

	md_str = ""
	header = [get_markdown_format(cell) for cell in filtered_table[0]]
	md_str += f"\| {' \| '.join(header)} \|\n"
	md_str += f"\|{'\|'.join(['---' for _ in header])}\|\n"

	for data_row in filtered_table[1:]:
	row_str = []
	for cell in data_row:
	row_str.append(get_markdown_format(cell))
	md_str += f"\| {' \| '.join(row_str)} \|\n"

	return md_str


	def parse_excel(file, selected_sheets):
	"""
	Parse the selected sheets from the uploaded Excel file and return Markdown content.
	"""

	if not selected_sheets or len(selected_sheets) == 0:
	return "No sheets selected.", []

	workbook = load_workbook(file.name, read_only=True)
	sheets = workbook.sheetnames

	markdown_output = ""

	images = extract_images_from_excel(file.name, selected_sheets)

	for sheet_name in selected_sheets:
	if sheet_name not in sheets:
	markdown_output += f"Sheet '{sheet_name}' not found.\n\n"
	continue

	sheet = workbook[sheet_name]
	tables = extract_tables_from_sheet(sheet)
	markdown_output += f"# {sheet_name}\n\n"

	for table in tables:
	markdown_output += convert_table_to_markdown(table)
	markdown_output += "\n\n"
	if markdown_output == "":
	markdown_output = "No tables found in selected sheets."
	return markdown_output, images


	def get_sheets(file):
	"""
	Return the list of sheet names from the uploaded Excel file.
	"""
	if not file:
	return gr.update(choices=[])

	workbook = load_workbook(file.name, read_only=True)

	return gr.update(choices=workbook.sheetnames)


	# Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("## Excel Tables and Images from XLSX")

	with gr.Row():
	with gr.Column(scale=4):
	with gr.Tab("Text/Tables"):
	markdown_output = gr.Markdown(show_copy_button=True)

	with gr.Tab("Images"):
	images_output = gr.Gallery(
	label="Images in Sheets",
	show_label=False,
	elem_id="gallery",
	object_fit="contain",
	height="auto",
	)
	with gr.Column(scale=1):
	sheet_selector = gr.CheckboxGroup(
	label="Select Sheets to Parse", interactive=True
	)

	with gr.Row():
	file_input = gr.File(label="Upload Excel File", file_types=[".xlsx"])

	extrract_btn = gr.Button(
	"Extract Sheets",
	)

	extrract_btn.click(fn=get_sheets, inputs=[file_input], outputs=sheet_selector)

	sheet_selector.change(
	fn=parse_excel,
	inputs=[file_input, sheet_selector],
	outputs=[markdown_output, images_output],
	)


	if __name__ == "__main__":
	demo.launch()