xlsx-extract / main.py
blazeofchi's picture
Upload folder using huggingface_hub
dfffdd6 verified
import gradio as gr
from openpyxl import load_workbook
import zipfile
import os
import tempfile
import xml.etree.ElementTree as ET
import tempfile
import zipfile
import os
import xml.etree.ElementTree as ET
def extract_images_from_excel(file, sheet_names=None):
"""
Extracts images from an Excel file, referenced in drawing.xml files.
"""
output_path = "output"
with tempfile.TemporaryDirectory() as temp_dir:
with zipfile.ZipFile(file, "r") as zip_ref:
zip_ref.extractall(temp_dir)
xl_folder = os.path.join(temp_dir, "xl")
worksheets_folder = os.path.join(xl_folder, "worksheets")
rels_folder = os.path.join(worksheets_folder, "_rels")
drawings_folder = os.path.join(xl_folder, "drawings")
media_folder = os.path.join(xl_folder, "media")
if not os.path.exists(rels_folder) or not os.path.exists(media_folder):
print("No images or relationships found in the Excel file.")
return
sheet_name_to_rel = {}
if sheet_names:
workbook_file = os.path.join(xl_folder, "workbook.xml")
root = ET.fromstring(open(workbook_file).read())
namespace = {"ns": root.tag.split("}")[0].strip("{")}
sheets = root.findall(".//ns:sheet", namespace)
for sheet in sheets:
sheet_id = sheet.attrib["sheetId"]
name = sheet.attrib["name"]
if name in sheet_names:
sheet_name_to_rel[name] = f"sheet{sheet_id}.xml.rels"
extracted_images = []
for rel_file in os.listdir(rels_folder):
if sheet_names:
if not any(
sheet_name_to_rel[sheet] in rel_file for sheet in sheet_names
):
continue
namespace = {
"rel": "http://schemas.openxmlformats.org/package/2006/relationships"
}
rel_file_path = os.path.join(rels_folder, rel_file)
rel_tree = ET.parse(rel_file_path)
for relationship in rel_tree.findall("rel:Relationship", namespace):
if relationship.attrib["Type"].endswith("/drawing"):
drawing_file = os.path.join(
drawings_folder, os.path.basename(relationship.attrib["Target"])
)
if os.path.exists(drawing_file):
drawing_tree = ET.parse(drawing_file)
namespace = {
"xdr": "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
}
images_path = drawing_tree.findall(".//xdr:cNvPr", namespace)
print("Found images elems in drawing.xml file", images_path)
for i in images_path:
image_name = i.attrib.get("name", None)
image_path = os.path.normpath(
os.path.join(media_folder, os.path.basename(image_name))
)
if os.path.exists(image_path):
output_image_path = os.path.join(
output_path, os.path.basename(image_name)
)
os.makedirs(output_path, exist_ok=True)
with open(image_path, "rb") as img_file:
with open(output_image_path, "wb") as out_file:
out_file.write(img_file.read())
extracted_images.append(output_image_path)
print(f"Extracted {len(extracted_images)} images to {temp_dir}")
return extracted_images
def get_markdown_format(cell):
"""
Convert Excel cell formatting (bold, italic, strikethrough) to Markdown format.
"""
if cell is None or cell.value is None:
return ""
value = str(cell.value)
if cell.font.bold:
value = f"**{value}**"
if cell.font.italic:
value = f"*{value}*"
if cell.font.strike:
value = f"~~{value}~~"
value = value.replace("\n", " ").replace("|", "\\|")
return value
def extract_tables_from_sheet(sheet):
"""
Extract tables from an Excel sheet.
Detects tables based on non-empty rows and retains formatting.
"""
tables = []
current_table = []
for row in sheet.iter_rows():
if all(cell.value is None for cell in row):
if current_table:
tables.append(current_table)
current_table = []
else:
current_table.append(row)
if current_table:
tables.append(current_table)
return tables
def convert_table_to_markdown(table, images=[]):
"""
Convert a table to Markdown format, preserving Excel formatting.
"""
try:
non_empty_columns = [
col_idx
for col_idx in range(len(table[0]))
if any(row[col_idx].value is not None for row in table)
]
except IndexError:
non_empty_columns = [
col_idx
for col_idx in range(len(table[0]))
if any(row[col_idx] is not None for row in table)
]
filtered_table = [
[row[col_idx] if col_idx < len(row) else None for col_idx in non_empty_columns]
for row in table
]
md_str = ""
header = [get_markdown_format(cell) for cell in filtered_table[0]]
md_str += f"| {' | '.join(header)} |\n"
md_str += f"|{'|'.join(['---' for _ in header])}|\n"
for data_row in filtered_table[1:]:
row_str = []
for cell in data_row:
row_str.append(get_markdown_format(cell))
md_str += f"| {' | '.join(row_str)} |\n"
return md_str
def parse_excel(file, selected_sheets):
"""
Parse the selected sheets from the uploaded Excel file and return Markdown content.
"""
if not selected_sheets or len(selected_sheets) == 0:
return "No sheets selected.", []
workbook = load_workbook(file.name, read_only=True)
sheets = workbook.sheetnames
markdown_output = ""
images = extract_images_from_excel(file.name, selected_sheets)
for sheet_name in selected_sheets:
if sheet_name not in sheets:
markdown_output += f"Sheet '{sheet_name}' not found.\n\n"
continue
sheet = workbook[sheet_name]
tables = extract_tables_from_sheet(sheet)
markdown_output += f"# {sheet_name}\n\n"
for table in tables:
markdown_output += convert_table_to_markdown(table)
markdown_output += "\n\n"
if markdown_output == "":
markdown_output = "No tables found in selected sheets."
return markdown_output, images
def get_sheets(file):
"""
Return the list of sheet names from the uploaded Excel file.
"""
if not file:
return gr.update(choices=[])
workbook = load_workbook(file.name, read_only=True)
return gr.update(choices=workbook.sheetnames)
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Excel Tables and Images from XLSX")
with gr.Row():
with gr.Column(scale=4):
with gr.Tab("Text/Tables"):
markdown_output = gr.Markdown(show_copy_button=True)
with gr.Tab("Images"):
images_output = gr.Gallery(
label="Images in Sheets",
show_label=False,
elem_id="gallery",
object_fit="contain",
height="auto",
)
with gr.Column(scale=1):
sheet_selector = gr.CheckboxGroup(
label="Select Sheets to Parse", interactive=True
)
with gr.Row():
file_input = gr.File(label="Upload Excel File", file_types=[".xlsx"])
extrract_btn = gr.Button(
"Extract Sheets",
)
extrract_btn.click(fn=get_sheets, inputs=[file_input], outputs=sheet_selector)
sheet_selector.change(
fn=parse_excel,
inputs=[file_input, sheet_selector],
outputs=[markdown_output, images_output],
)
if __name__ == "__main__":
demo.launch()