import gradio as gr from openpyxl import load_workbook import zipfile import os import tempfile import xml.etree.ElementTree as ET import tempfile import zipfile import os import xml.etree.ElementTree as ET def extract_images_from_excel(file, sheet_names=None): """ Extracts images from an Excel file, referenced in drawing.xml files. """ output_path = "output" with tempfile.TemporaryDirectory() as temp_dir: with zipfile.ZipFile(file, "r") as zip_ref: zip_ref.extractall(temp_dir) xl_folder = os.path.join(temp_dir, "xl") worksheets_folder = os.path.join(xl_folder, "worksheets") rels_folder = os.path.join(worksheets_folder, "_rels") drawings_folder = os.path.join(xl_folder, "drawings") media_folder = os.path.join(xl_folder, "media") if not os.path.exists(rels_folder) or not os.path.exists(media_folder): print("No images or relationships found in the Excel file.") return sheet_name_to_rel = {} if sheet_names: workbook_file = os.path.join(xl_folder, "workbook.xml") root = ET.fromstring(open(workbook_file).read()) namespace = {"ns": root.tag.split("}")[0].strip("{")} sheets = root.findall(".//ns:sheet", namespace) for sheet in sheets: sheet_id = sheet.attrib["sheetId"] name = sheet.attrib["name"] if name in sheet_names: sheet_name_to_rel[name] = f"sheet{sheet_id}.xml.rels" extracted_images = [] for rel_file in os.listdir(rels_folder): if sheet_names: if not any( sheet_name_to_rel[sheet] in rel_file for sheet in sheet_names ): continue namespace = { "rel": "http://schemas.openxmlformats.org/package/2006/relationships" } rel_file_path = os.path.join(rels_folder, rel_file) rel_tree = ET.parse(rel_file_path) for relationship in rel_tree.findall("rel:Relationship", namespace): if relationship.attrib["Type"].endswith("/drawing"): drawing_file = os.path.join( drawings_folder, os.path.basename(relationship.attrib["Target"]) ) if os.path.exists(drawing_file): drawing_tree = ET.parse(drawing_file) namespace = { "xdr": "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", "a": "http://schemas.openxmlformats.org/drawingml/2006/main", "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", } images_path = drawing_tree.findall(".//xdr:cNvPr", namespace) print("Found images elems in drawing.xml file", images_path) for i in images_path: image_name = i.attrib.get("name", None) image_path = os.path.normpath( os.path.join(media_folder, os.path.basename(image_name)) ) if os.path.exists(image_path): output_image_path = os.path.join( output_path, os.path.basename(image_name) ) os.makedirs(output_path, exist_ok=True) with open(image_path, "rb") as img_file: with open(output_image_path, "wb") as out_file: out_file.write(img_file.read()) extracted_images.append(output_image_path) print(f"Extracted {len(extracted_images)} images to {temp_dir}") return extracted_images def get_markdown_format(cell): """ Convert Excel cell formatting (bold, italic, strikethrough) to Markdown format. """ if cell is None or cell.value is None: return "" value = str(cell.value) if cell.font.bold: value = f"**{value}**" if cell.font.italic: value = f"*{value}*" if cell.font.strike: value = f"~~{value}~~" value = value.replace("\n", " ").replace("|", "\\|") return value def extract_tables_from_sheet(sheet): """ Extract tables from an Excel sheet. Detects tables based on non-empty rows and retains formatting. """ tables = [] current_table = [] for row in sheet.iter_rows(): if all(cell.value is None for cell in row): if current_table: tables.append(current_table) current_table = [] else: current_table.append(row) if current_table: tables.append(current_table) return tables def convert_table_to_markdown(table, images=[]): """ Convert a table to Markdown format, preserving Excel formatting. """ try: non_empty_columns = [ col_idx for col_idx in range(len(table[0])) if any(row[col_idx].value is not None for row in table) ] except IndexError: non_empty_columns = [ col_idx for col_idx in range(len(table[0])) if any(row[col_idx] is not None for row in table) ] filtered_table = [ [row[col_idx] if col_idx < len(row) else None for col_idx in non_empty_columns] for row in table ] md_str = "" header = [get_markdown_format(cell) for cell in filtered_table[0]] md_str += f"| {' | '.join(header)} |\n" md_str += f"|{'|'.join(['---' for _ in header])}|\n" for data_row in filtered_table[1:]: row_str = [] for cell in data_row: row_str.append(get_markdown_format(cell)) md_str += f"| {' | '.join(row_str)} |\n" return md_str def parse_excel(file, selected_sheets): """ Parse the selected sheets from the uploaded Excel file and return Markdown content. """ if not selected_sheets or len(selected_sheets) == 0: return "No sheets selected.", [] workbook = load_workbook(file.name, read_only=True) sheets = workbook.sheetnames markdown_output = "" images = extract_images_from_excel(file.name, selected_sheets) for sheet_name in selected_sheets: if sheet_name not in sheets: markdown_output += f"Sheet '{sheet_name}' not found.\n\n" continue sheet = workbook[sheet_name] tables = extract_tables_from_sheet(sheet) markdown_output += f"# {sheet_name}\n\n" for table in tables: markdown_output += convert_table_to_markdown(table) markdown_output += "\n\n" if markdown_output == "": markdown_output = "No tables found in selected sheets." return markdown_output, images def get_sheets(file): """ Return the list of sheet names from the uploaded Excel file. """ if not file: return gr.update(choices=[]) workbook = load_workbook(file.name, read_only=True) return gr.update(choices=workbook.sheetnames) # Gradio interface with gr.Blocks() as demo: gr.Markdown("## Excel Tables and Images from XLSX") with gr.Row(): with gr.Column(scale=4): with gr.Tab("Text/Tables"): markdown_output = gr.Markdown(show_copy_button=True) with gr.Tab("Images"): images_output = gr.Gallery( label="Images in Sheets", show_label=False, elem_id="gallery", object_fit="contain", height="auto", ) with gr.Column(scale=1): sheet_selector = gr.CheckboxGroup( label="Select Sheets to Parse", interactive=True ) with gr.Row(): file_input = gr.File(label="Upload Excel File", file_types=[".xlsx"]) extrract_btn = gr.Button( "Extract Sheets", ) extrract_btn.click(fn=get_sheets, inputs=[file_input], outputs=sheet_selector) sheet_selector.change( fn=parse_excel, inputs=[file_input, sheet_selector], outputs=[markdown_output, images_output], ) if __name__ == "__main__": demo.launch()