Spaces:
Build error
Build error
| import gradio as gr | |
| from openpyxl import load_workbook | |
| import zipfile | |
| import os | |
| import tempfile | |
| import xml.etree.ElementTree as ET | |
| import tempfile | |
| import zipfile | |
| import os | |
| import xml.etree.ElementTree as ET | |
| def extract_images_from_excel(file, sheet_names=None): | |
| """ | |
| Extracts images from an Excel file, referenced in drawing.xml files. | |
| """ | |
| output_path = "output" | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| with zipfile.ZipFile(file, "r") as zip_ref: | |
| zip_ref.extractall(temp_dir) | |
| xl_folder = os.path.join(temp_dir, "xl") | |
| worksheets_folder = os.path.join(xl_folder, "worksheets") | |
| rels_folder = os.path.join(worksheets_folder, "_rels") | |
| drawings_folder = os.path.join(xl_folder, "drawings") | |
| media_folder = os.path.join(xl_folder, "media") | |
| if not os.path.exists(rels_folder) or not os.path.exists(media_folder): | |
| print("No images or relationships found in the Excel file.") | |
| return | |
| sheet_name_to_rel = {} | |
| if sheet_names: | |
| workbook_file = os.path.join(xl_folder, "workbook.xml") | |
| root = ET.fromstring(open(workbook_file).read()) | |
| namespace = {"ns": root.tag.split("}")[0].strip("{")} | |
| sheets = root.findall(".//ns:sheet", namespace) | |
| for sheet in sheets: | |
| sheet_id = sheet.attrib["sheetId"] | |
| name = sheet.attrib["name"] | |
| if name in sheet_names: | |
| sheet_name_to_rel[name] = f"sheet{sheet_id}.xml.rels" | |
| extracted_images = [] | |
| for rel_file in os.listdir(rels_folder): | |
| if sheet_names: | |
| if not any( | |
| sheet_name_to_rel[sheet] in rel_file for sheet in sheet_names | |
| ): | |
| continue | |
| namespace = { | |
| "rel": "http://schemas.openxmlformats.org/package/2006/relationships" | |
| } | |
| rel_file_path = os.path.join(rels_folder, rel_file) | |
| rel_tree = ET.parse(rel_file_path) | |
| for relationship in rel_tree.findall("rel:Relationship", namespace): | |
| if relationship.attrib["Type"].endswith("/drawing"): | |
| drawing_file = os.path.join( | |
| drawings_folder, os.path.basename(relationship.attrib["Target"]) | |
| ) | |
| if os.path.exists(drawing_file): | |
| drawing_tree = ET.parse(drawing_file) | |
| namespace = { | |
| "xdr": "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", | |
| "a": "http://schemas.openxmlformats.org/drawingml/2006/main", | |
| "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", | |
| } | |
| images_path = drawing_tree.findall(".//xdr:cNvPr", namespace) | |
| print("Found images elems in drawing.xml file", images_path) | |
| for i in images_path: | |
| image_name = i.attrib.get("name", None) | |
| image_path = os.path.normpath( | |
| os.path.join(media_folder, os.path.basename(image_name)) | |
| ) | |
| if os.path.exists(image_path): | |
| output_image_path = os.path.join( | |
| output_path, os.path.basename(image_name) | |
| ) | |
| os.makedirs(output_path, exist_ok=True) | |
| with open(image_path, "rb") as img_file: | |
| with open(output_image_path, "wb") as out_file: | |
| out_file.write(img_file.read()) | |
| extracted_images.append(output_image_path) | |
| print(f"Extracted {len(extracted_images)} images to {temp_dir}") | |
| return extracted_images | |
| def get_markdown_format(cell): | |
| """ | |
| Convert Excel cell formatting (bold, italic, strikethrough) to Markdown format. | |
| """ | |
| if cell is None or cell.value is None: | |
| return "" | |
| value = str(cell.value) | |
| if cell.font.bold: | |
| value = f"**{value}**" | |
| if cell.font.italic: | |
| value = f"*{value}*" | |
| if cell.font.strike: | |
| value = f"~~{value}~~" | |
| value = value.replace("\n", " ").replace("|", "\\|") | |
| return value | |
| def extract_tables_from_sheet(sheet): | |
| """ | |
| Extract tables from an Excel sheet. | |
| Detects tables based on non-empty rows and retains formatting. | |
| """ | |
| tables = [] | |
| current_table = [] | |
| for row in sheet.iter_rows(): | |
| if all(cell.value is None for cell in row): | |
| if current_table: | |
| tables.append(current_table) | |
| current_table = [] | |
| else: | |
| current_table.append(row) | |
| if current_table: | |
| tables.append(current_table) | |
| return tables | |
| def convert_table_to_markdown(table, images=[]): | |
| """ | |
| Convert a table to Markdown format, preserving Excel formatting. | |
| """ | |
| try: | |
| non_empty_columns = [ | |
| col_idx | |
| for col_idx in range(len(table[0])) | |
| if any(row[col_idx].value is not None for row in table) | |
| ] | |
| except IndexError: | |
| non_empty_columns = [ | |
| col_idx | |
| for col_idx in range(len(table[0])) | |
| if any(row[col_idx] is not None for row in table) | |
| ] | |
| filtered_table = [ | |
| [row[col_idx] if col_idx < len(row) else None for col_idx in non_empty_columns] | |
| for row in table | |
| ] | |
| md_str = "" | |
| header = [get_markdown_format(cell) for cell in filtered_table[0]] | |
| md_str += f"| {' | '.join(header)} |\n" | |
| md_str += f"|{'|'.join(['---' for _ in header])}|\n" | |
| for data_row in filtered_table[1:]: | |
| row_str = [] | |
| for cell in data_row: | |
| row_str.append(get_markdown_format(cell)) | |
| md_str += f"| {' | '.join(row_str)} |\n" | |
| return md_str | |
| def parse_excel(file, selected_sheets): | |
| """ | |
| Parse the selected sheets from the uploaded Excel file and return Markdown content. | |
| """ | |
| if not selected_sheets or len(selected_sheets) == 0: | |
| return "No sheets selected.", [] | |
| workbook = load_workbook(file.name, read_only=True) | |
| sheets = workbook.sheetnames | |
| markdown_output = "" | |
| images = extract_images_from_excel(file.name, selected_sheets) | |
| for sheet_name in selected_sheets: | |
| if sheet_name not in sheets: | |
| markdown_output += f"Sheet '{sheet_name}' not found.\n\n" | |
| continue | |
| sheet = workbook[sheet_name] | |
| tables = extract_tables_from_sheet(sheet) | |
| markdown_output += f"# {sheet_name}\n\n" | |
| for table in tables: | |
| markdown_output += convert_table_to_markdown(table) | |
| markdown_output += "\n\n" | |
| if markdown_output == "": | |
| markdown_output = "No tables found in selected sheets." | |
| return markdown_output, images | |
| def get_sheets(file): | |
| """ | |
| Return the list of sheet names from the uploaded Excel file. | |
| """ | |
| if not file: | |
| return gr.update(choices=[]) | |
| workbook = load_workbook(file.name, read_only=True) | |
| return gr.update(choices=workbook.sheetnames) | |
| # Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Excel Tables and Images from XLSX") | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| with gr.Tab("Text/Tables"): | |
| markdown_output = gr.Markdown(show_copy_button=True) | |
| with gr.Tab("Images"): | |
| images_output = gr.Gallery( | |
| label="Images in Sheets", | |
| show_label=False, | |
| elem_id="gallery", | |
| object_fit="contain", | |
| height="auto", | |
| ) | |
| with gr.Column(scale=1): | |
| sheet_selector = gr.CheckboxGroup( | |
| label="Select Sheets to Parse", interactive=True | |
| ) | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload Excel File", file_types=[".xlsx"]) | |
| extrract_btn = gr.Button( | |
| "Extract Sheets", | |
| ) | |
| extrract_btn.click(fn=get_sheets, inputs=[file_input], outputs=sheet_selector) | |
| sheet_selector.change( | |
| fn=parse_excel, | |
| inputs=[file_input, sheet_selector], | |
| outputs=[markdown_output, images_output], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |