Spaces:
Runtime error
Runtime error
| # Do imports like python3 so our package works for 2 and 3 | |
| from __future__ import absolute_import | |
| from lxml import html | |
| from openpyxl import Workbook | |
| from openpyxl.utils import get_column_letter | |
| from premailer import Premailer | |
| from tablepyxl.style import Table | |
| def string_to_int(s): | |
| if s.isdigit(): | |
| return int(s) | |
| return 0 | |
| def get_Tables(doc): | |
| tree = html.fromstring(doc) | |
| comments = tree.xpath('//comment()') | |
| for comment in comments: | |
| comment.drop_tag() | |
| return [Table(table) for table in tree.xpath('//table')] | |
| def write_rows(worksheet, elem, row, column=1): | |
| """ | |
| Writes every tr child element of elem to a row in the worksheet | |
| returns the next row after all rows are written | |
| """ | |
| from openpyxl.cell.cell import MergedCell | |
| initial_column = column | |
| for table_row in elem.rows: | |
| for table_cell in table_row.cells: | |
| cell = worksheet.cell(row=row, column=column) | |
| while isinstance(cell, MergedCell): | |
| column += 1 | |
| cell = worksheet.cell(row=row, column=column) | |
| colspan = string_to_int(table_cell.element.get("colspan", "1")) | |
| rowspan = string_to_int(table_cell.element.get("rowspan", "1")) | |
| if rowspan > 1 or colspan > 1: | |
| worksheet.merge_cells(start_row=row, start_column=column, | |
| end_row=row + rowspan - 1, end_column=column + colspan - 1) | |
| cell.value = table_cell.value | |
| table_cell.format(cell) | |
| min_width = table_cell.get_dimension('min-width') | |
| max_width = table_cell.get_dimension('max-width') | |
| if colspan == 1: | |
| # Initially, when iterating for the first time through the loop, the width of all the cells is None. | |
| # As we start filling in contents, the initial width of the cell (which can be retrieved by: | |
| # worksheet.column_dimensions[get_column_letter(column)].width) is equal to the width of the previous | |
| # cell in the same column (i.e. width of A2 = width of A1) | |
| width = max(worksheet.column_dimensions[get_column_letter(column)].width or 0, len(table_cell.value) + 2) | |
| if max_width and width > max_width: | |
| width = max_width | |
| elif min_width and width < min_width: | |
| width = min_width | |
| worksheet.column_dimensions[get_column_letter(column)].width = width | |
| column += colspan | |
| row += 1 | |
| column = initial_column | |
| return row | |
| def table_to_sheet(table, wb): | |
| """ | |
| Takes a table and workbook and writes the table to a new sheet. | |
| The sheet title will be the same as the table attribute name. | |
| """ | |
| ws = wb.create_sheet(title=table.element.get('name')) | |
| insert_table(table, ws, 1, 1) | |
| def document_to_workbook(doc, wb=None, base_url=None): | |
| """ | |
| Takes a string representation of an html document and writes one sheet for | |
| every table in the document. | |
| The workbook is returned | |
| """ | |
| if not wb: | |
| wb = Workbook() | |
| wb.remove(wb.active) | |
| inline_styles_doc = Premailer(doc, base_url=base_url, remove_classes=False).transform() | |
| tables = get_Tables(inline_styles_doc) | |
| for table in tables: | |
| table_to_sheet(table, wb) | |
| return wb | |
| def document_to_xl(doc, filename, base_url=None): | |
| """ | |
| Takes a string representation of an html document and writes one sheet for | |
| every table in the document. The workbook is written out to a file called filename | |
| """ | |
| wb = document_to_workbook(doc, base_url=base_url) | |
| wb.save(filename) | |
| def insert_table(table, worksheet, column, row): | |
| if table.head: | |
| row = write_rows(worksheet, table.head, row, column) | |
| if table.body: | |
| row = write_rows(worksheet, table.body, row, column) | |
| def insert_table_at_cell(table, cell): | |
| """ | |
| Inserts a table at the location of an openpyxl Cell object. | |
| """ | |
| ws = cell.parent | |
| column, row = cell.column, cell.row | |
| insert_table(table, ws, column, row) |