Spaces:
Runtime error
Runtime error
| from collections import defaultdict | |
| import json | |
| import zipfile | |
| from lxml import etree | |
| # Define common fonts to ignore | |
| common_fonts = { | |
| 'Times New Roman', | |
| 'Arial', | |
| 'Calibri', | |
| # Add any other common fonts here | |
| } | |
| # Define elements to ignore | |
| ignored_elements = { | |
| 'proofErr', | |
| 'bookmarkStart', | |
| 'bookmarkEnd', | |
| 'lastRenderedPageBreak', | |
| 'webHidden', | |
| 'numPr', | |
| 'pBdr', | |
| 'ind', | |
| 'spacing', | |
| 'jc', | |
| 'tabs', | |
| 'sectPr', | |
| 'pgMar' | |
| # Add any other elements to ignore here | |
| } | |
| # Define attributes to ignore | |
| ignored_attributes = { | |
| 'rsidR', | |
| 'rsidRPr', | |
| 'rsidRDefault', | |
| 'rsidP', | |
| 'paraId', | |
| 'textId', | |
| 'rsidR', | |
| 'rsidRPr', | |
| 'rsidDel', | |
| 'rsidP', | |
| 'rsidTr', | |
| # Add any other attributes to ignore here | |
| } | |
| # Define metadata elements to ignore | |
| ignored_metadata_elements = { | |
| 'application', | |
| 'docSecurity', | |
| 'scaleCrop', | |
| 'linksUpToDate', | |
| 'charactersWithSpaces', | |
| 'hiddenSlides', | |
| 'mmClips', | |
| 'notes', | |
| 'words', | |
| 'characters', | |
| 'pages', | |
| 'lines', | |
| 'paragraphs', | |
| 'company', | |
| 'template', | |
| # Add any other metadata elements to ignore here | |
| } | |
| def remove_ignored_elements(tree): | |
| """Remove all ignored elements from the XML tree, except highlights.""" | |
| for elem in tree.xpath(".//*"): | |
| tag_without_ns = elem.tag.split('}')[-1] | |
| if tag_without_ns in ignored_elements: | |
| elem.getparent().remove(elem) | |
| elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr | |
| if not any(child.tag.endswith('highlight') for child in elem.getchildren()): | |
| elem.getparent().remove(elem) | |
| else: | |
| # Remove ignored attributes | |
| for attr in list(elem.attrib): | |
| attr_without_ns = attr.split('}')[-1] | |
| if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'): | |
| del elem.attrib[attr] | |
| return tree | |
| def etree_to_dict(t): | |
| """Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes.""" | |
| tag = t.tag.split('}')[-1] # Remove namespace URI | |
| if tag in ignored_elements: | |
| return None | |
| d = {tag: {} if t.attrib else None} | |
| children = list(t) | |
| if children: | |
| dd = defaultdict(list) | |
| for dc in filter(None, map(etree_to_dict, children)): | |
| for k, v in dc.items(): | |
| dd[k].append(v) | |
| d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}} | |
| if t.attrib: | |
| # Filter out common fonts and ignored attributes | |
| filtered_attribs = {} | |
| for k, v in t.attrib.items(): | |
| k = k.split('}')[-1] # Remove namespace URI | |
| if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'): | |
| if v not in common_fonts: | |
| filtered_attribs[k] = v | |
| elif k not in ignored_attributes and not k.startswith('rsid'): | |
| filtered_attribs[k] = v | |
| d[tag].update(filtered_attribs) | |
| if t.text: | |
| text = t.text.strip() | |
| # Here we ensure that the text encoding is correctly handled | |
| text = bytes(text, 'utf-8').decode('utf-8', 'ignore') | |
| if children or t.attrib: | |
| if text: | |
| d[tag]['#text'] = text | |
| else: | |
| d[tag] = text | |
| if not t.attrib and not children and not t.text: | |
| return None | |
| return d | |
| # Additionally, update the 'remove_ignored_elements' function to fix encoding | |
| def remove_ignored_elements(tree): | |
| """Remove all ignored elements from the XML tree, except highlights.""" | |
| for elem in tree.xpath(".//*"): | |
| tag_without_ns = elem.tag.split('}')[-1] | |
| if tag_without_ns in ignored_elements: | |
| elem.getparent().remove(elem) | |
| elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr | |
| if not any(child.tag.endswith('highlight') for child in elem.getchildren()): | |
| elem.getparent().remove(elem) | |
| else: | |
| # Remove ignored attributes | |
| for attr in list(elem.attrib): | |
| attr_without_ns = attr.split('}')[-1] | |
| if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'): | |
| del elem.attrib[attr] | |
| # Decode the text correctly for each XML element | |
| for elem in tree.xpath(".//text()"): | |
| elem_text = elem.strip() | |
| encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore') | |
| parent = elem.getparent() | |
| if parent is not None: | |
| parent.text = encoded_text | |
| return tree | |
| def extract_metadata(docx): | |
| """Extract metadata from the document properties, ignoring specified elements.""" | |
| metadata = {} | |
| with docx.open('docProps/core.xml') as core_xml: | |
| xml_content = core_xml.read() | |
| core_tree = etree.XML(xml_content) | |
| for child in core_tree.getchildren(): | |
| tag = child.tag.split('}')[-1] # Get tag without namespace | |
| if tag not in ignored_metadata_elements: | |
| metadata[tag] = child.text | |
| return metadata | |
| def process_docx(file_path): | |
| # Load the document with zipfile and lxml | |
| with zipfile.ZipFile(file_path) as docx: | |
| metadata = extract_metadata(docx) | |
| with docx.open('word/document.xml') as document_xml: | |
| xml_content = document_xml.read() | |
| document_tree = etree.XML(xml_content) | |
| # Remove the ignored elements | |
| document_tree = remove_ignored_elements(document_tree) | |
| # Convert the rest of the XML tree to a dictionary | |
| document_dict = etree_to_dict(document_tree) | |
| document_dict['metadata'] = metadata # Add metadata to the document dictionary | |
| docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2) | |
| return docx_json | |