| | from collections import defaultdict |
| | import json |
| | import zipfile |
| | from lxml import etree |
| |
|
| | |
| | common_fonts = { |
| | 'Times New Roman', |
| | 'Arial', |
| | 'Calibri', |
| | |
| | } |
| |
|
| | |
| | ignored_elements = { |
| | 'proofErr', |
| | 'bookmarkStart', |
| | 'bookmarkEnd', |
| | 'lastRenderedPageBreak', |
| | 'webHidden', |
| | 'numPr', |
| | 'pBdr', |
| | 'ind', |
| | 'spacing', |
| | 'jc', |
| | 'tabs', |
| | 'sectPr', |
| | 'pgMar' |
| | |
| | } |
| |
|
| | |
| | ignored_attributes = { |
| | 'rsidR', |
| | 'rsidRPr', |
| | 'rsidRDefault', |
| | 'rsidP', |
| | 'paraId', |
| | 'textId', |
| | 'rsidR', |
| | 'rsidRPr', |
| | 'rsidDel', |
| | 'rsidP', |
| | 'rsidTr', |
| | |
| | } |
| |
|
| | |
| | ignored_metadata_elements = { |
| | 'application', |
| | 'docSecurity', |
| | 'scaleCrop', |
| | 'linksUpToDate', |
| | 'charactersWithSpaces', |
| | 'hiddenSlides', |
| | 'mmClips', |
| | 'notes', |
| | 'words', |
| | 'characters', |
| | 'pages', |
| | 'lines', |
| | 'paragraphs', |
| | 'company', |
| | 'template', |
| | |
| | } |
| |
|
| | def remove_ignored_elements(tree): |
| | """Remove all ignored elements from the XML tree, except highlights.""" |
| | for elem in tree.xpath(".//*"): |
| | tag_without_ns = elem.tag.split('}')[-1] |
| | if tag_without_ns in ignored_elements: |
| | elem.getparent().remove(elem) |
| | elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': |
| | if not any(child.tag.endswith('highlight') for child in elem.getchildren()): |
| | elem.getparent().remove(elem) |
| | else: |
| | |
| | for attr in list(elem.attrib): |
| | attr_without_ns = attr.split('}')[-1] |
| | if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'): |
| | del elem.attrib[attr] |
| | return tree |
| |
|
| | def etree_to_dict(t): |
| | """Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes.""" |
| | tag = t.tag.split('}')[-1] |
| | if tag in ignored_elements: |
| | return None |
| |
|
| | d = {tag: {} if t.attrib else None} |
| | children = list(t) |
| | if children: |
| | dd = defaultdict(list) |
| | for dc in filter(None, map(etree_to_dict, children)): |
| | for k, v in dc.items(): |
| | dd[k].append(v) |
| | d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}} |
| |
|
| | if t.attrib: |
| | |
| | filtered_attribs = {} |
| | for k, v in t.attrib.items(): |
| | k = k.split('}')[-1] |
| | if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'): |
| | if v not in common_fonts: |
| | filtered_attribs[k] = v |
| | elif k not in ignored_attributes and not k.startswith('rsid'): |
| | filtered_attribs[k] = v |
| | d[tag].update(filtered_attribs) |
| | |
| | if t.text: |
| | text = t.text.strip() |
| | |
| | text = bytes(text, 'utf-8').decode('utf-8', 'ignore') |
| | if children or t.attrib: |
| | if text: |
| | d[tag]['#text'] = text |
| | else: |
| | d[tag] = text |
| | |
| | if not t.attrib and not children and not t.text: |
| | return None |
| |
|
| | return d |
| |
|
| | |
| | def remove_ignored_elements(tree): |
| | """Remove all ignored elements from the XML tree, except highlights.""" |
| | for elem in tree.xpath(".//*"): |
| | tag_without_ns = elem.tag.split('}')[-1] |
| | if tag_without_ns in ignored_elements: |
| | elem.getparent().remove(elem) |
| | elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': |
| | if not any(child.tag.endswith('highlight') for child in elem.getchildren()): |
| | elem.getparent().remove(elem) |
| | else: |
| | |
| | for attr in list(elem.attrib): |
| | attr_without_ns = attr.split('}')[-1] |
| | if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'): |
| | del elem.attrib[attr] |
| | |
| | for elem in tree.xpath(".//text()"): |
| | elem_text = elem.strip() |
| | encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore') |
| | parent = elem.getparent() |
| | if parent is not None: |
| | parent.text = encoded_text |
| | return tree |
| |
|
| | def extract_metadata(docx): |
| | """Extract metadata from the document properties, ignoring specified elements.""" |
| | metadata = {} |
| | with docx.open('docProps/core.xml') as core_xml: |
| | xml_content = core_xml.read() |
| | core_tree = etree.XML(xml_content) |
| | for child in core_tree.getchildren(): |
| | tag = child.tag.split('}')[-1] |
| | if tag not in ignored_metadata_elements: |
| | metadata[tag] = child.text |
| | return metadata |
| |
|
| | def process_docx(file_path): |
| | |
| | with zipfile.ZipFile(file_path) as docx: |
| | metadata = extract_metadata(docx) |
| | with docx.open('word/document.xml') as document_xml: |
| | xml_content = document_xml.read() |
| | document_tree = etree.XML(xml_content) |
| |
|
| | |
| | document_tree = remove_ignored_elements(document_tree) |
| |
|
| | |
| | document_dict = etree_to_dict(document_tree) |
| | document_dict['metadata'] = metadata |
| |
|
| | docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2) |
| |
|
| | return docx_json |
| |
|