import zipfile import xml.etree.ElementTree as ET import os import sys def extract_text_from_docx(docx_path): """Extract text from a DOCX file.""" try: with zipfile.ZipFile(docx_path, 'r') as zip_ref: # Get document.xml with zip_ref.open('word/document.xml') as xml_file: tree = ET.parse(xml_file) root = tree.getroot() # Define namespaces ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} # Extract paragraphs paragraphs = [] for p in root.findall('.//w:p', ns): text_parts = [] for t in p.findall('.//w:t', ns): if t.text: text_parts.append(t.text) if text_parts: paragraphs.append(''.join(text_parts)) return paragraphs except Exception as e: print(f"Error processing {docx_path}: {e}") return [] def main(): base_path = r"D:\OTROS\MCP_CLIENTE_LATEX_V3\temp_projects\Tatiana\INVESTIGACION PARA PTI" files = [ "Realidad Problemática_ Impacto ComunicUNS 2025.docx", "Antecedentes de Investigación_ Impacto ComunicUNS 2025.docx", "METODOLOGIA - REVISION.docx", "Metodología.docx" ] for file in files: file_path = os.path.join(base_path, file) print(f"\n{'='*80}") print(f"FILE: {file}") print(f"{'='*80}") paragraphs = extract_text_from_docx(file_path) for i, para in enumerate(paragraphs, 1): print(f"Para {i}: {para}") # Calculate word count word_count = sum(len(p.split()) for p in paragraphs) print(f"\nTotal paragraphs: {len(paragraphs)}") print(f"Total word count: {word_count}") if __name__ == "__main__": main()