import zipfile
import xml.etree.ElementTree as ET
import os
import sys

def extract_text_from_docx(docx_path):
    """Extract text from a DOCX file."""
    try:
        with zipfile.ZipFile(docx_path, 'r') as zip_ref:
            # Get document.xml
            with zip_ref.open('word/document.xml') as xml_file:
                tree = ET.parse(xml_file)
                root = tree.getroot()
                
                # Define namespaces
                ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
                
                # Extract paragraphs
                paragraphs = []
                for p in root.findall('.//w:p', ns):
                    text_parts = []
                    for t in p.findall('.//w:t', ns):
                        if t.text:
                            text_parts.append(t.text)
                    if text_parts:
                        paragraphs.append(''.join(text_parts))
                
                return paragraphs
    except Exception as e:
        print(f"Error processing {docx_path}: {e}")
        return []

def main():
    base_path = r"D:\OTROS\MCP_CLIENTE_LATEX_V3\temp_projects\Tatiana\INVESTIGACION PARA PTI"
    files = [
        "Realidad Problemática_ Impacto ComunicUNS 2025.docx",
        "Antecedentes de Investigación_ Impacto ComunicUNS 2025.docx",
        "METODOLOGIA - REVISION.docx",
        "Metodología.docx"
    ]
    
    for file in files:
        file_path = os.path.join(base_path, file)
        print(f"\n{'='*80}")
        print(f"FILE: {file}")
        print(f"{'='*80}")
        
        paragraphs = extract_text_from_docx(file_path)
        for i, para in enumerate(paragraphs, 1):
            print(f"Para {i}: {para}")
        
        # Calculate word count
        word_count = sum(len(p.split()) for p in paragraphs)
        print(f"\nTotal paragraphs: {len(paragraphs)}")
        print(f"Total word count: {word_count}")

if __name__ == "__main__":
    main()