Spaces:
Sleeping
Sleeping
| import zipfile | |
| import xml.etree.ElementTree as ET | |
| import os | |
| import sys | |
| def extract_text_from_docx(docx_path): | |
| """Extract text from a DOCX file.""" | |
| try: | |
| with zipfile.ZipFile(docx_path, 'r') as zip_ref: | |
| # Get document.xml | |
| with zip_ref.open('word/document.xml') as xml_file: | |
| tree = ET.parse(xml_file) | |
| root = tree.getroot() | |
| # Define namespaces | |
| ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} | |
| # Extract paragraphs | |
| paragraphs = [] | |
| for p in root.findall('.//w:p', ns): | |
| text_parts = [] | |
| for t in p.findall('.//w:t', ns): | |
| if t.text: | |
| text_parts.append(t.text) | |
| if text_parts: | |
| paragraphs.append(''.join(text_parts)) | |
| return paragraphs | |
| except Exception as e: | |
| print(f"Error processing {docx_path}: {e}") | |
| return [] | |
| def main(): | |
| base_path = r"D:\OTROS\MCP_CLIENTE_LATEX_V3\temp_projects\Tatiana\INVESTIGACION PARA PTI" | |
| files = [ | |
| "Realidad Problemática_ Impacto ComunicUNS 2025.docx", | |
| "Antecedentes de Investigación_ Impacto ComunicUNS 2025.docx", | |
| "METODOLOGIA - REVISION.docx", | |
| "Metodología.docx" | |
| ] | |
| for file in files: | |
| file_path = os.path.join(base_path, file) | |
| print(f"\n{'='*80}") | |
| print(f"FILE: {file}") | |
| print(f"{'='*80}") | |
| paragraphs = extract_text_from_docx(file_path) | |
| for i, para in enumerate(paragraphs, 1): | |
| print(f"Para {i}: {para}") | |
| # Calculate word count | |
| word_count = sum(len(p.split()) for p in paragraphs) | |
| print(f"\nTotal paragraphs: {len(paragraphs)}") | |
| print(f"Total word count: {word_count}") | |
| if __name__ == "__main__": | |
| main() | |