Spaces:
Sleeping
Sleeping
File size: 1,993 Bytes
0ea56ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | import zipfile
import xml.etree.ElementTree as ET
import os
import sys
def extract_text_from_docx(docx_path):
"""Extract text from a DOCX file."""
try:
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
# Get document.xml
with zip_ref.open('word/document.xml') as xml_file:
tree = ET.parse(xml_file)
root = tree.getroot()
# Define namespaces
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
# Extract paragraphs
paragraphs = []
for p in root.findall('.//w:p', ns):
text_parts = []
for t in p.findall('.//w:t', ns):
if t.text:
text_parts.append(t.text)
if text_parts:
paragraphs.append(''.join(text_parts))
return paragraphs
except Exception as e:
print(f"Error processing {docx_path}: {e}")
return []
def main():
base_path = r"D:\OTROS\MCP_CLIENTE_LATEX_V3\temp_projects\Tatiana\INVESTIGACION PARA PTI"
files = [
"Realidad Problemática_ Impacto ComunicUNS 2025.docx",
"Antecedentes de Investigación_ Impacto ComunicUNS 2025.docx",
"METODOLOGIA - REVISION.docx",
"Metodología.docx"
]
for file in files:
file_path = os.path.join(base_path, file)
print(f"\n{'='*80}")
print(f"FILE: {file}")
print(f"{'='*80}")
paragraphs = extract_text_from_docx(file_path)
for i, para in enumerate(paragraphs, 1):
print(f"Para {i}: {para}")
# Calculate word count
word_count = sum(len(p.split()) for p in paragraphs)
print(f"\nTotal paragraphs: {len(paragraphs)}")
print(f"Total word count: {word_count}")
if __name__ == "__main__":
main()
|