MCP_CLIENTE_LATEX_V3 / extract_docx_text.py
C2MV's picture
🚀 Deploy LaTeX MCP Server v2.0 — 7 tools (Modules A-F)
0ea56ba verified
import zipfile
import xml.etree.ElementTree as ET
import os
import sys
def extract_text_from_docx(docx_path):
"""Extract text from a DOCX file."""
try:
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
# Get document.xml
with zip_ref.open('word/document.xml') as xml_file:
tree = ET.parse(xml_file)
root = tree.getroot()
# Define namespaces
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
# Extract paragraphs
paragraphs = []
for p in root.findall('.//w:p', ns):
text_parts = []
for t in p.findall('.//w:t', ns):
if t.text:
text_parts.append(t.text)
if text_parts:
paragraphs.append(''.join(text_parts))
return paragraphs
except Exception as e:
print(f"Error processing {docx_path}: {e}")
return []
def main():
base_path = r"D:\OTROS\MCP_CLIENTE_LATEX_V3\temp_projects\Tatiana\INVESTIGACION PARA PTI"
files = [
"Realidad Problemática_ Impacto ComunicUNS 2025.docx",
"Antecedentes de Investigación_ Impacto ComunicUNS 2025.docx",
"METODOLOGIA - REVISION.docx",
"Metodología.docx"
]
for file in files:
file_path = os.path.join(base_path, file)
print(f"\n{'='*80}")
print(f"FILE: {file}")
print(f"{'='*80}")
paragraphs = extract_text_from_docx(file_path)
for i, para in enumerate(paragraphs, 1):
print(f"Para {i}: {para}")
# Calculate word count
word_count = sum(len(p.split()) for p in paragraphs)
print(f"\nTotal paragraphs: {len(paragraphs)}")
print(f"Total word count: {word_count}")
if __name__ == "__main__":
main()