Spaces:

C2MV
/

MCP_CLIENTE_LATEX_V3

Sleeping

MCP_CLIENTE_LATEX_V3 / extract_docx_text.py

🚀 Deploy LaTeX MCP Server v2.0 — 7 tools (Modules A-F)

0ea56ba verified about 2 months ago

1.99 kB

	import zipfile
	import xml.etree.ElementTree as ET
	import os
	import sys

	def extract_text_from_docx(docx_path):
	"""Extract text from a DOCX file."""
	try:
	with zipfile.ZipFile(docx_path, 'r') as zip_ref:
	# Get document.xml
	with zip_ref.open('word/document.xml') as xml_file:
	tree = ET.parse(xml_file)
	root = tree.getroot()

	# Define namespaces
	ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

	# Extract paragraphs
	paragraphs = []
	for p in root.findall('.//w:p', ns):
	text_parts = []
	for t in p.findall('.//w:t', ns):
	if t.text:
	text_parts.append(t.text)
	if text_parts:
	paragraphs.append(''.join(text_parts))

	return paragraphs
	except Exception as e:
	print(f"Error processing {docx_path}: {e}")
	return []

	def main():
	base_path = r"D:\OTROS\MCP_CLIENTE_LATEX_V3\temp_projects\Tatiana\INVESTIGACION PARA PTI"
	files = [
	"Realidad Problemática_ Impacto ComunicUNS 2025.docx",
	"Antecedentes de Investigación_ Impacto ComunicUNS 2025.docx",
	"METODOLOGIA - REVISION.docx",
	"Metodología.docx"
	]

	for file in files:
	file_path = os.path.join(base_path, file)
	print(f"\n{'='*80}")
	print(f"FILE: {file}")
	print(f"{'='*80}")

	paragraphs = extract_text_from_docx(file_path)
	for i, para in enumerate(paragraphs, 1):
	print(f"Para {i}: {para}")

	# Calculate word count
	word_count = sum(len(p.split()) for p in paragraphs)
	print(f"\nTotal paragraphs: {len(paragraphs)}")
	print(f"Total word count: {word_count}")

	if __name__ == "__main__":
	main()