Spaces:

ndmhung6
/

pums-tools

Sleeping

pums-tools / tools /DocReader /text_extractor.py

feat: add doc reader, global dark mode and fix marian translator tokenizer

0e1c5e3 13 days ago

1.75 kB

	import os
	import docx

	def read_txt_file(file_path: str) -> str:
	"""Read a text file with fallback encodings to support Vietnamese."""
	encodings = ['utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 'utf-8-sig', 'latin-1', 'cp1258']
	for enc in encodings:
	try:
	with open(file_path, 'r', encoding=enc) as f:
	return f.read()
	except UnicodeDecodeError:
	continue
	# Fallback to ignore errors
	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	return f.read()

	def read_docx_file(file_path: str) -> str:
	"""Read paragraphs and tables from a DOCX file."""
	doc = docx.Document(file_path)

	# Extract paragraphs
	paragraphs = [p.text for p in doc.paragraphs]

	# Extract tables
	for table in doc.tables:
	for row in table.rows:
	row_text = []
	for cell in row.cells:
	cell_text = cell.text.strip()
	if cell_text and cell_text not in row_text:
	row_text.append(cell_text)
	if row_text:
	paragraphs.append(" \| ".join(row_text))

	return "\n".join(paragraphs)

	def extract_text_from_file(file_path: str) -> str:
	"""Detect file extension and extract text content."""
	if not file_path or not os.path.exists(file_path):
	raise FileNotFoundError(f"Không tìm thấy file: {file_path}")

	ext = os.path.splitext(file_path)[1].lower()

	if ext == ".txt":
	return read_txt_file(file_path)
	elif ext == ".docx":
	return read_docx_file(file_path)
	else:
	raise ValueError(f"Định dạng file {ext} không được hỗ trợ. Chỉ hỗ trợ .txt và .docx.")