pums-tools / tools /DocReader /text_extractor.py
ndmhung6's picture
feat: add doc reader, global dark mode and fix marian translator tokenizer
0e1c5e3
Raw
History Blame Contribute Delete
1.75 kB
import os
import docx
def read_txt_file(file_path: str) -> str:
"""Read a text file with fallback encodings to support Vietnamese."""
encodings = ['utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 'utf-8-sig', 'latin-1', 'cp1258']
for enc in encodings:
try:
with open(file_path, 'r', encoding=enc) as f:
return f.read()
except UnicodeDecodeError:
continue
# Fallback to ignore errors
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def read_docx_file(file_path: str) -> str:
"""Read paragraphs and tables from a DOCX file."""
doc = docx.Document(file_path)
# Extract paragraphs
paragraphs = [p.text for p in doc.paragraphs]
# Extract tables
for table in doc.tables:
for row in table.rows:
row_text = []
for cell in row.cells:
cell_text = cell.text.strip()
if cell_text and cell_text not in row_text:
row_text.append(cell_text)
if row_text:
paragraphs.append(" | ".join(row_text))
return "\n".join(paragraphs)
def extract_text_from_file(file_path: str) -> str:
"""Detect file extension and extract text content."""
if not file_path or not os.path.exists(file_path):
raise FileNotFoundError(f"Không tìm thấy file: {file_path}")
ext = os.path.splitext(file_path)[1].lower()
if ext == ".txt":
return read_txt_file(file_path)
elif ext == ".docx":
return read_docx_file(file_path)
else:
raise ValueError(f"Định dạng file {ext} không được hỗ trợ. Chỉ hỗ trợ .txt và .docx.")