Spaces:
Sleeping
Sleeping
| import os | |
| import docx | |
| def read_txt_file(file_path: str) -> str: | |
| """Read a text file with fallback encodings to support Vietnamese.""" | |
| encodings = ['utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 'utf-8-sig', 'latin-1', 'cp1258'] | |
| for enc in encodings: | |
| try: | |
| with open(file_path, 'r', encoding=enc) as f: | |
| return f.read() | |
| except UnicodeDecodeError: | |
| continue | |
| # Fallback to ignore errors | |
| with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
| return f.read() | |
| def read_docx_file(file_path: str) -> str: | |
| """Read paragraphs and tables from a DOCX file.""" | |
| doc = docx.Document(file_path) | |
| # Extract paragraphs | |
| paragraphs = [p.text for p in doc.paragraphs] | |
| # Extract tables | |
| for table in doc.tables: | |
| for row in table.rows: | |
| row_text = [] | |
| for cell in row.cells: | |
| cell_text = cell.text.strip() | |
| if cell_text and cell_text not in row_text: | |
| row_text.append(cell_text) | |
| if row_text: | |
| paragraphs.append(" | ".join(row_text)) | |
| return "\n".join(paragraphs) | |
| def extract_text_from_file(file_path: str) -> str: | |
| """Detect file extension and extract text content.""" | |
| if not file_path or not os.path.exists(file_path): | |
| raise FileNotFoundError(f"Không tìm thấy file: {file_path}") | |
| ext = os.path.splitext(file_path)[1].lower() | |
| if ext == ".txt": | |
| return read_txt_file(file_path) | |
| elif ext == ".docx": | |
| return read_docx_file(file_path) | |
| else: | |
| raise ValueError(f"Định dạng file {ext} không được hỗ trợ. Chỉ hỗ trợ .txt và .docx.") | |