Spaces:
Sleeping
Sleeping
File size: 2,140 Bytes
01d5a5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from pathlib import Path
# Changed to relative import
from lpm_kernel.file_data.processors.processor import BaseFileProcessor
from ...core.file_type import FileType
from ...core.decorators import processor_register
from ...core.exceptions import FileProcessingError
from ...document import Document, ProcessStatus
print("Loading Text processor...") # Add debug statement
@processor_register
class TEXTProcessor(BaseFileProcessor):
SUPPORTED_TYPES = {FileType.TEXT}
# Define supported encoding list
SUPPORTED_ENCODINGS = [
'utf-8', # Unicode encoding, most common
'utf-8-sig', # UTF-8 with BOM
'utf-16', # Unicode 16-bit encoding
'gbk', # Chinese encoding
'gb2312', # Subset of Chinese encoding
'gb18030', # Superset of Chinese encoding
'big5', # Traditional Chinese encoding
'iso-8859-1', # Western European encoding
'ascii', # ASCII encoding
'cp936', # Microsoft Chinese encoding
'shift-jis', # Japanese encoding
'euc-jp', # Japanese encoding
'euc-kr', # Korean encoding
]
@classmethod
def _process_file(cls, file_path: Path, doc: Document) -> Document:
last_exception = None
# Try different encoding formats
for encoding in cls.SUPPORTED_ENCODINGS:
try:
with open(file_path, "r", encoding=encoding) as file:
text = file.read()
doc.raw_content = text
doc.extract_status = ProcessStatus.SUCCESS
return doc
except UnicodeDecodeError as e:
last_exception = e
continue
except Exception as e:
doc.extract_status = ProcessStatus.FAILED
raise FileProcessingError(f"Failed to process text file: {str(e)}")
# If all encodings failed
doc.extract_status = ProcessStatus.FAILED
raise FileProcessingError(f"Failed to process text file with all supported encodings: {str(last_exception)}")
|