Spaces:
Sleeping
Sleeping
File size: 3,420 Bytes
5b14aa2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | """Text file processor."""
import os
from typing import Dict, Any
from .base import BaseProcessor
from ..result import ConversionResult
from ..exceptions import ConversionError, FileNotFoundError
class TXTProcessor(BaseProcessor):
"""Processor for plain text files."""
def can_process(self, file_path: str) -> bool:
"""Check if this processor can handle the given file.
Args:
file_path: Path to the file to check
Returns:
True if this processor can handle the file
"""
if not os.path.exists(file_path):
return False
# Check file extension - ensure file_path is a string
file_path_str = str(file_path)
_, ext = os.path.splitext(file_path_str.lower())
return ext in ['.txt', '.text']
def process(self, file_path: str) -> ConversionResult:
"""Process the text file and return a conversion result.
Args:
file_path: Path to the text file to process
Returns:
ConversionResult containing the processed content
Raises:
FileNotFoundError: If the file doesn't exist
ConversionError: If processing fails
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
try:
# Try different encodings
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
content = None
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
content = f.read()
break
except UnicodeDecodeError:
continue
if content is None:
raise ConversionError(f"Could not decode file {file_path} with any supported encoding")
# Clean up the content
content = self._clean_content(content)
metadata = self.get_metadata(file_path)
metadata.update({
"encoding": encoding,
"line_count": len(content.split('\n')),
"word_count": len(content.split())
})
return ConversionResult(content, metadata)
except Exception as e:
if isinstance(e, (FileNotFoundError, ConversionError)):
raise
raise ConversionError(f"Failed to process text file {file_path}: {str(e)}")
def _clean_content(self, content: str) -> str:
"""Clean up the text content.
Args:
content: Raw text content
Returns:
Cleaned text content
"""
# Remove excessive whitespace
lines = content.split('\n')
cleaned_lines = []
for line in lines:
# Remove trailing whitespace
line = line.rstrip()
cleaned_lines.append(line)
# Remove empty lines at the beginning and end
while cleaned_lines and not cleaned_lines[0].strip():
cleaned_lines.pop(0)
while cleaned_lines and not cleaned_lines[-1].strip():
cleaned_lines.pop()
return '\n'.join(cleaned_lines) |