Spaces:

cryogenic22
/

Synaptyx

Sleeping

File size: 1,634 Bytes

c67b1f8

# core/document_processor.py
from typing import Dict, List
import PyPDF2
import docx
from pathlib import Path

class DocumentProcessor:
    """Handles document ingestion and preprocessing."""
    
    supported_formats = ['.txt', '.pdf', '.docx']
    
    @staticmethod
    def process_document(file_path: str) -> Dict:
        path = Path(file_path)
        
        if not path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
        
        if path.suffix not in DocumentProcessor.supported_formats:
            raise ValueError(f"Unsupported file format: {path.suffix}")
        
        content = DocumentProcessor._extract_content(path)
        metadata = DocumentProcessor._extract_metadata(path)
        
        return {
            "content": content,
            "metadata": metadata
        }
    
    @staticmethod
    def _extract_content(path: Path) -> str:
        if path.suffix == '.txt':
            return path.read_text(encoding='utf-8')
        
        elif path.suffix == '.pdf':
            with open(path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                return ' '.join([page.extract_text() for page in reader.pages])
        
        elif path.suffix == '.docx':
            doc = docx.Document(path)
            return ' '.join([paragraph.text for paragraph in doc.paragraphs])
    
    @staticmethod
    def _extract_metadata(path: Path) -> Dict:
        return {
            "filename": path.name,
            "file_size": path.stat().st_size,
            "file_type": path.suffix,
            "last_modified": path.stat().st_mtime
        }