File size: 1,634 Bytes
c67b1f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# core/document_processor.py
from typing import Dict, List
import PyPDF2
import docx
from pathlib import Path

class DocumentProcessor:
    """Handles document ingestion and preprocessing."""
    
    supported_formats = ['.txt', '.pdf', '.docx']
    
    @staticmethod
    def process_document(file_path: str) -> Dict:
        path = Path(file_path)
        
        if not path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
        
        if path.suffix not in DocumentProcessor.supported_formats:
            raise ValueError(f"Unsupported file format: {path.suffix}")
        
        content = DocumentProcessor._extract_content(path)
        metadata = DocumentProcessor._extract_metadata(path)
        
        return {
            "content": content,
            "metadata": metadata
        }
    
    @staticmethod
    def _extract_content(path: Path) -> str:
        if path.suffix == '.txt':
            return path.read_text(encoding='utf-8')
        
        elif path.suffix == '.pdf':
            with open(path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                return ' '.join([page.extract_text() for page in reader.pages])
        
        elif path.suffix == '.docx':
            doc = docx.Document(path)
            return ' '.join([paragraph.text for paragraph in doc.paragraphs])
    
    @staticmethod
    def _extract_metadata(path: Path) -> Dict:
        return {
            "filename": path.name,
            "file_size": path.stat().st_size,
            "file_type": path.suffix,
            "last_modified": path.stat().st_mtime
        }