Spaces:
Sleeping
Sleeping
File size: 1,634 Bytes
c67b1f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# core/document_processor.py
from typing import Dict, List
import PyPDF2
import docx
from pathlib import Path
class DocumentProcessor:
"""Handles document ingestion and preprocessing."""
supported_formats = ['.txt', '.pdf', '.docx']
@staticmethod
def process_document(file_path: str) -> Dict:
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
if path.suffix not in DocumentProcessor.supported_formats:
raise ValueError(f"Unsupported file format: {path.suffix}")
content = DocumentProcessor._extract_content(path)
metadata = DocumentProcessor._extract_metadata(path)
return {
"content": content,
"metadata": metadata
}
@staticmethod
def _extract_content(path: Path) -> str:
if path.suffix == '.txt':
return path.read_text(encoding='utf-8')
elif path.suffix == '.pdf':
with open(path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
return ' '.join([page.extract_text() for page in reader.pages])
elif path.suffix == '.docx':
doc = docx.Document(path)
return ' '.join([paragraph.text for paragraph in doc.paragraphs])
@staticmethod
def _extract_metadata(path: Path) -> Dict:
return {
"filename": path.name,
"file_size": path.stat().st_size,
"file_type": path.suffix,
"last_modified": path.stat().st_mtime
}
|