Synaptyx / core /document_processor.py
cryogenic22's picture
Create core/document_processor.py
c67b1f8 verified
# core/document_processor.py
from typing import Dict, List
import PyPDF2
import docx
from pathlib import Path
class DocumentProcessor:
"""Handles document ingestion and preprocessing."""
supported_formats = ['.txt', '.pdf', '.docx']
@staticmethod
def process_document(file_path: str) -> Dict:
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
if path.suffix not in DocumentProcessor.supported_formats:
raise ValueError(f"Unsupported file format: {path.suffix}")
content = DocumentProcessor._extract_content(path)
metadata = DocumentProcessor._extract_metadata(path)
return {
"content": content,
"metadata": metadata
}
@staticmethod
def _extract_content(path: Path) -> str:
if path.suffix == '.txt':
return path.read_text(encoding='utf-8')
elif path.suffix == '.pdf':
with open(path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
return ' '.join([page.extract_text() for page in reader.pages])
elif path.suffix == '.docx':
doc = docx.Document(path)
return ' '.join([paragraph.text for paragraph in doc.paragraphs])
@staticmethod
def _extract_metadata(path: Path) -> Dict:
return {
"filename": path.name,
"file_size": path.stat().st_size,
"file_type": path.suffix,
"last_modified": path.stat().st_mtime
}