Spaces:

cryogenic22
/

Synaptyx

Sleeping

Synaptyx / core /document_processor.py

Create core/document_processor.py

c67b1f8 verified about 1 year ago

1.63 kB

	# core/document_processor.py
	from typing import Dict, List
	import PyPDF2
	import docx
	from pathlib import Path

	class DocumentProcessor:
	"""Handles document ingestion and preprocessing."""

	supported_formats = ['.txt', '.pdf', '.docx']

	@staticmethod
	def process_document(file_path: str) -> Dict:
	path = Path(file_path)

	if not path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	if path.suffix not in DocumentProcessor.supported_formats:
	raise ValueError(f"Unsupported file format: {path.suffix}")

	content = DocumentProcessor._extract_content(path)
	metadata = DocumentProcessor._extract_metadata(path)

	return {
	"content": content,
	"metadata": metadata
	}

	@staticmethod
	def _extract_content(path: Path) -> str:
	if path.suffix == '.txt':
	return path.read_text(encoding='utf-8')

	elif path.suffix == '.pdf':
	with open(path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	return ' '.join([page.extract_text() for page in reader.pages])

	elif path.suffix == '.docx':
	doc = docx.Document(path)
	return ' '.join([paragraph.text for paragraph in doc.paragraphs])

	@staticmethod
	def _extract_metadata(path: Path) -> Dict:
	return {
	"filename": path.name,
	"file_size": path.stat().st_size,
	"file_type": path.suffix,
	"last_modified": path.stat().st_mtime
	}