from __future__ import annotations import io import logging from dataclasses import dataclass, field from pathlib import Path from typing import List from langchain_core.documents import Document logger = logging.getLogger(__name__) @dataclass class IngestedFile: filename: str file_type: str page_count: int char_count: int documents: List[Document] = field(default_factory=list) class DocumentProcessor: SUPPORTED_TYPES = {"pdf", "txt"} def ingest(self, file_bytes: bytes, filename: str) -> IngestedFile: ext = Path(filename).suffix.lstrip(".").lower() if ext not in self.SUPPORTED_TYPES: raise ValueError(f"Unsupported file type '.{ext}'. Supported: {self.SUPPORTED_TYPES}") docs = self._parse_pdf(file_bytes, filename) if ext == "pdf" else self._parse_txt(file_bytes, filename) if not docs: raise ValueError(f"No text could be extracted from '{filename}'.") total_chars = sum(len(d.page_content) for d in docs) return IngestedFile(filename=filename, file_type=ext, page_count=len(docs), char_count=total_chars, documents=docs) @staticmethod def _parse_pdf(file_bytes: bytes, filename: str) -> List[Document]: from pypdf import PdfReader reader = PdfReader(io.BytesIO(file_bytes)) docs = [] for page_num, page in enumerate(reader.pages, start=1): text = (page.extract_text() or "").strip() if text: docs.append(Document( page_content=text, metadata={"source": filename, "page": page_num, "file_type": "pdf"}, )) return docs @staticmethod def _parse_txt(file_bytes: bytes, filename: str) -> List[Document]: for encoding in ("utf-8", "latin-1"): try: text = file_bytes.decode(encoding).strip() break except UnicodeDecodeError: continue else: raise ValueError(f"Could not decode '{filename}' as UTF-8 or latin-1.") if not text: return [] return [Document( page_content=text, metadata={"source": filename, "page": 0, "file_type": "txt"}, )]