"""Tests for ingestion base layer: PaperRecord and DataLoader.""" import pytest from src.ingestion.base_loader import DataLoader, PaperRecord class TestPaperRecord: def test_paper_id_format(self): paper = PaperRecord( source_id="P18-1001", source="hf_acl_ocl", title="Test", abstract="Abstract", authors=["Alice"], year=2018, ) assert paper.paper_id() == "hf_acl_ocl::P18-1001" def test_paper_id_different_sources(self): hf = PaperRecord(source_id="P18-1001", source="hf_acl_ocl", title="T", abstract="A", authors=[], year=2018) acl = PaperRecord(source_id="2024.acl-long.1", source="acl_anthology", title="T", abstract="A", authors=[], year=2024) assert hf.paper_id() != acl.paper_id() assert "hf_acl_ocl::" in hf.paper_id() assert "acl_anthology::" in acl.paper_id() def test_has_full_text_true(self): paper = PaperRecord( source_id="x", source="test", title="T", abstract="A", authors=[], year=2020, full_text="Some full text content", ) assert paper.has_full_text() is True def test_has_full_text_false_none(self): paper = PaperRecord( source_id="x", source="test", title="T", abstract="A", authors=[], year=2020, full_text=None, ) assert paper.has_full_text() is False def test_has_full_text_false_empty(self): paper = PaperRecord( source_id="x", source="test", title="T", abstract="A", authors=[], year=2020, full_text=" ", ) assert paper.has_full_text() is False def test_defaults(self): paper = PaperRecord( source_id="x", source="test", title="T", abstract="A", authors=[], year=2020, ) assert paper.venue is None assert paper.volume is None assert paper.full_text is None assert paper.url is None assert paper.metadata == {} def test_metadata_storage(self): paper = PaperRecord( source_id="x", source="test", title="T", abstract="A", authors=[], year=2020, metadata={"numcitedby": 42}, ) assert paper.metadata["numcitedby"] == 42 class TestDataLoaderABC: def test_cannot_instantiate_directly(self): with pytest.raises(TypeError): DataLoader() def test_concrete_subclass_works(self): class DummyLoader(DataLoader): @property def source_name(self) -> str: return "dummy" def load(self, **kwargs): return [] def validate_source(self) -> bool: return True loader = DummyLoader() assert loader.source_name == "dummy" assert loader.validate_source() is True assert loader.load() == []