"""Tests for SQLite storage layer.""" import pytest from src.ingestion.base_loader import PaperRecord from src.storage.sqlite_db import SQLiteDB class TestSQLiteSchema: def test_create_schema(self, tmp_db): # Schema should be created without error assert tmp_db.get_paper_count() == 0 assert tmp_db.get_chunk_count() == 0 def test_create_schema_idempotent(self, tmp_db): # Calling create_schema again should not error tmp_db.create_schema() assert tmp_db.get_paper_count() == 0 class TestPaperIngestion: def test_insert_papers(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) assert tmp_db.get_paper_count() == 3 def test_insert_duplicate_papers_ignored(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) tmp_db.insert_papers(sample_papers) # same papers again assert tmp_db.get_paper_count() == 3 def test_get_paper_by_id(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) paper = tmp_db.get_paper_by_id("hf_acl_ocl::P18-1001") assert paper is not None assert paper["title"] == "Attention Is All You Need (Not Really)" assert paper["year"] == 2018 assert paper["venue"] == "acl" assert len(paper["authors"]) == 2 assert "Alice Smith" in paper["authors"] def test_get_paper_by_id_not_found(self, tmp_db): assert tmp_db.get_paper_by_id("nonexistent") is None def test_paper_sources_stored(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) p1 = tmp_db.get_paper_by_id("hf_acl_ocl::P18-1001") p2 = tmp_db.get_paper_by_id("acl_anthology::2022.acl-long.100") assert p1["source"] == "hf_acl_ocl" assert p2["source"] == "acl_anthology" class TestChunkOperations: def test_insert_and_count_chunks(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) chunks = [ { "paper_id": "hf_acl_ocl::P18-1001", "chunk_text": "We propose a novel transformer architecture.", "chunk_type": "abstract", "chunk_index": 0, "token_count": 8, }, { "paper_id": "hf_acl_ocl::P18-1001", "chunk_text": "The model uses self-attention.", "chunk_type": "method", "chunk_index": 1, "token_count": 6, }, ] tmp_db.insert_chunks(chunks) assert tmp_db.get_chunk_count() == 2 def test_get_all_chunks_joined(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) chunks = [ { "paper_id": "hf_acl_ocl::P18-1001", "chunk_text": "Transformer abstract text.", "chunk_type": "abstract", "chunk_index": 0, "token_count": 4, }, ] tmp_db.insert_chunks(chunks) results = tmp_db.get_all_chunks() assert len(results) == 1 assert results[0]["title"] == "Attention Is All You Need (Not Really)" assert results[0]["year"] == 2018 assert results[0]["venue"] == "acl" def test_get_chunk_texts_and_ids(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) chunks = [ {"paper_id": "hf_acl_ocl::P18-1001", "chunk_text": "Text A", "chunk_type": "abstract", "chunk_index": 0, "token_count": 2}, {"paper_id": "hf_acl_ocl::D19-1234", "chunk_text": "Text B", "chunk_type": "abstract", "chunk_index": 0, "token_count": 2}, ] tmp_db.insert_chunks(chunks) texts, ids = tmp_db.get_chunk_texts_and_ids() assert len(texts) == 2 assert len(ids) == 2 assert "Text A" in texts assert "Text B" in texts class TestBrowsePapers: def test_browse_all(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) results = tmp_db.browse_papers() assert len(results) == 3 def test_browse_by_venue(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) results = tmp_db.browse_papers(venue="acl") assert len(results) == 2 assert all(r["venue"] == "acl" for r in results) def test_browse_by_year(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) results = tmp_db.browse_papers(year=2018) assert len(results) == 1 assert results[0]["year"] == 2018 def test_browse_limit_offset(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) results = tmp_db.browse_papers(limit=1, offset=0) assert len(results) == 1 class TestEnrichmentOperations: def test_insert_and_query_methods(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) pid = "hf_acl_ocl::P18-1001" tmp_db.insert_methods(pid, [ {"method_name": "transformer", "method_type": "model"}, {"method_name": "self-attention", "method_type": "technique"}, ]) paper = tmp_db.get_paper_by_id(pid) assert len(paper["methods"]) == 2 def test_insert_and_query_datasets(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) pid = "hf_acl_ocl::P18-1001" tmp_db.insert_datasets(pid, [ {"dataset_name": "WMT14", "task_type": "translation"}, ]) paper = tmp_db.get_paper_by_id(pid) assert len(paper["datasets"]) == 1 assert paper["datasets"][0]["dataset_name"] == "WMT14" def test_insert_tasks(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) pid = "hf_acl_ocl::P18-1001" tmp_db.insert_tasks(pid, ["machine translation", "language modeling"]) # Tasks are not returned by get_paper_by_id, just verify no error assert tmp_db.get_paper_count() == 3 def test_insert_and_query_topics(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) pid = "hf_acl_ocl::P18-1001" tmp_db.insert_topics(pid, ["multimodal", "fairness"]) # Verify no error and count stats = tmp_db.get_enrichment_stats() assert stats["total_topics"] == 2 def test_enrichment_stats(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) pid = "hf_acl_ocl::P18-1001" tmp_db.insert_methods(pid, [{"method_name": "BERT", "method_type": "model"}]) tmp_db.insert_datasets(pid, [{"dataset_name": "SQuAD", "task_type": "QA"}]) tmp_db.insert_tasks(pid, ["question answering"]) tmp_db.insert_topics(pid, ["multimodal"]) stats = tmp_db.get_enrichment_stats() assert stats["total_papers"] == 3 assert stats["total_methods"] == 1 assert stats["total_datasets"] == 1 assert stats["total_tasks"] == 1 assert stats["total_topics"] == 1 assert stats["papers_with_methods"] == 1 class TestAnalytics: def test_papers_per_venue_per_year(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) results = tmp_db.papers_per_venue_per_year() assert len(results) > 0 # Should have venue and year keys assert "venue" in results[0] assert "year" in results[0] assert "paper_count" in results[0] def test_top_methods_by_year(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) tmp_db.insert_methods("hf_acl_ocl::P18-1001", [{"method_name": "BERT", "method_type": "model"}]) tmp_db.insert_methods("hf_acl_ocl::D19-1234", [{"method_name": "BERT", "method_type": "model"}]) results = tmp_db.top_methods_by_year(top_n=5) assert len(results) > 0 assert results[0]["method_name"] == "BERT" def test_method_trend(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) tmp_db.insert_methods("hf_acl_ocl::P18-1001", [{"method_name": "BERT", "method_type": "model"}]) results = tmp_db.method_trend("BERT") assert len(results) == 1 assert results[0]["year"] == 2018 assert results[0]["paper_count"] == 1 def test_method_trend_empty(self, tmp_db): results = tmp_db.method_trend("nonexistent") assert results == [] def test_top_datasets_by_year(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) tmp_db.insert_datasets("hf_acl_ocl::P18-1001", [{"dataset_name": "SQuAD", "task_type": "QA"}]) tmp_db.insert_datasets("hf_acl_ocl::D19-1234", [{"dataset_name": "SQuAD", "task_type": "QA"}]) results = tmp_db.top_datasets_by_year(top_n=5) assert len(results) > 0 assert results[0]["dataset_name"] == "SQuAD" def test_dataset_trend(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) tmp_db.insert_datasets("hf_acl_ocl::P18-1001", [{"dataset_name": "GLUE", "task_type": "NLI"}]) results = tmp_db.dataset_trend("GLUE") assert len(results) == 1 assert results[0]["year"] == 2018 class TestCooccurrenceAnalytics: def _setup_enrichment(self, db, sample_papers): """Helper to populate enrichment tables for testing.""" db.insert_papers(sample_papers) # Paper 1: BERT + SQuAD + QA db.insert_methods("hf_acl_ocl::P18-1001", [ {"method_name": "BERT", "method_type": "model"}, {"method_name": "Transformer", "method_type": "model"}, ]) db.insert_datasets("hf_acl_ocl::P18-1001", [ {"dataset_name": "SQuAD", "task_type": "QA"}, ]) db.insert_tasks("hf_acl_ocl::P18-1001", ["machine translation"]) db.insert_topics("hf_acl_ocl::P18-1001", ["multimodal", "low-resource"]) # Paper 2: BERT + LoRA + GLUE db.insert_methods("hf_acl_ocl::D19-1234", [ {"method_name": "BERT", "method_type": "model"}, {"method_name": "LoRA", "method_type": "technique"}, ]) db.insert_datasets("hf_acl_ocl::D19-1234", [ {"dataset_name": "GLUE", "task_type": "NLI"}, {"dataset_name": "SQuAD", "task_type": "QA"}, ]) db.insert_tasks("hf_acl_ocl::D19-1234", ["text classification"]) db.insert_topics("hf_acl_ocl::D19-1234", ["low-resource", "efficiency"]) # Paper 3: contrastive learning + no datasets db.insert_methods("acl_anthology::2022.acl-long.100", [ {"method_name": "contrastive learning", "method_type": "technique"}, ]) db.insert_tasks("acl_anthology::2022.acl-long.100", ["text classification"]) db.insert_topics("acl_anthology::2022.acl-long.100", ["multimodal"]) def test_method_dataset_cooccurrence(self, tmp_db, sample_papers): self._setup_enrichment(tmp_db, sample_papers) results = tmp_db.method_dataset_cooccurrence(top_n=10) assert len(results) > 0 # BERT+SQuAD should appear (co-occurs in 2 papers) top = results[0] assert "method_name" in top assert "dataset_name" in top assert "co_count" in top assert top["co_count"] >= 1 def test_method_task_cooccurrence(self, tmp_db, sample_papers): self._setup_enrichment(tmp_db, sample_papers) results = tmp_db.method_task_cooccurrence(top_n=10) assert len(results) > 0 assert "method_name" in results[0] assert "task_name" in results[0] assert "co_count" in results[0] def test_top_authors(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) results = tmp_db.top_authors(top_n=5) assert len(results) > 0 assert "name" in results[0] assert "paper_count" in results[0] def test_author_collaboration_pairs(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) results = tmp_db.author_collaboration_pairs(top_n=5) # Paper 1 has 2 authors, paper 3 has 2 authors → at least 2 pairs assert len(results) >= 2 assert "author_a" in results[0] assert "author_b" in results[0] assert "shared_papers" in results[0] def test_top_tasks_by_year(self, tmp_db, sample_papers): self._setup_enrichment(tmp_db, sample_papers) results = tmp_db.top_tasks_by_year(top_n=5) assert len(results) > 0 assert "task_name" in results[0] assert "year" in results[0] assert "count" in results[0] def test_task_trend(self, tmp_db, sample_papers): self._setup_enrichment(tmp_db, sample_papers) results = tmp_db.task_trend("text classification") assert len(results) > 0 assert "year" in results[0] assert "paper_count" in results[0] def test_task_trend_empty(self, tmp_db): results = tmp_db.task_trend("nonexistent_task") assert results == [] def test_venue_method_profile(self, tmp_db, sample_papers): self._setup_enrichment(tmp_db, sample_papers) results = tmp_db.venue_method_profile("acl", top_n=5) assert len(results) > 0 assert "method_name" in results[0] assert "paper_count" in results[0] def test_venue_method_profile_empty(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) results = tmp_db.venue_method_profile("nonexistent", top_n=5) assert results == [] def test_top_topics_by_year(self, tmp_db, sample_papers): self._setup_enrichment(tmp_db, sample_papers) results = tmp_db.top_topics_by_year(top_n=5) assert len(results) > 0 assert "topic_name" in results[0] assert "year" in results[0] assert "count" in results[0] def test_topic_trend(self, tmp_db, sample_papers): self._setup_enrichment(tmp_db, sample_papers) results = tmp_db.topic_trend("multimodal") assert len(results) > 0 assert "year" in results[0] assert "paper_count" in results[0] def test_topic_trend_empty(self, tmp_db): results = tmp_db.topic_trend("nonexistent_topic") assert results == [] def test_year_over_year_growth(self, tmp_db, sample_papers): tmp_db.insert_papers(sample_papers) results = tmp_db.year_over_year_growth() assert len(results) > 0 assert "year" in results[0] assert "paper_count" in results[0] # First year has no previous → growth_pct is None assert results[0]["growth_pct"] is None # Subsequent years should have growth_pct if len(results) > 1: assert "growth_pct" in results[1]