| |
|
|
| from pathlib import Path |
|
|
| from pluto.doc_summary import ( |
| DocSummary, |
| apply_doc_summary_context, |
| generate_doc_summary, |
| save_doc_summaries, |
| ) |
|
|
|
|
| def test_generate_doc_summary_returns_valid_summary_with_mocked_llm(monkeypatch, tmp_path): |
| corpus = tmp_path / "corpus" |
| corpus.mkdir() |
| (corpus / "paper.md").write_text("# Paper\n\nThis is about retrieval.", encoding="utf-8") |
|
|
| monkeypatch.setattr( |
| "pluto.doc_summary._call_summary_llm", |
| lambda **kwargs: """ |
| { |
| "title": "Retrieval Paper", |
| "domain": "information retrieval", |
| "key_claims": ["Chunk context improves retrieval"], |
| "structure": ["intro", "methodology", "results"], |
| "open_questions": ["How robust is it?"] |
| } |
| """, |
| ) |
|
|
| summary = generate_doc_summary("paper", corpus) |
|
|
| assert isinstance(summary, DocSummary) |
| assert summary.doc_id == "paper" |
| assert summary.title == "Retrieval Paper" |
| assert summary.domain == "information retrieval" |
| assert summary.key_claims == ["Chunk context improves retrieval"] |
|
|
|
|
| def test_generate_doc_summary_falls_back_when_llm_fails(monkeypatch, tmp_path): |
| corpus = tmp_path / "corpus" |
| corpus.mkdir() |
| (corpus / "paper.md").write_text("# Paper\n\nBody.", encoding="utf-8") |
|
|
| def fail(**kwargs): |
| raise RuntimeError("model unavailable") |
|
|
| monkeypatch.setattr("pluto.doc_summary._call_summary_llm", fail) |
|
|
| summary = generate_doc_summary("paper", corpus) |
|
|
| assert summary.doc_id == "paper" |
| assert summary.title == "paper" |
| assert summary.key_claims == [] |
| assert summary.open_questions == [] |
|
|
|
|
| def test_context_prefix_is_prepended_to_chunk_text(tmp_path): |
| corpus = tmp_path / "corpus" |
| corpus.mkdir() |
| summary = DocSummary( |
| doc_id="paper", |
| title="Retrieval Paper", |
| domain="AI", |
| key_claims=["Claim A", "Claim B"], |
| structure=[], |
| open_questions=[], |
| created_at="2026-01-01T00:00:00+00:00", |
| ) |
| save_doc_summaries(corpus, {"paper": summary}) |
|
|
| result = apply_doc_summary_context("Original chunk", "paper", corpus) |
|
|
| assert result.startswith("[Document context: Retrieval Paper | Domain: AI | Key claims: Claim A; Claim B]") |
| assert result.endswith("Original chunk") |
|
|