from unittest.mock import patch from ingestion.embedder import chunk_text, embed_and_store_filing, embed_and_store_transcript def test_chunk_long_text_produces_multiple_chunks(): text = "word " * 600 chunks = chunk_text(text, chunk_size=100, overlap=10) assert len(chunks) > 1 def test_chunk_short_text_returns_single_chunk(): assert chunk_text("Short text.") == ["Short text."] def test_chunk_overlap_repeats_words(): text = " ".join(str(i) for i in range(200)) chunks = chunk_text(text, chunk_size=100, overlap=20) last_word = chunks[0].split()[-1] assert last_word in chunks[1] @patch("ingestion.embedder.vector_store") def test_embed_filing_calls_add_chunks_twice(mock_vs): embed_and_store_filing( ticker="AAPL", company_name="Apple Inc.", mda_text="Revenue grew 5%.", risk_text="Competition remains a risk.", filing_date="2024-11-01", period="Q42024", form_type="10-Q", ) assert mock_vs.add_chunks.call_count == 2 @patch("ingestion.embedder.vector_store") def test_embed_filing_skips_empty_sections(mock_vs): embed_and_store_filing( ticker="AAPL", company_name="Apple Inc.", mda_text="Some MD&A text.", risk_text="", filing_date="2024-11-01", period="Q42024", form_type="10-Q", ) assert mock_vs.add_chunks.call_count == 1 @patch("ingestion.embedder.vector_store") def test_embed_filing_chunk_context_in_metadata(mock_vs): embed_and_store_filing( ticker="AAPL", company_name="Apple Inc.", mda_text="Revenue grew 5%.", risk_text="", filing_date="2024-11-01", period="Q42024", form_type="10-Q", ) _, kwargs = mock_vs.add_chunks.call_args_list[0] metadatas = mock_vs.add_chunks.call_args_list[0][0][2] assert "chunk_context" in metadatas[0] assert "Apple Inc." in metadatas[0]["chunk_context"] assert "Q42024" in metadatas[0]["chunk_context"] @patch("ingestion.embedder.vector_store") def test_embed_transcript(mock_vs): embed_and_store_transcript( ticker="AAPL", company_name="Apple Inc.", transcript_text="Tim Cook: Record quarter.", transcript_date="2024-11-01", period="Q42024", ) mock_vs.add_chunks.assert_called_once() args = mock_vs.add_chunks.call_args assert args[0][0] == "transcripts" metadatas = args[0][2] assert "chunk_context" in metadatas[0] assert "Q42024" in metadatas[0]["chunk_context"]