primer-app / tests /test_embedder.py
Viney's picture
deploy: Primer app for HF Spaces — clean orphan history
35676b4
from unittest.mock import patch
from ingestion.embedder import chunk_text, embed_and_store_filing, embed_and_store_transcript
def test_chunk_long_text_produces_multiple_chunks():
text = "word " * 600
chunks = chunk_text(text, chunk_size=100, overlap=10)
assert len(chunks) > 1
def test_chunk_short_text_returns_single_chunk():
assert chunk_text("Short text.") == ["Short text."]
def test_chunk_overlap_repeats_words():
text = " ".join(str(i) for i in range(200))
chunks = chunk_text(text, chunk_size=100, overlap=20)
last_word = chunks[0].split()[-1]
assert last_word in chunks[1]
@patch("ingestion.embedder.vector_store")
def test_embed_filing_calls_add_chunks_twice(mock_vs):
embed_and_store_filing(
ticker="AAPL",
company_name="Apple Inc.",
mda_text="Revenue grew 5%.",
risk_text="Competition remains a risk.",
filing_date="2024-11-01",
period="Q42024",
form_type="10-Q",
)
assert mock_vs.add_chunks.call_count == 2
@patch("ingestion.embedder.vector_store")
def test_embed_filing_skips_empty_sections(mock_vs):
embed_and_store_filing(
ticker="AAPL",
company_name="Apple Inc.",
mda_text="Some MD&A text.",
risk_text="",
filing_date="2024-11-01",
period="Q42024",
form_type="10-Q",
)
assert mock_vs.add_chunks.call_count == 1
@patch("ingestion.embedder.vector_store")
def test_embed_filing_chunk_context_in_metadata(mock_vs):
embed_and_store_filing(
ticker="AAPL",
company_name="Apple Inc.",
mda_text="Revenue grew 5%.",
risk_text="",
filing_date="2024-11-01",
period="Q42024",
form_type="10-Q",
)
_, kwargs = mock_vs.add_chunks.call_args_list[0]
metadatas = mock_vs.add_chunks.call_args_list[0][0][2]
assert "chunk_context" in metadatas[0]
assert "Apple Inc." in metadatas[0]["chunk_context"]
assert "Q42024" in metadatas[0]["chunk_context"]
@patch("ingestion.embedder.vector_store")
def test_embed_transcript(mock_vs):
embed_and_store_transcript(
ticker="AAPL",
company_name="Apple Inc.",
transcript_text="Tim Cook: Record quarter.",
transcript_date="2024-11-01",
period="Q42024",
)
mock_vs.add_chunks.assert_called_once()
args = mock_vs.add_chunks.call_args
assert args[0][0] == "transcripts"
metadatas = args[0][2]
assert "chunk_context" in metadatas[0]
assert "Q42024" in metadatas[0]["chunk_context"]