shopstack / tests /test_combo_parser_v2.py
pranaysuyash's picture
Sync ShopStack HEAD 6f8adfc
d999bba verified
Raw
History Blame Contribute Delete
6.3 kB
"""Tests for the combo parser v2 (description-based component extraction).
The combo parser v2 parses the product ``description`` field for
component lists rather than relying on hardcoded name-based heuristics.
This is the long-term fix recommended by the 2026-06-14 audit.
The Swiggy snapshot at ``data/swiggy_fresh_vegetables_cards_6jun26.json``
has the Sambar Veg Combo with description:
"Drumstick, Brinjal, Raw Banana and Pumpkin Fresh veggies combo
for Vishu festive cooking"
The OLD heuristic (name-based) would return the hardcoded
``["drumstick", "brinjal", "raw_banana", "pumpkin"]`` — which happens
to be correct for this one case but is brittle (any new combo like
"Paneer Tikka Combo" or "Diwali Special Combo" would have no
hardcoded mapping and would fail).
The NEW parser extracts components from the description, filtering
out generic marketing phrases, and resolves each candidate to its
canonical name. This works for any combo with a well-formed
description.
"""
from __future__ import annotations
import pytest
from shopstack.domain.unit_price import canonicalize_name
class TestComboParserV2:
"""The description-based combo parser should extract components
that match the actual data, not the hardcoded fallback."""
def test_sambar_veg_from_actual_swiggy_description(self):
"""Sambar Veg Combo description from the Swiggy snapshot.
Expected components: Drumstick, Brinjal, Raw Banana, Pumpkin
(the actual items listed in the description).
"""
_, _, components = canonicalize_name(
"Sambar Veg Combo",
description="Drumstick, Brinjal, Raw Banana and Pumpkin Fresh veggies combo for Vishu festive cooking",
)
assert "drumstick" in components
assert "brinjal" in components
assert "raw_banana" in components
assert "pumpkin" in components
# Should NOT contain the old wrong components
assert "radish" not in components
assert "cluster_beans" not in components
assert "ladys_finger" not in components
def test_description_components_override_hardcoded_fallback(self):
"""If the description is provided, use it; don't fall back to
the hardcoded name-based heuristic."""
_, _, components = canonicalize_name(
"Sambar Veg Combo",
description="Okra, Pumpkin, Carrot Fresh combo",
)
# The description's components should be used, not the
# hardcoded "drumstick, brinjal, raw_banana, pumpkin"
assert "okra" in components or "ladys_finger" in components
assert "pumpkin" in components
assert "carrot" in components
def test_filters_marketing_phrases(self):
"""Generic marketing phrases like 'fresh', 'combo', 'festive',
'cooking' should be filtered out, not treated as components."""
_, _, components = canonicalize_name(
"Diwali Special Combo",
description="Drumstick, Brinjal and Raw Banana Fresh combo for festive cooking",
)
assert "drumstick" in components
assert "brinjal" in components
assert "raw_banana" in components
# The marketing phrases should NOT be components
assert "fresh" not in components
assert "combo" not in components
assert "festive" not in components
assert "cooking" not in components
assert "fresh combo" not in components
assert "combo for" not in components
def test_handles_and_separator(self):
"""The parser splits on 'and' as well as commas."""
_, _, components = canonicalize_name(
"Mixed Veg Combo",
description="Tomato and Onion and Carrot Fresh pack",
)
assert "tomato" in components
assert "onion" in components
assert "carrot" in components
def test_handles_ampersand_separator(self):
"""The parser splits on '&' as well as commas."""
_, _, components = canonicalize_name(
"Bhindi Combo",
description="Ladies Finger & Okra Fresh pack",
)
# Both should resolve to the same canonical (ladys_finger)
assert "ladys_finger" in components
def test_empty_description_falls_back_to_name_heuristic(self):
"""If no description is provided, fall back to the hardcoded
name-based heuristic for known combos (sambar veg, herbs mix)."""
_, _, components = canonicalize_name("Sambar Veg Combo")
# The hardcoded fallback should return the previously-correct
# components (which we also fixed to match the actual data)
assert "drumstick" in components
assert "brinjal" in components
assert "raw_banana" in components
assert "pumpkin" in components
def test_unresolved_candidates_are_dropped(self):
"""Candidates that don't match any canonical should be dropped,
not appended as raw text (which would create non-canonical
components that never match inventory)."""
_, _, components = canonicalize_name(
"Mystery Combo",
description="Asparagus, Quinoa, Dragon Fruit Combo pack",
)
# Asparagus, Quinoa, Dragon Fruit are not in our alias maps
# — they should NOT appear in components
assert "asparagus" not in components
assert "quinoa" not in components
assert "dragon fruit" not in components
def test_deduplicates_repeated_candidates(self):
"""If a component is mentioned twice (e.g. in title and
description), it should appear only once."""
_, _, components = canonicalize_name(
"Tomato Combo",
description="Tomato, Tomato, Onion Fresh pack",
)
assert components.count("tomato") == 1
assert "onion" in components
def test_handles_missing_description_gracefully(self):
"""Empty/None description should not crash."""
_, _, components_a = canonicalize_name("Sambar Veg Combo", description="")
_, _, components_b = canonicalize_name("Sambar Veg Combo", description=None)
# Both should fall back to the hardcoded heuristic
assert "drumstick" in components_a
assert "drumstick" in components_b