"""Tests for the combo parser v2 (description-based component extraction). The combo parser v2 parses the product ``description`` field for component lists rather than relying on hardcoded name-based heuristics. This is the long-term fix recommended by the 2026-06-14 audit. The Swiggy snapshot at ``data/swiggy_fresh_vegetables_cards_6jun26.json`` has the Sambar Veg Combo with description: "Drumstick, Brinjal, Raw Banana and Pumpkin Fresh veggies combo for Vishu festive cooking" The OLD heuristic (name-based) would return the hardcoded ``["drumstick", "brinjal", "raw_banana", "pumpkin"]`` — which happens to be correct for this one case but is brittle (any new combo like "Paneer Tikka Combo" or "Diwali Special Combo" would have no hardcoded mapping and would fail). The NEW parser extracts components from the description, filtering out generic marketing phrases, and resolves each candidate to its canonical name. This works for any combo with a well-formed description. """ from __future__ import annotations import pytest from shopstack.domain.unit_price import canonicalize_name class TestComboParserV2: """The description-based combo parser should extract components that match the actual data, not the hardcoded fallback.""" def test_sambar_veg_from_actual_swiggy_description(self): """Sambar Veg Combo description from the Swiggy snapshot. Expected components: Drumstick, Brinjal, Raw Banana, Pumpkin (the actual items listed in the description). """ _, _, components = canonicalize_name( "Sambar Veg Combo", description="Drumstick, Brinjal, Raw Banana and Pumpkin Fresh veggies combo for Vishu festive cooking", ) assert "drumstick" in components assert "brinjal" in components assert "raw_banana" in components assert "pumpkin" in components # Should NOT contain the old wrong components assert "radish" not in components assert "cluster_beans" not in components assert "ladys_finger" not in components def test_description_components_override_hardcoded_fallback(self): """If the description is provided, use it; don't fall back to the hardcoded name-based heuristic.""" _, _, components = canonicalize_name( "Sambar Veg Combo", description="Okra, Pumpkin, Carrot Fresh combo", ) # The description's components should be used, not the # hardcoded "drumstick, brinjal, raw_banana, pumpkin" assert "okra" in components or "ladys_finger" in components assert "pumpkin" in components assert "carrot" in components def test_filters_marketing_phrases(self): """Generic marketing phrases like 'fresh', 'combo', 'festive', 'cooking' should be filtered out, not treated as components.""" _, _, components = canonicalize_name( "Diwali Special Combo", description="Drumstick, Brinjal and Raw Banana Fresh combo for festive cooking", ) assert "drumstick" in components assert "brinjal" in components assert "raw_banana" in components # The marketing phrases should NOT be components assert "fresh" not in components assert "combo" not in components assert "festive" not in components assert "cooking" not in components assert "fresh combo" not in components assert "combo for" not in components def test_handles_and_separator(self): """The parser splits on 'and' as well as commas.""" _, _, components = canonicalize_name( "Mixed Veg Combo", description="Tomato and Onion and Carrot Fresh pack", ) assert "tomato" in components assert "onion" in components assert "carrot" in components def test_handles_ampersand_separator(self): """The parser splits on '&' as well as commas.""" _, _, components = canonicalize_name( "Bhindi Combo", description="Ladies Finger & Okra Fresh pack", ) # Both should resolve to the same canonical (ladys_finger) assert "ladys_finger" in components def test_empty_description_falls_back_to_name_heuristic(self): """If no description is provided, fall back to the hardcoded name-based heuristic for known combos (sambar veg, herbs mix).""" _, _, components = canonicalize_name("Sambar Veg Combo") # The hardcoded fallback should return the previously-correct # components (which we also fixed to match the actual data) assert "drumstick" in components assert "brinjal" in components assert "raw_banana" in components assert "pumpkin" in components def test_unresolved_candidates_are_dropped(self): """Candidates that don't match any canonical should be dropped, not appended as raw text (which would create non-canonical components that never match inventory).""" _, _, components = canonicalize_name( "Mystery Combo", description="Asparagus, Quinoa, Dragon Fruit Combo pack", ) # Asparagus, Quinoa, Dragon Fruit are not in our alias maps # — they should NOT appear in components assert "asparagus" not in components assert "quinoa" not in components assert "dragon fruit" not in components def test_deduplicates_repeated_candidates(self): """If a component is mentioned twice (e.g. in title and description), it should appear only once.""" _, _, components = canonicalize_name( "Tomato Combo", description="Tomato, Tomato, Onion Fresh pack", ) assert components.count("tomato") == 1 assert "onion" in components def test_handles_missing_description_gracefully(self): """Empty/None description should not crash.""" _, _, components_a = canonicalize_name("Sambar Veg Combo", description="") _, _, components_b = canonicalize_name("Sambar Veg Combo", description=None) # Both should fall back to the hardcoded heuristic assert "drumstick" in components_a assert "drumstick" in components_b