Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| import sys | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parents[1] | |
| sys.path.insert(0, str(ROOT)) | |
| from meltmind_engine import MeltMindEngine # noqa: E402 | |
| def check(condition: bool, message: str, failures: list[str]): | |
| if not condition: | |
| failures.append(message) | |
| def main(): | |
| engine = MeltMindEngine() | |
| failures = [] | |
| check(len(engine.knowledge_chunks) >= 400, "Full MeltRoom knowledge corpus was not loaded", failures) | |
| check(len({chunk["source"] for chunk in engine.knowledge_chunks}) >= 16, "Expected MeltRoom knowledge sources are missing", failures) | |
| chat_cases = [ | |
| ("How much is Pista Bliss?", ("₹350",), (), False), | |
| ("Rapido delivery costs ₹150. What do I pay?", ("₹90", "₹60"), ("free delivery",), False), | |
| ("I sent my MeltBasket on WhatsApp. Is it confirmed?", ("payment", "confirmed"), ("already confirmed",), True), | |
| ("My order arrived damaged. What should I do?", ("same calendar day", "photos"), ("guaranteed",), True), | |
| ("Are your brownies gluten-free?", ("gluten-free", "complete order"), ("not gluten-free",), False), | |
| ("Tell me about the ingredients used", ("brownie base", "premium dark", "without maida", "milk"), ("i can help with",), False), | |
| ("What ingredient is Pistachio Cream?", ("rich pistachio-based cream", "tree nuts"), ("i can help with",), False), | |
| ("Which dessert is safe for diabetes?", ("cannot recommend", "medical"), ("diabetic-safe",), True), | |
| ] | |
| for message, required, forbidden, handoff in chat_cases: | |
| result = engine.deterministic_chat(message) | |
| answer = result["answer"].lower() | |
| for value in required: | |
| check(value.lower() in answer, f"Chat `{message}` must mention `{value}`", failures) | |
| for value in forbidden: | |
| check(value.lower() not in answer, f"Chat `{message}` must not mention `{value}`", failures) | |
| check(result["handoff"] is handoff, f"Chat `{message}` handoff mismatch", failures) | |
| greeting = engine.chat("hi") | |
| check(greeting["products"] == [], "Greeting must never produce product cards", failures) | |
| check(greeting["model"].get("fast_path") is True, "Greeting must use the instant concierge path", failures) | |
| group_budget = engine.chat("What should two friends order under ₹500?") | |
| check(group_budget.get("route_to_designer") is True, "Group-and-budget planning must route directly to Perfect Melt", failures) | |
| check(group_budget["model"].get("fast_path") is True, "Group-and-budget routing must skip MiniCPM", failures) | |
| check(group_budget["preferences"]["group_size"] == 2, "Perfect Melt handoff must preserve group size", failures) | |
| check(group_budget["preferences"]["budget_inr"] == 500, "Perfect Melt handoff must preserve budget", failures) | |
| check(group_budget["products"] == [], "Perfect Melt handoff must not generate premature product cards", failures) | |
| natural_group_budget = engine.chat("suggest something for 2 people in 1000 rupees") | |
| check(natural_group_budget.get("route_to_designer") is True, "Natural group-and-budget phrasing must route to Perfect Melt", failures) | |
| check(natural_group_budget["preferences"]["group_size"] == 2, "Natural Perfect Melt handoff must preserve group size", failures) | |
| check(natural_group_budget["preferences"]["budget_inr"] == 1000, "Natural Perfect Melt handoff must preserve rupee-suffix budget", failures) | |
| check(natural_group_budget["model"].get("used_for_response") is False, "Natural Perfect Melt handoff must skip MiniCPM", failures) | |
| within_budget = engine.chat("plan an order for 4 people within 1200") | |
| check(within_budget.get("route_to_designer") is True, "Within-budget phrasing must route to Perfect Melt", failures) | |
| check(within_budget["preferences"]["budget_inr"] == 1200, "Within-budget phrasing must preserve budget", failures) | |
| personal_budget = engine.chat("suggest something chocolatey and goey under a budget of 500") | |
| check(personal_budget.get("route_to_designer") is True, "Budgeted personal recommendation must route directly to Perfect Melt", failures) | |
| check(personal_budget["preferences"]["budget_inr"] == 500, "Budgeted personal handoff must preserve budget", failures) | |
| check("chocolate" in personal_budget["preferences"]["flavours"], "Budgeted personal handoff must preserve flavour", failures) | |
| check("gooey" in personal_budget["preferences"]["textures"], "Budgeted personal handoff must normalize and preserve gooey texture", failures) | |
| check(personal_budget["products"] == [], "Budgeted personal handoff must not generate premature product cards", failures) | |
| check(personal_budget["model"].get("used_for_response") is False, "Budgeted personal handoff must skip MiniCPM", failures) | |
| signature_ingredients = engine.chat("explain about the ingredients used in signature melt") | |
| signature_answer = signature_ingredients["answer"].lower() | |
| for value in ("meltroom brownie base", "premium dark chocolate", "chocolate chips"): | |
| check(value in signature_answer, f"Signature Melt ingredient answer must mention `{value}`", failures) | |
| for value in ("milk chocolate", "pistachio", "almond milk", "oat milk", "other meltroom"): | |
| check(value not in signature_answer, f"Signature Melt ingredient answer must not drift into `{value}`", failures) | |
| check(signature_ingredients["model"].get("fast_path") is True, "Named-product ingredient answer must be instant", failures) | |
| check(signature_ingredients["model"].get("used_for_response") is False, "Named-product ingredient answer must skip MiniCPM drift", failures) | |
| balanced = engine.deterministic_chat("I want something chocolaty but balanced.") | |
| check("no verified low-sweetness item" in balanced["answer"].lower(), "Balanced chocolate guidance must retain the verified sweetness limitation", failures) | |
| warm_melty = engine.deterministic_chat("suggest something warm and melty") | |
| check("meltcups" in warm_melty["answer"].lower(), "Warm and melty request should lead with MeltCups", failures) | |
| check("low-sweetness" not in warm_melty["answer"].lower(), "Warm and melty request must not trigger sweetness FAQ", failures) | |
| check(warm_melty["products"][0]["id"] == "meltcups", "MeltCups should be the first warm and melty product card", failures) | |
| all_bowls = engine.deterministic_chat("compare all the MeltRoom bowls") | |
| check(len(all_bowls["products"]) == 6, "All-bowls comparison must include every current Brownie Bowl", failures) | |
| for bowl in ("Mango Brownie Bowl", "Choco Fudge Brownie Bowl", "Chocolate Over Load", "MeltRoom Brownie Bowl", "Pista Bliss", "Banana Bliss"): | |
| check(bowl.lower() in all_bowls["answer"].lower(), f"All-bowls comparison must mention `{bowl}`", failures) | |
| original_completion = engine._llm_completion | |
| engine._llm_completion = lambda messages, max_tokens: '{"answer":"MeltRoom uses a signature fudgy brownie base, premium chocolate, fruits, creams, and texture-led toppings. The brownie base is gluten-free, eggless, and made without maida or refined sugar.","product_ids":[],"follow_ups":[]}' | |
| generalized = engine.chat("Tell me about the ingredients used") | |
| engine._llm_completion = original_completion | |
| check(generalized["model"]["used_for_response"] is True, "Meaningful ingredient query must use MiniCPM generation", failures) | |
| check(generalized["model"]["routing"] == "rag_minicpm_with_deterministic_validation", "Meaningful query routing metadata mismatch", failures) | |
| unsafe_comparison = "Banana Bliss is a healthier option and suitable for those avoiding tree nuts." | |
| check( | |
| not engine._grounded_answer_is_valid(unsafe_comparison, "", {}, False), | |
| "Unsupported health or allergen-suitability comparison must be rejected", | |
| failures, | |
| ) | |
| for budget, group, request in ( | |
| (500, 2, "A chocolate order for two people"), | |
| (900, 4, "Movie night with variety"), | |
| (600, 2, "Premium pistachio and crunchy"), | |
| ): | |
| result = engine.recommend({ | |
| "budget_inr": budget, | |
| "group_size": group, | |
| "flavours": [], | |
| "textures": [], | |
| "premium": False, | |
| "variety": False, | |
| "less_sweet": False, | |
| "request": request, | |
| }) | |
| check(bool(result["plans"]), f"Designer `{request}` returned no plans", failures) | |
| for plan in result["plans"]: | |
| check(plan["total"] <= budget, f"Designer `{request}` exceeded ₹{budget}", failures) | |
| calculated = sum(item["price"] * item["quantity"] for item in plan["items"]) | |
| check(calculated == plan["total"], f"Designer `{request}` total calculation mismatch", failures) | |
| check(plan["remaining_budget"] == budget - plan["total"], f"Designer `{request}` remaining budget mismatch", failures) | |
| allergy_result = engine.recommend({ | |
| "budget_inr": 500, | |
| "group_size": 2, | |
| "flavours": [], | |
| "textures": [], | |
| "premium": False, | |
| "variety": True, | |
| "less_sweet": False, | |
| "request": "One person has severe milk and tree-nut allergies", | |
| }) | |
| check(allergy_result["handoff"] is True, "Severe allergy plan must require handoff", failures) | |
| check(not allergy_result["plans"], "Severe allergy plan must not fabricate a verified-safe plan", failures) | |
| if failures: | |
| print(f"MeltMind evaluation failed with {len(failures)} issue(s):") | |
| for failure in failures: | |
| print(f"- {failure}") | |
| raise SystemExit(1) | |
| print("MeltMind evaluation passed: grounded chat facts, safety boundaries, and designer calculations are valid.") | |
| if __name__ == "__main__": | |
| main() | |