File size: 12,951 Bytes
from __future__ import annotations

import json
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent.parent
OUTPUT_DIR = BASE_DIR / "data" / "decision_phase_difficulty"
BENCHMARK_PATH = BASE_DIR / "data" / "decision_phase_benchmark.jsonl"

TRAIN_THEMES = (
    {
        "topic": "crm software",
        "product": "CRM",
        "products": "CRM tools",
        "provider_a": "HubSpot",
        "provider_b": "Zoho",
        "domain": "a small sales team",
        "goal": "manage leads better",
        "support_object": "account",
        "support_detail": "password reset",
        "asset": "CRM onboarding guide",
    },
    {
        "topic": "analytics software",
        "product": "analytics platform",
        "products": "analytics platforms",
        "provider_a": "Mixpanel",
        "provider_b": "Amplitude",
        "domain": "a product team",
        "goal": "measure activation",
        "support_object": "dashboard",
        "support_detail": "data sync issue",
        "asset": "analytics setup guide",
    },
    {
        "topic": "laptops",
        "product": "laptop",
        "products": "laptops",
        "provider_a": "MacBook Air",
        "provider_b": "Dell XPS 13",
        "domain": "college work",
        "goal": "choose the right laptop",
        "support_object": "order",
        "support_detail": "delivery delay",
        "asset": "laptop buying checklist",
    },
)

BENCHMARK_THEMES = {
    "easy": {
        "topic": "help desk software",
        "product": "help desk platform",
        "products": "help desk tools",
        "provider_a": "Zendesk",
        "provider_b": "Freshdesk",
        "domain": "a support team",
        "goal": "handle tickets faster",
        "support_object": "billing portal",
        "support_detail": "invoice issue",
        "asset": "help desk buyer guide",
    },
    "medium": {
        "topic": "cars",
        "product": "car",
        "products": "cars",
        "provider_a": "Toyota Corolla",
        "provider_b": "Honda Civic",
        "domain": "daily commuting",
        "goal": "choose the right car",
        "support_object": "reservation",
        "support_detail": "test drive booking",
        "asset": "car buying worksheet",
    },
    "hard": {
        "topic": "hosting platforms",
        "product": "hosting platform",
        "products": "hosting providers",
        "provider_a": "Vercel",
        "provider_b": "Netlify",
        "domain": "a startup launch",
        "goal": "ship a new website",
        "support_object": "deployment",
        "support_detail": "domain setup problem",
        "asset": "hosting migration guide",
    },
}

PHASE_TEMPLATES = {
    "awareness": {
        "easy": (
            "What is {topic}?",
            "Explain {topic}.",
            "How does {product} work?",
            "What does {provider_a} do?",
            "Give me the basics of {topic}.",
        ),
        "medium": (
            "Help me understand what problem {provider_a} solves.",
            "What should a beginner know about {products}?",
            "Before I look at options, what is a {product}?",
            "What is the purpose of {topic} in {domain}?",
            "What does a {product} actually help with?",
        ),
        "hard": (
            "I am not shopping yet, I just want to understand what {topic} is.",
            "Before I evaluate anything, what role does {topic} play in {domain}?",
            "I keep hearing about {provider_a}; what is it actually for?",
            "Can you clarify what people mean by {topic} in practice?",
            "I only need an overview of {topic} right now.",
        ),
    },
    "research": {
        "easy": (
            "What {products} should I explore for {domain}?",
            "Show me options to consider for {goal}.",
            "What tools should I look at for {domain}?",
            "Help me research {products}.",
            "Where should I start with {products}?",
        ),
        "medium": (
            "I am early in the process and want to explore {products}.",
            "Give me a shortlist of {products} worth researching.",
            "What directions should I investigate for {goal}?",
            "What categories should I look at before narrowing down?",
            "What are some promising {products} for {domain}?",
        ),
        "hard": (
            "I am not ready to compare vendors yet, just help me scope the market.",
            "What should I research first if I am only beginning to look at {products}?",
            "I need a landscape view before I make a shortlist.",
            "What are the main options in this space before I decide anything?",
            "Help me map the market for {products} without recommending one yet.",
        ),
    },
    "consideration": {
        "easy": (
            "Best {product} for {domain}.",
            "{provider_a} vs {provider_b}.",
            "Compare {products} for {goal}.",
            "Which {product} looks best for {domain}?",
            "What are some {products} worth considering?",
        ),
        "medium": (
            "Compare {provider_a} and {provider_b} for {goal}.",
            "What are the pros and cons of {provider_a}?",
            "Help me evaluate the best {products} for {domain}.",
            "Which {product} seems worth considering right now?",
            "I am comparing options for {goal}; what should be on the shortlist?",
        ),
        "hard": (
            "I am past basic research and now weighing tradeoffs between {provider_a} and {provider_b}.",
            "I want to compare serious options before committing to one.",
            "Help me think through the tradeoffs in the current shortlist.",
            "What looks strongest if I am narrowing down to a few options?",
            "I have done research, now help me compare the finalists.",
        ),
    },
    "decision": {
        "easy": (
            "Which {product} should I choose?",
            "Should I pick {provider_a} or {provider_b}?",
            "Which option should I commit to?",
            "What is the best fit for me right now?",
            "Which plan should I choose today?",
        ),
        "medium": (
            "I am ready to decide between {provider_a} and {provider_b}.",
            "Help me pick the final option for {goal}.",
            "Which {product} should I commit to this week?",
            "I need to make the call now; which option fits best?",
            "What should I choose if I need to decide today?",
        ),
        "hard": (
            "I have a shortlist and need to commit to one vendor now.",
            "I am at the point of commitment and need a final recommendation.",
            "Which option should we sign off on before next week?",
            "I have enough information; tell me which one to go with.",
            "I need the final pick, not another round of comparison.",
        ),
    },
    "action": {
        "easy": (
            "Start my free trial.",
            "Book a demo with {provider_a}.",
            "Create my account.",
            "Buy {provider_a} now.",
            "Download the {asset}.",
        ),
        "medium": (
            "Take me to checkout for {provider_a}.",
            "Get me signed up for {provider_a}.",
            "Reserve my spot with {provider_a}.",
            "I want to purchase {provider_a} today.",
            "Send me the download link for the {asset}.",
        ),
        "hard": (
            "I am ready to move forward now, where do I start the purchase?",
            "Help me complete the signup flow for {provider_a}.",
            "I want to act on this immediately and get access now.",
            "Can you help me finish the order for {provider_a}?",
            "I have decided, now let me complete the next step.",
        ),
    },
    "post_purchase": {
        "easy": (
            "How do I set up my new {product}?",
            "Show me how to import contacts into {provider_a}.",
            "How do I onboard my team after purchase?",
            "What should I enable first after signup?",
            "How do I configure my account now that I signed up?",
        ),
        "medium": (
            "We already subscribed; how do we get value quickly?",
            "What is the best way to roll this out after purchase?",
            "Help me configure {provider_a} now that we bought it.",
            "How do I invite teammates after signing up?",
            "What should I do first after we activate the plan?",
        ),
        "hard": (
            "We already made the purchase, now I need guidance on rollout and setup.",
            "This is not a buying decision anymore; I need post-purchase onboarding help.",
            "I need adoption guidance now that the contract is signed.",
            "What is the right onboarding sequence after we commit to {provider_a}?",
            "We are past checkout and need implementation help.",
        ),
    },
    "support": {
        "easy": (
            "I cannot log into my {support_object}.",
            "How do I reset my password?",
            "My invoice is wrong.",
            "The integration keeps failing.",
            "Our dashboard is not loading.",
        ),
        "medium": (
            "Can you help me fix a {support_detail}?",
            "I am stuck because my {support_object} keeps breaking.",
            "My password reset link is not working.",
            "I need support with my {support_object}.",
            "Why is {provider_a} not syncing correctly?",
        ),
        "hard": (
            "I am not evaluating anything, I just need this issue fixed.",
            "This is a live support problem, not a buying question.",
            "Please help me resolve a problem with my existing account.",
            "I cannot continue because something is broken in my setup.",
            "I need troubleshooting help, not recommendations.",
        ),
    },
}


def split_for_index(index: int) -> str:
    bucket = index % 5
    if bucket < 3:
        return "train"
    if bucket == 3:
        return "val"
    return "test"


def write_jsonl(path: Path, rows: list[dict]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as handle:
        for row in rows:
            handle.write(json.dumps(row, sort_keys=True) + "\n")


def render_text(template: str, spec: dict[str, str]) -> str:
    return template.format(**spec).strip()


def build_training_splits() -> dict[str, list[dict]]:
    splits = {"train": [], "val": [], "test": []}
    seen: set[str] = set()
    counters = {(phase, difficulty): 0 for phase in PHASE_TEMPLATES for difficulty in PHASE_TEMPLATES[phase]}

    for phase, difficulty_map in PHASE_TEMPLATES.items():
        for difficulty, templates in difficulty_map.items():
            for template in templates:
                theme_specs = TRAIN_THEMES if "{" in template else ({},)
                for spec in theme_specs:
                    text = render_text(template, spec)
                    key = text.lower()
                    if key in seen:
                        continue
                    seen.add(key)
                    split_name = split_for_index(counters[(phase, difficulty)])
                    counters[(phase, difficulty)] += 1
                    splits[split_name].append(
                        {
                            "text": text,
                            "decision_phase": phase,
                            "difficulty": difficulty,
                            "source": "synthetic_decision_phase_difficulty",
                        }
                    )
    return splits


def build_benchmark_rows() -> list[dict]:
    rows: list[dict] = []
    seen: set[str] = set()
    for phase, difficulty_map in PHASE_TEMPLATES.items():
        for difficulty, templates in difficulty_map.items():
            spec = BENCHMARK_THEMES.get(difficulty, {})
            for template in templates:
                text = render_text(template, spec)
                key = text.lower()
                if key in seen:
                    continue
                seen.add(key)
                rows.append(
                    {
                        "text": text,
                        "decision_phase": phase,
                        "difficulty": difficulty,
                        "source": "decision_phase_benchmark",
                    }
                )
    return rows


def main() -> None:
    splits = build_training_splits()
    for split_name, rows in splits.items():
        write_jsonl(OUTPUT_DIR / f"{split_name}.jsonl", rows)
        print(f"{split_name}: {len(rows)} rows")

    benchmark_rows = build_benchmark_rows()
    write_jsonl(BENCHMARK_PATH, benchmark_rows)
    print(f"benchmark: {len(benchmark_rows)} rows")


if __name__ == "__main__":
    main()