agentic-intent-classifier / training /build_decision_phase_difficulty_dataset.py
manikumargouni's picture
Upload folder using huggingface_hub
0584798 verified
from __future__ import annotations
import json
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent
OUTPUT_DIR = BASE_DIR / "data" / "decision_phase_difficulty"
BENCHMARK_PATH = BASE_DIR / "data" / "decision_phase_benchmark.jsonl"
TRAIN_THEMES = (
{
"topic": "crm software",
"product": "CRM",
"products": "CRM tools",
"provider_a": "HubSpot",
"provider_b": "Zoho",
"domain": "a small sales team",
"goal": "manage leads better",
"support_object": "account",
"support_detail": "password reset",
"asset": "CRM onboarding guide",
},
{
"topic": "analytics software",
"product": "analytics platform",
"products": "analytics platforms",
"provider_a": "Mixpanel",
"provider_b": "Amplitude",
"domain": "a product team",
"goal": "measure activation",
"support_object": "dashboard",
"support_detail": "data sync issue",
"asset": "analytics setup guide",
},
{
"topic": "laptops",
"product": "laptop",
"products": "laptops",
"provider_a": "MacBook Air",
"provider_b": "Dell XPS 13",
"domain": "college work",
"goal": "choose the right laptop",
"support_object": "order",
"support_detail": "delivery delay",
"asset": "laptop buying checklist",
},
)
BENCHMARK_THEMES = {
"easy": {
"topic": "help desk software",
"product": "help desk platform",
"products": "help desk tools",
"provider_a": "Zendesk",
"provider_b": "Freshdesk",
"domain": "a support team",
"goal": "handle tickets faster",
"support_object": "billing portal",
"support_detail": "invoice issue",
"asset": "help desk buyer guide",
},
"medium": {
"topic": "cars",
"product": "car",
"products": "cars",
"provider_a": "Toyota Corolla",
"provider_b": "Honda Civic",
"domain": "daily commuting",
"goal": "choose the right car",
"support_object": "reservation",
"support_detail": "test drive booking",
"asset": "car buying worksheet",
},
"hard": {
"topic": "hosting platforms",
"product": "hosting platform",
"products": "hosting providers",
"provider_a": "Vercel",
"provider_b": "Netlify",
"domain": "a startup launch",
"goal": "ship a new website",
"support_object": "deployment",
"support_detail": "domain setup problem",
"asset": "hosting migration guide",
},
}
PHASE_TEMPLATES = {
"awareness": {
"easy": (
"What is {topic}?",
"Explain {topic}.",
"How does {product} work?",
"What does {provider_a} do?",
"Give me the basics of {topic}.",
),
"medium": (
"Help me understand what problem {provider_a} solves.",
"What should a beginner know about {products}?",
"Before I look at options, what is a {product}?",
"What is the purpose of {topic} in {domain}?",
"What does a {product} actually help with?",
),
"hard": (
"I am not shopping yet, I just want to understand what {topic} is.",
"Before I evaluate anything, what role does {topic} play in {domain}?",
"I keep hearing about {provider_a}; what is it actually for?",
"Can you clarify what people mean by {topic} in practice?",
"I only need an overview of {topic} right now.",
),
},
"research": {
"easy": (
"What {products} should I explore for {domain}?",
"Show me options to consider for {goal}.",
"What tools should I look at for {domain}?",
"Help me research {products}.",
"Where should I start with {products}?",
),
"medium": (
"I am early in the process and want to explore {products}.",
"Give me a shortlist of {products} worth researching.",
"What directions should I investigate for {goal}?",
"What categories should I look at before narrowing down?",
"What are some promising {products} for {domain}?",
),
"hard": (
"I am not ready to compare vendors yet, just help me scope the market.",
"What should I research first if I am only beginning to look at {products}?",
"I need a landscape view before I make a shortlist.",
"What are the main options in this space before I decide anything?",
"Help me map the market for {products} without recommending one yet.",
),
},
"consideration": {
"easy": (
"Best {product} for {domain}.",
"{provider_a} vs {provider_b}.",
"Compare {products} for {goal}.",
"Which {product} looks best for {domain}?",
"What are some {products} worth considering?",
),
"medium": (
"Compare {provider_a} and {provider_b} for {goal}.",
"What are the pros and cons of {provider_a}?",
"Help me evaluate the best {products} for {domain}.",
"Which {product} seems worth considering right now?",
"I am comparing options for {goal}; what should be on the shortlist?",
),
"hard": (
"I am past basic research and now weighing tradeoffs between {provider_a} and {provider_b}.",
"I want to compare serious options before committing to one.",
"Help me think through the tradeoffs in the current shortlist.",
"What looks strongest if I am narrowing down to a few options?",
"I have done research, now help me compare the finalists.",
),
},
"decision": {
"easy": (
"Which {product} should I choose?",
"Should I pick {provider_a} or {provider_b}?",
"Which option should I commit to?",
"What is the best fit for me right now?",
"Which plan should I choose today?",
),
"medium": (
"I am ready to decide between {provider_a} and {provider_b}.",
"Help me pick the final option for {goal}.",
"Which {product} should I commit to this week?",
"I need to make the call now; which option fits best?",
"What should I choose if I need to decide today?",
),
"hard": (
"I have a shortlist and need to commit to one vendor now.",
"I am at the point of commitment and need a final recommendation.",
"Which option should we sign off on before next week?",
"I have enough information; tell me which one to go with.",
"I need the final pick, not another round of comparison.",
),
},
"action": {
"easy": (
"Start my free trial.",
"Book a demo with {provider_a}.",
"Create my account.",
"Buy {provider_a} now.",
"Download the {asset}.",
),
"medium": (
"Take me to checkout for {provider_a}.",
"Get me signed up for {provider_a}.",
"Reserve my spot with {provider_a}.",
"I want to purchase {provider_a} today.",
"Send me the download link for the {asset}.",
),
"hard": (
"I am ready to move forward now, where do I start the purchase?",
"Help me complete the signup flow for {provider_a}.",
"I want to act on this immediately and get access now.",
"Can you help me finish the order for {provider_a}?",
"I have decided, now let me complete the next step.",
),
},
"post_purchase": {
"easy": (
"How do I set up my new {product}?",
"Show me how to import contacts into {provider_a}.",
"How do I onboard my team after purchase?",
"What should I enable first after signup?",
"How do I configure my account now that I signed up?",
),
"medium": (
"We already subscribed; how do we get value quickly?",
"What is the best way to roll this out after purchase?",
"Help me configure {provider_a} now that we bought it.",
"How do I invite teammates after signing up?",
"What should I do first after we activate the plan?",
),
"hard": (
"We already made the purchase, now I need guidance on rollout and setup.",
"This is not a buying decision anymore; I need post-purchase onboarding help.",
"I need adoption guidance now that the contract is signed.",
"What is the right onboarding sequence after we commit to {provider_a}?",
"We are past checkout and need implementation help.",
),
},
"support": {
"easy": (
"I cannot log into my {support_object}.",
"How do I reset my password?",
"My invoice is wrong.",
"The integration keeps failing.",
"Our dashboard is not loading.",
),
"medium": (
"Can you help me fix a {support_detail}?",
"I am stuck because my {support_object} keeps breaking.",
"My password reset link is not working.",
"I need support with my {support_object}.",
"Why is {provider_a} not syncing correctly?",
),
"hard": (
"I am not evaluating anything, I just need this issue fixed.",
"This is a live support problem, not a buying question.",
"Please help me resolve a problem with my existing account.",
"I cannot continue because something is broken in my setup.",
"I need troubleshooting help, not recommendations.",
),
},
}
def split_for_index(index: int) -> str:
bucket = index % 5
if bucket < 3:
return "train"
if bucket == 3:
return "val"
return "test"
def write_jsonl(path: Path, rows: list[dict]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as handle:
for row in rows:
handle.write(json.dumps(row, sort_keys=True) + "\n")
def render_text(template: str, spec: dict[str, str]) -> str:
return template.format(**spec).strip()
def build_training_splits() -> dict[str, list[dict]]:
splits = {"train": [], "val": [], "test": []}
seen: set[str] = set()
counters = {(phase, difficulty): 0 for phase in PHASE_TEMPLATES for difficulty in PHASE_TEMPLATES[phase]}
for phase, difficulty_map in PHASE_TEMPLATES.items():
for difficulty, templates in difficulty_map.items():
for template in templates:
theme_specs = TRAIN_THEMES if "{" in template else ({},)
for spec in theme_specs:
text = render_text(template, spec)
key = text.lower()
if key in seen:
continue
seen.add(key)
split_name = split_for_index(counters[(phase, difficulty)])
counters[(phase, difficulty)] += 1
splits[split_name].append(
{
"text": text,
"decision_phase": phase,
"difficulty": difficulty,
"source": "synthetic_decision_phase_difficulty",
}
)
return splits
def build_benchmark_rows() -> list[dict]:
rows: list[dict] = []
seen: set[str] = set()
for phase, difficulty_map in PHASE_TEMPLATES.items():
for difficulty, templates in difficulty_map.items():
spec = BENCHMARK_THEMES.get(difficulty, {})
for template in templates:
text = render_text(template, spec)
key = text.lower()
if key in seen:
continue
seen.add(key)
rows.append(
{
"text": text,
"decision_phase": phase,
"difficulty": difficulty,
"source": "decision_phase_benchmark",
}
)
return rows
def main() -> None:
splits = build_training_splits()
for split_name, rows in splits.items():
write_jsonl(OUTPUT_DIR / f"{split_name}.jsonl", rows)
print(f"{split_name}: {len(rows)} rows")
benchmark_rows = build_benchmark_rows()
write_jsonl(BENCHMARK_PATH, benchmark_rows)
print(f"benchmark: {len(benchmark_rows)} rows")
if __name__ == "__main__":
main()