# data/generate_ood_test.py # Generates a genuinely out-of-distribution (OOD) test set for SupportMind. # # WHY THIS EXISTS: # The standard val/test split in preprocess.py is drawn from the same # template distribution as training data. Evaluating on it produces inflated # accuracy (~100%) because the model has seen structurally identical sentences. # # This script generates hand-crafted tickets that deliberately avoid all # training templates — different vocabulary, informal phrasing, multi-sentence # context, typos, ambiguous cases, and real-world edge cases. # Evaluating on this set yields honest, lower out-of-distribution accuracy. # # Usage: # python data/generate_ood_test.py # Output: # data/processed/ood_test.csv (labeled OOD test set) # # SupportMind — Asmitha import os import csv import random random.seed(99) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) PROC_DIR = os.path.join(BASE_DIR, 'processed') # ── Hand-crafted OOD tickets per category ───────────────────────────────────── # Rules for every sentence written here: # 1. Must NOT use any phrase from preprocess.py templates # 2. Should reflect how real B2B users actually write tickets # 3. Mix of: informal, multi-sentence, typos, jargon, abbreviations # 4. Some tickets are deliberately ambiguous (hard cases) OOD_TICKETS = { 'billing': [ # Informal, real-world phrasing "hey our last receipt had a weird extra line item under 'platform fee' - never seen that before, what is it", "we got charged twice last tuesday, both transactions visible in our bank statement", "finance team says the amount debited doesn't match the quote we accepted in march", "i need VAT removed from our invoices asap, we're registered as VAT-exempt in ireland", "our accountant is asking for a credit note for the overpayment in february", "the system won't let me download our Q4 invoices, getting a blank page", "we switched from annual to monthly plan two weeks ago but still got charged the annual rate", "can someone explain why our bill went up 23% this cycle with no plan changes", "the autopay failed last night and now the account shows past due, but card is valid", "we need invoices re-issued with our new entity name after the company rebrand", "just noticed we've been on the wrong tier for 3 months paying for enterprise when we're on growth", "our CFO needs a statement of account going back 18 months for the audit", ], 'technical_support': [ # Technical but conversational, real errors "reports tab just shows a spinner forever, been like this since yesterday morning for all our users", "getting a 'session expired' popup every 20 minutes even though we haven't changed any timeout settings", "the CSV download for our user list is coming back completely empty, no headers even", "two factor auth codes stopped arriving via email, tried multiple accounts same problem", "our zapier integration triggers fired but nothing updated on your end, checked the zap logs they show 200 ok", "whenever someone tries to @ mention a teammate in a comment it crashes the whole tab", "the search bar stopped returning results for anything older than 30 days overnight", "push notifications on android are delayed by like 4-5 hours, ios users are fine", "we have a data discrepancy: dashboard shows 4,201 active users but the API returns 3,988", "after last night's deployment our SSO stopped working, redirects to a blank page", "rate limiting hitting us at 180 req/min when our tier should allow 500", "bulk import keeps failing at exactly row 847 regardless of what data we put in the file", ], 'account_management': [ # Org admin requests in real language "our CTO just left the company and he was the only owner on the account, how do we take control", "is there a way to see a log of who accessed what data over the last 90 days", "we need to split our account into two separate orgs for our EU and US business units", "can we set it so only admin-level users can export data, regular agents shouldn't have this", "our company was acquired and we need to migrate everything to the parent company's account", "we have 3 employees who left - need their sessions terminated and access revoked right now", "how do I stop certain team members from seeing the billing section specifically", "we need guest access for external consultants that expires after 30 days automatically", "our SAML cert just rotated, how do we update it without locking everyone out", "can you add a second domain to our account, we have a subsidiary that uses a different email domain", ], 'feature_request': [ # Product feedback, wishlist items "would really love a way to pin certain tickets to the top of the queue for my whole team", "any plans to add a kanban view option, the list view doesn't work well for our workflow", "we need the ability to create custom SLA policies per customer tier not just one global setting", "it would be incredibly useful if the API returned the agent assignment history not just current", "please add a way to bulk-archive tickets older than X days, doing it one by one is painful", "is there a public roadmap we can subscribe to for updates", "we'd love conditional logic in your forms so irrelevant fields hide based on category", "can you add keyboard shortcuts to your UI, our agents live in this tool all day", "we need the reports to be schedulable so they auto-send to stakeholders every monday", "any chance of a read-only link sharing option for specific reports without requiring a login", ], 'compliance_legal': [ # Legal/security/compliance, formal tone "our legal team requires a subprocessor list before we can sign the DPA", "can you confirm whether your infrastructure is hosted entirely within the EU for GDPR purposes", "we've received a court order requesting all communications from user account X, how do we proceed", "need to know your breach notification timeline and who gets notified under your incident response plan", "our ISO 27001 auditor is asking for your risk register or equivalent documentation", "is there a way to ensure certain data never leaves a specific geographic region", "we need to perform a right-to-erasure under GDPR for a former customer, what's the process", "our security team flagged that API tokens don't expire - is there a way to enforce rotation", "can you provide a copy of your most recent vulnerability assessment report", "we need a BAA signed before we can process any data through your platform", ], 'onboarding': [ # New customer confusion, getting-started language "just got access this morning, where do i even start - the dashboard is overwhelming", "we were told to set up a webhook during onboarding but the setup page doesn't match the docs", "our success manager said we'd get a kickoff call scheduled but we haven't heard anything after signing", "migrating from zendesk, is there an automated way to move our ticket history over", "how long does it usually take before we can go live? we have a deadline in 3 weeks", "the setup checklist says 'configure routing rules' but I can't find where that setting lives", "we have 80 agents who need accounts, is there a bulk invite feature or do we do it one by one", "our team uses desk phones for support too, does your system integrate with any VOIP providers", "the trial data we imported doesn't show up in production after upgrading, where did it go", "we bought the enterprise tier but some features listed on the pricing page aren't showing up yet", ], 'general_inquiry': [ # Pre-sales, casual questions, no clear urgency "quick question - if we go over our monthly ticket limit do you auto-charge or just cap it", "do you have a status page where we can check if there are ongoing incidents", "we're evaluating you alongside helpscout and intercom, what makes you different", "our team is fully remote across 6 timezones, does your support cover that or only business hours", "is there a sandbox/staging environment we can use to test things without affecting production", "roughly how much data storage does a team of 50 agents typically use per year", "we're a startup of 8 people, is the starter plan enough or will we hit limits fast", "what happens to our data if we decide to cancel, can we export everything", "do you have native integrations with linear or notion or do we need zapier for that", "are there any uptime SLA guarantees in the contract and what's the compensation if missed", ], 'churn_risk': [ # Frustration, implicit/explicit cancellation signals "honestly at this point we've had 6 open tickets for over a month with no resolution", "i've been a customer for 3 years and the service has gotten noticeably worse since your redesign", "our team is actively demoing competitors right now because of the reliability issues", "sent 4 follow up emails to our account manager with no response, extremely unprofessional", "the product has gone downhill since the acquisition and the pricing just went up again", "we're going to have a hard conversation internally about renewing if this isn't fixed this week", "i need our full data export immediately - zip file of everything - we've made a decision", "your SLA says 4 hour response but we've been waiting 3 days on a critical issue", "leadership is losing patience and so am i, this is the last time i'm asking nicely", "please confirm our cancellation date is end of month as I requested last thursday", "we switched half our workflow to a competitor last week as a backup, tell me why we shouldn't move fully", "the contract is up for renewal in 6 weeks and right now i would not recommend renewing", ], } # ── Ambiguous edge cases (deliberately hard, labeled with best-fit category) ── AMBIGUOUS_TICKETS = [ # billing vs account_management ("we need to add a new entity to our account that should be billed separately", "billing"), ("our subsidiary needs its own contract and invoice, how does that work", "billing"), # technical_support vs feature_request ("the analytics charts don't show data older than 6 months, is this a limit or a bug", "technical_support"), ("can we get more granular filters on the reports page, the current ones aren't enough", "feature_request"), # onboarding vs technical_support ("we just started and the API key we got doesn't work in production, returns 401", "technical_support"), ("new account setup, the webhook we configured isn't firing at all during testing", "onboarding"), # churn_risk vs general_inquiry ("what would it cost to move down to the basic tier, weighing our options", "general_inquiry"), ("thinking about whether the enterprise features are worth what we're paying", "churn_risk"), # compliance_legal vs account_management ("we need to know exactly which of your staff can see our customer data", "compliance_legal"), ("can we restrict which of our admins have access to audit logs", "account_management"), ] def build_ood_dataset(): """Assemble and shuffle the full OOD test dataset.""" from preprocess import CATEGORY_MAP # reuse the label mapping tickets = [] # Main hand-crafted tickets for category, texts in OOD_TICKETS.items(): label = CATEGORY_MAP[category] for text in texts: tickets.append({'text': text.strip(), 'label': label, 'category': category, 'ood_type': 'hand_crafted'}) # Ambiguous tickets for text, category in AMBIGUOUS_TICKETS: label = CATEGORY_MAP[category] tickets.append({'text': text.strip(), 'label': label, 'category': category, 'ood_type': 'ambiguous'}) random.shuffle(tickets) return tickets def save_ood_csv(tickets, filepath): os.makedirs(os.path.dirname(filepath), exist_ok=True) fieldnames = ['text', 'label', 'category', 'ood_type'] with open(filepath, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(tickets) print(f" Saved {len(tickets)} OOD test samples -> {filepath}") def main(): print("=" * 60) print("SupportMind — OOD Test Set Generator") print("=" * 60) print("\nThis dataset is deliberately out-of-distribution:") print(" - Hand-crafted sentences (no template reuse from training)") print(" - Informal phrasing, typos, multi-sentence context") print(" - Includes ambiguous edge-case tickets") print() tickets = build_ood_dataset() # Stats from collections import Counter cat_counts = Counter(t['category'] for t in tickets) type_counts = Counter(t['ood_type'] for t in tickets) print("Category distribution:") for cat, count in sorted(cat_counts.items()): print(f" {cat:25s} {count:3d} samples") print(f"\nOOD types:") for ood_type, count in type_counts.items(): print(f" {ood_type:20s} {count:3d} samples") out_path = os.path.join(PROC_DIR, 'ood_test.csv') save_ood_csv(tickets, out_path) print(f"\nTotal: {len(tickets)} OOD test samples") print("\nNext step: run python src/evaluate_ood.py to get honest metrics") print("=" * 60) if __name__ == '__main__': main()