Spaces:

ax2183
/

forward-deployed-ai-sim

Running

File size: 16,918 Bytes

c4fe0a4

"""Download public datasets to data/raw/.

Downloads via HTTP API to avoid heavy dependency issues with the
`datasets` library. Falls back gracefully if network is unavailable.

Datasets used:
  1. Tobi-Bueck/customer-support-tickets (CC BY-NC 4.0)
     - Real multilingual enterprise support tickets
     - Fields: subject, body, answer, type, queue, priority, language, tag_1..tag_8
  2. bitext/Bitext-customer-support-llm-chatbot-training-dataset (Apache 2.0)
     - Customer-agent dialogue pairs with intent/category labels
     - Fields: instruction, response, category, intent, flags
"""
import json
import urllib.request
import urllib.error
from pathlib import Path

RAW_DIR = Path("data/raw")

HF_API_BASE = "https://datasets-server.huggingface.co/rows"


def _fetch_hf_rows(dataset: str, config: str, split: str,
                   offset: int = 0, length: int = 100,
                   retries: int = 2) -> list[dict]:
    """Fetch rows from HuggingFace datasets-server API.

    Returns list of row dicts. Retries on transient HTTP errors (422/5xx).
    """
    url = (
        f"{HF_API_BASE}"
        f"?dataset={dataset}"
        f"&config={config}&split={split}"
        f"&offset={offset}&length={length}"
    )
    last_error = None
    for attempt in range(retries + 1):
        try:
            req = urllib.request.Request(url, headers={
                "User-Agent": "Mozilla/5.0 (forward-deployed-ai-sim)",
            })
            with urllib.request.urlopen(req, timeout=30) as resp:
                data = json.loads(resp.read().decode("utf-8"))
            return [item.get("row", item) for item in data.get("rows", [])]
        except urllib.error.HTTPError as e:
            last_error = e
            if e.code in (422, 429, 500, 502, 503) and attempt < retries:
                import time
                wait = 5 * (attempt + 1)  # 5s, 10s backoff
                print(f"    HTTP {e.code}, retrying in {wait}s (attempt {attempt+1}/{retries})...")
                time.sleep(wait)
                continue
            raise
    raise last_error  # unreachable but satisfies type checker


# ---------------------------------------------------------------------------
# Dataset 1: Support tickets
# ---------------------------------------------------------------------------

def ingest_support_tickets(max_rows: int = 200) -> Path:
    """Download support ticket dataset from HuggingFace.

    Source: Tobi-Bueck/customer-support-tickets
    Saves JSONL to data/raw/support_tickets.jsonl
    """
    RAW_DIR.mkdir(parents=True, exist_ok=True)
    output_path = RAW_DIR / "support_tickets.jsonl"

    # Skip if real data already exists
    if output_path.exists():
        with open(output_path, encoding="utf-8") as f:
            first_line = f.readline()
        if first_line and "_synthetic" not in first_line:
            line_count = sum(1 for _ in open(output_path))
            print(f"  ✓ Already have {line_count} REAL tickets at {output_path} (skipping)")
            return output_path

    print(f"Downloading support tickets (max {max_rows} rows)...")
    try:
        rows = _fetch_hf_rows(
            dataset="Tobi-Bueck/customer-support-tickets",
            config="default",
            split="train",
            offset=0,
            length=max_rows,
        )

        if not rows:
            raise ValueError("API returned 0 rows")

        count = 0
        with open(output_path, "w", encoding="utf-8") as f:
            for row in rows:
                # Keep all fields from the real dataset
                f.write(json.dumps(row, ensure_ascii=False) + "\n")
                count += 1

        print(f"  ✓ Saved {count} REAL tickets to {output_path}")
        _print_ticket_stats(rows)
        return output_path

    except (urllib.error.URLError, urllib.error.HTTPError,
            TimeoutError, ValueError) as e:
        print(f"  ✗ Download failed: {e}")
        print("  → Creating synthetic fallback data...")
        return _create_synthetic_tickets(output_path, max_rows=min(max_rows, 30))


def _print_ticket_stats(rows: list[dict]) -> None:
    """Print summary stats for downloaded tickets."""
    languages = {}
    queues = {}
    for row in rows:
        lang = row.get("language", "unknown")
        languages[lang] = languages.get(lang, 0) + 1
        queue = row.get("queue", "unknown")
        queues[queue] = queues.get(queue, 0) + 1
    print(f"  Languages: {dict(sorted(languages.items(), key=lambda x: -x[1]))}")
    print(f"  Queues:    {dict(sorted(queues.items(), key=lambda x: -x[1]))}")


# ---------------------------------------------------------------------------
# Dataset 2: Bitext customer support dialogues
# ---------------------------------------------------------------------------

def ingest_bitext_dialogues(max_rows: int = 100) -> Path:
    """Download Bitext customer support dialogue dataset.

    Source: bitext/Bitext-customer-support-llm-chatbot-training-dataset
    Saves JSONL to data/raw/bitext_dialogues.jsonl

    The dataset is very repetitive within each intent (~100 paraphrases),
    so we sample sparsely across offsets to maximize category diversity.
    """
    RAW_DIR.mkdir(parents=True, exist_ok=True)
    output_path = RAW_DIR / "bitext_dialogues.jsonl"

    # Skip if real data already exists
    if output_path.exists():
        with open(output_path, encoding="utf-8") as f:
            first_line = f.readline()
        if first_line and "_synthetic" not in first_line:
            line_count = sum(1 for _ in open(output_path))
            print(f"  ✓ Already have {line_count} REAL dialogues at {output_path} (skipping)")
            return output_path

    # Sample at staggered offsets to get diverse categories/intents
    # Dataset has ~27k rows, categories include ORDER, ACCOUNT, PAYMENT,
    # DELIVERY, REFUND, FEEDBACK, CONTACT, INVOICE, CANCELLATION, etc.
    sample_offsets = [0, 1000, 2000, 3000, 4000, 5000, 7000, 9000,
                      11000, 13000, 15000, 17000, 19000, 21000, 24000]
    rows_per_offset = max(1, max_rows // len(sample_offsets))

    print(f"Downloading Bitext dialogues (max {max_rows} rows, sparse sampling)...")
    try:
        all_rows = []
        seen_intents = set()

        for offset in sample_offsets:
            if len(all_rows) >= max_rows:
                break
            batch = _fetch_hf_rows(
                dataset="bitext/Bitext-customer-support-llm-chatbot-training-dataset",
                config="default",
                split="train",
                offset=offset,
                length=rows_per_offset + 5,  # fetch a few extra to deduplicate
            )
            for row in batch:
                intent = row.get("intent", "")
                # Take at most one example per intent to maximize diversity
                if intent not in seen_intents and len(all_rows) < max_rows:
                    all_rows.append(row)
                    seen_intents.add(intent)

        if not all_rows:
            raise ValueError("API returned 0 rows")

        count = 0
        with open(output_path, "w", encoding="utf-8") as f:
            for row in all_rows:
                f.write(json.dumps(row, ensure_ascii=False) + "\n")
                count += 1

        print(f"  ✓ Saved {count} REAL dialogues to {output_path}")
        print(f"  Unique intents: {len(seen_intents)}")
        categories = {}
        for row in all_rows:
            cat = row.get("category", "unknown")
            categories[cat] = categories.get(cat, 0) + 1
        print(f"  Categories: {dict(sorted(categories.items(), key=lambda x: -x[1]))}")
        return output_path

    except (urllib.error.URLError, urllib.error.HTTPError,
            TimeoutError, ValueError) as e:
        print(f"  ✗ Download failed: {e}")
        print("  → Creating synthetic fallback conversations...")
        return _create_synthetic_conversations(
            RAW_DIR / "samsum_conversations.jsonl",
            max_rows=min(max_rows, 20),
        )


# ---------------------------------------------------------------------------
# Synthetic fallback (used only when API is unreachable)
# ---------------------------------------------------------------------------

def _create_synthetic_tickets(path: Path, max_rows: int = 30) -> Path:
    """Create synthetic support tickets as fallback.

    LABELED: all fields are synthetic. Used only when real data download fails.
    """
    import random
    random.seed(42)

    categories = [
        ("billing", "I was charged twice for my subscription this month. Please fix this immediately."),
        ("billing", "My invoice shows an incorrect amount. I should be on the $29/month plan but was charged $49."),
        ("billing", "I cancelled my service last month but I'm still being billed. This is unacceptable."),
        ("network", "My internet has been down for 3 days. I work from home and this is critical."),
        ("network", "The connection keeps dropping every 30 minutes. I've already restarted the router multiple times."),
        ("network", "Extremely slow speeds. I'm paying for 100Mbps but only getting 5Mbps."),
        ("account", "I can't log into my account. Password reset isn't working either."),
        ("account", "Please update my address and phone number on file."),
        ("account", "I want to upgrade my plan to premium. Can you help with that?"),
        ("service", "The technician didn't show up for my scheduled appointment today."),
        ("service", "I've been on hold for 45 minutes trying to reach support. This is terrible customer service."),
        ("service", "Your mobile app crashes every time I try to check my usage."),
        ("product", "The equipment you sent is defective. The power light keeps blinking red."),
        ("product", "I need a replacement remote control. Mine stopped working."),
        ("security", "I received a suspicious email claiming to be from your company asking for my password."),
        ("security", "Someone made unauthorized changes to my account. I think my account was compromised."),
        ("billing", "Why was I charged an early termination fee? I completed my contract period."),
        ("network", "No service in my area since the storm last week. When will it be restored?"),
        ("service", "Your automated system keeps disconnecting my calls before I can speak to anyone."),
        ("product", "The new modem you sent doesn't support my existing setup. I need a compatible one."),
        ("billing", "I was promised a promotional rate of $19.99 but my bill shows $39.99."),
        ("network", "WiFi doesn't reach my home office. Signal is very weak upstairs."),
        ("account", "I'm moving to a new address next month. How do I transfer my service?"),
        ("service", "The online chat support gave me wrong information and now my service is disrupted."),
        ("product", "Battery on the provided router dies after 2 hours. Need replacement."),
        ("billing", "I've been paying for premium channels I never ordered. Want a refund for the past 3 months."),
        ("network", "Complete outage in the downtown area. Multiple neighbors affected too."),
        ("security", "I noticed unknown devices connected to my account. Please secure it immediately."),
        ("service", "Scheduled maintenance was supposed to be overnight but it extended into business hours."),
        ("account", "I want to cancel my service effective end of this month. Please confirm."),
    ]

    priorities = ["low", "medium", "medium", "high", "critical"]
    answers = [
        "We apologize for the inconvenience. Our team is looking into this issue.",
        "Thank you for reaching out. We've escalated this to our technical team.",
        "We understand your frustration. A credit has been applied to your account.",
        "Our technician will visit your location within 24-48 hours.",
        "We've updated your account as requested. Changes will take effect immediately.",
    ]

    count = 0
    with open(path, "w", encoding="utf-8") as f:
        for i, (category, text) in enumerate(categories[:max_rows]):
            row = {
                "subject": f"Issue with {category} - Ticket #{i+1000}",
                "body": text,
                "answer": random.choice(answers),
                "priority": random.choice(priorities),
                "queue": category,
                "type": "complaint" if "unacceptable" in text.lower() or "terrible" in text.lower() else "inquiry",
                "language": "en",
                "_synthetic": True,  # Explicit label
            }
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
            count += 1

    print(f"  Created {count} SYNTHETIC tickets at {path}")
    return path


def _create_synthetic_conversations(path: Path, max_rows: int = 20) -> Path:
    """Create synthetic conversations as fallback.

    LABELED: all fields are synthetic. Used only when real data download fails.
    """
    conversations = [
        {
            "dialogue": "Customer: Hi, my internet is not working.\nAgent: I'm sorry to hear that. Let me check your connection status.\nCustomer: It's been down since yesterday morning.\nAgent: I can see there's an outage in your area. Our team is working on it.\nCustomer: When will it be fixed?\nAgent: We expect it to be resolved within 24 hours.",
            "summary": "Customer reports internet outage since yesterday. Agent confirms area outage and estimates 24-hour resolution.",
        },
        {
            "dialogue": "Customer: I want to dispute a charge on my bill.\nAgent: I'd be happy to help. Which charge are you referring to?\nCustomer: There's a $15 fee labeled 'service adjustment' that I don't recognize.\nAgent: Let me look into that. It appears this was an error. I'll remove it.\nCustomer: Thank you. How long until I see the credit?\nAgent: The credit will appear on your next billing cycle.",
            "summary": "Customer disputes unknown $15 service adjustment fee. Agent identifies it as an error and applies credit for next billing cycle.",
        },
        {
            "dialogue": "Customer: I'm extremely frustrated. This is the third time I'm calling about the same issue.\nAgent: I sincerely apologize. Let me review your case history.\nCustomer: Every time I call, I get a different answer. Nobody seems to know what's going on.\nAgent: I understand your frustration. I'm going to escalate this to our senior team.\nCustomer: I want this resolved today or I'm switching providers.\nAgent: I've marked this as urgent. A supervisor will call you within 2 hours.",
            "summary": "Frustrated repeat caller threatens to switch providers. Agent escalates to supervisor with 2-hour callback commitment.",
        },
        {
            "dialogue": "Customer: Can I upgrade my plan without extending my contract?\nAgent: Yes, you can upgrade anytime. Would you like to see the available options?\nCustomer: What's the price difference for the premium tier?\nAgent: The premium tier is $20 more per month and includes additional features.\nCustomer: OK, let me think about it.\nAgent: No problem. I'll send you a comparison email.",
            "summary": "Customer inquires about plan upgrade pricing. Agent explains $20/month premium tier difference and will send comparison email.",
        },
        {
            "dialogue": "Customer: Someone accessed my account without permission.\nAgent: This is very concerning. Let me secure your account immediately.\nCustomer: I noticed charges I didn't make.\nAgent: I've temporarily locked your account. We'll need to verify your identity.\nCustomer: What charges were made?\nAgent: There are three unauthorized transactions totaling $127. We'll investigate and reverse them.",
            "summary": "Customer reports unauthorized account access with $127 in fraudulent charges. Agent locks account and initiates investigation to reverse transactions.",
        },
    ]

    count = 0
    with open(path, "w", encoding="utf-8") as f:
        for conv in conversations[:max_rows]:
            conv["_synthetic"] = True
            f.write(json.dumps(conv, ensure_ascii=False) + "\n")
            count += 1

    print(f"  Created {count} SYNTHETIC conversations at {path}")
    return path


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    print("=" * 60)
    print("Data Ingestion — downloading public datasets")
    print("=" * 60)
    print()
    ingest_support_tickets(max_rows=200)
    print()
    ingest_bitext_dialogues(max_rows=100)
    print()
    print("=" * 60)
    print("Done. Raw data saved to data/raw/")
    print("Next step: python scripts/build_cases.py")
    print("=" * 60)