s23deepak
/

grandgemma-eval

Model card Files Files and versions

xet

Community

s23deepak commited on May 12

Commit

750e870

verified ·

1 Parent(s): 633ce01

Add assemble_corpus.py with combined text+audio multimodal support for Unsloth"

Browse files

Files changed (1) hide show

assemble_corpus.py +339 -175

assemble_corpus.py CHANGED Viewed

@@ -1,33 +1,48 @@
 #!/usr/bin/env python3
 """
-assemble_corpus.py — Compile scam detection datasets into one unified corpus.
 ═══════════════════════════════════════════════════════════════════════════
-PHASE 1 (Text SFT) — this script handles:
-  1. BothBosu/scam-dialogue           — phone call transcripts (EN)
-  2. BothBosu/multi-agent-scam-conversation — phone call transcripts (EN)
-  3. ealvaradob/phishing-dataset      — email/SMS phishing texts (EN)
-PHASE 2 (Audio fine-tuning) — NOT handled here, use separately:
-  4. JimmyMa99/TeleAntiFraud          — labeled .mp3 files of fraud calls (ZH)
-     The audio_path field points to actual call recordings.
-     The instruction/label fields are just prompts, NOT content.
-═══════════════════════════════════════════════════════════════════════════
-Output schema:
-  ┌────────────────┬─────────┬──────────────────────────────────────────────────┐
-  │ Column         │ Type    │ Purpose                                          │
-  ├────────────────┼─────────┼──────────────────────────────────────────────────┤
-  │ text           │ string  │ The message/transcript content                   │
-  │ category       │ string  │ "scam" or "not_scam" — normalized label          │
-  │ source_id      │ string  │ Dataset identifier                               │
-  │ source_license │ string  │ License for compliance                           │
-  │ pii_redacted   │ boolean │ Whether PII regex was applied                    │
-  └────────────────┴─────────┴──────────────────────────────────────────────────┘
 USAGE:
-  python assemble_corpus.py
-  python assemble_corpus.py --output_dir ./scam_corpus --push_to_hub s23deepak/scambench
 REQUIREMENTS:
   pip install datasets huggingface_hub scikit-learn
@@ -36,6 +51,7 @@ import argparse
 import json
 import re
 import hashlib
 from pathlib import Path
 from collections import Counter
@@ -43,20 +59,40 @@ from datasets import load_dataset, Dataset, DatasetDict
 from sklearn.model_selection import train_test_split
 # ═══════════════════════════════════════════════════════════════════════
 # PII REDACTION
 # ═══════════════════════════════════════════════════════════════════════
 PII_PATTERNS = [
-    (r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]'),           # US phone
-    (r'\b\d{10,11}\b', '[PHONE]'),                             # Generic phone
     (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]'),
-    (r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]'),                      # SSN
-    (r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', '[CARD]'),  # Credit card
-    (r'\b(?:\d{1,3}\.){3}\d{1,3}\b', '[IP]'),                 # IP address
 ]
 def redact_pii(text: str) -> tuple[str, bool]:
-    """Apply PII regex patterns. Returns (redacted_text, was_redacted)."""
     redacted = False
     for pattern, replacement in PII_PATTERNS:
         new_text = re.sub(pattern, replacement, text)
@@ -67,67 +103,205 @@ def redact_pii(text: str) -> tuple[str, bool]:
 # ═══════════════════════════════════════════════════════════════════════
-# SOURCE LOADERS (Phase 1 — text only)
 # ═══════════════════════════════════════════════════════════════════════
 def load_scam_dialogue() -> list[dict]:
-    """BothBosu/scam-dialogue — 1,280 labeled phone call transcripts."""
-    print("  [1/3] Loading BothBosu/scam-dialogue …")
     ds = load_dataset("BothBosu/scam-dialogue", split="train")
     rows = []
     for r in ds:
-        text, pii = redact_pii(r["dialogue"])
-        rows.append({
-            "text": text,
-            "category": "scam" if r["label"] == 1 else "not_scam",
-            "source_id": "BothBosu/scam-dialogue",
-            "source_license": "unknown",
-            "pii_redacted": pii,
-        })
     print(f"       → {len(rows)} rows")
     return rows
 def load_multi_agent_scam() -> list[dict]:
-    """BothBosu/multi-agent-scam-conversation — multi-personality phone scams."""
-    print("  [2/3] Loading BothBosu/multi-agent-scam-conversation …")
     ds = load_dataset("BothBosu/multi-agent-scam-conversation", split="train")
     rows = []
     for r in ds:
-        text, pii = redact_pii(r["dialogue"])
-        rows.append({
-            "text": text,
-            "category": "scam" if r["labels"] == 1 else "not_scam",
-            "source_id": "BothBosu/multi-agent-scam-conversation",
-            "source_license": "unknown",
-            "pii_redacted": pii,
-        })
     print(f"       → {len(rows)} rows")
     return rows
-def load_phishing_dataset() -> list[dict]:
-    """ealvaradob/phishing-dataset — 20K email/SMS phishing texts."""
-    print("  [3/3] Loading ealvaradob/phishing-dataset (texts.json) …")
-    from huggingface_hub import hf_hub_download
-    path = hf_hub_download("ealvaradob/phishing-dataset", "texts.json", repo_type="dataset")
-    with open(path) as f:
-        data = json.load(f)
     rows = []
-    for r in data:
-        text = r.get("text", "")
-        if not text or len(text.strip()) < 20:
-            continue  # Skip empty/trivial rows
-        text, pii = redact_pii(text)
-        rows.append({
-            "text": text,
-            "category": "scam" if r["label"] == 1 else "not_scam",
-            "source_id": "ealvaradob/phishing-dataset",
-            "source_license": "apache-2.0",
-            "pii_redacted": pii,
-        })
-    print(f"       → {len(rows)} rows")
     return rows
@@ -135,155 +309,145 @@ def load_phishing_dataset() -> list[dict]:
 # ASSEMBLY
 # ═══════════════════════════════════════════════════════════════════════
-def deduplicate(rows: list[dict]) -> list[dict]:
-    """Remove exact-text duplicates."""
-    seen = set()
-    unique = []
-    for r in rows:
-        h = hashlib.md5(r["text"].encode()).hexdigest()
-        if h not in seen:
-            seen.add(h)
-            unique.append(r)
-    removed = len(rows) - len(unique)
-    if removed:
-        print(f"  Deduplication: removed {removed} exact duplicates")
-    return unique
-def print_stats(rows: list[dict], name: str = "Corpus"):
     """Print corpus statistics."""
-    cats = Counter(r["category"] for r in rows)
-    sources = Counter(r["source_id"] for r in rows)
-    pii_count = sum(1 for r in rows if r["pii_redacted"])
     print(f"\n{'='*60}")
-    print(f"{name} Statistics")
     print(f"{'='*60}")
-    print(f"  Total rows: {len(rows)}")
-    print(f"  Categories: {dict(cats)}")
-    if cats.get("not_scam", 0) > 0:
-        print(f"  Balance: {cats.get('scam',0)}/{cats.get('not_scam',0)} "
-              f"= {cats.get('scam',0)/cats['not_scam']:.2f} scam:legit ratio")
-    print(f"  PII redacted: {pii_count} rows ({100*pii_count/max(len(rows),1):.1f}%)")
-    print(f"  Sources:")
-    for src, count in sources.most_common():
-        print(f"    {src}: {count}")
     print(f"{'='*60}\n")
 def main():
-    parser = argparse.ArgumentParser(description="Assemble ScamBench corpus (Phase 1 — text)")
     parser.add_argument("--output_dir", default="./scam_corpus")
     parser.add_argument("--push_to_hub", default=None,
                         help="HF dataset repo, e.g. s23deepak/scambench")
     parser.add_argument("--held_out_ratio", type=float, default=0.10)
     parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--max_phishing", type=int, default=5000,
-                        help="Cap phishing rows to prevent dominating corpus")
     args = parser.parse_args()
     print("=" * 60)
-    print("ASSEMBLING SCAMBENCH CORPUS (Phase 1 — Text)")
     print("=" * 60)
-    print()
-    print("NOTE: JimmyMa99/TeleAntiFraud is EXCLUDED from this pipeline.")
-    print("      It contains .mp3 audio files, not text transcripts.")
-    print("      Use it in Phase 2 (audio multimodal fine-tuning).")
-    print()
-    # ── Load all text sources ─────────────────────────────────────────
     all_rows = []
     all_rows.extend(load_scam_dialogue())
     all_rows.extend(load_multi_agent_scam())
-    phishing_rows = load_phishing_dataset()
-    if len(phishing_rows) > args.max_phishing:
-        import random
-        random.seed(args.seed)
-        phishing_rows = random.sample(phishing_rows, args.max_phishing)
-        print(f"       (capped to {args.max_phishing} to prevent dominating corpus)")
-    all_rows.extend(phishing_rows)
-    # ── Deduplicate ───────────────────────────────────────────────────
-    all_rows = deduplicate(all_rows)
-    # ── Stats ─────────────────────────────────────────────────────────
-    print_stats(all_rows, "Full Corpus (before split)")
     # ── Stratified split ──────────────────────────────────────────────
-    labels = [r["category"] for r in all_rows]
     train_rows, held_out_rows = train_test_split(
         all_rows, test_size=args.held_out_ratio,
         stratify=labels, random_state=args.seed
     )
-    print(f"Stratified split (seed={args.seed}): "
-          f"train={len(train_rows)} | held_out={len(held_out_rows)}")
     print_stats(train_rows, "Train Split")
-    print_stats(held_out_rows, "Held-Out Split (NEVER use for training)")
-    # ── Save as Parquet + JSONL ───────────────────────────────────────
     out_dir = Path(args.output_dir)
     out_dir.mkdir(parents=True, exist_ok=True)
-    train_ds = Dataset.from_list(train_rows)
-    held_out_ds = Dataset.from_list(held_out_rows)
-    corpus = DatasetDict({"train": train_ds, "held_out": held_out_ds})
-    # Save Parquet (HF-native)
-    corpus.save_to_disk(str(out_dir / "parquet"))
-    print(f"✓ Saved Parquet → {out_dir / 'parquet'}/")
-    # Save JSONL (portable fallback)
     for split_name, split_rows in [("train", train_rows), ("held_out", held_out_rows)]:
         jsonl_path = out_dir / f"{split_name}.jsonl"
         with open(jsonl_path, "w") as f:
             for r in split_rows:
                 f.write(json.dumps(r, ensure_ascii=False) + "\n")
-    print(f"✓ Saved JSONL → {out_dir}/train.jsonl, held_out.jsonl")
-    # PII audit trail
-    audit = {
-        "total_rows": len(all_rows),
-        "pii_redacted_count": sum(1 for r in all_rows if r["pii_redacted"]),
-        "patterns_applied": [p[1] for p in PII_PATTERNS],
-        "seed": args.seed,
-        "held_out_ratio": args.held_out_ratio,
-        "sources": dict(Counter(r["source_id"] for r in all_rows)),
-    }
-    (out_dir / "pii-audit.json").write_text(json.dumps(audit, indent=2))
-    print(f"✓ PII audit → {out_dir}/pii-audit.json")
-    # ── Push to Hub ───────────────────────────────────────────────────
     if args.push_to_hub:
         print(f"\nPushing to https://huggingface.co/datasets/{args.push_to_hub} …")
         corpus.push_to_hub(args.push_to_hub, private=False)
         print(f"✓ Pushed!")
-    # ── Print training format example ─────────────────────────────────
-    print(f"\n{'='*60}")
-    print("TRAINING FORMAT EXAMPLE")
-    print("(Use this in your training script's format_example function)")
-    print(f"{'='*60}")
-    sample = train_rows[0]
-    label = "SCAM" if sample["category"] == "scam" else "NOT_SCAM"
-    formatted = (
-        "Classify the following message as SCAM or NOT_SCAM. "
-        "Consider urgency, payment requests, impersonation, and remote-access patterns.\n\n"
-        f"Message: {sample['text'][:300]}...\n\n"
-        f"Classification: {label}"
     )
-    print(formatted)
-    print(f"\n{'='*60}")
-    print("PHASE 2 REMINDER")
-    print(f"{'='*60}")
-    print("""
-For audio fine-tuning (Phase 2), use JimmyMa99/TeleAntiFraud separately:
-  - audio_path: path to .mp3 files of actual phone calls
-  - label: "fraud" or "normal"
-  - Feed audio directly to Gemma 4's audio encoder
-  - No text transcription needed — model processes raw audio
 """)

 #!/usr/bin/env python3
 """
+assemble_corpus.py — Build a unified multimodal scam detection corpus.
+Supports TWO modes:
+  --mode text     → Phase 1: text-only SFT (works on 8GB VRAM)
+  --mode combined → Phase 1+2: text AND audio in one dataset (needs 16GB+ VRAM)
 ═══════════════════════════════════════════════════════════════════════════
+TEXT SOURCES:
+  1. BothBosu/scam-dialogue              — phone transcripts (EN, 1280 rows)
+  2. BothBosu/multi-agent-scam-conversation — phone transcripts (EN)
+  3. BothBosu/single-agent-scam-conversations — phone transcripts (EN)
+  4. ealvaradob/phishing-dataset         — email/SMS (EN, 20K rows)
+  5. shakeleoatmeal/phone-scam-detection-synthetic — phone calls (EN, 1800)
+  6. FredZhang7/all-scam-spam            — SMS/email multilingual (42K)
+AUDIO SOURCE:
+  7. JimmyMa99/TeleAntiFraud             — .mp3 phone call recordings (ZH, 11.9GB)
+═══════════════════════════════════════════════════════════════════════════
+OUTPUT FORMAT (compatible with Unsloth multimodal SFT):
+  Text example:
+    {"messages": [
+        {"role": "user", "content": [{"type": "text", "text": "Classify...\\n\\nMessage: ..."}]},
+        {"role": "assistant", "content": [{"type": "text", "text": "SCAM"}]}
+    ]}
+  Audio example:
+    {"messages": [
+        {"role": "user", "content": [
+            {"type": "audio", "audio_url": "audio/NEG-imitate-12/tts_test3037.mp3"},
+            {"type": "text", "text": "Is this phone call a scam? Answer: SCAM or NOT_SCAM"}
+        ]},
+        {"role": "assistant", "content": [{"type": "text", "text": "SCAM"}]}
+    ]}
+═══════════════════════════════════════════════════════════════════════════
 USAGE:
+  # Text only (Phase 1)
+  python assemble_corpus.py --mode text --push_to_hub s23deepak/scambench
+  # Combined text + audio (Phase 1+2)
+  python assemble_corpus.py --mode combined --audio_dir ./audio --push_to_hub s23deepak/scambench-multimodal
 REQUIREMENTS:
   pip install datasets huggingface_hub scikit-learn
 import json
 import re
 import hashlib
+import random
 from pathlib import Path
 from collections import Counter
 from sklearn.model_selection import train_test_split
+# ═══════════════════��═══════════════════════════════════════════════════
+# CONFIG
+# ═══════════════════════════════════════════════════════════════════════
+SEED = 42
+random.seed(SEED)
+SYSTEM_PROMPT = (
+    "You are a phone scam detection expert. "
+    "Analyze the content and classify it as SCAM or NOT_SCAM."
+)
+TEXT_PROMPT = (
+    "Classify the following message as SCAM or NOT_SCAM. "
+    "Consider urgency, payment requests, impersonation, and remote-access patterns.\n\n"
+    "Message: {text}\n\n"
+    "Classification:"
+)
+AUDIO_PROMPT = "Listen to this phone call and classify it as SCAM or NOT_SCAM."
 # ═══════════════════════════════════════════════════════════════════════
 # PII REDACTION
 # ═══════════════════════════════════════════════════════════════════════
 PII_PATTERNS = [
+    (r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]'),
+    (r'\b\d{10,11}\b', '[PHONE]'),
     (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]'),
+    (r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]'),
+    (r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', '[CARD]'),
+    (r'\b(?:\d{1,3}\.){3}\d{1,3}\b', '[IP]'),
 ]
 def redact_pii(text: str) -> tuple[str, bool]:
     redacted = False
     for pattern, replacement in PII_PATTERNS:
         new_text = re.sub(pattern, replacement, text)
 # ═══════════════════════════════════════════════════════════════════════
+# FORMAT CONVERTERS
+# ═══════════════════════════════════════════════════════════════════════
+def to_text_message(text: str, label: str) -> dict:
+    """Convert text + label to Unsloth multimodal message format."""
+    return {
+        "messages": [
+            {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
+            {"role": "user", "content": [
+                {"type": "text", "text": TEXT_PROMPT.format(text=text)}
+            ]},
+            {"role": "assistant", "content": [
+                {"type": "text", "text": label}
+            ]},
+        ]
+    }
+def to_audio_message(audio_path: str, label: str) -> dict:
+    """Convert audio path + label to Unsloth multimodal message format."""
+    return {
+        "messages": [
+            {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
+            {"role": "user", "content": [
+                {"type": "audio", "audio_url": audio_path},
+                {"type": "text", "text": AUDIO_PROMPT},
+            ]},
+            {"role": "assistant", "content": [
+                {"type": "text", "text": label}
+            ]},
+        ]
+    }
+# ═══════════════════════════════════════════════════════════════════════
+# TEXT SOURCE LOADERS
 # ═══════════════════════════════════════════════════════════════════════
 def load_scam_dialogue() -> list[dict]:
+    """BothBosu/scam-dialogue"""
+    print("  [1/6] BothBosu/scam-dialogue …")
     ds = load_dataset("BothBosu/scam-dialogue", split="train")
     rows = []
     for r in ds:
+        text, _ = redact_pii(r["dialogue"])
+        label = "SCAM" if r["label"] == 1 else "NOT_SCAM"
+        rows.append(to_text_message(text, label))
     print(f"       → {len(rows)} rows")
     return rows
 def load_multi_agent_scam() -> list[dict]:
+    """BothBosu/multi-agent-scam-conversation"""
+    print("  [2/6] BothBosu/multi-agent-scam-conversation …")
     ds = load_dataset("BothBosu/multi-agent-scam-conversation", split="train")
     rows = []
     for r in ds:
+        text, _ = redact_pii(r["dialogue"])
+        label = "SCAM" if r["labels"] == 1 else "NOT_SCAM"
+        rows.append(to_text_message(text, label))
     print(f"       → {len(rows)} rows")
     return rows
+def load_single_agent_scam() -> list[dict]:
+    """BothBosu/single-agent-scam-conversations"""
+    print("  [3/6] BothBosu/single-agent-scam-conversations …")
+    try:
+        ds = load_dataset("BothBosu/single-agent-scam-conversations", split="train")
+        rows = []
+        for r in ds:
+            text, _ = redact_pii(r.get("dialogue", r.get("conversation", "")))
+            label_raw = r.get("labels", r.get("label", 0))
+            label = "SCAM" if label_raw == 1 else "NOT_SCAM"
+            rows.append(to_text_message(text, label))
+        print(f"       → {len(rows)} rows")
+        return rows
+    except Exception as e:
+        print(f"       ⚠ Skipped: {e}")
+        return []
+def load_phone_scam_synthetic() -> list[dict]:
+    """shakeleoatmeal/phone-scam-detection-synthetic"""
+    print("  [4/6] shakeleoatmeal/phone-scam-detection-synthetic …")
+    try:
+        ds = load_dataset("shakeleoatmeal/phone-scam-detection-synthetic", split="train")
+        rows = []
+        for r in ds:
+            # Check column names
+            text = r.get("dialogue", r.get("text", r.get("conversation", "")))
+            if not text:
+                continue
+            text, _ = redact_pii(text)
+            label_raw = r.get("label", r.get("labels", r.get("is_fraud", 0)))
+            if isinstance(label_raw, str):
+                label = "SCAM" if label_raw.lower() in ("fraud", "scam", "1") else "NOT_SCAM"
+            else:
+                label = "SCAM" if label_raw == 1 else "NOT_SCAM"
+            rows.append(to_text_message(text, label))
+        print(f"       → {len(rows)} rows")
+        return rows
+    except Exception as e:
+        print(f"       ⚠ Skipped: {e}")
+        return []
+def load_phishing_dataset(max_rows: int = 5000) -> list[dict]:
+    """ealvaradob/phishing-dataset (texts.json)"""
+    print("  [5/6] ealvaradob/phishing-dataset …")
+    try:
+        from huggingface_hub import hf_hub_download
+        path = hf_hub_download("ealvaradob/phishing-dataset", "texts.json", repo_type="dataset")
+        with open(path) as f:
+            data = json.load(f)
+        rows = []
+        for r in data:
+            text = r.get("text", "")
+            if not text or len(text.strip()) < 20:
+                continue
+            text, _ = redact_pii(text)
+            label = "SCAM" if r["label"] == 1 else "NOT_SCAM"
+            rows.append(to_text_message(text, label))
+        if len(rows) > max_rows:
+            rows = random.sample(rows, max_rows)
+            print(f"       (capped to {max_rows})")
+        print(f"       → {len(rows)} rows")
+        return rows
+    except Exception as e:
+        print(f"       ⚠ Skipped: {e}")
+        return []
+def load_all_scam_spam(max_rows: int = 5000) -> list[dict]:
+    """FredZhang7/all-scam-spam"""
+    print("  [6/6] FredZhang7/all-scam-spam …")
+    try:
+        ds = load_dataset("FredZhang7/all-scam-spam", split="train")
+        rows = []
+        for r in ds:
+            text = r.get("text", "")
+            if not text or len(text.strip()) < 20:
+                continue
+            text, _ = redact_pii(text)
+            label = "SCAM" if r.get("is_spam", 0) == 1 else "NOT_SCAM"
+            rows.append(to_text_message(text, label))
+        if len(rows) > max_rows:
+            rows = random.sample(rows, max_rows)
+            print(f"       (capped to {max_rows})")
+        print(f"       → {len(rows)} rows")
+        return rows
+    except Exception as e:
+        print(f"       ⚠ Skipped: {e}")
+        return []
+# ═══════════════════════════════════════════════════════════════════════
+# AUDIO SOURCE LOADER
+# ═══════════════════════════════════════════════════════════════════════
+def load_teleanti_fraud_audio(audio_dir: str) -> list[dict]:
+    """
+    JimmyMa99/TeleAntiFraud — audio examples.
+    Prerequisites: Download and unzip audio.zip from the dataset repo:
+      huggingface-cli download JimmyMa99/TeleAntiFraud audio.zip --repo-type dataset
+      unzip audio.zip -d ./audio
+    Then pass --audio_dir ./audio
+    """
+    print("  [AUDIO] JimmyMa99/TeleAntiFraud …")
+    ds = load_dataset("JimmyMa99/TeleAntiFraud", split="train")
+    audio_path = Path(audio_dir)
+    if not audio_path.exists():
+        print(f"       ⚠ Audio dir '{audio_dir}' not found!")
+        print(f"       Download with: huggingface-cli download JimmyMa99/TeleAntiFraud audio.zip --repo-type dataset")
+        print(f"       Then: unzip audio.zip -d {audio_dir}")
+        return []
     rows = []
+    missing = 0
+    for r in ds:
+        rel_path = r["audio_path"]  # e.g. "audio/POS-imitate-4/tts_test1139/tts_test1139.mp3"
+        # Try to find the file
+        full_path = audio_path / rel_path
+        if not full_path.exists():
+            # Try without "audio/" prefix
+            full_path = audio_path / rel_path.replace("audio/", "", 1)
+        if not full_path.exists():
+            missing += 1
+            continue
+        label = "SCAM" if r["label"] == "fraud" else "NOT_SCAM"
+        rows.append(to_audio_message(str(full_path), label))
+    if missing:
+        print(f"       ⚠ {missing} audio files not found (check --audio_dir path)")
+    print(f"       → {len(rows)} audio rows")
     return rows
 # ASSEMBLY
 # ═══════════════════════════════════════════════════════════════════════
+def print_stats(rows: list[dict], name: str):
     """Print corpus statistics."""
+    labels = []
+    modalities = Counter()
+    for r in rows:
+        # Extract label from assistant message
+        assistant_content = r["messages"][-1]["content"]
+        if isinstance(assistant_content, list):
+            label = assistant_content[0]["text"]
+        else:
+            label = assistant_content
+        labels.append(label)
+        # Check modality
+        user_content = r["messages"][1]["content"]
+        has_audio = any(c.get("type") == "audio" for c in user_content)
+        modalities["audio" if has_audio else "text"] += 1
+    cats = Counter(labels)
     print(f"\n{'='*60}")
+    print(f"{name}")
     print(f"{'='*60}")
+    print(f"  Total: {len(rows)}")
+    print(f"  Labels: {dict(cats)}")
+    print(f"  Modalities: {dict(modalities)}")
+    if cats.get("NOT_SCAM", 0) > 0:
+        print(f"  Balance: {cats.get('SCAM',0)}:{cats.get('NOT_SCAM',0)} "
+              f"({cats.get('SCAM',0)/cats['NOT_SCAM']:.2f} ratio)")
     print(f"{'='*60}\n")
 def main():
+    parser = argparse.ArgumentParser(description="Assemble ScamBench corpus")
+    parser.add_argument("--mode", choices=["text", "combined"], default="text",
+                        help="'text' = Phase 1 only, 'combined' = text + audio")
     parser.add_argument("--output_dir", default="./scam_corpus")
     parser.add_argument("--push_to_hub", default=None,
                         help="HF dataset repo, e.g. s23deepak/scambench")
+    parser.add_argument("--audio_dir", default="./audio",
+                        help="Path to extracted TeleAntiFraud audio files")
     parser.add_argument("--held_out_ratio", type=float, default=0.10)
+    parser.add_argument("--max_phishing", type=int, default=5000)
+    parser.add_argument("--max_spam", type=int, default=5000)
     parser.add_argument("--seed", type=int, default=42)
     args = parser.parse_args()
+    random.seed(args.seed)
     print("=" * 60)
+    print(f"ASSEMBLING SCAMBENCH CORPUS — mode={args.mode}")
     print("=" * 60)
+    # ── Load text sources ─────────────────────────────────────────────
+    print("\n📝 Loading TEXT sources …")
     all_rows = []
     all_rows.extend(load_scam_dialogue())
     all_rows.extend(load_multi_agent_scam())
+    all_rows.extend(load_single_agent_scam())
+    all_rows.extend(load_phone_scam_synthetic())
+    all_rows.extend(load_phishing_dataset(max_rows=args.max_phishing))
+    all_rows.extend(load_all_scam_spam(max_rows=args.max_spam))
+    # ── Load audio sources (combined mode only) ───────────────────────
+    if args.mode == "combined":
+        print("\n🔊 Loading AUDIO sources …")
+        audio_rows = load_teleanti_fraud_audio(args.audio_dir)
+        all_rows.extend(audio_rows)
+    else:
+        print("\n   (Audio skipped — use --mode combined for multimodal)")
+    # ── Shuffle ───────────────────────────────────────────────────────
+    random.shuffle(all_rows)
+    print_stats(all_rows, "Full Corpus")
     # ── Stratified split ──────────────────────────────────────────────
+    labels = []
+    for r in all_rows:
+        assistant_content = r["messages"][-1]["content"]
+        if isinstance(assistant_content, list):
+            labels.append(assistant_content[0]["text"])
+        else:
+            labels.append(assistant_content)
     train_rows, held_out_rows = train_test_split(
         all_rows, test_size=args.held_out_ratio,
         stratify=labels, random_state=args.seed
     )
+    print(f"Split: train={len(train_rows)} | held_out={len(held_out_rows)}")
     print_stats(train_rows, "Train Split")
+    print_stats(held_out_rows, "Held-Out Split")
+    # ── Save ──────────────────────────────────────────────���───────────
     out_dir = Path(args.output_dir)
     out_dir.mkdir(parents=True, exist_ok=True)
     for split_name, split_rows in [("train", train_rows), ("held_out", held_out_rows)]:
         jsonl_path = out_dir / f"{split_name}.jsonl"
         with open(jsonl_path, "w") as f:
             for r in split_rows:
                 f.write(json.dumps(r, ensure_ascii=False) + "\n")
+    print(f"✓ Saved → {out_dir}/train.jsonl, held_out.jsonl")
+    # Also save as HF Dataset
+    train_ds = Dataset.from_list(train_rows)
+    held_out_ds = Dataset.from_list(held_out_rows)
+    corpus = DatasetDict({"train": train_ds, "held_out": held_out_ds})
     if args.push_to_hub:
         print(f"\nPushing to https://huggingface.co/datasets/{args.push_to_hub} …")
         corpus.push_to_hub(args.push_to_hub, private=False)
         print(f"✓ Pushed!")
+    # ── Print usage ───────────────────────────────────────────────────
+    print(f"""
+{'='*60}
+DONE! To train with this corpus:
+{'='*60}
+# In your Unsloth training script:
+from datasets import load_dataset
+dataset = load_dataset("json", data_files="{out_dir}/train.jsonl", split="train")
+# For text-only mode, use SFTTrainer with dataset_text_field=""
+# For combined mode, use UnslothVisionDataCollator:
+from unsloth.trainer import UnslothVisionDataCollator
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=dataset,
+    processing_class=processor.tokenizer,
+    data_collator=UnslothVisionDataCollator(model, processor),
+    args=SFTConfig(
+        dataset_text_field="",
+        dataset_kwargs={{"skip_prepare_dataset": True}},
+        max_length=8192,
+        ...
     )
+)
+{'='*60}
 """)