s23deepak commited on
Commit
750e870
Β·
verified Β·
1 Parent(s): 633ce01

Add assemble_corpus.py with combined text+audio multimodal support for Unsloth"

Browse files
Files changed (1) hide show
  1. assemble_corpus.py +339 -175
assemble_corpus.py CHANGED
@@ -1,33 +1,48 @@
1
  #!/usr/bin/env python3
2
  """
3
- assemble_corpus.py β€” Compile scam detection datasets into one unified corpus.
 
 
 
 
4
 
5
  ═══════════════════════════════════════════════════════════════════════════
6
- PHASE 1 (Text SFT) β€” this script handles:
7
- 1. BothBosu/scam-dialogue β€” phone call transcripts (EN)
8
- 2. BothBosu/multi-agent-scam-conversation β€” phone call transcripts (EN)
9
- 3. ealvaradob/phishing-dataset β€” email/SMS phishing texts (EN)
10
-
11
- PHASE 2 (Audio fine-tuning) β€” NOT handled here, use separately:
12
- 4. JimmyMa99/TeleAntiFraud β€” labeled .mp3 files of fraud calls (ZH)
13
- The audio_path field points to actual call recordings.
14
- The instruction/label fields are just prompts, NOT content.
15
- ═══════════════════════════════════════════════════════════════════════════
16
 
17
- Output schema:
18
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
19
- β”‚ Column β”‚ Type β”‚ Purpose β”‚
20
- β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
21
- β”‚ text β”‚ string β”‚ The message/transcript content β”‚
22
- β”‚ category β”‚ string β”‚ "scam" or "not_scam" β€” normalized label β”‚
23
- β”‚ source_id β”‚ string β”‚ Dataset identifier β”‚
24
- β”‚ source_license β”‚ string β”‚ License for compliance β”‚
25
- β”‚ pii_redacted β”‚ boolean β”‚ Whether PII regex was applied β”‚
26
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
 
 
 
 
 
 
 
 
 
 
27
 
 
28
  USAGE:
29
- python assemble_corpus.py
30
- python assemble_corpus.py --output_dir ./scam_corpus --push_to_hub s23deepak/scambench
 
 
 
31
 
32
  REQUIREMENTS:
33
  pip install datasets huggingface_hub scikit-learn
@@ -36,6 +51,7 @@ import argparse
36
  import json
37
  import re
38
  import hashlib
 
39
  from pathlib import Path
40
  from collections import Counter
41
 
@@ -43,20 +59,40 @@ from datasets import load_dataset, Dataset, DatasetDict
43
  from sklearn.model_selection import train_test_split
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # ═══════════════════════════════════════════════════════════════════════
47
  # PII REDACTION
48
  # ═══════════════════════════════════════════════════════════════════════
49
  PII_PATTERNS = [
50
- (r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]'), # US phone
51
- (r'\b\d{10,11}\b', '[PHONE]'), # Generic phone
52
  (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]'),
53
- (r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]'), # SSN
54
- (r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', '[CARD]'), # Credit card
55
- (r'\b(?:\d{1,3}\.){3}\d{1,3}\b', '[IP]'), # IP address
56
  ]
57
 
58
  def redact_pii(text: str) -> tuple[str, bool]:
59
- """Apply PII regex patterns. Returns (redacted_text, was_redacted)."""
60
  redacted = False
61
  for pattern, replacement in PII_PATTERNS:
62
  new_text = re.sub(pattern, replacement, text)
@@ -67,67 +103,205 @@ def redact_pii(text: str) -> tuple[str, bool]:
67
 
68
 
69
  # ═══════════════════════════════════════════════════════════════════════
70
- # SOURCE LOADERS (Phase 1 β€” text only)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  # ═══════════════════════════════════════════════════════════════════════
72
 
73
  def load_scam_dialogue() -> list[dict]:
74
- """BothBosu/scam-dialogue β€” 1,280 labeled phone call transcripts."""
75
- print(" [1/3] Loading BothBosu/scam-dialogue …")
76
  ds = load_dataset("BothBosu/scam-dialogue", split="train")
77
  rows = []
78
  for r in ds:
79
- text, pii = redact_pii(r["dialogue"])
80
- rows.append({
81
- "text": text,
82
- "category": "scam" if r["label"] == 1 else "not_scam",
83
- "source_id": "BothBosu/scam-dialogue",
84
- "source_license": "unknown",
85
- "pii_redacted": pii,
86
- })
87
  print(f" β†’ {len(rows)} rows")
88
  return rows
89
 
90
 
91
  def load_multi_agent_scam() -> list[dict]:
92
- """BothBosu/multi-agent-scam-conversation β€” multi-personality phone scams."""
93
- print(" [2/3] Loading BothBosu/multi-agent-scam-conversation …")
94
  ds = load_dataset("BothBosu/multi-agent-scam-conversation", split="train")
95
  rows = []
96
  for r in ds:
97
- text, pii = redact_pii(r["dialogue"])
98
- rows.append({
99
- "text": text,
100
- "category": "scam" if r["labels"] == 1 else "not_scam",
101
- "source_id": "BothBosu/multi-agent-scam-conversation",
102
- "source_license": "unknown",
103
- "pii_redacted": pii,
104
- })
105
  print(f" β†’ {len(rows)} rows")
106
  return rows
107
 
108
 
109
- def load_phishing_dataset() -> list[dict]:
110
- """ealvaradob/phishing-dataset β€” 20K email/SMS phishing texts."""
111
- print(" [3/3] Loading ealvaradob/phishing-dataset (texts.json) …")
112
- from huggingface_hub import hf_hub_download
113
- path = hf_hub_download("ealvaradob/phishing-dataset", "texts.json", repo_type="dataset")
114
- with open(path) as f:
115
- data = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  rows = []
118
- for r in data:
119
- text = r.get("text", "")
120
- if not text or len(text.strip()) < 20:
121
- continue # Skip empty/trivial rows
122
- text, pii = redact_pii(text)
123
- rows.append({
124
- "text": text,
125
- "category": "scam" if r["label"] == 1 else "not_scam",
126
- "source_id": "ealvaradob/phishing-dataset",
127
- "source_license": "apache-2.0",
128
- "pii_redacted": pii,
129
- })
130
- print(f" β†’ {len(rows)} rows")
 
 
 
 
 
131
  return rows
132
 
133
 
@@ -135,155 +309,145 @@ def load_phishing_dataset() -> list[dict]:
135
  # ASSEMBLY
136
  # ═══════════════════════════════════════════════════════════════════════
137
 
138
- def deduplicate(rows: list[dict]) -> list[dict]:
139
- """Remove exact-text duplicates."""
140
- seen = set()
141
- unique = []
142
- for r in rows:
143
- h = hashlib.md5(r["text"].encode()).hexdigest()
144
- if h not in seen:
145
- seen.add(h)
146
- unique.append(r)
147
- removed = len(rows) - len(unique)
148
- if removed:
149
- print(f" Deduplication: removed {removed} exact duplicates")
150
- return unique
151
-
152
-
153
- def print_stats(rows: list[dict], name: str = "Corpus"):
154
  """Print corpus statistics."""
155
- cats = Counter(r["category"] for r in rows)
156
- sources = Counter(r["source_id"] for r in rows)
157
- pii_count = sum(1 for r in rows if r["pii_redacted"])
 
 
 
 
 
 
 
 
 
 
 
 
158
 
 
159
  print(f"\n{'='*60}")
160
- print(f"{name} Statistics")
161
  print(f"{'='*60}")
162
- print(f" Total rows: {len(rows)}")
163
- print(f" Categories: {dict(cats)}")
164
- if cats.get("not_scam", 0) > 0:
165
- print(f" Balance: {cats.get('scam',0)}/{cats.get('not_scam',0)} "
166
- f"= {cats.get('scam',0)/cats['not_scam']:.2f} scam:legit ratio")
167
- print(f" PII redacted: {pii_count} rows ({100*pii_count/max(len(rows),1):.1f}%)")
168
- print(f" Sources:")
169
- for src, count in sources.most_common():
170
- print(f" {src}: {count}")
171
  print(f"{'='*60}\n")
172
 
173
 
174
  def main():
175
- parser = argparse.ArgumentParser(description="Assemble ScamBench corpus (Phase 1 β€” text)")
 
 
176
  parser.add_argument("--output_dir", default="./scam_corpus")
177
  parser.add_argument("--push_to_hub", default=None,
178
  help="HF dataset repo, e.g. s23deepak/scambench")
 
 
179
  parser.add_argument("--held_out_ratio", type=float, default=0.10)
 
 
180
  parser.add_argument("--seed", type=int, default=42)
181
- parser.add_argument("--max_phishing", type=int, default=5000,
182
- help="Cap phishing rows to prevent dominating corpus")
183
  args = parser.parse_args()
184
 
 
 
185
  print("=" * 60)
186
- print("ASSEMBLING SCAMBENCH CORPUS (Phase 1 β€” Text)")
187
  print("=" * 60)
188
- print()
189
- print("NOTE: JimmyMa99/TeleAntiFraud is EXCLUDED from this pipeline.")
190
- print(" It contains .mp3 audio files, not text transcripts.")
191
- print(" Use it in Phase 2 (audio multimodal fine-tuning).")
192
- print()
193
 
194
- # ── Load all text sources ─────────────────────────────────────────
 
195
  all_rows = []
196
-
197
  all_rows.extend(load_scam_dialogue())
198
  all_rows.extend(load_multi_agent_scam())
199
-
200
- phishing_rows = load_phishing_dataset()
201
- if len(phishing_rows) > args.max_phishing:
202
- import random
203
- random.seed(args.seed)
204
- phishing_rows = random.sample(phishing_rows, args.max_phishing)
205
- print(f" (capped to {args.max_phishing} to prevent dominating corpus)")
206
- all_rows.extend(phishing_rows)
207
-
208
- # ── Deduplicate ───────────────────────────────────────────────────
209
- all_rows = deduplicate(all_rows)
210
-
211
- # ── Stats ─────────────────────────────────────────────────────────
212
- print_stats(all_rows, "Full Corpus (before split)")
 
 
213
 
214
  # ── Stratified split ──────────────────────────────────────────────
215
- labels = [r["category"] for r in all_rows]
 
 
 
 
 
 
 
216
  train_rows, held_out_rows = train_test_split(
217
  all_rows, test_size=args.held_out_ratio,
218
  stratify=labels, random_state=args.seed
219
  )
220
- print(f"Stratified split (seed={args.seed}): "
221
- f"train={len(train_rows)} | held_out={len(held_out_rows)}")
222
  print_stats(train_rows, "Train Split")
223
- print_stats(held_out_rows, "Held-Out Split (NEVER use for training)")
224
 
225
- # ── Save as Parquet + JSONL ───────────────────────────────────────
226
  out_dir = Path(args.output_dir)
227
  out_dir.mkdir(parents=True, exist_ok=True)
228
 
229
- train_ds = Dataset.from_list(train_rows)
230
- held_out_ds = Dataset.from_list(held_out_rows)
231
- corpus = DatasetDict({"train": train_ds, "held_out": held_out_ds})
232
-
233
- # Save Parquet (HF-native)
234
- corpus.save_to_disk(str(out_dir / "parquet"))
235
- print(f"βœ“ Saved Parquet β†’ {out_dir / 'parquet'}/")
236
-
237
- # Save JSONL (portable fallback)
238
  for split_name, split_rows in [("train", train_rows), ("held_out", held_out_rows)]:
239
  jsonl_path = out_dir / f"{split_name}.jsonl"
240
  with open(jsonl_path, "w") as f:
241
  for r in split_rows:
242
  f.write(json.dumps(r, ensure_ascii=False) + "\n")
243
- print(f"βœ“ Saved JSONL β†’ {out_dir}/train.jsonl, held_out.jsonl")
244
-
245
- # PII audit trail
246
- audit = {
247
- "total_rows": len(all_rows),
248
- "pii_redacted_count": sum(1 for r in all_rows if r["pii_redacted"]),
249
- "patterns_applied": [p[1] for p in PII_PATTERNS],
250
- "seed": args.seed,
251
- "held_out_ratio": args.held_out_ratio,
252
- "sources": dict(Counter(r["source_id"] for r in all_rows)),
253
- }
254
- (out_dir / "pii-audit.json").write_text(json.dumps(audit, indent=2))
255
- print(f"βœ“ PII audit β†’ {out_dir}/pii-audit.json")
256
 
257
- # ── Push to Hub ───────────────────────────────────────────────────
258
  if args.push_to_hub:
259
  print(f"\nPushing to https://huggingface.co/datasets/{args.push_to_hub} …")
260
  corpus.push_to_hub(args.push_to_hub, private=False)
261
  print(f"βœ“ Pushed!")
262
 
263
- # ── Print training format example ─────────────────────────────────
264
- print(f"\n{'='*60}")
265
- print("TRAINING FORMAT EXAMPLE")
266
- print("(Use this in your training script's format_example function)")
267
- print(f"{'='*60}")
268
- sample = train_rows[0]
269
- label = "SCAM" if sample["category"] == "scam" else "NOT_SCAM"
270
- formatted = (
271
- "Classify the following message as SCAM or NOT_SCAM. "
272
- "Consider urgency, payment requests, impersonation, and remote-access patterns.\n\n"
273
- f"Message: {sample['text'][:300]}...\n\n"
274
- f"Classification: {label}"
 
 
 
 
 
 
 
 
 
 
 
 
275
  )
276
- print(formatted)
277
-
278
- print(f"\n{'='*60}")
279
- print("PHASE 2 REMINDER")
280
- print(f"{'='*60}")
281
- print("""
282
- For audio fine-tuning (Phase 2), use JimmyMa99/TeleAntiFraud separately:
283
- - audio_path: path to .mp3 files of actual phone calls
284
- - label: "fraud" or "normal"
285
- - Feed audio directly to Gemma 4's audio encoder
286
- - No text transcription needed β€” model processes raw audio
287
  """)
288
 
289
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ assemble_corpus.py β€” Build a unified multimodal scam detection corpus.
4
+
5
+ Supports TWO modes:
6
+ --mode text β†’ Phase 1: text-only SFT (works on 8GB VRAM)
7
+ --mode combined β†’ Phase 1+2: text AND audio in one dataset (needs 16GB+ VRAM)
8
 
9
  ═══════════════════════════════════════════════════════════════════════════
10
+ TEXT SOURCES:
11
+ 1. BothBosu/scam-dialogue β€” phone transcripts (EN, 1280 rows)
12
+ 2. BothBosu/multi-agent-scam-conversation β€” phone transcripts (EN)
13
+ 3. BothBosu/single-agent-scam-conversations β€” phone transcripts (EN)
14
+ 4. ealvaradob/phishing-dataset β€” email/SMS (EN, 20K rows)
15
+ 5. shakeleoatmeal/phone-scam-detection-synthetic β€” phone calls (EN, 1800)
16
+ 6. FredZhang7/all-scam-spam β€” SMS/email multilingual (42K)
 
 
 
17
 
18
+ AUDIO SOURCE:
19
+ 7. JimmyMa99/TeleAntiFraud β€” .mp3 phone call recordings (ZH, 11.9GB)
20
+
21
+ ═══════════════════════════════════════════════════════════════════════════
22
+ OUTPUT FORMAT (compatible with Unsloth multimodal SFT):
23
+
24
+ Text example:
25
+ {"messages": [
26
+ {"role": "user", "content": [{"type": "text", "text": "Classify...\\n\\nMessage: ..."}]},
27
+ {"role": "assistant", "content": [{"type": "text", "text": "SCAM"}]}
28
+ ]}
29
+
30
+ Audio example:
31
+ {"messages": [
32
+ {"role": "user", "content": [
33
+ {"type": "audio", "audio_url": "audio/NEG-imitate-12/tts_test3037.mp3"},
34
+ {"type": "text", "text": "Is this phone call a scam? Answer: SCAM or NOT_SCAM"}
35
+ ]},
36
+ {"role": "assistant", "content": [{"type": "text", "text": "SCAM"}]}
37
+ ]}
38
 
39
+ ═══════════════════════════════════════════════════════════════════════════
40
  USAGE:
41
+ # Text only (Phase 1)
42
+ python assemble_corpus.py --mode text --push_to_hub s23deepak/scambench
43
+
44
+ # Combined text + audio (Phase 1+2)
45
+ python assemble_corpus.py --mode combined --audio_dir ./audio --push_to_hub s23deepak/scambench-multimodal
46
 
47
  REQUIREMENTS:
48
  pip install datasets huggingface_hub scikit-learn
 
51
  import json
52
  import re
53
  import hashlib
54
+ import random
55
  from pathlib import Path
56
  from collections import Counter
57
 
 
59
  from sklearn.model_selection import train_test_split
60
 
61
 
62
+ # ═══════════════════��═══════════════════════════════════════════════════
63
+ # CONFIG
64
+ # ═══════════════════════════════════════════════════════════════════════
65
+ SEED = 42
66
+ random.seed(SEED)
67
+
68
+ SYSTEM_PROMPT = (
69
+ "You are a phone scam detection expert. "
70
+ "Analyze the content and classify it as SCAM or NOT_SCAM."
71
+ )
72
+
73
+ TEXT_PROMPT = (
74
+ "Classify the following message as SCAM or NOT_SCAM. "
75
+ "Consider urgency, payment requests, impersonation, and remote-access patterns.\n\n"
76
+ "Message: {text}\n\n"
77
+ "Classification:"
78
+ )
79
+
80
+ AUDIO_PROMPT = "Listen to this phone call and classify it as SCAM or NOT_SCAM."
81
+
82
+
83
  # ═══════════════════════════════════════════════════════════════════════
84
  # PII REDACTION
85
  # ═══════════════════════════════════════════════════════════════════════
86
  PII_PATTERNS = [
87
+ (r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]'),
88
+ (r'\b\d{10,11}\b', '[PHONE]'),
89
  (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]'),
90
+ (r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]'),
91
+ (r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', '[CARD]'),
92
+ (r'\b(?:\d{1,3}\.){3}\d{1,3}\b', '[IP]'),
93
  ]
94
 
95
  def redact_pii(text: str) -> tuple[str, bool]:
 
96
  redacted = False
97
  for pattern, replacement in PII_PATTERNS:
98
  new_text = re.sub(pattern, replacement, text)
 
103
 
104
 
105
  # ═══════════════════════════════════════════════════════════════════════
106
+ # FORMAT CONVERTERS
107
+ # ═══════════════════════════════════════════════════════════════════════
108
+
109
+ def to_text_message(text: str, label: str) -> dict:
110
+ """Convert text + label to Unsloth multimodal message format."""
111
+ return {
112
+ "messages": [
113
+ {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
114
+ {"role": "user", "content": [
115
+ {"type": "text", "text": TEXT_PROMPT.format(text=text)}
116
+ ]},
117
+ {"role": "assistant", "content": [
118
+ {"type": "text", "text": label}
119
+ ]},
120
+ ]
121
+ }
122
+
123
+
124
+ def to_audio_message(audio_path: str, label: str) -> dict:
125
+ """Convert audio path + label to Unsloth multimodal message format."""
126
+ return {
127
+ "messages": [
128
+ {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
129
+ {"role": "user", "content": [
130
+ {"type": "audio", "audio_url": audio_path},
131
+ {"type": "text", "text": AUDIO_PROMPT},
132
+ ]},
133
+ {"role": "assistant", "content": [
134
+ {"type": "text", "text": label}
135
+ ]},
136
+ ]
137
+ }
138
+
139
+
140
+ # ═══════════════════════════════════════════════════════════════════════
141
+ # TEXT SOURCE LOADERS
142
  # ═══════════════════════════════════════════════════════════════════════
143
 
144
  def load_scam_dialogue() -> list[dict]:
145
+ """BothBosu/scam-dialogue"""
146
+ print(" [1/6] BothBosu/scam-dialogue …")
147
  ds = load_dataset("BothBosu/scam-dialogue", split="train")
148
  rows = []
149
  for r in ds:
150
+ text, _ = redact_pii(r["dialogue"])
151
+ label = "SCAM" if r["label"] == 1 else "NOT_SCAM"
152
+ rows.append(to_text_message(text, label))
 
 
 
 
 
153
  print(f" β†’ {len(rows)} rows")
154
  return rows
155
 
156
 
157
  def load_multi_agent_scam() -> list[dict]:
158
+ """BothBosu/multi-agent-scam-conversation"""
159
+ print(" [2/6] BothBosu/multi-agent-scam-conversation …")
160
  ds = load_dataset("BothBosu/multi-agent-scam-conversation", split="train")
161
  rows = []
162
  for r in ds:
163
+ text, _ = redact_pii(r["dialogue"])
164
+ label = "SCAM" if r["labels"] == 1 else "NOT_SCAM"
165
+ rows.append(to_text_message(text, label))
 
 
 
 
 
166
  print(f" β†’ {len(rows)} rows")
167
  return rows
168
 
169
 
170
+ def load_single_agent_scam() -> list[dict]:
171
+ """BothBosu/single-agent-scam-conversations"""
172
+ print(" [3/6] BothBosu/single-agent-scam-conversations …")
173
+ try:
174
+ ds = load_dataset("BothBosu/single-agent-scam-conversations", split="train")
175
+ rows = []
176
+ for r in ds:
177
+ text, _ = redact_pii(r.get("dialogue", r.get("conversation", "")))
178
+ label_raw = r.get("labels", r.get("label", 0))
179
+ label = "SCAM" if label_raw == 1 else "NOT_SCAM"
180
+ rows.append(to_text_message(text, label))
181
+ print(f" β†’ {len(rows)} rows")
182
+ return rows
183
+ except Exception as e:
184
+ print(f" ⚠ Skipped: {e}")
185
+ return []
186
+
187
+
188
+ def load_phone_scam_synthetic() -> list[dict]:
189
+ """shakeleoatmeal/phone-scam-detection-synthetic"""
190
+ print(" [4/6] shakeleoatmeal/phone-scam-detection-synthetic …")
191
+ try:
192
+ ds = load_dataset("shakeleoatmeal/phone-scam-detection-synthetic", split="train")
193
+ rows = []
194
+ for r in ds:
195
+ # Check column names
196
+ text = r.get("dialogue", r.get("text", r.get("conversation", "")))
197
+ if not text:
198
+ continue
199
+ text, _ = redact_pii(text)
200
+ label_raw = r.get("label", r.get("labels", r.get("is_fraud", 0)))
201
+ if isinstance(label_raw, str):
202
+ label = "SCAM" if label_raw.lower() in ("fraud", "scam", "1") else "NOT_SCAM"
203
+ else:
204
+ label = "SCAM" if label_raw == 1 else "NOT_SCAM"
205
+ rows.append(to_text_message(text, label))
206
+ print(f" β†’ {len(rows)} rows")
207
+ return rows
208
+ except Exception as e:
209
+ print(f" ⚠ Skipped: {e}")
210
+ return []
211
+
212
+
213
+ def load_phishing_dataset(max_rows: int = 5000) -> list[dict]:
214
+ """ealvaradob/phishing-dataset (texts.json)"""
215
+ print(" [5/6] ealvaradob/phishing-dataset …")
216
+ try:
217
+ from huggingface_hub import hf_hub_download
218
+ path = hf_hub_download("ealvaradob/phishing-dataset", "texts.json", repo_type="dataset")
219
+ with open(path) as f:
220
+ data = json.load(f)
221
+ rows = []
222
+ for r in data:
223
+ text = r.get("text", "")
224
+ if not text or len(text.strip()) < 20:
225
+ continue
226
+ text, _ = redact_pii(text)
227
+ label = "SCAM" if r["label"] == 1 else "NOT_SCAM"
228
+ rows.append(to_text_message(text, label))
229
+ if len(rows) > max_rows:
230
+ rows = random.sample(rows, max_rows)
231
+ print(f" (capped to {max_rows})")
232
+ print(f" β†’ {len(rows)} rows")
233
+ return rows
234
+ except Exception as e:
235
+ print(f" ⚠ Skipped: {e}")
236
+ return []
237
+
238
+
239
+ def load_all_scam_spam(max_rows: int = 5000) -> list[dict]:
240
+ """FredZhang7/all-scam-spam"""
241
+ print(" [6/6] FredZhang7/all-scam-spam …")
242
+ try:
243
+ ds = load_dataset("FredZhang7/all-scam-spam", split="train")
244
+ rows = []
245
+ for r in ds:
246
+ text = r.get("text", "")
247
+ if not text or len(text.strip()) < 20:
248
+ continue
249
+ text, _ = redact_pii(text)
250
+ label = "SCAM" if r.get("is_spam", 0) == 1 else "NOT_SCAM"
251
+ rows.append(to_text_message(text, label))
252
+ if len(rows) > max_rows:
253
+ rows = random.sample(rows, max_rows)
254
+ print(f" (capped to {max_rows})")
255
+ print(f" β†’ {len(rows)} rows")
256
+ return rows
257
+ except Exception as e:
258
+ print(f" ⚠ Skipped: {e}")
259
+ return []
260
+
261
+
262
+ # ═══════════════════════════════════════════════════════════════════════
263
+ # AUDIO SOURCE LOADER
264
+ # ═══════════════════════════════════════════════════════════════════════
265
+
266
+ def load_teleanti_fraud_audio(audio_dir: str) -> list[dict]:
267
+ """
268
+ JimmyMa99/TeleAntiFraud β€” audio examples.
269
+
270
+ Prerequisites: Download and unzip audio.zip from the dataset repo:
271
+ huggingface-cli download JimmyMa99/TeleAntiFraud audio.zip --repo-type dataset
272
+ unzip audio.zip -d ./audio
273
+
274
+ Then pass --audio_dir ./audio
275
+ """
276
+ print(" [AUDIO] JimmyMa99/TeleAntiFraud …")
277
+ ds = load_dataset("JimmyMa99/TeleAntiFraud", split="train")
278
+
279
+ audio_path = Path(audio_dir)
280
+ if not audio_path.exists():
281
+ print(f" ⚠ Audio dir '{audio_dir}' not found!")
282
+ print(f" Download with: huggingface-cli download JimmyMa99/TeleAntiFraud audio.zip --repo-type dataset")
283
+ print(f" Then: unzip audio.zip -d {audio_dir}")
284
+ return []
285
 
286
  rows = []
287
+ missing = 0
288
+ for r in ds:
289
+ rel_path = r["audio_path"] # e.g. "audio/POS-imitate-4/tts_test1139/tts_test1139.mp3"
290
+ # Try to find the file
291
+ full_path = audio_path / rel_path
292
+ if not full_path.exists():
293
+ # Try without "audio/" prefix
294
+ full_path = audio_path / rel_path.replace("audio/", "", 1)
295
+ if not full_path.exists():
296
+ missing += 1
297
+ continue
298
+
299
+ label = "SCAM" if r["label"] == "fraud" else "NOT_SCAM"
300
+ rows.append(to_audio_message(str(full_path), label))
301
+
302
+ if missing:
303
+ print(f" ⚠ {missing} audio files not found (check --audio_dir path)")
304
+ print(f" β†’ {len(rows)} audio rows")
305
  return rows
306
 
307
 
 
309
  # ASSEMBLY
310
  # ═══════════════════════════════════════════════════════════════════════
311
 
312
+ def print_stats(rows: list[dict], name: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  """Print corpus statistics."""
314
+ labels = []
315
+ modalities = Counter()
316
+ for r in rows:
317
+ # Extract label from assistant message
318
+ assistant_content = r["messages"][-1]["content"]
319
+ if isinstance(assistant_content, list):
320
+ label = assistant_content[0]["text"]
321
+ else:
322
+ label = assistant_content
323
+ labels.append(label)
324
+
325
+ # Check modality
326
+ user_content = r["messages"][1]["content"]
327
+ has_audio = any(c.get("type") == "audio" for c in user_content)
328
+ modalities["audio" if has_audio else "text"] += 1
329
 
330
+ cats = Counter(labels)
331
  print(f"\n{'='*60}")
332
+ print(f"{name}")
333
  print(f"{'='*60}")
334
+ print(f" Total: {len(rows)}")
335
+ print(f" Labels: {dict(cats)}")
336
+ print(f" Modalities: {dict(modalities)}")
337
+ if cats.get("NOT_SCAM", 0) > 0:
338
+ print(f" Balance: {cats.get('SCAM',0)}:{cats.get('NOT_SCAM',0)} "
339
+ f"({cats.get('SCAM',0)/cats['NOT_SCAM']:.2f} ratio)")
 
 
 
340
  print(f"{'='*60}\n")
341
 
342
 
343
  def main():
344
+ parser = argparse.ArgumentParser(description="Assemble ScamBench corpus")
345
+ parser.add_argument("--mode", choices=["text", "combined"], default="text",
346
+ help="'text' = Phase 1 only, 'combined' = text + audio")
347
  parser.add_argument("--output_dir", default="./scam_corpus")
348
  parser.add_argument("--push_to_hub", default=None,
349
  help="HF dataset repo, e.g. s23deepak/scambench")
350
+ parser.add_argument("--audio_dir", default="./audio",
351
+ help="Path to extracted TeleAntiFraud audio files")
352
  parser.add_argument("--held_out_ratio", type=float, default=0.10)
353
+ parser.add_argument("--max_phishing", type=int, default=5000)
354
+ parser.add_argument("--max_spam", type=int, default=5000)
355
  parser.add_argument("--seed", type=int, default=42)
 
 
356
  args = parser.parse_args()
357
 
358
+ random.seed(args.seed)
359
+
360
  print("=" * 60)
361
+ print(f"ASSEMBLING SCAMBENCH CORPUS β€” mode={args.mode}")
362
  print("=" * 60)
 
 
 
 
 
363
 
364
+ # ── Load text sources ─────────────────────────────────────────────
365
+ print("\nπŸ“ Loading TEXT sources …")
366
  all_rows = []
 
367
  all_rows.extend(load_scam_dialogue())
368
  all_rows.extend(load_multi_agent_scam())
369
+ all_rows.extend(load_single_agent_scam())
370
+ all_rows.extend(load_phone_scam_synthetic())
371
+ all_rows.extend(load_phishing_dataset(max_rows=args.max_phishing))
372
+ all_rows.extend(load_all_scam_spam(max_rows=args.max_spam))
373
+
374
+ # ── Load audio sources (combined mode only) ───────────────────────
375
+ if args.mode == "combined":
376
+ print("\nπŸ”Š Loading AUDIO sources …")
377
+ audio_rows = load_teleanti_fraud_audio(args.audio_dir)
378
+ all_rows.extend(audio_rows)
379
+ else:
380
+ print("\n (Audio skipped β€” use --mode combined for multimodal)")
381
+
382
+ # ── Shuffle ───────────────────────────────────────────────────────
383
+ random.shuffle(all_rows)
384
+ print_stats(all_rows, "Full Corpus")
385
 
386
  # ── Stratified split ──────────────────────────────────────────────
387
+ labels = []
388
+ for r in all_rows:
389
+ assistant_content = r["messages"][-1]["content"]
390
+ if isinstance(assistant_content, list):
391
+ labels.append(assistant_content[0]["text"])
392
+ else:
393
+ labels.append(assistant_content)
394
+
395
  train_rows, held_out_rows = train_test_split(
396
  all_rows, test_size=args.held_out_ratio,
397
  stratify=labels, random_state=args.seed
398
  )
399
+ print(f"Split: train={len(train_rows)} | held_out={len(held_out_rows)}")
 
400
  print_stats(train_rows, "Train Split")
401
+ print_stats(held_out_rows, "Held-Out Split")
402
 
403
+ # ── Save ──────────────────────────────────────────────���───────────
404
  out_dir = Path(args.output_dir)
405
  out_dir.mkdir(parents=True, exist_ok=True)
406
 
 
 
 
 
 
 
 
 
 
407
  for split_name, split_rows in [("train", train_rows), ("held_out", held_out_rows)]:
408
  jsonl_path = out_dir / f"{split_name}.jsonl"
409
  with open(jsonl_path, "w") as f:
410
  for r in split_rows:
411
  f.write(json.dumps(r, ensure_ascii=False) + "\n")
412
+ print(f"βœ“ Saved β†’ {out_dir}/train.jsonl, held_out.jsonl")
413
+
414
+ # Also save as HF Dataset
415
+ train_ds = Dataset.from_list(train_rows)
416
+ held_out_ds = Dataset.from_list(held_out_rows)
417
+ corpus = DatasetDict({"train": train_ds, "held_out": held_out_ds})
 
 
 
 
 
 
 
418
 
 
419
  if args.push_to_hub:
420
  print(f"\nPushing to https://huggingface.co/datasets/{args.push_to_hub} …")
421
  corpus.push_to_hub(args.push_to_hub, private=False)
422
  print(f"βœ“ Pushed!")
423
 
424
+ # ── Print usage ───────────────────────────────────────────────────
425
+ print(f"""
426
+ {'='*60}
427
+ DONE! To train with this corpus:
428
+ {'='*60}
429
+
430
+ # In your Unsloth training script:
431
+ from datasets import load_dataset
432
+ dataset = load_dataset("json", data_files="{out_dir}/train.jsonl", split="train")
433
+
434
+ # For text-only mode, use SFTTrainer with dataset_text_field=""
435
+ # For combined mode, use UnslothVisionDataCollator:
436
+
437
+ from unsloth.trainer import UnslothVisionDataCollator
438
+ trainer = SFTTrainer(
439
+ model=model,
440
+ train_dataset=dataset,
441
+ processing_class=processor.tokenizer,
442
+ data_collator=UnslothVisionDataCollator(model, processor),
443
+ args=SFTConfig(
444
+ dataset_text_field="",
445
+ dataset_kwargs={{"skip_prepare_dataset": True}},
446
+ max_length=8192,
447
+ ...
448
  )
449
+ )
450
+ {'='*60}
 
 
 
 
 
 
 
 
 
451
  """)
452
 
453