File size: 26,003 Bytes
0bf5290 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 |
"""
Dataset preparation for PubGuard training.
Downloads publicly available datasets from HuggingFace and assembles
them into the three labelled corpora needed by the training pipeline.
Datasets used (verified available 2026-02)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**Head 1 β Document Type** (scientific_paper | poster | abstract_only | junk)
Positive (scientific_paper):
- armanc/scientific_papers (arxiv) ~300 K full-text articles
cols: article, abstract, section_names
Negative (abstract_only):
- gfissore/arxiv-abstracts-2021 ~2 M abstracts
cols: abstract (filter length < 600 chars)
Negative (junk):
- ag_news (news articles) + synthetic templates (flyers, invoices, etc.)
Negative (poster):
- Synthetic poster-style structured text
**Head 2 β AI-Generated Text Detection**
- liamdugan/raid β multi-model generations, domain="abstracts"
cols: model, domain, generation (model="human" for human text)
- NicolaiSivesind/ChatGPT-Research-Abstracts β real + GPT-3.5 abstracts
cols: real_abstract, generated_abstract
**Head 3 β Toxicity**
- google/civil_comments β 1.8 M comments with toxicity scores (0β1)
cols: text, toxicity
- skg/toxigen-data β 274 K annotated toxic/benign statements
cols: text, toxicity_human (1β5 scale)
"""
import json
import logging
import random
from pathlib import Path
from typing import Dict, List, Tuple
logger = logging.getLogger(__name__)
# ββ Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββ
SEED = 42
random.seed(SEED)
# ββ Synthetic templates ββββββββββββββββββββββββββββββββββββββββββ
JUNK_TEMPLATES = [
"π Annual {event} at {place}! Join us on {date}. Free food and drinks. RSVP to {email}.",
"FOR SALE: {item}. Great condition. ${price}. Contact {name} at {phone}.",
"{company} is hiring! We're looking for a {role}. Apply now at {url}.",
"NOTICE: The {dept} office will be closed on {date} for {reason}. Questions? Call {phone}.",
"Don't miss our {event}! {date} from {time}. {place}. Tickets: ${price}.",
"Weekly newsletter from {company}. This week: {topic1}, {topic2}, and more!",
"Invoice #{num} from {company}. Amount due: ${price}. Payment due by {date}.",
"Meeting agenda for {date}. 1) {topic1} 2) {topic2} 3) {topic3}. Location: {place}.",
"URGENT: Your {account} password expires on {date}. Click here to reset: {url}.",
"Congratulations {name}! You've been selected for our exclusive {event}. Limited spots!",
"Thank you for your purchase! Order #{num}. Estimated delivery: {date}.",
"{company} presents the {event}. Keynote by {name}. Register at {url}.",
"Garage sale this weekend! {place}. {date} {time}. Everything must go!",
"Happy Birthday to {name} from all of us at {company}! π",
"POOL PARTY! π Come join us at {place} on {date}. Bring your swimsuit and sunscreen!",
"Menu for this week: Monday: {food1}. Tuesday: {food2}. Wednesday: {food3}.",
"Building maintenance notice: {reason} on {date}. Please plan accordingly.",
"Lost & Found: {item} found near {place}. Contact front desk to claim.",
"Fantasy Football League draft is on {date}! Don't forget to submit your picks.",
"Book club meeting: We're reading '{book}' by {name}. Discussion on {date}.",
"Hey everyone! Movie night at {place} on {date}. We're watching '{movie}'. Bring popcorn!",
"Reminder: Staff meeting {date} at {time}. Attendance mandatory. {dept}.",
"Lost cat! Orange tabby, answers to '{pet_name}'. Last seen near {place}. Call {phone}.",
"HOT DEAL! {item} only ${price}! Limited time offer. Visit {url}.",
"Club registration open! Join the {club} club. Meetings every {day} at {time}. {place}.",
"Fundraiser bake sale! {date} at {place}. All proceeds go to {charity}.",
"Apartment for rent: 2BR/1BA near {place}. ${price}/month. Pet friendly. Call {phone}.",
"Yoga class every {day} at {time}. {place}. All levels welcome. Bring your own mat!",
"IT Alert: System maintenance scheduled for {date}. Expected downtime: {time}. {dept}.",
"Carpool needed! Driving from {place} to {place2} daily. Contact {name} at {email}.",
]
POSTER_TEMPLATES = [
"TITLE: {title}\n\nAUTHORS: {authors}\nAFFILIATION: {affil}\n\nINTRODUCTION\n{intro}\n\nMETHODS\n{methods}\n\nRESULTS\n{results}\n\nCONCLUSIONS\n{conclusions}\n\nACKNOWLEDGMENTS\n{ack}",
"{title}\n{authors} | {affil}\n\nBackground: {intro}\n\nApproach: {methods}\n\nKey Findings:\nβ’ {finding1}\nβ’ {finding2}\nβ’ {finding3}\n\nFuture Work: {future}\n\nContact: {email}",
"POSTER PRESENTATION\n\n{title}\n\n{authors}\n{affil}\n\nObjective: {intro}\n\nDesign: {methods}\n\nOutcome: {results}\n\nConclusion: {conclusions}",
"{title}\n\n{authors} ({affil})\n\nAim: {intro}\nMethod: {methods}\nResult: {results}\nSummary: {conclusions}\n\nCorrespondence: {email}",
"RESEARCH POSTER\nβββββββββββββββββββββ\n{title}\n{authors}\n{affil}\n\nβΈ Background\n{intro}\n\nβΈ Methods\n{methods}\n\nβΈ Results\nβ’ {finding1}\nβ’ {finding2}\n\nβΈ Conclusion\n{conclusions}\n\nFunding: {ack}",
]
def _fill_template(template: str) -> str:
"""Fill a template with random plausible values."""
fillers = {
"{event}": random.choice(["Pool Party", "BBQ Bash", "Career Fair", "Fundraiser Gala", "Open House", "Trivia Night"]),
"{place}": random.choice(["Room 201", "Hilton Downtown", "the Community Center", "Central Park", "Building B Courtyard", "Main Auditorium"]),
"{place2}": random.choice(["Campus North", "Downtown", "Tech Park", "Medical Center"]),
"{date}": random.choice(["March 15", "June 22", "Sept 5", "November 10", "January 30", "Friday the 13th"]),
"{email}": "info@example.com",
"{item}": random.choice(["2019 Honda Civic", "MacBook Pro 16-inch", "Standing Desk", "Mountain Bike", "Vintage Guitar"]),
"{price}": str(random.randint(10, 5000)),
"{name}": random.choice(["Dr. Smith", "Jane Doe", "Prof. Chen", "Maria Garcia", "Bob Wilson"]),
"{phone}": "555-0123",
"{company}": random.choice(["TechCorp", "BioGen Inc.", "Global Solutions", "Acme Labs", "DataFlow Systems"]),
"{role}": random.choice(["Data Scientist", "Lab Technician", "Project Manager", "Software Engineer"]),
"{url}": "https://example.com/apply",
"{dept}": random.choice(["HR", "Finance", "Engineering", "Admissions", "IT Support"]),
"{reason}": random.choice(["maintenance", "holiday", "training day", "renovation", "fire drill"]),
"{time}": random.choice(["2-5 PM", "10 AM - 3 PM", "6-9 PM", "All Day", "Noon"]),
"{topic1}": random.choice(["Q3 Review", "Budget Update", "New Hires", "Project Status"]),
"{topic2}": random.choice(["Safety Training", "Holiday Schedule", "IT Migration", "Team Building"]),
"{topic3}": random.choice(["Parking Changes", "Wellness Program", "Open Q&A"]),
"{account}": random.choice(["university", "corporate", "cloud storage"]),
"{num}": str(random.randint(10000, 99999)),
"{food1}": "Pasta Primavera", "{food2}": "Chicken Tikka", "{food3}": "Fish Tacos",
"{book}": random.choice(["1984", "Sapiens", "The Gene", "Thinking, Fast and Slow"]),
"{movie}": random.choice(["Inception", "The Matrix", "Interstellar"]),
"{pet_name}": random.choice(["Whiskers", "Max", "Luna"]),
"{club}": random.choice(["Chess", "Photography", "Hiking", "Debate"]),
"{day}": random.choice(["Monday", "Wednesday", "Friday"]),
"{charity}": random.choice(["Children's Hospital", "Local Food Bank", "Animal Shelter"]),
"{title}": random.choice([
"Effects of Temperature on Enzyme Kinetics in Thermophilic Bacteria",
"Deep Learning for Medical Image Segmentation: A Systematic Review",
"Novel Biomarkers in Cardiovascular Disease Progression",
"Metagenomic Analysis of Coral Reef Microbiomes Under Thermal Stress",
"CRISPR-Cas9 Editing Efficiency in Human iPSC-Derived Neurons",
]),
"{authors}": random.choice(["A. Smith, B. Jones, C. Lee", "R. Patel, S. Kim, T. Brown", "M. Wang, L. Davis"]),
"{affil}": random.choice(["University of Example, Dept. of Science", "MIT, CSAIL", "Stanford School of Medicine"]),
"{intro}": random.choice([
"Background text about the research problem being investigated.",
"This study addresses the gap in understanding of X in the context of Y.",
"Recent advances in Z have highlighted the need for improved W.",
]),
"{methods}": random.choice([
"We employed a cross-sectional study design with N=200 participants.",
"Samples were collected from 5 sites and processed using standard protocols.",
"We developed a convolutional neural network trained on 50K labeled images.",
]),
"{results}": random.choice([
"Treatment group showed 45% improvement (p<0.01) compared to control.",
"Our model achieved 94.2% accuracy on the held-out test set.",
"We identified 23 significantly enriched pathways (FDR < 0.05).",
]),
"{conclusions}": random.choice([
"Our findings support the hypothesis that X leads to improved Y.",
"These results demonstrate the feasibility of the proposed approach.",
"Further validation with larger cohorts is warranted.",
]),
"{finding1}": "Significant reduction in error rate (p<0.001)",
"{finding2}": "Model outperformed baseline by 15%",
"{finding3}": "Robust to distribution shift across domains",
"{future}": "Extend to longitudinal datasets and multi-site validation.",
"{ack}": random.choice(["Funded by NIH Grant R01-ABC123.", "Supported by NSF Award #1234567."]),
}
result = template
for key, val in fillers.items():
result = result.replace(key, val)
return result
def generate_synthetic_junk(n: int = 5000) -> List[Dict[str, str]]:
"""Generate synthetic junk documents."""
samples = []
for _ in range(n):
template = random.choice(JUNK_TEMPLATES)
text = _fill_template(template)
samples.append({"text": text, "label": "junk"})
return samples
def generate_synthetic_posters(n: int = 3000) -> List[Dict[str, str]]:
"""Generate synthetic poster-style documents."""
samples = []
for _ in range(n):
template = random.choice(POSTER_TEMPLATES)
text = _fill_template(template)
samples.append({"text": text, "label": "poster"})
return samples
# ββ Head 1: doc_type ββββββββββββββββββββββββββββββββββββββββββββ
def prepare_doc_type_dataset(
output_dir: Path,
n_per_class: int = 15000,
) -> Path:
"""
Assemble and save document-type training data.
Downloads from HuggingFace and combines with synthetic data.
Saves as NDJSON: {text, label}
"""
from datasets import load_dataset
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "doc_type_train.ndjson"
all_samples = []
logger.info("=== Preparing doc_type dataset ===")
# ββ scientific_paper βββββββββββββββββββββββββββββββββββββββββ
logger.info("Loading armanc/scientific_papers (arxiv split)...")
try:
ds = load_dataset(
"armanc/scientific_papers", "arxiv",
split="train", streaming=True, trust_remote_code=True,
)
count = 0
for row in ds:
if count >= n_per_class:
break
# Combine abstract + article body for full-text signal
abstract = row.get("abstract", "") or ""
article = row.get("article", "") or ""
text = (abstract + " " + article)[:4000]
if len(text.strip()) > 100:
all_samples.append({"text": text.strip(), "label": "scientific_paper"})
count += 1
logger.info(f" scientific_paper: {count}")
except Exception as e:
logger.warning(f"Could not load scientific_papers: {e}")
# Fallback
logger.info("Falling back to ccdv/arxiv-summarization...")
try:
ds = load_dataset(
"ccdv/arxiv-summarization",
split="train", streaming=True, trust_remote_code=True,
)
count = 0
for row in ds:
if count >= n_per_class:
break
text = ((row.get("abstract", "") or "") + " " + (row.get("article", "") or ""))[:4000]
if len(text.strip()) > 100:
all_samples.append({"text": text.strip(), "label": "scientific_paper"})
count += 1
logger.info(f" scientific_paper (fallback): {count}")
except Exception as e2:
logger.error(f"Fallback also failed: {e2}")
# ββ abstract_only ββββββββββββββββββββββββββββββββββββββββββββ
logger.info("Loading gfissore/arxiv-abstracts-2021...")
try:
ds = load_dataset(
"gfissore/arxiv-abstracts-2021",
split="train", streaming=True, trust_remote_code=True,
)
count = 0
for row in ds:
if count >= n_per_class:
break
abstract = row.get("abstract", "")
if abstract and 50 < len(abstract) < 600:
all_samples.append({"text": abstract.strip(), "label": "abstract_only"})
count += 1
logger.info(f" abstract_only: {count}")
except Exception as e:
logger.warning(f"Could not load arxiv-abstracts: {e}")
# Fallback: extract abstracts from scientific_papers
logger.info("Generating abstract_only from scientific_papers abstracts...")
try:
ds = load_dataset(
"armanc/scientific_papers", "arxiv",
split="train", streaming=True, trust_remote_code=True,
)
count = 0
for row in ds:
if count >= n_per_class:
break
abstract = row.get("abstract", "")
if abstract and 50 < len(abstract) < 600:
all_samples.append({"text": abstract.strip(), "label": "abstract_only"})
count += 1
logger.info(f" abstract_only (fallback): {count}")
except Exception:
pass
# ββ junk βββββββββββββββββββββββββββββββββββββββββββββββββββββ
logger.info("Loading ag_news for junk class...")
try:
ds = load_dataset(
"ag_news",
split="train", streaming=True, trust_remote_code=True,
)
count = 0
for row in ds:
if count >= n_per_class // 2:
break
text = row.get("text", "")
if len(text) > 30:
all_samples.append({"text": text.strip(), "label": "junk"})
count += 1
logger.info(f" junk (ag_news): {count}")
except Exception as e:
logger.warning(f"Could not load ag_news: {e}")
logger.info("Generating synthetic junk...")
synth_junk = generate_synthetic_junk(n_per_class // 2)
all_samples.extend(synth_junk)
logger.info(f" junk (synthetic): {len(synth_junk)}")
# ββ poster βββββββββββββββββββββββββββββββββββββββββββββββββββ
logger.info("Generating synthetic poster data...")
synth_posters = generate_synthetic_posters(n_per_class)
all_samples.extend(synth_posters)
logger.info(f" poster (synthetic): {len(synth_posters)}")
# ββ Shuffle and save βββββββββββββββββββββββββββββββββββββββββ
random.shuffle(all_samples)
with open(output_path, "w") as f:
for sample in all_samples:
f.write(json.dumps(sample) + "\n")
# Report distribution
dist = {}
for s in all_samples:
dist[s["label"]] = dist.get(s["label"], 0) + 1
logger.info(f"Saved {len(all_samples)} samples to {output_path}")
for label, count in sorted(dist.items()):
logger.info(f" {label}: {count}")
return output_path
# ββ Head 2: ai_detect βββββββββββββββββββββββββββββββββββββββββββ
def prepare_ai_detect_dataset(
output_dir: Path,
n_per_class: int = 20000,
) -> Path:
"""
Assemble AI-generated text detection training data.
Sources (all verified available):
- liamdugan/raid: multi-model generations, domain="abstracts"
model="human" β human, otherwise β ai_generated
- NicolaiSivesind/ChatGPT-Research-Abstracts: real + GPT-3.5 abstracts
"""
from datasets import load_dataset
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "ai_detect_train.ndjson"
human_samples = []
ai_samples = []
logger.info("=== Preparing ai_detect dataset ===")
# ββ RAID (scientific abstracts domain) βββββββββββββββββββββββ
logger.info("Loading liamdugan/raid (abstracts domain)...")
try:
ds = load_dataset(
"liamdugan/raid",
split="train", streaming=True, trust_remote_code=True,
)
human_count = 0
ai_count = 0
for row in ds:
domain = row.get("domain", "")
if domain != "abstracts":
continue
text = row.get("generation", "") or ""
if not text or len(text) < 50:
continue
model = row.get("model", "")
if model == "human":
if human_count < n_per_class:
human_samples.append({"text": text[:4000], "label": "human"})
human_count += 1
else:
if ai_count < n_per_class:
ai_samples.append({"text": text[:4000], "label": "ai_generated"})
ai_count += 1
if human_count >= n_per_class and ai_count >= n_per_class:
break
logger.info(f" RAID: human={human_count}, ai={ai_count}")
except Exception as e:
logger.warning(f"Could not load RAID: {e}")
# ββ ChatGPT-Research-Abstracts βββββββββββββββββββββββββββββββ
logger.info("Loading NicolaiSivesind/ChatGPT-Research-Abstracts...")
try:
ds = load_dataset(
"NicolaiSivesind/ChatGPT-Research-Abstracts",
split="train", streaming=True, trust_remote_code=True,
)
h_count = 0
a_count = 0
for row in ds:
real = row.get("real_abstract", "")
generated = row.get("generated_abstract", "")
if real and len(real) > 50:
human_samples.append({"text": real[:4000], "label": "human"})
h_count += 1
if generated and len(generated) > 50:
ai_samples.append({"text": generated[:4000], "label": "ai_generated"})
a_count += 1
logger.info(f" ChatGPT-Abstracts: human={h_count}, ai={a_count}")
except Exception as e:
logger.warning(f"Could not load ChatGPT-Research-Abstracts: {e}")
# ββ Balance and save βββββββββββββββββββββββββββββββββββββββββ
min_count = min(len(human_samples), len(ai_samples), n_per_class)
if min_count == 0:
logger.error("No AI detection training data available!")
# Save empty file
with open(output_path, "w") as f:
pass
return output_path
balanced = (
random.sample(human_samples, min(min_count, len(human_samples)))
+ random.sample(ai_samples, min(min_count, len(ai_samples)))
)
random.shuffle(balanced)
with open(output_path, "w") as f:
for sample in balanced:
f.write(json.dumps(sample) + "\n")
n_h = sum(1 for s in balanced if s["label"] == "human")
n_a = sum(1 for s in balanced if s["label"] == "ai_generated")
logger.info(f"Saved {len(balanced)} samples (human={n_h}, ai={n_a}) to {output_path}")
return output_path
# ββ Head 3: toxicity ββββββββββββββββββββββββββββββββββββββββββββ
def prepare_toxicity_dataset(
output_dir: Path,
n_per_class: int = 20000,
) -> Path:
"""
Assemble toxicity detection training data.
Sources (all verified available without manual download):
- google/civil_comments β ~1.8 M comments with toxicity float (0β1)
We threshold: toxic >= 0.5, clean < 0.1
- skg/toxigen-data β 274 K annotated statements
toxicity_human is a float 1β5; we use >= 4.0 as toxic, <= 2.0 as clean
"""
from datasets import load_dataset
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "toxicity_train.ndjson"
toxic_samples = []
clean_samples = []
logger.info("=== Preparing toxicity dataset ===")
# ββ Civil Comments βββββββββββββββββββββββββββββββββββββββββββ
logger.info("Loading google/civil_comments...")
try:
ds = load_dataset(
"google/civil_comments",
split="train", streaming=True, trust_remote_code=True,
)
toxic_count = 0
clean_count = 0
for row in ds:
text = row.get("text", "")
if not text or len(text) < 20:
continue
toxicity = row.get("toxicity", 0.0)
if toxicity >= 0.5 and toxic_count < n_per_class:
toxic_samples.append({"text": text[:4000], "label": "toxic"})
toxic_count += 1
elif toxicity < 0.1 and clean_count < n_per_class:
clean_samples.append({"text": text[:4000], "label": "clean"})
clean_count += 1
if toxic_count >= n_per_class and clean_count >= n_per_class:
break
logger.info(f" Civil Comments: toxic={toxic_count}, clean={clean_count}")
except Exception as e:
logger.warning(f"Could not load civil_comments: {e}")
# ββ ToxiGen ββββββββββββββββββββββββββββββββββββββββββββββββββ
logger.info("Loading skg/toxigen-data...")
try:
ds = load_dataset(
"skg/toxigen-data",
split="train", streaming=True, trust_remote_code=True,
)
t_count = 0
c_count = 0
for row in ds:
text = row.get("text", "")
if not text or len(text) < 20:
continue
# toxicity_human is 1-5 scale
tox_score = row.get("toxicity_human", None)
if tox_score is None:
continue
tox_score = float(tox_score)
if tox_score >= 4.0:
toxic_samples.append({"text": text[:4000], "label": "toxic"})
t_count += 1
elif tox_score <= 2.0:
clean_samples.append({"text": text[:4000], "label": "clean"})
c_count += 1
logger.info(f" ToxiGen: toxic={t_count}, clean={c_count}")
except Exception as e:
logger.warning(f"Could not load ToxiGen: {e}")
# ββ Balance and save βββββββββββββββββββββββββββββββββββββββββ
min_count = min(len(toxic_samples), len(clean_samples), n_per_class)
if min_count == 0:
logger.error("No toxicity training data available!")
with open(output_path, "w") as f:
pass
return output_path
balanced = (
random.sample(toxic_samples, min(min_count, len(toxic_samples)))
+ random.sample(clean_samples, min(min_count, len(clean_samples)))
)
random.shuffle(balanced)
with open(output_path, "w") as f:
for sample in balanced:
f.write(json.dumps(sample) + "\n")
n_t = sum(1 for s in balanced if s["label"] == "toxic")
n_c = sum(1 for s in balanced if s["label"] == "clean")
logger.info(f"Saved {len(balanced)} samples (toxic={n_t}, clean={n_c}) to {output_path}")
return output_path
# ββ Orchestrator βββββββββββββββββββββββββββββββββββββββββββββββββ
def prepare_all(output_dir: Path, n_per_class: int = 15000):
"""Download and prepare all three datasets."""
output_dir = Path(output_dir)
logger.info(f"Preparing all datasets in {output_dir}")
paths = {}
paths["doc_type"] = prepare_doc_type_dataset(output_dir, n_per_class)
paths["ai_detect"] = prepare_ai_detect_dataset(output_dir, n_per_class)
paths["toxicity"] = prepare_toxicity_dataset(output_dir, n_per_class)
logger.info("All datasets prepared!")
return paths
|