""" Dataset preparation for PubGuard training. Downloads publicly available datasets from HuggingFace and assembles them into the three labelled corpora needed by the training pipeline. Datasets used (verified available 2026-02) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **Head 1 — Document Type** (scientific_paper | poster | abstract_only | junk) Positive (scientific_paper): - armanc/scientific_papers (arxiv) ~300 K full-text articles cols: article, abstract, section_names Negative (abstract_only): - gfissore/arxiv-abstracts-2021 ~2 M abstracts cols: abstract (filter length < 600 chars) Negative (junk): - ag_news (news articles) + synthetic templates (flyers, invoices, etc.) Negative (poster): - Synthetic poster-style structured text **Head 2 — AI-Generated Text Detection** - liamdugan/raid – multi-model generations, domain="abstracts" cols: model, domain, generation (model="human" for human text) - NicolaiSivesind/ChatGPT-Research-Abstracts – real + GPT-3.5 abstracts cols: real_abstract, generated_abstract **Head 3 — Toxicity** - google/civil_comments – 1.8 M comments with toxicity scores (0–1) cols: text, toxicity - skg/toxigen-data – 274 K annotated toxic/benign statements cols: text, toxicity_human (1–5 scale) """ import json import logging import random from pathlib import Path from typing import Dict, List, Tuple logger = logging.getLogger(__name__) # ── Constants ──────────────────────────────────────────────────── SEED = 42 random.seed(SEED) # ── Synthetic templates ────────────────────────────────────────── JUNK_TEMPLATES = [ "🎉 Annual {event} at {place}! Join us on {date}. Free food and drinks. RSVP to {email}.", "FOR SALE: {item}. Great condition. ${price}. Contact {name} at {phone}.", "{company} is hiring! We're looking for a {role}. Apply now at {url}.", "NOTICE: The {dept} office will be closed on {date} for {reason}. Questions? Call {phone}.", "Don't miss our {event}! {date} from {time}. {place}. Tickets: ${price}.", "Weekly newsletter from {company}. This week: {topic1}, {topic2}, and more!", "Invoice #{num} from {company}. Amount due: ${price}. Payment due by {date}.", "Meeting agenda for {date}. 1) {topic1} 2) {topic2} 3) {topic3}. Location: {place}.", "URGENT: Your {account} password expires on {date}. Click here to reset: {url}.", "Congratulations {name}! You've been selected for our exclusive {event}. Limited spots!", "Thank you for your purchase! Order #{num}. Estimated delivery: {date}.", "{company} presents the {event}. Keynote by {name}. Register at {url}.", "Garage sale this weekend! {place}. {date} {time}. Everything must go!", "Happy Birthday to {name} from all of us at {company}! 🎂", "POOL PARTY! 🏊 Come join us at {place} on {date}. Bring your swimsuit and sunscreen!", "Menu for this week: Monday: {food1}. Tuesday: {food2}. Wednesday: {food3}.", "Building maintenance notice: {reason} on {date}. Please plan accordingly.", "Lost & Found: {item} found near {place}. Contact front desk to claim.", "Fantasy Football League draft is on {date}! Don't forget to submit your picks.", "Book club meeting: We're reading '{book}' by {name}. Discussion on {date}.", "Hey everyone! Movie night at {place} on {date}. We're watching '{movie}'. Bring popcorn!", "Reminder: Staff meeting {date} at {time}. Attendance mandatory. {dept}.", "Lost cat! Orange tabby, answers to '{pet_name}'. Last seen near {place}. Call {phone}.", "HOT DEAL! {item} only ${price}! Limited time offer. Visit {url}.", "Club registration open! Join the {club} club. Meetings every {day} at {time}. {place}.", "Fundraiser bake sale! {date} at {place}. All proceeds go to {charity}.", "Apartment for rent: 2BR/1BA near {place}. ${price}/month. Pet friendly. Call {phone}.", "Yoga class every {day} at {time}. {place}. All levels welcome. Bring your own mat!", "IT Alert: System maintenance scheduled for {date}. Expected downtime: {time}. {dept}.", "Carpool needed! Driving from {place} to {place2} daily. Contact {name} at {email}.", ] POSTER_TEMPLATES = [ "TITLE: {title}\n\nAUTHORS: {authors}\nAFFILIATION: {affil}\n\nINTRODUCTION\n{intro}\n\nMETHODS\n{methods}\n\nRESULTS\n{results}\n\nCONCLUSIONS\n{conclusions}\n\nACKNOWLEDGMENTS\n{ack}", "{title}\n{authors} | {affil}\n\nBackground: {intro}\n\nApproach: {methods}\n\nKey Findings:\n• {finding1}\n• {finding2}\n• {finding3}\n\nFuture Work: {future}\n\nContact: {email}", "POSTER PRESENTATION\n\n{title}\n\n{authors}\n{affil}\n\nObjective: {intro}\n\nDesign: {methods}\n\nOutcome: {results}\n\nConclusion: {conclusions}", "{title}\n\n{authors} ({affil})\n\nAim: {intro}\nMethod: {methods}\nResult: {results}\nSummary: {conclusions}\n\nCorrespondence: {email}", "RESEARCH POSTER\n─────────────────────\n{title}\n{authors}\n{affil}\n\n▸ Background\n{intro}\n\n▸ Methods\n{methods}\n\n▸ Results\n• {finding1}\n• {finding2}\n\n▸ Conclusion\n{conclusions}\n\nFunding: {ack}", ] def _fill_template(template: str) -> str: """Fill a template with random plausible values.""" fillers = { "{event}": random.choice(["Pool Party", "BBQ Bash", "Career Fair", "Fundraiser Gala", "Open House", "Trivia Night"]), "{place}": random.choice(["Room 201", "Hilton Downtown", "the Community Center", "Central Park", "Building B Courtyard", "Main Auditorium"]), "{place2}": random.choice(["Campus North", "Downtown", "Tech Park", "Medical Center"]), "{date}": random.choice(["March 15", "June 22", "Sept 5", "November 10", "January 30", "Friday the 13th"]), "{email}": "info@example.com", "{item}": random.choice(["2019 Honda Civic", "MacBook Pro 16-inch", "Standing Desk", "Mountain Bike", "Vintage Guitar"]), "{price}": str(random.randint(10, 5000)), "{name}": random.choice(["Dr. Smith", "Jane Doe", "Prof. Chen", "Maria Garcia", "Bob Wilson"]), "{phone}": "555-0123", "{company}": random.choice(["TechCorp", "BioGen Inc.", "Global Solutions", "Acme Labs", "DataFlow Systems"]), "{role}": random.choice(["Data Scientist", "Lab Technician", "Project Manager", "Software Engineer"]), "{url}": "https://example.com/apply", "{dept}": random.choice(["HR", "Finance", "Engineering", "Admissions", "IT Support"]), "{reason}": random.choice(["maintenance", "holiday", "training day", "renovation", "fire drill"]), "{time}": random.choice(["2-5 PM", "10 AM - 3 PM", "6-9 PM", "All Day", "Noon"]), "{topic1}": random.choice(["Q3 Review", "Budget Update", "New Hires", "Project Status"]), "{topic2}": random.choice(["Safety Training", "Holiday Schedule", "IT Migration", "Team Building"]), "{topic3}": random.choice(["Parking Changes", "Wellness Program", "Open Q&A"]), "{account}": random.choice(["university", "corporate", "cloud storage"]), "{num}": str(random.randint(10000, 99999)), "{food1}": "Pasta Primavera", "{food2}": "Chicken Tikka", "{food3}": "Fish Tacos", "{book}": random.choice(["1984", "Sapiens", "The Gene", "Thinking, Fast and Slow"]), "{movie}": random.choice(["Inception", "The Matrix", "Interstellar"]), "{pet_name}": random.choice(["Whiskers", "Max", "Luna"]), "{club}": random.choice(["Chess", "Photography", "Hiking", "Debate"]), "{day}": random.choice(["Monday", "Wednesday", "Friday"]), "{charity}": random.choice(["Children's Hospital", "Local Food Bank", "Animal Shelter"]), "{title}": random.choice([ "Effects of Temperature on Enzyme Kinetics in Thermophilic Bacteria", "Deep Learning for Medical Image Segmentation: A Systematic Review", "Novel Biomarkers in Cardiovascular Disease Progression", "Metagenomic Analysis of Coral Reef Microbiomes Under Thermal Stress", "CRISPR-Cas9 Editing Efficiency in Human iPSC-Derived Neurons", ]), "{authors}": random.choice(["A. Smith, B. Jones, C. Lee", "R. Patel, S. Kim, T. Brown", "M. Wang, L. Davis"]), "{affil}": random.choice(["University of Example, Dept. of Science", "MIT, CSAIL", "Stanford School of Medicine"]), "{intro}": random.choice([ "Background text about the research problem being investigated.", "This study addresses the gap in understanding of X in the context of Y.", "Recent advances in Z have highlighted the need for improved W.", ]), "{methods}": random.choice([ "We employed a cross-sectional study design with N=200 participants.", "Samples were collected from 5 sites and processed using standard protocols.", "We developed a convolutional neural network trained on 50K labeled images.", ]), "{results}": random.choice([ "Treatment group showed 45% improvement (p<0.01) compared to control.", "Our model achieved 94.2% accuracy on the held-out test set.", "We identified 23 significantly enriched pathways (FDR < 0.05).", ]), "{conclusions}": random.choice([ "Our findings support the hypothesis that X leads to improved Y.", "These results demonstrate the feasibility of the proposed approach.", "Further validation with larger cohorts is warranted.", ]), "{finding1}": "Significant reduction in error rate (p<0.001)", "{finding2}": "Model outperformed baseline by 15%", "{finding3}": "Robust to distribution shift across domains", "{future}": "Extend to longitudinal datasets and multi-site validation.", "{ack}": random.choice(["Funded by NIH Grant R01-ABC123.", "Supported by NSF Award #1234567."]), } result = template for key, val in fillers.items(): result = result.replace(key, val) return result def generate_synthetic_junk(n: int = 5000) -> List[Dict[str, str]]: """Generate synthetic junk documents.""" samples = [] for _ in range(n): template = random.choice(JUNK_TEMPLATES) text = _fill_template(template) samples.append({"text": text, "label": "junk"}) return samples def generate_synthetic_posters(n: int = 3000) -> List[Dict[str, str]]: """Generate synthetic poster-style documents.""" samples = [] for _ in range(n): template = random.choice(POSTER_TEMPLATES) text = _fill_template(template) samples.append({"text": text, "label": "poster"}) return samples # ── Head 1: doc_type ──────────────────────────────────────────── def prepare_doc_type_dataset( output_dir: Path, n_per_class: int = 15000, ) -> Path: """ Assemble and save document-type training data. Downloads from HuggingFace and combines with synthetic data. Saves as NDJSON: {text, label} """ from datasets import load_dataset output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / "doc_type_train.ndjson" all_samples = [] logger.info("=== Preparing doc_type dataset ===") # ── scientific_paper ───────────────────────────────────────── logger.info("Loading armanc/scientific_papers (arxiv split)...") try: ds = load_dataset( "armanc/scientific_papers", "arxiv", split="train", streaming=True, trust_remote_code=True, ) count = 0 for row in ds: if count >= n_per_class: break # Combine abstract + article body for full-text signal abstract = row.get("abstract", "") or "" article = row.get("article", "") or "" text = (abstract + " " + article)[:4000] if len(text.strip()) > 100: all_samples.append({"text": text.strip(), "label": "scientific_paper"}) count += 1 logger.info(f" scientific_paper: {count}") except Exception as e: logger.warning(f"Could not load scientific_papers: {e}") # Fallback logger.info("Falling back to ccdv/arxiv-summarization...") try: ds = load_dataset( "ccdv/arxiv-summarization", split="train", streaming=True, trust_remote_code=True, ) count = 0 for row in ds: if count >= n_per_class: break text = ((row.get("abstract", "") or "") + " " + (row.get("article", "") or ""))[:4000] if len(text.strip()) > 100: all_samples.append({"text": text.strip(), "label": "scientific_paper"}) count += 1 logger.info(f" scientific_paper (fallback): {count}") except Exception as e2: logger.error(f"Fallback also failed: {e2}") # ── abstract_only ──────────────────────────────────────────── logger.info("Loading gfissore/arxiv-abstracts-2021...") try: ds = load_dataset( "gfissore/arxiv-abstracts-2021", split="train", streaming=True, trust_remote_code=True, ) count = 0 for row in ds: if count >= n_per_class: break abstract = row.get("abstract", "") if abstract and 50 < len(abstract) < 600: all_samples.append({"text": abstract.strip(), "label": "abstract_only"}) count += 1 logger.info(f" abstract_only: {count}") except Exception as e: logger.warning(f"Could not load arxiv-abstracts: {e}") # Fallback: extract abstracts from scientific_papers logger.info("Generating abstract_only from scientific_papers abstracts...") try: ds = load_dataset( "armanc/scientific_papers", "arxiv", split="train", streaming=True, trust_remote_code=True, ) count = 0 for row in ds: if count >= n_per_class: break abstract = row.get("abstract", "") if abstract and 50 < len(abstract) < 600: all_samples.append({"text": abstract.strip(), "label": "abstract_only"}) count += 1 logger.info(f" abstract_only (fallback): {count}") except Exception: pass # ── junk (100% real data — ag_news) ──────────────────────────── logger.info("Loading ag_news for junk class (full — no synthetic)...") try: ds = load_dataset( "ag_news", split="train", streaming=True, trust_remote_code=True, ) count = 0 for row in ds: if count >= n_per_class: break text = row.get("text", "") if len(text) > 30: all_samples.append({"text": text.strip(), "label": "junk"}) count += 1 logger.info(f" junk (ag_news): {count}") except Exception as e: logger.warning(f"Could not load ag_news: {e}") # ── poster ──────────────────────────────────────────────────── # NOTE: Real poster text is nearly identical to paper text in # embedding space (both are scientific). PubGuard uses text-only # features, so we need SHORT, STRUCTURED poster-style texts that # the embedding can distinguish from full papers. # # Strategy: synthetic poster templates (structured, short) + # real poster texts TRUNCATED to first 500 chars (title/authors # block, which has distinct formatting from paper introductions). logger.info("Loading poster data (structured templates + real poster headers)...") poster_count = 0 # (a) Synthetic templates — provide distinctive poster structure signal synth_posters = generate_synthetic_posters(min(n_per_class // 2, 7500)) all_samples.extend(synth_posters) poster_count += len(synth_posters) logger.info(f" poster (synthetic templates): {len(synth_posters)}") # (b) Real poster header text (first 500 chars only — title/authors block) real_poster_count = 0 local_poster_data = Path("/home/joneill/pubverse_brett/poster_sentry/poster_texts_for_pubguard.ndjson") if not local_poster_data.exists(): local_poster_data = Path.cwd().parent / "poster_sentry" / "poster_texts_for_pubguard.ndjson" if local_poster_data.exists(): logger.info(f" Adding real poster headers from: {local_poster_data}") with open(local_poster_data) as f: for line in f: if real_poster_count >= n_per_class // 2: break row = json.loads(line) if row.get("label") == "poster": # Truncate to header region (title, authors, affiliations) text = row["text"][:500] if len(text) > 50: all_samples.append({"text": text, "label": "poster"}) real_poster_count += 1 poster_count += real_poster_count logger.info(f" poster (real headers, ≤500 chars): {real_poster_count}") else: # Fill with more synthetic templates if no real data available extra = generate_synthetic_posters(n_per_class // 2) all_samples.extend(extra) poster_count += len(extra) logger.info(f" poster (synthetic fallback): {len(extra)}") logger.info(f" poster total: {poster_count}") # ── Shuffle and save ───────────────────────────────────────── random.shuffle(all_samples) with open(output_path, "w") as f: for sample in all_samples: f.write(json.dumps(sample) + "\n") # Report distribution dist = {} for s in all_samples: dist[s["label"]] = dist.get(s["label"], 0) + 1 logger.info(f"Saved {len(all_samples)} samples to {output_path}") for label, count in sorted(dist.items()): logger.info(f" {label}: {count}") return output_path # ── Head 2: ai_detect ─────────────────────────────────────────── def prepare_ai_detect_dataset( output_dir: Path, n_per_class: int = 20000, ) -> Path: """ Assemble AI-generated text detection training data. Sources (all verified available): - liamdugan/raid: multi-model generations, domain="abstracts" model="human" → human, otherwise → ai_generated - NicolaiSivesind/ChatGPT-Research-Abstracts: real + GPT-3.5 abstracts """ from datasets import load_dataset output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / "ai_detect_train.ndjson" human_samples = [] ai_samples = [] logger.info("=== Preparing ai_detect dataset ===") # ── RAID (scientific abstracts domain) ─────────────────────── logger.info("Loading liamdugan/raid (abstracts domain)...") try: ds = load_dataset( "liamdugan/raid", split="train", streaming=True, trust_remote_code=True, ) human_count = 0 ai_count = 0 for row in ds: domain = row.get("domain", "") if domain != "abstracts": continue text = row.get("generation", "") or "" if not text or len(text) < 50: continue model = row.get("model", "") if model == "human": if human_count < n_per_class: human_samples.append({"text": text[:4000], "label": "human"}) human_count += 1 else: if ai_count < n_per_class: ai_samples.append({"text": text[:4000], "label": "ai_generated"}) ai_count += 1 if human_count >= n_per_class and ai_count >= n_per_class: break logger.info(f" RAID: human={human_count}, ai={ai_count}") except Exception as e: logger.warning(f"Could not load RAID: {e}") # ── ChatGPT-Research-Abstracts ─────────────────────────────── logger.info("Loading NicolaiSivesind/ChatGPT-Research-Abstracts...") try: ds = load_dataset( "NicolaiSivesind/ChatGPT-Research-Abstracts", split="train", streaming=True, trust_remote_code=True, ) h_count = 0 a_count = 0 for row in ds: real = row.get("real_abstract", "") generated = row.get("generated_abstract", "") if real and len(real) > 50: human_samples.append({"text": real[:4000], "label": "human"}) h_count += 1 if generated and len(generated) > 50: ai_samples.append({"text": generated[:4000], "label": "ai_generated"}) a_count += 1 logger.info(f" ChatGPT-Abstracts: human={h_count}, ai={a_count}") except Exception as e: logger.warning(f"Could not load ChatGPT-Research-Abstracts: {e}") # ── Balance and save ───────────────────────────────────────── min_count = min(len(human_samples), len(ai_samples), n_per_class) if min_count == 0: logger.error("No AI detection training data available!") # Save empty file with open(output_path, "w") as f: pass return output_path balanced = ( random.sample(human_samples, min(min_count, len(human_samples))) + random.sample(ai_samples, min(min_count, len(ai_samples))) ) random.shuffle(balanced) with open(output_path, "w") as f: for sample in balanced: f.write(json.dumps(sample) + "\n") n_h = sum(1 for s in balanced if s["label"] == "human") n_a = sum(1 for s in balanced if s["label"] == "ai_generated") logger.info(f"Saved {len(balanced)} samples (human={n_h}, ai={n_a}) to {output_path}") return output_path # ── Head 3: toxicity ──────────────────────────────────────────── def prepare_toxicity_dataset( output_dir: Path, n_per_class: int = 20000, ) -> Path: """ Assemble toxicity detection training data. Sources (all verified available without manual download): - google/civil_comments – ~1.8 M comments with toxicity float (0–1) We threshold: toxic >= 0.5, clean < 0.1 - skg/toxigen-data – 274 K annotated statements toxicity_human is a float 1–5; we use >= 4.0 as toxic, <= 2.0 as clean """ from datasets import load_dataset output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / "toxicity_train.ndjson" toxic_samples = [] clean_samples = [] logger.info("=== Preparing toxicity dataset ===") # ── Civil Comments ─────────────────────────────────────────── logger.info("Loading google/civil_comments...") try: ds = load_dataset( "google/civil_comments", split="train", streaming=True, trust_remote_code=True, ) toxic_count = 0 clean_count = 0 for row in ds: text = row.get("text", "") if not text or len(text) < 20: continue toxicity = row.get("toxicity", 0.0) if toxicity >= 0.5 and toxic_count < n_per_class: toxic_samples.append({"text": text[:4000], "label": "toxic"}) toxic_count += 1 elif toxicity < 0.1 and clean_count < n_per_class: clean_samples.append({"text": text[:4000], "label": "clean"}) clean_count += 1 if toxic_count >= n_per_class and clean_count >= n_per_class: break logger.info(f" Civil Comments: toxic={toxic_count}, clean={clean_count}") except Exception as e: logger.warning(f"Could not load civil_comments: {e}") # ── ToxiGen ────────────────────────────────────────────────── logger.info("Loading skg/toxigen-data...") try: ds = load_dataset( "skg/toxigen-data", split="train", streaming=True, trust_remote_code=True, ) t_count = 0 c_count = 0 for row in ds: text = row.get("text", "") if not text or len(text) < 20: continue # toxicity_human is 1-5 scale tox_score = row.get("toxicity_human", None) if tox_score is None: continue tox_score = float(tox_score) if tox_score >= 4.0: toxic_samples.append({"text": text[:4000], "label": "toxic"}) t_count += 1 elif tox_score <= 2.0: clean_samples.append({"text": text[:4000], "label": "clean"}) c_count += 1 logger.info(f" ToxiGen: toxic={t_count}, clean={c_count}") except Exception as e: logger.warning(f"Could not load ToxiGen: {e}") # ── Balance and save ───────────────────────────────────────── min_count = min(len(toxic_samples), len(clean_samples), n_per_class) if min_count == 0: logger.error("No toxicity training data available!") with open(output_path, "w") as f: pass return output_path balanced = ( random.sample(toxic_samples, min(min_count, len(toxic_samples))) + random.sample(clean_samples, min(min_count, len(clean_samples))) ) random.shuffle(balanced) with open(output_path, "w") as f: for sample in balanced: f.write(json.dumps(sample) + "\n") n_t = sum(1 for s in balanced if s["label"] == "toxic") n_c = sum(1 for s in balanced if s["label"] == "clean") logger.info(f"Saved {len(balanced)} samples (toxic={n_t}, clean={n_c}) to {output_path}") return output_path # ── Orchestrator ───────────────────────────────────────────────── def prepare_all(output_dir: Path, n_per_class: int = 15000): """Download and prepare all three datasets.""" output_dir = Path(output_dir) logger.info(f"Preparing all datasets in {output_dir}") paths = {} paths["doc_type"] = prepare_doc_type_dataset(output_dir, n_per_class) paths["ai_detect"] = prepare_ai_detect_dataset(output_dir, n_per_class) paths["toxicity"] = prepare_toxicity_dataset(output_dir, n_per_class) logger.info("All datasets prepared!") return paths