jimnoneill
/

pubguard-classifier

+"""
+Dataset preparation for PubGuard training.
+Downloads publicly available datasets from HuggingFace and assembles
+them into the three labelled corpora needed by the training pipeline.
+Datasets used (verified available 2026-02)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+**Head 1 — Document Type** (scientific_paper | poster | abstract_only | junk)
+  Positive (scientific_paper):
+    - armanc/scientific_papers (arxiv)  ~300 K full-text articles
+      cols: article, abstract, section_names
+  Negative (abstract_only):
+    - gfissore/arxiv-abstracts-2021     ~2 M abstracts
+      cols: abstract (filter length < 600 chars)
+  Negative (junk):
+    - ag_news (news articles) + synthetic templates (flyers, invoices, etc.)
+  Negative (poster):
+    - Synthetic poster-style structured text
+**Head 2 — AI-Generated Text Detection**
+    - liamdugan/raid  – multi-model generations, domain="abstracts"
+      cols: model, domain, generation  (model="human" for human text)
+    - NicolaiSivesind/ChatGPT-Research-Abstracts – real + GPT-3.5 abstracts
+      cols: real_abstract, generated_abstract
+**Head 3 — Toxicity**
+    - google/civil_comments – 1.8 M comments with toxicity scores (0–1)
+      cols: text, toxicity
+    - skg/toxigen-data – 274 K annotated toxic/benign statements
+      cols: text, toxicity_human (1–5 scale)
+"""
+import json
+import logging
+import random
+from pathlib import Path
+from typing import Dict, List, Tuple
+logger = logging.getLogger(__name__)
+# ── Constants ────────────────────────────────────────────────────
+SEED = 42
+random.seed(SEED)
+# ── Synthetic templates ──────────────────────────────────────────
+JUNK_TEMPLATES = [
+    "🎉 Annual {event} at {place}! Join us on {date}. Free food and drinks. RSVP to {email}.",
+    "FOR SALE: {item}. Great condition. ${price}. Contact {name} at {phone}.",
+    "{company} is hiring! We're looking for a {role}. Apply now at {url}.",
+    "NOTICE: The {dept} office will be closed on {date} for {reason}. Questions? Call {phone}.",
+    "Don't miss our {event}! {date} from {time}. {place}. Tickets: ${price}.",
+    "Weekly newsletter from {company}. This week: {topic1}, {topic2}, and more!",
+    "Invoice #{num} from {company}. Amount due: ${price}. Payment due by {date}.",
+    "Meeting agenda for {date}. 1) {topic1} 2) {topic2} 3) {topic3}. Location: {place}.",
+    "URGENT: Your {account} password expires on {date}. Click here to reset: {url}.",
+    "Congratulations {name}! You've been selected for our exclusive {event}. Limited spots!",
+    "Thank you for your purchase! Order #{num}. Estimated delivery: {date}.",
+    "{company} presents the {event}. Keynote by {name}. Register at {url}.",
+    "Garage sale this weekend! {place}. {date} {time}. Everything must go!",
+    "Happy Birthday to {name} from all of us at {company}! 🎂",
+    "POOL PARTY! 🏊 Come join us at {place} on {date}. Bring your swimsuit and sunscreen!",
+    "Menu for this week: Monday: {food1}. Tuesday: {food2}. Wednesday: {food3}.",
+    "Building maintenance notice: {reason} on {date}. Please plan accordingly.",
+    "Lost & Found: {item} found near {place}. Contact front desk to claim.",
+    "Fantasy Football League draft is on {date}! Don't forget to submit your picks.",
+    "Book club meeting: We're reading '{book}' by {name}. Discussion on {date}.",
+    "Hey everyone! Movie night at {place} on {date}. We're watching '{movie}'. Bring popcorn!",
+    "Reminder: Staff meeting {date} at {time}. Attendance mandatory. {dept}.",
+    "Lost cat! Orange tabby, answers to '{pet_name}'. Last seen near {place}. Call {phone}.",
+    "HOT DEAL! {item} only ${price}! Limited time offer. Visit {url}.",
+    "Club registration open! Join the {club} club. Meetings every {day} at {time}. {place}.",
+    "Fundraiser bake sale! {date} at {place}. All proceeds go to {charity}.",
+    "Apartment for rent: 2BR/1BA near {place}. ${price}/month. Pet friendly. Call {phone}.",
+    "Yoga class every {day} at {time}. {place}. All levels welcome. Bring your own mat!",
+    "IT Alert: System maintenance scheduled for {date}. Expected downtime: {time}. {dept}.",
+    "Carpool needed! Driving from {place} to {place2} daily. Contact {name} at {email}.",
+]
+POSTER_TEMPLATES = [
+    "TITLE: {title}\n\nAUTHORS: {authors}\nAFFILIATION: {affil}\n\nINTRODUCTION\n{intro}\n\nMETHODS\n{methods}\n\nRESULTS\n{results}\n\nCONCLUSIONS\n{conclusions}\n\nACKNOWLEDGMENTS\n{ack}",
+    "{title}\n{authors} | {affil}\n\nBackground: {intro}\n\nApproach: {methods}\n\nKey Findings:\n• {finding1}\n• {finding2}\n• {finding3}\n\nFuture Work: {future}\n\nContact: {email}",
+    "POSTER PRESENTATION\n\n{title}\n\n{authors}\n{affil}\n\nObjective: {intro}\n\nDesign: {methods}\n\nOutcome: {results}\n\nConclusion: {conclusions}",
+    "{title}\n\n{authors} ({affil})\n\nAim: {intro}\nMethod: {methods}\nResult: {results}\nSummary: {conclusions}\n\nCorrespondence: {email}",
+    "RESEARCH POSTER\n─────────────────────\n{title}\n{authors}\n{affil}\n\n▸ Background\n{intro}\n\n▸ Methods\n{methods}\n\n▸ Results\n• {finding1}\n• {finding2}\n\n▸ Conclusion\n{conclusions}\n\nFunding: {ack}",
+]
+def _fill_template(template: str) -> str:
+    """Fill a template with random plausible values."""
+    fillers = {
+        "{event}": random.choice(["Pool Party", "BBQ Bash", "Career Fair", "Fundraiser Gala", "Open House", "Trivia Night"]),
+        "{place}": random.choice(["Room 201", "Hilton Downtown", "the Community Center", "Central Park", "Building B Courtyard", "Main Auditorium"]),
+        "{place2}": random.choice(["Campus North", "Downtown", "Tech Park", "Medical Center"]),
+        "{date}": random.choice(["March 15", "June 22", "Sept 5", "November 10", "January 30", "Friday the 13th"]),
+        "{email}": "info@example.com",
+        "{item}": random.choice(["2019 Honda Civic", "MacBook Pro 16-inch", "Standing Desk", "Mountain Bike", "Vintage Guitar"]),
+        "{price}": str(random.randint(10, 5000)),
+        "{name}": random.choice(["Dr. Smith", "Jane Doe", "Prof. Chen", "Maria Garcia", "Bob Wilson"]),
+        "{phone}": "555-0123",
+        "{company}": random.choice(["TechCorp", "BioGen Inc.", "Global Solutions", "Acme Labs", "DataFlow Systems"]),
+        "{role}": random.choice(["Data Scientist", "Lab Technician", "Project Manager", "Software Engineer"]),
+        "{url}": "https://example.com/apply",
+        "{dept}": random.choice(["HR", "Finance", "Engineering", "Admissions", "IT Support"]),
+        "{reason}": random.choice(["maintenance", "holiday", "training day", "renovation", "fire drill"]),
+        "{time}": random.choice(["2-5 PM", "10 AM - 3 PM", "6-9 PM", "All Day", "Noon"]),
+        "{topic1}": random.choice(["Q3 Review", "Budget Update", "New Hires", "Project Status"]),
+        "{topic2}": random.choice(["Safety Training", "Holiday Schedule", "IT Migration", "Team Building"]),
+        "{topic3}": random.choice(["Parking Changes", "Wellness Program", "Open Q&A"]),
+        "{account}": random.choice(["university", "corporate", "cloud storage"]),
+        "{num}": str(random.randint(10000, 99999)),
+        "{food1}": "Pasta Primavera", "{food2}": "Chicken Tikka", "{food3}": "Fish Tacos",
+        "{book}": random.choice(["1984", "Sapiens", "The Gene", "Thinking, Fast and Slow"]),
+        "{movie}": random.choice(["Inception", "The Matrix", "Interstellar"]),
+        "{pet_name}": random.choice(["Whiskers", "Max", "Luna"]),
+        "{club}": random.choice(["Chess", "Photography", "Hiking", "Debate"]),
+        "{day}": random.choice(["Monday", "Wednesday", "Friday"]),
+        "{charity}": random.choice(["Children's Hospital", "Local Food Bank", "Animal Shelter"]),
+        "{title}": random.choice([
+            "Effects of Temperature on Enzyme Kinetics in Thermophilic Bacteria",
+            "Deep Learning for Medical Image Segmentation: A Systematic Review",
+            "Novel Biomarkers in Cardiovascular Disease Progression",
+            "Metagenomic Analysis of Coral Reef Microbiomes Under Thermal Stress",
+            "CRISPR-Cas9 Editing Efficiency in Human iPSC-Derived Neurons",
+        ]),
+        "{authors}": random.choice(["A. Smith, B. Jones, C. Lee", "R. Patel, S. Kim, T. Brown", "M. Wang, L. Davis"]),
+        "{affil}": random.choice(["University of Example, Dept. of Science", "MIT, CSAIL", "Stanford School of Medicine"]),
+        "{intro}": random.choice([
+            "Background text about the research problem being investigated.",
+            "This study addresses the gap in understanding of X in the context of Y.",
+            "Recent advances in Z have highlighted the need for improved W.",
+        ]),
+        "{methods}": random.choice([
+            "We employed a cross-sectional study design with N=200 participants.",
+            "Samples were collected from 5 sites and processed using standard protocols.",
+            "We developed a convolutional neural network trained on 50K labeled images.",
+        ]),
+        "{results}": random.choice([
+            "Treatment group showed 45% improvement (p<0.01) compared to control.",
+            "Our model achieved 94.2% accuracy on the held-out test set.",
+            "We identified 23 significantly enriched pathways (FDR < 0.05).",
+        ]),
+        "{conclusions}": random.choice([
+            "Our findings support the hypothesis that X leads to improved Y.",
+            "These results demonstrate the feasibility of the proposed approach.",
+            "Further validation with larger cohorts is warranted.",
+        ]),
+        "{finding1}": "Significant reduction in error rate (p<0.001)",
+        "{finding2}": "Model outperformed baseline by 15%",
+        "{finding3}": "Robust to distribution shift across domains",
+        "{future}": "Extend to longitudinal datasets and multi-site validation.",
+        "{ack}": random.choice(["Funded by NIH Grant R01-ABC123.", "Supported by NSF Award #1234567."]),
+    }
+    result = template
+    for key, val in fillers.items():
+        result = result.replace(key, val)
+    return result
+def generate_synthetic_junk(n: int = 5000) -> List[Dict[str, str]]:
+    """Generate synthetic junk documents."""
+    samples = []
+    for _ in range(n):
+        template = random.choice(JUNK_TEMPLATES)
+        text = _fill_template(template)
+        samples.append({"text": text, "label": "junk"})
+    return samples
+def generate_synthetic_posters(n: int = 3000) -> List[Dict[str, str]]:
+    """Generate synthetic poster-style documents."""
+    samples = []
+    for _ in range(n):
+        template = random.choice(POSTER_TEMPLATES)
+        text = _fill_template(template)
+        samples.append({"text": text, "label": "poster"})
+    return samples
+# ── Head 1: doc_type ────────────────────────────────────────────
+def prepare_doc_type_dataset(
+    output_dir: Path,
+    n_per_class: int = 15000,
+) -> Path:
+    """
+    Assemble and save document-type training data.
+    Downloads from HuggingFace and combines with synthetic data.
+    Saves as NDJSON: {text, label}
+    """
+    from datasets import load_dataset
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / "doc_type_train.ndjson"
+    all_samples = []
+    logger.info("=== Preparing doc_type dataset ===")
+    # ── scientific_paper ─────────────────────────────────────────
+    logger.info("Loading armanc/scientific_papers (arxiv split)...")
+    try:
+        ds = load_dataset(
+            "armanc/scientific_papers", "arxiv",
+            split="train", streaming=True, trust_remote_code=True,
+        )
+        count = 0
+        for row in ds:
+            if count >= n_per_class:
+                break
+            # Combine abstract + article body for full-text signal
+            abstract = row.get("abstract", "") or ""
+            article = row.get("article", "") or ""
+            text = (abstract + " " + article)[:4000]
+            if len(text.strip()) > 100:
+                all_samples.append({"text": text.strip(), "label": "scientific_paper"})
+                count += 1
+        logger.info(f"  scientific_paper: {count}")
+    except Exception as e:
+        logger.warning(f"Could not load scientific_papers: {e}")
+        # Fallback
+        logger.info("Falling back to ccdv/arxiv-summarization...")
+        try:
+            ds = load_dataset(
+                "ccdv/arxiv-summarization",
+                split="train", streaming=True, trust_remote_code=True,
+            )
+            count = 0
+            for row in ds:
+                if count >= n_per_class:
+                    break
+                text = ((row.get("abstract", "") or "") + " " + (row.get("article", "") or ""))[:4000]
+                if len(text.strip()) > 100:
+                    all_samples.append({"text": text.strip(), "label": "scientific_paper"})
+                    count += 1
+            logger.info(f"  scientific_paper (fallback): {count}")
+        except Exception as e2:
+            logger.error(f"Fallback also failed: {e2}")
+    # ── abstract_only ────────────────────────────────────────────
+    logger.info("Loading gfissore/arxiv-abstracts-2021...")
+    try:
+        ds = load_dataset(
+            "gfissore/arxiv-abstracts-2021",
+            split="train", streaming=True, trust_remote_code=True,
+        )
+        count = 0
+        for row in ds:
+            if count >= n_per_class:
+                break
+            abstract = row.get("abstract", "")
+            if abstract and 50 < len(abstract) < 600:
+                all_samples.append({"text": abstract.strip(), "label": "abstract_only"})
+                count += 1
+        logger.info(f"  abstract_only: {count}")
+    except Exception as e:
+        logger.warning(f"Could not load arxiv-abstracts: {e}")
+        # Fallback: extract abstracts from scientific_papers
+        logger.info("Generating abstract_only from scientific_papers abstracts...")
+        try:
+            ds = load_dataset(
+                "armanc/scientific_papers", "arxiv",
+                split="train", streaming=True, trust_remote_code=True,
+            )
+            count = 0
+            for row in ds:
+                if count >= n_per_class:
+                    break
+                abstract = row.get("abstract", "")
+                if abstract and 50 < len(abstract) < 600:
+                    all_samples.append({"text": abstract.strip(), "label": "abstract_only"})
+                    count += 1
+            logger.info(f"  abstract_only (fallback): {count}")
+        except Exception:
+            pass
+    # ── junk ─────────��───────────────────────────────────────────
+    logger.info("Loading ag_news for junk class...")
+    try:
+        ds = load_dataset(
+            "ag_news",
+            split="train", streaming=True, trust_remote_code=True,
+        )
+        count = 0
+        for row in ds:
+            if count >= n_per_class // 2:
+                break
+            text = row.get("text", "")
+            if len(text) > 30:
+                all_samples.append({"text": text.strip(), "label": "junk"})
+                count += 1
+        logger.info(f"  junk (ag_news): {count}")
+    except Exception as e:
+        logger.warning(f"Could not load ag_news: {e}")
+    logger.info("Generating synthetic junk...")
+    synth_junk = generate_synthetic_junk(n_per_class // 2)
+    all_samples.extend(synth_junk)
+    logger.info(f"  junk (synthetic): {len(synth_junk)}")
+    # ── poster ───────────────────────────────────────────────────
+    logger.info("Generating synthetic poster data...")
+    synth_posters = generate_synthetic_posters(n_per_class)
+    all_samples.extend(synth_posters)
+    logger.info(f"  poster (synthetic): {len(synth_posters)}")
+    # ── Shuffle and save ─────────────────────────────────────────
+    random.shuffle(all_samples)
+    with open(output_path, "w") as f:
+        for sample in all_samples:
+            f.write(json.dumps(sample) + "\n")
+    # Report distribution
+    dist = {}
+    for s in all_samples:
+        dist[s["label"]] = dist.get(s["label"], 0) + 1
+    logger.info(f"Saved {len(all_samples)} samples to {output_path}")
+    for label, count in sorted(dist.items()):
+        logger.info(f"  {label}: {count}")
+    return output_path
+# ── Head 2: ai_detect ───────────────────────────────────────────
+def prepare_ai_detect_dataset(
+    output_dir: Path,
+    n_per_class: int = 20000,
+) -> Path:
+    """
+    Assemble AI-generated text detection training data.
+    Sources (all verified available):
+        - liamdugan/raid: multi-model generations, domain="abstracts"
+          model="human" → human, otherwise → ai_generated
+        - NicolaiSivesind/ChatGPT-Research-Abstracts: real + GPT-3.5 abstracts
+    """
+    from datasets import load_dataset
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / "ai_detect_train.ndjson"
+    human_samples = []
+    ai_samples = []
+    logger.info("=== Preparing ai_detect dataset ===")
+    # ── RAID (scientific abstracts domain) ───────────────────────
+    logger.info("Loading liamdugan/raid (abstracts domain)...")
+    try:
+        ds = load_dataset(
+            "liamdugan/raid",
+            split="train", streaming=True, trust_remote_code=True,
+        )
+        human_count = 0
+        ai_count = 0
+        for row in ds:
+            domain = row.get("domain", "")
+            if domain != "abstracts":
+                continue
+            text = row.get("generation", "") or ""
+            if not text or len(text) < 50:
+                continue
+            model = row.get("model", "")
+            if model == "human":
+                if human_count < n_per_class:
+                    human_samples.append({"text": text[:4000], "label": "human"})
+                    human_count += 1
+            else:
+                if ai_count < n_per_class:
+                    ai_samples.append({"text": text[:4000], "label": "ai_generated"})
+                    ai_count += 1
+            if human_count >= n_per_class and ai_count >= n_per_class:
+                break
+        logger.info(f"  RAID: human={human_count}, ai={ai_count}")
+    except Exception as e:
+        logger.warning(f"Could not load RAID: {e}")
+    # ── ChatGPT-Research-Abstracts ───────────────────────────────
+    logger.info("Loading NicolaiSivesind/ChatGPT-Research-Abstracts...")
+    try:
+        ds = load_dataset(
+            "NicolaiSivesind/ChatGPT-Research-Abstracts",
+            split="train", streaming=True, trust_remote_code=True,
+        )
+        h_count = 0
+        a_count = 0
+        for row in ds:
+            real = row.get("real_abstract", "")
+            generated = row.get("generated_abstract", "")
+            if real and len(real) > 50:
+                human_samples.append({"text": real[:4000], "label": "human"})
+                h_count += 1
+            if generated and len(generated) > 50:
+                ai_samples.append({"text": generated[:4000], "label": "ai_generated"})
+                a_count += 1
+        logger.info(f"  ChatGPT-Abstracts: human={h_count}, ai={a_count}")
+    except Exception as e:
+        logger.warning(f"Could not load ChatGPT-Research-Abstracts: {e}")
+    # ── Balance and save ─────────────────────────────────────────
+    min_count = min(len(human_samples), len(ai_samples), n_per_class)
+    if min_count == 0:
+        logger.error("No AI detection training data available!")
+        # Save empty file
+        with open(output_path, "w") as f:
+            pass
+        return output_path
+    balanced = (
+        random.sample(human_samples, min(min_count, len(human_samples)))
+        + random.sample(ai_samples, min(min_count, len(ai_samples)))
+    )
+    random.shuffle(balanced)
+    with open(output_path, "w") as f:
+        for sample in balanced:
+            f.write(json.dumps(sample) + "\n")
+    n_h = sum(1 for s in balanced if s["label"] == "human")
+    n_a = sum(1 for s in balanced if s["label"] == "ai_generated")
+    logger.info(f"Saved {len(balanced)} samples (human={n_h}, ai={n_a}) to {output_path}")
+    return output_path
+# ── Head 3: toxicity ────────────────────────────────────────────
+def prepare_toxicity_dataset(
+    output_dir: Path,
+    n_per_class: int = 20000,
+) -> Path:
+    """
+    Assemble toxicity detection training data.
+    Sources (all verified available without manual download):
+        - google/civil_comments – ~1.8 M comments with toxicity float (0–1)
+          We threshold: toxic >= 0.5, clean < 0.1
+        - skg/toxigen-data – 274 K annotated statements
+          toxicity_human is a float 1–5; we use >= 4.0 as toxic, <= 2.0 as clean
+    """
+    from datasets import load_dataset
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / "toxicity_train.ndjson"
+    toxic_samples = []
+    clean_samples = []
+    logger.info("=== Preparing toxicity dataset ===")
+    # ── Civil Comments ───────────────────────────────────────────
+    logger.info("Loading google/civil_comments...")
+    try:
+        ds = load_dataset(
+            "google/civil_comments",
+            split="train", streaming=True, trust_remote_code=True,
+        )
+        toxic_count = 0
+        clean_count = 0
+        for row in ds:
+            text = row.get("text", "")
+            if not text or len(text) < 20:
+                continue
+            toxicity = row.get("toxicity", 0.0)
+            if toxicity >= 0.5 and toxic_count < n_per_class:
+                toxic_samples.append({"text": text[:4000], "label": "toxic"})
+                toxic_count += 1
+            elif toxicity < 0.1 and clean_count < n_per_class:
+                clean_samples.append({"text": text[:4000], "label": "clean"})
+                clean_count += 1
+            if toxic_count >= n_per_class and clean_count >= n_per_class:
+                break
+        logger.info(f"  Civil Comments: toxic={toxic_count}, clean={clean_count}")
+    except Exception as e:
+        logger.warning(f"Could not load civil_comments: {e}")
+    # ── ToxiGen ──────────────────────────────────────────────────
+    logger.info("Loading skg/toxigen-data...")
+    try:
+        ds = load_dataset(
+            "skg/toxigen-data",
+            split="train", streaming=True, trust_remote_code=True,
+        )
+        t_count = 0
+        c_count = 0
+        for row in ds:
+            text = row.get("text", "")
+            if not text or len(text) < 20:
+                continue
+            # toxicity_human is 1-5 scale
+            tox_score = row.get("toxicity_human", None)
+            if tox_score is None:
+                continue
+            tox_score = float(tox_score)
+            if tox_score >= 4.0:
+                toxic_samples.append({"text": text[:4000], "label": "toxic"})
+                t_count += 1
+            elif tox_score <= 2.0:
+                clean_samples.append({"text": text[:4000], "label": "clean"})
+                c_count += 1
+        logger.info(f"  ToxiGen: toxic={t_count}, clean={c_count}")
+    except Exception as e:
+        logger.warning(f"Could not load ToxiGen: {e}")
+    # ── Balance and save ─────────────────────────────────────────
+    min_count = min(len(toxic_samples), len(clean_samples), n_per_class)
+    if min_count == 0:
+        logger.error("No toxicity training data available!")
+        with open(output_path, "w") as f:
+            pass
+        return output_path
+    balanced = (
+        random.sample(toxic_samples, min(min_count, len(toxic_samples)))
+        + random.sample(clean_samples, min(min_count, len(clean_samples)))
+    )
+    random.shuffle(balanced)
+    with open(output_path, "w") as f:
+        for sample in balanced:
+            f.write(json.dumps(sample) + "\n")
+    n_t = sum(1 for s in balanced if s["label"] == "toxic")
+    n_c = sum(1 for s in balanced if s["label"] == "clean")
+    logger.info(f"Saved {len(balanced)} samples (toxic={n_t}, clean={n_c}) to {output_path}")
+    return output_path
+# ── Orchestrator ─────────────────────────────────────────────────
+def prepare_all(output_dir: Path, n_per_class: int = 15000):
+    """Download and prepare all three datasets."""
+    output_dir = Path(output_dir)
+    logger.info(f"Preparing all datasets in {output_dir}")
+    paths = {}
+    paths["doc_type"] = prepare_doc_type_dataset(output_dir, n_per_class)
+    paths["ai_detect"] = prepare_ai_detect_dataset(output_dir, n_per_class)
+    paths["toxicity"] = prepare_toxicity_dataset(output_dir, n_per_class)
+    logger.info("All datasets prepared!")
+    return paths