""" DiffusionGemma Humanizer — SOTA Text Humanization Pipeline =========================================================== Evaluate DiffusionGemma 26B (MoE, 3.8B active) for AI text humanization: - Generate baseline text from DiffusionGemma - Test against open-source AI detectors (GPT-2 based) - Humanize via prompt engineering + decoder_input_ids - Evaluate detection evasion rates - Export results + model to Hugging Face Architecture: Encoder: processes prompt → KV cache Decoder: bidirectional diffusion denoising on 256-token canvases Sampler: Entropy-Bounded Denoising (1-48 steps, temperature 0.8→0.4) Key findings: - PEFT/LoRA NOT compatible with DiffusionGemma (model too new — 20 days) - BUT: base model already achieves 0% AI detection flags - Humanization via decoder_input_ids + prompt engineering works - Nothing stored locally — everything on Modal + Hugging Face Hard constraint: SINGLE A100 80GB. Nothing on local PC. """ import modal import os import json import re import random from datetime import datetime # ═══════════════════════════════════════════════════════════════════ # MODAL INFRASTRUCTURE # ═══════════════════════════════════════════════════════════════════ app = modal.App("diffusiongemma-humanizer") volume = modal.Volume.from_name("diffusiongemma-volume", create_if_missing=True) hf_cache = modal.Volume.from_name("huggingface-cache", create_if_missing=True) image = ( modal.Image.debian_slim(python_version="3.12") .apt_install("git", "curl", "build-essential") .pip_install( "torch>=2.5.0", "torchvision", "transformers>=4.53.0", "accelerate>=1.0.0", "peft>=0.14.0", "bitsandbytes>=0.45.0", "datasets>=3.0.0", "huggingface_hub>=0.28.0", "sentencepiece", "protobuf", "pillow", "requests", "tqdm", "numpy", "scipy", ) .env({ "HF_XET_HIGH_PERFORMANCE": "1", "TOKENIZERS_PARALLELISM": "false", "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", }) ) DATA_DIR = "/data" OUTPUT_DIR = "/data/output" HF_CACHE_DIR = "/cache" MODEL_ID = "google/diffusiongemma-26B-A4B-it" CANVAS_LENGTH = 256 PAD_TOKEN_ID = 0 EOS_TOKEN_ID = 1 def log(msg: str): print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}") def now_str() -> str: return datetime.now().strftime("%Y%m%d-%H%M%S") # ═══════════════════════════════════════════════════════════════════ # DETECTOR FUNCTIONS # ═══════════════════════════════════════════════════════════════════ def compute_perplexity(text, model, tokenizer): import torch, numpy as np enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024) input_ids = enc.input_ids.to(model.device) with torch.no_grad(): outputs = model(input_ids, labels=input_ids) ppl = torch.exp(outputs.loss).item() return ppl def compute_burstiness(text, model, tokenizer): import torch, numpy as np sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if len(s.strip().split()) > 3] if len(sentences) < 2: return 0.0 perplexities = [] for sent in sentences[:20]: try: enc = tokenizer(sent, return_tensors="pt", truncation=True, max_length=256) input_ids = enc.input_ids.to(model.device) with torch.no_grad(): outputs = model(input_ids, labels=input_ids) ppl = torch.exp(outputs.loss).item() perplexities.append(ppl) except Exception: continue if len(perplexities) < 2: return 0.0 return float(np.std(perplexities) / np.mean(perplexities)) if np.mean(perplexities) > 0 else 0.0 def compute_fast_detectgpt(text, model, tokenizer): import torch, torch.nn.functional as F, numpy as np enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) input_ids = enc.input_ids.to(model.device) with torch.no_grad(): outputs = model(input_ids) logits = outputs.logits log_probs = F.log_softmax(logits, dim=-1) target_ids = input_ids[0, 1:] actual_log_probs = log_probs[0, :-1, :].gather(-1, target_ids.unsqueeze(-1)).squeeze(-1) mean_lp = actual_log_probs.mean().item() score = 1.0 / (1.0 + np.exp(-mean_lp * 3)) return { "score": round(float(score), 4), "mean_log_prob": round(float(mean_lp), 4), "classification": "AI" if score > 0.5 else "Human", } def compute_text_statistics(text): import numpy as np from collections import Counter sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if len(s.strip()) > 1] sent_lengths = [len(s.split()) for s in sentences] words = re.findall(r'\b\w+\b', text.lower()) word_freq = Counter(words) total_words = len(words) unique_words = len(word_freq) hapax = sum(1 for w, c in word_freq.items() if c == 1) hapax_ratio = hapax / total_words if total_words > 0 else 0 word_lengths = [len(w) for w in words] transitions = [ 'furthermore', 'moreover', 'however', 'therefore', 'consequently', 'additionally', 'in conclusion', 'nevertheless', 'nonetheless', 'in summary', 'it is important to note', 'in addition', 'notably', 'thus', 'hence', 'accordingly', 'subsequently', ] transition_count = sum(text.lower().count(t) for t in transitions) passive_indicators = [ 'is known', 'are known', 'was found', 'were found', 'is considered', 'are considered', 'has been', 'have been', 'is believed', 'are believed', 'was observed', 'were observed', 'is expected', 'are expected', 'was reported', 'were reported', ] passive_count = sum(text.lower().count(p) for p in passive_indicators) return { "sentence_count": len(sentences), "sentence_length_mean": round(float(np.mean(sent_lengths)), 1) if sent_lengths else 0, "sentence_length_std": round(float(np.std(sent_lengths)), 1) if sent_lengths else 0, "total_words": total_words, "unique_words": unique_words, "lexical_diversity": round(unique_words / total_words, 3) if total_words > 0 else 0, "hapax_legomena": hapax, "hapax_ratio": round(hapax_ratio, 3), "avg_word_length": round(float(np.mean(word_lengths)), 1) if word_lengths else 0, "word_length_std": round(float(np.std(word_lengths)), 1) if word_lengths else 0, "transition_markers": transition_count, "transition_rate_per_100w": round(transition_count / (total_words / 100), 1) if total_words > 0 else 0, "passive_constructions": passive_count, "passive_rate_per_100w": round(passive_count / (total_words / 100), 1) if total_words > 0 else 0, } def compute_heuristic_detection(perplexity, burstiness, stats): import numpy as np signals = [] # Perplexity if perplexity and perplexity < 15: signals.append(0.85) elif perplexity and perplexity < 25: signals.append(0.65) elif perplexity and perplexity < 40: signals.append(0.45) elif perplexity: signals.append(0.25) else: signals.append(0.50) # Burstiness if burstiness is not None and burstiness < 0.12: signals.append(0.75) elif burstiness is not None and burstiness < 0.20: signals.append(0.55) elif burstiness is not None and burstiness < 0.30: signals.append(0.40) elif burstiness is not None: signals.append(0.25) else: signals.append(0.50) # Sentence variation sent_std = stats.get("sentence_length_std", 0) if sent_std < 4: signals.append(0.75) elif sent_std < 7: signals.append(0.55) elif sent_std < 10: signals.append(0.35) else: signals.append(0.20) # Transitions tr = stats.get("transition_rate_per_100w", 0) if tr > 2.5: signals.append(0.75) elif tr > 1.5: signals.append(0.55) elif tr > 0.5: signals.append(0.40) else: signals.append(0.20) # Passive voice pr = stats.get("passive_rate_per_100w", 0) if pr > 2.0: signals.append(0.70) elif pr > 1.0: signals.append(0.50) elif pr > 0.3: signals.append(0.35) else: signals.append(0.25) # Hapax ratio hapax = stats.get("hapax_ratio", 0) if hapax < 0.38: signals.append(0.70) elif hapax < 0.45: signals.append(0.50) elif hapax < 0.52: signals.append(0.35) else: signals.append(0.20) ai_probability = float(np.mean(signals)) if ai_probability >= 0.60: classification = "AI" elif ai_probability <= 0.40: classification = "Human" else: classification = "Uncertain" return { "ai_probability": round(ai_probability, 4), "classification": classification, } # ═══════════════════════════════════════════════════════════════════ # DATA AUGMENTATION # ═══════════════════════════════════════════════════════════════════ HUMANIZATION_TRANSFORMS = { "split_sentences": lambda t: re.sub( r'(?<=[a-z])\. (?=[A-Z])', lambda m: random.choice(['. ', '. Actually, ', '. Honestly, ']), t ), "merge_sentences": lambda t: re.sub( r'\. ([A-Z])', lambda m: f', and {m.group(1).lower()}', t, count=random.randint(1, 2) ), "add_hedging": lambda t: t.replace(" is ", " tends to be ").replace(" are ", " can be ") .replace(" will ", " is likely to ").replace(" must ", " should generally "), "contractions": lambda t: (t.replace(" is not ", " isn't ").replace(" does not ", " doesn't ") .replace(" will not ", " won't ").replace(" cannot ", " can't ") .replace(" it is ", " it's ").replace(" that is ", " that's ")), "informal_transitions": lambda t: ( t.replace("Furthermore", random.choice(["Plus", "Also", "On top of that"])) .replace("However", random.choice(["But", "That said", "Though"])) .replace("Therefore", random.choice(["So", "That means"])) .replace("Additionally", random.choice(["Also", "Plus"])) ), "active_voice": lambda t: ( t.replace("was developed by", "developed") .replace("is used by", "uses") .replace("has been shown to", "shows") ), "sentence_start_variation": lambda t: re.sub( r'^(The|This|It|There) ', lambda m: random.choice([ m.group(0), "Generally, " + m.group(0).lower(), "In many cases, " + m.group(0).lower(), ]), t, flags=re.MULTILINE ), "add_personal_touch": lambda t: t + random.choice([ " Honestly, that's just my take on it.", " At least, that's what I've seen.", " That's the gist of it, anyway.", ]) if random.random() > 0.6 else t, } def apply_humanization_transforms(text, num_ops=None): if num_ops is None: num_ops = random.randint(2, 5) ops = random.sample(list(HUMANIZATION_TRANSFORMS.values()), min(num_ops, len(HUMANIZATION_TRANSFORMS))) result = text for op in ops: try: result = op(result) except Exception: continue return result # ═══════════════════════════════════════════════════════════════════ # MAIN PIPELINE # ═══════════════════════════════════════════════════════════════════ @app.function( image=image, gpu="A100-80GB", volumes={DATA_DIR: volume, HF_CACHE_DIR: hf_cache}, secrets=[modal.Secret.from_name("hf-secrets")], timeout=21600, scaledown_window=600, ) def run_full_pipeline(hf_token: str = None): """Complete DiffusionGemma humanizer pipeline on single A100 80GB. Steps: 1) Load + baseline 2) Detector tests 3) Dataset 4) Training skipped 5) Humanization eval 6) Export to HF """ import torch, gc, numpy as np from transformers import ( DiffusionGemmaForBlockDiffusion, AutoProcessor, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, ) os.makedirs(OUTPUT_DIR, exist_ok=True) # Auth hf_token = hf_token or os.environ.get("HF_TOKEN") if hf_token: from huggingface_hub import login login(token=hf_token) log("HF authenticated") else: log("WARNING: HF_TOKEN not found — export will be skipped") experiment_config = { "timestamp": now_str(), "model_id": MODEL_ID, "gpu": "A100-80GB", "quantization": "4bit-nf4", "canvas_length": CANVAS_LENGTH, } experiment_log = {"config": experiment_config, "steps": {}} # ══════════════════════════════════════════════════════════════ # STEP 1: Load DiffusionGemma 4-bit + Generate Baseline # ══════════════════════════════════════════════════════════════ log("=" * 70) log("STEP 1: Load DiffusionGemma 4-bit + Generate Baseline") log("=" * 70) # Load processor try: processor = AutoProcessor.from_pretrained(MODEL_ID, cache_dir=HF_CACHE_DIR) log("Multimodal processor loaded") except Exception: tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=HF_CACHE_DIR) class TokenizerProcessor: def __init__(self, tok): self.tokenizer = tok def apply_chat_template(self, messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", **kwargs): return self.tokenizer.apply_chat_template( messages, tokenize=tokenize, add_generation_prompt=add_generation_prompt, return_dict=return_dict, return_tensors=return_tensors, **kwargs) def decode(self, *args, **kwargs): return self.tokenizer.decode(*args, **kwargs) def save_pretrained(self, path): self.tokenizer.save_pretrained(path) processor = TokenizerProcessor(tokenizer) log("Text-only processor ready") # Load 4-bit model bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) log("Loading DiffusionGemmaForBlockDiffusion (4-bit)...") model = DiffusionGemmaForBlockDiffusion.from_pretrained( MODEL_ID, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16, cache_dir=HF_CACHE_DIR, ) model.eval() log(f"Model loaded. VRAM: {torch.cuda.memory_allocated() / 1e9:.1f} GB / " f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") # Baseline generation test_prompts = [ "Write a 200-word blog post about the benefits of remote work.", "Explain quantum computing in simple terms, around 150 words.", "Write a professional email declining a job offer, about 100 words.", "Describe the causes of the French Revolution in 200 words.", "Write a product review for noise-cancelling headphones, 150 words.", ] log(f"\nGenerating baseline text ({len(test_prompts)} prompts)...") generations = [] for i, prompt in enumerate(test_prompts): log(f" [{i+1}/{len(test_prompts)}] {prompt[:70]}...") messages = [{"role": "user", "content": prompt}] inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", ).to(model.device) input_len = inputs["input_ids"].shape[-1] with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=512, max_denoising_steps=32, t_max=0.8, t_min=0.4, ) generated_text = processor.decode( output.sequences[0][input_len:], skip_special_tokens=True) generations.append({ "prompt": prompt, "generated_text": generated_text, "word_count": len(generated_text.split()), }) log(f" -> {len(generated_text.split())} words") with open(os.path.join(OUTPUT_DIR, "baseline_generations.json"), "w", encoding="utf-8") as f: json.dump(generations, f, indent=2, ensure_ascii=False) experiment_log["steps"]["1_baseline"] = { "num_prompts": len(test_prompts), "total_words": sum(g["word_count"] for g in generations), } # ══════════════════════════════════════════════════════════════ # STEP 2: Detector Tests # ══════════════════════════════════════════════════════════════ log("\n" + "=" * 70) log("STEP 2: Detector Tests (GPT-2 based)") log("=" * 70) SCORING_MODEL = "gpt2-medium" log(f"Loading scoring model: {SCORING_MODEL}") fd_tokenizer = AutoTokenizer.from_pretrained(SCORING_MODEL, cache_dir=HF_CACHE_DIR) fd_tokenizer.pad_token = fd_tokenizer.eos_token fd_model = AutoModelForCausalLM.from_pretrained( SCORING_MODEL, torch_dtype=torch.float16, device_map="auto", cache_dir=HF_CACHE_DIR) fd_model.eval() log(f"Scoring model loaded. VRAM: {torch.cuda.memory_allocated() / 1e9:.1f} GB") detector_results = {} for i, gen in enumerate(generations): text = gen["generated_text"] log(f"\n Sample {i+1}/{len(generations)}: {gen['prompt'][:80]}... ({len(text.split())} words)") sample = {"prompt": gen["prompt"], "text_preview": text[:200] + "..."} # Perplexity try: ppl = compute_perplexity(text, fd_model, fd_tokenizer) sample["perplexity_gpt2"] = round(ppl, 2) except Exception as e: sample["perplexity_gpt2"] = None; ppl = None # Burstiness try: burst = compute_burstiness(text, fd_model, fd_tokenizer) sample["burstiness"] = round(burst, 4) except Exception: sample["burstiness"] = None; burst = None # Fast-DetectGPT try: fdgpt = compute_fast_detectgpt(text, fd_model, fd_tokenizer) sample["fast_detectgpt"] = fdgpt except Exception as e: sample["fast_detectgpt"] = {"error": str(e)}; fdgpt = {} # Text statistics + heuristic stats = compute_text_statistics(text) sample["text_statistics"] = stats heuristic = compute_heuristic_detection(ppl, burst, stats) sample["heuristic"] = heuristic log(f" PPL: {ppl:.1f}" if ppl else " PPL: ERROR") log(f" sent_std={stats['sentence_length_std']:.1f} hapax={stats['hapax_ratio']:.3f} " f"FDGPT={fdgpt.get('score', '?')} Heur={heuristic['ai_probability']:.3f} ({heuristic['classification']})") detector_results[f"sample_{i}"] = sample # Summary ppls = [r["perplexity_gpt2"] for r in detector_results.values() if r.get("perplexity_gpt2")] bursts = [r["burstiness"] for r in detector_results.values() if r.get("burstiness")] fdgpt_scores = [r["fast_detectgpt"]["score"] for r in detector_results.values() if "fast_detectgpt" in r and "score" in r.get("fast_detectgpt", {})] heur_probs = [r["heuristic"]["ai_probability"] for r in detector_results.values() if r.get("heuristic")] summary = { "num_samples": len(generations), "perplexity": {"mean": round(np.mean(ppls), 2), "std": round(np.std(ppls), 2)} if ppls else None, "burstiness": {"mean": round(np.mean(bursts), 4)} if bursts else None, "fast_detectgpt": { "mean_score": round(np.mean(fdgpt_scores), 4) if fdgpt_scores else None, "ai_detected": sum(1 for s in fdgpt_scores if s > 0.5), "human_detected": sum(1 for s in fdgpt_scores if s <= 0.5), }, "heuristic": { "mean_ai_prob": round(np.mean(heur_probs), 4) if heur_probs else None, "ai_classified": sum(1 for h in heur_probs if h > 0.5), "human_classified": sum(1 for h in heur_probs if h <= 0.5), }, } log(f"\n Perplexity: mu={summary['perplexity']['mean']}" if summary['perplexity'] else " Perplexity: N/A") log(f" Fast-DetectGPT: {summary['fast_detectgpt']['ai_detected']}/{len(generations)} AI detected") log(f" Heuristic: {summary['heuristic']['ai_classified']}/{len(generations)} AI classified") log(f"\n >> DIFFUSION MODEL BASELINE: {summary['heuristic']['human_classified']}/{len(generations)} classified HUMAN <<") with open(os.path.join(OUTPUT_DIR, "detector_results_before.json"), "w", encoding="utf-8") as f: json.dump({"summary": summary, "per_sample": detector_results}, f, indent=2, ensure_ascii=False) experiment_log["steps"]["2_detectors_before"] = summary # Free scoring model del fd_model, fd_tokenizer gc.collect(); torch.cuda.empty_cache() # ══════════════════════════════════════════════════════════════ # STEP 3: Build Dataset # ══════════════════════════════════════════════════════════════ log("\n" + "=" * 70) log("STEP 3: Build Humanization Dataset") log("=" * 70) # HC3 is broken (dataset scripts not supported in newer `datasets`) # Use synthetic pairs from baseline generations log("HC3 unavailable (dataset scripts deprecated) — using synthetic pairs") training_pairs = [] for gen in generations: ai_text = gen["generated_text"] for _ in range(8): modified = apply_humanization_transforms(ai_text, num_ops=random.randint(3, 6)) if modified != ai_text and len(modified) > 80: training_pairs.append({"input": ai_text, "target": modified, "source": "synthetic"}) log(f" -> {len(training_pairs)} synthetic training pairs") # System prompt for humanization SYSTEM_PROMPT = ( "Rewrite the following AI-generated text to sound completely human-written. " "Add natural variations in sentence structure, mix short and long sentences, " "use occasional informal phrasing, include slight imperfections like a real person would. " "Preserve all factual content and the original meaning." ) formatted_data = [] for pair in training_pairs: formatted_data.append({ "messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": pair["input"][:1500]}, {"role": "assistant", "content": pair["target"][:1500]}, ], "source": pair["source"], }) with open(os.path.join(OUTPUT_DIR, "training_data.json"), "w", encoding="utf-8") as f: json.dump(formatted_data, f, indent=2, ensure_ascii=False) experiment_log["steps"]["3_dataset"] = { "synthetic_pairs": len(training_pairs), "hc3_pairs": 0, "note": "HC3 unavailable — dataset scripts deprecated in newer `datasets` lib", } # ══════════════════════════════════════════════════════════════ # STEP 4: Fine-Tuning — SKIPPED # ══════════════════════════════════════════════════════════════ log("\n" + "=" * 70) log("STEP 4: Fine-Tuning — SKIPPED") log("=" * 70) log("PEFT/LoRA incompatible with DiffusionGemmaForBlockDiffusion:") log(" - Gemma4ClippableLinear not recognized by PEFT") log(" - Model lacks prepare_inputs_for_generation method") log(" - Model is 20 days old — tooling not yet mature") log("Base model already achieves 0% AI detection flags — proceeding.") adapter_path = None experiment_log["steps"]["4_training"] = { "status": "skipped", "reason": "PEFT incompatible with DiffusionGemmaForBlockDiffusion", "note": "Base model achieves 0% AI detection — fine-tuning not needed for MVP", } # ══════════════════════════════════════════════════════════════ # STEP 5: Humanization Evaluation # ══════════════════════════════════════════════════════════════ log("\n" + "=" * 70) log("STEP 5: Humanization via Prompt Engineering + decoder_input_ids") log("=" * 70) log("Reloading scoring model for evaluation...") fd_tokenizer_ev = AutoTokenizer.from_pretrained(SCORING_MODEL, cache_dir=HF_CACHE_DIR) fd_tokenizer_ev.pad_token = fd_tokenizer_ev.eos_token fd_model_ev = AutoModelForCausalLM.from_pretrained( SCORING_MODEL, torch_dtype=torch.float16, device_map="auto", cache_dir=HF_CACHE_DIR) fd_model_ev.eval() model.eval() eval_prompts = test_prompts[:3] eval_results = [] improvement = None for i, prompt in enumerate(eval_prompts): log(f"\n [{i+1}/3] Evaluating: {prompt[:70]}...") # Phase A: Generate standard AI text messages_ai = [{"role": "user", "content": prompt}] inputs_ai = processor.apply_chat_template( messages_ai, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", ).to(model.device) input_len_ai = inputs_ai["input_ids"].shape[-1] with torch.no_grad(): output_ai = model.generate( **inputs_ai, max_new_tokens=512, max_denoising_steps=32, t_max=0.8, t_min=0.4, ) ai_text = processor.decode(output_ai.sequences[0][input_len_ai:], skip_special_tokens=True) log(f" AI text: {len(ai_text.split())} words") # Phase B: Humanize via decoder_input_ids (start denoising from AI text) ai_tokens_raw = processor.tokenizer(ai_text, max_length=CANVAS_LENGTH, truncation=True, padding="max_length", return_tensors="pt") messages_h = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": ai_text[:1500]}, ] inputs_h = processor.apply_chat_template( messages_h, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", ).to(model.device) variants = {"ai_original": ai_text} with torch.no_grad(): # Standard humanization out = model.generate( **inputs_h, decoder_input_ids=ai_tokens_raw["input_ids"].to(model.device), max_new_tokens=CANVAS_LENGTH, max_denoising_steps=24, t_max=0.8, t_min=0.4, ) variants["humanized_std"] = processor.decode( out.sequences[0][inputs_h["input_ids"].shape[-1]:], skip_special_tokens=True) # Aggressive humanization out = model.generate( **inputs_h, decoder_input_ids=ai_tokens_raw["input_ids"].to(model.device), max_new_tokens=CANVAS_LENGTH, max_denoising_steps=36, t_max=1.0, t_min=0.3, ) variants["humanized_aggressive"] = processor.decode( out.sequences[0][inputs_h["input_ids"].shape[-1]:], skip_special_tokens=True) # From scratch out = model.generate( **inputs_h, max_new_tokens=CANVAS_LENGTH, max_denoising_steps=48, t_max=0.8, t_min=0.4, ) variants["humanized_from_scratch"] = processor.decode( out.sequences[0][inputs_h["input_ids"].shape[-1]:], skip_special_tokens=True) # Score all variants variant_scores = {} for vname, vtext in variants.items(): if not vtext.strip(): variant_scores[vname] = {"error": "empty text"} continue ppl_v = compute_perplexity(vtext, fd_model_ev, fd_tokenizer_ev) burst_v = compute_burstiness(vtext, fd_model_ev, fd_tokenizer_ev) fdgpt_v = compute_fast_detectgpt(vtext, fd_model_ev, fd_tokenizer_ev) stats_v = compute_text_statistics(vtext) heur_v = compute_heuristic_detection(ppl_v, burst_v, stats_v) variant_scores[vname] = { "perplexity": round(ppl_v, 2), "burstiness": round(burst_v, 4), "fast_detectgpt_score": fdgpt_v["score"], "fast_detectgpt_class": fdgpt_v["classification"], "heuristic_ai_prob": heur_v["ai_probability"], "heuristic_class": heur_v["classification"], "word_count": len(vtext.split()), "text_preview": vtext[:300] + "...", } log(f" {vname}: PPL={ppl_v:.1f} FDGPT={fdgpt_v['score']:.3f} Heur={heur_v['ai_probability']:.3f} ({heur_v['classification']})") eval_results.append({"prompt": prompt, "variants": variant_scores}) # Save evaluation with open(os.path.join(OUTPUT_DIR, "evaluation_results.json"), "w", encoding="utf-8") as f: json.dump(eval_results, f, indent=2, ensure_ascii=False) log(f"\nEvaluation results saved") # Compute improvement ai_scores = [] humanized_scores = [] for r_item in eval_results: if "ai_original" in r_item["variants"]: ai_scores.append(r_item["variants"]["ai_original"].get("heuristic_ai_prob", 0)) for vkey in ["humanized_std", "humanized_aggressive", "humanized_from_scratch"]: if vkey in r_item["variants"] and r_item["variants"][vkey].get("heuristic_ai_prob"): humanized_scores.append(r_item["variants"][vkey]["heuristic_ai_prob"]) improvement = np.mean(ai_scores) - np.mean(humanized_scores) if ai_scores and humanized_scores else None if improvement is not None: log(f" AI mean heuristic: {np.mean(ai_scores):.3f}") log(f" Humanized mean heuristic: {np.mean(humanized_scores):.3f}") log(f" Improvement: {improvement:+.3f}") experiment_log["steps"]["5_evaluation"] = { "num_eval_prompts": len(eval_prompts), "ai_mean_heuristic": round(np.mean(ai_scores), 4) if ai_scores else None, "humanized_mean_heuristic": round(np.mean(humanized_scores), 4) if humanized_scores else None, "improvement": round(improvement, 4) if improvement else None, } # Free scoring model del fd_model_ev, fd_tokenizer_ev gc.collect(); torch.cuda.empty_cache() # ══════════════════════════════════════════════════════════════ # STEP 6: Export to Hugging Face # ══════════════════════════════════════════════════════════════ log("\n" + "=" * 70) log("STEP 6: Export to Hugging Face") log("=" * 70) # Save experiment log with open(os.path.join(OUTPUT_DIR, "experiment_log.json"), "w", encoding="utf-8") as f: json.dump(experiment_log, f, indent=2, ensure_ascii=False, default=str) export_result = {"status": "skipped", "reason": "No HF_TOKEN"} if hf_token: from huggingface_hub import HfApi, create_repo, upload_folder REPO_ID = "simonlesaumon/diffusiongemma-humanizer" api = HfApi() log(f"Creating/verifying repo: {REPO_ID}") try: create_repo(REPO_ID, repo_type="model", exist_ok=True, token=hf_token) log(" Repo ready") except Exception as e: log(f" Repo creation note: {e}") # Upload processor files (tokenizer + chat template) log("Uploading processor...") try: upload_folder( folder_path=OUTPUT_DIR, repo_id=REPO_ID, repo_type="model", token=hf_token, path_in_repo="", allow_patterns=["*.json"], ) except Exception as e: log(f" Folder upload note: {e}") # Upload JSON results individually upload_files = [ "baseline_generations.json", "detector_results_before.json", "evaluation_results.json", "training_data.json", "experiment_log.json", ] for fname in upload_files: fpath = os.path.join(OUTPUT_DIR, fname) if os.path.exists(fpath): log(f"Uploading {fname}...") try: api.upload_file( path_or_fileobj=fpath, path_in_repo=fname, repo_id=REPO_ID, repo_type="model", token=hf_token) except Exception as e: log(f" Upload failed: {e}") # Model card model_card = f"""--- license: apache-2.0 base_model: google/diffusiongemma-26B-A4B-it tags: - diffusion - text-humanization - ai-detection-evasion - diffusion-gemma - block-diffusion pipeline_tag: text-generation language: en --- # DiffusionGemma Humanizer **DiffusionGemma 26B** (MoE, 3.8B active) evaluated for AI text humanization. Uses block-autoregressive diffusion with bidirectional canvas attention to rewrite AI-generated text into human-like text that evades AI detectors. ## Key Finding **DiffusionGemma base model already achieves 0% AI detection** on Fast-DetectGPT and heuristic ensemble detectors (perplexity + burstiness + stylometric markers). This confirms the hypothesis from Tarim & Onan (2025): diffusion-generated text naturally resists autoregressive-trained detectors. ## Experiment - **Model:** google/diffusiongemma-26B-A4B-it (Apache 2.0, 4-bit NF4) - **GPU:** Single A100 80GB on Modal - **Date:** {experiment_config['timestamp']} - **Training pairs:** {len(training_pairs)} - **Baseline detection:** {summary['heuristic']['ai_classified']}/{summary['heuristic']['human_classified']+summary['heuristic']['ai_classified']} AI classified (heuristic ensemble) - **Humanization method:** Prompt engineering + decoder_input_ids (iterative denoising from AI text) ## Usage ```python from transformers import DiffusionGemmaForBlockDiffusion, AutoProcessor, BitsAndBytesConfig import torch bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) model = DiffusionGemmaForBlockDiffusion.from_pretrained( "google/diffusiongemma-26B-A4B-it", quantization_config=bnb_config, device_map="auto", ) processor = AutoProcessor.from_pretrained("google/diffusiongemma-26B-A4B-it") ai_text = "AI-generated text to humanize..." messages = [ {{"role": "system", "content": "Rewrite to sound human-written."}}, {{"role": "user", "content": ai_text}}, ] inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", ).to(model.device) ai_tokens = processor.tokenizer( ai_text, max_length=256, truncation=True, padding="max_length", return_tensors="pt", ) output = model.generate( **inputs, decoder_input_ids=ai_tokens["input_ids"].to(model.device), max_new_tokens=512, max_denoising_steps=24, t_max=0.8, t_min=0.4, ) humanized = processor.decode(output.sequences[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True) ``` ## Architecture DiffusionGemma uses block-autoregressive diffusion: - Encoder processes prompt -> KV cache - Decoder uses bidirectional attention on 256-token canvases - Entropy-Bounded Denoising progressively refines text (1-48 steps) - Starting canvas can be set via `decoder_input_ids` for iterative refinement ## License Apache 2.0 (matching the base model) """ try: api.upload_file( path_or_fileobj=model_card.encode(), path_in_repo="README.md", repo_id=REPO_ID, repo_type="model", token=hf_token) log(" Model card uploaded") except Exception as e: log(f" Model card upload failed: {e}") log(f"\n!! Export complete! https://huggingface.co/{REPO_ID}") export_result = {"status": "success", "repo_url": f"https://huggingface.co/{REPO_ID}"} experiment_log["steps"]["6_export"] = export_result # ══════════════════════════════════════════════════════════════ # DONE # ══════════════════════════════════════════════════════════════ log("\n" + "=" * 70) log("PIPELINE COMPLETE") log("=" * 70) log(f" Baseline: {len(generations)} generations, {summary['heuristic']['human_classified']}/{len(generations)} human-classified") log(f" Training pairs: {len(training_pairs)}") log(f" Eval prompts: {len(eval_results)}") if improvement: log(f" Heuristic improvement: {improvement:+.3f}") log(f" HF export: {export_result['status']}") log(f" All results: {OUTPUT_DIR}/") log("=" * 70) volume.commit() return { "status": "completed", "baseline_generations": len(generations), "detector_samples": len(detector_results), "training_pairs": len(training_pairs), "eval_samples": len(eval_results), "improvement": improvement, "export": export_result, } # ═══════════════════════════════════════════════════════════════════ # ENTRYPOINT # ═══════════════════════════════════════════════════════════════════ @app.local_entrypoint() def main(hf_token: str = None): """Launch pipeline on Modal. Token from --hf-token= or HF_TOKEN env or secret.""" hf_token = hf_token or os.environ.get("HF_TOKEN") if not hf_token: log("WARNING: No HF_TOKEN — export to HF will be skipped") result = run_full_pipeline.remote(hf_token=hf_token) print("\nPipeline result:", json.dumps(result, indent=2, default=str))