""" CoPA (Contrastive Paraphrase Attack) — training-free evasion prototype. Based on Fang et al., EMNLP 2025: "Your Language Model Can Secretly Write Like Humans: Contrastive Paraphrase Attacks on LLM-Generated Text Detectors." Contrastive decoding formula: P_final = (1+lambda) * P_human_style - lambda * P_machine_style Only the inference-time contrastive decoding is implemented here. No training required — runs on Modal T4 (~$0.60/h). """ from __future__ import annotations import argparse import json import os import time from dataclasses import dataclass, field from typing import Any, TYPE_CHECKING if TYPE_CHECKING: import torch from transformers import AutoModelForCausalLM, AutoTokenizer # --------------------------------------------------------------------------- # Data structures # --------------------------------------------------------------------------- @dataclass class CopaConfig: """CoPA decoding configuration — tuned for token dispersion maximization.""" # Model: Instruct-tuned (best quality) + CoPA dispersion compensates for detection model_name: str = "Qwen/Qwen2.5-1.5B-Instruct" # Contrastive decoding lambda_contrast: float = 0.5 # CoPA original — best quality balance alpha_truncation: float = 1e-5 # adaptive truncation threshold # Generation max_new_tokens: int = 768 # was 256 — prevents cutoff temperature: float = 1.0 # sampling temperature top_p: float = 0.92 # nucleus sampling for diversity repetition_penalty: float = 1.15 # penalize repeated n-grams diversity_bonus_strength: float = 0.5 # penalty for recently used tokens # Prompts: simple style transfer (post-processing strips any artifacts) human_style_prompt: str = ( "Rewrite this to sound like a natural human wrote it, " "with varied sentences and conversational wording:\n\n{input_text}" ) machine_style_prompt: str = ( "Repeat the following text exactly, word for word, " "maintaining the original formal structure:\n\n{input_text}" ) device: str = "cuda" @dataclass class CopaResult: """Single CoPA rewriting result.""" original_text: str rewritten_text: str tokens_generated: int time_seconds: float contrast_strength: float # --------------------------------------------------------------------------- # Model loading # --------------------------------------------------------------------------- def _lazy_import_torch(): """Lazy import torch — only when actually running inference (Modal GPU).""" import torch # noqa: F811 from transformers import AutoModelForCausalLM, AutoTokenizer # noqa: F811 return torch, AutoModelForCausalLM, AutoTokenizer def load_model(config: CopaConfig): """Load model and tokenizer once for both scoring and generation.""" torch, AutoModelForCausalLM, AutoTokenizer = _lazy_import_torch() tokenizer = AutoTokenizer.from_pretrained( config.model_name, trust_remote_code=True ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( config.model_name, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, ) model.eval() return model, tokenizer # --------------------------------------------------------------------------- # Output cleaning # --------------------------------------------------------------------------- def _clean_output(text: str, original: str) -> str: """Remove template artifacts, repeated prompts, and truncated sentences. Common artifacts from CoPA/LLM generation: - Repeated few-shot templates ("Text: ...", "Human version:") - Instruction echoes ("### Informal Natural Language Rewritten:") - Trailing mid-word cutoffs """ import re # Cut at common template repetition patterns cut_patterns = [ r"\n\s*Text:\s", # Few-shot template repetition r"\n\s*Human version:", # Few-shot output label r"\n\s*Formal academic", # Machine-style prompt leak r"\n\s*Formal explanation", r"###\s", # Markdown headings (meta-artifacts) r"\n\s*You are an AI", # System prompt leak r"\n\s*Here is a more", # Prompt repetition r"\n\s*Rewrite the", # Instruction echo ] for pattern in cut_patterns: m = re.search(pattern, text) if m: text = text[: m.start()].strip() break # Remove trailing incomplete sentence (no ending punctuation) text = text.rstrip() if text and text[-1] not in '.!?":' "'" ')' ']': # Find last complete sentence last_period = max(text.rfind('.'), text.rfind('!'), text.rfind('?')) if last_period > len(text) * 0.6: # Only if we have enough content text = text[: last_period + 1] return text.strip() # --------------------------------------------------------------------------- # CoPA contrastive decoding # --------------------------------------------------------------------------- def copa_rewrite( text: str, model, tokenizer, config: CopaConfig, ) -> CopaResult: """Rewrite `text` using contrastive decoding. Algorithm (from CoPA paper, Algorithm 1): 1. Build human-style prompt (few-shot) and machine-style prompt (academic). 2. For each token position t: a. Compute P_h = model(x_h + y_ 0: contrastive_logits[gid] /= rep_penalty else: contrastive_logits[gid] *= rep_penalty # --- Adaptive truncation (keep tokens with P_h >= alpha * max(P_h)) --- h_probs = torch.softmax(h_logits, dim=-1) max_prob = h_probs.max() mask = h_probs >= alpha * max_prob contrastive_logits[~mask] = float("-inf") # --- Top-p (nucleus) filtering --- if top_p < 1.0: sorted_logits, sorted_indices = torch.sort(contrastive_logits, descending=True) cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1) sorted_indices_to_remove = cumulative_probs > top_p sorted_indices_to_remove[0] = False # keep at least one token indices_to_remove = sorted_indices[sorted_indices_to_remove] contrastive_logits[indices_to_remove] = float("-inf") # --- Diversity bonus: penalize tokens used in last 20 positions --- if div_strength > 0 and len(generated_ids) >= 3: recent_window = generated_ids[-20:] for gid in set(recent_window): contrastive_logits[gid] -= div_strength * recent_window.count(gid) # --- Sample --- probs = torch.softmax(contrastive_logits, dim=-1) next_token_id = torch.multinomial(probs, num_samples=1).item() generated_ids.append(next_token_id) # --- Append to both contexts --- h_inputs["input_ids"] = torch.cat( [h_inputs["input_ids"], torch.tensor([[next_token_id]], device=model.device)], dim=1 ) h_inputs["attention_mask"] = torch.ones_like(h_inputs["input_ids"]) m_inputs["input_ids"] = torch.cat( [m_inputs["input_ids"], torch.tensor([[next_token_id]], device=model.device)], dim=1 ) m_inputs["attention_mask"] = torch.ones_like(m_inputs["input_ids"]) # --- Stop conditions --- if next_token_id == tokenizer.eos_token_id: break rewritten = tokenizer.decode( generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True, ) # Post-processing: strip repeated template artifacts rewritten = _clean_output(rewritten, text) elapsed = time.time() - start_time return CopaResult( original_text=text, rewritten_text=rewritten, tokens_generated=len(generated_ids), time_seconds=elapsed, contrast_strength=lambda_, ) # --------------------------------------------------------------------------- # Batch processing # --------------------------------------------------------------------------- @dataclass class CopaBatchResult: results: list[CopaResult] = field(default_factory=list) total_time: float = 0.0 total_tokens: int = 0 avg_tokens_per_second: float = 0.0 def run_copa_batch( texts: list[str], model: AutoModelForCausalLM, tokenizer: AutoTokenizer, config: CopaConfig, ) -> CopaBatchResult: """Run CoPA rewriting on a batch of texts.""" batch = CopaBatchResult() start = time.time() for i, text in enumerate(texts): print(f"[CoPA] {i+1}/{len(texts)}: rewriting {len(text.split())} words...") try: result = copa_rewrite(text, model, tokenizer, config) batch.results.append(result) batch.total_tokens += result.tokens_generated except Exception as e: print(f"[CoPA] ERROR on sample {i}: {e}") batch.results.append(CopaResult( original_text=text, rewritten_text=text, # fallback to original tokens_generated=0, time_seconds=0, contrast_strength=config.lambda_contrast, )) batch.total_time = time.time() - start if batch.total_time > 0: batch.avg_tokens_per_second = batch.total_tokens / batch.total_time return batch # --------------------------------------------------------------------------- # Test data generation # --------------------------------------------------------------------------- def generate_test_texts(n: int = 50) -> list[str]: """Generate synthetic AI-like texts for testing. In production, replace with real AI-generated texts from HC3 or similar. """ templates = [ "Artificial intelligence has revolutionized the field of natural language processing in recent years. The development of large language models has enabled unprecedented capabilities in text generation, translation, and summarization tasks.", "Climate change represents one of the most significant challenges facing humanity in the twenty-first century. Rising global temperatures have led to increasingly severe weather events, sea level rise, and disruptions to ecosystems worldwide.", "The history of computer science can be traced back to the early twentieth century, with the foundational work of Alan Turing and others. Their theoretical contributions laid the groundwork for the digital revolution that followed.", "Machine learning algorithms have demonstrated remarkable success across a wide range of applications, from image recognition to natural language understanding. These systems learn patterns from large datasets.", "The Renaissance period marked a profound transformation in European art, science, and philosophy. This cultural movement began in Italy during the fourteenth century and spread throughout the continent.", ] # Repeat/cycle to reach n result = [] for i in range(n): result.append(templates[i % len(templates)]) return result # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser(description="CoPA: Contrastive Paraphrase Attack") parser.add_argument("--model", default="Qwen/Qwen2.5-1.5B-Instruct") parser.add_argument("--lambda", type=float, default=0.5, dest="lambda_contrast") parser.add_argument("--alpha", type=float, default=1e-5, dest="alpha_truncation") parser.add_argument("--max-tokens", type=int, default=256) parser.add_argument("--temperature", type=float, default=1.0) parser.add_argument("--num-samples", type=int, default=50) parser.add_argument("--output", default="output/copa_results.json") parser.add_argument("--device", default="cuda") args = parser.parse_args() config = CopaConfig( model_name=args.model, lambda_contrast=args.lambda_contrast, alpha_truncation=args.alpha_truncation, max_new_tokens=args.max_tokens, temperature=args.temperature, device=args.device, ) print(f"[CoPA] Loading model: {config.model_name}") model, tokenizer = load_model(config) print(f"[CoPA] Generating {args.num_samples} test texts...") test_texts = generate_test_texts(args.num_samples) print(f"[CoPA] Running contrastive rewriting...") batch_result = run_copa_batch(test_texts, model, tokenizer, config) # Save results os.makedirs(os.path.dirname(args.output), exist_ok=True) output_data = { "config": { "model": config.model_name, "lambda": config.lambda_contrast, "alpha": config.alpha_truncation, }, "summary": { "num_samples": len(batch_result.results), "total_time_s": batch_result.total_time, "total_tokens": batch_result.total_tokens, "avg_tokens_per_second": batch_result.avg_tokens_per_second, }, "results": [ { "original": r.original_text, "rewritten": r.rewritten_text, "tokens": r.tokens_generated, "time_s": r.time_seconds, } for r in batch_result.results ], } with open(args.output, "w", encoding="utf-8") as f: json.dump(output_data, f, indent=2, ensure_ascii=False) print(f"[CoPA] Done. {len(batch_result.results)} samples in {batch_result.total_time:.1f}s") print(f"[CoPA] Saved to {args.output}") if __name__ == "__main__": main()