| """ |
| CoPA (Contrastive Paraphrase Attack) — training-free evasion prototype. |
| |
| Based on Fang et al., EMNLP 2025: "Your Language Model Can Secretly |
| Write Like Humans: Contrastive Paraphrase Attacks on LLM-Generated |
| Text Detectors." |
| |
| Contrastive decoding formula: |
| P_final = (1+lambda) * P_human_style - lambda * P_machine_style |
| |
| Only the inference-time contrastive decoding is implemented here. |
| No training required — runs on Modal T4 (~$0.60/h). |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import time |
| from dataclasses import dataclass, field |
| from typing import Any, TYPE_CHECKING |
|
|
| if TYPE_CHECKING: |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class CopaConfig: |
| """CoPA decoding configuration — tuned for token dispersion maximization.""" |
|
|
| |
| model_name: str = "Qwen/Qwen2.5-1.5B-Instruct" |
|
|
| |
| lambda_contrast: float = 0.5 |
| alpha_truncation: float = 1e-5 |
|
|
| |
| max_new_tokens: int = 768 |
| temperature: float = 1.0 |
| top_p: float = 0.92 |
| repetition_penalty: float = 1.15 |
| diversity_bonus_strength: float = 0.5 |
|
|
| |
| human_style_prompt: str = ( |
| "Rewrite this to sound like a natural human wrote it, " |
| "with varied sentences and conversational wording:\n\n{input_text}" |
| ) |
| machine_style_prompt: str = ( |
| "Repeat the following text exactly, word for word, " |
| "maintaining the original formal structure:\n\n{input_text}" |
| ) |
|
|
| device: str = "cuda" |
|
|
|
|
| @dataclass |
| class CopaResult: |
| """Single CoPA rewriting result.""" |
| original_text: str |
| rewritten_text: str |
| tokens_generated: int |
| time_seconds: float |
| contrast_strength: float |
|
|
|
|
| |
| |
| |
|
|
| def _lazy_import_torch(): |
| """Lazy import torch — only when actually running inference (Modal GPU).""" |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| return torch, AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
| def load_model(config: CopaConfig): |
| """Load model and tokenizer once for both scoring and generation.""" |
| torch, AutoModelForCausalLM, AutoTokenizer = _lazy_import_torch() |
| tokenizer = AutoTokenizer.from_pretrained( |
| config.model_name, trust_remote_code=True |
| ) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| config.model_name, |
| torch_dtype=torch.float16, |
| device_map="auto", |
| trust_remote_code=True, |
| ) |
| model.eval() |
| return model, tokenizer |
|
|
|
|
| |
| |
| |
|
|
| def _clean_output(text: str, original: str) -> str: |
| """Remove template artifacts, repeated prompts, and truncated sentences. |
| |
| Common artifacts from CoPA/LLM generation: |
| - Repeated few-shot templates ("Text: ...", "Human version:") |
| - Instruction echoes ("### Informal Natural Language Rewritten:") |
| - Trailing mid-word cutoffs |
| """ |
| import re |
|
|
| |
| cut_patterns = [ |
| r"\n\s*Text:\s", |
| r"\n\s*Human version:", |
| r"\n\s*Formal academic", |
| r"\n\s*Formal explanation", |
| r"###\s", |
| r"\n\s*You are an AI", |
| r"\n\s*Here is a more", |
| r"\n\s*Rewrite the", |
| ] |
| for pattern in cut_patterns: |
| m = re.search(pattern, text) |
| if m: |
| text = text[: m.start()].strip() |
| break |
|
|
| |
| text = text.rstrip() |
| if text and text[-1] not in '.!?":' "'" ')' ']': |
| |
| last_period = max(text.rfind('.'), text.rfind('!'), text.rfind('?')) |
| if last_period > len(text) * 0.6: |
| text = text[: last_period + 1] |
|
|
| return text.strip() |
|
|
|
|
| |
| |
| |
|
|
| def copa_rewrite( |
| text: str, |
| model, |
| tokenizer, |
| config: CopaConfig, |
| ) -> CopaResult: |
| """Rewrite `text` using contrastive decoding. |
| |
| Algorithm (from CoPA paper, Algorithm 1): |
| 1. Build human-style prompt (few-shot) and machine-style prompt (academic). |
| 2. For each token position t: |
| a. Compute P_h = model(x_h + y_<t) |
| b. Compute P_m = model(x_m + y_<t) |
| c. P_c = softmax((1+lambda)*log P_h - lambda*log P_m) |
| d. Apply adaptive truncation + top-p nucleus filtering |
| e. Apply diversity bonus (penalize recent tokens) |
| f. Sample from truncated P_c with temperature + repetition penalty |
| """ |
| torch, _, _ = _lazy_import_torch() |
| start_time = time.time() |
|
|
| |
| human_prompt = config.human_style_prompt.replace("{input_text}", text) |
| machine_prompt = config.machine_style_prompt.replace("{input_text}", text) |
|
|
| h_inputs = tokenizer(human_prompt, return_tensors="pt").to(model.device) |
| m_inputs = tokenizer(machine_prompt, return_tensors="pt").to(model.device) |
|
|
| generated_ids: list[int] = [] |
| lambda_ = config.lambda_contrast |
| alpha = config.alpha_truncation |
| temp = config.temperature |
| top_p = config.top_p |
| rep_penalty = config.repetition_penalty |
| div_strength = config.diversity_bonus_strength |
|
|
| for step in range(config.max_new_tokens): |
| |
| h_out = model(**h_inputs) |
| h_logits = h_out.logits[0, -1, :] / temp |
|
|
| |
| m_out = model(**m_inputs) |
| m_logits = m_out.logits[0, -1, :] / temp |
|
|
| |
| h_log_probs = torch.log_softmax(h_logits, dim=-1) |
| m_log_probs = torch.log_softmax(m_logits, dim=-1) |
| contrastive_logits = (1 + lambda_) * h_log_probs - lambda_ * m_log_probs |
|
|
| |
| if rep_penalty != 1.0 and generated_ids: |
| for gid in set(generated_ids): |
| if contrastive_logits[gid] > 0: |
| contrastive_logits[gid] /= rep_penalty |
| else: |
| contrastive_logits[gid] *= rep_penalty |
|
|
| |
| h_probs = torch.softmax(h_logits, dim=-1) |
| max_prob = h_probs.max() |
| mask = h_probs >= alpha * max_prob |
| contrastive_logits[~mask] = float("-inf") |
|
|
| |
| if top_p < 1.0: |
| sorted_logits, sorted_indices = torch.sort(contrastive_logits, descending=True) |
| cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1) |
| sorted_indices_to_remove = cumulative_probs > top_p |
| sorted_indices_to_remove[0] = False |
| indices_to_remove = sorted_indices[sorted_indices_to_remove] |
| contrastive_logits[indices_to_remove] = float("-inf") |
|
|
| |
| if div_strength > 0 and len(generated_ids) >= 3: |
| recent_window = generated_ids[-20:] |
| for gid in set(recent_window): |
| contrastive_logits[gid] -= div_strength * recent_window.count(gid) |
|
|
| |
| probs = torch.softmax(contrastive_logits, dim=-1) |
| next_token_id = torch.multinomial(probs, num_samples=1).item() |
| generated_ids.append(next_token_id) |
|
|
| |
| h_inputs["input_ids"] = torch.cat( |
| [h_inputs["input_ids"], torch.tensor([[next_token_id]], device=model.device)], dim=1 |
| ) |
| h_inputs["attention_mask"] = torch.ones_like(h_inputs["input_ids"]) |
| m_inputs["input_ids"] = torch.cat( |
| [m_inputs["input_ids"], torch.tensor([[next_token_id]], device=model.device)], dim=1 |
| ) |
| m_inputs["attention_mask"] = torch.ones_like(m_inputs["input_ids"]) |
|
|
| |
| if next_token_id == tokenizer.eos_token_id: |
| break |
|
|
| rewritten = tokenizer.decode( |
| generated_ids, |
| skip_special_tokens=True, |
| clean_up_tokenization_spaces=True, |
| ) |
|
|
| |
| rewritten = _clean_output(rewritten, text) |
|
|
| elapsed = time.time() - start_time |
| return CopaResult( |
| original_text=text, |
| rewritten_text=rewritten, |
| tokens_generated=len(generated_ids), |
| time_seconds=elapsed, |
| contrast_strength=lambda_, |
| ) |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class CopaBatchResult: |
| results: list[CopaResult] = field(default_factory=list) |
| total_time: float = 0.0 |
| total_tokens: int = 0 |
| avg_tokens_per_second: float = 0.0 |
|
|
|
|
| def run_copa_batch( |
| texts: list[str], |
| model: AutoModelForCausalLM, |
| tokenizer: AutoTokenizer, |
| config: CopaConfig, |
| ) -> CopaBatchResult: |
| """Run CoPA rewriting on a batch of texts.""" |
| batch = CopaBatchResult() |
| start = time.time() |
|
|
| for i, text in enumerate(texts): |
| print(f"[CoPA] {i+1}/{len(texts)}: rewriting {len(text.split())} words...") |
| try: |
| result = copa_rewrite(text, model, tokenizer, config) |
| batch.results.append(result) |
| batch.total_tokens += result.tokens_generated |
| except Exception as e: |
| print(f"[CoPA] ERROR on sample {i}: {e}") |
| batch.results.append(CopaResult( |
| original_text=text, |
| rewritten_text=text, |
| tokens_generated=0, |
| time_seconds=0, |
| contrast_strength=config.lambda_contrast, |
| )) |
|
|
| batch.total_time = time.time() - start |
| if batch.total_time > 0: |
| batch.avg_tokens_per_second = batch.total_tokens / batch.total_time |
| return batch |
|
|
|
|
| |
| |
| |
|
|
| def generate_test_texts(n: int = 50) -> list[str]: |
| """Generate synthetic AI-like texts for testing. |
| |
| In production, replace with real AI-generated texts from HC3 or similar. |
| """ |
| templates = [ |
| "Artificial intelligence has revolutionized the field of natural language processing in recent years. The development of large language models has enabled unprecedented capabilities in text generation, translation, and summarization tasks.", |
| "Climate change represents one of the most significant challenges facing humanity in the twenty-first century. Rising global temperatures have led to increasingly severe weather events, sea level rise, and disruptions to ecosystems worldwide.", |
| "The history of computer science can be traced back to the early twentieth century, with the foundational work of Alan Turing and others. Their theoretical contributions laid the groundwork for the digital revolution that followed.", |
| "Machine learning algorithms have demonstrated remarkable success across a wide range of applications, from image recognition to natural language understanding. These systems learn patterns from large datasets.", |
| "The Renaissance period marked a profound transformation in European art, science, and philosophy. This cultural movement began in Italy during the fourteenth century and spread throughout the continent.", |
| ] |
| |
| result = [] |
| for i in range(n): |
| result.append(templates[i % len(templates)]) |
| return result |
|
|
|
|
| |
| |
| |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="CoPA: Contrastive Paraphrase Attack") |
| parser.add_argument("--model", default="Qwen/Qwen2.5-1.5B-Instruct") |
| parser.add_argument("--lambda", type=float, default=0.5, dest="lambda_contrast") |
| parser.add_argument("--alpha", type=float, default=1e-5, dest="alpha_truncation") |
| parser.add_argument("--max-tokens", type=int, default=256) |
| parser.add_argument("--temperature", type=float, default=1.0) |
| parser.add_argument("--num-samples", type=int, default=50) |
| parser.add_argument("--output", default="output/copa_results.json") |
| parser.add_argument("--device", default="cuda") |
| args = parser.parse_args() |
|
|
| config = CopaConfig( |
| model_name=args.model, |
| lambda_contrast=args.lambda_contrast, |
| alpha_truncation=args.alpha_truncation, |
| max_new_tokens=args.max_tokens, |
| temperature=args.temperature, |
| device=args.device, |
| ) |
|
|
| print(f"[CoPA] Loading model: {config.model_name}") |
| model, tokenizer = load_model(config) |
|
|
| print(f"[CoPA] Generating {args.num_samples} test texts...") |
| test_texts = generate_test_texts(args.num_samples) |
|
|
| print(f"[CoPA] Running contrastive rewriting...") |
| batch_result = run_copa_batch(test_texts, model, tokenizer, config) |
|
|
| |
| os.makedirs(os.path.dirname(args.output), exist_ok=True) |
| output_data = { |
| "config": { |
| "model": config.model_name, |
| "lambda": config.lambda_contrast, |
| "alpha": config.alpha_truncation, |
| }, |
| "summary": { |
| "num_samples": len(batch_result.results), |
| "total_time_s": batch_result.total_time, |
| "total_tokens": batch_result.total_tokens, |
| "avg_tokens_per_second": batch_result.avg_tokens_per_second, |
| }, |
| "results": [ |
| { |
| "original": r.original_text, |
| "rewritten": r.rewritten_text, |
| "tokens": r.tokens_generated, |
| "time_s": r.time_seconds, |
| } |
| for r in batch_result.results |
| ], |
| } |
|
|
| with open(args.output, "w", encoding="utf-8") as f: |
| json.dump(output_data, f, indent=2, ensure_ascii=False) |
|
|
| print(f"[CoPA] Done. {len(batch_result.results)} samples in {batch_result.total_time:.1f}s") |
| print(f"[CoPA] Saved to {args.output}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|