evasion-detection-artifacts / src /inference_copa.py
simonlesaumon's picture
Upload src/inference_copa.py with huggingface_hub
16ca538 verified
Raw
History Blame Contribute Delete
15.6 kB
"""
CoPA (Contrastive Paraphrase Attack) — training-free evasion prototype.
Based on Fang et al., EMNLP 2025: "Your Language Model Can Secretly
Write Like Humans: Contrastive Paraphrase Attacks on LLM-Generated
Text Detectors."
Contrastive decoding formula:
P_final = (1+lambda) * P_human_style - lambda * P_machine_style
Only the inference-time contrastive decoding is implemented here.
No training required — runs on Modal T4 (~$0.60/h).
"""
from __future__ import annotations
import argparse
import json
import os
import time
from dataclasses import dataclass, field
from typing import Any, TYPE_CHECKING
if TYPE_CHECKING:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------
@dataclass
class CopaConfig:
"""CoPA decoding configuration — tuned for token dispersion maximization."""
# Model: Instruct-tuned (best quality) + CoPA dispersion compensates for detection
model_name: str = "Qwen/Qwen2.5-1.5B-Instruct"
# Contrastive decoding
lambda_contrast: float = 0.5 # CoPA original — best quality balance
alpha_truncation: float = 1e-5 # adaptive truncation threshold
# Generation
max_new_tokens: int = 768 # was 256 — prevents cutoff
temperature: float = 1.0 # sampling temperature
top_p: float = 0.92 # nucleus sampling for diversity
repetition_penalty: float = 1.15 # penalize repeated n-grams
diversity_bonus_strength: float = 0.5 # penalty for recently used tokens
# Prompts: simple style transfer (post-processing strips any artifacts)
human_style_prompt: str = (
"Rewrite this to sound like a natural human wrote it, "
"with varied sentences and conversational wording:\n\n{input_text}"
)
machine_style_prompt: str = (
"Repeat the following text exactly, word for word, "
"maintaining the original formal structure:\n\n{input_text}"
)
device: str = "cuda"
@dataclass
class CopaResult:
"""Single CoPA rewriting result."""
original_text: str
rewritten_text: str
tokens_generated: int
time_seconds: float
contrast_strength: float
# ---------------------------------------------------------------------------
# Model loading
# ---------------------------------------------------------------------------
def _lazy_import_torch():
"""Lazy import torch — only when actually running inference (Modal GPU)."""
import torch # noqa: F811
from transformers import AutoModelForCausalLM, AutoTokenizer # noqa: F811
return torch, AutoModelForCausalLM, AutoTokenizer
def load_model(config: CopaConfig):
"""Load model and tokenizer once for both scoring and generation."""
torch, AutoModelForCausalLM, AutoTokenizer = _lazy_import_torch()
tokenizer = AutoTokenizer.from_pretrained(
config.model_name, trust_remote_code=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
config.model_name,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
model.eval()
return model, tokenizer
# ---------------------------------------------------------------------------
# Output cleaning
# ---------------------------------------------------------------------------
def _clean_output(text: str, original: str) -> str:
"""Remove template artifacts, repeated prompts, and truncated sentences.
Common artifacts from CoPA/LLM generation:
- Repeated few-shot templates ("Text: ...", "Human version:")
- Instruction echoes ("### Informal Natural Language Rewritten:")
- Trailing mid-word cutoffs
"""
import re
# Cut at common template repetition patterns
cut_patterns = [
r"\n\s*Text:\s", # Few-shot template repetition
r"\n\s*Human version:", # Few-shot output label
r"\n\s*Formal academic", # Machine-style prompt leak
r"\n\s*Formal explanation",
r"###\s", # Markdown headings (meta-artifacts)
r"\n\s*You are an AI", # System prompt leak
r"\n\s*Here is a more", # Prompt repetition
r"\n\s*Rewrite the", # Instruction echo
]
for pattern in cut_patterns:
m = re.search(pattern, text)
if m:
text = text[: m.start()].strip()
break
# Remove trailing incomplete sentence (no ending punctuation)
text = text.rstrip()
if text and text[-1] not in '.!?":' "'" ')' ']':
# Find last complete sentence
last_period = max(text.rfind('.'), text.rfind('!'), text.rfind('?'))
if last_period > len(text) * 0.6: # Only if we have enough content
text = text[: last_period + 1]
return text.strip()
# ---------------------------------------------------------------------------
# CoPA contrastive decoding
# ---------------------------------------------------------------------------
def copa_rewrite(
text: str,
model,
tokenizer,
config: CopaConfig,
) -> CopaResult:
"""Rewrite `text` using contrastive decoding.
Algorithm (from CoPA paper, Algorithm 1):
1. Build human-style prompt (few-shot) and machine-style prompt (academic).
2. For each token position t:
a. Compute P_h = model(x_h + y_<t)
b. Compute P_m = model(x_m + y_<t)
c. P_c = softmax((1+lambda)*log P_h - lambda*log P_m)
d. Apply adaptive truncation + top-p nucleus filtering
e. Apply diversity bonus (penalize recent tokens)
f. Sample from truncated P_c with temperature + repetition penalty
"""
torch, _, _ = _lazy_import_torch()
start_time = time.time()
# Format prompts with the actual input text
human_prompt = config.human_style_prompt.replace("{input_text}", text)
machine_prompt = config.machine_style_prompt.replace("{input_text}", text)
h_inputs = tokenizer(human_prompt, return_tensors="pt").to(model.device)
m_inputs = tokenizer(machine_prompt, return_tensors="pt").to(model.device)
generated_ids: list[int] = []
lambda_ = config.lambda_contrast
alpha = config.alpha_truncation
temp = config.temperature
top_p = config.top_p
rep_penalty = config.repetition_penalty
div_strength = config.diversity_bonus_strength
for step in range(config.max_new_tokens):
# --- Human-style logits ---
h_out = model(**h_inputs)
h_logits = h_out.logits[0, -1, :] / temp # (vocab_size,)
# --- Machine-style logits ---
m_out = model(**m_inputs)
m_logits = m_out.logits[0, -1, :] / temp
# --- Contrastive combination ---
h_log_probs = torch.log_softmax(h_logits, dim=-1)
m_log_probs = torch.log_softmax(m_logits, dim=-1)
contrastive_logits = (1 + lambda_) * h_log_probs - lambda_ * m_log_probs
# --- Repetition penalty ---
if rep_penalty != 1.0 and generated_ids:
for gid in set(generated_ids):
if contrastive_logits[gid] > 0:
contrastive_logits[gid] /= rep_penalty
else:
contrastive_logits[gid] *= rep_penalty
# --- Adaptive truncation (keep tokens with P_h >= alpha * max(P_h)) ---
h_probs = torch.softmax(h_logits, dim=-1)
max_prob = h_probs.max()
mask = h_probs >= alpha * max_prob
contrastive_logits[~mask] = float("-inf")
# --- Top-p (nucleus) filtering ---
if top_p < 1.0:
sorted_logits, sorted_indices = torch.sort(contrastive_logits, descending=True)
cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
sorted_indices_to_remove = cumulative_probs > top_p
sorted_indices_to_remove[0] = False # keep at least one token
indices_to_remove = sorted_indices[sorted_indices_to_remove]
contrastive_logits[indices_to_remove] = float("-inf")
# --- Diversity bonus: penalize tokens used in last 20 positions ---
if div_strength > 0 and len(generated_ids) >= 3:
recent_window = generated_ids[-20:]
for gid in set(recent_window):
contrastive_logits[gid] -= div_strength * recent_window.count(gid)
# --- Sample ---
probs = torch.softmax(contrastive_logits, dim=-1)
next_token_id = torch.multinomial(probs, num_samples=1).item()
generated_ids.append(next_token_id)
# --- Append to both contexts ---
h_inputs["input_ids"] = torch.cat(
[h_inputs["input_ids"], torch.tensor([[next_token_id]], device=model.device)], dim=1
)
h_inputs["attention_mask"] = torch.ones_like(h_inputs["input_ids"])
m_inputs["input_ids"] = torch.cat(
[m_inputs["input_ids"], torch.tensor([[next_token_id]], device=model.device)], dim=1
)
m_inputs["attention_mask"] = torch.ones_like(m_inputs["input_ids"])
# --- Stop conditions ---
if next_token_id == tokenizer.eos_token_id:
break
rewritten = tokenizer.decode(
generated_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
# Post-processing: strip repeated template artifacts
rewritten = _clean_output(rewritten, text)
elapsed = time.time() - start_time
return CopaResult(
original_text=text,
rewritten_text=rewritten,
tokens_generated=len(generated_ids),
time_seconds=elapsed,
contrast_strength=lambda_,
)
# ---------------------------------------------------------------------------
# Batch processing
# ---------------------------------------------------------------------------
@dataclass
class CopaBatchResult:
results: list[CopaResult] = field(default_factory=list)
total_time: float = 0.0
total_tokens: int = 0
avg_tokens_per_second: float = 0.0
def run_copa_batch(
texts: list[str],
model: AutoModelForCausalLM,
tokenizer: AutoTokenizer,
config: CopaConfig,
) -> CopaBatchResult:
"""Run CoPA rewriting on a batch of texts."""
batch = CopaBatchResult()
start = time.time()
for i, text in enumerate(texts):
print(f"[CoPA] {i+1}/{len(texts)}: rewriting {len(text.split())} words...")
try:
result = copa_rewrite(text, model, tokenizer, config)
batch.results.append(result)
batch.total_tokens += result.tokens_generated
except Exception as e:
print(f"[CoPA] ERROR on sample {i}: {e}")
batch.results.append(CopaResult(
original_text=text,
rewritten_text=text, # fallback to original
tokens_generated=0,
time_seconds=0,
contrast_strength=config.lambda_contrast,
))
batch.total_time = time.time() - start
if batch.total_time > 0:
batch.avg_tokens_per_second = batch.total_tokens / batch.total_time
return batch
# ---------------------------------------------------------------------------
# Test data generation
# ---------------------------------------------------------------------------
def generate_test_texts(n: int = 50) -> list[str]:
"""Generate synthetic AI-like texts for testing.
In production, replace with real AI-generated texts from HC3 or similar.
"""
templates = [
"Artificial intelligence has revolutionized the field of natural language processing in recent years. The development of large language models has enabled unprecedented capabilities in text generation, translation, and summarization tasks.",
"Climate change represents one of the most significant challenges facing humanity in the twenty-first century. Rising global temperatures have led to increasingly severe weather events, sea level rise, and disruptions to ecosystems worldwide.",
"The history of computer science can be traced back to the early twentieth century, with the foundational work of Alan Turing and others. Their theoretical contributions laid the groundwork for the digital revolution that followed.",
"Machine learning algorithms have demonstrated remarkable success across a wide range of applications, from image recognition to natural language understanding. These systems learn patterns from large datasets.",
"The Renaissance period marked a profound transformation in European art, science, and philosophy. This cultural movement began in Italy during the fourteenth century and spread throughout the continent.",
]
# Repeat/cycle to reach n
result = []
for i in range(n):
result.append(templates[i % len(templates)])
return result
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description="CoPA: Contrastive Paraphrase Attack")
parser.add_argument("--model", default="Qwen/Qwen2.5-1.5B-Instruct")
parser.add_argument("--lambda", type=float, default=0.5, dest="lambda_contrast")
parser.add_argument("--alpha", type=float, default=1e-5, dest="alpha_truncation")
parser.add_argument("--max-tokens", type=int, default=256)
parser.add_argument("--temperature", type=float, default=1.0)
parser.add_argument("--num-samples", type=int, default=50)
parser.add_argument("--output", default="output/copa_results.json")
parser.add_argument("--device", default="cuda")
args = parser.parse_args()
config = CopaConfig(
model_name=args.model,
lambda_contrast=args.lambda_contrast,
alpha_truncation=args.alpha_truncation,
max_new_tokens=args.max_tokens,
temperature=args.temperature,
device=args.device,
)
print(f"[CoPA] Loading model: {config.model_name}")
model, tokenizer = load_model(config)
print(f"[CoPA] Generating {args.num_samples} test texts...")
test_texts = generate_test_texts(args.num_samples)
print(f"[CoPA] Running contrastive rewriting...")
batch_result = run_copa_batch(test_texts, model, tokenizer, config)
# Save results
os.makedirs(os.path.dirname(args.output), exist_ok=True)
output_data = {
"config": {
"model": config.model_name,
"lambda": config.lambda_contrast,
"alpha": config.alpha_truncation,
},
"summary": {
"num_samples": len(batch_result.results),
"total_time_s": batch_result.total_time,
"total_tokens": batch_result.total_tokens,
"avg_tokens_per_second": batch_result.avg_tokens_per_second,
},
"results": [
{
"original": r.original_text,
"rewritten": r.rewritten_text,
"tokens": r.tokens_generated,
"time_s": r.time_seconds,
}
for r in batch_result.results
],
}
with open(args.output, "w", encoding="utf-8") as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"[CoPA] Done. {len(batch_result.results)} samples in {batch_result.total_time:.1f}s")
print(f"[CoPA] Saved to {args.output}")
if __name__ == "__main__":
main()