Upload src/inference_copa.py with huggingface_hub

16ca538 verified 3 days ago

15.6 kB

	"""
	CoPA (Contrastive Paraphrase Attack) — training-free evasion prototype.

	Based on Fang et al., EMNLP 2025: "Your Language Model Can Secretly
	Write Like Humans: Contrastive Paraphrase Attacks on LLM-Generated
	Text Detectors."

	Contrastive decoding formula:
	P_final = (1+lambda) * P_human_style - lambda * P_machine_style

	Only the inference-time contrastive decoding is implemented here.
	No training required — runs on Modal T4 (~$0.60/h).
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import time
	from dataclasses import dataclass, field
	from typing import Any, TYPE_CHECKING

	if TYPE_CHECKING:
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer


	# ---------------------------------------------------------------------------
	# Data structures
	# ---------------------------------------------------------------------------

	@dataclass
	class CopaConfig:
	"""CoPA decoding configuration — tuned for token dispersion maximization."""

	# Model: Instruct-tuned (best quality) + CoPA dispersion compensates for detection
	model_name: str = "Qwen/Qwen2.5-1.5B-Instruct"

	# Contrastive decoding
	lambda_contrast: float = 0.5 # CoPA original — best quality balance
	alpha_truncation: float = 1e-5 # adaptive truncation threshold

	# Generation
	max_new_tokens: int = 768 # was 256 — prevents cutoff
	temperature: float = 1.0 # sampling temperature
	top_p: float = 0.92 # nucleus sampling for diversity
	repetition_penalty: float = 1.15 # penalize repeated n-grams
	diversity_bonus_strength: float = 0.5 # penalty for recently used tokens

	# Prompts: simple style transfer (post-processing strips any artifacts)
	human_style_prompt: str = (
	"Rewrite this to sound like a natural human wrote it, "
	"with varied sentences and conversational wording:\n\n{input_text}"
	)
	machine_style_prompt: str = (
	"Repeat the following text exactly, word for word, "
	"maintaining the original formal structure:\n\n{input_text}"
	)

	device: str = "cuda"


	@dataclass
	class CopaResult:
	"""Single CoPA rewriting result."""
	original_text: str
	rewritten_text: str
	tokens_generated: int
	time_seconds: float
	contrast_strength: float


	# ---------------------------------------------------------------------------
	# Model loading
	# ---------------------------------------------------------------------------

	def _lazy_import_torch():
	"""Lazy import torch — only when actually running inference (Modal GPU)."""
	import torch # noqa: F811
	from transformers import AutoModelForCausalLM, AutoTokenizer # noqa: F811
	return torch, AutoModelForCausalLM, AutoTokenizer


	def load_model(config: CopaConfig):
	"""Load model and tokenizer once for both scoring and generation."""
	torch, AutoModelForCausalLM, AutoTokenizer = _lazy_import_torch()
	tokenizer = AutoTokenizer.from_pretrained(
	config.model_name, trust_remote_code=True
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	model = AutoModelForCausalLM.from_pretrained(
	config.model_name,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True,
	)
	model.eval()
	return model, tokenizer


	# ---------------------------------------------------------------------------
	# Output cleaning
	# ---------------------------------------------------------------------------

	def _clean_output(text: str, original: str) -> str:
	"""Remove template artifacts, repeated prompts, and truncated sentences.

	Common artifacts from CoPA/LLM generation:
	- Repeated few-shot templates ("Text: ...", "Human version:")
	- Instruction echoes ("### Informal Natural Language Rewritten:")
	- Trailing mid-word cutoffs
	"""
	import re

	# Cut at common template repetition patterns
	cut_patterns = [
	r"\n\s*Text:\s", # Few-shot template repetition
	r"\n\s*Human version:", # Few-shot output label
	r"\n\s*Formal academic", # Machine-style prompt leak
	r"\n\s*Formal explanation",
	r"###\s", # Markdown headings (meta-artifacts)
	r"\n\s*You are an AI", # System prompt leak
	r"\n\s*Here is a more", # Prompt repetition
	r"\n\s*Rewrite the", # Instruction echo
	]
	for pattern in cut_patterns:
	m = re.search(pattern, text)
	if m:
	text = text[: m.start()].strip()
	break

	# Remove trailing incomplete sentence (no ending punctuation)
	text = text.rstrip()
	if text and text[-1] not in '.!?":' "'" ')' ']':
	# Find last complete sentence
	last_period = max(text.rfind('.'), text.rfind('!'), text.rfind('?'))
	if last_period > len(text) * 0.6: # Only if we have enough content
	text = text[: last_period + 1]

	return text.strip()


	# ---------------------------------------------------------------------------
	# CoPA contrastive decoding
	# ---------------------------------------------------------------------------

	def copa_rewrite(
	text: str,
	model,
	tokenizer,
	config: CopaConfig,
	) -> CopaResult:
	"""Rewrite `text` using contrastive decoding.

	Algorithm (from CoPA paper, Algorithm 1):
	1. Build human-style prompt (few-shot) and machine-style prompt (academic).
	2. For each token position t:
	a. Compute P_h = model(x_h + y_<t)
	b. Compute P_m = model(x_m + y_<t)
	c. P_c = softmax((1+lambda)log P_h - lambdalog P_m)
	d. Apply adaptive truncation + top-p nucleus filtering
	e. Apply diversity bonus (penalize recent tokens)
	f. Sample from truncated P_c with temperature + repetition penalty
	"""
	torch, _, _ = _lazy_import_torch()
	start_time = time.time()

	# Format prompts with the actual input text
	human_prompt = config.human_style_prompt.replace("{input_text}", text)
	machine_prompt = config.machine_style_prompt.replace("{input_text}", text)

	h_inputs = tokenizer(human_prompt, return_tensors="pt").to(model.device)
	m_inputs = tokenizer(machine_prompt, return_tensors="pt").to(model.device)

	generated_ids: list[int] = []
	lambda_ = config.lambda_contrast
	alpha = config.alpha_truncation
	temp = config.temperature
	top_p = config.top_p
	rep_penalty = config.repetition_penalty
	div_strength = config.diversity_bonus_strength

	for step in range(config.max_new_tokens):
	# --- Human-style logits ---
	h_out = model(**h_inputs)
	h_logits = h_out.logits[0, -1, :] / temp # (vocab_size,)

	# --- Machine-style logits ---
	m_out = model(**m_inputs)
	m_logits = m_out.logits[0, -1, :] / temp

	# --- Contrastive combination ---
	h_log_probs = torch.log_softmax(h_logits, dim=-1)
	m_log_probs = torch.log_softmax(m_logits, dim=-1)
	contrastive_logits = (1 + lambda_) * h_log_probs - lambda_ * m_log_probs

	# --- Repetition penalty ---
	if rep_penalty != 1.0 and generated_ids:
	for gid in set(generated_ids):
	if contrastive_logits[gid] > 0:
	contrastive_logits[gid] /= rep_penalty
	else:
	contrastive_logits[gid] *= rep_penalty

	# --- Adaptive truncation (keep tokens with P_h >= alpha * max(P_h)) ---
	h_probs = torch.softmax(h_logits, dim=-1)
	max_prob = h_probs.max()
	mask = h_probs >= alpha * max_prob
	contrastive_logits[~mask] = float("-inf")

	# --- Top-p (nucleus) filtering ---
	if top_p < 1.0:
	sorted_logits, sorted_indices = torch.sort(contrastive_logits, descending=True)
	cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
	sorted_indices_to_remove = cumulative_probs > top_p
	sorted_indices_to_remove[0] = False # keep at least one token
	indices_to_remove = sorted_indices[sorted_indices_to_remove]
	contrastive_logits[indices_to_remove] = float("-inf")

	# --- Diversity bonus: penalize tokens used in last 20 positions ---
	if div_strength > 0 and len(generated_ids) >= 3:
	recent_window = generated_ids[-20:]
	for gid in set(recent_window):
	contrastive_logits[gid] -= div_strength * recent_window.count(gid)

	# --- Sample ---
	probs = torch.softmax(contrastive_logits, dim=-1)
	next_token_id = torch.multinomial(probs, num_samples=1).item()
	generated_ids.append(next_token_id)

	# --- Append to both contexts ---
	h_inputs["input_ids"] = torch.cat(
	[h_inputs["input_ids"], torch.tensor([[next_token_id]], device=model.device)], dim=1
	)
	h_inputs["attention_mask"] = torch.ones_like(h_inputs["input_ids"])
	m_inputs["input_ids"] = torch.cat(
	[m_inputs["input_ids"], torch.tensor([[next_token_id]], device=model.device)], dim=1
	)
	m_inputs["attention_mask"] = torch.ones_like(m_inputs["input_ids"])

	# --- Stop conditions ---
	if next_token_id == tokenizer.eos_token_id:
	break

	rewritten = tokenizer.decode(
	generated_ids,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=True,
	)

	# Post-processing: strip repeated template artifacts
	rewritten = _clean_output(rewritten, text)

	elapsed = time.time() - start_time
	return CopaResult(
	original_text=text,
	rewritten_text=rewritten,
	tokens_generated=len(generated_ids),
	time_seconds=elapsed,
	contrast_strength=lambda_,
	)


	# ---------------------------------------------------------------------------
	# Batch processing
	# ---------------------------------------------------------------------------

	@dataclass
	class CopaBatchResult:
	results: list[CopaResult] = field(default_factory=list)
	total_time: float = 0.0
	total_tokens: int = 0
	avg_tokens_per_second: float = 0.0


	def run_copa_batch(
	texts: list[str],
	model: AutoModelForCausalLM,
	tokenizer: AutoTokenizer,
	config: CopaConfig,
	) -> CopaBatchResult:
	"""Run CoPA rewriting on a batch of texts."""
	batch = CopaBatchResult()
	start = time.time()

	for i, text in enumerate(texts):
	print(f"[CoPA] {i+1}/{len(texts)}: rewriting {len(text.split())} words...")
	try:
	result = copa_rewrite(text, model, tokenizer, config)
	batch.results.append(result)
	batch.total_tokens += result.tokens_generated
	except Exception as e:
	print(f"[CoPA] ERROR on sample {i}: {e}")
	batch.results.append(CopaResult(
	original_text=text,
	rewritten_text=text, # fallback to original
	tokens_generated=0,
	time_seconds=0,
	contrast_strength=config.lambda_contrast,
	))

	batch.total_time = time.time() - start
	if batch.total_time > 0:
	batch.avg_tokens_per_second = batch.total_tokens / batch.total_time
	return batch


	# ---------------------------------------------------------------------------
	# Test data generation
	# ---------------------------------------------------------------------------

	def generate_test_texts(n: int = 50) -> list[str]:
	"""Generate synthetic AI-like texts for testing.

	In production, replace with real AI-generated texts from HC3 or similar.
	"""
	templates = [
	"Artificial intelligence has revolutionized the field of natural language processing in recent years. The development of large language models has enabled unprecedented capabilities in text generation, translation, and summarization tasks.",
	"Climate change represents one of the most significant challenges facing humanity in the twenty-first century. Rising global temperatures have led to increasingly severe weather events, sea level rise, and disruptions to ecosystems worldwide.",
	"The history of computer science can be traced back to the early twentieth century, with the foundational work of Alan Turing and others. Their theoretical contributions laid the groundwork for the digital revolution that followed.",
	"Machine learning algorithms have demonstrated remarkable success across a wide range of applications, from image recognition to natural language understanding. These systems learn patterns from large datasets.",
	"The Renaissance period marked a profound transformation in European art, science, and philosophy. This cultural movement began in Italy during the fourteenth century and spread throughout the continent.",
	]
	# Repeat/cycle to reach n
	result = []
	for i in range(n):
	result.append(templates[i % len(templates)])
	return result


	# ---------------------------------------------------------------------------
	# CLI
	# ---------------------------------------------------------------------------

	def main():
	parser = argparse.ArgumentParser(description="CoPA: Contrastive Paraphrase Attack")
	parser.add_argument("--model", default="Qwen/Qwen2.5-1.5B-Instruct")
	parser.add_argument("--lambda", type=float, default=0.5, dest="lambda_contrast")
	parser.add_argument("--alpha", type=float, default=1e-5, dest="alpha_truncation")
	parser.add_argument("--max-tokens", type=int, default=256)
	parser.add_argument("--temperature", type=float, default=1.0)
	parser.add_argument("--num-samples", type=int, default=50)
	parser.add_argument("--output", default="output/copa_results.json")
	parser.add_argument("--device", default="cuda")
	args = parser.parse_args()

	config = CopaConfig(
	model_name=args.model,
	lambda_contrast=args.lambda_contrast,
	alpha_truncation=args.alpha_truncation,
	max_new_tokens=args.max_tokens,
	temperature=args.temperature,
	device=args.device,
	)

	print(f"[CoPA] Loading model: {config.model_name}")
	model, tokenizer = load_model(config)

	print(f"[CoPA] Generating {args.num_samples} test texts...")
	test_texts = generate_test_texts(args.num_samples)

	print(f"[CoPA] Running contrastive rewriting...")
	batch_result = run_copa_batch(test_texts, model, tokenizer, config)

	# Save results
	os.makedirs(os.path.dirname(args.output), exist_ok=True)
	output_data = {
	"config": {
	"model": config.model_name,
	"lambda": config.lambda_contrast,
	"alpha": config.alpha_truncation,
	},
	"summary": {
	"num_samples": len(batch_result.results),
	"total_time_s": batch_result.total_time,
	"total_tokens": batch_result.total_tokens,
	"avg_tokens_per_second": batch_result.avg_tokens_per_second,
	},
	"results": [
	{
	"original": r.original_text,
	"rewritten": r.rewritten_text,
	"tokens": r.tokens_generated,
	"time_s": r.time_seconds,
	}
	for r in batch_result.results
	],
	}

	with open(args.output, "w", encoding="utf-8") as f:
	json.dump(output_data, f, indent=2, ensure_ascii=False)

	print(f"[CoPA] Done. {len(batch_result.results)} samples in {batch_result.total_time:.1f}s")
	print(f"[CoPA] Saved to {args.output}")


	if __name__ == "__main__":
	main()