enhanced-replica-model-pack / scripts /inference /run_zero_shot_detectors.py
LUCIFerace's picture
Add files using upload-large-folder tool
6b6f412 verified
"""
Combined 3B zero-shot detector: FastDetectGPT + Binoculars in one run.
Loads Qwen2.5-3B and Qwen2.5-3B-Instruct once, generates both score files.
"""
import os
import json
import argparse
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM
from modelscope import snapshot_download
REPO_ROOT = Path(__file__).resolve()
while REPO_ROOT != REPO_ROOT.parent and not (REPO_ROOT / "src").exists():
REPO_ROOT = REPO_ROOT.parent
DATASET_ROOT = REPO_ROOT / "data" / "dataset"
OUTPUT_ROOT = REPO_ROOT / "outputs" / "zero_shot"
MAX_LENGTH = 512
BATCH_SIZE = 16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def resolve_dataset_path(dataset_name: str) -> Path:
for p in DATASET_ROOT.rglob(dataset_name):
if p.is_dir() and (p / "test.jsonl").exists():
return p
raise FileNotFoundError(f"Dataset {dataset_name} not found under {DATASET_ROOT}")
def resolve_ms_path(model_id):
return snapshot_download(model_id)
def load_jsonl(path):
records = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
records.append(json.loads(line))
return records
def batch_mean_logprob(model, tokenizer, texts, device):
enc = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH)
input_ids = enc["input_ids"].to(device)
attention_mask = enc["attention_mask"].to(device)
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits[:, :-1, :]
targets = input_ids[:, 1:]
log_probs = torch.log_softmax(logits, dim=-1)
token_lls = log_probs.gather(2, targets.unsqueeze(-1)).squeeze(-1)
mask = attention_mask[:, 1:].float()
mean_lls = (token_lls * mask).sum(dim=1) / (mask.sum(dim=1) + 1e-9)
return mean_lls.cpu().numpy()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", required=True)
args = parser.parse_args()
ds_path = resolve_dataset_path(args.dataset)
out_dir = OUTPUT_ROOT / args.dataset
out_dir.mkdir(parents=True, exist_ok=True)
# Resolve local ModelScope paths
path_3b = resolve_ms_path("qwen/Qwen2.5-3B")
path_3b_inst = resolve_ms_path("qwen/Qwen2.5-3B-Instruct")
print(f"3B base path: {path_3b}")
print(f"3B instruct path: {path_3b_inst}")
# Load tokenizers
tok_3b = AutoTokenizer.from_pretrained(path_3b, trust_remote_code=True)
tok_3b_inst = AutoTokenizer.from_pretrained(path_3b_inst, trust_remote_code=True)
# Load models (load once, reuse for both detectors)
print("Loading 3B models...")
model_3b = AutoModelForCausalLM.from_pretrained(
path_3b, trust_remote_code=True, torch_dtype=torch.bfloat16,
device_map=DEVICE, attn_implementation="eager"
)
model_3b_inst = AutoModelForCausalLM.from_pretrained(
path_3b_inst, trust_remote_code=True, torch_dtype=torch.bfloat16,
device_map=DEVICE, attn_implementation="eager"
)
model_3b.eval()
model_3b_inst.eval()
print("Models loaded.")
for split in ["dev", "test"]:
jsonl_path = ds_path / f"{split}.jsonl"
if not jsonl_path.exists():
continue
records = load_jsonl(jsonl_path)
if not records:
continue
print(f"\n[{split}] {len(records)} samples")
texts = [r["text"] for r in records]
# --- FastDetectGPT ---
fd_scores = []
for i in range(0, len(texts), BATCH_SIZE):
batch = texts[i:i+BATCH_SIZE]
s_lls = batch_mean_logprob(model_3b_inst, tok_3b_inst, batch, DEVICE)
r_lls = batch_mean_logprob(model_3b, tok_3b, batch, DEVICE)
fd_scores.extend((r_lls - s_lls).tolist())
if (i // BATCH_SIZE + 1) % 10 == 0:
print(f" FD {min(i+BATCH_SIZE, len(texts))}/{len(texts)}")
df_fd = pd.DataFrame({
"text": [r["text"] for r in records],
"label": [int(r["label"]) for r in records],
"id": [r.get("id", i) for i, r in enumerate(records)],
"fdgpt_score": fd_scores,
})
df_fd.to_csv(out_dir / f"{split}_fdgpt_scores.csv", index=False, encoding="utf-8")
print(f" Saved FD -> {out_dir}/{split}_fdgpt_scores.csv")
# --- Binoculars ---
bino_scores = []
for i in range(0, len(texts), BATCH_SIZE):
batch = texts[i:i+BATCH_SIZE]
obs_lls = batch_mean_logprob(model_3b, tok_3b, batch, DEVICE)
per_lls = batch_mean_logprob(model_3b_inst, tok_3b_inst, batch, DEVICE)
bino_scores.extend((np.exp(-per_lls) / np.exp(-obs_lls)).tolist())
if (i // BATCH_SIZE + 1) % 10 == 0:
print(f" Bino {min(i+BATCH_SIZE, len(texts))}/{len(texts)}")
df_bino = pd.DataFrame({
"text": [r["text"] for r in records],
"label": [int(r["label"]) for r in records],
"id": [r.get("id", i) for i, r in enumerate(records)],
"binoculars_score": bino_scores,
})
df_bino.to_csv(out_dir / f"{split}_bino_scores.csv", index=False, encoding="utf-8")
print(f" Saved Bino -> {out_dir}/{split}_bino_scores.csv")
print("\nAll done.")
if __name__ == "__main__":
main()