Text Classification
Transformers
Safetensors
Chinese
chinese
ai-text-detection
ensemble
bert
roberta
qwen
lora
research
dataset
Instructions to use LUCIFerace/enhanced-replica-model-pack with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use LUCIFerace/enhanced-replica-model-pack with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-classification", model="LUCIFerace/enhanced-replica-model-pack")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("LUCIFerace/enhanced-replica-model-pack", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """ | |
| Combined 3B zero-shot detector: FastDetectGPT + Binoculars in one run. | |
| Loads Qwen2.5-3B and Qwen2.5-3B-Instruct once, generates both score files. | |
| """ | |
| import os | |
| import json | |
| import argparse | |
| import torch | |
| import numpy as np | |
| import pandas as pd | |
| from pathlib import Path | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from modelscope import snapshot_download | |
| REPO_ROOT = Path(__file__).resolve() | |
| while REPO_ROOT != REPO_ROOT.parent and not (REPO_ROOT / "src").exists(): | |
| REPO_ROOT = REPO_ROOT.parent | |
| DATASET_ROOT = REPO_ROOT / "data" / "dataset" | |
| OUTPUT_ROOT = REPO_ROOT / "outputs" / "zero_shot" | |
| MAX_LENGTH = 512 | |
| BATCH_SIZE = 16 | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| def resolve_dataset_path(dataset_name: str) -> Path: | |
| for p in DATASET_ROOT.rglob(dataset_name): | |
| if p.is_dir() and (p / "test.jsonl").exists(): | |
| return p | |
| raise FileNotFoundError(f"Dataset {dataset_name} not found under {DATASET_ROOT}") | |
| def resolve_ms_path(model_id): | |
| return snapshot_download(model_id) | |
| def load_jsonl(path): | |
| records = [] | |
| with open(path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| records.append(json.loads(line)) | |
| return records | |
| def batch_mean_logprob(model, tokenizer, texts, device): | |
| enc = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH) | |
| input_ids = enc["input_ids"].to(device) | |
| attention_mask = enc["attention_mask"].to(device) | |
| with torch.no_grad(): | |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
| logits = outputs.logits[:, :-1, :] | |
| targets = input_ids[:, 1:] | |
| log_probs = torch.log_softmax(logits, dim=-1) | |
| token_lls = log_probs.gather(2, targets.unsqueeze(-1)).squeeze(-1) | |
| mask = attention_mask[:, 1:].float() | |
| mean_lls = (token_lls * mask).sum(dim=1) / (mask.sum(dim=1) + 1e-9) | |
| return mean_lls.cpu().numpy() | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--dataset", required=True) | |
| args = parser.parse_args() | |
| ds_path = resolve_dataset_path(args.dataset) | |
| out_dir = OUTPUT_ROOT / args.dataset | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| # Resolve local ModelScope paths | |
| path_3b = resolve_ms_path("qwen/Qwen2.5-3B") | |
| path_3b_inst = resolve_ms_path("qwen/Qwen2.5-3B-Instruct") | |
| print(f"3B base path: {path_3b}") | |
| print(f"3B instruct path: {path_3b_inst}") | |
| # Load tokenizers | |
| tok_3b = AutoTokenizer.from_pretrained(path_3b, trust_remote_code=True) | |
| tok_3b_inst = AutoTokenizer.from_pretrained(path_3b_inst, trust_remote_code=True) | |
| # Load models (load once, reuse for both detectors) | |
| print("Loading 3B models...") | |
| model_3b = AutoModelForCausalLM.from_pretrained( | |
| path_3b, trust_remote_code=True, torch_dtype=torch.bfloat16, | |
| device_map=DEVICE, attn_implementation="eager" | |
| ) | |
| model_3b_inst = AutoModelForCausalLM.from_pretrained( | |
| path_3b_inst, trust_remote_code=True, torch_dtype=torch.bfloat16, | |
| device_map=DEVICE, attn_implementation="eager" | |
| ) | |
| model_3b.eval() | |
| model_3b_inst.eval() | |
| print("Models loaded.") | |
| for split in ["dev", "test"]: | |
| jsonl_path = ds_path / f"{split}.jsonl" | |
| if not jsonl_path.exists(): | |
| continue | |
| records = load_jsonl(jsonl_path) | |
| if not records: | |
| continue | |
| print(f"\n[{split}] {len(records)} samples") | |
| texts = [r["text"] for r in records] | |
| # --- FastDetectGPT --- | |
| fd_scores = [] | |
| for i in range(0, len(texts), BATCH_SIZE): | |
| batch = texts[i:i+BATCH_SIZE] | |
| s_lls = batch_mean_logprob(model_3b_inst, tok_3b_inst, batch, DEVICE) | |
| r_lls = batch_mean_logprob(model_3b, tok_3b, batch, DEVICE) | |
| fd_scores.extend((r_lls - s_lls).tolist()) | |
| if (i // BATCH_SIZE + 1) % 10 == 0: | |
| print(f" FD {min(i+BATCH_SIZE, len(texts))}/{len(texts)}") | |
| df_fd = pd.DataFrame({ | |
| "text": [r["text"] for r in records], | |
| "label": [int(r["label"]) for r in records], | |
| "id": [r.get("id", i) for i, r in enumerate(records)], | |
| "fdgpt_score": fd_scores, | |
| }) | |
| df_fd.to_csv(out_dir / f"{split}_fdgpt_scores.csv", index=False, encoding="utf-8") | |
| print(f" Saved FD -> {out_dir}/{split}_fdgpt_scores.csv") | |
| # --- Binoculars --- | |
| bino_scores = [] | |
| for i in range(0, len(texts), BATCH_SIZE): | |
| batch = texts[i:i+BATCH_SIZE] | |
| obs_lls = batch_mean_logprob(model_3b, tok_3b, batch, DEVICE) | |
| per_lls = batch_mean_logprob(model_3b_inst, tok_3b_inst, batch, DEVICE) | |
| bino_scores.extend((np.exp(-per_lls) / np.exp(-obs_lls)).tolist()) | |
| if (i // BATCH_SIZE + 1) % 10 == 0: | |
| print(f" Bino {min(i+BATCH_SIZE, len(texts))}/{len(texts)}") | |
| df_bino = pd.DataFrame({ | |
| "text": [r["text"] for r in records], | |
| "label": [int(r["label"]) for r in records], | |
| "id": [r.get("id", i) for i, r in enumerate(records)], | |
| "binoculars_score": bino_scores, | |
| }) | |
| df_bino.to_csv(out_dir / f"{split}_bino_scores.csv", index=False, encoding="utf-8") | |
| print(f" Saved Bino -> {out_dir}/{split}_bino_scores.csv") | |
| print("\nAll done.") | |
| if __name__ == "__main__": | |
| main() | |