Spaces:
Running
Running
File size: 4,689 Bytes
13464bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import os
# ββ 1) Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
BASE_DIR = "MAS-AI-0000/Authentica"
MODEL_DIR = os.path.join(BASE_DIR, "Lib/Models/Text") # Update this path to your model location
MAX_LEN = 512
# ββ 2) Load model & tokenizer ββββββββββββββββββββββββββββββββββββββββββββββββββ
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Text prediction device: {device}")
# Global variables for model and tokenizer
tokenizer = None
model = None
ID2LABEL = {0: "human", 1: "ai"}
try:
# Config carries id2label/label2id if you saved them
config = AutoConfig.from_pretrained(MODEL_DIR)
# Loads tokenizer.json + special_tokens_map.json automatically
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
# Loads model.safetensors automatically (no extra flags needed)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
model.eval().to(device)
# Update label mapping from config if available
ID2LABEL = model.config.id2label if getattr(model.config, "id2label", None) else {0: "human", 1: "ai"}
print(f"Text classification model loaded successfully")
print("Labels:", ID2LABEL)
except Exception as e:
print(f"Error loading text model: {e}")
print("Text prediction will return fallback responses")
# ββ 3) Inference function ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@torch.inference_mode()
def predict_text(text: str, max_length: int = None):
"""
Predict whether the given text is human-written or AI-generated.
Args:
text (str): The text to classify
max_length (int): Maximum sequence length for tokenization (defaults to MAX_LEN)
Returns:
dict: Contains predicted_class and confidence
"""
if model is None or tokenizer is None:
return {"predicted_class": "Human", "confidence": 0}
if max_length is None:
max_length = MAX_LEN
try:
# Tokenize input
enc = tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=max_length,
)
enc = {k: v.to(device) for k, v in enc.items()}
# Get predictions
logits = model(**enc).logits
probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
pred_id = int(probs.argmax(-1))
# Get label (capitalize first letter for consistency)
label = ID2LABEL.get(pred_id, str(pred_id))
label = label.capitalize() # "human" -> "Human", "ai" -> "Ai"
return {
"predicted_class": label,
"confidence": float(probs[pred_id])
}
except Exception as e:
print(f"Error during text prediction: {e}")
return {"predicted_class": "Human", "confidence": 0}
# ββ 4) Batch prediction (optional, for future use) βββββββββββββββββββββββββββββ
@torch.inference_mode()
def predict_batch(texts, batch_size=16):
"""
Predict multiple texts in batches.
Args:
texts (list): List of text strings to classify
batch_size (int): Batch size for processing
Returns:
list: List of prediction dictionaries
"""
if model is None or tokenizer is None:
return [{"predicted_class": "Human", "confidence": 0} for _ in texts]
results = []
for i in range(0, len(texts), batch_size):
chunk = texts[i:i+batch_size]
enc = tokenizer(
chunk,
return_tensors="pt",
truncation=True,
max_length=MAX_LEN,
padding=True,
)
enc = {k: v.to(device) for k, v in enc.items()}
logits = model(**enc).logits
probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
ids = probs.argmax(-1)
for t, pid, p in zip(chunk, ids, probs):
label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
results.append({
"text": t,
"predicted_class": label,
"confidence": float(p[int(pid)])
})
return results
|