File size: 4,689 Bytes
13464bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import os

# ── 1) Configuration ────────────────────────────────────────────────────────────
BASE_DIR = "MAS-AI-0000/Authentica"
MODEL_DIR = os.path.join(BASE_DIR, "Lib/Models/Text")  # Update this path to your model location
MAX_LEN = 512

# ── 2) Load model & tokenizer ──────────────────────────────────────────────────
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Text prediction device: {device}")

# Global variables for model and tokenizer
tokenizer = None
model = None
ID2LABEL = {0: "human", 1: "ai"}

try:
    # Config carries id2label/label2id if you saved them
    config = AutoConfig.from_pretrained(MODEL_DIR)
    
    # Loads tokenizer.json + special_tokens_map.json automatically
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
    
    # Loads model.safetensors automatically (no extra flags needed)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
    model.eval().to(device)
    
    # Update label mapping from config if available
    ID2LABEL = model.config.id2label if getattr(model.config, "id2label", None) else {0: "human", 1: "ai"}
    
    print(f"Text classification model loaded successfully")
    print("Labels:", ID2LABEL)
except Exception as e:
    print(f"Error loading text model: {e}")
    print("Text prediction will return fallback responses")

# ── 3) Inference function ──────────────────────────────────────────────────────
@torch.inference_mode()
def predict_text(text: str, max_length: int = None):
    """
    Predict whether the given text is human-written or AI-generated.
    
    Args:
        text (str): The text to classify
        max_length (int): Maximum sequence length for tokenization (defaults to MAX_LEN)
        
    Returns:
        dict: Contains predicted_class and confidence
    """
    if model is None or tokenizer is None:
        return {"predicted_class": "Human", "confidence": 0}
    
    if max_length is None:
        max_length = MAX_LEN
    
    try:
        # Tokenize input
        enc = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=max_length,
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        
        # Get predictions
        logits = model(**enc).logits
        probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
        pred_id = int(probs.argmax(-1))
        
        # Get label (capitalize first letter for consistency)
        label = ID2LABEL.get(pred_id, str(pred_id))
        label = label.capitalize()  # "human" -> "Human", "ai" -> "Ai"
        
        return {
            "predicted_class": label,
            "confidence": float(probs[pred_id])
        }
    except Exception as e:
        print(f"Error during text prediction: {e}")
        return {"predicted_class": "Human", "confidence": 0}

# ── 4) Batch prediction (optional, for future use) ─────────────────────────────
@torch.inference_mode()
def predict_batch(texts, batch_size=16):
    """
    Predict multiple texts in batches.
    
    Args:
        texts (list): List of text strings to classify
        batch_size (int): Batch size for processing
        
    Returns:
        list: List of prediction dictionaries
    """
    if model is None or tokenizer is None:
        return [{"predicted_class": "Human", "confidence": 0} for _ in texts]
    
    results = []
    for i in range(0, len(texts), batch_size):
        chunk = texts[i:i+batch_size]
        enc = tokenizer(
            chunk,
            return_tensors="pt",
            truncation=True,
            max_length=MAX_LEN,
            padding=True,
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        logits = model(**enc).logits
        probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
        ids = probs.argmax(-1)
        
        for t, pid, p in zip(chunk, ids, probs):
            label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
            results.append({
                "text": t,
                "predicted_class": label,
                "confidence": float(p[int(pid)])
            })
    return results