File size: 4,550 Bytes
b4746b6
13464bf
 
b4746b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13464bf
 
 
 
 
 
 
 
 
 
 
 
b4746b6
13464bf
 
 
 
b4746b6
 
 
 
 
 
 
13464bf
 
 
 
 
b4746b6
13464bf
b4746b6
13464bf
5efd7a3
 
b4746b6
13464bf
 
b4746b6
13464bf
b4746b6
13464bf
 
 
 
b4746b6
 
13464bf
 
5efd7a3
13464bf
b4746b6
13464bf
 
 
5efd7a3
 
b4746b6
13464bf
 
 
b4746b6
13464bf
b4746b6
13464bf
 
 
b4746b6
13464bf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from huggingface_hub import snapshot_download  # <-- needed to pull the folder

# ── 1) PATHS / VARS ────────────────────────────────────────────────────────────
REPO_ID = "MAS-AI-0000/Authentica"
TEXT_SUBFOLDER = "Lib/Models/Text"   # where config.json/model.safetensors live in the repo

# download a local snapshot of just the Text folder and point MODEL_DIR at it
_snapshot_dir = snapshot_download(
    repo_id=REPO_ID,
    allow_patterns=[f"{TEXT_SUBFOLDER}/*"]
)
MODEL_DIR = os.path.join(_snapshot_dir, TEXT_SUBFOLDER)

# individual file paths (in case you need them elsewhere)
CONFIG_PATH               = os.path.join(MODEL_DIR, "config.json")
MODEL_SAFETENSORS_PATH    = os.path.join(MODEL_DIR, "model.safetensors")
TOKENIZER_JSON_PATH       = os.path.join(MODEL_DIR, "tokenizer.json")
TOKENIZER_CONFIG_PATH     = os.path.join(MODEL_DIR, "tokenizer_config.json")
SPECIAL_TOKENS_MAP_PATH   = os.path.join(MODEL_DIR, "special_tokens_map.json")
TRAINING_ARGS_BIN_PATH    = os.path.join(MODEL_DIR, "training_args.bin")  # optional
TEXT_TXT_PATH             = os.path.join(MODEL_DIR, "text.txt")           # optional

MAX_LEN = 512

# ── 2) Load model & tokenizer ──────────────────────────────────────────────────
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Text prediction device: {device}")

tokenizer = None
model = None
ID2LABEL = {0: "human", 1: "ai"}

try:
    # load directly from the local MODEL_DIR
    config = AutoConfig.from_pretrained(MODEL_DIR)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
    model.eval().to(device)

    # override labels from config if present
    if getattr(model.config, "id2label", None):
        ID2LABEL = {int(k): v for k, v in model.config.id2label.items()}

    print("Text classification model loaded successfully")
    print("MODEL_DIR:", MODEL_DIR)
    print("Labels:", ID2LABEL)
except Exception as e:
    print(f"Error loading text model: {e}")
    print("Text prediction will return fallback responses")

# ── 3) Inference ───────────────────────────────────────────────────────────────
@torch.inference_mode()
def predict_text(text: str, max_length: int | None = None):
    if model is None or tokenizer is None:
        print("Issue 1")
        return {"predicted_class": "Human", "confidence": -100.0}

    if max_length is None:
        max_length = MAX_LEN

    try:
        enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
        enc = {k: v.to(device) for k, v in enc.items()}
        logits = model(**enc).logits
        probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
        pred_id = int(probs.argmax(-1))
        label = ID2LABEL.get(pred_id, str(pred_id)).capitalize()
        return {"predicted_class": label, "confidence": float(probs[pred_id])}
    except Exception as e:
        print(f"Error during text prediction: {e}")
        return {"predicted_class": "Human", "confidence": -100.0}

# ── 4) Batch (optional) ────────────────────────────────────────────────────────
@torch.inference_mode()
def predict_batch(texts, batch_size=16):
    if model is None or tokenizer is None:
        print("Issue 2")
        return [{"predicted_class": "Human", "confidence": -100.0} for _ in texts]

    results = []
    for i in range(0, len(texts), batch_size):
        chunk = texts[i:i+batch_size]
        enc = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=MAX_LEN, padding=True)
        enc = {k: v.to(device) for k, v in enc.items()}
        probs = torch.softmax(model(**enc).logits, dim=-1).detach().cpu().numpy()
        ids = probs.argmax(-1)
        for t, pid, p in zip(chunk, ids, probs):
            label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
            results.append({"text": t, "predicted_class": label, "confidence": float(p[int(pid)])})
    return results