Spaces:

MAS-AI-0000
/

Authentica

Running

App Files Files Community

MAS-AI-0000 commited on Oct 18

Commit

13464bf

verified ·

1 Parent(s): 1da3a1c

Update textPreprocess.py

Browse files

Files changed (1) hide show

textPreprocess.py +123 -123

textPreprocess.py CHANGED Viewed

@@ -1,123 +1,123 @@
-import torch
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
-import os
-# ── 1) Configuration ────────────────────────────────────────────────────────────
-BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-MODEL_DIR = os.path.join(BASE_DIR, "Lib/Models/Text")  # Update this path to your model location
-MAX_LEN = 512
-# ── 2) Load model & tokenizer ──────────────────────────────────────────────────
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Text prediction device: {device}")
-# Global variables for model and tokenizer
-tokenizer = None
-model = None
-ID2LABEL = {0: "human", 1: "ai"}
-try:
-    # Config carries id2label/label2id if you saved them
-    config = AutoConfig.from_pretrained(MODEL_DIR)
-    # Loads tokenizer.json + special_tokens_map.json automatically
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
-    # Loads model.safetensors automatically (no extra flags needed)
-    model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
-    model.eval().to(device)
-    # Update label mapping from config if available
-    ID2LABEL = model.config.id2label if getattr(model.config, "id2label", None) else {0: "human", 1: "ai"}
-    print(f"Text classification model loaded successfully")
-    print("Labels:", ID2LABEL)
-except Exception as e:
-    print(f"Error loading text model: {e}")
-    print("Text prediction will return fallback responses")
-# ── 3) Inference function ──────────────────────────────────────────────────────
-@torch.inference_mode()
-def predict_text(text: str, max_length: int = None):
-    """
-    Predict whether the given text is human-written or AI-generated.
-    Args:
-        text (str): The text to classify
-        max_length (int): Maximum sequence length for tokenization (defaults to MAX_LEN)
-    Returns:
-        dict: Contains predicted_class and confidence
-    """
-    if model is None or tokenizer is None:
-        return {"predicted_class": "Human", "confidence": 0}
-    if max_length is None:
-        max_length = MAX_LEN
-    try:
-        # Tokenize input
-        enc = tokenizer(
-            text,
-            return_tensors="pt",
-            truncation=True,
-            max_length=max_length,
-        )
-        enc = {k: v.to(device) for k, v in enc.items()}
-        # Get predictions
-        logits = model(**enc).logits
-        probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
-        pred_id = int(probs.argmax(-1))
-        # Get label (capitalize first letter for consistency)
-        label = ID2LABEL.get(pred_id, str(pred_id))
-        label = label.capitalize()  # "human" -> "Human", "ai" -> "Ai"
-        return {
-            "predicted_class": label,
-            "confidence": float(probs[pred_id])
-        }
-    except Exception as e:
-        print(f"Error during text prediction: {e}")
-        return {"predicted_class": "Human", "confidence": 0}
-# ── 4) Batch prediction (optional, for future use) ─────────────────────────────
-@torch.inference_mode()
-def predict_batch(texts, batch_size=16):
-    """
-    Predict multiple texts in batches.
-    Args:
-        texts (list): List of text strings to classify
-        batch_size (int): Batch size for processing
-    Returns:
-        list: List of prediction dictionaries
-    """
-    if model is None or tokenizer is None:
-        return [{"predicted_class": "Human", "confidence": 0} for _ in texts]
-    results = []
-    for i in range(0, len(texts), batch_size):
-        chunk = texts[i:i+batch_size]
-        enc = tokenizer(
-            chunk,
-            return_tensors="pt",
-            truncation=True,
-            max_length=MAX_LEN,
-            padding=True,
-        )
-        enc = {k: v.to(device) for k, v in enc.items()}
-        logits = model(**enc).logits
-        probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
-        ids = probs.argmax(-1)
-        for t, pid, p in zip(chunk, ids, probs):
-            label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
-            results.append({
-                "text": t,
-                "predicted_class": label,
-                "confidence": float(p[int(pid)])
-            })
-    return results

+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
+import os
+# ── 1) Configuration ────────────────────────────────────────────────────────────
+BASE_DIR = "MAS-AI-0000/Authentica"
+MODEL_DIR = os.path.join(BASE_DIR, "Lib/Models/Text")  # Update this path to your model location
+MAX_LEN = 512
+# ── 2) Load model & tokenizer ──────────────────────────────────────────────────
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Text prediction device: {device}")
+# Global variables for model and tokenizer
+tokenizer = None
+model = None
+ID2LABEL = {0: "human", 1: "ai"}
+try:
+    # Config carries id2label/label2id if you saved them
+    config = AutoConfig.from_pretrained(MODEL_DIR)
+    # Loads tokenizer.json + special_tokens_map.json automatically
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
+    # Loads model.safetensors automatically (no extra flags needed)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
+    model.eval().to(device)
+    # Update label mapping from config if available
+    ID2LABEL = model.config.id2label if getattr(model.config, "id2label", None) else {0: "human", 1: "ai"}
+    print(f"Text classification model loaded successfully")
+    print("Labels:", ID2LABEL)
+except Exception as e:
+    print(f"Error loading text model: {e}")
+    print("Text prediction will return fallback responses")
+# ── 3) Inference function ──────────────────────────────────────────────────────
+@torch.inference_mode()
+def predict_text(text: str, max_length: int = None):
+    """
+    Predict whether the given text is human-written or AI-generated.
+    Args:
+        text (str): The text to classify
+        max_length (int): Maximum sequence length for tokenization (defaults to MAX_LEN)
+    Returns:
+        dict: Contains predicted_class and confidence
+    """
+    if model is None or tokenizer is None:
+        return {"predicted_class": "Human", "confidence": 0}
+    if max_length is None:
+        max_length = MAX_LEN
+    try:
+        # Tokenize input
+        enc = tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=max_length,
+        )
+        enc = {k: v.to(device) for k, v in enc.items()}
+        # Get predictions
+        logits = model(**enc).logits
+        probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
+        pred_id = int(probs.argmax(-1))
+        # Get label (capitalize first letter for consistency)
+        label = ID2LABEL.get(pred_id, str(pred_id))
+        label = label.capitalize()  # "human" -> "Human", "ai" -> "Ai"
+        return {
+            "predicted_class": label,
+            "confidence": float(probs[pred_id])
+        }
+    except Exception as e:
+        print(f"Error during text prediction: {e}")
+        return {"predicted_class": "Human", "confidence": 0}
+# ── 4) Batch prediction (optional, for future use) ─────────────────────────────
+@torch.inference_mode()
+def predict_batch(texts, batch_size=16):
+    """
+    Predict multiple texts in batches.
+    Args:
+        texts (list): List of text strings to classify
+        batch_size (int): Batch size for processing
+    Returns:
+        list: List of prediction dictionaries
+    """
+    if model is None or tokenizer is None:
+        return [{"predicted_class": "Human", "confidence": 0} for _ in texts]
+    results = []
+    for i in range(0, len(texts), batch_size):
+        chunk = texts[i:i+batch_size]
+        enc = tokenizer(
+            chunk,
+            return_tensors="pt",
+            truncation=True,
+            max_length=MAX_LEN,
+            padding=True,
+        )
+        enc = {k: v.to(device) for k, v in enc.items()}
+        logits = model(**enc).logits
+        probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
+        ids = probs.argmax(-1)
+        for t, pid, p in zip(chunk, ids, probs):
+            label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
+            results.append({
+                "text": t,
+                "predicted_class": label,
+                "confidence": float(p[int(pid)])
+            })
+    return results