Spaces:

ST-THOMAS-OF-AQUINAS
/

document_Authenification

Runtime error

File size: 2,767 Bytes

662dfff
8e32a8a
97f71f2
 
fa70caf
97f71f2
 
 
662dfff
a05692f
 
 
 
 
47eba64
d89415b
47eba64
8af630e
a05692f

import os
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.svm import SVC
import joblib
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List

# 🔹 Ensure HF cache is writable (before importing transformers)
os.environ["HF_HOME"] = "/tmp/hf_cache"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.makedirs("/tmp/hf_cache", exist_ok=True)

# 🔹 Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 🔹 Load tokenizer & BERT model
try:
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    bert_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
    bert_model.eval()
except Exception as e:
    raise RuntimeError(f"Failed to load BERT model: {e}")

# 🔹 Load SVM models
MODEL_DIR = "models"
MODEL_FILES = ["Dean of students_svm.pkl", "Registra_svm.pkl"]
author_svms = {}

for file in MODEL_FILES:
    path = os.path.join(MODEL_DIR, file)
    if not os.path.exists(path):
        raise FileNotFoundError(f"Model file not found: {path}")
    author = file.replace("_svm.pkl", "")
    try:
        clf = joblib.load(path)
        author_svms[author] = clf
    except Exception as e:
        raise RuntimeError(f"Failed to load SVM model {file}: {e}")

print(f"✅ Loaded {len(author_svms)} author models from {MODEL_DIR}")

# 🔹 Text embedding function
def embed_text(text: str):
    enc = tokenizer(
        [text], return_tensors="pt", truncation=True, padding=True, max_length=256
    )
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        outputs = bert_model(**enc)
    pooled = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token
    return pooled

# 🔹 Prediction function
def predict_author(text: str):
    emb = embed_text(text)
    predictions = {}
    for author, clf in author_svms.items():
        try:
            predictions[author] = clf.predict(emb)[0]
        except Exception as e:
            predictions[author] = -1
            print(f"⚠️ Prediction failed for {author}: {e}")

    accepted = [author for author, pred in predictions.items() if pred == 1]
    if len(accepted) == 1:
        return accepted[0]
    elif len(accepted) > 1:
        return accepted[0]  # pick first if multiple
    else:
        return "Unknown"

# 🔹 FastAPI app
app = FastAPI(title="Document Verification API")

class TextInput(BaseModel):
    texts: List[str]

@app.post("/predict")
def predict(input_data: TextInput):
    results = []
    for txt in input_data.texts:
        author = predict_author(txt)
        results.append({"text": txt, "predicted_author": author})
    return {"results": results}

@app.get("/health")
def health_check():
    return {"status": "ok"}