Spaces:

ST-THOMAS-OF-AQUINAS
/

document_Authenification

Runtime error

App Files Files Community

ST-THOMAS-OF-AQUINAS commited on Sep 29, 2025

Commit

fa70caf

verified ·

1 Parent(s): 8e32a8a

Create app.py

Browse files

Files changed (1) hide show

app.py +42 -46

app.py CHANGED Viewed

@@ -1,58 +1,54 @@
-import os
-import joblib
 import torch
-from fastapi import FastAPI
-from pydantic import BaseModel
-from transformers import DistilBertTokenizerFast, DistilBertModel
-# 🔹 Config
-MODEL_DIR = "svm_models"
-# 🔹 FastAPI app
-app = FastAPI(title="Author Identification API")
-# 🔹 Load tokenizer & BERT
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
-bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)
-bert_model.eval()
-# 🔹 Load SVM models
-author_svms = {}
-for file in os.listdir(MODEL_DIR):
-    if file.endswith("_svm.pkl"):
-        author = file.replace("_svm.pkl", "")
-        clf = joblib.load(os.path.join(MODEL_DIR, file))
-        author_svms[author] = clf
-print(f"✅ Loaded {len(author_svms)} author models")
-# 🔹 Embedding function
-def embed_text(text):
     enc = tokenizer([text], return_tensors="pt", truncation=True, padding=True, max_length=256)
     enc = {k: v.to(device) for k, v in enc.items()}
     with torch.no_grad():
         outputs = bert_model(**enc)
-    pooled = outputs.last_hidden_state[:, 0, :].cpu().numpy()
-    return pooled
-# 🔹 Request schema
-class InputText(BaseModel):
-    text: str
-# 🔹 API route
-@app.post("/predict")
-def predict_author(input: InputText):
-    text = input.text
-    emb = embed_text(text)
-    scores = {}
     for author, clf in author_svms.items():
         pred = clf.predict(emb)[0]
         score = clf.decision_function(emb)[0]
-        scores[author] = float(score) if pred == 1 else -9999
-    if all(s == -9999 for s in scores.values()):
-        return {"author": "Unknown", "score": None}
-    best_author = max(scores, key=scores.get)
-    return {"author": best_author, "score": round(scores[best_author], 4)}

+from flask import Flask, request, jsonify
+from transformers import AutoTokenizer, AutoModel
 import torch
+import joblib
+app = Flask(__name__)
+# 🔹 Load model + tokenizer from Hugging Face Hub
+MODEL_NAME = "ST-THOMAS-OF-AQUINAS/Document_verification"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+bert_model = AutoModel.from_pretrained(MODEL_NAME)
+# 🔹 Load saved SVM classifiers (from your training step)
+author_svms = joblib.load("author_svms.pkl")  # saved dict of {author: svm_model}
+label_map = joblib.load("label_map.pkl")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+bert_model.to(device)
+def predict_author(text):
+    bert_model.eval()
     enc = tokenizer([text], return_tensors="pt", truncation=True, padding=True, max_length=256)
     enc = {k: v.to(device) for k, v in enc.items()}
     with torch.no_grad():
         outputs = bert_model(**enc)
+    emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+    predictions = {}
     for author, clf in author_svms.items():
         pred = clf.predict(emb)[0]
         score = clf.decision_function(emb)[0]
+        predictions[author] = (pred, score)
+    accepted = {a: s for a, (p, s) in predictions.items() if p == 1}
+    if not accepted:
+        return "Unknown", None
+    best_author = max(accepted, key=accepted.get)
+    return best_author, accepted[best_author]
+@app.route("/predict", methods=["POST"])
+def predict():
+    data = request.json
+    text = data.get("text", "")
+    if not text:
+        return jsonify({"error": "No text provided"}), 400
+    author, score = predict_author(text)
+    return jsonify({
+        "author": author,
+        "score": score if score is not None else 0
+    })
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5000)