Spaces:

ST-THOMAS-OF-AQUINAS
/

document_Authenification

Runtime error

App Files Files Community

ST-THOMAS-OF-AQUINAS commited on Sep 29, 2025

Commit

97f71f2

verified ·

1 Parent(s): dfd0bbc

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -43

app.py CHANGED Viewed

@@ -1,54 +1,78 @@
-from flask import Flask, request, jsonify
-from transformers import AutoTokenizer, AutoModel
 import torch
 import joblib
-app = Flask(__name__)
-# 🔹 Load model + tokenizer from Hugging Face Hub
-MODEL_NAME = "ST-THOMAS-OF-AQUINAS/Document_verification"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-bert_model = AutoModel.from_pretrained(MODEL_NAME)
-# 🔹 Load saved SVM classifiers (from your training step)
-author_svms = joblib.load("author_svms.pkl")  # saved dict of {author: svm_model}
-label_map = joblib.load("label_map.pkl")
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-bert_model.to(device)
-def predict_author(text):
-    bert_model.eval()
     enc = tokenizer([text], return_tensors="pt", truncation=True, padding=True, max_length=256)
     enc = {k: v.to(device) for k, v in enc.items()}
     with torch.no_grad():
         outputs = bert_model(**enc)
-    emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
-    predictions = {}
-    for author, clf in author_svms.items():
-        pred = clf.predict(emb)[0]
-        score = clf.decision_function(emb)[0]
-        predictions[author] = (pred, score)
-    accepted = {a: s for a, (p, s) in predictions.items() if p == 1}
-    if not accepted:
-        return "Unknown", None
-    best_author = max(accepted, key=accepted.get)
-    return best_author, accepted[best_author]
-@app.route("/predict", methods=["POST"])
-def predict():
-    data = request.json
-    text = data.get("text", "")
-    if not text:
-        return jsonify({"error": "No text provided"}), 400
-    author, score = predict_author(text)
-    return jsonify({
-        "author": author,
-        "score": score if score is not None else 0
-    })
-if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=5000)

 import torch
+from transformers import AutoTokenizer, AutoModel
+from sklearn.svm import SVC
 import joblib
+import requests
+import os
+from fastapi import FastAPI
+from pydantic import BaseModel
+from typing import List
+import tempfile
+# 🔹 Hugging Face repo info
+HF_REPO = "ST-THOMAS-OF-AQUINAS/Document_verification"
+MODEL_FILES = ["author1_svm.pkl", "author2_svm.pkl"]  # replace with actual filenames
+# 🔹 Load tokenizer & BERT model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+bert_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
+bert_model.eval()
+# 🔹 Function to download file from HF Hub
+def download_file(repo, filename):
+    url = f"https://huggingface.co/{repo}/resolve/main/{filename}"
+    response = requests.get(url)
+    tmp_path = os.path.join(tempfile.gettempdir(), filename)
+    with open(tmp_path, "wb") as f:
+        f.write(response.content)
+    return tmp_path
+# 🔹 Load SVM models dynamically from Hub
+author_svms = {}
+for file in MODEL_FILES:
+    author = file.replace("_svm.pkl", "")
+    path = download_file(HF_REPO, file)
+    clf = joblib.load(path)
+    author_svms[author] = clf
+print(f"✅ Loaded {len(author_svms)} author models")
+# 🔹 Text embedding
+def embed_text(text):
     enc = tokenizer([text], return_tensors="pt", truncation=True, padding=True, max_length=256)
     enc = {k: v.to(device) for k, v in enc.items()}
     with torch.no_grad():
         outputs = bert_model(**enc)
+    pooled = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+    return pooled
+# 🔹 Prediction function
+def predict_author(text):
+    emb = embed_text(text)
+    predictions = {author: clf.predict(emb)[0] for author, clf in author_svms.items()}
+    accepted = [author for author, pred in predictions.items() if pred == 1]
+    if len(accepted) == 1:
+        return accepted[0]
+    elif len(accepted) > 1:
+        return accepted[0]
+    else:
+        return "Unknown"
+# 🔹 FastAPI app
+app = FastAPI(title="Document Verification API")
+class TextInput(BaseModel):
+    texts: List[str]
+@app.post("/predict")
+def predict(input_data: TextInput):
+    results = []
+    for txt in input_data.texts:
+        author = predict_author(txt)
+        results.append({"text": txt, "predicted_author": author})
+    return {"results": results}
+@app.get("/health")
+def health_check():
+    return {"status": "ok"}