xanderabim's picture
Upload folder using huggingface_hub
343005c verified
"""
FraudFoxAI Inference API - ONNX Runtime (low memory)
Uses ONNX Runtime instead of PyTorch to stay under 512MB memory limit.
The ONNX model files must already exist in the HF model repo
(pushed from Colab after training via optimum export).
"""
import gc
import numpy as np
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForSequenceClassification
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from scipy.special import softmax
app = FastAPI()
MODEL_NAME = "xanderabim/fraudfoxai-phishing"
# Load tokenizer (lightweight, ~few MB)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Load ONNX model directly (~80MB vs ~400MB for PyTorch)
# ONNX files are pre-exported and pushed from Colab - no torch needed here
model = ORTModelForSequenceClassification.from_pretrained(MODEL_NAME)
# Force garbage collection after loading
gc.collect()
class TextInput(BaseModel):
inputs: str
@app.post("/")
async def predict(data: TextInput):
try:
inputs = tokenizer(
data.inputs,
return_tensors="np", # numpy tensors for ONNX
truncation=True,
max_length=512,
)
# Remove token_type_ids for DistilBERT (doesn't use them)
inputs.pop("token_type_ids", None)
outputs = model(**inputs)
# outputs.logits is a numpy array
logits = outputs.logits
if hasattr(logits, "numpy"):
logits = logits.numpy()
probs = softmax(logits, axis=1)
result = [
[
{"label": "LABEL_0", "score": float(probs[0][0])},
{"label": "LABEL_1", "score": float(probs[0][1])},
]
]
return result
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/")
async def health():
return {"status": "healthy", "model": MODEL_NAME, "runtime": "onnxruntime"}