# ======================= # IMPORTS # ======================= import joblib import re from urllib.parse import urlparse import tldextract from PyPDF2 import PdfReader # ======================= # LOAD MODEL # ======================= model = joblib.load("test_model.joblib") # ======================= # URL FEATURES # ======================= def extract_url_features(url): parsed = urlparse(url) ext = tldextract.extract(url) return { "url_length": len(url), "num_dots": url.count("."), "has_ip": bool(re.search(r"\d+\.\d+\.\d+\.\d+", url)), "https": parsed.scheme == "https", "domain_length": len(ext.domain) } # ======================= # PDF TEXT EXTRACTION # ======================= def extract_pdf_text(pdf_path): text = "" reader = PdfReader(pdf_path) for page in reader.pages: text += page.extract_text() or "" return text[:500] # limit for cloud # ======================= # PREDICTION FUNCTION # ======================= def predict(data): """ Expects JSON input: {"inputs": {"text": "...", "url": "...", "pdf_path": "..."}} pdf_path is optional if sending a PDF file """ text = data["inputs"].get("text", "") url = data["inputs"].get("url", "") pdf_path = data["inputs"].get("pdf_path", "") # URL features url_features = extract_url_features(url) if url else {} # PDF text (optional) pdf_text = extract_pdf_text(pdf_path) if pdf_path else "" # Combine text + PDF text combined_text = text + " " + pdf_text # ML prediction pred = model.predict([combined_text])[0] prob = model.predict_proba([combined_text])[0][1] return { "prediction": int(pred), "probability": float(prob), "url_features": url_features, "pdf_text_sample": pdf_text[:100] # sample only }