Spaces:

Perth0603
/

Random-Forest-Model-for-PhishingDetection

Sleeping

App Files Files Community

Perth0603 commited on Nov 10, 2025

Commit

ee00f05

verified ·

1 Parent(s): 2c14a34

Update app.py

Browse files

Files changed (1) hide show

app.py +275 -117

app.py CHANGED Viewed

@@ -19,6 +19,26 @@ try:
 except Exception:
     xgb = None
 # Environment defaults suitable for HF Spaces
 os.environ.setdefault("HOME", "/data")
@@ -47,14 +67,42 @@ AUTOCALIB_PHISHY_CSV = os.environ.get("AUTOCALIB_PHISHY_CSV", os.path.join(BASE_
 AUTOCALIB_LEGIT_CSV = os.environ.get("AUTOCALIB_LEGIT_CSV", os.path.join(BASE_DIR, "autocalib_legit.csv"))
 KNOWN_HOSTS_CSV = os.environ.get("KNOWN_HOSTS_CSV", os.path.join(BASE_DIR, "known_hosts.csv"))
-app = FastAPI(title="PhishWatch URL API", version="2.0.0")
 class PredictUrlPayload(BaseModel):
     url: str
 _url_bundle: Optional[Dict[str, Any]] = None
 _url_lock = threading.Lock()
@@ -247,38 +295,22 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
     out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
     out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
-    # Lookalike/homoglyph detection: unusual Unicode symbols that resemble ASCII letters
-    # Examples: Cyrillic а (U+0430) looks like 'a', Greek α (U+03B1) looks like 'a', etc.
     def _detect_lookalike_chars(url: str) -> int:
-        """
-        Detects if URL contains Unicode characters that visually resemble ASCII letters.
-        Common lookalikes used in phishing:
-        - Cyrillic: а, е, о, р, с, х, у, ч, ы, ь (look like a,e,o,p,c,x,y,4,b,b)
-        - Greek: α, ο (look like a, o)
-        - Latin Extended: ɑ, ɢ, ᴅ, ɡ, ɪ, ɴ, ɪ (look like a,G,D,g,i,N,I)
-        """
         url_str = url or ""
-        # Cyrillic characters that look like ASCII letters
         lookalikes_cyrillic = {
             'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
             'у': 'y', 'ч': '4', 'ы': 'b', 'ь': 'b', 'і': 'i', 'ї': 'yi',
             'ґ': 'g', 'ė': 'e', 'ń': 'n', 'ș': 's', 'ț': 't'
         }
-        # Greek characters that look like ASCII letters
         lookalikes_greek = {
             'α': 'a', 'ο': 'o', 'ν': 'v', 'τ': 't', 'ρ': 'p'
         }
-        # Latin Extended lookalikes
         lookalikes_latin = {
             'ɑ': 'a', 'ɢ': 'g', 'ᴅ': 'd', 'ɡ': 'g', 'ɪ': 'i',
             'ɴ': 'n', 'ᴘ': 'p', 'ᴠ': 'v', 'ᴡ': 'w', 'ɨ': 'i'
         }
         all_lookalikes = {**lookalikes_cyrillic, **lookalikes_greek, **lookalikes_latin}
         for char in url_str:
             if char in all_lookalikes:
                 return 1
@@ -286,9 +318,6 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
     out["has_lookalike_chars"] = s.apply(_detect_lookalike_chars)
-    # Return columns in the exact order expected by the model; fill any
-    # still-missing engineered columns with zeros to stay robust across
-    # model updates.
     return out.reindex(columns=feature_cols, fill_value=0)
@@ -314,17 +343,162 @@ def _normalize_url_string(url: str) -> str:
     return (url or "").strip().rstrip("/")
 @app.get("/")
 def root():
-    return {"status": "ok", "backend": "url-only"}
 @app.post("/predict-url")
 def predict_url(payload: PredictUrlPayload):
     try:
         _load_url_model()
-        # Load CSVs on every request (keeps behavior in sync without code edits)
         phishy_list = _read_urls_from_csv(AUTOCALIB_PHISHY_CSV)
         legit_list = _read_urls_from_csv(AUTOCALIB_LEGIT_CSV)
         host_map = _read_hosts_from_csv(KNOWN_HOSTS_CSV)
@@ -343,7 +517,7 @@ def predict_url(payload: PredictUrlPayload):
         if not url_str:
             return JSONResponse(status_code=400, content={"error": "Empty url"})
-        # URL-level override via CSV lists (normalized exact match, ignoring trailing slash)
         norm_url = _normalize_url_string(url_str)
         phishy_set = { _normalize_url_string(u) for u in phishy_list }
         legit_set = { _normalize_url_string(u) for u in legit_list }
@@ -355,6 +529,7 @@ def predict_url(payload: PredictUrlPayload):
             phish_proba = 0.99 if label == "PHISH" else 0.01
             score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
             return {
                 "label": label,
                 "predicted_label": int(predicted_label),
                 "score": float(score),
@@ -365,7 +540,7 @@ def predict_url(payload: PredictUrlPayload):
                 "override": {"reason": "csv_url_match"},
             }
-        # Known-host override (suffix match)
         host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
         if host and host_map:
             for h, lbl in host_map.items():
@@ -376,6 +551,7 @@ def predict_url(payload: PredictUrlPayload):
                     phish_proba = 0.99 if label == "PHISH" else 0.01
                     score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
                     return {
                         "label": label,
                         "predicted_label": int(predicted_label),
                         "score": float(score),
@@ -383,95 +559,80 @@ def predict_url(payload: PredictUrlPayload):
                         "backend": str(model_type),
                         "threshold": 0.5,
                         "url_col": url_col,
                     }
-        # Lookalike character guard: detect homoglyph/lookalike attacks
-        try:
-            # Cyrillic characters that look like ASCII letters
-            lookalikes_cyrillic = {
-                'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
-                'у': 'y', 'ч': '4', 'ы': 'b', 'ь': 'b', 'і': 'i', 'ї': 'yi',
-                'ґ': 'g', 'ė': 'e', 'ń': 'n', 'ș': 's', 'ț': 't'
-            }
-            # Greek characters that look like ASCII letters
-            lookalikes_greek = {
-                'α': 'a', 'ο': 'o', 'ν': 'v', 'τ': 't', 'ρ': 'p'
-            }
-            # Latin Extended lookalikes
-            lookalikes_latin = {
-                'ɑ': 'a', 'ɢ': 'g', 'ᴅ': 'd', 'ɡ': 'g', 'ɪ': 'i',
-                'ɴ': 'n', 'ᴘ': 'p', 'ᴠ': 'v', 'ᴡ': 'w', 'ɨ': 'i'
-            }
-            all_lookalikes = {**lookalikes_cyrillic, **lookalikes_greek, **lookalikes_latin}
-            for char in url_str:
-                if char in all_lookalikes:
-                    phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
-                    label = "PHISH"
-                    predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
-                    phish_proba = 0.95
-                    score = phish_proba
-                    return {
-                        "label": label,
-                        "predicted_label": int(predicted_label),
-                        "score": float(score),
-                        "phishing_probability": float(phish_proba),
-                        "backend": "lookalike_guard",
-                        "threshold": 0.5,
-                        "url_col": url_col,
-                        "rule": "lookalike_character_detected",
-                    }
-        except Exception:
-            pass
-        # Typosquat guard: mirror notebook fallback logic.
-        try:
-            s_host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
-            s_sld = s_host.split(".")[-2] if "." in s_host else s_host
-            def _normalize_brand(s: str) -> str:
-                return re.sub(r"[^a-z]", "", s.lower())
-            s_clean = _normalize_brand(s_sld)
-            brands = [
-                "facebook","linkedin","paypal","google","amazon","apple",
-                "microsoft","instagram","netflix","twitter","whatsapp"
-            ]
-            def _sim(a: str, b: str) -> float:
-                try:
-                    from rapidfuzz import fuzz  # type: ignore
-                    return float(fuzz.ratio(a, b)) / 100.0
-                except Exception:
-                    from difflib import SequenceMatcher
-                    return SequenceMatcher(None, a, b).ratio()
-            if s_clean:
-                best = 0.0
-                for b in brands:
-                    best = max(best, _sim(s_clean, _normalize_brand(b)))
-                has_digits = bool(re.search(r"\d", s_sld))
-                has_hyphen = ("-" in s_sld)
-                is_official = any(s_host.endswith(f"{_normalize_brand(b)}.com") for b in brands)
-                if (best >= 0.90) and (has_digits or has_hyphen) and (not is_official):
-                    phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
-                    label = "PHISH"
-                    predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
-                    phish_proba = 0.90
-                    score = phish_proba
-                    return {
-                        "label": label,
-                        "predicted_label": int(predicted_label),
-                        "score": float(score),
-                        "phishing_probability": float(phish_proba),
-                        "backend": "typosquat_guard",
-                        "threshold": 0.5,
-                        "url_col": url_col,
-                        "rule": "typosquat_guard",
-                    }
-        except Exception:
-            pass
-        # Mirror inference flow for probability of class 1
         feats = _engineer_features([url_str], feature_cols)
         if model_type == "xgboost_bst":
             if xgb is None:
@@ -484,15 +645,14 @@ def predict_url(payload: PredictUrlPayload):
             pred = model.predict(feats)[0]
             raw_p_class1 = 1.0 if int(pred) == 1 else 0.0
-        # Polarity: strictly env or default (class1==PHISH)
         phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
         phish_proba = raw_p_class1 if phish_is_positive else (1.0 - raw_p_class1)
         label = "PHISH" if phish_proba >= 0.5 else "LEGIT"
         predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
         score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
         return {
             "label": label,
             "predicted_label": int(predicted_label),
             "score": float(score),
@@ -502,6 +662,4 @@ def predict_url(payload: PredictUrlPayload):
             "url_col": url_col,
         }
     except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})

 except Exception:
     xgb = None
+# NLP libraries for Text Preprocessing (Module 2)
+try:
+    import nltk
+    from nltk.tokenize import word_tokenize
+    from nltk.corpus import stopwords
+    from nltk.stem import PorterStemmer, WordNetLemmatizer
+    from textblob import TextBlob
+    # Download required NLTK data on startup
+    for resource in ['punkt', 'stopwords', 'wordnet', 'omw-1.4']:
+        try:
+            nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' else f'corpora/{resource}')
+        except LookupError:
+            nltk.download(resource, quiet=True)
+    NLTK_AVAILABLE = True
+except Exception as e:
+    print(f"[WARNING] NLP libraries not available: {e}")
+    NLTK_AVAILABLE = False
 # Environment defaults suitable for HF Spaces
 os.environ.setdefault("HOME", "/data")
 AUTOCALIB_LEGIT_CSV = os.environ.get("AUTOCALIB_LEGIT_CSV", os.path.join(BASE_DIR, "autocalib_legit.csv"))
 KNOWN_HOSTS_CSV = os.environ.get("KNOWN_HOSTS_CSV", os.path.join(BASE_DIR, "known_hosts.csv"))
+# Initialize NLP components for Module 2
+if NLTK_AVAILABLE:
+    stemmer = PorterStemmer()
+    lemmatizer = WordNetLemmatizer()
+    stop_words = set(stopwords.words('english'))
+    # Phishing-specific suspicious keywords (as per methodology Section 3.7.2)
+    PHISHING_KEYWORDS = {
+        'urgent', 'verify', 'suspended', 'locked', 'confirm', 'update',
+        'click', 'prize', 'winner', 'congratulations', 'expire', 'act now',
+        'account', 'security', 'password', 'credit card', 'bank', 'payment',
+        'refund', 'tax', 'irs', 'social security', 'ssn', 'login', 'signin',
+        'alert', 'warning', 'action required', 'unusual activity', 'compromised'
+    }
+app = FastAPI(
+    title="PhishWatch Pro API",
+    version="3.0.0",
+    description="Complete phishing detection system with URL analysis (Module 4) and Text Preprocessing (Module 2)"
+)
+# Pydantic Models
 class PredictUrlPayload(BaseModel):
     url: str
+class PreprocessTextPayload(BaseModel):
+    text: str
+    include_sentiment: bool = True
+    include_stemming: bool = True
+    include_lemmatization: bool = True
+    remove_stopwords: bool = True
 _url_bundle: Optional[Dict[str, Any]] = None
 _url_lock = threading.Lock()
     out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
     out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
+    # Lookalike/homoglyph detection
     def _detect_lookalike_chars(url: str) -> int:
         url_str = url or ""
         lookalikes_cyrillic = {
             'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
             'у': 'y', 'ч': '4', 'ы': 'b', 'ь': 'b', 'і': 'i', 'ї': 'yi',
             'ґ': 'g', 'ė': 'e', 'ń': 'n', 'ș': 's', 'ț': 't'
         }
         lookalikes_greek = {
             'α': 'a', 'ο': 'o', 'ν': 'v', 'τ': 't', 'ρ': 'p'
         }
         lookalikes_latin = {
             'ɑ': 'a', 'ɢ': 'g', 'ᴅ': 'd', 'ɡ': 'g', 'ɪ': 'i',
             'ɴ': 'n', 'ᴘ': 'p', 'ᴠ': 'v', 'ᴡ': 'w', 'ɨ': 'i'
         }
         all_lookalikes = {**lookalikes_cyrillic, **lookalikes_greek, **lookalikes_latin}
         for char in url_str:
             if char in all_lookalikes:
                 return 1
     out["has_lookalike_chars"] = s.apply(_detect_lookalike_chars)
     return out.reindex(columns=feature_cols, fill_value=0)
     return (url or "").strip().rstrip("/")
+# ============================================================================
+# API ENDPOINTS
+# ============================================================================
 @app.get("/")
 def root():
+    return {
+        "status": "ok",
+        "service": "PhishWatch Pro API",
+        "modules": {
+            "module_2_text_preprocessing": NLTK_AVAILABLE,
+            "module_4_url_analyzer": True
+        },
+        "endpoints": [
+            "/predict-url (Module 4: URL Analysis)",
+            "/preprocess-text (Module 2: Text Preprocessing)"
+        ]
+    }
+@app.post("/preprocess-text")
+def preprocess_text(payload: PreprocessTextPayload):
+    """
+    Module 2: Text Preprocessing
+    Implements the complete NLP pipeline as per methodology Section 3.7.2:
+    - Tokenization
+    - Stemming & Lemmatization
+    - Stop word removal
+    - Sentiment analysis (emotional/persuasive language detection)
+    """
+    if not NLTK_AVAILABLE:
+        return JSONResponse(
+            status_code=503,
+            content={
+                "error": "NLP libraries not available",
+                "message": "Please install: pip install nltk textblob"
+            }
+        )
+    try:
+        text = (payload.text or "").strip()
+        if not text:
+            return JSONResponse(status_code=400, content={"error": "Empty text"})
+        # Step 1: Tokenization
+        tokens = word_tokenize(text.lower())
+        # Step 2: Stop word removal (optional)
+        if payload.remove_stopwords:
+            tokens_filtered = [t for t in tokens if t.isalnum() and t not in stop_words]
+        else:
+            tokens_filtered = [t for t in tokens if t.isalnum()]
+        # Step 3: Stemming (optional)
+        stemmed_tokens = []
+        if payload.include_stemming:
+            stemmed_tokens = [stemmer.stem(t) for t in tokens_filtered]
+        # Step 4: Lemmatization (optional)
+        lemmatized_tokens = []
+        if payload.include_lemmatization:
+            lemmatized_tokens = [lemmatizer.lemmatize(t) for t in tokens_filtered]
+        # Step 5: Sentiment Analysis & Phishing Indicators (optional)
+        sentiment_data = {}
+        phishing_indicators = {}
+        if payload.include_sentiment:
+            blob = TextBlob(text)
+            sentiment_data = {
+                "polarity": float(blob.sentiment.polarity),  # -1 (negative) to 1 (positive)
+                "subjectivity": float(blob.sentiment.subjectivity),  # 0 (objective) to 1 (subjective)
+                "classification": (
+                    "positive" if blob.sentiment.polarity > 0.1 else
+                    "negative" if blob.sentiment.polarity < -0.1 else
+                    "neutral"
+                )
+            }
+            # Detect phishing-specific emotional/persuasive language
+            text_lower = text.lower()
+            detected_keywords = [kw for kw in PHISHING_KEYWORDS if kw in text_lower]
+            # Calculate risk score based on keyword density and emotional manipulation
+            keyword_density = len(detected_keywords) / max(len(tokens_filtered), 1)
+            urgency_detected = any(kw in detected_keywords for kw in [
+                'urgent', 'expire', 'act now', 'suspended', 'locked', 'warning', 'alert'
+            ])
+            emotional_appeal = blob.sentiment.subjectivity > 0.6
+            phishing_indicators = {
+                "suspicious_keywords": detected_keywords,
+                "keyword_count": len(detected_keywords),
+                "keyword_density": float(keyword_density),
+                "urgency_detected": urgency_detected,
+                "emotional_appeal": emotional_appeal,
+                "high_subjectivity": blob.sentiment.subjectivity > 0.6,
+                "risk_score": min(1.0,
+                    len(detected_keywords) * 0.12 +
+                    (0.25 if urgency_detected else 0) +
+                    (0.20 if emotional_appeal else 0) +
+                    (keyword_density * 0.3)
+                ),
+                "risk_level": (
+                    "HIGH" if len(detected_keywords) >= 3 or urgency_detected else
+                    "MEDIUM" if len(detected_keywords) >= 1 else
+                    "LOW"
+                )
+            }
+        # Prepare cleaned text variants
+        cleaned_text = " ".join(tokens_filtered)
+        stemmed_text = " ".join(stemmed_tokens) if stemmed_tokens else None
+        lemmatized_text = " ".join(lemmatized_tokens) if lemmatized_tokens else None
+        return {
+            "module": "text_preprocessing",
+            "original_text": text,
+            "tokens": tokens[:100],  # Limit for readability
+            "token_count": len(tokens),
+            "filtered_tokens": tokens_filtered[:100],
+            "filtered_token_count": len(tokens_filtered),
+            "cleaned_text": cleaned_text,
+            "stemmed_text": stemmed_text,
+            "lemmatized_text": lemmatized_text,
+            "sentiment": sentiment_data if sentiment_data else None,
+            "phishing_indicators": phishing_indicators if phishing_indicators else None,
+            "preprocessing_applied": {
+                "tokenization": True,
+                "stopword_removal": payload.remove_stopwords,
+                "stemming": payload.include_stemming,
+                "lemmatization": payload.include_lemmatization,
+                "sentiment_analysis": payload.include_sentiment
+            }
+        }
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})
 @app.post("/predict-url")
 def predict_url(payload: PredictUrlPayload):
+    """
+    Module 4: URL Analyzer
+    Analyzes URLs for phishing using Random Forest model with:
+    - Structural analysis (length, symbols, patterns)
+    - Domain analysis (SLD, TLD, subdomains)
+    - Typosquatting detection
+    - Lookalike character detection
+    - Brand similarity analysis
+    """
     try:
         _load_url_model()
+        # Load CSVs on every request
         phishy_list = _read_urls_from_csv(AUTOCALIB_PHISHY_CSV)
         legit_list = _read_urls_from_csv(AUTOCALIB_LEGIT_CSV)
         host_map = _read_hosts_from_csv(KNOWN_HOSTS_CSV)
         if not url_str:
             return JSONResponse(status_code=400, content={"error": "Empty url"})
+        # URL-level override via CSV lists
         norm_url = _normalize_url_string(url_str)
         phishy_set = { _normalize_url_string(u) for u in phishy_list }
         legit_set = { _normalize_url_string(u) for u in legit_list }
             phish_proba = 0.99 if label == "PHISH" else 0.01
             score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
             return {
+                "module": "url_analyzer",
                 "label": label,
                 "predicted_label": int(predicted_label),
                 "score": float(score),
                 "override": {"reason": "csv_url_match"},
             }
+        # Known-host override
         host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
         if host and host_map:
             for h, lbl in host_map.items():
                     phish_proba = 0.99 if label == "PHISH" else 0.01
                     score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
                     return {
+                        "module": "url_analyzer",
                         "label": label,
                         "predicted_label": int(predicted_label),
                         "score": float(score),
                         "backend": str(model_type),
                         "threshold": 0.5,
                         "url_col": url_col,
+                        "override": {"reason": "known_host_match"},
                     }
+        # Lookalike character guard
+        lookalikes_cyrillic = {
+            'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
+            'у': 'y', 'ч': '4', 'ы': 'b', 'ь': 'b', 'і': 'i', 'ї': 'yi',
+            'ґ': 'g', 'ė': 'e', 'ń': 'n', 'ș': 's', 'ț': 't'
+        }
+        lookalikes_greek = {
+            'α': 'a', 'ο': 'o', 'ν': 'v', 'τ': 't', 'ρ': 'p'
+        }
+        lookalikes_latin = {
+            'ɑ': 'a', 'ɢ': 'g', 'ᴅ': 'd', 'ɡ': 'g', 'ɪ': 'i',
+            'ɴ': 'n', 'ᴘ': 'p', 'ᴠ': 'v', 'ᴡ': 'w', 'ɨ': 'i'
+        }
+        all_lookalikes = {**lookalikes_cyrillic, **lookalikes_greek, **lookalikes_latin}
+        for char in url_str:
+            if char in all_lookalikes:
+                phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
+                label = "PHISH"
+                predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
+                phish_proba = 0.95
+                score = phish_proba
+                return {
+                    "module": "url_analyzer",
+                    "label": label,
+                    "predicted_label": int(predicted_label),
+                    "score": float(score),
+                    "phishing_probability": float(phish_proba),
+                    "backend": "lookalike_guard",
+                    "threshold": 0.5,
+                    "url_col": url_col,
+                    "rule": "lookalike_character_detected",
+                }
+        # Typosquat guard
+        s_host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
+        s_sld = s_host.split(".")[-2] if "." in s_host else s_host
+        def _normalize_brand(s: str) -> str:
+            return re.sub(r"[^a-z]", "", s.lower())
+        s_clean = _normalize_brand(s_sld)
+        brands = [
+            "facebook","linkedin","paypal","google","amazon","apple",
+            "microsoft","instagram","netflix","twitter","whatsapp"
+        ]
+        if s_clean:
+            best = 0.0
+            for b in brands:
+                best = max(best, SequenceMatcher(None, s_clean, _normalize_brand(b)).ratio())
+            has_digits = bool(re.search(r"\d", s_sld))
+            has_hyphen = ("-" in s_sld)
+            is_official = any(s_host.endswith(f"{_normalize_brand(b)}.com") for b in brands)
+            if (best >= 0.90) and (has_digits or has_hyphen) and (not is_official):
+                phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
+                label = "PHISH"
+                predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
+                phish_proba = 0.90
+                score = phish_proba
+                return {
+                    "module": "url_analyzer",
+                    "label": label,
+                    "predicted_label": int(predicted_label),
+                    "score": float(score),
+                    "phishing_probability": float(phish_proba),
+                    "backend": "typosquat_guard",
+                    "threshold": 0.5,
+                    "url_col": url_col,
+                    "rule": "typosquat_guard",
+                }
+        # ML model inference
         feats = _engineer_features([url_str], feature_cols)
         if model_type == "xgboost_bst":
             if xgb is None:
             pred = model.predict(feats)[0]
             raw_p_class1 = 1.0 if int(pred) == 1 else 0.0
         phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
         phish_proba = raw_p_class1 if phish_is_positive else (1.0 - raw_p_class1)
         label = "PHISH" if phish_proba >= 0.5 else "LEGIT"
         predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
         score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
         return {
+            "module": "url_analyzer",
             "label": label,
             "predicted_label": int(predicted_label),
             "score": float(score),
             "url_col": url_col,
         }
     except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})