Spaces:

Akshatbhatt515334
/

AEGIS-SECURE-API

Running

App Files Files Community

AEGIS-SECURE-API / app1.py

Akshatbhatt515334

Update app1.py

5edbfca verified 3 months ago

raw

history blame contribute delete

47.1 kB

	import os
	import re
	import json
	import time
	import sys
	import asyncio
	import socket
	import random
	import logging
	import warnings
	import unicodedata
	import email
	from email.policy import default
	from typing import List, Dict, Optional, Any
	from urllib.parse import urlparse

	# Third-party imports
	import httpx
	import uvicorn
	import joblib
	import torch
	import numpy as np
	import pandas as pd
	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from groq import AsyncGroq, RateLimitError, APIError
	from dotenv import load_dotenv
	from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
	from playwright.async_api import async_playwright

	# Local imports
	import config
	from models import get_ml_models, get_dl_models, FinetunedBERT
	from feature_extraction import process_row

	load_dotenv()
	sys.path.append(os.path.join(config.BASE_DIR, 'Message_model'))

	# Attempt to import the local semantic model
	try:
	from predict import PhishingPredictor
	except ImportError:
	PhishingPredictor = None

	# ==================================================================================
	# 🌈 ULTRA-AESTHETIC LOGGING SETUP (VISUAL OVERHAUL)
	# ==================================================================================
	class UltraColorFormatter(logging.Formatter):
	# ANSI Color Codes
	GREY = "\x1b[38;5;240m"
	CYAN = "\x1b[36m"
	NEON_BLUE = "\x1b[38;5;39m"
	NEON_GREEN = "\x1b[38;5;82m"
	NEON_PURPLE = "\x1b[38;5;129m"
	YELLOW = "\x1b[33m"
	ORANGE = "\x1b[38;5;208m"
	RED = "\x1b[31m"
	BOLD_RED = "\x1b[31;1m"
	WHITE_BOLD = "\x1b[37;1m"
	RESET = "\x1b[0m"

	# Custom Formats based on Level
	FORMATS = {
	logging.DEBUG: GREY + " 🐞 [DEBUG] %(message)s" + RESET,
	logging.INFO: "%(message)s" + RESET, # Info handles its own coloring in code
	logging.WARNING: ORANGE + " ⚠️ [WARNING] %(message)s" + RESET,
	logging.ERROR: RED + " ❌ [ERROR] %(message)s" + RESET,
	logging.CRITICAL: BOLD_RED + "\n🚨 [CRITICAL] %(message)s\n" + RESET
	}

	def format(self, record):
	log_fmt = self.FORMATS.get(record.levelno)
	formatter = logging.Formatter(log_fmt)
	return formatter.format(record)

	logger = logging.getLogger("PhishingAPI")
	logger.setLevel(logging.INFO)
	ch = logging.StreamHandler(sys.stdout)
	ch.setFormatter(UltraColorFormatter())
	if logger.hasHandlers():
	logger.handlers.clear()
	logger.addHandler(ch)

	# --- VISUAL HELPER FUNCTIONS ---
	def log_section(title):
	logger.info(f"\n{UltraColorFormatter.NEON_PURPLE}┌{'─'*70}┐")
	logger.info(f"{UltraColorFormatter.NEON_PURPLE}│ {UltraColorFormatter.WHITE_BOLD}{title.center(68)}{UltraColorFormatter.NEON_PURPLE} │")
	logger.info(f"{UltraColorFormatter.NEON_PURPLE}└{'─'*70}┘{UltraColorFormatter.RESET}")

	def log_step(icon, text):
	logger.info(f"{UltraColorFormatter.CYAN} {icon} {text}{UltraColorFormatter.RESET}")

	def log_substep(text, value=""):
	val_str = f": {UltraColorFormatter.NEON_GREEN}{value}{UltraColorFormatter.RESET}" if value else ""
	logger.info(f"{UltraColorFormatter.GREY} ├─ {text}{val_str}")

	def log_success(text):
	logger.info(f"{UltraColorFormatter.NEON_GREEN} ✅ {text}{UltraColorFormatter.RESET}")

	def log_metric(label, value, warning=False):
	color = UltraColorFormatter.ORANGE if warning else UltraColorFormatter.NEON_BLUE
	logger.info(f" {color}📊 {label}: {UltraColorFormatter.WHITE_BOLD}{value}{UltraColorFormatter.RESET}")

	# ==================================================================================

	# --- CONFIGURATION ---
	MAX_INPUT_CHARS = 4000
	MAX_CONCURRENT_REQUESTS = 5
	MAX_URLS_TO_ANALYZE = 15
	LLM_MAX_RETRIES = 3

	app = FastAPI(
	title="Phishing Detection API (Robust Ensemble)",
	description="Multilingual phishing detection using Weighted Ensemble (ML/DL) + LLM Semantic Analysis + Live Scraping",
	version="2.6.0"
	)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	request_semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

	# --- DATA MODELS ---
	class MessageInput(BaseModel):
	sender: Optional[str] = ""
	subject: Optional[str] = ""
	text: Optional[str] = ""
	metadata: Optional[Dict] = {}

	class PredictionResponse(BaseModel):
	confidence: float
	reasoning: str
	highlighted_text: str
	final_decision: str
	suggestion: str

	# --- UTILITIES ---
	class SmartAPIKeyRotator:
	def __init__(self):
	keys_str = os.environ.get('GROQ_API_KEYS', '')
	self.keys = [k.strip() for k in keys_str.split(',') if k.strip()]
	if not self.keys:
	single_key = os.environ.get('GROQ_API_KEY')
	if single_key:
	self.keys = [single_key]

	if not self.keys:
	logger.critical("CRITICAL: No GROQ_API_KEYS found in environment variables!")
	else:
	log_substep("API Key Rotator", f"Initialized with {len(self.keys)} keys")

	self.clients = [AsyncGroq(api_key=k) for k in self.keys]
	self.num_keys = len(self.clients)
	self.current_index = 0

	def get_client_and_rotate(self):
	if not self.clients:
	return None
	client = self.clients[self.current_index]
	self.current_index = (self.current_index + 1) % self.num_keys
	return client

	# Global Model Placeholders
	ml_models = {}
	dl_models = {}
	bert_model = None
	semantic_model = None
	key_rotator: Optional[SmartAPIKeyRotator] = None
	ip_cache = {}

	def clean_and_parse_json(text: str) -> Dict:
	try:
	return json.loads(text)
	except json.JSONDecodeError:
	pass

	text = re.sub(r"json\s*", "", text, flags=re.IGNORECASE)
	text = re.sub(r"", "", text)

	try:
	start = text.find('{')
	end = text.rfind('}')
	if start != -1 and end != -1:
	json_str = text[start:end+1]
	return json.loads(json_str)
	except Exception:
	pass
	logger.error(f"Failed to parse JSON from LLM response: {text[:50]}...")
	return {}

	class EnsembleScorer:
	WEIGHTS = {'ml': 0.30, 'dl': 0.20, 'bert': 0.20, 'semantic': 0.10, 'network': 0.20}

	@staticmethod
	def calculate_technical_score(predictions: Dict, network_data: List[Dict], urls: List[str]) -> Dict:
	score_accum = 0.0
	weight_accum = 0.0
	details = []

	log_step("🧮", "Calculating Ensemble Weights")

	# ML Scores
	ml_scores = [p['raw_score'] for k, p in predictions.items() if k in ['logistic', 'svm', 'xgboost']]
	if ml_scores:
	avg_ml = np.mean(ml_scores)
	score_accum += avg_ml * EnsembleScorer.WEIGHTS['ml'] * 100
	weight_accum += EnsembleScorer.WEIGHTS['ml']
	details.append(f"ML Consensus: {avg_ml:.2f}")
	log_substep("ML Models Consensus", f"{avg_ml:.4f} (Weight: {EnsembleScorer.WEIGHTS['ml']})")

	# DL Scores
	dl_scores = [p['raw_score'] for k, p in predictions.items() if k in ['attention_blstm', 'rcnn']]
	if dl_scores:
	avg_dl = np.mean(dl_scores)
	score_accum += avg_dl * EnsembleScorer.WEIGHTS['dl'] * 100
	weight_accum += EnsembleScorer.WEIGHTS['dl']
	details.append(f"DL Consensus: {avg_dl:.2f}")
	log_substep("Deep Learning Consensus", f"{avg_dl:.4f} (Weight: {EnsembleScorer.WEIGHTS['dl']})")

	# BERT
	if 'bert' in predictions:
	bert_s = predictions['bert']['raw_score']
	score_accum += bert_s * EnsembleScorer.WEIGHTS['bert'] * 100
	weight_accum += EnsembleScorer.WEIGHTS['bert']
	details.append(f"BERT Score: {bert_s:.2f}")
	log_substep("BERT Finetuned", f"{bert_s:.4f} (Weight: {EnsembleScorer.WEIGHTS['bert']})")

	# Semantic
	if 'semantic' in predictions:
	sem_s = predictions['semantic']['raw_score']
	score_accum += sem_s * EnsembleScorer.WEIGHTS['semantic'] * 100
	weight_accum += EnsembleScorer.WEIGHTS['semantic']
	log_substep("Semantic Analysis", f"{sem_s:.4f} (Weight: {EnsembleScorer.WEIGHTS['semantic']})")

	# Network
	net_risk = 0.0
	net_reasons = []
	for net_info in network_data:
	if net_info.get('proxy') or net_info.get('hosting'):
	net_risk += 40
	net_reasons.append("Hosted/Proxy IP")

	org = str(net_info.get('org', '')).lower()
	isp = str(net_info.get('isp', '')).lower()
	suspicious_hosts = ['hostinger', 'namecheap', 'digitalocean', 'hetzner', 'ovh', 'flokinet']

	if any(x in org or x in isp for x in suspicious_hosts):
	net_risk += 20
	net_reasons.append(f"Cheap Cloud Provider ({org[:15]}...)")

	net_risk = min(net_risk, 100)
	score_accum += net_risk * EnsembleScorer.WEIGHTS['network']
	weight_accum += EnsembleScorer.WEIGHTS['network']

	log_substep("Network Risk Calculated", f"{net_risk:.2f} (Weight: {EnsembleScorer.WEIGHTS['network']})")
	if net_reasons:
	details.append(f"Network Penalties: {', '.join(list(set(net_reasons)))}")

	if weight_accum == 0:
	final_score = 50.0
	else:
	final_score = score_accum / weight_accum

	return {
	"score": min(max(final_score, 0), 100),
	"details": "; ".join(details),
	"network_risk": net_risk
	}

	def load_models():
	global ml_models, dl_models, bert_model, semantic_model, key_rotator
	log_section("SYSTEM STARTUP: LOADING ASSETS")

	models_dir = config.MODELS_DIR

	# Load ML
	for model_name in ['logistic', 'svm', 'xgboost']:
	try:
	path = os.path.join(models_dir, f'{model_name}.joblib')
	if os.path.exists(path):
	ml_models[model_name] = joblib.load(path)
	log_substep(f"ML Model Loaded", model_name)
	except Exception:
	pass

	# Load DL
	for model_name in ['attention_blstm', 'rcnn']:
	try:
	path = os.path.join(models_dir, f'{model_name}.pt')
	if os.path.exists(path):
	template = get_dl_models(input_dim=len(config.NUMERICAL_FEATURES))
	model = template[model_name]
	model.load_state_dict(torch.load(path, map_location='cpu'))
	model.eval()
	dl_models[model_name] = model
	log_substep(f"DL Model Loaded", model_name)
	except Exception:
	pass

	# Load BERT
	bert_path = os.path.join(config.BASE_DIR, 'finetuned_bert')
	if os.path.exists(bert_path):
	try:
	bert_model = FinetunedBERT(bert_path)
	log_substep("BERT Model", "Loaded Successfully")
	except Exception:
	pass

	# Load Semantic
	sem_path = os.path.join(config.BASE_DIR, 'Message_model', 'final_semantic_model')
	if os.path.exists(sem_path) and PhishingPredictor:
	try:
	semantic_model = PhishingPredictor(model_path=sem_path)
	log_substep("Semantic Model", "Loaded Successfully")
	except Exception:
	pass

	key_rotator = SmartAPIKeyRotator()

	# --- UPDATED PARSING LOGIC ---
	def extract_visible_text_and_links(raw_email: str) -> tuple:
	"""
	Parse a full raw email using Python's email library and extract:
	- extracted_text (merged plain text + HTML text + metadata)
	- links (list of all URLs found anywhere)
	"""
	log_step("📨", "Parsing Email MIME Structure")
	if not raw_email:
	logger.warning("Parsing received empty email input")
	return "", []

	extracted_text_parts = []
	links = set()

	# Attempt 1: Try parsing as a standard MIME Email message
	try:
	msg = email.message_from_string(raw_email, policy=default)

	# Extract basic metadata if available
	metadata = {
	"from": msg.get("From", ""),
	"to": msg.get("To", ""),
	"subject": msg.get("Subject", "")
	}
	for k, v in metadata.items():
	if v:
	extracted_text_parts.append(f"{k.capitalize()}: {v}")
	log_substep(f"Metadata [{k}]", v[:50] + "..." if len(v) > 50 else v)

	part_count = 0
	for part in msg.walk():
	part_count += 1
	content_type = part.get_content_type()
	content_disposition = str(part.get("Content-Disposition") or "")

	try:
	if content_type == "text/plain":
	text_data = part.get_payload(decode=True)
	if text_data:
	text_str = text_data.decode(part.get_content_charset() or "utf-8", errors="ignore")
	extracted_text_parts.append(text_str)
	links.update(re.findall(r'https?://\S+', text_str))

	elif content_type == "text/html":
	html_data = part.get_payload(decode=True)
	if html_data:
	html_str = html_data.decode(part.get_content_charset() or "utf-8", errors="ignore")
	soup = BeautifulSoup(html_str, "html.parser")
	extracted_text_parts.append(soup.get_text(separator="\n"))
	for a in soup.find_all("a", href=True):
	links.add(a["href"])
	for img in soup.find_all("img", src=True):
	links.add(img["src"])

	elif "attachment" in content_disposition.lower() or "inline" in content_disposition.lower():
	filename = part.get_filename()
	if filename:
	extracted_text_parts.append(f"[Attachment found: {filename}]")
	log_substep("Attachment", filename)

	except Exception as e:
	logger.warning(f"Error parsing email part: {e}")

	except Exception as e:
	logger.error(f"Email Parsing Failed: {e}")

	# Combine extracted parts
	extracted_text = "\n".join(extracted_text_parts).strip()

	# --- CRITICAL FIX FOR RAW HTML PAYLOADS ---
	# If MIME parsing failed to extract text (extracted_text is empty),
	# but the input looks like HTML, force a BeautifulSoup clean.
	if not extracted_text:
	if "<html" in raw_email.lower() or "<body" in raw_email.lower() or "<div" in raw_email.lower():
	log_substep("Fallback", "Input appears to be Raw HTML, stripping tags...")
	try:
	soup = BeautifulSoup(raw_email, "html.parser")
	extracted_text = soup.get_text(separator="\n")
	# Also grab links from this raw HTML
	for a in soup.find_all("a", href=True):
	links.add(a["href"])
	for img in soup.find_all("img", src=True):
	links.add(img["src"])
	except Exception:
	extracted_text = raw_email
	else:
	extracted_text = raw_email

	# Final Regex sweep for links (catch-all)
	links.update(re.findall(r'https?://\S+', raw_email))

	cleaned_links = []
	for link in links:
	link = link.strip().strip("<>").replace('"', "")
	if link.startswith("http://") or link.startswith("https://"):
	cleaned_links.append(link)

	log_success(f"Parsed Content. Extracted {len(cleaned_links)} unique URLs.")
	return extracted_text, cleaned_links

	async def extract_url_features(urls: List[str]) -> pd.DataFrame:
	if not urls:
	return pd.DataFrame()

	log_step("🧬", f"Extracting Features for {len(urls)} URLs")
	df = pd.DataFrame({'url': urls})
	whois_cache, ssl_cache = {}, {}

	tasks = [asyncio.to_thread(process_row, row, whois_cache, ssl_cache) for _, row in df.iterrows()]
	feature_list_raw = await asyncio.gather(*tasks, return_exceptions=True)

	feature_list = []
	for i, f in enumerate(feature_list_raw):
	if isinstance(f, Exception):
	logger.error(f"Feature extraction error on {urls[i]}: {f}")
	feature_list.append({})
	else:
	feature_list.append(f)

	log_substep("Feature Extraction", "Complete")
	return pd.concat([df, pd.DataFrame(feature_list)], axis=1)

	def get_model_predictions(features_df: pd.DataFrame, message_text: str) -> Dict:
	predictions = {}
	num_feats = config.NUMERICAL_FEATURES
	cat_feats = config.CATEGORICAL_FEATURES

	if not features_df.empty:
	try:
	log_step("🤖", "Running Machine Learning Inference")
	X = features_df[num_feats + cat_feats].copy()
	X[num_feats] = X[num_feats].fillna(-1)
	X[cat_feats] = X[cat_feats].fillna('N/A')

	# ML Models
	for name, model in ml_models.items():
	try:
	probas = model.predict_proba(X)[:, 1]
	raw_score = float(np.max(probas))
	predictions[name] = {'raw_score': raw_score}
	log_substep(f"ML: {name.ljust(10)}", f"{raw_score:.4f}")
	except:
	predictions[name] = {'raw_score': 0.5}

	# DL Models
	if dl_models:
	X_num = torch.tensor(X[num_feats].values.astype(np.float32))
	with torch.no_grad():
	for name, model in dl_models.items():
	try:
	out = model(X_num)
	raw_score = float(torch.max(out).item())
	predictions[name] = {'raw_score': raw_score}
	log_substep(f"DL: {name.ljust(10)}", f"{raw_score:.4f}")
	except:
	predictions[name] = {'raw_score': 0.5}

	# BERT
	if bert_model:
	try:
	scores = bert_model.predict_proba(features_df['url'].tolist())
	avg_score = float(np.mean([s[1] for s in scores]))
	predictions['bert'] = {'raw_score': avg_score}
	log_substep("BERT Inference", f"{avg_score:.4f}")
	except:
	pass
	except Exception as e:
	logger.error(f"Feature Pipeline Error: {e}")

	if semantic_model and message_text:
	try:
	log_step("🧠", "Running Semantic Text Analysis")
	res = semantic_model.predict(message_text)
	predictions['semantic'] = {'raw_score': float(res['phishing_probability'])}
	log_substep("Semantic Prob", f"{res['phishing_probability']:.4f}")
	except:
	pass

	return predictions

	async def get_network_data_raw(urls: List[str]) -> List[Dict]:
	data = []
	unique_hosts = set()

	for url_str in urls:
	try:
	parsed = urlparse(url_str if url_str.startswith(('http', 'https')) else f"http://{url_str}")
	if parsed.hostname:
	unique_hosts.add(parsed.hostname)
	except:
	pass

	target_hosts = list(unique_hosts)[:5]
	log_step("🌍", f"Geo-Locating Hosts: {target_hosts}")

	async with httpx.AsyncClient(timeout=3.0) as client:
	for host in target_hosts:
	if host in ip_cache:
	data.append(ip_cache[host])
	log_substep(f"Cache Hit", host)
	continue

	try:
	ip = await asyncio.to_thread(socket.gethostbyname, host)
	resp = await client.get(f"http://ip-api.com/json/{ip}?fields=status,message,country,isp,org,as,proxy,hosting")
	if resp.status_code == 200:
	geo = resp.json()
	if geo.get('status') == 'success':
	geo['ip'] = ip
	geo['host'] = host
	data.append(geo)
	ip_cache[host] = geo
	log_substep(f"Resolved {host}", f"{geo.get('org', 'Unknown')} [{geo.get('country', 'UNK')}]")
	except Exception:
	log_substep(f"Failed to resolve", host)

	await asyncio.sleep(0.2)
	return data

	async def scrape_landing_page(urls: list[str]) -> dict:

	# Cap URLs to 10
	urls = urls[:10]

	results = {}

	async def scrape_single(url: str):
	nonlocal results
	try:
	async with async_playwright() as p:
	browser = await p.chromium.launch(headless=True)
	context = await browser.new_context(
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
	)
	page = await context.new_page()

	try:
	target_url = url if url.startswith(("http", "https")) else f"http://{url}"
	await page.goto(target_url, timeout=10000, wait_until="domcontentloaded")

	content = await page.content()

	soup = BeautifulSoup(content, "html.parser")
	for tag in soup(["script", "style", "nav", "footer", "svg", "noscript"]):
	tag.decompose()

	text = soup.get_text(separator=" ", strip=True)
	text = unicodedata.normalize("NFKC", text)

	results[url] = text[:300]

	except Exception as e:
	results[url] = f"Error accessing page: {str(e)}"

	finally:
	await browser.close()

	except Exception as e:
	results[url] = f"Scraping failed: {str(e)}"

	# Run all tasks concurrently
	tasks = [scrape_single(u) for u in urls]
	await asyncio.gather(*tasks)

	return results


	# --- SYSTEM PROMPT ---

	SYSTEM_PROMPT = """You are the 'Maverick', an elite, autonomous Cybersecurity Judge. Your sole purpose is to analyze the provided Evidence Dossier and return a JSON object.
	Core Rules:
	1. The "One Bad Link" Rule: If the email contains ANY suspicious or malicious URL, the Final Decision MUST be "phishing" (100% Confidence), even if other links are legitimate.
	2. Prioritize Ground Truth: You must prioritize Scraped Content (e.g., a page asking for credentials) and Network Data (e.g., a Bank hosted on DigitalOcean) over the Technical Score.
	3. Override Authority: Even if the 'Technical Ensemble Score' is low (e.g., 20/100), if you find a Critical Threat in the Scraped Data or Forensic Scan, you MUST override with a High Score (90-100).
	4. Suspicious Indicators:
	- Scraped Data: Login forms on non-official domains, "Verify Identity" text, urgency.
	- Network: Mismatch between Sender Domain and Hosting (e.g., Microsoft email hosted on Namecheap).
	- Forensics: Hidden H1 tags, Typosquatting (paypa1.com), Mismatched hrefs.
	5. Confidence score:
	-Give confidence score between 0-100 based on all the evidences and the decision being made. The score >50 should be given if the email seems phishing and <50 should be given if the email seems legitimate.
	6. Final Decision:
	-Based on the evidences and confidence score, give the final decision , if the final score seems phishing then give final decision as phishing and if the final score seems legitimate then give final decision as legitimate.
	8 ROBUST FEW-SHOT EXAMPLES:
	Example 1: Phishing (Credential Harvesting - Scraped Data Override)
	Input:
	Sender: security-alert@microsoft-online-verify.com
	Subject: Action Required: Unusual Sign-in Activity Detected
	Technical Score: 35 / 100
	Network Intelligence: Host: 162.241.2.1 \| Org: Unified Layer (Cheap Hosting) \| ISP: Bluehost \| Proxy: False
	Scraped Content: "Microsoft 365. Sign in to your account. Email, phone, or Skype. No account? Create one. Can't access your account? Sign-in options. Terms of Use Privacy & Cookies. © Microsoft 2025. NOTE: This page is for authorized users only."
	Forensic Scan: Link: http://microsoft-online-verify.com/login.php
	Message: "Microsoft Security Alert
	We detected a sign-in attempt from a new device or location.
	Account: user@example.com
	Date: Fri, Nov 28, 2025 10:23 AM GMT
	Location: Moscow, Russia
	IP Address: 103.22.14.2
	Browser: Firefox on Windows 10
	If this wasn't you, your account may have been compromised. Please verify your identity immediately to secure your account and avoid permanent suspension.
	[Secure My Account]
	Thanks,
	The Microsoft Account Team"
	Correct Decision:
	{{
	"confidence": 99.0,
	"reasoning": "CRITICAL OVERRIDE. The Scraped Data mimics a Microsoft 365 Login portal ('Sign in to your account'), but the Network Data confirms the site is hosted on 'Unified Layer/Bluehost', NOT Microsoft's official Azure infrastructure. This is a classic credential harvesting attack using a fake security alert.",
	"highlighted_text": "Please @@verify your identity immediately@@ to secure your account and avoid permanent suspension. @@[Secure My Account]@@",
	"final_decision": "phishing",
	"suggestion": "Do not enter credentials. This is a fake login page hosted on non-Microsoft servers."
	}}
	Example 2: Phishing (Hidden Malicious URL - Forensic Override)
	Input:
	Sender: hr-updates@wipro.com
	Subject: MANDATORY: Updated Employee Handbook & Compliance Policy 2025
	Technical Score: 45 / 100
	Network Intelligence: Host: docs.google.com (Google LLC)
	Scraped Content: "Google Docs. Sign in. Employee Handbook 2025.pdf. You need permission. Request access. Switch accounts. Google Workspace."
	Forensic Scan: CRITICAL: Found hidden URL in H1 tag: 'http://bit.ly/malware-redirect-payload'
	Message: "Dear Team,
	As part of our annual compliance audit (ISO 27001), all employees are required to review and sign the updated Employee Handbook for the fiscal year 2025.
	Please access the document via the secure Google Docs link below:
	[docs.google.com/handbook-2025](Link)
	Failure to acknowledge this document by Friday may result in a temporary suspension of network access.
	Regards,
	HR Compliance Team
	Wipro Limited"
	Correct Decision:
	{{
	"confidence": 98.0,
	"reasoning": "Phishing. While the visible body text points to a legitimate Google Docs URL, the email contains a hidden malicious URL ('bit.ly/malware-redirect-payload') embedded in the HTML header tags. This is a sophisticated evasion tactic designed to bypass filters while tricking the user.",
	"highlighted_text": "Please access the document via the secure Google Docs link below: [docs.google.com/handbook-2025] @@(Hidden Header URL Detected)@@",
	"final_decision": "phishing",
	"suggestion": "Do not click. A hidden malicious payload was detected in the email structure."
	}}
	Example 3: Phishing (Typosquatting & Urgency)
	Input:
	Sender: support@paypa1-resolution.com
	Subject: URGENT: Wallet Suspended - Case ID #99283-AX
	Technical Score: 88 / 100
	Network Intelligence: Host: paypa1-resolution.com \| Org: Namecheap Inc \| ISP: Namecheap
	Scraped Content: "PayPal. Security Challenge. Enter your credit card number to verify ownership. Expiration Date. CVV. Billing Address. Submit. Copyright 1999-2025 PayPal. All rights reserved."
	Forensic Scan: Link: https://paypa1-resolution.com/verify-identity
	Message: "Hello Customer,
	Your PayPal wallet has been temporarily suspended due to suspicious transactions totaling $400.00 USD to 'Global-Tech-Solutions Ltd'.
	To restore full access to your funds, you must verify your payment method immediately. Failure to do so within 24 hours will result in the permanent closure of your account and forfeiture of remaining balance.
	[Resolve Issue Now]
	Thank you for being a valued customer."
	Correct Decision:
	{{
	"confidence": 99.0,
	"reasoning": "Phishing. Typosquatting detected ('paypa1' instead of 'paypal'). The Scraped Data confirms the landing page asks for credit card details (CVV/Expiry), and the domain is registered via Namecheap, not PayPal's official infrastructure. High urgency and threat of 'forfeiture' are clear indicators.",
	"highlighted_text": "Your PayPal wallet has been temporarily @@suspended@@. To restore full access, you must @@verify your payment method immediately@@. Failure to do so within 24 hours will result in @@permanent closure@@.",
	"final_decision": "phishing",
	"suggestion": "Delete immediately. This is an impersonation attack stealing financial data."
	}}
	Example 4: Legitimate (Internal Corporate Email)
	Input:
	Sender: admin@internal.daiict.ac.in
	Subject: Scheduled Maintenance - Server Room B - Sunday Nov 30
	Technical Score: 15 / 100
	Network Intelligence: Host: internal.daiict.ac.in \| Org: DA-IICT \| ISP: NKN (National Knowledge Network) \| Proxy: False
	Scraped Content: "DA-IICT Intranet Login. Authorized Personnel Only. User ID. Password. Forgot Password? IT Helpdesk Extension: 4444."
	Forensic Scan: Link: https://internal.daiict.ac.in/wiki/maintenance-schedule
	Message: "Dear Faculty and Staff,
	The central server room (Room B) will be undergoing scheduled maintenance on Sunday, Nov 30, from 2:00 AM to 4:00 AM IST.
	Services affected:
	1. Moodle (LMS)
	2. Faculty Webmail
	3. Student Portal
	Please ensure all work is saved before this time. For the full maintenance schedule and alternative access points, please check the wiki:
	https://internal.daiict.ac.in/wiki/maintenance-schedule
	Regards,
	IT Services"
	Correct Decision:
	{{
	"confidence": 5.0,
	"reasoning": "Legitimate. The sender domain, link domain, and network infrastructure (DA-IICT/NKN) all align perfectly. The scraped content matches an internal login page. The content is informational with specific details and lacks suspicious urgency or external hosting.",
	"highlighted_text": "The central server room (Room B) will be undergoing scheduled maintenance on Sunday. Please check the wiki: https://internal.daiict.ac.in/wiki/maintenance-schedule",
	"final_decision": "legitimate",
	"suggestion": "Safe internal communication regarding maintenance."
	}}
	Example 5: Phishing (Legitimate Cloud Abuse - Google Forms)
	Input:
	Sender: recruitment-officer.john.doe@gmail.com
	Subject: Job Offer: Senior Python Developer - Document Upload
	Technical Score: 60 / 100
	Network Intelligence: Host: forms.gle (Google LLC)
	Scraped Content: "Untitled Form. Job Application - Senior Dev. Upload your Passport Front and Back. Upload SSN Card. Upload Drivers License. Submit. Never submit passwords through Google Forms."
	Forensic Scan: Link: https://forms.gle/xyz123abc
	Message: "Dear Candidate,
	Congratulations! We are pleased to offer you the Senior Python Developer position at TechSolutions Inc.
	To proceed with your background check and contract generation, we need you to upload the following documents immediately:
	1. Scanned Copy of Passport
	2. Social Security Number (SSN) Card
	3. Driver's License
	Please upload them to our secure portal here:
	[Link to Google Form]
	We need this by EOD today."
	Correct Decision:
	{{
	"confidence": 92.0,
	"reasoning": "Phishing. Although the domain is legitimate (Google Forms), the Scraped Content reveals it is soliciting highly sensitive PII (Passport/SSN). Legitimate companies do not collect SSNs via public Google Forms. The use of a generic Gmail address for a 'Recruitment Officer' is also a red flag.",
	"highlighted_text": "To proceed with your background check... please @@upload the following documents immediately@@: 1. Scanned Copy of Passport 2. Social Security Number (SSN) Card. Please upload them to our secure portal here: @@[Link to Google Form]@@",
	"final_decision": "phishing",
	"suggestion": "Do not upload sensitive ID documents to public forms. This is likely identity theft."
	}}
	Example 6: Legitimate (Transactional Alert)
	Input:
	Sender: alerts@hdfcbank.net
	Subject: Transaction Alert: INR 5,000.00 Debited
	Technical Score: 8 / 100
	Network Intelligence: Host: hdfcbank.net \| Org: HDFC Bank Ltd \| ISP: Sify Technologies
	Scraped Content: N/A (No Links)
	Forensic Scan: No URLs found.
	Message: "Dear Customer,
	INR 5,000.00 was debited from your A/c XX1234 on 28-Nov-2025.
	Info: UPI-12345-AmazonPay
	Available Bal: INR 42,000.00
	If this transaction was not done by you, please call our 24x7 PhoneBanking number 1800-202-6161 immediately to block your card/account.
	Warm Regards,
	HDFC Bank"
	Correct Decision:
	{{
	"confidence": 2.0,
	"reasoning": "Legitimate. This is a standard text-only transactional alert. The sender domain matches HDFC Bank's official domain, and the network data confirms it. The phone number is a standard support line. There are no suspicious links.",
	"highlighted_text": "INR 5,000.00 was debited from your A/c XX1234 on 28-Nov-2025. Info: UPI-12345-AmazonPay.",
	"final_decision": "legitimate",
	"suggestion": "Safe transactional alert. No action needed unless the transaction is unrecognized."
	}}
	Example 7: Phishing (CEO Fraud / BEC - No Links)
	Input:
	Sender: ceo.work.private@gmail.com (Spoofed Name: "Elon Musk")
	Subject: Urgent Task - Confidential - DO NOT IGNORE
	Technical Score: 75 / 100
	Network Intelligence: Host: mail.google.com (Google LLC)
	Scraped Content: N/A
	Forensic Scan: No URLs found.
	Message: "Akshat,
	I am currently in a closed-door meeting with the board of investors and cannot talk on the phone. I need a favor.
	I need you to purchase 5 Apple Gift Cards ($100 each) for a client gift. It is urgent and needs to be done in the next 30 minutes. I will reimburse you personally by this evening.
	Do not mention this to anyone else yet. Reply with the codes here as soon as you have them.
	Elon."
	Correct Decision:
	{{
	"confidence": 90.0,
	"reasoning": "Phishing (BEC). Classic Business Email Compromise. The Sender is using a generic Gmail address to impersonate a C-level executive. The request involves financial urgency (Gift Cards), secrecy ('closed-door meeting', 'do not mention'), and bypasses standard procurement channels.",
	"highlighted_text": "I need you to @@purchase 5 Apple Gift Cards@@ ($100 each) for a client gift. It is urgent... @@Reply with the codes here@@ as soon as you have them.",
	"final_decision": "phishing",
	"suggestion": "Do not reply. Verify this request with the CEO via a different, verified channel (Slack/Phone/Corporate Email)."
	}}
	Example 8: Legitimate (Marketing with Trackers)
	Input:
	Sender: newsletter@coursera.org
	Subject: Recommended for you: Python for Everybody Specialization
	Technical Score: 20 / 100
	Network Intelligence: Host: links.coursera.org \| Org: Coursera Inc \| ISP: Amazon.com
	Scraped Content: "Coursera. Master Python. Enroll for Free. Starts Nov 29. Financial Aid available. Top Instructors. University of Michigan. 4.8 Stars (120k ratings)."
	Forensic Scan: Link: https://links.coursera.org/track/click?id=12345&user=akshat
	Message: "Hi Student,
	Based on your interest in Data Science, we found a course you might like:
	Python for Everybody Specialization
	Offered by University of Michigan.
	Start learning today and build job-ready skills.
	[Enroll Now]
	See you in class,
	The Coursera Team
	381 E. Evelyn Ave, Mountain View, CA 94041"
	Correct Decision:
	{{
	"confidence": 10.0,
	"reasoning": "Legitimate. Standard marketing email from a known education platform. Network data confirms the link tracking domain belongs to Coursera (hosted on AWS). Scraped content is consistent with the offer. Address matches public records.",
	"highlighted_text": "Based on your interest in Data Science, we found a course you might like: Python for Everybody Specialization. [Enroll Now]",
	"final_decision": "legitimate",
	"suggestion": "Safe marketing email."
	}}"""

	async def get_groq_decision(ensemble_result: Dict, network_data: List[Dict], landing_page_text: str, cleaned_text: str, original_raw_html: str, readable_display_text: str, sender: str, subject: str):
	net_str = "No Network Data"
	if network_data:
	net_str = "\n".join([
	f"- Host: {d.get('host')} \| IP: {d.get('ip')} \| Org: {d.get('org')} \| ISP: {d.get('isp')} \| Hosting/Proxy: {d.get('hosting') or d.get('proxy')}"
	for d in network_data
	])

	log_step("🔎", "Starting Forensic HTML Scan")
	forensic_report = []
	try:
	soup = BeautifulSoup(original_raw_html, 'html.parser')

	# A. Scan Forms
	for form in soup.find_all('form'):
	action = form.get('action')
	if action:
	forensic_report.append(f"CRITICAL: Found URL in <form action>: {action}")

	# B. Scan Images
	for img in soup.find_all('img'):
	src = img.get('src')
	if src:
	forensic_report.append(f"Found URL in <img src>: {src}")

	# C. Scan Links
	for a in soup.find_all('a'):
	href = a.get('href')
	if href:
	forensic_report.append(f"Found URL in <a href>: {href}")

	# D. Scan Raw Text (Catches the H1 Case)
	url_pattern = r'(?:https?://\|ftp://\|www\.)[\w\-\.]+\.[a-zA-Z]{2,}(?:/[\w\-\._~:/?#[\]@!$&\'()+,;=])?'
	all_text_urls = set(re.findall(url_pattern, original_raw_html))
	if all_text_urls:
	forensic_report.append(f"All URLs detected in raw text: {', '.join(all_text_urls)}")

	except Exception as e:
	logger.warning(f"Forensic Scan Error: {e}")
	forensic_report.append("Forensic scan failed to parse HTML structure.")

	forensic_str = "\n".join(forensic_report) if forensic_report else "No URLs found in forensic scan."
	log_substep("Forensic Scan", f"Found {len(forensic_report)} potential indicators")

	# CRITICAL: Ensure input text is truncated to fit token limits
	prompt_display_text = readable_display_text[:MAX_INPUT_CHARS]

	prompt = f"""
	ANALYSIS CONTEXT
	Sender: {sender}
	Subject: {subject}
	FORENSIC URL SCAN (INTERNAL HTML ANALYSIS)
	The system scanned the raw HTML and found these URLs (hidden in tags):
	{forensic_str}
	TECHNICAL INDICATORS
	Calculated Ensemble Score: {ensemble_result['score']:.2f} / 100
	Key Factors: {ensemble_result['details']}
	NETWORK GROUND TRUTH
	{net_str}
	LANDING PAGE PREVIEW (Scraped Text)
	"{landing_page_text}"
	MESSAGE CONTENT (READABLE VERSION)
	"{prompt_display_text}"
	TASK:
	Analyze the "FORENSIC URL SCAN" findings.
	- If ANY URL in the forensic scan is NSFW/Adult or malicious, flag as PHISHING.
	- If a URL looks like a generated subdomain (e.g. 643646.me) or is unrelated to the sender, FLAG AS PHISHING immediately.
	- IMPORTANT: For the 'highlighted_text' field in your JSON response, use the MESSAGE CONTENT (READABLE VERSION) provided above. Do NOT output raw HTML tags. Just mark suspicious parts in the readable text with @@...@@.
	"""

	attempts = 0
	while attempts < LLM_MAX_RETRIES:
	try:
	client = key_rotator.get_client_and_rotate()
	if not client:
	raise Exception("No Keys")

	log_step("🚀", f"Sending LLM Request (Attempt {attempts+1}/{LLM_MAX_RETRIES})")

	completion = await client.chat.completions.create(
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": prompt}
	],
	model="meta-llama/llama-4-scout-17b-16e-instruct",
	temperature=0.1,
	max_tokens=4096,
	response_format={"type": "json_object"}
	)

	raw_content = completion.choices[0].message.content
	log_substep("LLM Response Received", f"Length: {len(raw_content)} chars")

	parsed_json = clean_and_parse_json(raw_content)

	if parsed_json:
	log_success("LLM Response Parsed Successfully")
	return parsed_json
	else:
	raise ValueError("Empty or Invalid JSON from LLM")

	except RateLimitError as e:
	wait_time = 2 ** (attempts + 1) + random.uniform(0, 1)
	if hasattr(e, 'headers') and 'retry-after' in e.headers:
	try:
	wait_time = float(e.headers['retry-after']) + 1
	except:
	pass
	logger.warning(f"LLM Rate Limit (429). Retrying in {wait_time:.2f}s...")
	await asyncio.sleep(wait_time)
	attempts += 1
	except Exception as e:
	logger.warning(f"LLM Attempt {attempts+1} failed: {e}")
	attempts += 1
	await asyncio.sleep(1)

	is_phishing = ensemble_result['score'] > 50
	return {
	"confidence": ensemble_result['score'],
	"reasoning": f"LLM Unavailable after retries. Decision based purely on Technical Score ({ensemble_result['score']:.2f}).",
	"highlighted_text": readable_display_text,
	"final_decision": "phishing" if is_phishing else "legitimate",
	"suggestion": "Exercise caution. Automated analysis detected risks." if is_phishing else "Appears safe."
	}

	@app.on_event("startup")
	async def startup():
	logger.info(f"\n{UltraColorFormatter.NEON_BLUE}{'='*70}")
	logger.info(f"{UltraColorFormatter.WHITE_BOLD} PHISHING DETECTION API v2.6.0 - SYSTEM STARTUP ".center(80))
	logger.info(f"{UltraColorFormatter.NEON_BLUE}{'='*70}{UltraColorFormatter.RESET}")
	load_models()
	logger.info(f"\n{UltraColorFormatter.NEON_GREEN}🚀 SYSTEM READY AND LISTENING ON PORT 8000{UltraColorFormatter.RESET}\n")

	@app.post("/predict", response_model=PredictionResponse)
	async def predict(input_data: MessageInput):
	log_section(f"NEW REQUEST: {input_data.sender}")

	if not input_data.text or not input_data.text.strip():
	logger.warning("Received empty input text.")
	return PredictionResponse(
	confidence=0.0,
	reasoning="Empty input.",
	highlighted_text="",
	final_decision="legitimate",
	suggestion="None"
	)

	async with request_semaphore:
	try:
	start_time = time.time()

	# Step 1: Parsing
	extracted_text, all_urls = extract_visible_text_and_links(input_data.text)

	# Cleaning
	url_pattern_for_cleaning = r'(?:https?://\|ftp://\|www\.)[\w\-\.]+\.[a-zA-Z]{2,}(?:/[\w\-\._~:/?#[\]@!$&\'()+,;=])?'
	cleaned_text_for_models = re.sub(url_pattern_for_cleaning, '', extracted_text)
	cleaned_text_for_models = ' '.join(cleaned_text_for_models.lower().split())

	all_urls = all_urls[:MAX_URLS_TO_ANALYZE]

	if all_urls:
	log_step("🔗", f"Proceeding with {len(all_urls)} URLs")
	else:
	log_step("🚫", "No URLs Detected - Skipping Feature Extraction")

	features_df = pd.DataFrame()
	network_data_raw = []
	landing_page_text = ""

	# Step 3: Async Tasks
	if all_urls:
	log_step("⚡", "Initiating Parallel Async Tasks")
	results = await asyncio.gather(
	extract_url_features(all_urls),
	get_network_data_raw(all_urls),
	scrape_landing_page(all_urls)
	)
	features_df, network_data_raw, landing_page_text = results

	if isinstance(landing_page_text, dict):
	landing_page_text = "\n".join(f"{u}: {txt}" for u, txt in landing_page_text.items())
	else:
	landing_page_text = str(landing_page_text)

	# Step 4: Ensemble
	predictions = await asyncio.to_thread(get_model_predictions, features_df, cleaned_text_for_models)
	ensemble_result = EnsembleScorer.calculate_technical_score(predictions, network_data_raw, all_urls)

	log_metric("Ensemble Technical Score", f"{ensemble_result['score']:.2f}/100", warning=ensemble_result['score']>50)

	# Step 5: LLM
	llm_result = await get_groq_decision(
	ensemble_result,
	network_data_raw,
	landing_page_text,
	cleaned_text_for_models,
	input_data.text,
	extracted_text,
	input_data.sender,
	input_data.subject
	)

	# --- START OF FIX: LOGIC ENFORCEMENT ---

	# 1. Normalize the decision string
	final_dec = llm_result.get('final_decision', 'legitimate').lower()
	if final_dec not in ['phishing', 'legitimate']:
	final_dec = 'legitimate'

	# 2. Get the raw confidence
	final_confidence = float(llm_result.get('confidence', ensemble_result['score']))

	# 3. FORCE ALIGNMENT (The Fix)
	# If verdict is Phishing, score MUST be > 50.
	if final_dec == "phishing" and final_confidence <= 50:
	logger.warning(f"⚠️ Consistency Fix: Verdict is Phishing but Score was {final_confidence}. Forcing to 85.0.")
	final_confidence = max(60.0, final_confidence) # Force to high confidence

	# If verdict is Legitimate, score MUST be < 50.
	elif final_dec == "legitimate" and final_confidence > 50:
	logger.warning(f"⚠️ Consistency Fix: Verdict is Legitimate but Score was {final_confidence}. Forcing to 15.0.")
	final_confidence = min(40.0, final_confidence) # Force to low confidenc

	# --- END OF FIX ---

	elapsed = time.time() - start_time

	log_section("REQUEST COMPLETE")
	log_metric("Execution Time", f"{elapsed:.2f}s")
	log_metric("Technical Score", f"{ensemble_result['score']:.0f}")

	decision_color = UltraColorFormatter.BOLD_RED if final_dec == "phishing" else UltraColorFormatter.NEON_GREEN
	logger.info(f" ⚖️ FINAL VERDICT: {decision_color}{final_dec.upper()}{UltraColorFormatter.RESET} (Conf: {final_confidence})")

	return PredictionResponse(
	confidence=final_confidence,
	reasoning=llm_result.get('reasoning', ensemble_result['details']),
	highlighted_text=llm_result.get('highlighted_text', extracted_text),
	final_decision=final_dec,
	suggestion=llm_result.get('suggestion', 'Check details carefully.')
	)

	except Exception as e:
	logger.error(f"CRITICAL FAILURE in Prediction Pipeline: {e}")
	import traceback
	traceback.print_exc()
	raise HTTPException(status_code=500, detail=str(e))