Spaces:

haroon103
/

sppamemail

Running

App Files Files Community

sppamemail / app.py

haroon103

Update app.py

1085ade verified 30 days ago

raw

history blame contribute delete

9.13 kB

	"""
	FYP4 SPAM DETECTION API - Loads model from Hugging Face Hub
	"""

	import os
	import re
	import io
	import torch
	import torch.nn as nn
	from fastapi import FastAPI, File, UploadFile, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from huggingface_hub import hf_hub_download
	from safetensors.torch import load_file
	import PyPDF2
	import pdfplumber
	from PIL import Image
	from transformers import DebertaV2Model, DebertaV2Tokenizer

	# ================================
	# CONFIGURATION
	# ================================

	class Config:
	DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	TEXT_MODEL = 'microsoft/deberta-v3-base'

	# 🔥 CHANGE THIS TO YOUR MODEL REPO! 🔥
	MODEL_REPO = "haroon103/fyp4-spam-model" # ← Change haroon103 to YOUR username

	TEXT_HIDDEN_DIM = 768
	FUSION_DIM = 512
	NUM_CLASSES = 2
	DROPOUT = 0.3
	MAX_TEXT_LENGTH = 256

	config = Config()

	# ================================
	# MODEL ARCHITECTURES
	# ================================

	class DeBERTaTextEncoder(nn.Module):
	def __init__(self, dropout=0.3):
	super(DeBERTaTextEncoder, self).__init__()
	self.deberta = DebertaV2Model.from_pretrained(config.TEXT_MODEL)
	self.projection = nn.Sequential(
	nn.Dropout(dropout),
	nn.Linear(config.TEXT_HIDDEN_DIM, config.FUSION_DIM),
	nn.LayerNorm(config.FUSION_DIM),
	nn.GELU()
	)

	def forward(self, input_ids, attention_mask):
	outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
	pooled = outputs.last_hidden_state[:, 0, :]
	return self.projection(pooled)


	class TextSpamClassifier(nn.Module):
	def __init__(self, dropout=0.3):
	super(TextSpamClassifier, self).__init__()
	self.text_encoder = DeBERTaTextEncoder(dropout)
	self.classifier = nn.Sequential(
	nn.Linear(config.FUSION_DIM, 256),
	nn.LayerNorm(256),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(256, 128),
	nn.LayerNorm(128),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(128, config.NUM_CLASSES)
	)

	def forward(self, input_ids, attention_mask):
	features = self.text_encoder(input_ids, attention_mask)
	return self.classifier(features)


	# ================================
	# PDF EXTRACTION
	# ================================

	class PDFExtractor:
	@staticmethod
	def extract_text_from_pdf(pdf_bytes):
	email_data = {'subject': '', 'sender': '', 'body': '', 'full_text': ''}
	try:
	pdf_file = io.BytesIO(pdf_bytes)
	with pdfplumber.open(pdf_file) as pdf:
	full_text = ""
	for page in pdf.pages:
	text = page.extract_text()
	if text:
	full_text += text + "\n"
	email_data['full_text'] = full_text

	patterns = {
	'subject': [r'Subject:\s(.+)', r'SUBJECT:\s(.+)'],
	'sender': [r'From:\s(.+)', r'FROM:\s(.+)']
	}

	for field, pattern_list in patterns.items():
	for pattern in pattern_list:
	match = re.search(pattern, full_text, re.IGNORECASE)
	if match:
	email_data[field] = match.group(1).strip()[:100]
	break

	body_match = re.search(r'(?:Subject\|Date\|From\|To):.+?\n\n(.+)', full_text, re.DOTALL \| re.IGNORECASE)
	if body_match:
	email_data['body'] = body_match.group(1).strip()
	else:
	email_data['body'] = full_text

	return email_data
	except:
	try:
	pdf_file = io.BytesIO(pdf_bytes)
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	full_text = ""
	for page in pdf_reader.pages:
	text = page.extract_text()
	if text:
	full_text += text + "\n"
	email_data['full_text'] = full_text
	email_data['body'] = full_text
	return email_data
	except Exception as e:
	raise HTTPException(status_code=400, detail=f"Error: {str(e)}")


	# ================================
	# TEXT PREPROCESSING
	# ================================

	def preprocess_text(text):
	text = str(text).lower()
	text = re.sub(r'http\S+\|www\.\S+', '[URL]', text)
	text = re.sub(r'\S+@\S+', '[EMAIL]', text)
	text = re.sub(r'\d+', '[NUM]', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text


	# ================================
	# SPAM DETECTOR
	# ================================

	class SpamDetector:
	def __init__(self):
	self.device = config.DEVICE
	self.tokenizer = DebertaV2Tokenizer.from_pretrained(config.TEXT_MODEL)
	self.model = None

	try:
	print(f"📥 Downloading model from {config.MODEL_REPO}...")
	model_path = hf_hub_download(
	repo_id=config.MODEL_REPO,
	filename="model.safetensors"
	)
	print(f"✅ Downloaded to: {model_path}")

	print("🔧 Loading model...")
	self.model = TextSpamClassifier(dropout=config.DROPOUT).to(self.device)
	state_dict = load_file(model_path)
	self.model.load_state_dict(state_dict, strict=False)
	self.model.eval()
	print("✅ Model ready!")

	except Exception as e:
	print(f"❌ Error: {e}")
	self.model = None

	def predict_text(self, text):
	if not self.model:
	raise HTTPException(status_code=503, detail="Model not loaded")

	encoding = self.tokenizer(
	preprocess_text(text),
	add_special_tokens=True,
	max_length=config.MAX_TEXT_LENGTH,
	padding='max_length',
	truncation=True,
	return_attention_mask=True,
	return_tensors='pt'
	)

	input_ids = encoding['input_ids'].to(self.device)
	attention_mask = encoding['attention_mask'].to(self.device)

	with torch.no_grad():
	outputs = self.model(input_ids, attention_mask)
	probs = torch.softmax(outputs, dim=1)
	predicted = torch.argmax(probs, dim=1)

	return {
	'prediction': 'SPAM' if predicted.item() == 1 else 'LEGITIMATE',
	'confidence': float(probs[0, predicted.item()].item() * 100),
	'spam_probability': float(probs[0, 1].item() * 100),
	'ham_probability': float(probs[0, 0].item() * 100)
	}


	# ================================
	# FASTAPI APPLICATION
	# ================================

	app = FastAPI(
	title="FYP4 Spam Detection API",
	description="Email spam detection using DeBERTa",
	version="1.0.0"
	)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	detector = None


	@app.on_event("startup")
	async def startup_event():
	global detector
	print("🚀 Starting API...")
	detector = SpamDetector()
	if detector.model:
	print("✅ API Ready!")
	else:
	print("⚠️ Model failed to load")


	class TextRequest(BaseModel):
	text: str


	@app.get("/")
	async def root():
	return {
	"message": "🎯 FYP4 Spam Detection API",
	"status": "running",
	"model": config.MODEL_REPO,
	"endpoints": {
	"POST /predict/text": "Detect spam from text",
	"POST /predict/pdf": "Detect spam from PDF",
	"GET /health": "Check API health"
	}
	}


	@app.get("/health")
	async def health():
	return {
	"status": "healthy" if detector and detector.model else "unhealthy",
	"model_loaded": detector.model is not None if detector else False,
	"device": str(config.DEVICE)
	}


	@app.post("/predict/text")
	async def predict_text(request: TextRequest):
	if not detector or not detector.model:
	raise HTTPException(status_code=503, detail="Model not available")

	result = detector.predict_text(request.text)
	return result


	@app.post("/predict/pdf")
	async def predict_pdf(file: UploadFile = File(...)):
	if not file.filename.endswith('.pdf'):
	raise HTTPException(status_code=400, detail="Must be PDF file")

	if not detector or not detector.model:
	raise HTTPException(status_code=503, detail="Model not available")

	pdf_bytes = await file.read()
	email_data = PDFExtractor.extract_text_from_pdf(pdf_bytes)
	full_text = f"{email_data['subject']}\n\n{email_data['body']}"
	result = detector.predict_text(full_text)

	return {
	'email_data': email_data,
	'prediction': result
	}