Spaces:

SubhaL
/

bert-phishing-detector

Runtime error

App Files Files Community

SubhaL commited on May 16, 2025

Commit

ecf2409

verified ·

1 Parent(s): 27d4105

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -26

app.py CHANGED Viewed

@@ -1,22 +1,22 @@
 import gradio as gr
 from transformers import pipeline, AutoTokenizer
 # Load model and tokenizer
 model_name = "ealvaradob/bert-finetuned-phishing"
 classifier = pipeline("text-classification", model=model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 MAX_TOKENS = 512
 def count_tokens(text):
     return len(tokenizer.encode(text, truncation=False))
 def chunk_text(text, max_tokens=MAX_TOKENS):
     words = text.split()
-    chunks = []
-    current_chunk = []
-    current_length = 0
     for word in words:
         word_length = len(tokenizer.encode(word, add_special_tokens=False))
         if current_length + word_length > max_tokens:
@@ -26,51 +26,53 @@ def chunk_text(text, max_tokens=MAX_TOKENS):
         else:
             current_chunk.append(word)
             current_length += word_length
     if current_chunk:
         chunks.append(" ".join(current_chunk))
     return chunks
 def process_chunks(chunks):
-    phishing_count = 0
-    legitimate_count = 0
-    total_score = 0
     for chunk in chunks:
         result = classifier(chunk)[0]
-        label = result['label'].lower()
-        score = result['score']
         total_score += score
         if label == "phishing":
             phishing_count += 1
         else:
             legitimate_count += 1
     final_label = "Phishing" if phishing_count > legitimate_count else "Legitimate"
     average_confidence = total_score / len(chunks)
-    return f"Prediction: {final_label}\nAverage Confidence: {average_confidence:.2%}"
 def detect_phishing(input_text):
-    token_count = count_tokens(input_text)
-    if token_count <= MAX_TOKENS:
         result = classifier(input_text)[0]
         label = "Phishing" if result['label'].lower() == "phishing" else "Legitimate"
-        return f"Prediction: {label}\nConfidence: {result['score']:.2%}"
     else:
         chunks = chunk_text(input_text)
         return process_chunks(chunks)
-# Gradio interface
 demo = gr.Interface(
-    fn=detect_phishing,
-    inputs=gr.Textbox(lines=8, placeholder="Paste email content here..."),
     outputs="text",
     title="Phishing Email Detector",
-    description="Uses a fine-tuned BERT model to classify whether the email is phishing or legitimate. Handles long emails by chunking."
 )
-demo.launch()

+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
 import gradio as gr
 from transformers import pipeline, AutoTokenizer
+import uvicorn
 # Load model and tokenizer
 model_name = "ealvaradob/bert-finetuned-phishing"
 classifier = pipeline("text-classification", model=model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 MAX_TOKENS = 512
+# Functions
 def count_tokens(text):
     return len(tokenizer.encode(text, truncation=False))
 def chunk_text(text, max_tokens=MAX_TOKENS):
     words = text.split()
+    chunks, current_chunk, current_length = [], [], 0
     for word in words:
         word_length = len(tokenizer.encode(word, add_special_tokens=False))
         if current_length + word_length > max_tokens:
         else:
             current_chunk.append(word)
             current_length += word_length
     if current_chunk:
         chunks.append(" ".join(current_chunk))
     return chunks
 def process_chunks(chunks):
+    phishing_count, legitimate_count, total_score = 0, 0, 0
     for chunk in chunks:
         result = classifier(chunk)[0]
+        label, score = result['label'].lower(), result['score']
         total_score += score
         if label == "phishing":
             phishing_count += 1
         else:
             legitimate_count += 1
     final_label = "Phishing" if phishing_count > legitimate_count else "Legitimate"
     average_confidence = total_score / len(chunks)
+    return {"label": final_label, "confidence": round(average_confidence, 4)}
 def detect_phishing(input_text):
+    if count_tokens(input_text) <= MAX_TOKENS:
         result = classifier(input_text)[0]
         label = "Phishing" if result['label'].lower() == "phishing" else "Legitimate"
+        return {"label": label, "confidence": round(result['score'], 4)}
     else:
         chunks = chunk_text(input_text)
         return process_chunks(chunks)
+# FastAPI app
+api = FastAPI()
+@api.post("/predict")
+async def predict(request: Request):
+    data = await request.json()
+    input_text = data.get("text", "")
+    if not input_text:
+        return JSONResponse({"error": "No text provided."}, status_code=400)
+    result = detect_phishing(input_text)
+    return JSONResponse(result)
+# Gradio interface (optional)
 demo = gr.Interface(
+    fn=lambda x: f"{detect_phishing(x)['label']} ({detect_phishing(x)['confidence']*100:.2f}%)",
+    inputs=gr.Textbox(lines=6, label="Paste Email Text"),
     outputs="text",
     title="Phishing Email Detector",
+    description="Detects whether an email is Phishing or Legitimate using BERT."
 )
+demo.launch(server_name="0.0.0.0", server_port=7860, inline=False, share=False)