Spaces:

SubhaL
/

bert-phishing-detector

Runtime error

App Files Files Community

SubhaL commited on May 16, 2025

Commit

5286d26

verified ·

1 Parent(s): 9fac93d

Updated app.py

Browse files

Files changed (1) hide show

app.py +27 -27

app.py CHANGED Viewed

@@ -1,22 +1,22 @@
-from fastapi import FastAPI, Request
-from fastapi.responses import JSONResponse
 import gradio as gr
 from transformers import pipeline, AutoTokenizer
-import uvicorn
 # Load model and tokenizer
 model_name = "ealvaradob/bert-finetuned-phishing"
 classifier = pipeline("text-classification", model=model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 MAX_TOKENS = 512
-# Functions
 def count_tokens(text):
     return len(tokenizer.encode(text, truncation=False))
 def chunk_text(text, max_tokens=MAX_TOKENS):
     words = text.split()
-    chunks, current_chunk, current_length = [], [], 0
     for word in words:
         word_length = len(tokenizer.encode(word, add_special_tokens=False))
         if current_length + word_length > max_tokens:
@@ -26,53 +26,53 @@ def chunk_text(text, max_tokens=MAX_TOKENS):
         else:
             current_chunk.append(word)
             current_length += word_length
     if current_chunk:
         chunks.append(" ".join(current_chunk))
     return chunks
 def process_chunks(chunks):
-    phishing_count, legitimate_count, total_score = 0, 0, 0
     for chunk in chunks:
         result = classifier(chunk)[0]
-        label, score = result['label'].lower(), result['score']
         total_score += score
         if label == "phishing":
             phishing_count += 1
         else:
             legitimate_count += 1
     final_label = "Phishing" if phishing_count > legitimate_count else "Legitimate"
     average_confidence = total_score / len(chunks)
-    return {"label": final_label, "confidence": round(average_confidence, 4)}
 def detect_phishing(input_text):
-    if count_tokens(input_text) <= MAX_TOKENS:
         result = classifier(input_text)[0]
         label = "Phishing" if result['label'].lower() == "phishing" else "Legitimate"
-        return {"label": label, "confidence": round(result['score'], 4)}
     else:
         chunks = chunk_text(input_text)
         return process_chunks(chunks)
-# FastAPI app
-api = FastAPI()
-@api.post("/predict")
-async def predict(request: Request):
-    data = await request.json()
-    input_text = data.get("text", "")
-    if not input_text:
-        return JSONResponse({"error": "No text provided."}, status_code=400)
-    result = detect_phishing(input_text)
-    return JSONResponse(result)
-# Gradio interface (optional)
 demo = gr.Interface(
-    fn=lambda x: f"{detect_phishing(x)['label']} ({detect_phishing(x)['confidence']*100:.2f}%)",
-    inputs=gr.Textbox(lines=6, label="Paste Email Text"),
     outputs="text",
     title="Phishing Email Detector",
-    description="Detects whether an email is Phishing or Legitimate using BERT."
 )
-demo.launch(server_name="0.0.0.0", server_port=7860, inline=False, share=False)

 import gradio as gr
 from transformers import pipeline, AutoTokenizer
 # Load model and tokenizer
 model_name = "ealvaradob/bert-finetuned-phishing"
 classifier = pipeline("text-classification", model=model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 MAX_TOKENS = 512
 def count_tokens(text):
     return len(tokenizer.encode(text, truncation=False))
 def chunk_text(text, max_tokens=MAX_TOKENS):
     words = text.split()
+    chunks = []
+    current_chunk = []
+    current_length = 0
     for word in words:
         word_length = len(tokenizer.encode(word, add_special_tokens=False))
         if current_length + word_length > max_tokens:
         else:
             current_chunk.append(word)
             current_length += word_length
     if current_chunk:
         chunks.append(" ".join(current_chunk))
     return chunks
 def process_chunks(chunks):
+    phishing_count = 0
+    legitimate_count = 0
+    total_score = 0
     for chunk in chunks:
         result = classifier(chunk)[0]
+        label = result['label'].lower()
+        score = result['score']
         total_score += score
         if label == "phishing":
             phishing_count += 1
         else:
             legitimate_count += 1
     final_label = "Phishing" if phishing_count > legitimate_count else "Legitimate"
     average_confidence = total_score / len(chunks)
+    return f"Prediction: {final_label}\nAverage Confidence: {average_confidence:.2%}"
 def detect_phishing(input_text):
+    token_count = count_tokens(input_text)
+    if token_count <= MAX_TOKENS:
         result = classifier(input_text)[0]
         label = "Phishing" if result['label'].lower() == "phishing" else "Legitimate"
+        return f"Prediction: {label}\nConfidence: {result['score']:.2%}"
     else:
         chunks = chunk_text(input_text)
         return process_chunks(chunks)
+# Gradio interface
 demo = gr.Interface(
+    fn=detect_phishing,
+    inputs=gr.Textbox(lines=8, placeholder="Paste email content here..."),
     outputs="text",
     title="Phishing Email Detector",
+    description="Uses a fine-tuned BERT model to classify whether the email is phishing or legitimate. Handles long emails by chunking."
 )
+demo.launch()