Spaces:

khoa-done
/

Phishing-Detector

Sleeping

App Files Files Community

khoa-done commited on Jul 31, 2025

Commit

9a6970e

1 Parent(s): 3f09186

Add all files for the model

Browse files

Files changed (10) hide show

.idea/.gitignore +3 -0
.idea/Phishing-Detector-HF.iml +8 -0
.idea/inspectionProfiles/Project_Default.xml +29 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +7 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
app.py +79 -0
model.py +29 -0
requirements.txt +5 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/Phishing-Detector-HF.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.13 (Chatbot)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,29 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="9">
+            <item index="0" class="java.lang.String" itemvalue="chromadb" />
+            <item index="1" class="java.lang.String" itemvalue="protobuf" />
+            <item index="2" class="java.lang.String" itemvalue="langchain-community" />
+            <item index="3" class="java.lang.String" itemvalue="langchain" />
+            <item index="4" class="java.lang.String" itemvalue="streamlit" />
+            <item index="5" class="java.lang.String" itemvalue="langchain-huggingface" />
+            <item index="6" class="java.lang.String" itemvalue="python-dotenv" />
+            <item index="7" class="java.lang.String" itemvalue="pypdf" />
+            <item index="8" class="java.lang.String" itemvalue="pysqlite3-binary" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N802" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.13 (Chatbot)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (Chatbot)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Phishing-Detector-HF.iml" filepath="$PROJECT_DIR$/.idea/Phishing-Detector-HF.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+from huggingface_hub import hf_hub_download
+import gradio as gr
+# --- import your architecture ---
+# Make sure this file is in the repo (e.g., models/deberta_lstm_classifier.py)
+# and update the import path accordingly.
+from model import DeBERTaLSTMClassifier  # <-- your class
+# --------- Config ----------
+REPO_ID = "khoa-done/phishing-detector"       # HF repo that holds the checkpoint
+CKPT_NAME = "deberta_lstm_checkpoint.pt"      # the .pt file name
+MODEL_NAME = "microsoft/deberta-base"         # base tokenizer/backbone
+LABELS = ["benign", "phishing"]               # adjust to your classes
+# If your checkpoint contains hyperparams, you can fetch them like:
+# checkpoint.get("config") or checkpoint.get("model_args")
+# and pass into DeBERTaLSTMClassifier(**model_args)
+# --------- Load model/tokenizer once (global) ----------
+device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+ckpt_path = hf_hub_download(repo_id=REPO_ID, filename=CKPT_NAME)
+checkpoint = torch.load(ckpt_path, map_location=device)
+# If you saved hyperparams in the checkpoint, use them:
+model_args = checkpoint.get("model_args", {})  # e.g., {"lstm_hidden":256, "num_labels":2, ...}
+model = DeBERTaLSTMClassifier(**model_args)
+model.load_state_dict(checkpoint["model_state_dict"])
+model.to(device).eval()
+# --------- Inference function ----------
+def predict_fn(text: str):
+    if not text or not text.strip():
+        return {"error": "Please enter a URL or text."}
+    # Tokenize
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,        # single example -> becomes [1, seq_len]
+        max_length=256       # adjust as used during training
+    )
+    # DeBERTa typically doesn't use token_type_ids
+    inputs.pop("token_type_ids", None)
+    # Move to device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        logits = model(**inputs)  # your model.forward should accept (input_ids, attention_mask)
+        probs = F.softmax(logits, dim=-1).squeeze(0).tolist()
+    # Build label->prob mapping for Gradio Label output
+    # If LABELS length doesn't match logits dim, just return raw list
+    if len(LABELS) == len(probs):
+        return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
+    else:
+        return {f"class_{i}": float(p) for i, p in enumerate(probs)}
+# --------- Gradio UI ----------
+demo = gr.Interface(
+    fn=predict_fn,
+    inputs=gr.Textbox(label="URL or text", placeholder="e.g., http://suspicious-site.example"),
+    outputs=gr.Label(label="Prediction"),
+    title="Phishing Detector (DeBERTa + LSTM)",
+    description="Enter a URL/text. The model outputs class probabilities.",
+    examples=[
+        ["http://rendmoiunserviceeee.com"],
+        ["https://www.google.com"],
+        ["https://mail-secure-login-verify.example/path?token=..."]
+    ]
+)
+if __name__ == "__main__":
+    demo.launch()

model.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+import torch.nn as nn
+from transformers import AutoModel
+class DeBERTaLSTMClassifier(nn.Module):
+    def __init__(self, hidden_dim=128, num_labels=2):
+        super().__init__()
+        self.deberta = AutoModel.from_pretrained("microsoft/deberta-base")
+        for param in self.deberta.parameters():
+            param.requires_grad = False  # freeze DeBERTa (as we don't have enough resources, we will not train DeBERTa in this model)
+        self.lstm = nn.LSTM(
+            input_size=self.deberta.config.hidden_size,
+            hidden_size=hidden_dim,
+            batch_first=True,
+            bidirectional=True
+        )
+        self.fc = nn.Linear(hidden_dim * 2, num_labels)
+    def forward(self, input_ids, attention_mask):
+        with torch.no_grad():
+            outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
+        lstm_out, _ = self.lstm(outputs.last_hidden_state)  # shape: [batch, seq_len, hidden*2]
+        final_hidden = lstm_out[:, -1, :]  # last token output
+        logits = self.fc(final_hidden)
+        return logits

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+transformers==4.41.2
+huggingface_hub==0.24.5
+safetensors
+gradio==4.39.0