Spaces:
Sleeping
Sleeping
Create attachment_analyzer.py
Browse files- attachment_analyzer.py +91 -0
attachment_analyzer.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# attachment_analyzer.py
|
| 2 |
+
import hashlib
|
| 3 |
+
import re
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
|
| 6 |
+
# Dangerous extensions frequently abused
|
| 7 |
+
DANGEROUS_EXTENSIONS = {
|
| 8 |
+
"exe", "js", "vbs", "scr", "bat", "cmd", "ps1", "hta", "jar" "svg"
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
ARCHIVE_EXTENSIONS = {"zip", "rar", "7z", "iso", "img"}
|
| 12 |
+
DOCUMENT_EXTENSIONS = {"doc", "docx", "xls", "xlsx", "ppt", "pptx", "pdf", "html", "htm"}
|
| 13 |
+
|
| 14 |
+
DOUBLE_EXT_REGEX = re.compile(r"\.(pdf|docx|xlsx|pptx|jpg|png)\.(html|exe|js|zip)$", re.IGNORECASE)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def sha256_hash(data: bytes) -> str:
|
| 18 |
+
return hashlib.sha256(data).hexdigest()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def analyze_attachments(attachments: List[Dict]):
|
| 22 |
+
"""
|
| 23 |
+
attachments: list of dicts
|
| 24 |
+
[
|
| 25 |
+
{
|
| 26 |
+
"filename": str,
|
| 27 |
+
"content_type": str,
|
| 28 |
+
"size": int,
|
| 29 |
+
"data": bytes
|
| 30 |
+
}
|
| 31 |
+
]
|
| 32 |
+
"""
|
| 33 |
+
findings = []
|
| 34 |
+
score = 0
|
| 35 |
+
hashes = []
|
| 36 |
+
|
| 37 |
+
if not attachments:
|
| 38 |
+
return ["No attachments detected."], 0, []
|
| 39 |
+
|
| 40 |
+
for att in attachments:
|
| 41 |
+
name = (att.get("filename") or "").lower()
|
| 42 |
+
ctype = att.get("content_type", "")
|
| 43 |
+
size = att.get("size", 0)
|
| 44 |
+
data = att.get("data", b"")
|
| 45 |
+
|
| 46 |
+
ext = name.split(".")[-1] if "." in name else ""
|
| 47 |
+
|
| 48 |
+
# Hashing
|
| 49 |
+
if data:
|
| 50 |
+
hashes.append(sha256_hash(data))
|
| 51 |
+
|
| 52 |
+
# 1️⃣ Dangerous file extensions
|
| 53 |
+
if ext in DANGEROUS_EXTENSIONS:
|
| 54 |
+
findings.append(f"Attachment: Dangerous executable attachment detected ({name})")
|
| 55 |
+
score += 50
|
| 56 |
+
|
| 57 |
+
# 2️⃣ Double extension (HTML smuggling / masquerading)
|
| 58 |
+
if DOUBLE_EXT_REGEX.search(name):
|
| 59 |
+
findings.append(f"Attachment: Double extension detected (possible HTML smuggling): {name}")
|
| 60 |
+
score += 40
|
| 61 |
+
|
| 62 |
+
# 3️⃣ HTML attachment
|
| 63 |
+
if ext in {"html", "htm"}:
|
| 64 |
+
findings.append(f"Attachment: HTML attachment detected ({name}) — commonly used in phishing")
|
| 65 |
+
score += 35
|
| 66 |
+
|
| 67 |
+
# 4️⃣ Archive / container abuse
|
| 68 |
+
if ext in ARCHIVE_EXTENSIONS:
|
| 69 |
+
findings.append(f"Attachment: Compressed or disk image attachment detected ({name})")
|
| 70 |
+
score += 25
|
| 71 |
+
|
| 72 |
+
# 5️⃣ Office documents (macro risk)
|
| 73 |
+
if ext in {"docm", "xlsm", "pptm"}:
|
| 74 |
+
findings.append(f"Attachment: Macro-enabled Office document detected ({name})")
|
| 75 |
+
score += 45
|
| 76 |
+
|
| 77 |
+
# 6️⃣ Suspicious size (tiny payload delivery)
|
| 78 |
+
if size and size < 10_000:
|
| 79 |
+
findings.append(f"Attachment: Very small attachment size ({size} bytes) — possible loader")
|
| 80 |
+
score += 15
|
| 81 |
+
|
| 82 |
+
# 7️⃣ Content-type mismatch
|
| 83 |
+
if ctype and ext and ext not in ctype:
|
| 84 |
+
findings.append(
|
| 85 |
+
f"Attachment: Content-Type mismatch ({name} reported as {ctype})"
|
| 86 |
+
)
|
| 87 |
+
score += 20
|
| 88 |
+
|
| 89 |
+
score = min(score, 100)
|
| 90 |
+
|
| 91 |
+
return findings, score, hashes
|