princemaxp commited on
Commit
a4c47a0
·
verified ·
1 Parent(s): 89a43f0

Create attachment_analyzer.py

Browse files
Files changed (1) hide show
  1. attachment_analyzer.py +91 -0
attachment_analyzer.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # attachment_analyzer.py
2
+ import hashlib
3
+ import re
4
+ from typing import List, Dict
5
+
6
+ # Dangerous extensions frequently abused
7
+ DANGEROUS_EXTENSIONS = {
8
+ "exe", "js", "vbs", "scr", "bat", "cmd", "ps1", "hta", "jar" "svg"
9
+ }
10
+
11
+ ARCHIVE_EXTENSIONS = {"zip", "rar", "7z", "iso", "img"}
12
+ DOCUMENT_EXTENSIONS = {"doc", "docx", "xls", "xlsx", "ppt", "pptx", "pdf", "html", "htm"}
13
+
14
+ DOUBLE_EXT_REGEX = re.compile(r"\.(pdf|docx|xlsx|pptx|jpg|png)\.(html|exe|js|zip)$", re.IGNORECASE)
15
+
16
+
17
+ def sha256_hash(data: bytes) -> str:
18
+ return hashlib.sha256(data).hexdigest()
19
+
20
+
21
+ def analyze_attachments(attachments: List[Dict]):
22
+ """
23
+ attachments: list of dicts
24
+ [
25
+ {
26
+ "filename": str,
27
+ "content_type": str,
28
+ "size": int,
29
+ "data": bytes
30
+ }
31
+ ]
32
+ """
33
+ findings = []
34
+ score = 0
35
+ hashes = []
36
+
37
+ if not attachments:
38
+ return ["No attachments detected."], 0, []
39
+
40
+ for att in attachments:
41
+ name = (att.get("filename") or "").lower()
42
+ ctype = att.get("content_type", "")
43
+ size = att.get("size", 0)
44
+ data = att.get("data", b"")
45
+
46
+ ext = name.split(".")[-1] if "." in name else ""
47
+
48
+ # Hashing
49
+ if data:
50
+ hashes.append(sha256_hash(data))
51
+
52
+ # 1️⃣ Dangerous file extensions
53
+ if ext in DANGEROUS_EXTENSIONS:
54
+ findings.append(f"Attachment: Dangerous executable attachment detected ({name})")
55
+ score += 50
56
+
57
+ # 2️⃣ Double extension (HTML smuggling / masquerading)
58
+ if DOUBLE_EXT_REGEX.search(name):
59
+ findings.append(f"Attachment: Double extension detected (possible HTML smuggling): {name}")
60
+ score += 40
61
+
62
+ # 3️⃣ HTML attachment
63
+ if ext in {"html", "htm"}:
64
+ findings.append(f"Attachment: HTML attachment detected ({name}) — commonly used in phishing")
65
+ score += 35
66
+
67
+ # 4️⃣ Archive / container abuse
68
+ if ext in ARCHIVE_EXTENSIONS:
69
+ findings.append(f"Attachment: Compressed or disk image attachment detected ({name})")
70
+ score += 25
71
+
72
+ # 5️⃣ Office documents (macro risk)
73
+ if ext in {"docm", "xlsm", "pptm"}:
74
+ findings.append(f"Attachment: Macro-enabled Office document detected ({name})")
75
+ score += 45
76
+
77
+ # 6️⃣ Suspicious size (tiny payload delivery)
78
+ if size and size < 10_000:
79
+ findings.append(f"Attachment: Very small attachment size ({size} bytes) — possible loader")
80
+ score += 15
81
+
82
+ # 7️⃣ Content-type mismatch
83
+ if ctype and ext and ext not in ctype:
84
+ findings.append(
85
+ f"Attachment: Content-Type mismatch ({name} reported as {ctype})"
86
+ )
87
+ score += 20
88
+
89
+ score = min(score, 100)
90
+
91
+ return findings, score, hashes