File size: 13,633 Bytes
07fcfbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/usr/bin/env python3
"""Programmatic NER annotation for Exploit-DB entries."""

import json
import re
import sys

INPUT = "/home/ubuntu/alkyline/data/raw/exploitdb/exploitdb_descriptions.jsonl"
OUTPUT = "/home/ubuntu/alkyline/data/processed/llm_annotated_exploitdb.jsonl"

# Common vulnerability type keywords (longest first for greedy match)
VULN_TYPES = [
    "Unauthenticated Remote Code Execution",
    "Authenticated Remote Code Execution",
    "Remote Code Execution (RCE)",
    "Unrestricted File Upload + RCE",
    "Stored Cross-Site Scripting (XSS)",
    "Reflected Cross-Site Scripting (XSS)",
    "Persistent Cross-Site Scripting",
    "Multiple Stored Cross-Site Scripting (XSS)",
    "Stored Cross-Site Scripting via SVG File Upload (Authenticated)",
    "Stored Cross Site Scripting",
    "Stored Cross-Site Scripting",
    "Reflected Cross-Site Scripting",
    "Cross-Site Scripting (XSS)",
    "Cross Site Scripting",
    "Cross-Site Scripting",
    "XML External Entity Injection",
    "Remote Code Execution",
    "Local Privilege Escalation",
    "Privilege Escalation",
    "Remote Buffer Overflow",
    "Buffer Overflow",
    "Stack Buffer Overflow",
    "Heap Buffer Overflow",
    "Stack-based Buffer Overflow",
    "Heap-based Buffer Overflow",
    "Integer Overflow",
    "Authentication Bypass",
    "Authorization Bypass",
    "Directory Traversal",
    "Path Traversal",
    "SQL Injection",
    "SQL injection",
    "Blind SQL Injection",
    "Time Based Blind SQL Injection",
    "Command Injection",
    "OS Command Injection",
    "Code Injection",
    "LDAP Injection",
    "SSTI",
    "Server Side Template Injection",
    "Server-Side Template Injection",
    "Server Side Request Forgery",
    "Server-Side Request Forgery (SSRF)",
    "Server-Side Request Forgery",
    "SSRF",
    "Remote File Inclusion",
    "Local File Inclusion",
    "File Inclusion",
    "Arbitrary File Upload",
    "Arbitrary File Read",
    "Arbitrary File Write",
    "Arbitrary File Download",
    "Arbitrary File Deletion",
    "Arbitrary Code Execution",
    "Remote Command Execution",
    "Insecure Direct Object Reference",
    "Insecure Permissions",
    "Insecure File Permissions",
    "Information Disclosure",
    "Credential Disclosure",
    "Remote Configuration Disclosure",
    "Password Disclosure",
    "Denial of Service (DoS)",
    "Denial of Service (PoC)",
    "Denial of Service",
    "Use-After-Free",
    "Use After Free",
    "Double Free",
    "Type Confusion",
    "Out-of-Bounds Write",
    "Out-of-Bounds Read",
    "Out of Bounds Write",
    "Out of Bounds Read",
    "Null Pointer Dereference",
    "Memory Corruption",
    "Format String",
    "Open Redirect",
    "CSRF",
    "Cross-Site Request Forgery",
    "IDOR",
    "XXE",
    "XSS",
    "SQLi",
    "RCE",
    "LFI",
    "RFI",
    "Remote Root Backdoor",
    "Remote Password Reset",
    "Unrestricted File Upload",
    "File Upload",
    "Persistent XSS",
    "Stored XSS",
    "Reflected XSS",
    "DOM XSS",
]

# Known tools that appear in parentheses
KNOWN_TOOLS = ["Metasploit"]

# Regex patterns
IP_RE = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
DOMAIN_RE = re.compile(r'\b(?:[a-zA-Z0-9-]+\.)+(?:com|net|org|io|gov|edu|co|uk|de|fr|ru|cn|jp|info|biz)\b')
URL_RE = re.compile(r'https?://[^\s<>"\')+,]+')
FILEPATH_RE = re.compile(r'(?:/[a-zA-Z0-9_.+-]+){2,}|[a-zA-Z]:\\(?:[a-zA-Z0-9_.+-]+\\)*[a-zA-Z0-9_.+-]+')
EMAIL_RE = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
HASH_RE = re.compile(r'\b[a-fA-F0-9]{32,64}\b')
CVE_RE = re.compile(r'CVE-\d{4}-\d{4,}')

# Quoted parameter names that look like file paths but aren't
PARAM_IN_QUOTES = re.compile(r"'[a-zA-Z0-9_./]+'")


def find_all(text, substring):
    """Find all occurrences of substring in text, return list of (start, end)."""
    spans = []
    start = 0
    while True:
        idx = text.find(substring, start)
        if idx == -1:
            break
        spans.append([idx, idx + len(substring)])
        start = idx + 1
    return spans


def parse_title(text):
    """Parse Exploit-DB title pattern: 'Product Version - Vuln Type (extras)'

    Returns (system_text, vuln_text, tool_text) or partial results.
    """
    # Try splitting on ' - ' (the standard delimiter)
    # Use the LAST ' - ' that precedes a known vuln type, or just the last ' - '
    parts = text.split(' - ')

    if len(parts) >= 2:
        # Try to find the split point where vuln type starts
        # Check from the second part onwards
        best_split = None
        for i in range(1, len(parts)):
            after = ' - '.join(parts[i:])
            # Check if this starts with a known vuln pattern
            for vt in VULN_TYPES:
                after_clean = re.sub(r'\s*\(.*?\)\s*$', '', after).strip()
                if after_clean == vt or after.startswith(vt):
                    best_split = i
                    break
            # Also check for quoted-param patterns like 'param' SQL Injection
            if best_split is None and re.match(r"'[^']+'\s+", after):
                remainder = re.sub(r"^'[^']+'\s+", "", after)
                for vt in VULN_TYPES:
                    if remainder.strip().startswith(vt) or re.sub(r'\s*\(.*?\)\s*$', '', remainder).strip() == vt:
                        best_split = i
                        break

        if best_split is None:
            # Default: first ' - ' is the split
            best_split = 1

        system_part = ' - '.join(parts[:best_split]).strip()
        vuln_part = ' - '.join(parts[best_split:]).strip()

        return system_part, vuln_part

    return text.strip(), None


def extract_vuln_from_part(text, vuln_part):
    """Extract vulnerability span from the vuln part of the title."""
    if not vuln_part:
        return []

    results = []

    # Remove trailing (Metasploit) etc for vuln matching, but we'll handle tools separately
    clean = re.sub(r'\s*\(Metasploit\)\s*$', '', vuln_part).strip()
    # Remove trailing (Authenticated), (Unauthenticated), (PoC) — these are part of the vuln

    # Remove leading quoted param like 'username'
    param_match = re.match(r"'[^']+'\s+", clean)
    vuln_search = clean
    if param_match:
        vuln_search = clean[param_match.end():]

    # Try matching known vuln types (longest first)
    for vt in VULN_TYPES:
        if vt in vuln_search:
            # Find it in the original text
            spans = find_all(text, vt)
            if spans:
                results.append(("VULNERABILITY", vt, spans))
                break
    else:
        # If no known type matched, try the whole clean vuln part as vulnerability
        # But only if it looks like a vuln (not too long, not a product name)
        stripped = re.sub(r'\s*\(.*?\)\s*$', '', clean).strip()
        if param_match:
            stripped = re.sub(r"^'[^']+'\s+", "", stripped).strip()
        if len(stripped) < 80 and stripped:
            spans = find_all(text, stripped)
            if spans:
                results.append(("VULNERABILITY", stripped, spans))

    return results


def annotate_entry(entry):
    text = entry["text"]
    cves = entry.get("cves", [])
    spans_dict = {}  # "LABEL: entity" -> [[start, end], ...]

    def add_span(label, entity, positions):
        key = f"{label}: {entity}"
        if key not in spans_dict:
            spans_dict[key] = []
        for pos in positions:
            if pos not in spans_dict[key]:
                spans_dict[key].append(pos)

    # 1. Parse title structure
    system_part, vuln_part = parse_title(text)

    # 2. SYSTEM entity — the product/system name
    if system_part:
        sys_spans = find_all(text, system_part)
        if sys_spans:
            add_span("SYSTEM", system_part, sys_spans)

    # 3. VULNERABILITY entity
    vuln_results = extract_vuln_from_part(text, vuln_part)
    for label, entity, positions in vuln_results:
        add_span(label, entity, positions)

    # 4. CVE_ID from cves field — check if in text
    for cve in cves:
        cve_spans = find_all(text, cve)
        if cve_spans:
            add_span("CVE_ID", cve, cve_spans)
        # CVEs from the field that aren't in the text: we still record them
        # but with no character spans (they're metadata)

    # Also find CVEs in text that might not be in the cves field
    for m in CVE_RE.finditer(text):
        cve_text = m.group()
        add_span("CVE_ID", cve_text, [[m.start(), m.end()]])

    # 5. TOOL — check for (Metasploit) etc
    for tool in KNOWN_TOOLS:
        tool_spans = find_all(text, tool)
        if tool_spans:
            add_span("TOOL", tool, tool_spans)

    # 6. IP_ADDRESS — but skip version numbers embedded in product names
    for m in IP_RE.finditer(text):
        val = m.group()
        parts_ip = val.split('.')
        if all(0 <= int(p) <= 255 for p in parts_ip):
            # Heuristic: if it's inside the SYSTEM part of the title, it's a version
            # Also skip if preceded/followed by version-like context
            start_pos = m.start()
            # Check if this IP-like string is part of the system/product portion
            if system_part and start_pos < len(system_part) + 3:
                continue  # Almost certainly a version number
            # Check surrounding context for version indicators
            before = text[max(0, start_pos-10):start_pos]
            after = text[m.end():m.end()+5]
            if re.search(r'[vV]\s*$|version\s*$|\d\s*$', before) or re.search(r'^\.\d', after):
                continue
            # If it's in the vuln part preceded by a letter/digit, likely a version
            if start_pos > 0 and text[start_pos-1].isalnum():
                continue
            add_span("IP_ADDRESS", val, [[m.start(), m.end()]])

    # 7. URL
    for m in URL_RE.finditer(text):
        add_span("URL", m.group(), [[m.start(), m.end()]])

    # 8. EMAIL
    for m in EMAIL_RE.finditer(text):
        add_span("EMAIL", m.group(), [[m.start(), m.end()]])

    # 9. DOMAIN (only if not already captured as part of URL/EMAIL)
    for m in DOMAIN_RE.finditer(text):
        # Skip if inside a URL or email
        skip = False
        for key in spans_dict:
            if key.startswith("URL:") or key.startswith("EMAIL:"):
                for s, e in spans_dict[key]:
                    if s <= m.start() and m.end() <= e:
                        skip = True
                        break
        if not skip:
            add_span("DOMAIN", m.group(), [[m.start(), m.end()]])

    # 10. FILEPATH — look for paths in the text
    for m in FILEPATH_RE.finditer(text):
        val = m.group()
        # Skip if it's inside a URL
        skip = False
        for key in spans_dict:
            if key.startswith("URL:"):
                for s, e in spans_dict[key]:
                    if s <= m.start() and m.end() <= e:
                        skip = True
        # Skip if inside the SYSTEM span (product names with slashes like KZTech/JatonTec)
        if system_part and m.start() < len(system_part):
            skip = True
        if not skip and len(val) > 3:
            add_span("FILEPATH", val, [[m.start(), m.end()]])

    # 11. HASH
    for m in HASH_RE.finditer(text):
        val = m.group()
        # Skip CVE numbers and version-like strings
        if not CVE_RE.match(text[max(0,m.start()-4):m.end()]):
            add_span("HASH", val, [[m.start(), m.end()]])

    return {
        "text": text,
        "spans": spans_dict,
        "info": {
            "source": "exploitdb",
            "exploit_id": entry["exploit_id"],
        }
    }


def verify_offsets(result):
    """Verify all span offsets are correct."""
    text = result["text"]
    errors = []
    for key, positions in result["spans"].items():
        label, entity = key.split(": ", 1)
        for start, end in positions:
            if start < 0 or end > len(text):
                errors.append(f"Out of bounds: {key} [{start},{end}] in text len {len(text)}")
            elif text[start:end] != entity:
                errors.append(f"Mismatch: {key} [{start},{end}] = '{text[start:end]}' != '{entity}'")
    return errors


def main():
    with open(INPUT) as f:
        entries = [json.loads(line) for line in f]

    print(f"Processing {len(entries)} entries...")

    all_errors = []
    results = []

    for entry in entries:
        result = annotate_entry(entry)
        errors = verify_offsets(result)
        if errors:
            all_errors.extend([(entry["exploit_id"], e) for e in errors])
        results.append(result)

    # Write output
    with open(OUTPUT, "w") as f:
        for r in results:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    print(f"Wrote {len(results)} annotated entries to {OUTPUT}")

    if all_errors:
        print(f"\n{len(all_errors)} offset errors found:")
        for eid, err in all_errors[:20]:
            print(f"  [{eid}] {err}")
        if len(all_errors) > 20:
            print(f"  ... and {len(all_errors)-20} more")
    else:
        print("All offsets verified correct!")

    # Stats
    label_counts = {}
    for r in results:
        for key in r["spans"]:
            label = key.split(": ", 1)[0]
            label_counts[label] = label_counts.get(label, 0) + 1

    print("\nEntity type distribution:")
    for label, count in sorted(label_counts.items(), key=lambda x: -x[1]):
        print(f"  {label}: {count}")

    entries_with_spans = sum(1 for r in results if r["spans"])
    print(f"\nEntries with at least one span: {entries_with_spans}/{len(results)}")


if __name__ == "__main__":
    main()