File size: 16,396 Bytes
90b3b3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
import re
import json

def extract_entities(log_content):
    # Initialize entity dictionaries
    entities = {
        "DateTime": [],
        "System": [],
        "Service": [],
        "Process": [],
        "Action": [],
        "IPAddress": [],
        "DNSName": [],
        "Username": [],
        "Role": [],
        "Metadata": [],
        "Status": [],
        "Error": [],
        "Severity": [],
        "SessionID": [],
        "SessionStatus": [],
        "FileName": [],
        "Object": [],
        "ApplicationSpecific": [],
        "AuthenticationType": [],
        "ResourceType": [],
        "ResourceUsage": [],
        # Add new entity types
        "TimeServer": [],
        "Port": [],
        "SourcePort": [],
        "DestinationPort": [],
        "Protocol": [],
        "Interface": [],
        "InterfaceType": [],
        "Subnet": [],
        "Rule": [],
        "TTL": [],
        "MAC": [],
        "Flags": [],
        "CPU": [],
        "MemoryInfo": [],
        "Hypervisor": [],
        "Device": [],
        "FileSystem": [],
        "DataBus": [],
        "EventID": [],
        "CMD": []
    }
    
    # Process log content line by line to maintain proper context
    lines = log_content.split('\n')
    
    # DateTime patterns with clear start/end markers
    # ISO timestamps in JSON - starts with @timestamp": " and ends with "
    iso_timestamps = re.findall(r'@timestamp\":\s*\"([^\"]+)\"', log_content)
    entities["DateTime"].extend(iso_timestamps)
    
    # Log timestamps - starts with year or month and ends with space
    for line in lines:
        # YYYY-MM-DD HH:MM:SS format (starts with digit, ends with space)
        date_match = re.search(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s', line)
        if date_match:
            entities["DateTime"].append(date_match.group(1))
            
        # Month DD HH:MM:SS format (starts with month name, ends with space)
        syslog_match = re.match(r'^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{1,2})\s+(\d{2}:\d{2}:\d{2})\s', line)
        if syslog_match:
            month, day, time = syslog_match.groups()
            entities["DateTime"].append(f"{month} {day} {time}")
    
    # Audit timestamps - starts with msg=audit( and ends with :
    audit_timestamps = re.findall(r'msg=audit\((\d+\.\d+):', log_content)
    entities["DateTime"].extend(audit_timestamps)
    
    # System/Node patterns - with clear start/end markers
    # Extract hostname from JSON - starts with "hostname": " and ends with "
    hostnames = re.findall(r'\"hostname\":\s*\"([^\"]+)\"', log_content)
    entities["System"].extend(hostnames)
    
    # Extract hostname from JSON host field - starts with "host": {"name": " and ends with "
    host_names = re.findall(r'\"host\":\s*\{[^}]*\"name\":\s*\"([^\"]+)\"', log_content)
    entities["System"].extend(host_names)
    
    # Extract hostname from syslog format - after timestamp and before service
    for line in lines:
        syslog_match = re.match(r'^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+([^\s:]+)\s', line)
        if syslog_match and syslog_match.group(1) not in ["?", "-"]:
            entities["System"].append(syslog_match.group(1))
    
    # Service patterns - with clear start/end markers
    # Extract service from syslog format - after hostname and before colon or bracket
    for line in lines:
        service_match = re.search(r'^\w+\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+[^\s]+\s+([^:\[\s]+)(?:\[\d+\])?:', line)
        if service_match and service_match.group(1) not in ["?", "-"]:
            entities["Service"].append(service_match.group(1))
    
    # Extract service from JSON - starts with "service": {"type": " and ends with "
    service_types = re.findall(r'\"service\":\s*\{\s*\"type\":\s*\"([^\"]+)\"', log_content)
    entities["Service"].extend(service_types)
    
    # Extract agent types - starts with "agent": and contains "type": " and ends with "
    agent_types = re.findall(r'\"agent\":[^}]*\"type\":\s*\"([^\"]+)\"', log_content)
    entities["Service"].extend(agent_types)
    
    # Process IDs - with clear start/end markers
    # Extract PIDs from brackets - starts with [ and ends with ]
    for line in lines:
        pid_matches = re.findall(r'(?:sshd|dnsmasq|cron|systemd|openvpn|metricbeat)\[(\d+)\]', line)
        entities["Process"].extend(pid_matches)
    
    # Extract PIDs from audit logs - starts with pid= and ends with space
    pid_audit = re.findall(r'pid=(\d+)\s', log_content)
    entities["Process"].extend(pid_audit)
    
    # Action patterns - with clear start/end markers
    # Session actions - starts with session and ends with for user or space
    session_actions = re.findall(r'session\s+(opened|closed)(?:\s+for\s+user|\s)', log_content)
    entities["Action"].extend(session_actions)
    
    # DNS actions - starts with dnsmasq[PID]: and ends with space
    for line in lines:
        if "dnsmasq" in line:
            dns_match = re.search(r'dnsmasq\[\d+\]:\s+(query|forwarded|reply|cached|NODATA-IPv[46])(?:\s)', line)
            if dns_match:
                entities["Action"].append(dns_match.group(1))
    
    # VPN actions - starts with clear identifier and ends with space
    vpn_actions = re.findall(r'(?:TLS|VERIFY)\s+(OK|soft\s+reset)(?:\s|$)', log_content)
    entities["Action"].extend(vpn_actions)
    
    # IP Address patterns - with clear start/end markers
    # Find IPs with context - starts with from/to/is and ends with space or port
    for line in lines:
        ip_from = re.findall(r'from\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?:\s|$|:)', line)
        entities["IPAddress"].extend(ip_from)
        
        ip_to = re.findall(r'to\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?:\s|$)', line)
        entities["IPAddress"].extend(ip_to)
        
        ip_is = re.findall(r'is\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?:\s|$)', line)
        entities["IPAddress"].extend(ip_is)
    
    # Find IPs in VPN logs - starts with username/ and ends with :
    vpn_ips = re.findall(r'[a-zA-Z0-9]+/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):', log_content)
    entities["IPAddress"].extend(vpn_ips)
    
    # Extract Source IP more comprehensively
    src_ips = re.findall(r'SRC=(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', log_content)
    entities["IPAddress"].extend(src_ips)
    
    # Extract Destination IP more comprehensively
    dst_ips = re.findall(r'DST=(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', log_content)
    entities["IPAddress"].extend(dst_ips)
    
    # DNS Name patterns - with clear start/end markers
    # Find domains in DNS queries - starts with query/forwarded/reply and ends with from/to/is
    for line in lines:
        if "dnsmasq" in line:
            dns_match = re.search(r'(?:query\[[A-Z]+\]|forwarded|reply)\s+([-a-zA-Z0-9.*_/]+(?:\.[a-zA-Z0-9.*_/-]+)+)(?:\s+from|\s+to|\s+is)', line)
            if dns_match:
                entities["DNSName"].append(dns_match.group(1))
    
    # Username patterns - with clear start/end markers
    # Extract usernames from quotes - starts with user=" or acct=" and ends with "
    usernames_quoted = re.findall(r'(?:user|acct)=\"([^\"]+)\"', log_content)
    entities["Username"].extend(usernames_quoted)
    
    # Extract usernames from VPN logs - username before slash and IP
    vpn_users = re.findall(r'(\w+)/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:', log_content)
    entities["Username"].extend(vpn_users)
    
    # Extract usernames from session logs - starts with for user and ends with space or by
    usernames_session = re.findall(r'for\s+user\s+(\w+)(?:\s|$|by)', log_content)
    entities["Username"].extend(usernames_session)
    
    # Extract usernames from SSH logs - starts with Accepted type for and ends with from
    usernames_ssh = re.findall(r'Accepted\s+\w+\s+for\s+(\w+)\s+from', log_content)
    entities["Username"].extend(usernames_ssh)
    
    # Time Server patterns - NTP servers with port 123
    time_servers = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):123\s+\(([^)]+)\)', log_content)
    for ip, name in time_servers:
        entities["TimeServer"].append(f"{ip}:123 ({name})")
    
    # Port patterns - explicit port mentions
    port_patterns = re.findall(r'(?:port\s+|:)(\d+)(?:\s|$|,|\))', log_content)
    entities["Port"].extend(port_patterns)
    
    # Source Port patterns
    source_ports = re.findall(r'SPT=(\d+)', log_content)
    entities["SourcePort"].extend(source_ports)
    
    # Destination Port patterns
    dest_ports = re.findall(r'DPT=(\d+)', log_content)
    entities["DestinationPort"].extend(dest_ports)
    
    # Protocol patterns
    protocols = re.findall(r'(?:PROTO=|protocol\s+)([a-zA-Z]+\d*)', log_content)  # Modified to avoid numeric-only protocols
    entities["Protocol"].extend(protocols)
    # Add common protocols if mentioned
    for proto in ["tcp", "udp", "icmp", "TCP", "IPv4", "IPv6"]:
        if re.search(r'\b' + proto + r'\b', log_content, re.IGNORECASE):
            entities["Protocol"].append(proto)
    
    # Interface patterns
    interfaces = re.findall(r'(?:interface|dev)\s+(ens\d+|eth\d+|wlan\d+|lo)', log_content)
    entities["Interface"].extend(interfaces)
    
    # Interface Type patterns
    interface_types = re.findall(r'(?:zone|type)\s+(inet|lan|dmz|wan)', log_content, re.IGNORECASE)
    entities["InterfaceType"].extend(interface_types)
    
    # Subnet patterns
    subnets = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/\d{1,2})', log_content)
    entities["Subnet"].extend(subnets)
    
    # Rule patterns
    rules = re.findall(r'(DNAT|ACCEPT|REJECT|DROP|Policy)\s', log_content)
    entities["Rule"].extend(rules)
    
    # TTL patterns
    ttls = re.findall(r'TTL=(\d+)', log_content)
    entities["TTL"].extend(ttls)
    
    # MAC Address patterns
    macs = re.findall(r'((?:[0-9a-fA-F]{2}:){5}[0-9a-fA-F]{2}|(?:[0-9a-fA-F]{2}-){5}[0-9a-fA-F]{2})', log_content)
    entities["MAC"].extend(macs)
    
    # Flags patterns
    flags = re.findall(r'(?:flags|FLAG)\s+(\w+)', log_content)
    entities["Flags"].extend(flags)
    # Add specific flags if found
    for flag in ["RST", "DF", "ACK", "SYN", "FIN", "PSH", "URG"]:
        if re.search(r'\b' + flag + r'\b', log_content):
            entities["Flags"].append(flag)
    
    # CPU patterns
    cpu_info = re.findall(r'(Intel GenuineIntel|AMD AuthenticAMD|Centaur CentaurHauls)', log_content)
    entities["CPU"].extend(cpu_info)
    
    # Memory Info patterns
    memory_info = re.findall(r'mem\s+0x[0-9a-f]+-0x[0-9a-f]+\s+([a-z]+)', log_content)
    entities["MemoryInfo"].extend(memory_info)
    
    # Hypervisor patterns
    hypervisor = re.findall(r'Hypervisor detected:\s+(\w+)', log_content)
    entities["Hypervisor"].extend(hypervisor)
    
    # Device/Component patterns
    devices = re.findall(r'(?:device|component):\s+([a-zA-Z0-9_-]+)', log_content)
    entities["Device"].extend(devices)
    # Add common devices if found
    for device in ["PCI-DMA", "ehci_hcd", "usb", "rtc_cmos", "virtio-pci", "i8042", "ata_piix"]:
        if re.search(r'\b' + device + r'\b', log_content):
            entities["Device"].append(device)
    
    # File System patterns
    filesystems = re.findall(r'(squashfs|ext4|xfs|btrfs):\s', log_content)
    entities["FileSystem"].extend(filesystems)
    
    # Data Bus patterns
    for bus in ["PCI", "USB", "i2c", "PS/2", "Serial", "ATA", "SATA", "TUN/TAP"]:
        if re.search(r'\b' + bus + r'\b', log_content):
            entities["DataBus"].append(bus)
    
    # Event ID patterns
    event_ids = re.findall(r'\[(\d+\.\d+)\]', log_content)
    entities["EventID"].extend(event_ids)
    
    # CMD patterns
    cmd_patterns = re.findall(r'CMD\s+\(([^)]+)\)', log_content)
    entities["CMD"].extend(cmd_patterns)
    
    # Enhanced file path detection
    file_paths = re.findall(r'(/etc/[a-zA-Z0-9_/.-]+)', log_content)
    entities["FileName"].extend(file_paths)
    
    # File name patterns - with clear file extensions
    for line in lines:
        # Look for common file extensions - starts with word character and ends with known extension
        file_match = re.search(r'(?<!\S)([a-zA-Z0-9_-]+\.(?:xlsx|txt|java|log|csv|pdf|docx|cfg|conf))(?:\s|$|\.)', line)
        if file_match:
            entities["FileName"].append(file_match.group(1))
    
    # Enhanced severity level detection
    for severity in ["warning", "info", "error", "debug", "notice", "critical", "alert", "emergency"]:
        if re.search(r'\b' + severity + r'\b', log_content, re.IGNORECASE):
            entities["Severity"].append(severity)
    
    # Session status patterns
    session_status = re.findall(r'session\s+(opened|closed)', log_content)
    entities["SessionStatus"].extend(session_status)
    
    # Error message patterns
    error_patterns = re.findall(r'(?:error|failure|failed):\s+([^,\n]+)', log_content, re.IGNORECASE)
    entities["Error"].extend(error_patterns)
    
    # Authentication type patterns
    auth_types = re.findall(r'Accepted\s+(\w+)', log_content)
    entities["AuthenticationType"].extend(auth_types)
    
    # Status patterns - with clear start/end markers
    # Extract PAM results - starts with res= and ends with space or quote
    status_pam = re.findall(r'res=(\w+)(?:\s|\'|\")', log_content)
    entities["Status"].extend(status_pam)
    
    # Extract verification results - starts with VERIFY and ends with space
    status_verify = re.findall(r'VERIFY\s+(OK|FAILED|KU OK|EKU OK)(?:\s|$)', log_content)
    entities["Status"].extend(status_verify)
    
    # Resource type patterns - with clear indicators
    resource_types = re.findall(r'(?:resource|type):\s+([a-zA-Z0-9_-]+)', log_content)
    entities["ResourceType"].extend(resource_types)
    
    # Add CPU as resource type if system CPU metrics are present
    if re.search(r'\"system\":\s*\{\s*\"cpu\":', log_content):
        entities["ResourceType"].append("CPU")
    
    # SessionID patterns - starts with session_id= or sessionId= and ends with space or comma
    session_id_patterns = re.findall(r'session(?:_id|Id)=([a-zA-Z0-9-]+)(?:\s|,|$)', log_content)
    entities["SessionID"].extend(session_id_patterns)
    
    # Extract session numbers - starts with session space and ends with space
    session_numbers = re.findall(r'session\s+(\d+)(?:\s|$)', log_content)
    entities["SessionID"].extend(session_numbers)
    
    # Extract ses= format SessionIDs from audit logs
    ses_patterns = re.findall(r'ses=(\d+)(?:\s|,|$)', log_content)
    entities["SessionID"].extend(ses_patterns)
    
    # Object patterns - starts with object= and ends with space or comma
    object_patterns = re.findall(r'object=([a-zA-Z0-9_-]+)(?:\s|,|$)', log_content)
    entities["Object"].extend(object_patterns)
    
    # Extract unit names from systemd logs
    unit_objects = re.findall(r'unit=([a-zA-Z0-9_-]+)(?:\s|,|$)', log_content)
    entities["Object"].extend(unit_objects)
    
    # Application specific patterns - starts with app= and ends with space or comma
    app_specific_patterns = re.findall(r'app=([a-zA-Z0-9_-]+)(?:\s|,|$)', log_content)
    entities["ApplicationSpecific"].extend(app_specific_patterns)
    
    # Extract specific applications from content
    if "OpenVPN" in log_content:
        entities["ApplicationSpecific"].append("OpenVPN")
        
    if "metricbeat" in log_content:
        entities["ApplicationSpecific"].append("metricbeat")
    
    # Role patterns - starts with role= and ends with space or comma
    role_patterns = re.findall(r'role=([a-zA-Z0-9_-]+)(?:\s|,|$)', log_content)
    entities["Role"].extend(role_patterns)
    
    # Metadata patterns - starts with metadata={ and ends with }
    metadata_patterns = re.findall(r'metadata=\{([^}]+)\}', log_content)
    entities["Metadata"].extend(metadata_patterns)
    
    # Resource usage patterns - CPU and memory metrics
    cpu_usage = re.findall(r'\"cpu\":\s*{\s*\"pct\":\s*([0-9.]+)', log_content)
    if cpu_usage:
        entities["ResourceUsage"].extend([f"CPU: {usage}%" for usage in cpu_usage])
    
    # Remove duplicates
    for entity_type in list(entities.keys()):
        if entities[entity_type]:
            entities[entity_type] = list(set(entities[entity_type]))
        else:
            # Remove empty entity types from output
            del entities[entity_type]
    
    return entities