Spaces:
Sleeping
Sleeping
File size: 16,396 Bytes
90b3b3f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 | import re
import json
def extract_entities(log_content):
# Initialize entity dictionaries
entities = {
"DateTime": [],
"System": [],
"Service": [],
"Process": [],
"Action": [],
"IPAddress": [],
"DNSName": [],
"Username": [],
"Role": [],
"Metadata": [],
"Status": [],
"Error": [],
"Severity": [],
"SessionID": [],
"SessionStatus": [],
"FileName": [],
"Object": [],
"ApplicationSpecific": [],
"AuthenticationType": [],
"ResourceType": [],
"ResourceUsage": [],
# Add new entity types
"TimeServer": [],
"Port": [],
"SourcePort": [],
"DestinationPort": [],
"Protocol": [],
"Interface": [],
"InterfaceType": [],
"Subnet": [],
"Rule": [],
"TTL": [],
"MAC": [],
"Flags": [],
"CPU": [],
"MemoryInfo": [],
"Hypervisor": [],
"Device": [],
"FileSystem": [],
"DataBus": [],
"EventID": [],
"CMD": []
}
# Process log content line by line to maintain proper context
lines = log_content.split('\n')
# DateTime patterns with clear start/end markers
# ISO timestamps in JSON - starts with @timestamp": " and ends with "
iso_timestamps = re.findall(r'@timestamp\":\s*\"([^\"]+)\"', log_content)
entities["DateTime"].extend(iso_timestamps)
# Log timestamps - starts with year or month and ends with space
for line in lines:
# YYYY-MM-DD HH:MM:SS format (starts with digit, ends with space)
date_match = re.search(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s', line)
if date_match:
entities["DateTime"].append(date_match.group(1))
# Month DD HH:MM:SS format (starts with month name, ends with space)
syslog_match = re.match(r'^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{1,2})\s+(\d{2}:\d{2}:\d{2})\s', line)
if syslog_match:
month, day, time = syslog_match.groups()
entities["DateTime"].append(f"{month} {day} {time}")
# Audit timestamps - starts with msg=audit( and ends with :
audit_timestamps = re.findall(r'msg=audit\((\d+\.\d+):', log_content)
entities["DateTime"].extend(audit_timestamps)
# System/Node patterns - with clear start/end markers
# Extract hostname from JSON - starts with "hostname": " and ends with "
hostnames = re.findall(r'\"hostname\":\s*\"([^\"]+)\"', log_content)
entities["System"].extend(hostnames)
# Extract hostname from JSON host field - starts with "host": {"name": " and ends with "
host_names = re.findall(r'\"host\":\s*\{[^}]*\"name\":\s*\"([^\"]+)\"', log_content)
entities["System"].extend(host_names)
# Extract hostname from syslog format - after timestamp and before service
for line in lines:
syslog_match = re.match(r'^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+([^\s:]+)\s', line)
if syslog_match and syslog_match.group(1) not in ["?", "-"]:
entities["System"].append(syslog_match.group(1))
# Service patterns - with clear start/end markers
# Extract service from syslog format - after hostname and before colon or bracket
for line in lines:
service_match = re.search(r'^\w+\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+[^\s]+\s+([^:\[\s]+)(?:\[\d+\])?:', line)
if service_match and service_match.group(1) not in ["?", "-"]:
entities["Service"].append(service_match.group(1))
# Extract service from JSON - starts with "service": {"type": " and ends with "
service_types = re.findall(r'\"service\":\s*\{\s*\"type\":\s*\"([^\"]+)\"', log_content)
entities["Service"].extend(service_types)
# Extract agent types - starts with "agent": and contains "type": " and ends with "
agent_types = re.findall(r'\"agent\":[^}]*\"type\":\s*\"([^\"]+)\"', log_content)
entities["Service"].extend(agent_types)
# Process IDs - with clear start/end markers
# Extract PIDs from brackets - starts with [ and ends with ]
for line in lines:
pid_matches = re.findall(r'(?:sshd|dnsmasq|cron|systemd|openvpn|metricbeat)\[(\d+)\]', line)
entities["Process"].extend(pid_matches)
# Extract PIDs from audit logs - starts with pid= and ends with space
pid_audit = re.findall(r'pid=(\d+)\s', log_content)
entities["Process"].extend(pid_audit)
# Action patterns - with clear start/end markers
# Session actions - starts with session and ends with for user or space
session_actions = re.findall(r'session\s+(opened|closed)(?:\s+for\s+user|\s)', log_content)
entities["Action"].extend(session_actions)
# DNS actions - starts with dnsmasq[PID]: and ends with space
for line in lines:
if "dnsmasq" in line:
dns_match = re.search(r'dnsmasq\[\d+\]:\s+(query|forwarded|reply|cached|NODATA-IPv[46])(?:\s)', line)
if dns_match:
entities["Action"].append(dns_match.group(1))
# VPN actions - starts with clear identifier and ends with space
vpn_actions = re.findall(r'(?:TLS|VERIFY)\s+(OK|soft\s+reset)(?:\s|$)', log_content)
entities["Action"].extend(vpn_actions)
# IP Address patterns - with clear start/end markers
# Find IPs with context - starts with from/to/is and ends with space or port
for line in lines:
ip_from = re.findall(r'from\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?:\s|$|:)', line)
entities["IPAddress"].extend(ip_from)
ip_to = re.findall(r'to\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?:\s|$)', line)
entities["IPAddress"].extend(ip_to)
ip_is = re.findall(r'is\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?:\s|$)', line)
entities["IPAddress"].extend(ip_is)
# Find IPs in VPN logs - starts with username/ and ends with :
vpn_ips = re.findall(r'[a-zA-Z0-9]+/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):', log_content)
entities["IPAddress"].extend(vpn_ips)
# Extract Source IP more comprehensively
src_ips = re.findall(r'SRC=(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', log_content)
entities["IPAddress"].extend(src_ips)
# Extract Destination IP more comprehensively
dst_ips = re.findall(r'DST=(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', log_content)
entities["IPAddress"].extend(dst_ips)
# DNS Name patterns - with clear start/end markers
# Find domains in DNS queries - starts with query/forwarded/reply and ends with from/to/is
for line in lines:
if "dnsmasq" in line:
dns_match = re.search(r'(?:query\[[A-Z]+\]|forwarded|reply)\s+([-a-zA-Z0-9.*_/]+(?:\.[a-zA-Z0-9.*_/-]+)+)(?:\s+from|\s+to|\s+is)', line)
if dns_match:
entities["DNSName"].append(dns_match.group(1))
# Username patterns - with clear start/end markers
# Extract usernames from quotes - starts with user=" or acct=" and ends with "
usernames_quoted = re.findall(r'(?:user|acct)=\"([^\"]+)\"', log_content)
entities["Username"].extend(usernames_quoted)
# Extract usernames from VPN logs - username before slash and IP
vpn_users = re.findall(r'(\w+)/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:', log_content)
entities["Username"].extend(vpn_users)
# Extract usernames from session logs - starts with for user and ends with space or by
usernames_session = re.findall(r'for\s+user\s+(\w+)(?:\s|$|by)', log_content)
entities["Username"].extend(usernames_session)
# Extract usernames from SSH logs - starts with Accepted type for and ends with from
usernames_ssh = re.findall(r'Accepted\s+\w+\s+for\s+(\w+)\s+from', log_content)
entities["Username"].extend(usernames_ssh)
# Time Server patterns - NTP servers with port 123
time_servers = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):123\s+\(([^)]+)\)', log_content)
for ip, name in time_servers:
entities["TimeServer"].append(f"{ip}:123 ({name})")
# Port patterns - explicit port mentions
port_patterns = re.findall(r'(?:port\s+|:)(\d+)(?:\s|$|,|\))', log_content)
entities["Port"].extend(port_patterns)
# Source Port patterns
source_ports = re.findall(r'SPT=(\d+)', log_content)
entities["SourcePort"].extend(source_ports)
# Destination Port patterns
dest_ports = re.findall(r'DPT=(\d+)', log_content)
entities["DestinationPort"].extend(dest_ports)
# Protocol patterns
protocols = re.findall(r'(?:PROTO=|protocol\s+)([a-zA-Z]+\d*)', log_content) # Modified to avoid numeric-only protocols
entities["Protocol"].extend(protocols)
# Add common protocols if mentioned
for proto in ["tcp", "udp", "icmp", "TCP", "IPv4", "IPv6"]:
if re.search(r'\b' + proto + r'\b', log_content, re.IGNORECASE):
entities["Protocol"].append(proto)
# Interface patterns
interfaces = re.findall(r'(?:interface|dev)\s+(ens\d+|eth\d+|wlan\d+|lo)', log_content)
entities["Interface"].extend(interfaces)
# Interface Type patterns
interface_types = re.findall(r'(?:zone|type)\s+(inet|lan|dmz|wan)', log_content, re.IGNORECASE)
entities["InterfaceType"].extend(interface_types)
# Subnet patterns
subnets = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/\d{1,2})', log_content)
entities["Subnet"].extend(subnets)
# Rule patterns
rules = re.findall(r'(DNAT|ACCEPT|REJECT|DROP|Policy)\s', log_content)
entities["Rule"].extend(rules)
# TTL patterns
ttls = re.findall(r'TTL=(\d+)', log_content)
entities["TTL"].extend(ttls)
# MAC Address patterns
macs = re.findall(r'((?:[0-9a-fA-F]{2}:){5}[0-9a-fA-F]{2}|(?:[0-9a-fA-F]{2}-){5}[0-9a-fA-F]{2})', log_content)
entities["MAC"].extend(macs)
# Flags patterns
flags = re.findall(r'(?:flags|FLAG)\s+(\w+)', log_content)
entities["Flags"].extend(flags)
# Add specific flags if found
for flag in ["RST", "DF", "ACK", "SYN", "FIN", "PSH", "URG"]:
if re.search(r'\b' + flag + r'\b', log_content):
entities["Flags"].append(flag)
# CPU patterns
cpu_info = re.findall(r'(Intel GenuineIntel|AMD AuthenticAMD|Centaur CentaurHauls)', log_content)
entities["CPU"].extend(cpu_info)
# Memory Info patterns
memory_info = re.findall(r'mem\s+0x[0-9a-f]+-0x[0-9a-f]+\s+([a-z]+)', log_content)
entities["MemoryInfo"].extend(memory_info)
# Hypervisor patterns
hypervisor = re.findall(r'Hypervisor detected:\s+(\w+)', log_content)
entities["Hypervisor"].extend(hypervisor)
# Device/Component patterns
devices = re.findall(r'(?:device|component):\s+([a-zA-Z0-9_-]+)', log_content)
entities["Device"].extend(devices)
# Add common devices if found
for device in ["PCI-DMA", "ehci_hcd", "usb", "rtc_cmos", "virtio-pci", "i8042", "ata_piix"]:
if re.search(r'\b' + device + r'\b', log_content):
entities["Device"].append(device)
# File System patterns
filesystems = re.findall(r'(squashfs|ext4|xfs|btrfs):\s', log_content)
entities["FileSystem"].extend(filesystems)
# Data Bus patterns
for bus in ["PCI", "USB", "i2c", "PS/2", "Serial", "ATA", "SATA", "TUN/TAP"]:
if re.search(r'\b' + bus + r'\b', log_content):
entities["DataBus"].append(bus)
# Event ID patterns
event_ids = re.findall(r'\[(\d+\.\d+)\]', log_content)
entities["EventID"].extend(event_ids)
# CMD patterns
cmd_patterns = re.findall(r'CMD\s+\(([^)]+)\)', log_content)
entities["CMD"].extend(cmd_patterns)
# Enhanced file path detection
file_paths = re.findall(r'(/etc/[a-zA-Z0-9_/.-]+)', log_content)
entities["FileName"].extend(file_paths)
# File name patterns - with clear file extensions
for line in lines:
# Look for common file extensions - starts with word character and ends with known extension
file_match = re.search(r'(?<!\S)([a-zA-Z0-9_-]+\.(?:xlsx|txt|java|log|csv|pdf|docx|cfg|conf))(?:\s|$|\.)', line)
if file_match:
entities["FileName"].append(file_match.group(1))
# Enhanced severity level detection
for severity in ["warning", "info", "error", "debug", "notice", "critical", "alert", "emergency"]:
if re.search(r'\b' + severity + r'\b', log_content, re.IGNORECASE):
entities["Severity"].append(severity)
# Session status patterns
session_status = re.findall(r'session\s+(opened|closed)', log_content)
entities["SessionStatus"].extend(session_status)
# Error message patterns
error_patterns = re.findall(r'(?:error|failure|failed):\s+([^,\n]+)', log_content, re.IGNORECASE)
entities["Error"].extend(error_patterns)
# Authentication type patterns
auth_types = re.findall(r'Accepted\s+(\w+)', log_content)
entities["AuthenticationType"].extend(auth_types)
# Status patterns - with clear start/end markers
# Extract PAM results - starts with res= and ends with space or quote
status_pam = re.findall(r'res=(\w+)(?:\s|\'|\")', log_content)
entities["Status"].extend(status_pam)
# Extract verification results - starts with VERIFY and ends with space
status_verify = re.findall(r'VERIFY\s+(OK|FAILED|KU OK|EKU OK)(?:\s|$)', log_content)
entities["Status"].extend(status_verify)
# Resource type patterns - with clear indicators
resource_types = re.findall(r'(?:resource|type):\s+([a-zA-Z0-9_-]+)', log_content)
entities["ResourceType"].extend(resource_types)
# Add CPU as resource type if system CPU metrics are present
if re.search(r'\"system\":\s*\{\s*\"cpu\":', log_content):
entities["ResourceType"].append("CPU")
# SessionID patterns - starts with session_id= or sessionId= and ends with space or comma
session_id_patterns = re.findall(r'session(?:_id|Id)=([a-zA-Z0-9-]+)(?:\s|,|$)', log_content)
entities["SessionID"].extend(session_id_patterns)
# Extract session numbers - starts with session space and ends with space
session_numbers = re.findall(r'session\s+(\d+)(?:\s|$)', log_content)
entities["SessionID"].extend(session_numbers)
# Extract ses= format SessionIDs from audit logs
ses_patterns = re.findall(r'ses=(\d+)(?:\s|,|$)', log_content)
entities["SessionID"].extend(ses_patterns)
# Object patterns - starts with object= and ends with space or comma
object_patterns = re.findall(r'object=([a-zA-Z0-9_-]+)(?:\s|,|$)', log_content)
entities["Object"].extend(object_patterns)
# Extract unit names from systemd logs
unit_objects = re.findall(r'unit=([a-zA-Z0-9_-]+)(?:\s|,|$)', log_content)
entities["Object"].extend(unit_objects)
# Application specific patterns - starts with app= and ends with space or comma
app_specific_patterns = re.findall(r'app=([a-zA-Z0-9_-]+)(?:\s|,|$)', log_content)
entities["ApplicationSpecific"].extend(app_specific_patterns)
# Extract specific applications from content
if "OpenVPN" in log_content:
entities["ApplicationSpecific"].append("OpenVPN")
if "metricbeat" in log_content:
entities["ApplicationSpecific"].append("metricbeat")
# Role patterns - starts with role= and ends with space or comma
role_patterns = re.findall(r'role=([a-zA-Z0-9_-]+)(?:\s|,|$)', log_content)
entities["Role"].extend(role_patterns)
# Metadata patterns - starts with metadata={ and ends with }
metadata_patterns = re.findall(r'metadata=\{([^}]+)\}', log_content)
entities["Metadata"].extend(metadata_patterns)
# Resource usage patterns - CPU and memory metrics
cpu_usage = re.findall(r'\"cpu\":\s*{\s*\"pct\":\s*([0-9.]+)', log_content)
if cpu_usage:
entities["ResourceUsage"].extend([f"CPU: {usage}%" for usage in cpu_usage])
# Remove duplicates
for entity_type in list(entities.keys()):
if entities[entity_type]:
entities[entity_type] = list(set(entities[entity_type]))
else:
# Remove empty entity types from output
del entities[entity_type]
return entities
|