import re import json def extract_entities(log_content): # Initialize entity dictionaries entities = { "DateTime": [], "System": [], "Service": [], "Process": [], "Action": [], "IPAddress": [], "DNSName": [], "Username": [], "Role": [], "Metadata": [], "Status": [], "Error": [], "Severity": [], "SessionID": [], "SessionStatus": [], "FileName": [], "Object": [], "ApplicationSpecific": [], "AuthenticationType": [], "ResourceType": [], "ResourceUsage": [], # Add new entity types "TimeServer": [], "Port": [], "SourcePort": [], "DestinationPort": [], "Protocol": [], "Interface": [], "InterfaceType": [], "Subnet": [], "Rule": [], "TTL": [], "MAC": [], "Flags": [], "CPU": [], "MemoryInfo": [], "Hypervisor": [], "Device": [], "FileSystem": [], "DataBus": [], "EventID": [], "CMD": [] } # Process log content line by line to maintain proper context lines = log_content.split('\n') # DateTime patterns with clear start/end markers # ISO timestamps in JSON - starts with @timestamp": " and ends with " iso_timestamps = re.findall(r'@timestamp\":\s*\"([^\"]+)\"', log_content) entities["DateTime"].extend(iso_timestamps) # Log timestamps - starts with year or month and ends with space for line in lines: # YYYY-MM-DD HH:MM:SS format (starts with digit, ends with space) date_match = re.search(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s', line) if date_match: entities["DateTime"].append(date_match.group(1)) # Month DD HH:MM:SS format (starts with month name, ends with space) syslog_match = re.match(r'^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{1,2})\s+(\d{2}:\d{2}:\d{2})\s', line) if syslog_match: month, day, time = syslog_match.groups() entities["DateTime"].append(f"{month} {day} {time}") # Audit timestamps - starts with msg=audit( and ends with : audit_timestamps = re.findall(r'msg=audit\((\d+\.\d+):', log_content) entities["DateTime"].extend(audit_timestamps) # System/Node patterns - with clear start/end markers # Extract hostname from JSON - starts with "hostname": " and ends with " hostnames = re.findall(r'\"hostname\":\s*\"([^\"]+)\"', log_content) entities["System"].extend(hostnames) # Extract hostname from JSON host field - starts with "host": {"name": " and ends with " host_names = re.findall(r'\"host\":\s*\{[^}]*\"name\":\s*\"([^\"]+)\"', log_content) entities["System"].extend(host_names) # Extract hostname from syslog format - after timestamp and before service for line in lines: syslog_match = re.match(r'^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+([^\s:]+)\s', line) if syslog_match and syslog_match.group(1) not in ["?", "-"]: entities["System"].append(syslog_match.group(1)) # Service patterns - with clear start/end markers # Extract service from syslog format - after hostname and before colon or bracket for line in lines: service_match = re.search(r'^\w+\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+[^\s]+\s+([^:\[\s]+)(?:\[\d+\])?:', line) if service_match and service_match.group(1) not in ["?", "-"]: entities["Service"].append(service_match.group(1)) # Extract service from JSON - starts with "service": {"type": " and ends with " service_types = re.findall(r'\"service\":\s*\{\s*\"type\":\s*\"([^\"]+)\"', log_content) entities["Service"].extend(service_types) # Extract agent types - starts with "agent": and contains "type": " and ends with " agent_types = re.findall(r'\"agent\":[^}]*\"type\":\s*\"([^\"]+)\"', log_content) entities["Service"].extend(agent_types) # Process IDs - with clear start/end markers # Extract PIDs from brackets - starts with [ and ends with ] for line in lines: pid_matches = re.findall(r'(?:sshd|dnsmasq|cron|systemd|openvpn|metricbeat)\[(\d+)\]', line) entities["Process"].extend(pid_matches) # Extract PIDs from audit logs - starts with pid= and ends with space pid_audit = re.findall(r'pid=(\d+)\s', log_content) entities["Process"].extend(pid_audit) # Action patterns - with clear start/end markers # Session actions - starts with session and ends with for user or space session_actions = re.findall(r'session\s+(opened|closed)(?:\s+for\s+user|\s)', log_content) entities["Action"].extend(session_actions) # DNS actions - starts with dnsmasq[PID]: and ends with space for line in lines: if "dnsmasq" in line: dns_match = re.search(r'dnsmasq\[\d+\]:\s+(query|forwarded|reply|cached|NODATA-IPv[46])(?:\s)', line) if dns_match: entities["Action"].append(dns_match.group(1)) # VPN actions - starts with clear identifier and ends with space vpn_actions = re.findall(r'(?:TLS|VERIFY)\s+(OK|soft\s+reset)(?:\s|$)', log_content) entities["Action"].extend(vpn_actions) # IP Address patterns - with clear start/end markers # Find IPs with context - starts with from/to/is and ends with space or port for line in lines: ip_from = re.findall(r'from\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?:\s|$|:)', line) entities["IPAddress"].extend(ip_from) ip_to = re.findall(r'to\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?:\s|$)', line) entities["IPAddress"].extend(ip_to) ip_is = re.findall(r'is\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?:\s|$)', line) entities["IPAddress"].extend(ip_is) # Find IPs in VPN logs - starts with username/ and ends with : vpn_ips = re.findall(r'[a-zA-Z0-9]+/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):', log_content) entities["IPAddress"].extend(vpn_ips) # Extract Source IP more comprehensively src_ips = re.findall(r'SRC=(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', log_content) entities["IPAddress"].extend(src_ips) # Extract Destination IP more comprehensively dst_ips = re.findall(r'DST=(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', log_content) entities["IPAddress"].extend(dst_ips) # DNS Name patterns - with clear start/end markers # Find domains in DNS queries - starts with query/forwarded/reply and ends with from/to/is for line in lines: if "dnsmasq" in line: dns_match = re.search(r'(?:query\[[A-Z]+\]|forwarded|reply)\s+([-a-zA-Z0-9.*_/]+(?:\.[a-zA-Z0-9.*_/-]+)+)(?:\s+from|\s+to|\s+is)', line) if dns_match: entities["DNSName"].append(dns_match.group(1)) # Username patterns - with clear start/end markers # Extract usernames from quotes - starts with user=" or acct=" and ends with " usernames_quoted = re.findall(r'(?:user|acct)=\"([^\"]+)\"', log_content) entities["Username"].extend(usernames_quoted) # Extract usernames from VPN logs - username before slash and IP vpn_users = re.findall(r'(\w+)/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:', log_content) entities["Username"].extend(vpn_users) # Extract usernames from session logs - starts with for user and ends with space or by usernames_session = re.findall(r'for\s+user\s+(\w+)(?:\s|$|by)', log_content) entities["Username"].extend(usernames_session) # Extract usernames from SSH logs - starts with Accepted type for and ends with from usernames_ssh = re.findall(r'Accepted\s+\w+\s+for\s+(\w+)\s+from', log_content) entities["Username"].extend(usernames_ssh) # Time Server patterns - NTP servers with port 123 time_servers = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):123\s+\(([^)]+)\)', log_content) for ip, name in time_servers: entities["TimeServer"].append(f"{ip}:123 ({name})") # Port patterns - explicit port mentions port_patterns = re.findall(r'(?:port\s+|:)(\d+)(?:\s|$|,|\))', log_content) entities["Port"].extend(port_patterns) # Source Port patterns source_ports = re.findall(r'SPT=(\d+)', log_content) entities["SourcePort"].extend(source_ports) # Destination Port patterns dest_ports = re.findall(r'DPT=(\d+)', log_content) entities["DestinationPort"].extend(dest_ports) # Protocol patterns protocols = re.findall(r'(?:PROTO=|protocol\s+)([a-zA-Z]+\d*)', log_content) # Modified to avoid numeric-only protocols entities["Protocol"].extend(protocols) # Add common protocols if mentioned for proto in ["tcp", "udp", "icmp", "TCP", "IPv4", "IPv6"]: if re.search(r'\b' + proto + r'\b', log_content, re.IGNORECASE): entities["Protocol"].append(proto) # Interface patterns interfaces = re.findall(r'(?:interface|dev)\s+(ens\d+|eth\d+|wlan\d+|lo)', log_content) entities["Interface"].extend(interfaces) # Interface Type patterns interface_types = re.findall(r'(?:zone|type)\s+(inet|lan|dmz|wan)', log_content, re.IGNORECASE) entities["InterfaceType"].extend(interface_types) # Subnet patterns subnets = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/\d{1,2})', log_content) entities["Subnet"].extend(subnets) # Rule patterns rules = re.findall(r'(DNAT|ACCEPT|REJECT|DROP|Policy)\s', log_content) entities["Rule"].extend(rules) # TTL patterns ttls = re.findall(r'TTL=(\d+)', log_content) entities["TTL"].extend(ttls) # MAC Address patterns macs = re.findall(r'((?:[0-9a-fA-F]{2}:){5}[0-9a-fA-F]{2}|(?:[0-9a-fA-F]{2}-){5}[0-9a-fA-F]{2})', log_content) entities["MAC"].extend(macs) # Flags patterns flags = re.findall(r'(?:flags|FLAG)\s+(\w+)', log_content) entities["Flags"].extend(flags) # Add specific flags if found for flag in ["RST", "DF", "ACK", "SYN", "FIN", "PSH", "URG"]: if re.search(r'\b' + flag + r'\b', log_content): entities["Flags"].append(flag) # CPU patterns cpu_info = re.findall(r'(Intel GenuineIntel|AMD AuthenticAMD|Centaur CentaurHauls)', log_content) entities["CPU"].extend(cpu_info) # Memory Info patterns memory_info = re.findall(r'mem\s+0x[0-9a-f]+-0x[0-9a-f]+\s+([a-z]+)', log_content) entities["MemoryInfo"].extend(memory_info) # Hypervisor patterns hypervisor = re.findall(r'Hypervisor detected:\s+(\w+)', log_content) entities["Hypervisor"].extend(hypervisor) # Device/Component patterns devices = re.findall(r'(?:device|component):\s+([a-zA-Z0-9_-]+)', log_content) entities["Device"].extend(devices) # Add common devices if found for device in ["PCI-DMA", "ehci_hcd", "usb", "rtc_cmos", "virtio-pci", "i8042", "ata_piix"]: if re.search(r'\b' + device + r'\b', log_content): entities["Device"].append(device) # File System patterns filesystems = re.findall(r'(squashfs|ext4|xfs|btrfs):\s', log_content) entities["FileSystem"].extend(filesystems) # Data Bus patterns for bus in ["PCI", "USB", "i2c", "PS/2", "Serial", "ATA", "SATA", "TUN/TAP"]: if re.search(r'\b' + bus + r'\b', log_content): entities["DataBus"].append(bus) # Event ID patterns event_ids = re.findall(r'\[(\d+\.\d+)\]', log_content) entities["EventID"].extend(event_ids) # CMD patterns cmd_patterns = re.findall(r'CMD\s+\(([^)]+)\)', log_content) entities["CMD"].extend(cmd_patterns) # Enhanced file path detection file_paths = re.findall(r'(/etc/[a-zA-Z0-9_/.-]+)', log_content) entities["FileName"].extend(file_paths) # File name patterns - with clear file extensions for line in lines: # Look for common file extensions - starts with word character and ends with known extension file_match = re.search(r'(?