Fix log parsing: JSON arrays, Zeek format, and beacon detection priority
Browse files- c2sentinel.py +95 -16
c2sentinel.py
CHANGED
|
@@ -1051,14 +1051,18 @@ class LogParser:
|
|
| 1051 |
def parse_zeek_conn(log_line: str) -> Optional[Dict]:
|
| 1052 |
"""Parse Zeek/Bro conn.log format."""
|
| 1053 |
try:
|
|
|
|
|
|
|
|
|
|
| 1054 |
parts = log_line.strip().split('\t')
|
| 1055 |
-
|
|
|
|
| 1056 |
return {
|
| 1057 |
'timestamp': float(parts[0]),
|
| 1058 |
'src_ip': parts[2],
|
| 1059 |
-
'src_port': int(parts[3]),
|
| 1060 |
'dst_ip': parts[4],
|
| 1061 |
-
'dst_port': int(parts[5]),
|
| 1062 |
'protocol': parts[6],
|
| 1063 |
'duration': float(parts[8]) if parts[8] != '-' else 0,
|
| 1064 |
'bytes_sent': int(parts[9]) if parts[9] != '-' else 0,
|
|
@@ -1518,6 +1522,7 @@ class C2Sentinel:
|
|
| 1518 |
# PHASE 4: Behavioral refinement
|
| 1519 |
# ================================================================
|
| 1520 |
|
|
|
|
| 1521 |
dst_ips = set(conn.get('dst_ip', '') for conn in connections)
|
| 1522 |
bytes_recv = [conn.get('bytes_recv', 0) for conn in connections]
|
| 1523 |
bytes_sent = [conn.get('bytes_sent', 0) for conn in connections]
|
|
@@ -1567,15 +1572,63 @@ class C2Sentinel:
|
|
| 1567 |
c2_prob = min(1.0, c2_prob * 1.5)
|
| 1568 |
result.risk_factors.append("APT-style slow beacon pattern")
|
| 1569 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1570 |
# ================================================================
|
| 1571 |
-
# PHASE 5: Apply legitimate pattern discount
|
| 1572 |
# ================================================================
|
| 1573 |
|
| 1574 |
if matched_pattern and pattern_confidence > 0.5:
|
| 1575 |
-
#
|
| 1576 |
-
|
| 1577 |
-
|
| 1578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1579 |
|
| 1580 |
# ================================================================
|
| 1581 |
# PHASE 6: Apply context inference (always check whitelist/blacklist)
|
|
@@ -1667,14 +1720,40 @@ class C2Sentinel:
|
|
| 1667 |
) -> List[Dict]:
|
| 1668 |
"""Analyze raw log lines for C2 activity."""
|
| 1669 |
connections = []
|
| 1670 |
-
|
| 1671 |
-
|
| 1672 |
-
|
| 1673 |
-
|
| 1674 |
-
|
| 1675 |
-
|
| 1676 |
-
|
| 1677 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1678 |
|
| 1679 |
if not connections:
|
| 1680 |
return []
|
|
|
|
| 1051 |
def parse_zeek_conn(log_line: str) -> Optional[Dict]:
|
| 1052 |
"""Parse Zeek/Bro conn.log format."""
|
| 1053 |
try:
|
| 1054 |
+
# Skip header lines
|
| 1055 |
+
if log_line.startswith('#'):
|
| 1056 |
+
return None
|
| 1057 |
parts = log_line.strip().split('\t')
|
| 1058 |
+
# Minimum fields: ts, uid, orig_h, orig_p, resp_h, resp_p, proto, service, duration, orig_bytes, resp_bytes
|
| 1059 |
+
if len(parts) >= 11:
|
| 1060 |
return {
|
| 1061 |
'timestamp': float(parts[0]),
|
| 1062 |
'src_ip': parts[2],
|
| 1063 |
+
'src_port': int(parts[3]) if parts[3] != '-' else 0,
|
| 1064 |
'dst_ip': parts[4],
|
| 1065 |
+
'dst_port': int(parts[5]) if parts[5] != '-' else 0,
|
| 1066 |
'protocol': parts[6],
|
| 1067 |
'duration': float(parts[8]) if parts[8] != '-' else 0,
|
| 1068 |
'bytes_sent': int(parts[9]) if parts[9] != '-' else 0,
|
|
|
|
| 1522 |
# PHASE 4: Behavioral refinement
|
| 1523 |
# ================================================================
|
| 1524 |
|
| 1525 |
+
beacon_indicators = 0 # Initialize here so it's always defined
|
| 1526 |
dst_ips = set(conn.get('dst_ip', '') for conn in connections)
|
| 1527 |
bytes_recv = [conn.get('bytes_recv', 0) for conn in connections]
|
| 1528 |
bytes_sent = [conn.get('bytes_sent', 0) for conn in connections]
|
|
|
|
| 1572 |
c2_prob = min(1.0, c2_prob * 1.5)
|
| 1573 |
result.risk_factors.append("APT-style slow beacon pattern")
|
| 1574 |
|
| 1575 |
+
# ============================================================
|
| 1576 |
+
# CRITICAL: Explicit beacon pattern override
|
| 1577 |
+
# When ALL classic beacon indicators are present, force detection
|
| 1578 |
+
# ============================================================
|
| 1579 |
+
beacon_indicators = 0
|
| 1580 |
+
|
| 1581 |
+
# Indicator 1: Very regular timing (CV < 0.15)
|
| 1582 |
+
if interval_cv < 0.15:
|
| 1583 |
+
beacon_indicators += 1
|
| 1584 |
+
|
| 1585 |
+
# Indicator 2: Very consistent sizes (both sent and recv CV < 0.15)
|
| 1586 |
+
if recv_cv < 0.15 and sent_cv < 0.15:
|
| 1587 |
+
beacon_indicators += 1
|
| 1588 |
+
|
| 1589 |
+
# Indicator 3: Small packet sizes (typical heartbeat)
|
| 1590 |
+
mean_sent = np.mean(bytes_sent) if bytes_sent else 0
|
| 1591 |
+
mean_recv = np.mean(bytes_recv) if bytes_recv else 0
|
| 1592 |
+
if mean_sent < 500 and mean_recv < 500:
|
| 1593 |
+
beacon_indicators += 1
|
| 1594 |
+
|
| 1595 |
+
# Indicator 4: Regular interval in beacon range (5s - 300s)
|
| 1596 |
+
if 5 <= mean_interval <= 300:
|
| 1597 |
+
beacon_indicators += 1
|
| 1598 |
+
|
| 1599 |
+
# Indicator 5: Sufficient sample size
|
| 1600 |
+
if len(connections) >= 8:
|
| 1601 |
+
beacon_indicators += 1
|
| 1602 |
+
|
| 1603 |
+
# If 4+ of 5 indicators present, this is almost certainly C2
|
| 1604 |
+
if beacon_indicators >= 4:
|
| 1605 |
+
c2_prob = max(c2_prob, 0.85) # Force high probability
|
| 1606 |
+
result.risk_factors.append(f"Classic C2 beacon pattern detected ({beacon_indicators}/5 indicators)")
|
| 1607 |
+
result.detection_method = DetectionMethod.BEHAVIORAL.value
|
| 1608 |
+
elif beacon_indicators >= 3:
|
| 1609 |
+
c2_prob = max(c2_prob, 0.65) # Likely C2
|
| 1610 |
+
result.risk_factors.append(f"Probable C2 beacon pattern ({beacon_indicators}/5 indicators)")
|
| 1611 |
+
|
| 1612 |
# ================================================================
|
| 1613 |
+
# PHASE 5: Apply legitimate pattern discount (but respect strong beacon signals)
|
| 1614 |
# ================================================================
|
| 1615 |
|
| 1616 |
if matched_pattern and pattern_confidence > 0.5:
|
| 1617 |
+
# If strong beacon indicators present, skip or reduce the legitimate pattern discount
|
| 1618 |
+
# This prevents false negatives when C2 mimics legitimate patterns
|
| 1619 |
+
if beacon_indicators >= 4:
|
| 1620 |
+
# Strong C2 signal - don't discount
|
| 1621 |
+
result.mitigating_factors.append(f"Matches {matched_pattern.name} pattern but beacon indicators override")
|
| 1622 |
+
elif beacon_indicators >= 3:
|
| 1623 |
+
# Moderate C2 signal - apply reduced discount
|
| 1624 |
+
discount = 1.0 - (pattern_confidence * 0.3) # Max 30% reduction
|
| 1625 |
+
c2_prob *= discount
|
| 1626 |
+
result.mitigating_factors.append(f"Legitimate pattern match reduces probability by {(1-discount)*100:.0f}%")
|
| 1627 |
+
else:
|
| 1628 |
+
# Weak C2 signal - apply full discount
|
| 1629 |
+
discount = 1.0 - (pattern_confidence * 0.7) # Up to 70% reduction
|
| 1630 |
+
c2_prob *= discount
|
| 1631 |
+
result.mitigating_factors.append(f"Legitimate pattern match reduces probability by {(1-discount)*100:.0f}%")
|
| 1632 |
|
| 1633 |
# ================================================================
|
| 1634 |
# PHASE 6: Apply context inference (always check whitelist/blacklist)
|
|
|
|
| 1720 |
) -> List[Dict]:
|
| 1721 |
"""Analyze raw log lines for C2 activity."""
|
| 1722 |
connections = []
|
| 1723 |
+
|
| 1724 |
+
# First try to parse as a complete JSON array
|
| 1725 |
+
full_content = ''.join(log_lines)
|
| 1726 |
+
try:
|
| 1727 |
+
data = json.loads(full_content)
|
| 1728 |
+
if isinstance(data, list):
|
| 1729 |
+
for item in data:
|
| 1730 |
+
if isinstance(item, dict):
|
| 1731 |
+
conn = {
|
| 1732 |
+
'timestamp': item.get('timestamp', item.get('@timestamp', 0)),
|
| 1733 |
+
'src_ip': item.get('src_ip', item.get('source_ip', '')),
|
| 1734 |
+
'dst_ip': item.get('dst_ip', item.get('dest_ip', '')),
|
| 1735 |
+
'src_port': int(item.get('src_port', item.get('source_port', 0))),
|
| 1736 |
+
'dst_port': int(item.get('dst_port', item.get('dest_port', 0))),
|
| 1737 |
+
'protocol': item.get('protocol', 'tcp'),
|
| 1738 |
+
'bytes_sent': int(item.get('bytes_sent', item.get('bytes_out', 0))),
|
| 1739 |
+
'bytes_recv': int(item.get('bytes_recv', item.get('bytes_in', 0))),
|
| 1740 |
+
'duration': float(item.get('duration', 0))
|
| 1741 |
+
}
|
| 1742 |
+
if conn.get('dst_ip'):
|
| 1743 |
+
connections.append(conn)
|
| 1744 |
+
except (json.JSONDecodeError, TypeError):
|
| 1745 |
+
pass
|
| 1746 |
+
|
| 1747 |
+
# Fall back to line-by-line parsing
|
| 1748 |
+
if not connections:
|
| 1749 |
+
for line in log_lines:
|
| 1750 |
+
conn = self.log_parser.parse_json(line)
|
| 1751 |
+
if not conn:
|
| 1752 |
+
conn = self.log_parser.parse_zeek_conn(line)
|
| 1753 |
+
if not conn:
|
| 1754 |
+
conn = self.log_parser.parse_syslog(line)
|
| 1755 |
+
if conn:
|
| 1756 |
+
connections.append(conn)
|
| 1757 |
|
| 1758 |
if not connections:
|
| 1759 |
return []
|