""" Pattern-based security vulnerability detector using regular expressions. Detects hardcoded secrets, credentials, and sensitive information patterns. """ import re from typing import Dict, List, Any # Security patterns with regex, severity, and descriptions SECURITY_PATTERNS = { "aws_access_key": { "regex": r"(?:AWS_ACCESS_KEY_ID|aws_access_key_id)\s*[:=]\s*['\"]?(AKIA[0-9A-Z]{16})['\"]?", "severity": "CRITICAL", "title": "Hardcoded AWS Access Key detected", "description": "AWS Access Key is hardcoded in the source code.", }, "aws_secret_key": { "regex": r"(?:AWS_SECRET_ACCESS_KEY|aws_secret_access_key)\s*[:=]\s*['\"]?([A-Za-z0-9/+=]{40})['\"]?", "severity": "CRITICAL", "title": "Hardcoded AWS Secret Key detected", "description": "AWS Secret Access Key is hardcoded in the source code.", }, "api_key": { "regex": r"(?:api[_-]?key|apikey|api[_-]?secret)\s*[:=]\s*['\"]([a-zA-Z0-9_\-]{20,})['\"]", "severity": "HIGH", "title": "Hardcoded API key detected", "description": "API key is directly hardcoded in the source code.", }, "github_token": { "regex": r"\b(gh[ps]_[a-zA-Z0-9]{36,})\b", "severity": "HIGH", "title": "GitHub Personal Access Token detected", "description": "GitHub personal access token is exposed in the source code.", }, "jwt_token": { "regex": r"\b(eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]+)\b", "severity": "HIGH", "title": "Hardcoded JWT token detected", "description": "JWT token is hardcoded in the source code.", }, "password": { "regex": r"(?:password|passwd|pwd)\s*[:=]\s*['\"]([^'\"]{4,})['\"]", "severity": "MEDIUM", "title": "Hardcoded password detected", "description": "Password is directly written in the source code.", }, "ssn": { "regex": r"\b(\d{6}[-]\d{7})\b", "severity": "MEDIUM", "title": "Social Security Number pattern detected", "description": "Data matching SSN format found in the source code.", }, "credit_card": { "regex": r"\b(\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4})\b", "severity": "MEDIUM", "title": "Credit card number pattern detected", "description": "Data matching credit card number format found.", }, "phone_number": { "regex": r"\b(0\d{1,2}[-\s]?\d{3,4}[-\s]?\d{4})\b", "severity": "LOW", "title": "Phone number pattern detected", "description": "Phone number is included in the source code.", }, "database_url": { "regex": r"(?:postgresql|mysql|mongodb|redis)://([^:]+):([^@]+)@", "severity": "CRITICAL", "title": "Database connection string contains credentials", "description": "Database connection string includes username and password.", }, "private_key": { "regex": r"-----BEGIN (?:RSA |EC |DSA )?PRIVATE KEY-----", "severity": "CRITICAL", "title": "Hardcoded private key detected", "description": "Cryptographic private key is directly included in the source code.", }, } def is_false_positive(line: str, pattern_type: str) -> bool: """ Check if a detected pattern is likely a false positive. Args: line: The line of code containing the match pattern_type: Type of pattern detected Returns: True if likely a false positive, False otherwise """ # Skip commented lines (but with lower confidence) if line.strip().startswith("#"): return True # Skip obvious example/test values test_indicators = [ "example", "test", "dummy", "fake", "sample", "mock", "placeholder", "TODO", "FIXME", "xxx", "000", ] line_lower = line.lower() for indicator in test_indicators: if indicator in line_lower: return True # Pattern-specific false positive checks if pattern_type == "credit_card": # Common false positive: date ranges, version numbers if re.search(r"(19|20)\d{2}", line): # Year pattern return True if pattern_type == "phone_number": # Skip if looks like a date or other numeric pattern if "date" in line_lower or "time" in line_lower: return True if pattern_type == "password": # Skip if it's just a variable name assignment (no actual password) if re.search(r'password\s*[:=]\s*["\']?\s*["\']?$', line): return True return False def scan_patterns(file_path: str, code: str) -> List[Dict[str, Any]]: """ Scan code for security vulnerability patterns. Args: file_path: Path to the file being scanned (for reference) code: Source code to scan Returns: List of vulnerability dictionaries """ vulnerabilities = [] lines = code.split("\n") for line_num, line in enumerate(lines, start=1): for pattern_name, pattern_info in SECURITY_PATTERNS.items(): regex = pattern_info["regex"] matches = re.finditer(regex, line, re.IGNORECASE) for match in matches: # Check for false positives if is_false_positive(line, pattern_name): continue # Extract matched text (mask sensitive parts) matched_text = match.group(0) if len(matched_text) > 50: # Truncate long matches for display matched_text = matched_text[:47] + "..." # Mask the actual secret value for security code_snippet = line.strip() if len(code_snippet) > 100: code_snippet = code_snippet[:97] + "..." vulnerability = { "id": f"pattern-{pattern_name}", "severity": pattern_info["severity"], "title": pattern_info["title"], "description": pattern_info["description"], "line_number": line_num, "code_snippet": code_snippet, "pattern_type": pattern_name, "file_path": file_path, "scanner": "pattern_detector", } vulnerabilities.append(vulnerability) return vulnerabilities def get_pattern_info(pattern_type: str) -> Dict[str, str]: """ Get information about a specific pattern type. Args: pattern_type: Type of security pattern Returns: Dictionary with pattern information """ return SECURITY_PATTERNS.get( pattern_type, { "severity": "MEDIUM", "title": "Security pattern detected", "description": "Unknown security pattern found.", }, ) def list_available_patterns() -> List[str]: """ List all available security patterns. Returns: List of pattern names """ return list(SECURITY_PATTERNS.keys()) def get_patterns_by_severity(severity: str) -> List[str]: """ Get patterns filtered by severity level. Args: severity: Severity level (CRITICAL, HIGH, MEDIUM, LOW) Returns: List of pattern names with matching severity """ return [ name for name, info in SECURITY_PATTERNS.items() if info["severity"] == severity.upper() ]