Spaces:

MCP-1st-Birthday
/

simple-security-scanner

Running

File size: 7,546 Bytes

"""
Pattern-based security vulnerability detector using regular expressions.

Detects hardcoded secrets, credentials, and sensitive information patterns.
"""

import re
from typing import Dict, List, Any

# Security patterns with regex, severity, and descriptions
SECURITY_PATTERNS = {
    "aws_access_key": {
        "regex": r"(?:AWS_ACCESS_KEY_ID|aws_access_key_id)\s*[:=]\s*['\"]?(AKIA[0-9A-Z]{16})['\"]?",
        "severity": "CRITICAL",
        "title": "Hardcoded AWS Access Key detected",
        "description": "AWS Access Key is hardcoded in the source code.",
    },
    "aws_secret_key": {
        "regex": r"(?:AWS_SECRET_ACCESS_KEY|aws_secret_access_key)\s*[:=]\s*['\"]?([A-Za-z0-9/+=]{40})['\"]?",
        "severity": "CRITICAL",
        "title": "Hardcoded AWS Secret Key detected",
        "description": "AWS Secret Access Key is hardcoded in the source code.",
    },
    "api_key": {
        "regex": r"(?:api[_-]?key|apikey|api[_-]?secret)\s*[:=]\s*['\"]([a-zA-Z0-9_\-]{20,})['\"]",
        "severity": "HIGH",
        "title": "Hardcoded API key detected",
        "description": "API key is directly hardcoded in the source code.",
    },
    "github_token": {
        "regex": r"\b(gh[ps]_[a-zA-Z0-9]{36,})\b",
        "severity": "HIGH",
        "title": "GitHub Personal Access Token detected",
        "description": "GitHub personal access token is exposed in the source code.",
    },
    "jwt_token": {
        "regex": r"\b(eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]+)\b",
        "severity": "HIGH",
        "title": "Hardcoded JWT token detected",
        "description": "JWT token is hardcoded in the source code.",
    },
    "password": {
        "regex": r"(?:password|passwd|pwd)\s*[:=]\s*['\"]([^'\"]{4,})['\"]",
        "severity": "MEDIUM",
        "title": "Hardcoded password detected",
        "description": "Password is directly written in the source code.",
    },
    "ssn": {
        "regex": r"\b(\d{6}[-]\d{7})\b",
        "severity": "MEDIUM",
        "title": "Social Security Number pattern detected",
        "description": "Data matching SSN format found in the source code.",
    },
    "credit_card": {
        "regex": r"\b(\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4})\b",
        "severity": "MEDIUM",
        "title": "Credit card number pattern detected",
        "description": "Data matching credit card number format found.",
    },
    "phone_number": {
        "regex": r"\b(0\d{1,2}[-\s]?\d{3,4}[-\s]?\d{4})\b",
        "severity": "LOW",
        "title": "Phone number pattern detected",
        "description": "Phone number is included in the source code.",
    },
    "database_url": {
        "regex": r"(?:postgresql|mysql|mongodb|redis)://([^:]+):([^@]+)@",
        "severity": "CRITICAL",
        "title": "Database connection string contains credentials",
        "description": "Database connection string includes username and password.",
    },
    "private_key": {
        "regex": r"-----BEGIN (?:RSA |EC |DSA )?PRIVATE KEY-----",
        "severity": "CRITICAL",
        "title": "Hardcoded private key detected",
        "description": "Cryptographic private key is directly included in the source code.",
    },
}


def is_false_positive(line: str, pattern_type: str) -> bool:
    """
    Check if a detected pattern is likely a false positive.

    Args:
        line: The line of code containing the match
        pattern_type: Type of pattern detected

    Returns:
        True if likely a false positive, False otherwise
    """
    # Skip commented lines (but with lower confidence)
    if line.strip().startswith("#"):
        return True

    # Skip obvious example/test values
    test_indicators = [
        "example",
        "test",
        "dummy",
        "fake",
        "sample",
        "mock",
        "placeholder",
        "TODO",
        "FIXME",
        "xxx",
        "000",
    ]

    line_lower = line.lower()
    for indicator in test_indicators:
        if indicator in line_lower:
            return True

    # Pattern-specific false positive checks
    if pattern_type == "credit_card":
        # Common false positive: date ranges, version numbers
        if re.search(r"(19|20)\d{2}", line):  # Year pattern
            return True

    if pattern_type == "phone_number":
        # Skip if looks like a date or other numeric pattern
        if "date" in line_lower or "time" in line_lower:
            return True

    if pattern_type == "password":
        # Skip if it's just a variable name assignment (no actual password)
        if re.search(r'password\s*[:=]\s*["\']?\s*["\']?$', line):
            return True

    return False


def scan_patterns(file_path: str, code: str) -> List[Dict[str, Any]]:
    """
    Scan code for security vulnerability patterns.

    Args:
        file_path: Path to the file being scanned (for reference)
        code: Source code to scan

    Returns:
        List of vulnerability dictionaries
    """
    vulnerabilities = []
    lines = code.split("\n")

    for line_num, line in enumerate(lines, start=1):
        for pattern_name, pattern_info in SECURITY_PATTERNS.items():
            regex = pattern_info["regex"]
            matches = re.finditer(regex, line, re.IGNORECASE)

            for match in matches:
                # Check for false positives
                if is_false_positive(line, pattern_name):
                    continue

                # Extract matched text (mask sensitive parts)
                matched_text = match.group(0)
                if len(matched_text) > 50:
                    # Truncate long matches for display
                    matched_text = matched_text[:47] + "..."

                # Mask the actual secret value for security
                code_snippet = line.strip()
                if len(code_snippet) > 100:
                    code_snippet = code_snippet[:97] + "..."

                vulnerability = {
                    "id": f"pattern-{pattern_name}",
                    "severity": pattern_info["severity"],
                    "title": pattern_info["title"],
                    "description": pattern_info["description"],
                    "line_number": line_num,
                    "code_snippet": code_snippet,
                    "pattern_type": pattern_name,
                    "file_path": file_path,
                    "scanner": "pattern_detector",
                }

                vulnerabilities.append(vulnerability)

    return vulnerabilities


def get_pattern_info(pattern_type: str) -> Dict[str, str]:
    """
    Get information about a specific pattern type.

    Args:
        pattern_type: Type of security pattern

    Returns:
        Dictionary with pattern information
    """
    return SECURITY_PATTERNS.get(
        pattern_type,
        {
            "severity": "MEDIUM",
            "title": "Security pattern detected",
            "description": "Unknown security pattern found.",
        },
    )


def list_available_patterns() -> List[str]:
    """
    List all available security patterns.

    Returns:
        List of pattern names
    """
    return list(SECURITY_PATTERNS.keys())


def get_patterns_by_severity(severity: str) -> List[str]:
    """
    Get patterns filtered by severity level.

    Args:
        severity: Severity level (CRITICAL, HIGH, MEDIUM, LOW)

    Returns:
        List of pattern names with matching severity
    """
    return [
        name
        for name, info in SECURITY_PATTERNS.items()
        if info["severity"] == severity.upper()
    ]