Spaces:

MCP-1st-Birthday
/

simple-security-scanner

Running

App Files Files Community

simple-security-scanner / src /scanner /pattern_detector.py

garibong

Translate scanner messages to English

41329d5 about 2 months ago

raw

history blame contribute delete

7.55 kB

	"""
	Pattern-based security vulnerability detector using regular expressions.

	Detects hardcoded secrets, credentials, and sensitive information patterns.
	"""

	import re
	from typing import Dict, List, Any

	# Security patterns with regex, severity, and descriptions
	SECURITY_PATTERNS = {
	"aws_access_key": {
	"regex": r"(?:AWS_ACCESS_KEY_ID\|aws_access_key_id)\s[:=]\s['\"]?(AKIA[0-9A-Z]{16})['\"]?",
	"severity": "CRITICAL",
	"title": "Hardcoded AWS Access Key detected",
	"description": "AWS Access Key is hardcoded in the source code.",
	},
	"aws_secret_key": {
	"regex": r"(?:AWS_SECRET_ACCESS_KEY\|aws_secret_access_key)\s[:=]\s['\"]?([A-Za-z0-9/+=]{40})['\"]?",
	"severity": "CRITICAL",
	"title": "Hardcoded AWS Secret Key detected",
	"description": "AWS Secret Access Key is hardcoded in the source code.",
	},
	"api_key": {
	"regex": r"(?:api[_-]?key\|apikey\|api[_-]?secret)\s[:=]\s['\"]([a-zA-Z0-9_\-]{20,})['\"]",
	"severity": "HIGH",
	"title": "Hardcoded API key detected",
	"description": "API key is directly hardcoded in the source code.",
	},
	"github_token": {
	"regex": r"\b(gh[ps]_[a-zA-Z0-9]{36,})\b",
	"severity": "HIGH",
	"title": "GitHub Personal Access Token detected",
	"description": "GitHub personal access token is exposed in the source code.",
	},
	"jwt_token": {
	"regex": r"\b(eyJ[a-zA-Z0-9_-]\.eyJ[a-zA-Z0-9_-]\.[a-zA-Z0-9_-]+)\b",
	"severity": "HIGH",
	"title": "Hardcoded JWT token detected",
	"description": "JWT token is hardcoded in the source code.",
	},
	"password": {
	"regex": r"(?:password\|passwd\|pwd)\s[:=]\s['\"]([^'\"]{4,})['\"]",
	"severity": "MEDIUM",
	"title": "Hardcoded password detected",
	"description": "Password is directly written in the source code.",
	},
	"ssn": {
	"regex": r"\b(\d{6}[-]\d{7})\b",
	"severity": "MEDIUM",
	"title": "Social Security Number pattern detected",
	"description": "Data matching SSN format found in the source code.",
	},
	"credit_card": {
	"regex": r"\b(\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4})\b",
	"severity": "MEDIUM",
	"title": "Credit card number pattern detected",
	"description": "Data matching credit card number format found.",
	},
	"phone_number": {
	"regex": r"\b(0\d{1,2}[-\s]?\d{3,4}[-\s]?\d{4})\b",
	"severity": "LOW",
	"title": "Phone number pattern detected",
	"description": "Phone number is included in the source code.",
	},
	"database_url": {
	"regex": r"(?:postgresql\|mysql\|mongodb\|redis)://([^:]+):([^@]+)@",
	"severity": "CRITICAL",
	"title": "Database connection string contains credentials",
	"description": "Database connection string includes username and password.",
	},
	"private_key": {
	"regex": r"-----BEGIN (?:RSA \|EC \|DSA )?PRIVATE KEY-----",
	"severity": "CRITICAL",
	"title": "Hardcoded private key detected",
	"description": "Cryptographic private key is directly included in the source code.",
	},
	}


	def is_false_positive(line: str, pattern_type: str) -> bool:
	"""
	Check if a detected pattern is likely a false positive.

	Args:
	line: The line of code containing the match
	pattern_type: Type of pattern detected

	Returns:
	True if likely a false positive, False otherwise
	"""
	# Skip commented lines (but with lower confidence)
	if line.strip().startswith("#"):
	return True

	# Skip obvious example/test values
	test_indicators = [
	"example",
	"test",
	"dummy",
	"fake",
	"sample",
	"mock",
	"placeholder",
	"TODO",
	"FIXME",
	"xxx",
	"000",
	]

	line_lower = line.lower()
	for indicator in test_indicators:
	if indicator in line_lower:
	return True

	# Pattern-specific false positive checks
	if pattern_type == "credit_card":
	# Common false positive: date ranges, version numbers
	if re.search(r"(19\|20)\d{2}", line): # Year pattern
	return True

	if pattern_type == "phone_number":
	# Skip if looks like a date or other numeric pattern
	if "date" in line_lower or "time" in line_lower:
	return True

	if pattern_type == "password":
	# Skip if it's just a variable name assignment (no actual password)
	if re.search(r'password\s[:=]\s["\']?\s*["\']?$', line):
	return True

	return False


	def scan_patterns(file_path: str, code: str) -> List[Dict[str, Any]]:
	"""
	Scan code for security vulnerability patterns.

	Args:
	file_path: Path to the file being scanned (for reference)
	code: Source code to scan

	Returns:
	List of vulnerability dictionaries
	"""
	vulnerabilities = []
	lines = code.split("\n")

	for line_num, line in enumerate(lines, start=1):
	for pattern_name, pattern_info in SECURITY_PATTERNS.items():
	regex = pattern_info["regex"]
	matches = re.finditer(regex, line, re.IGNORECASE)

	for match in matches:
	# Check for false positives
	if is_false_positive(line, pattern_name):
	continue

	# Extract matched text (mask sensitive parts)
	matched_text = match.group(0)
	if len(matched_text) > 50:
	# Truncate long matches for display
	matched_text = matched_text[:47] + "..."

	# Mask the actual secret value for security
	code_snippet = line.strip()
	if len(code_snippet) > 100:
	code_snippet = code_snippet[:97] + "..."

	vulnerability = {
	"id": f"pattern-{pattern_name}",
	"severity": pattern_info["severity"],
	"title": pattern_info["title"],
	"description": pattern_info["description"],
	"line_number": line_num,
	"code_snippet": code_snippet,
	"pattern_type": pattern_name,
	"file_path": file_path,
	"scanner": "pattern_detector",
	}

	vulnerabilities.append(vulnerability)

	return vulnerabilities


	def get_pattern_info(pattern_type: str) -> Dict[str, str]:
	"""
	Get information about a specific pattern type.

	Args:
	pattern_type: Type of security pattern

	Returns:
	Dictionary with pattern information
	"""
	return SECURITY_PATTERNS.get(
	pattern_type,
	{
	"severity": "MEDIUM",
	"title": "Security pattern detected",
	"description": "Unknown security pattern found.",
	},
	)


	def list_available_patterns() -> List[str]:
	"""
	List all available security patterns.

	Returns:
	List of pattern names
	"""
	return list(SECURITY_PATTERNS.keys())


	def get_patterns_by_severity(severity: str) -> List[str]:
	"""
	Get patterns filtered by severity level.

	Args:
	severity: Severity level (CRITICAL, HIGH, MEDIUM, LOW)

	Returns:
	List of pattern names with matching severity
	"""
	return [
	name
	for name, info in SECURITY_PATTERNS.items()
	if info["severity"] == severity.upper()
	]