File size: 5,051 Bytes
ed71b0e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | """
Code Extraction Detector
Detects attempts to extract source code, scripts, or implementation details
from websites or systems. This is a form of intellectual property theft
and reverse engineering.
"""
from __future__ import annotations
import re
from typing import Any, Dict, Optional
from ..base import ScannerPlugin, ScanResult, PluginMetadata
CODE_EXTRACTION_PATTERNS = [
# Code extraction intent
r"\b(extract|pull.*source|get.*source|download.*code)\b",
r"\b(extract|grab|pull|download|fetch)\b.*\b(javascript|js|html|css|code|source|implementation)\b",
# Reverse engineering intent
r"\b(understand.*code|understand.*implementation|reverse.*engineer|decompile|disassemble)\b",
r"\b(understand|analyze)\b.*\b(how.*work|mechanism|implementation)\b",
# Script harvesting
r"\b(script|plugin|extension)\b.*\b(extract|pull|grab|download|source)\b",
]
CODE_KEYWORDS = {
"javascript", "js", "code", "source", "script",
"html", "css", "react", "vue", "angular",
"implementation", "algorithm", "logic", "function"
}
TARGET_KEYWORDS = {
"website", "site", "eventbrite", "ticketmaster", "meetup",
"competitor", "competitor's", "application", "app", "platform"
}
class CodeExtractionDetector(ScannerPlugin):
"""Detects attempts to extract source code and implementation details."""
def __init__(self):
super().__init__(
metadata=PluginMetadata(
name="CodeExtractionDetector",
version="1.0.0",
description="Detects source code extraction and reverse engineering attempts",
author="SecurityGateway",
)
)
def scan(
self,
user_id: Optional[str],
server_key: str,
tool: str,
arguments: Dict[str, Any],
llm_context: Optional[str] = None,
) -> ScanResult:
"""
Scan for code extraction patterns.
Detects:
- Source code extraction from websites
- Script/plugin harvesting
- Reverse engineering intent
- Implementation detail extraction
Args:
user_id: User identifier
server_key: Server key
tool: Tool name
arguments: Tool arguments
llm_context: Optional context
Returns:
ScanResult with code extraction detection
"""
context = (llm_context or "") + " " + self._flatten_json(arguments)
context_lower = context.lower()
reasons = []
flags = {}
risk_score = 0.0
# 1) Pattern matching for code extraction
if self._contains_pattern(context, CODE_EXTRACTION_PATTERNS):
reasons.append("Code extraction or reverse engineering attempt detected.")
flags["code_extraction"] = True
risk_score += 0.45
# 2) Keyword combination: code + target
code_found = any(keyword in context_lower for keyword in CODE_KEYWORDS)
target_found = any(keyword in context_lower for keyword in TARGET_KEYWORDS)
if code_found and target_found:
reasons.append("Source code extraction from target website/application detected.")
flags["code_extraction_targeted"] = True
risk_score += 0.50
# 3) Reverse engineering intent
if any(word in context_lower for word in ["reverse", "decompile", "disassemble", "how it works", "how they work"]):
reasons.append("Reverse engineering intent detected.")
flags["reverse_engineering"] = True
risk_score += 0.55
# 4) Script/component extraction
if any(word in context_lower for word in ["extract.*script", "extract.*component", "pull.*source"]):
reasons.append("Script or component source extraction detected.")
flags["script_extraction"] = True
risk_score += 0.40
detected = bool(reasons)
return ScanResult(
plugin_name=self.get_metadata().name,
detected=detected,
risk_score=min(1.0, risk_score),
reasons=reasons if reasons else ["No code extraction detected."],
flags=flags,
)
def _flatten_json(self, value: Any) -> str:
"""Flatten nested structures to string for pattern matching."""
if isinstance(value, dict):
return " ".join(self._flatten_json(v) for v in value.values())
if isinstance(value, list):
return " ".join(self._flatten_json(v) for v in value)
return str(value)
def _contains_pattern(self, text: str, patterns: list) -> bool:
"""Check if text matches any pattern."""
for pat in patterns:
if re.search(pat, text, flags=re.IGNORECASE):
return True
return False
# Export as module-level plugin for auto-loading
plugin = CodeExtractionDetector()
|