yuki-sui's picture
Upload 169 files
ed71b0e verified
"""
Code Extraction Detector
Detects attempts to extract source code, scripts, or implementation details
from websites or systems. This is a form of intellectual property theft
and reverse engineering.
"""
from __future__ import annotations
import re
from typing import Any, Dict, Optional
from ..base import ScannerPlugin, ScanResult, PluginMetadata
CODE_EXTRACTION_PATTERNS = [
# Code extraction intent
r"\b(extract|pull.*source|get.*source|download.*code)\b",
r"\b(extract|grab|pull|download|fetch)\b.*\b(javascript|js|html|css|code|source|implementation)\b",
# Reverse engineering intent
r"\b(understand.*code|understand.*implementation|reverse.*engineer|decompile|disassemble)\b",
r"\b(understand|analyze)\b.*\b(how.*work|mechanism|implementation)\b",
# Script harvesting
r"\b(script|plugin|extension)\b.*\b(extract|pull|grab|download|source)\b",
]
CODE_KEYWORDS = {
"javascript", "js", "code", "source", "script",
"html", "css", "react", "vue", "angular",
"implementation", "algorithm", "logic", "function"
}
TARGET_KEYWORDS = {
"website", "site", "eventbrite", "ticketmaster", "meetup",
"competitor", "competitor's", "application", "app", "platform"
}
class CodeExtractionDetector(ScannerPlugin):
"""Detects attempts to extract source code and implementation details."""
def __init__(self):
super().__init__(
metadata=PluginMetadata(
name="CodeExtractionDetector",
version="1.0.0",
description="Detects source code extraction and reverse engineering attempts",
author="SecurityGateway",
)
)
def scan(
self,
user_id: Optional[str],
server_key: str,
tool: str,
arguments: Dict[str, Any],
llm_context: Optional[str] = None,
) -> ScanResult:
"""
Scan for code extraction patterns.
Detects:
- Source code extraction from websites
- Script/plugin harvesting
- Reverse engineering intent
- Implementation detail extraction
Args:
user_id: User identifier
server_key: Server key
tool: Tool name
arguments: Tool arguments
llm_context: Optional context
Returns:
ScanResult with code extraction detection
"""
context = (llm_context or "") + " " + self._flatten_json(arguments)
context_lower = context.lower()
reasons = []
flags = {}
risk_score = 0.0
# 1) Pattern matching for code extraction
if self._contains_pattern(context, CODE_EXTRACTION_PATTERNS):
reasons.append("Code extraction or reverse engineering attempt detected.")
flags["code_extraction"] = True
risk_score += 0.45
# 2) Keyword combination: code + target
code_found = any(keyword in context_lower for keyword in CODE_KEYWORDS)
target_found = any(keyword in context_lower for keyword in TARGET_KEYWORDS)
if code_found and target_found:
reasons.append("Source code extraction from target website/application detected.")
flags["code_extraction_targeted"] = True
risk_score += 0.50
# 3) Reverse engineering intent
if any(word in context_lower for word in ["reverse", "decompile", "disassemble", "how it works", "how they work"]):
reasons.append("Reverse engineering intent detected.")
flags["reverse_engineering"] = True
risk_score += 0.55
# 4) Script/component extraction
if any(word in context_lower for word in ["extract.*script", "extract.*component", "pull.*source"]):
reasons.append("Script or component source extraction detected.")
flags["script_extraction"] = True
risk_score += 0.40
detected = bool(reasons)
return ScanResult(
plugin_name=self.get_metadata().name,
detected=detected,
risk_score=min(1.0, risk_score),
reasons=reasons if reasons else ["No code extraction detected."],
flags=flags,
)
def _flatten_json(self, value: Any) -> str:
"""Flatten nested structures to string for pattern matching."""
if isinstance(value, dict):
return " ".join(self._flatten_json(v) for v in value.values())
if isinstance(value, list):
return " ".join(self._flatten_json(v) for v in value)
return str(value)
def _contains_pattern(self, text: str, patterns: list) -> bool:
"""Check if text matches any pattern."""
for pat in patterns:
if re.search(pat, text, flags=re.IGNORECASE):
return True
return False
# Export as module-level plugin for auto-loading
plugin = CodeExtractionDetector()