Spaces:

MCP-1st-Birthday
/

Eventure-Project-Overview

Sleeping

File size: 5,051 Bytes

ed71b0e

"""

Code Extraction Detector



Detects attempts to extract source code, scripts, or implementation details

from websites or systems. This is a form of intellectual property theft

and reverse engineering.

"""

from __future__ import annotations

import re
from typing import Any, Dict, Optional

from ..base import ScannerPlugin, ScanResult, PluginMetadata


CODE_EXTRACTION_PATTERNS = [
    # Code extraction intent
    r"\b(extract|pull.*source|get.*source|download.*code)\b",
    r"\b(extract|grab|pull|download|fetch)\b.*\b(javascript|js|html|css|code|source|implementation)\b",

    # Reverse engineering intent
    r"\b(understand.*code|understand.*implementation|reverse.*engineer|decompile|disassemble)\b",
    r"\b(understand|analyze)\b.*\b(how.*work|mechanism|implementation)\b",

    # Script harvesting
    r"\b(script|plugin|extension)\b.*\b(extract|pull|grab|download|source)\b",
]

CODE_KEYWORDS = {
    "javascript", "js", "code", "source", "script",
    "html", "css", "react", "vue", "angular",
    "implementation", "algorithm", "logic", "function"
}

TARGET_KEYWORDS = {
    "website", "site", "eventbrite", "ticketmaster", "meetup",
    "competitor", "competitor's", "application", "app", "platform"
}


class CodeExtractionDetector(ScannerPlugin):
    """Detects attempts to extract source code and implementation details."""

    def __init__(self):
        super().__init__(
            metadata=PluginMetadata(
                name="CodeExtractionDetector",
                version="1.0.0",
                description="Detects source code extraction and reverse engineering attempts",
                author="SecurityGateway",
            )
        )

    def scan(

        self,

        user_id: Optional[str],

        server_key: str,

        tool: str,

        arguments: Dict[str, Any],

        llm_context: Optional[str] = None,

    ) -> ScanResult:
        """

        Scan for code extraction patterns.



        Detects:

        - Source code extraction from websites

        - Script/plugin harvesting

        - Reverse engineering intent

        - Implementation detail extraction



        Args:

            user_id: User identifier

            server_key: Server key

            tool: Tool name

            arguments: Tool arguments

            llm_context: Optional context



        Returns:

            ScanResult with code extraction detection

        """
        context = (llm_context or "") + " " + self._flatten_json(arguments)
        context_lower = context.lower()

        reasons = []
        flags = {}
        risk_score = 0.0

        # 1) Pattern matching for code extraction
        if self._contains_pattern(context, CODE_EXTRACTION_PATTERNS):
            reasons.append("Code extraction or reverse engineering attempt detected.")
            flags["code_extraction"] = True
            risk_score += 0.45

        # 2) Keyword combination: code + target
        code_found = any(keyword in context_lower for keyword in CODE_KEYWORDS)
        target_found = any(keyword in context_lower for keyword in TARGET_KEYWORDS)

        if code_found and target_found:
            reasons.append("Source code extraction from target website/application detected.")
            flags["code_extraction_targeted"] = True
            risk_score += 0.50

        # 3) Reverse engineering intent
        if any(word in context_lower for word in ["reverse", "decompile", "disassemble", "how it works", "how they work"]):
            reasons.append("Reverse engineering intent detected.")
            flags["reverse_engineering"] = True
            risk_score += 0.55

        # 4) Script/component extraction
        if any(word in context_lower for word in ["extract.*script", "extract.*component", "pull.*source"]):
            reasons.append("Script or component source extraction detected.")
            flags["script_extraction"] = True
            risk_score += 0.40

        detected = bool(reasons)

        return ScanResult(
            plugin_name=self.get_metadata().name,
            detected=detected,
            risk_score=min(1.0, risk_score),
            reasons=reasons if reasons else ["No code extraction detected."],
            flags=flags,
        )

    def _flatten_json(self, value: Any) -> str:
        """Flatten nested structures to string for pattern matching."""
        if isinstance(value, dict):
            return " ".join(self._flatten_json(v) for v in value.values())
        if isinstance(value, list):
            return " ".join(self._flatten_json(v) for v in value)
        return str(value)

    def _contains_pattern(self, text: str, patterns: list) -> bool:
        """Check if text matches any pattern."""
        for pat in patterns:
            if re.search(pat, text, flags=re.IGNORECASE):
                return True
        return False


# Export as module-level plugin for auto-loading
plugin = CodeExtractionDetector()