Upload 14 files

Browse files

Files changed (15) hide show

#readme.txt +17 -0
.gitattributes +1 -0
inference_bert_url.py +215 -0
moniter.py +410 -0
static/banner.jpg +3 -0
static/style.css +145 -0
templates/index.html +72 -0
test_server.py +48 -0
xss_detect_trained/config.json +24 -0
xss_detect_trained/model.safetensors +3 -0
xss_detect_trained/special_tokens_map.json +7 -0
xss_detect_trained/tokenizer.json +0 -0
xss_detect_trained/tokenizer_config.json +56 -0
xss_detect_trained/training_args.bin +3 -0
xss_detect_trained/vocab.txt +0 -0

#readme.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+*requirements:
+install python first and
+pip install torch
+pip install transformers
+pip install flask
+*how to use:
+python test_server.py
+python moniter.py
+open browser and enter:
+http://127.0.0.1:8080/?q=abcde	: OK
+http://127.0.0.1:8080/?q=<img src='x' onerror='alert("xss")'>	:this will be detected and logged.
+*inference_bert_url.py
+It's just a simple cli tool that judge your input contains xss payloads.

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+static/banner.jpg filter=lfs diff=lfs merge=lfs -text

inference_bert_url.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# -*- coding: utf-8 -*-
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+from urllib.parse import (
+    urlparse,
+    parse_qs,
+    unquote
+)
+#################################################
+# 모델 경로
+#################################################
+model_path = "xss_detect_trained"
+#################################################
+# URL 여부 확인
+#################################################
+def is_url(text):
+    return text.startswith("http://") or text.startswith("https://")
+#################################################
+# URL에서 parameter value 추출
+#################################################
+def extract_url_payload(url):
+    try:
+        parsed = urlparse(url)
+        # query parameter 파싱
+        params = parse_qs(parsed.query)
+        extracted = []
+        for key, values in params.items():
+            for value in values:
+                # URL decode
+                decoded = unquote(value)
+                extracted.append(decoded)
+        # parameter 없으면 path 사용
+        if not extracted:
+            return parsed.path
+        # 여러 parameter면 합침
+        return " ".join(extracted)
+    except:
+        return url
+#################################################
+# 의심 코드 존재 여부 검사
+#################################################
+def contains_suspicious_code(text):
+    suspicious_patterns = [
+        # HTML / JS
+        "<",
+        ">",
+        "script",
+        "javascript:",
+        "onerror",
+        "onclick",
+        "onload",
+        "iframe",
+        "svg",
+        # JS 실행
+        "eval(",
+        "alert(",
+        "prompt(",
+        "confirm(",
+        "document.cookie",
+        "document.domain",
+        "window.location",
+        # 난독화 / 우회
+        "constructor",
+        "fromcharcode",
+        "\\x",
+        "%3c",
+        "%3e",
+        "&#",
+        "base64",
+        "atob(",
+        # 특수 실행 패턴
+        "srcdoc",
+        "data:text/html",
+        "vbscript:",
+        "expression("
+    ]
+    text_lower = text.lower()
+    for pattern in suspicious_patterns:
+        if pattern in text_lower:
+            return True
+    return False
+#################################################
+# 모델 로드
+#################################################
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForSequenceClassification.from_pretrained(model_path)
+device = torch.device("cpu")
+model.to(device)
+model.eval()
+#################################################
+# 라벨
+#################################################
+labels = {
+    0: "NORMAL",
+    1: "XSS"
+}
+#################################################
+# 테스트
+#################################################
+print("\n테스트 시작 (exit 입력 시 종료)\n")
+while True:
+    text = input("입력: ")
+    if text.lower() == "exit":
+        break
+    #################################################
+    # 기본값
+    #################################################
+    target_text = text
+    #################################################
+    # URL 처리
+    #################################################
+    if is_url(text):
+        target_text = extract_url_payload(text)
+        print(f"[추출된 parameter]: {target_text}")
+        #################################################
+        # 의심 코드 없으면 바로 NORMAL
+        #################################################
+        if not contains_suspicious_code(target_text):
+            print("결과: NORMAL")
+            print("신뢰도: heuristic\n")
+            continue
+    #################################################
+    # 토크나이즈
+    #################################################
+    MAX_INPUT_LENGTH = 2000
+    if len(target_text) > MAX_INPUT_LENGTH:
+        print("입력 길이 초과\n")
+        continue
+    inputs = tokenizer(
+        target_text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=128
+    ).to(device)
+    #################################################
+    # 추론
+    #################################################
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    probs = torch.softmax(logits, dim=1)
+    confidence, pred = torch.max(probs, dim=1)
+    pred = pred.item()
+    confidence = confidence.item()
+    label = labels[pred]
+    #################################################
+    # 출력
+    #################################################
+    print(f"결과: {label}")
+    print(f"신뢰도: {confidence:.4f}\n")

moniter.py ADDED Viewed

	@@ -0,0 +1,410 @@

+# -*- coding: utf-8 -*-
+#################################################
+# XSS Log Monitor + BERT Detector
+#################################################
+import re
+import time
+import sqlite3
+import unicodedata
+from urllib.parse import (
+    urlparse,
+    parse_qs,
+    unquote
+)
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification
+)
+#################################################
+# 설정
+#################################################
+LOG_FILE = "access.log"
+MODEL_PATH = "xss_detect_trained"
+MAX_INPUT_LENGTH = 2000
+CHECK_INTERVAL = 0.2
+#################################################
+# SQLite 초기화
+#################################################
+conn = sqlite3.connect("xss_detection.db")
+cursor = conn.cursor()
+cursor.execute("""
+CREATE TABLE IF NOT EXISTS detections (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    timestamp TEXT,
+    ip TEXT,
+    url TEXT,
+    payload TEXT,
+    prediction TEXT,
+    confidence REAL
+)
+""")
+conn.commit()
+#################################################
+# 모델 로드
+#################################################
+print("[+] 모델 로드 중...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
+device = torch.device("cpu")
+model.to(device)
+model.eval()
+print("[+] 모델 로드 완료")
+#################################################
+# 라벨
+#################################################
+labels = {
+    0: "NORMAL",
+    1: "XSS"
+}
+#################################################
+# URL 여부
+#################################################
+def is_url(text):
+    return (
+        text.startswith("http://")
+        or text.startswith("https://")
+        or text.startswith("/")
+    )
+#################################################
+# Unicode 정규화
+#################################################
+def normalize_unicode(text):
+    return unicodedata.normalize("NFKC", text)
+#################################################
+# URL payload 추출
+#################################################
+def extract_url_payload(url):
+    try:
+        parsed = urlparse(url)
+        raw_query = unquote(parsed.query)
+        params = parse_qs(parsed.query)
+        extracted = []
+        #################################################
+        # parameter value 추출
+        #################################################
+        for key, values in params.items():
+            for value in values:
+                decoded = unquote(value)
+                extracted.append(decoded)
+        #################################################
+        # query 자체에 suspicious code 존재 시 추가
+        #################################################
+        if contains_suspicious_code(raw_query):
+            extracted.append(raw_query)
+        #################################################
+        # parameter 없으면 path 사용
+        #################################################
+        if not extracted:
+            return parsed.path
+        return " ".join(extracted)
+    except:
+        return url
+#################################################
+# suspicious code 존재 여부
+#################################################
+def contains_suspicious_code(text):
+    suspicious_patterns = [
+        # HTML / JS
+        "<",
+        ">",
+        "script",
+        "javascript:",
+        "onerror",
+        "onclick",
+        "onload",
+        "iframe",
+        "svg",
+        # JS 실행
+        "eval(",
+        "alert(",
+        "prompt(",
+        "confirm(",
+        "document.cookie",
+        "document.domain",
+        "window.location",
+        # 난독화 / 우회
+        "constructor",
+        "fromcharcode",
+        "\\x",
+        "%3c",
+        "%3e",
+        "&#",
+        "base64",
+        "atob(",
+        # 특수 실행
+        "srcdoc",
+        "data:text/html",
+        "vbscript:",
+        "expression("
+    ]
+    text_lower = text.lower()
+    for pattern in suspicious_patterns:
+        if pattern in text_lower:
+            return True
+    return False
+#################################################
+# 로그 한 줄 파싱
+#################################################
+def parse_log_line(line):
+    """
+    Apache/Nginx common log format 대응
+    """
+    try:
+        ip_match = re.search(r'^(\S+)', line)
+        request_match = re.search(
+            r'\"(GET|POST|PUT|DELETE|HEAD|OPTIONS)\s+(.+?)\s+HTTP',
+            line
+        )
+        if not ip_match or not request_match:
+            return None, None
+        ip = ip_match.group(1)
+        url = request_match.group(2)
+        return ip, url
+    except:
+        return None, None
+#################################################
+# BERT 추론
+#################################################
+def predict_xss(text):
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=128
+    ).to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    probs = torch.softmax(logits, dim=1)
+    confidence, pred = torch.max(probs, dim=1)
+    pred = pred.item()
+    confidence = confidence.item()
+    label = labels[pred]
+    return label, confidence
+#################################################
+# 로그 감시
+#################################################
+def follow(thefile):
+    thefile.seek(0, 2)
+    while True:
+        line = thefile.readline()
+        if not line:
+            time.sleep(CHECK_INTERVAL)
+            continue
+        yield line
+#################################################
+# 메인
+#################################################
+print(f"[+] 로그 감시 시작: {LOG_FILE}")
+with open(LOG_FILE, "r", encoding="utf-8", errors="ignore") as logfile:
+    loglines = follow(logfile)
+    for line in loglines:
+        try:
+            ip, url = parse_log_line(line)
+            if not url:
+                continue
+            #################################################
+            # unicode normalization
+            #################################################
+            url = normalize_unicode(url)
+            #################################################
+            # URL payload 추출
+            #################################################
+            if is_url(url):
+                target_text = extract_url_payload(url)
+            else:
+                target_text = url
+            #################################################
+            # 길이 제한
+            #################################################
+            if len(target_text) > MAX_INPUT_LENGTH:
+                continue
+            #################################################
+            # suspicious fragment 없으면 skip
+            #################################################
+            if not contains_suspicious_code(target_text):
+                continue
+            #################################################
+            # ML 추론
+            #################################################
+            label, confidence = predict_xss(target_text)
+            #################################################
+            # XSS 탐지 시 출력
+            #################################################
+            if label == "XSS":
+                print("\n==============================")
+                print("[XSS DETECTED]")
+                print(f"IP         : {ip}")
+                print(f"URL        : {url}")
+                print(f"Payload    : {target_text}")
+                print(f"Confidence : {confidence:.4f}")
+                print("==============================\n")
+            #################################################
+            # DB 저장
+            #################################################
+            cursor.execute("""
+            INSERT INTO detections (
+                timestamp,
+                ip,
+                url,
+                payload,
+                prediction,
+                confidence
+            )
+            VALUES (
+                datetime('now'),
+                ?,
+                ?,
+                ?,
+                ?,
+                ?
+            )
+            """, (
+                ip,
+                url,
+                target_text,
+                label,
+                confidence
+            ))
+            conn.commit()
+        except Exception as e:
+            print(f"[ERROR] {e}")

static/banner.jpg ADDED Viewed

Git LFS Details

SHA256: e5167e5860110c51cd6388d7451904cfac3ce2d85b64e69b00841b9053bcfbbb
Pointer size: 132 Bytes
Size of remote file: 1.97 MB

static/style.css ADDED Viewed

	@@ -0,0 +1,145 @@

+body {
+    margin: 0;
+    background: #f5f5f5;
+    color: #222;
+    font-family: "Segoe UI", sans-serif;
+}
+.hero {
+    height: 320px;
+    background-image: url("/static/banner.jpg");
+    background-size: cover;
+    background-position: center;
+    position: relative;
+}
+.overlay {
+    width: 100%;
+    height: 100%;
+    background: rgba(0,0,0,0.45);
+    display: flex;
+    flex-direction: column;
+    justify-content: center;
+    align-items: center;
+    color: white;
+}
+.overlay h1 {
+    font-size: 52px;
+    margin-bottom: 10px;
+}
+.overlay p {
+    font-size: 18px;
+    opacity: 0.9;
+}
+.container {
+    max-width: 900px;
+    margin: auto;
+    margin-top: 40px;
+    background: white;
+    padding: 40px;
+    border-radius: 12px;
+    box-shadow: 0 4px 20px rgba(0,0,0,0.08);
+}
+h2 {
+    margin-top: 0;
+}
+input {
+    width: 100%;
+    padding: 14px;
+    font-size: 16px;
+    border-radius: 8px;
+    border: 1px solid #ccc;
+    margin-top: 10px;
+}
+button {
+    margin-top: 15px;
+    padding: 12px 24px;
+    border: none;
+    border-radius: 8px;
+    background: #222;
+    color: white;
+    font-size: 15px;
+    cursor: pointer;
+}
+button:hover {
+    background: #444;
+}
+.result {
+    margin-top: 40px;
+}
+.output {
+    margin-top: 15px;
+    padding: 20px;
+    background: #fafafa;
+    border-radius: 8px;
+    border: 1px solid #ddd;
+    word-break: break-word;
+}
+footer {
+    text-align: center;
+    padding: 30px;
+    color: #777;
+}

templates/index.html ADDED Viewed

	@@ -0,0 +1,72 @@

+<!DOCTYPE html>
+<html lang="ko">
+<head>
+    <meta charset="utf-8">
+    <title>MINSUNG's XSS TEST PAGE</title>
+    <link
+        rel="stylesheet"
+        href="{{ url_for('static', filename='style.css') }}"
+    >
+</head>
+<body>
+<div class="hero">
+    <div class="overlay">
+        <h1>Great Memories~~</h1>
+        <p>
+           MINSUNG's xss test environment
+        </p>
+    </div>
+</div>
+<div class="container">
+    <h2>Input</h2>
+    <form method="GET" action="/">
+        <input
+            type="text"
+            name="q"
+            placeholder="Enter payload..."
+            value="{{ q }}"
+        >
+        <button type="submit">
+            Submit
+        </button>
+    </form>
+    <div class="result">
+        <h3>Reflected Output</h3>
+        <div class="output">
+            {{ q|safe }}
+        </div>
+    </div>
+</div>
+<footer>
+</footer>
+</body>
+</html>

test_server.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# -*- coding: utf-8 -*-
+from flask import (
+    Flask,
+    request,
+    render_template
+)
+app = Flask(__name__)
+@app.route("/", methods=["GET", "POST"])
+def home():
+    #################################################
+    # 로그 저장
+    #################################################
+    with open("access.log", "a", encoding="utf-8") as f:
+        log = (
+            f'{request.remote_addr} - '
+            f'"{request.method} {request.full_path} HTTP/1.1"\n'
+        )
+        f.write(log)
+        f.flush()
+    #################################################
+    # q 파라미터 출력
+    #################################################
+    q = request.args.get("q", "")
+    #################################################
+    # html render
+    #################################################
+    return render_template(
+        "index.html",
+        q=q
+    )
+if __name__ == "__main__":
+    app.run(
+        host="0.0.0.0",
+        port=8080,
+        debug=False
+    )

xss_detect_trained/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "dtype": "float32",
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "transformers_version": "4.57.3",
+  "vocab_size": 30522
+}

xss_detect_trained/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f67b36e0aa339dc914d178da974f6507e6cf79cddf3c643f7a2a80d2cab7bbea
+size 267832560

xss_detect_trained/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

xss_detect_trained/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

xss_detect_trained/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

xss_detect_trained/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:248f94b765fe79a616bfe9dad2106bf7c617d3f5011ce8a209c8997268e9b6ad
+size 5841

xss_detect_trained/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff