Upload 21 files
Browse files- .gitattributes +1 -0
- app.py +83 -0
- inference.py +293 -0
- requirements.txt +10 -0
- saved/masking/spacy_config.json +8 -0
- saved/masking/statistical_config.json +215 -0
- saved/model/feature_spec.json +0 -0
- saved/model/metrics.json +32 -0
- saved/model/model.json +0 -0
- saved/model/threshold.json +3 -0
- saved/model/training_config.json +26 -0
- saved/ngram_features/char_vectorizer.pkl +3 -0
- saved/ngram_features/ngram_config.json +18 -0
- saved/ngram_features/pos_vectorizer.pkl +3 -0
- saved/normalization/normalization_config.json +4 -0
- saved/tfidf_features/tfidf_config.json +23 -0
- saved/tfidf_features/vectorizer.pkl +3 -0
- static/app.js +124 -0
- static/cdf.png +0 -0
- static/image.png +3 -0
- static/styles.css +445 -0
- templates/index.html +122 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
static/image.png filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# !usr/bin/env/python3
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
from functools import lru_cache
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from flask import Flask, jsonify, render_template, request
|
| 9 |
+
|
| 10 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 11 |
+
SRC_DIR = BASE_DIR / "src"
|
| 12 |
+
SAVED_DIR = BASE_DIR / "saved"
|
| 13 |
+
|
| 14 |
+
if str(SRC_DIR) not in sys.path:
|
| 15 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 16 |
+
|
| 17 |
+
from inference import Inference
|
| 18 |
+
from helpers import load_json
|
| 19 |
+
|
| 20 |
+
app = Flask(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _compute_model_metrics(metrics_payload: dict[str, Any]) -> dict[str, float]:
|
| 25 |
+
test_metrics = metrics_payload.get("test") or {}
|
| 26 |
+
tn = float(test_metrics.get("tn", 0.0))
|
| 27 |
+
fp = float(test_metrics.get("fp", 0.0))
|
| 28 |
+
tp = float(test_metrics.get("tp", 0.0))
|
| 29 |
+
fn = float(test_metrics.get("fn", 0.0))
|
| 30 |
+
specificity = tn / (tn + fp) if (tn + fp) else 0.0
|
| 31 |
+
sensitivity = tp / (tp + fn) if (tp + fn) else float(test_metrics.get("recall", 0.0))
|
| 32 |
+
youden_j = sensitivity + specificity - 1.0
|
| 33 |
+
return {
|
| 34 |
+
"f1": float(test_metrics.get("f1", 0.0)),
|
| 35 |
+
"youden_j": round(youden_j, 5),
|
| 36 |
+
"auc_roc": float(test_metrics.get("roc_auc", 0.0)),
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
@lru_cache(maxsize=1)
|
| 40 |
+
def get_metrics() -> dict[str, float]:
|
| 41 |
+
return _compute_model_metrics(load_json(SAVED_DIR / "model" / "metrics.json"))
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@lru_cache(maxsize=1)
|
| 46 |
+
def get_service() -> Inference:
|
| 47 |
+
return Inference(project_root=BASE_DIR)
|
| 48 |
+
|
| 49 |
+
def predict(text1: str, text2: str) -> dict[str, Any]:
|
| 50 |
+
return get_service().predict(text1, text2).to_dict()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@app.route("/", methods=["GET"])
|
| 54 |
+
def home_route():
|
| 55 |
+
return render_template("index.html")
|
| 56 |
+
|
| 57 |
+
@app.route("/predict", methods=["POST"])
|
| 58 |
+
def predict_route():
|
| 59 |
+
data = request.get_json(force=True)
|
| 60 |
+
text1 = (data.get("text1") or "").strip()
|
| 61 |
+
text2 = (data.get("text2") or "").strip()
|
| 62 |
+
if not text1 or not text2: return jsonify({"error": "Both text fields are required."}), 400
|
| 63 |
+
try: result = predict(text1, text2)
|
| 64 |
+
except Exception as exc:
|
| 65 |
+
return jsonify({"error": f"Inference failed: {exc}"}), 500
|
| 66 |
+
return jsonify(result)
|
| 67 |
+
|
| 68 |
+
@app.route("/metrics", methods=["GET"])
|
| 69 |
+
def metrics_route():
|
| 70 |
+
try:
|
| 71 |
+
return jsonify(get_metrics())
|
| 72 |
+
except Exception as exc:
|
| 73 |
+
return jsonify({"error": f"Failed to load metrics: {exc}"}), 500
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# ping for cron job
|
| 77 |
+
@app.route("/ping")
|
| 78 |
+
def ping():
|
| 79 |
+
return {"status": "ok"}, 200
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
if __name__ == "__main__":
|
| 83 |
+
app.run(debug=True, port=5000)
|
inference.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from textstat import textstat
|
| 10 |
+
from xgboost import XGBClassifier
|
| 11 |
+
|
| 12 |
+
from helpers import load_json, load_pickle
|
| 13 |
+
|
| 14 |
+
from masking_regex import mask_split as regex_mask_split
|
| 15 |
+
from masking_spacy import Config as SpacyMaskingConfig, _apply_ner_mask, _build_linguistic_record, load_nlp_model
|
| 16 |
+
from normalization import normalize_text, Config as NormalizationConfig
|
| 17 |
+
from features_statistical import extract_split_statistics, Config as StatisticalConfig
|
| 18 |
+
from features_tfidf import record_to_tfidf_text, Config as TFIDFConfig
|
| 19 |
+
from features_ngram import Config as NGramConfig, build_space_free_char_ngrams, record_to_pos_sequence
|
| 20 |
+
from model_training import Config as TrainingConfig
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _coerce_tfidf_config(payload: dict[str, Any]) -> TFIDFConfig:
|
| 25 |
+
payload = dict(payload)
|
| 26 |
+
if isinstance(payload.get("ngram_range"), list):
|
| 27 |
+
payload["ngram_range"] = tuple(payload["ngram_range"])
|
| 28 |
+
return TFIDFConfig(**payload)
|
| 29 |
+
def _coerce_ngram_config(payload: dict[str, Any]) -> NGramConfig:
|
| 30 |
+
payload = dict(payload)
|
| 31 |
+
if isinstance(payload.get("pos_ngram_range"), list):
|
| 32 |
+
payload["pos_ngram_range"] = tuple(payload["pos_ngram_range"])
|
| 33 |
+
return NGramConfig(**payload)
|
| 34 |
+
def _coerce_statistical_config(payload: dict[str, Any]) -> StatisticalConfig:
|
| 35 |
+
payload = dict(payload)
|
| 36 |
+
if isinstance(payload.get("phrase_role_dependency_labels"), list):
|
| 37 |
+
payload["phrase_role_dependency_labels"] = tuple(payload["phrase_role_dependency_labels"])
|
| 38 |
+
return StatisticalConfig(**payload)
|
| 39 |
+
def _coerce_training_config(payload: dict[str, Any]) -> TrainingConfig:
|
| 40 |
+
payload = dict(payload)
|
| 41 |
+
return TrainingConfig(**payload)
|
| 42 |
+
|
| 43 |
+
@dataclass(slots=True)
|
| 44 |
+
class PredictionResult:
|
| 45 |
+
probability_same: float
|
| 46 |
+
predicted_label: int
|
| 47 |
+
threshold: float
|
| 48 |
+
normalized_text1: str
|
| 49 |
+
normalized_text2: str
|
| 50 |
+
masked_text1: str
|
| 51 |
+
masked_text2: str
|
| 52 |
+
def to_dict(self) -> dict[str, Any]:
|
| 53 |
+
label = "Same author" if self.predicted_label == 1 else "Different author"
|
| 54 |
+
return {
|
| 55 |
+
"label": label,
|
| 56 |
+
"probability": self.probability_same,
|
| 57 |
+
"threshold": self.threshold,
|
| 58 |
+
"normalized_text1": self.normalized_text1,
|
| 59 |
+
"normalized_text2": self.normalized_text2,
|
| 60 |
+
"masked_text1": self.masked_text1,
|
| 61 |
+
"masked_text2": self.masked_text2,
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# STAND-ALONE PIPELINE TO PERFORM INFERENCE USING THE TRAINED MODEL
|
| 67 |
+
class Inference:
|
| 68 |
+
def __init__(self, project_root: str | Path | None = None) -> None:
|
| 69 |
+
|
| 70 |
+
self.project_root = Path(project_root) if project_root is not None else Path(__file__).resolve().parents[1]
|
| 71 |
+
self.saved_dir = self.project_root / "saved"
|
| 72 |
+
self.model_dir = self.saved_dir / "model"
|
| 73 |
+
|
| 74 |
+
# =============================
|
| 75 |
+
# the pipeline follows what is done in src/pipeline.py but adapted to do inference instead of training
|
| 76 |
+
# =============================
|
| 77 |
+
|
| 78 |
+
self.normalization_config = NormalizationConfig(**load_json(self.saved_dir / "normalization" / "normalization_config.json"))
|
| 79 |
+
|
| 80 |
+
spacy_payload = load_json(self.saved_dir / "masking" / "spacy_config.json")
|
| 81 |
+
spacy_payload["verbose"] = False
|
| 82 |
+
spacy_payload["nlp_n_process"] = 1
|
| 83 |
+
self.spacy_config = SpacyMaskingConfig(**spacy_payload)
|
| 84 |
+
|
| 85 |
+
statistical_payload = load_json(self.saved_dir / "masking" / "statistical_config.json")
|
| 86 |
+
statistical_payload["verbose"] = False
|
| 87 |
+
self.statistical_config = _coerce_statistical_config(statistical_payload)
|
| 88 |
+
|
| 89 |
+
tfidf_payload = load_json(self.saved_dir / "tfidf_features" / "tfidf_config.json")
|
| 90 |
+
tfidf_payload["verbose"] = False
|
| 91 |
+
self.tfidf_config = _coerce_tfidf_config(tfidf_payload)
|
| 92 |
+
|
| 93 |
+
ngram_payload = load_json(self.saved_dir / "ngram_features" / "ngram_config.json")
|
| 94 |
+
ngram_payload["verbose"] = False
|
| 95 |
+
self.ngram_config = _coerce_ngram_config(ngram_payload)
|
| 96 |
+
|
| 97 |
+
training_payload = load_json(self.saved_dir / "model" / "training_config.json")
|
| 98 |
+
self.training_config = _coerce_training_config(training_payload)
|
| 99 |
+
|
| 100 |
+
self.tfidf_vectorizer = load_pickle(self.saved_dir / "tfidf_features" / "vectorizer.pkl")
|
| 101 |
+
self.char_vectorizer = load_pickle(self.saved_dir / "ngram_features" / "char_vectorizer.pkl")
|
| 102 |
+
self.pos_vectorizer = load_pickle(self.saved_dir / "ngram_features" / "pos_vectorizer.pkl")
|
| 103 |
+
|
| 104 |
+
self.model = None
|
| 105 |
+
self.threshold = float(load_json(self.model_dir / "threshold.json")["threshold"])
|
| 106 |
+
|
| 107 |
+
feature_spec = load_json(self.model_dir / "feature_spec.json")
|
| 108 |
+
self.suffixes: list[str] = feature_spec["suffixes"]
|
| 109 |
+
# self.pairwise_operations: tuple[str, ...] = tuple(feature_spec["pairwise_operations"])
|
| 110 |
+
self.pairwise_column_pairs = [(f"text1_{suffix}", f"text2_{suffix}") for suffix in self.suffixes]
|
| 111 |
+
|
| 112 |
+
self.metrics = load_json(self.model_dir / "metrics.json")
|
| 113 |
+
self.nlp = None
|
| 114 |
+
|
| 115 |
+
def _load_model(self) -> XGBClassifier:
|
| 116 |
+
if self.model is None:
|
| 117 |
+
model_path = self.model_dir / "model.json"
|
| 118 |
+
if not model_path.exists():
|
| 119 |
+
raise FileNotFoundError(f"Missing '{model_path}'")
|
| 120 |
+
model = XGBClassifier()
|
| 121 |
+
model.load_model(model_path)
|
| 122 |
+
self.model = model
|
| 123 |
+
return self.model
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def _predict_positive_proba(self, X: np.ndarray) -> float:
|
| 127 |
+
model = self._load_model()
|
| 128 |
+
return float(model.predict_proba(X)[0, 1])
|
| 129 |
+
|
| 130 |
+
def _mask_one_text(self, text: str) -> tuple[str, dict[str, Any]]:
|
| 131 |
+
if self.nlp is None:
|
| 132 |
+
self.nlp = load_nlp_model(config=self.spacy_config)
|
| 133 |
+
doc = self.nlp(text)
|
| 134 |
+
masked_text, _ = _apply_ner_mask(text, doc)
|
| 135 |
+
record = _build_linguistic_record(doc)
|
| 136 |
+
return masked_text, record
|
| 137 |
+
|
| 138 |
+
def _build_pairwise_vector(self, feature_df: pd.DataFrame) -> np.ndarray:
|
| 139 |
+
row_values = feature_df.iloc[0].to_dict()
|
| 140 |
+
width = len(self.pairwise_column_pairs) * 2 # two pairwise operations: abs. diff & dot product
|
| 141 |
+
X_pair = np.empty((1, width), dtype=np.float32)
|
| 142 |
+
column_index = 0
|
| 143 |
+
for left_col, right_col in self.pairwise_column_pairs:
|
| 144 |
+
left = np.float32(row_values.get(left_col, 0.0))
|
| 145 |
+
right = np.float32(row_values.get(right_col, 0.0))
|
| 146 |
+
diff = left - right
|
| 147 |
+
X_pair[0, column_index] = abs(diff)
|
| 148 |
+
X_pair[0, column_index + 1] = left * right
|
| 149 |
+
column_index += 2
|
| 150 |
+
|
| 151 |
+
return X_pair
|
| 152 |
+
|
| 153 |
+
def _family_suffix_groups(self) -> dict[str, list[str]]:
|
| 154 |
+
return {
|
| 155 |
+
"tfidf": [s for s in self.suffixes if s.startswith("tfidf_")],
|
| 156 |
+
"char_ngrams": [s for s in self.suffixes if s.startswith("char") and "_tfidf_" in s],
|
| 157 |
+
"pos_ngrams": [s for s in self.suffixes if s.startswith("pos") and "_tfidf_" in s],
|
| 158 |
+
"scalar": [s for s in self.suffixes if not (
|
| 159 |
+
s.startswith("tfidf_")
|
| 160 |
+
or (s.startswith("char") and "_tfidf_" in s)
|
| 161 |
+
or (s.startswith("pos") and "_tfidf_" in s)
|
| 162 |
+
)],
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
def _build_global_pairwise_vector(self, feature_df: pd.DataFrame) -> np.ndarray:
|
| 166 |
+
row_values = feature_df.iloc[0].to_dict()
|
| 167 |
+
values: list[float] = []
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
for family_suffixes in self._family_suffix_groups().values():
|
| 171 |
+
if not family_suffixes:
|
| 172 |
+
continue
|
| 173 |
+
|
| 174 |
+
left = np.array([row_values.get(f"text1_{suffix}", 0.0) for suffix in family_suffixes], dtype=np.float32)
|
| 175 |
+
right = np.array([row_values.get(f"text2_{suffix}", 0.0) for suffix in family_suffixes], dtype=np.float32)
|
| 176 |
+
|
| 177 |
+
denominator = float(np.linalg.norm(left) * np.linalg.norm(right))
|
| 178 |
+
cosine = float(np.dot(left, right) / denominator) if denominator > 0 else 0.0
|
| 179 |
+
diff = left - right
|
| 180 |
+
l1 = float(np.abs(diff).sum())
|
| 181 |
+
l2 = float(np.linalg.norm(diff))
|
| 182 |
+
|
| 183 |
+
values.extend([cosine, l1, l2])
|
| 184 |
+
|
| 185 |
+
return np.array(values, dtype=np.float32).reshape(1, -1)
|
| 186 |
+
|
| 187 |
+
# predict prbability and classification of two given texts (input from the user)
|
| 188 |
+
def predict(self, text1: str, text2: str, threshold: float | None = None) -> PredictionResult:
|
| 189 |
+
|
| 190 |
+
threshold_value = self.threshold if threshold is None else float(threshold)
|
| 191 |
+
|
| 192 |
+
pair_df = pd.DataFrame([{
|
| 193 |
+
"text1": normalize_text(text1, config=self.normalization_config),
|
| 194 |
+
"text2": normalize_text(text2, config=self.normalization_config),
|
| 195 |
+
"same": 0,
|
| 196 |
+
}])
|
| 197 |
+
|
| 198 |
+
regex_masked_df, _ = regex_mask_split(pair_df)
|
| 199 |
+
|
| 200 |
+
# spaCy masking; not using nlp.pipe
|
| 201 |
+
masked_text1, record1 = self._mask_one_text(regex_masked_df.iloc[0]["text1"])
|
| 202 |
+
masked_text2, record2 = self._mask_one_text(regex_masked_df.iloc[0]["text2"])
|
| 203 |
+
|
| 204 |
+
masked_df = regex_masked_df.copy()
|
| 205 |
+
masked_df.at[0, "text1"] = masked_text1 # combining regex and spaCy masking
|
| 206 |
+
masked_df.at[0, "text2"] = masked_text2 # ...
|
| 207 |
+
|
| 208 |
+
split_cache = {"text1": [record1], "text2": [record2]} # the linguistic cache
|
| 209 |
+
feature_df = pd.DataFrame() # initialize empty dataframe for the features
|
| 210 |
+
|
| 211 |
+
# ======== statistical features ===========
|
| 212 |
+
|
| 213 |
+
if self.training_config.include_statistical:
|
| 214 |
+
feature_df = extract_split_statistics(
|
| 215 |
+
masked_df,
|
| 216 |
+
split_cache=split_cache,
|
| 217 |
+
split_name="inference",
|
| 218 |
+
config=self.statistical_config,
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
# ======== TF-IDF features ===========
|
| 222 |
+
|
| 223 |
+
if self.training_config.include_tfidf:
|
| 224 |
+
for column in ("text1", "text2"):
|
| 225 |
+
docs = [record_to_tfidf_text(record, config=self.tfidf_config) for record in split_cache[column]]
|
| 226 |
+
tfidf_matrix = self.tfidf_vectorizer.transform(docs).toarray()
|
| 227 |
+
tfidf_cols = [f"{column}_tfidf_{index:05d}" for index in range(tfidf_matrix.shape[1])]
|
| 228 |
+
tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_cols)
|
| 229 |
+
feature_df = pd.concat([feature_df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)
|
| 230 |
+
|
| 231 |
+
# ======== n-gram features ===========
|
| 232 |
+
|
| 233 |
+
for column in ("text1", "text2"):
|
| 234 |
+
|
| 235 |
+
if self.training_config.include_char_ngrams:
|
| 236 |
+
char_docs = [
|
| 237 |
+
" ".join(build_space_free_char_ngrams(text, n=self.ngram_config.char_ngram_n))
|
| 238 |
+
for text in masked_df[column].tolist()]
|
| 239 |
+
char_matrix = self.char_vectorizer.transform(char_docs).toarray()
|
| 240 |
+
char_cols = [
|
| 241 |
+
f"{column}_char{self.ngram_config.char_ngram_n}_tfidf_{index:05d}"
|
| 242 |
+
for index in range(char_matrix.shape[1])]
|
| 243 |
+
char_df = pd.DataFrame(char_matrix, columns=char_cols)
|
| 244 |
+
feature_df = pd.concat([feature_df.reset_index(drop=True), char_df.reset_index(drop=True)], axis=1)
|
| 245 |
+
|
| 246 |
+
if self.training_config.include_pos_ngrams:
|
| 247 |
+
pos_docs = [" ".join(record_to_pos_sequence(record)) for record in split_cache[column]]
|
| 248 |
+
pos_matrix = self.pos_vectorizer.transform(pos_docs).toarray()
|
| 249 |
+
pos_cols = [
|
| 250 |
+
f"{column}_pos{self.ngram_config.pos_ngram_range}_tfidf_{index:05d}"
|
| 251 |
+
for index in range(pos_matrix.shape[1])]
|
| 252 |
+
pos_df = pd.DataFrame(pos_matrix, columns=pos_cols)
|
| 253 |
+
feature_df = pd.concat([feature_df.reset_index(drop=True), pos_df.reset_index(drop=True)], axis=1)
|
| 254 |
+
|
| 255 |
+
continue
|
| 256 |
+
|
| 257 |
+
# ======== readability features ===========
|
| 258 |
+
|
| 259 |
+
if self.training_config.include_readability:
|
| 260 |
+
readability_df = pd.DataFrame([{
|
| 261 |
+
"text1_readability_flesch_kincaid_grade": round(textstat.flesch_kincaid_grade(masked_df.iloc[0]["text1"]), 5),
|
| 262 |
+
"text1_readability_gunning_fog": round(textstat.gunning_fog(masked_df.iloc[0]["text1"]), 5),
|
| 263 |
+
"text1_readability_smog": round(textstat.smog_index(masked_df.iloc[0]["text1"]), 5),
|
| 264 |
+
"text1_readability_coleman_liau": round(textstat.coleman_liau_index(masked_df.iloc[0]["text1"]), 5),
|
| 265 |
+
|
| 266 |
+
"text2_readability_flesch_kincaid_grade": round(textstat.flesch_kincaid_grade(masked_df.iloc[0]["text2"]), 5),
|
| 267 |
+
"text2_readability_gunning_fog": round(textstat.gunning_fog(masked_df.iloc[0]["text2"]), 5),
|
| 268 |
+
"text2_readability_smog": round(textstat.smog_index(masked_df.iloc[0]["text2"]), 5),
|
| 269 |
+
"text2_readability_coleman_liau": round(textstat.coleman_liau_index(masked_df.iloc[0]["text2"]), 5)
|
| 270 |
+
}])
|
| 271 |
+
feature_df = pd.concat([feature_df.reset_index(drop=True), readability_df.reset_index(drop=True)], axis=1)
|
| 272 |
+
|
| 273 |
+
blocks: list[np.ndarray] = []
|
| 274 |
+
if self.training_config.include_local_pairwise:
|
| 275 |
+
blocks.append(self._build_pairwise_vector(feature_df)) # optimized
|
| 276 |
+
if self.training_config.include_global_pairwise:
|
| 277 |
+
blocks.append(self._build_global_pairwise_vector(feature_df))
|
| 278 |
+
if not blocks:
|
| 279 |
+
raise ValueError("At least one of include_local_pairwise or include_global_pairwise must be True.")
|
| 280 |
+
|
| 281 |
+
X = np.hstack(blocks).astype(np.float32)
|
| 282 |
+
probability_same = self._predict_positive_proba(X)
|
| 283 |
+
predicted_label = int(probability_same >= threshold_value) # 1 if > threshold, otherwise 0
|
| 284 |
+
|
| 285 |
+
return PredictionResult(
|
| 286 |
+
probability_same=probability_same,
|
| 287 |
+
predicted_label=predicted_label,
|
| 288 |
+
threshold=threshold_value,
|
| 289 |
+
normalized_text1=pair_df.iloc[0]["text1"],
|
| 290 |
+
normalized_text2=pair_df.iloc[0]["text2"],
|
| 291 |
+
masked_text1=masked_df.iloc[0]["text1"],
|
| 292 |
+
masked_text2=masked_df.iloc[0]["text2"],
|
| 293 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Flask==3.1.3
|
| 2 |
+
numpy==2.4.4
|
| 3 |
+
pandas==3.0.2
|
| 4 |
+
scipy==1.17.1
|
| 5 |
+
scikit-learn==1.8.0
|
| 6 |
+
xgboost==3.2.0
|
| 7 |
+
spacy==3.8.14
|
| 8 |
+
ftfy==6.3.1
|
| 9 |
+
textstat==0.7.13
|
| 10 |
+
tqdm==4.67.3
|
saved/masking/spacy_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"verbose": true,
|
| 3 |
+
"use_gpu": false,
|
| 4 |
+
"nlp_model": "en_core_web_lg",
|
| 5 |
+
"nlp_batch_size": 150,
|
| 6 |
+
"nlp_n_process": 2,
|
| 7 |
+
"checkpoint_dir": "/Users/salirafi/Documents/Personal Project/Text Similarity/saved/masking/spacy_checkpoints"
|
| 8 |
+
}
|
saved/masking/statistical_config.json
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"verbose": true,
|
| 3 |
+
"include_function_word_rate": true,
|
| 4 |
+
"exclude_placeholders_from_avg_word_length": true,
|
| 5 |
+
"phrase_role_dependency_labels": [
|
| 6 |
+
"acl",
|
| 7 |
+
"advcl",
|
| 8 |
+
"ccomp",
|
| 9 |
+
"pcomp",
|
| 10 |
+
"relcl",
|
| 11 |
+
"xcomp"
|
| 12 |
+
],
|
| 13 |
+
"pos_roles": {
|
| 14 |
+
"adjective": [
|
| 15 |
+
"ADJ"
|
| 16 |
+
],
|
| 17 |
+
"adposition": [
|
| 18 |
+
"ADP"
|
| 19 |
+
],
|
| 20 |
+
"adverb": [
|
| 21 |
+
"ADV"
|
| 22 |
+
],
|
| 23 |
+
"auxiliary": [
|
| 24 |
+
"AUX"
|
| 25 |
+
],
|
| 26 |
+
"conjunction": [
|
| 27 |
+
"CONJ"
|
| 28 |
+
],
|
| 29 |
+
"coordinating_conjunction": [
|
| 30 |
+
"CCONJ"
|
| 31 |
+
],
|
| 32 |
+
"determiner": [
|
| 33 |
+
"DET"
|
| 34 |
+
],
|
| 35 |
+
"interjection": [
|
| 36 |
+
"INTJ"
|
| 37 |
+
],
|
| 38 |
+
"noun": [
|
| 39 |
+
"NOUN"
|
| 40 |
+
],
|
| 41 |
+
"numeral": [
|
| 42 |
+
"NUM"
|
| 43 |
+
],
|
| 44 |
+
"particle": [
|
| 45 |
+
"PART"
|
| 46 |
+
],
|
| 47 |
+
"pronoun": [
|
| 48 |
+
"PRON"
|
| 49 |
+
],
|
| 50 |
+
"proper_noun": [
|
| 51 |
+
"PROPN"
|
| 52 |
+
],
|
| 53 |
+
"punctuation": [
|
| 54 |
+
"PUNCT"
|
| 55 |
+
],
|
| 56 |
+
"subordinating_conjunction": [
|
| 57 |
+
"SCONJ"
|
| 58 |
+
],
|
| 59 |
+
"symbol": [
|
| 60 |
+
"SYM"
|
| 61 |
+
],
|
| 62 |
+
"verb": [
|
| 63 |
+
"VERB"
|
| 64 |
+
],
|
| 65 |
+
"other": [
|
| 66 |
+
"X"
|
| 67 |
+
],
|
| 68 |
+
"space": [
|
| 69 |
+
"SPACE"
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
"dep_roles": {
|
| 73 |
+
"root": [
|
| 74 |
+
"ROOT"
|
| 75 |
+
],
|
| 76 |
+
"adjectival_clause": [
|
| 77 |
+
"acl"
|
| 78 |
+
],
|
| 79 |
+
"adjectival_complement": [
|
| 80 |
+
"acomp"
|
| 81 |
+
],
|
| 82 |
+
"adverbial_clause": [
|
| 83 |
+
"advcl"
|
| 84 |
+
],
|
| 85 |
+
"adverbial_modifier": [
|
| 86 |
+
"advmod"
|
| 87 |
+
],
|
| 88 |
+
"agent": [
|
| 89 |
+
"agent"
|
| 90 |
+
],
|
| 91 |
+
"adjectival_modifier": [
|
| 92 |
+
"amod"
|
| 93 |
+
],
|
| 94 |
+
"apposition": [
|
| 95 |
+
"appos"
|
| 96 |
+
],
|
| 97 |
+
"attribute": [
|
| 98 |
+
"attr"
|
| 99 |
+
],
|
| 100 |
+
"auxiliary": [
|
| 101 |
+
"aux"
|
| 102 |
+
],
|
| 103 |
+
"passive_auxiliary": [
|
| 104 |
+
"auxpass"
|
| 105 |
+
],
|
| 106 |
+
"case_marker": [
|
| 107 |
+
"case"
|
| 108 |
+
],
|
| 109 |
+
"coordinating_conjunction": [
|
| 110 |
+
"cc"
|
| 111 |
+
],
|
| 112 |
+
"clausal_complement": [
|
| 113 |
+
"ccomp"
|
| 114 |
+
],
|
| 115 |
+
"compound": [
|
| 116 |
+
"compound"
|
| 117 |
+
],
|
| 118 |
+
"conjunct": [
|
| 119 |
+
"conj"
|
| 120 |
+
],
|
| 121 |
+
"clausal_subject": [
|
| 122 |
+
"csubj"
|
| 123 |
+
],
|
| 124 |
+
"passive_clausal_subject": [
|
| 125 |
+
"csubjpass"
|
| 126 |
+
],
|
| 127 |
+
"dative": [
|
| 128 |
+
"dative"
|
| 129 |
+
],
|
| 130 |
+
"dependency_unspecified": [
|
| 131 |
+
"dep"
|
| 132 |
+
],
|
| 133 |
+
"determiner": [
|
| 134 |
+
"det"
|
| 135 |
+
],
|
| 136 |
+
"direct_object": [
|
| 137 |
+
"dobj"
|
| 138 |
+
],
|
| 139 |
+
"expletive": [
|
| 140 |
+
"expl"
|
| 141 |
+
],
|
| 142 |
+
"indirect_object": [
|
| 143 |
+
"iobj"
|
| 144 |
+
],
|
| 145 |
+
"interjection": [
|
| 146 |
+
"intj"
|
| 147 |
+
],
|
| 148 |
+
"marker": [
|
| 149 |
+
"mark"
|
| 150 |
+
],
|
| 151 |
+
"meta": [
|
| 152 |
+
"meta"
|
| 153 |
+
],
|
| 154 |
+
"negation": [
|
| 155 |
+
"neg"
|
| 156 |
+
],
|
| 157 |
+
"nominal_modifier": [
|
| 158 |
+
"nmod"
|
| 159 |
+
],
|
| 160 |
+
"noun_phrase_adverbial_modifier": [
|
| 161 |
+
"npadvmod"
|
| 162 |
+
],
|
| 163 |
+
"nominal_subject": [
|
| 164 |
+
"nsubj"
|
| 165 |
+
],
|
| 166 |
+
"passive_nominal_subject": [
|
| 167 |
+
"nsubjpass"
|
| 168 |
+
],
|
| 169 |
+
"numeric_modifier": [
|
| 170 |
+
"nummod"
|
| 171 |
+
],
|
| 172 |
+
"object": [
|
| 173 |
+
"obj"
|
| 174 |
+
],
|
| 175 |
+
"object_predicate": [
|
| 176 |
+
"oprd"
|
| 177 |
+
],
|
| 178 |
+
"parataxis": [
|
| 179 |
+
"parataxis"
|
| 180 |
+
],
|
| 181 |
+
"prepositional_complement": [
|
| 182 |
+
"pcomp"
|
| 183 |
+
],
|
| 184 |
+
"object_of_preposition": [
|
| 185 |
+
"pobj"
|
| 186 |
+
],
|
| 187 |
+
"possessive_modifier": [
|
| 188 |
+
"poss"
|
| 189 |
+
],
|
| 190 |
+
"preconjunct": [
|
| 191 |
+
"preconj"
|
| 192 |
+
],
|
| 193 |
+
"predeterminer": [
|
| 194 |
+
"predet"
|
| 195 |
+
],
|
| 196 |
+
"prepositional_modifier": [
|
| 197 |
+
"prep"
|
| 198 |
+
],
|
| 199 |
+
"particle": [
|
| 200 |
+
"prt"
|
| 201 |
+
],
|
| 202 |
+
"punctuation": [
|
| 203 |
+
"punct"
|
| 204 |
+
],
|
| 205 |
+
"quantifier_modifier": [
|
| 206 |
+
"quantmod"
|
| 207 |
+
],
|
| 208 |
+
"relative_clause_modifier": [
|
| 209 |
+
"relcl"
|
| 210 |
+
],
|
| 211 |
+
"open_clausal_complement": [
|
| 212 |
+
"xcomp"
|
| 213 |
+
]
|
| 214 |
+
}
|
| 215 |
+
}
|
saved/model/feature_spec.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
saved/model/metrics.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"validation": {
|
| 3 |
+
"threshold": 0.58,
|
| 4 |
+
"accuracy": 0.78197,
|
| 5 |
+
"precision": 0.8444,
|
| 6 |
+
"recall": 0.73342,
|
| 7 |
+
"f1": 0.785,
|
| 8 |
+
"balanced_accuracy": 0.78651,
|
| 9 |
+
"specificity": 0.8396,
|
| 10 |
+
"youden_j": 0.57302,
|
| 11 |
+
"roc_auc": 0.87306,
|
| 12 |
+
"tn": 2858,
|
| 13 |
+
"fp": 546,
|
| 14 |
+
"fn": 1077,
|
| 15 |
+
"tp": 2963
|
| 16 |
+
},
|
| 17 |
+
"test": {
|
| 18 |
+
"threshold": 0.58,
|
| 19 |
+
"accuracy": 0.79001,
|
| 20 |
+
"precision": 0.84661,
|
| 21 |
+
"recall": 0.73118,
|
| 22 |
+
"f1": 0.78467,
|
| 23 |
+
"balanced_accuracy": 0.79288,
|
| 24 |
+
"specificity": 0.85459,
|
| 25 |
+
"youden_j": 0.58576,
|
| 26 |
+
"roc_auc": 0.87719,
|
| 27 |
+
"tn": 3009,
|
| 28 |
+
"fp": 512,
|
| 29 |
+
"fn": 1039,
|
| 30 |
+
"tp": 2826
|
| 31 |
+
}
|
| 32 |
+
}
|
saved/model/model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
saved/model/threshold.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"threshold": 0.5799999833106995
|
| 3 |
+
}
|
saved/model/training_config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"include_statistical": true,
|
| 3 |
+
"include_tfidf": true,
|
| 4 |
+
"include_char_ngrams": true,
|
| 5 |
+
"include_pos_ngrams": true,
|
| 6 |
+
"include_readability": true,
|
| 7 |
+
"include_local_pairwise": true,
|
| 8 |
+
"include_global_pairwise": true,
|
| 9 |
+
"threshold_metric": "youden_j",
|
| 10 |
+
"threshold_grid_step": 0.01,
|
| 11 |
+
"model_params": {
|
| 12 |
+
"objective": "binary:logistic",
|
| 13 |
+
"eval_metric": "logloss",
|
| 14 |
+
"n_estimators": 500,
|
| 15 |
+
"max_depth": 4,
|
| 16 |
+
"learning_rate": 0.05,
|
| 17 |
+
"subsample": 0.8,
|
| 18 |
+
"colsample_bytree": 0.3,
|
| 19 |
+
"min_child_weight": 3,
|
| 20 |
+
"reg_lambda": 5.0,
|
| 21 |
+
"reg_alpha": 1.0,
|
| 22 |
+
"random_state": 42,
|
| 23 |
+
"n_jobs": 2,
|
| 24 |
+
"tree_method": "hist"
|
| 25 |
+
}
|
| 26 |
+
}
|
saved/ngram_features/char_vectorizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ccb3af06aa45f1da5fe4526c3c7963390b24bc4f0499a825543062128569400
|
| 3 |
+
size 1724313
|
saved/ngram_features/ngram_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"verbose": true,
|
| 3 |
+
"char_ngram_n": 4,
|
| 4 |
+
"char_tfidf_min_df": 2,
|
| 5 |
+
"char_tfidf_max_df": 0.95,
|
| 6 |
+
"char_tfidf_max_features": 50000,
|
| 7 |
+
"pos_ngram_range": [
|
| 8 |
+
2,
|
| 9 |
+
3
|
| 10 |
+
],
|
| 11 |
+
"pos_tfidf_min_df": 2,
|
| 12 |
+
"pos_tfidf_max_df": 0.95,
|
| 13 |
+
"pos_tfidf_max_features": 5000,
|
| 14 |
+
"sublinear_tf": true,
|
| 15 |
+
"norm": "l2",
|
| 16 |
+
"include_readability": true,
|
| 17 |
+
"dense_output": true
|
| 18 |
+
}
|
saved/ngram_features/pos_vectorizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb2c23ef3aaaa48c6c754a2c1e94eac5b1507282607258f48a81daa9f73bb88b
|
| 3 |
+
size 196682
|
saved/normalization/normalization_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"verbose": true,
|
| 3 |
+
"unicode_form": "NFC"
|
| 4 |
+
}
|
saved/tfidf_features/tfidf_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"verbose": true,
|
| 3 |
+
"allowed_pos_tags": [
|
| 4 |
+
"NOUN",
|
| 5 |
+
"PROPN",
|
| 6 |
+
"VERB",
|
| 7 |
+
"ADJ",
|
| 8 |
+
"ADV",
|
| 9 |
+
"CONJ",
|
| 10 |
+
"AUX"
|
| 11 |
+
],
|
| 12 |
+
"min_token_length": 2,
|
| 13 |
+
"ngram_range": [
|
| 14 |
+
1,
|
| 15 |
+
2
|
| 16 |
+
],
|
| 17 |
+
"min_df": 2,
|
| 18 |
+
"max_df": 0.95,
|
| 19 |
+
"max_features": 25000,
|
| 20 |
+
"sublinear_tf": true,
|
| 21 |
+
"norm": "l2",
|
| 22 |
+
"dense_output": true
|
| 23 |
+
}
|
saved/tfidf_features/vectorizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ac5200b840b2db60e359d9818416187fcc8b190ed6b482d629a5e405f694f971
|
| 3 |
+
size 977747
|
static/app.js
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const errorEl = document.getElementById("error");
|
| 2 |
+
const resultCard = document.getElementById("result-card");
|
| 3 |
+
const probabilityRingEl = document.getElementById("probability-ring");
|
| 4 |
+
const probabilityEl = document.getElementById("probability");
|
| 5 |
+
const classificationConfidenceEl = document.getElementById("classification-confidence");
|
| 6 |
+
const classificationDecisionEl = document.getElementById("classification-decision");
|
| 7 |
+
const buttonEl = document.getElementById("predict-btn");
|
| 8 |
+
const decisionThreshold = 0.58;
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
// number formatting
|
| 12 |
+
function formatNumber(value) {
|
| 13 |
+
return value == null || Number.isNaN(Number(value)) ? "-" : Number(value).toFixed(4);
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
function showError(message) {
|
| 18 |
+
errorEl.textContent = message;
|
| 19 |
+
errorEl.classList.remove("hidden"); // remove hidden CSS class
|
| 20 |
+
}
|
| 21 |
+
function clearError() { // clear the error
|
| 22 |
+
errorEl.textContent = "";
|
| 23 |
+
errorEl.classList.add("hidden");
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
// function metricRating(value) {
|
| 28 |
+
// const score = Number(value);
|
| 29 |
+
// if (Number.isNaN(score)) return "-";
|
| 30 |
+
// if (score >= 0.9) return "Very strong";
|
| 31 |
+
// if (score >= 0.8) return "Good";
|
| 32 |
+
// if (score >= 0.7) return "Fairly good";
|
| 33 |
+
// if (score >= 0.6) return "Moderate";
|
| 34 |
+
// return "Limited";
|
| 35 |
+
// }
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
function probabilityColor(probability) {
|
| 39 |
+
const clamped = Math.max(0, Math.min(1, Number(probability)));
|
| 40 |
+
const hue = 8 + clamped * 126; // low prob is near red/orange; high prob more green
|
| 41 |
+
return `hsl(${hue} 72% 46%)`;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
function classificationConfidence(probability) {
|
| 45 |
+
if (probability >= 0.9 || probability <= 0.1) return "Surely";
|
| 46 |
+
if (probability >= 0.75 || probability <= 0.25) return "Very Likely";
|
| 47 |
+
if (probability >= 0.6 || probability <= 0.4) return "Likely";
|
| 48 |
+
return "Uncertain";
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
function renderProbability(probability, showClassification = true) {
|
| 52 |
+
|
| 53 |
+
const clamped = Math.max(0, Math.min(1, Number(probability)));
|
| 54 |
+
const angle = `${(clamped * 360).toFixed(2)}deg`;
|
| 55 |
+
const color = probabilityColor(clamped);
|
| 56 |
+
const decision = clamped >= decisionThreshold ? "Same author" : "Different author";
|
| 57 |
+
const decisionClass = clamped >= decisionThreshold ? "is-same" : "is-different";
|
| 58 |
+
|
| 59 |
+
probabilityRingEl.style.setProperty("--ring-angle", angle);
|
| 60 |
+
probabilityRingEl.style.setProperty("--ring-color", color);
|
| 61 |
+
probabilityEl.textContent = `${(clamped * 100).toFixed(1)}%`;
|
| 62 |
+
classificationDecisionEl.classList.remove("is-same", "is-different");
|
| 63 |
+
|
| 64 |
+
if (showClassification) {
|
| 65 |
+
classificationConfidenceEl.textContent = classificationConfidence(clamped);
|
| 66 |
+
classificationDecisionEl.textContent = decision;
|
| 67 |
+
classificationDecisionEl.classList.add(decisionClass);
|
| 68 |
+
} else {
|
| 69 |
+
classificationConfidenceEl.textContent = "";
|
| 70 |
+
classificationDecisionEl.textContent = "";
|
| 71 |
+
}
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
async function loadMetrics() {
|
| 75 |
+
try {
|
| 76 |
+
const response = await fetch("/metrics");
|
| 77 |
+
const metrics = await response.json();
|
| 78 |
+
document.getElementById("metric-f1").textContent = formatNumber(metrics.f1);
|
| 79 |
+
document.getElementById("metric-youden").textContent = formatNumber(metrics.youden_j);
|
| 80 |
+
document.getElementById("metric-auc").textContent = formatNumber(metrics.auc_roc);
|
| 81 |
+
document.getElementById("metric-f1-rating").textContent = metricRating(metrics.f1);
|
| 82 |
+
document.getElementById("metric-youden-rating").textContent = metricRating(metrics.youden_j);
|
| 83 |
+
document.getElementById("metric-auc-rating").textContent = metricRating(metrics.auc_roc);
|
| 84 |
+
} catch (error) {
|
| 85 |
+
console.error("Failed to load metrics", error);
|
| 86 |
+
}}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
async function handlePredict() {
|
| 90 |
+
clearError();
|
| 91 |
+
|
| 92 |
+
const text1 = document.getElementById("text1").value.trim();
|
| 93 |
+
const text2 = document.getElementById("text2").value.trim();
|
| 94 |
+
|
| 95 |
+
if (!text1 || !text2) {
|
| 96 |
+
showError("Please fill in both text fields.");
|
| 97 |
+
return;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
buttonEl.disabled = true; // disable click
|
| 101 |
+
buttonEl.textContent = "Running...";
|
| 102 |
+
|
| 103 |
+
try {
|
| 104 |
+
const response = await fetch("/predict", {
|
| 105 |
+
method: "POST",
|
| 106 |
+
headers: { "Content-Type": "application/json" },
|
| 107 |
+
body: JSON.stringify({ text1, text2 }),
|
| 108 |
+
});
|
| 109 |
+
const result = await response.json();
|
| 110 |
+
if (!response.ok || result.error) {
|
| 111 |
+
throw new Error(result.error || "Request failed.");
|
| 112 |
+
}
|
| 113 |
+
renderProbability(result.probability);
|
| 114 |
+
} catch (error) {
|
| 115 |
+
showError(error.message || "Request failed.");
|
| 116 |
+
} finally {
|
| 117 |
+
buttonEl.disabled = false;
|
| 118 |
+
buttonEl.textContent = "Predict";
|
| 119 |
+
}}
|
| 120 |
+
|
| 121 |
+
buttonEl.addEventListener("click", handlePredict);
|
| 122 |
+
|
| 123 |
+
renderProbability(0, false);
|
| 124 |
+
loadMetrics(); // always show performance metrics
|
static/cdf.png
ADDED
|
static/image.png
ADDED
|
Git LFS Details
|
static/styles.css
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
:root {
|
| 2 |
+
color-scheme: light;
|
| 3 |
+
--bg: #f7f7f5;
|
| 4 |
+
--panel: #ffffff;
|
| 5 |
+
--border: #e0dfd9;
|
| 6 |
+
--border-strong: #d4d4d0;
|
| 7 |
+
--text: #1a1a1a;
|
| 8 |
+
--muted: #6b6b6b;
|
| 9 |
+
--muted-soft: #9b9b93;
|
| 10 |
+
--success: #1a7f4b;
|
| 11 |
+
--danger: #b84040;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
* {
|
| 15 |
+
box-sizing: border-box;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
/* body {
|
| 19 |
+
margin: 0;
|
| 20 |
+
min-height: 100vh;
|
| 21 |
+
background: #ffffff;
|
| 22 |
+
color: var(--text);
|
| 23 |
+
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
|
| 24 |
+
} */
|
| 25 |
+
body {
|
| 26 |
+
margin: 0;
|
| 27 |
+
min-height: 100vh;
|
| 28 |
+
background: #f6f3ee;
|
| 29 |
+
color: var(--text);
|
| 30 |
+
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
.page {
|
| 36 |
+
width: 100%;
|
| 37 |
+
max-width: 1700px;
|
| 38 |
+
margin: 0 auto;
|
| 39 |
+
padding: 28px 48px 36px;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
.header-bar {
|
| 43 |
+
width: 100%;
|
| 44 |
+
padding: 10px 30px;
|
| 45 |
+
background:
|
| 46 |
+
linear-gradient(180deg, rgba(255, 253, 250, 0.644) 0%, rgba(246, 243, 238, 0.72) 100%),
|
| 47 |
+
linear-gradient(90deg, #c5c1ba 0%, #cfc4b1 52%, #dfe8e2 100%);
|
| 48 |
+
border-bottom: 1px solid rgba(76, 76, 74, 0.202);
|
| 49 |
+
box-shadow: 0 14px 28px rgba(149, 149, 149, 0.08);
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
.header-inner {
|
| 53 |
+
width: 100%;
|
| 54 |
+
max-width: 1540px;
|
| 55 |
+
margin: 0 auto;
|
| 56 |
+
display: flex;
|
| 57 |
+
align-items: baseline;
|
| 58 |
+
gap: 18px;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
.app-shell {
|
| 62 |
+
width: 100%;
|
| 63 |
+
max-width: 1540px;
|
| 64 |
+
margin: 0 auto;
|
| 65 |
+
min-height: auto;
|
| 66 |
+
display: grid;
|
| 67 |
+
grid-template-columns: minmax(960px, 1fr) 420px;
|
| 68 |
+
gap: 44px;
|
| 69 |
+
align-items: start;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
.main-pane,
|
| 73 |
+
.side-pane {
|
| 74 |
+
display: flex;
|
| 75 |
+
flex-direction: column;
|
| 76 |
+
gap: 22px;
|
| 77 |
+
}
|
| 78 |
+
.side-pane {
|
| 79 |
+
padding-top: 0;
|
| 80 |
+
min-height: 100%;
|
| 81 |
+
justify-content: center;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
.title {
|
| 86 |
+
margin: 0;
|
| 87 |
+
font-size: 39px;
|
| 88 |
+
font-weight: 500;
|
| 89 |
+
font-family: serif, Georgia, "Times New Roman", Times;
|
| 90 |
+
letter-spacing: 0.02em;
|
| 91 |
+
white-space: nowrap;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
.subtitle {
|
| 95 |
+
margin: 0;
|
| 96 |
+
max-width: 62ch;
|
| 97 |
+
font-size: 15px;
|
| 98 |
+
color: var(--muted);
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
.card {
|
| 102 |
+
background: color-mix(in srgb, var(--panel) 88%, #faf6ef 12%);
|
| 103 |
+
border: 1px solid var(--border);
|
| 104 |
+
border-radius: 18px;
|
| 105 |
+
padding: 28px 28px 24px;
|
| 106 |
+
box-shadow: 0 18px 48px rgba(48, 42, 31, 0.06);
|
| 107 |
+
}
|
| 108 |
+
#result-card {
|
| 109 |
+
background: transparent;
|
| 110 |
+
border: none;
|
| 111 |
+
box-shadow: none;
|
| 112 |
+
padding: 8px 0 0;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
.bottom-panel {
|
| 117 |
+
width: 100%;
|
| 118 |
+
max-width: 1540px;
|
| 119 |
+
margin: 28px auto 0;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
.workspace-card {
|
| 123 |
+
padding: 0 0 18px;
|
| 124 |
+
background: transparent;
|
| 125 |
+
border: none;
|
| 126 |
+
box-shadow: none;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
.section-title {
|
| 130 |
+
margin: 0 0 8px;
|
| 131 |
+
font-size: 16px;
|
| 132 |
+
font-weight: 600;
|
| 133 |
+
letter-spacing: 0.08em;
|
| 134 |
+
text-transform: uppercase;
|
| 135 |
+
color: var(--muted);
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
.card-note {
|
| 139 |
+
margin: -2px 0 10px;
|
| 140 |
+
font-size: 14px;
|
| 141 |
+
color: var(--muted-soft);
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
.stack {
|
| 145 |
+
display: grid;
|
| 146 |
+
grid-template-columns: minmax(0, 1fr) minmax(0, 1fr);
|
| 147 |
+
gap: 18px;
|
| 148 |
+
margin-bottom: 18px;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
.input-block {
|
| 152 |
+
display: flex;
|
| 153 |
+
flex-direction: column;
|
| 154 |
+
gap: 8px;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
.label {
|
| 158 |
+
font-size: 13px;
|
| 159 |
+
font-weight: 600;
|
| 160 |
+
letter-spacing: 0.07em;
|
| 161 |
+
text-transform: uppercase;
|
| 162 |
+
color: var(--muted);
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
textarea {
|
| 167 |
+
width: 100%;
|
| 168 |
+
min-height: 450px;
|
| 169 |
+
padding: 16px 18px;
|
| 170 |
+
font-size: 16px;
|
| 171 |
+
line-height: 1.55;
|
| 172 |
+
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
|
| 173 |
+
border: 1px solid var(--border-strong);
|
| 174 |
+
border-radius: 14px;
|
| 175 |
+
resize: vertical;
|
| 176 |
+
background: rgba(255, 255, 255, 0.82);
|
| 177 |
+
color: var(--text);
|
| 178 |
+
outline: none;
|
| 179 |
+
transition: border-color 140ms ease, box-shadow 140ms ease, background-color 140ms ease;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
textarea:focus {
|
| 184 |
+
border-color: #a48b68;
|
| 185 |
+
background: #fffdfa;
|
| 186 |
+
box-shadow: 0 0 0 4px rgba(164, 139, 104, 0.12);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
button {
|
| 190 |
+
display: inline-flex;
|
| 191 |
+
align-items: center;
|
| 192 |
+
justify-content: center;
|
| 193 |
+
width: auto;
|
| 194 |
+
min-width: 146px;
|
| 195 |
+
padding: 14px 22px;
|
| 196 |
+
font-size: 16px;
|
| 197 |
+
font-weight: 500;
|
| 198 |
+
letter-spacing: 0.04em;
|
| 199 |
+
background: var(--text);
|
| 200 |
+
color: #ffffff;
|
| 201 |
+
border: none;
|
| 202 |
+
border-radius: 999px;
|
| 203 |
+
cursor: pointer;
|
| 204 |
+
box-shadow: 0 10px 24px rgba(26, 26, 26, 0.18);
|
| 205 |
+
transition: transform 140ms ease, box-shadow 140ms ease, background-color 140ms ease;
|
| 206 |
+
}
|
| 207 |
+
button:hover:not(:disabled) {
|
| 208 |
+
transform: translateY(-1px);
|
| 209 |
+
box-shadow: 0 14px 28px rgba(26, 26, 26, 0.2);
|
| 210 |
+
}
|
| 211 |
+
button:disabled {
|
| 212 |
+
background: var(--muted-soft);
|
| 213 |
+
cursor: not-allowed;
|
| 214 |
+
box-shadow: none;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
.error-msg {
|
| 218 |
+
margin: 0 0 8px;
|
| 219 |
+
font-size: 13px;
|
| 220 |
+
color: var(--danger);
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
/* hide component */
|
| 226 |
+
.hidden {
|
| 227 |
+
display: none;
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
.result-visual {
|
| 232 |
+
display: flex;
|
| 233 |
+
align-items: center;
|
| 234 |
+
gap: 28px;
|
| 235 |
+
padding: 4px 0 0;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
.result-meter {
|
| 239 |
+
display: flex;
|
| 240 |
+
flex-direction: column;
|
| 241 |
+
align-items: center;
|
| 242 |
+
gap: 14px;
|
| 243 |
+
flex-shrink: 0;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
.probability-ring {
|
| 247 |
+
--ring-angle: 0deg;
|
| 248 |
+
--ring-color: hsl(8 72% 46%);
|
| 249 |
+
width: 138px;
|
| 250 |
+
aspect-ratio: 1;
|
| 251 |
+
border-radius: 50%;
|
| 252 |
+
background:
|
| 253 |
+
conic-gradient(var(--ring-color) 0 var(--ring-angle), #e8e8e4a4 var(--ring-angle) 360deg);
|
| 254 |
+
display: grid;
|
| 255 |
+
place-items: center;
|
| 256 |
+
flex-shrink: 0;
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
.probability-ring-inner {
|
| 260 |
+
width: calc(100% - 16px);
|
| 261 |
+
height: calc(100% - 16px);
|
| 262 |
+
border-radius: 50%;
|
| 263 |
+
background: #f6f3ee;
|
| 264 |
+
color: var(--text);
|
| 265 |
+
display: grid;
|
| 266 |
+
place-items: center;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
.probability-value {
|
| 270 |
+
font-size: 28px;
|
| 271 |
+
font-weight: 600;
|
| 272 |
+
letter-spacing: -0.03em;
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
.result-copy {
|
| 276 |
+
display: flex;
|
| 277 |
+
flex-direction: column;
|
| 278 |
+
gap: 8px;
|
| 279 |
+
max-width: 360px;
|
| 280 |
+
}
|
| 281 |
+
.result-eyebrow {
|
| 282 |
+
margin: 0;
|
| 283 |
+
font-size: 14px;
|
| 284 |
+
font-weight: 600;
|
| 285 |
+
letter-spacing: 0.08em;
|
| 286 |
+
text-transform: uppercase;
|
| 287 |
+
color: var(--muted);
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
.result-description {
|
| 291 |
+
margin: 0;
|
| 292 |
+
font-size: 16px;
|
| 293 |
+
line-height: 1.5;
|
| 294 |
+
color: var(--muted);
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
.classification {
|
| 300 |
+
display: flex;
|
| 301 |
+
flex-direction: column;
|
| 302 |
+
align-items: center;
|
| 303 |
+
gap: 6px;
|
| 304 |
+
text-align: center;
|
| 305 |
+
}
|
| 306 |
+
.classification-decision {
|
| 307 |
+
margin: 0;
|
| 308 |
+
font-size: 22px;
|
| 309 |
+
font-weight: 600;
|
| 310 |
+
color: var(--text);
|
| 311 |
+
}
|
| 312 |
+
/* green for same author */
|
| 313 |
+
.classification-decision.is-same {
|
| 314 |
+
color: var(--success);
|
| 315 |
+
}
|
| 316 |
+
/* red for different author */
|
| 317 |
+
.classification-decision.is-different {
|
| 318 |
+
color: var(--danger);
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
.sidebar-stats {
|
| 324 |
+
display: grid;
|
| 325 |
+
grid-template-columns: repeat(3, minmax(210px, 250px));
|
| 326 |
+
justify-content: start;
|
| 327 |
+
gap: 10px;
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
.performance-card {
|
| 332 |
+
background: transparent;
|
| 333 |
+
border: none;
|
| 334 |
+
box-shadow: none;
|
| 335 |
+
padding: 0;
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
.metric-main {
|
| 339 |
+
display: flex;
|
| 340 |
+
flex-direction: column;
|
| 341 |
+
gap: 8px;
|
| 342 |
+
align-items: flex-start;
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
.metric-summary {
|
| 346 |
+
display: flex;
|
| 347 |
+
flex-direction: column;
|
| 348 |
+
gap: 3px;
|
| 349 |
+
}
|
| 350 |
+
.meta-stat {
|
| 351 |
+
display: flex;
|
| 352 |
+
flex-direction: column;
|
| 353 |
+
gap: 2px;
|
| 354 |
+
padding-right: 0;
|
| 355 |
+
}
|
| 356 |
+
.meta-value {
|
| 357 |
+
font-size: 22px;
|
| 358 |
+
letter-spacing: -0.2px;
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
.meta-label {
|
| 362 |
+
font-size: 13px;
|
| 363 |
+
color: var(--muted);
|
| 364 |
+
letter-spacing: 0.03em;
|
| 365 |
+
}
|
| 366 |
+
.metric-rating {
|
| 367 |
+
font-size: 13px;
|
| 368 |
+
font-weight: 600;
|
| 369 |
+
color: #8b7355;
|
| 370 |
+
letter-spacing: 0.02em;
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
.metric-note {
|
| 374 |
+
margin: 0;
|
| 375 |
+
font-size: 11px;
|
| 376 |
+
line-height: 1.45;
|
| 377 |
+
color: var(--muted-soft);
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
@media (max-width: 760px) {
|
| 383 |
+
.app-shell {
|
| 384 |
+
grid-template-columns: 1fr;
|
| 385 |
+
gap: 18px;
|
| 386 |
+
}
|
| 387 |
+
.side-pane {
|
| 388 |
+
padding-top: 0;
|
| 389 |
+
}
|
| 390 |
+
.bottom-panel {
|
| 391 |
+
margin-top: 20px;
|
| 392 |
+
}
|
| 393 |
+
.result-visual {
|
| 394 |
+
align-items: flex-start;
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
.stack {
|
| 398 |
+
grid-template-columns: 1fr;
|
| 399 |
+
}
|
| 400 |
+
.sidebar-stats {
|
| 401 |
+
grid-template-columns: 1fr;
|
| 402 |
+
gap: 14px;
|
| 403 |
+
justify-content: stretch;
|
| 404 |
+
}
|
| 405 |
+
.metric-main {
|
| 406 |
+
gap: 6px;
|
| 407 |
+
}
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
@media (max-width: 720px) {
|
| 411 |
+
.page {
|
| 412 |
+
padding: 20px 16px 28px;
|
| 413 |
+
}
|
| 414 |
+
.card {
|
| 415 |
+
padding: 20px 18px;
|
| 416 |
+
}
|
| 417 |
+
.header-bar {
|
| 418 |
+
padding: 12px 30px;
|
| 419 |
+
}
|
| 420 |
+
.header-inner {
|
| 421 |
+
align-items: flex-start;
|
| 422 |
+
flex-direction: column;
|
| 423 |
+
gap: 3px;
|
| 424 |
+
}
|
| 425 |
+
.title {
|
| 426 |
+
font-size: 26px;
|
| 427 |
+
white-space: normal;
|
| 428 |
+
}
|
| 429 |
+
textarea {
|
| 430 |
+
min-height: 220px;
|
| 431 |
+
}
|
| 432 |
+
.result-visual {
|
| 433 |
+
flex-direction: column;
|
| 434 |
+
gap: 16px;
|
| 435 |
+
}
|
| 436 |
+
.probability-ring {
|
| 437 |
+
width: 120px;
|
| 438 |
+
}
|
| 439 |
+
.probability-value {
|
| 440 |
+
font-size: 26px;
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
.metric-main {
|
| 444 |
+
gap: 6px;
|
| 445 |
+
}}
|
templates/index.html
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="utf-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 7 |
+
<title>AVeri</title>
|
| 8 |
+
<link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
|
| 9 |
+
</head>
|
| 10 |
+
|
| 11 |
+
<body>
|
| 12 |
+
|
| 13 |
+
<header class="header-bar">
|
| 14 |
+
<div class="header-inner">
|
| 15 |
+
<h1 class="title">AVeri: Author Verifier</h1>
|
| 16 |
+
<!-- <p class="subtitle">Are your two texts written by the same author?</p> -->
|
| 17 |
+
</div>
|
| 18 |
+
</header>
|
| 19 |
+
|
| 20 |
+
<div class="page">
|
| 21 |
+
<div class="app-shell">
|
| 22 |
+
<main class="main-pane">
|
| 23 |
+
|
| 24 |
+
<section class="card workspace-card">
|
| 25 |
+
<div class="stack">
|
| 26 |
+
<div class="input-block">
|
| 27 |
+
<label class="label" for="text1">Text A</label>
|
| 28 |
+
<textarea id="text1" placeholder="Paste first text here..."></textarea>
|
| 29 |
+
</div>
|
| 30 |
+
|
| 31 |
+
<div class="input-block">
|
| 32 |
+
<label class="label" for="text2">Text B</label>
|
| 33 |
+
<textarea id="text2" placeholder="Paste second text here..."></textarea>
|
| 34 |
+
</div>
|
| 35 |
+
</div>
|
| 36 |
+
|
| 37 |
+
<p id="error" class="error-msg hidden"></p>
|
| 38 |
+
<button id="predict-btn" type="button">Predict</button>
|
| 39 |
+
</section>
|
| 40 |
+
|
| 41 |
+
</main>
|
| 42 |
+
|
| 43 |
+
<aside class="side-pane">
|
| 44 |
+
|
| 45 |
+
<section id="result-card" class="card">
|
| 46 |
+
<h2 class="section-title">Result</h2>
|
| 47 |
+
<div class="result-visual">
|
| 48 |
+
|
| 49 |
+
<div class="result-meter">
|
| 50 |
+
<div id="probability-ring" class="probability-ring" aria-hidden="true">
|
| 51 |
+
<div class="probability-ring-inner">
|
| 52 |
+
<span id="probability" class="probability-value">0.0%</span>
|
| 53 |
+
</div>
|
| 54 |
+
</div>
|
| 55 |
+
|
| 56 |
+
<div class="classification">
|
| 57 |
+
<p id="classification-confidence" class="result-eyebrow"></p>
|
| 58 |
+
<p id="classification-decision" class="classification-decision"></p>
|
| 59 |
+
</div>
|
| 60 |
+
</div>
|
| 61 |
+
|
| 62 |
+
<div class="result-copy">
|
| 63 |
+
<p class="result-eyebrow">Probability</p>
|
| 64 |
+
<p class="result-description">Probability both texts were written by the same author.</p>
|
| 65 |
+
</div>
|
| 66 |
+
|
| 67 |
+
</div>
|
| 68 |
+
</section>
|
| 69 |
+
|
| 70 |
+
</aside>
|
| 71 |
+
</div>
|
| 72 |
+
|
| 73 |
+
<section class="bottom-panel">
|
| 74 |
+
<section class="card performance-card">
|
| 75 |
+
|
| 76 |
+
<h2 class="section-title">Model Performance</h2>
|
| 77 |
+
|
| 78 |
+
<div class="sidebar-stats">
|
| 79 |
+
|
| 80 |
+
<div class="meta-stat">
|
| 81 |
+
<div class="metric-main">
|
| 82 |
+
<div class="metric-summary">
|
| 83 |
+
<span id="metric-f1" class="meta-value">-</span>
|
| 84 |
+
<span class="meta-label">F1 Score</span>
|
| 85 |
+
<!-- <span id="metric-f1-rating" class="metric-rating">-</span> -->
|
| 86 |
+
</div>
|
| 87 |
+
<p class="metric-note">The model is good at prediction without raising too many false alarms.</p>
|
| 88 |
+
</div>
|
| 89 |
+
</div>
|
| 90 |
+
|
| 91 |
+
<div class="meta-stat">
|
| 92 |
+
<div class="metric-main">
|
| 93 |
+
<div class="metric-summary">
|
| 94 |
+
<span id="metric-youden" class="meta-value">-</span>
|
| 95 |
+
<span class="meta-label">Youden J</span>
|
| 96 |
+
<!-- <span id="metric-youden-rating" class="metric-rating">-</span> -->
|
| 97 |
+
</div>
|
| 98 |
+
<p class="metric-note">The model has moderate ability to correctly separate same- and different-author cases.</p>
|
| 99 |
+
</div>
|
| 100 |
+
</div>
|
| 101 |
+
|
| 102 |
+
<div class="meta-stat">
|
| 103 |
+
<div class="metric-main">
|
| 104 |
+
<div class="metric-summary">
|
| 105 |
+
<span id="metric-auc" class="meta-value">-</span>
|
| 106 |
+
<span class="meta-label">AUC-ROC</span>
|
| 107 |
+
<!-- <span id="metric-auc-rating" class="metric-rating">-</span> -->
|
| 108 |
+
</div>
|
| 109 |
+
<p class="metric-note">The model is strongly reliable at distinguishing between same- and different-author cases overall.</p>
|
| 110 |
+
</div>
|
| 111 |
+
</div>
|
| 112 |
+
|
| 113 |
+
</div>
|
| 114 |
+
|
| 115 |
+
</section>
|
| 116 |
+
</section>
|
| 117 |
+
|
| 118 |
+
</div>
|
| 119 |
+
|
| 120 |
+
<script src="{{ url_for('static', filename='app.js') }}"></script>
|
| 121 |
+
</body>
|
| 122 |
+
</html>
|