salirafi commited on
Commit
ad19081
·
verified ·
1 Parent(s): 337a134

Upload 21 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ static/image.png filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # !usr/bin/env/python3
2
+
3
+ import sys
4
+ from functools import lru_cache
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from flask import Flask, jsonify, render_template, request
9
+
10
+ BASE_DIR = Path(__file__).resolve().parent
11
+ SRC_DIR = BASE_DIR / "src"
12
+ SAVED_DIR = BASE_DIR / "saved"
13
+
14
+ if str(SRC_DIR) not in sys.path:
15
+ sys.path.insert(0, str(SRC_DIR))
16
+
17
+ from inference import Inference
18
+ from helpers import load_json
19
+
20
+ app = Flask(__name__)
21
+
22
+
23
+
24
+ def _compute_model_metrics(metrics_payload: dict[str, Any]) -> dict[str, float]:
25
+ test_metrics = metrics_payload.get("test") or {}
26
+ tn = float(test_metrics.get("tn", 0.0))
27
+ fp = float(test_metrics.get("fp", 0.0))
28
+ tp = float(test_metrics.get("tp", 0.0))
29
+ fn = float(test_metrics.get("fn", 0.0))
30
+ specificity = tn / (tn + fp) if (tn + fp) else 0.0
31
+ sensitivity = tp / (tp + fn) if (tp + fn) else float(test_metrics.get("recall", 0.0))
32
+ youden_j = sensitivity + specificity - 1.0
33
+ return {
34
+ "f1": float(test_metrics.get("f1", 0.0)),
35
+ "youden_j": round(youden_j, 5),
36
+ "auc_roc": float(test_metrics.get("roc_auc", 0.0)),
37
+ }
38
+
39
+ @lru_cache(maxsize=1)
40
+ def get_metrics() -> dict[str, float]:
41
+ return _compute_model_metrics(load_json(SAVED_DIR / "model" / "metrics.json"))
42
+
43
+
44
+
45
+ @lru_cache(maxsize=1)
46
+ def get_service() -> Inference:
47
+ return Inference(project_root=BASE_DIR)
48
+
49
+ def predict(text1: str, text2: str) -> dict[str, Any]:
50
+ return get_service().predict(text1, text2).to_dict()
51
+
52
+
53
+ @app.route("/", methods=["GET"])
54
+ def home_route():
55
+ return render_template("index.html")
56
+
57
+ @app.route("/predict", methods=["POST"])
58
+ def predict_route():
59
+ data = request.get_json(force=True)
60
+ text1 = (data.get("text1") or "").strip()
61
+ text2 = (data.get("text2") or "").strip()
62
+ if not text1 or not text2: return jsonify({"error": "Both text fields are required."}), 400
63
+ try: result = predict(text1, text2)
64
+ except Exception as exc:
65
+ return jsonify({"error": f"Inference failed: {exc}"}), 500
66
+ return jsonify(result)
67
+
68
+ @app.route("/metrics", methods=["GET"])
69
+ def metrics_route():
70
+ try:
71
+ return jsonify(get_metrics())
72
+ except Exception as exc:
73
+ return jsonify({"error": f"Failed to load metrics: {exc}"}), 500
74
+
75
+
76
+ # ping for cron job
77
+ @app.route("/ping")
78
+ def ping():
79
+ return {"status": "ok"}, 200
80
+
81
+
82
+ if __name__ == "__main__":
83
+ app.run(debug=True, port=5000)
inference.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from textstat import textstat
10
+ from xgboost import XGBClassifier
11
+
12
+ from helpers import load_json, load_pickle
13
+
14
+ from masking_regex import mask_split as regex_mask_split
15
+ from masking_spacy import Config as SpacyMaskingConfig, _apply_ner_mask, _build_linguistic_record, load_nlp_model
16
+ from normalization import normalize_text, Config as NormalizationConfig
17
+ from features_statistical import extract_split_statistics, Config as StatisticalConfig
18
+ from features_tfidf import record_to_tfidf_text, Config as TFIDFConfig
19
+ from features_ngram import Config as NGramConfig, build_space_free_char_ngrams, record_to_pos_sequence
20
+ from model_training import Config as TrainingConfig
21
+
22
+
23
+
24
+ def _coerce_tfidf_config(payload: dict[str, Any]) -> TFIDFConfig:
25
+ payload = dict(payload)
26
+ if isinstance(payload.get("ngram_range"), list):
27
+ payload["ngram_range"] = tuple(payload["ngram_range"])
28
+ return TFIDFConfig(**payload)
29
+ def _coerce_ngram_config(payload: dict[str, Any]) -> NGramConfig:
30
+ payload = dict(payload)
31
+ if isinstance(payload.get("pos_ngram_range"), list):
32
+ payload["pos_ngram_range"] = tuple(payload["pos_ngram_range"])
33
+ return NGramConfig(**payload)
34
+ def _coerce_statistical_config(payload: dict[str, Any]) -> StatisticalConfig:
35
+ payload = dict(payload)
36
+ if isinstance(payload.get("phrase_role_dependency_labels"), list):
37
+ payload["phrase_role_dependency_labels"] = tuple(payload["phrase_role_dependency_labels"])
38
+ return StatisticalConfig(**payload)
39
+ def _coerce_training_config(payload: dict[str, Any]) -> TrainingConfig:
40
+ payload = dict(payload)
41
+ return TrainingConfig(**payload)
42
+
43
+ @dataclass(slots=True)
44
+ class PredictionResult:
45
+ probability_same: float
46
+ predicted_label: int
47
+ threshold: float
48
+ normalized_text1: str
49
+ normalized_text2: str
50
+ masked_text1: str
51
+ masked_text2: str
52
+ def to_dict(self) -> dict[str, Any]:
53
+ label = "Same author" if self.predicted_label == 1 else "Different author"
54
+ return {
55
+ "label": label,
56
+ "probability": self.probability_same,
57
+ "threshold": self.threshold,
58
+ "normalized_text1": self.normalized_text1,
59
+ "normalized_text2": self.normalized_text2,
60
+ "masked_text1": self.masked_text1,
61
+ "masked_text2": self.masked_text2,
62
+ }
63
+
64
+
65
+
66
+ # STAND-ALONE PIPELINE TO PERFORM INFERENCE USING THE TRAINED MODEL
67
+ class Inference:
68
+ def __init__(self, project_root: str | Path | None = None) -> None:
69
+
70
+ self.project_root = Path(project_root) if project_root is not None else Path(__file__).resolve().parents[1]
71
+ self.saved_dir = self.project_root / "saved"
72
+ self.model_dir = self.saved_dir / "model"
73
+
74
+ # =============================
75
+ # the pipeline follows what is done in src/pipeline.py but adapted to do inference instead of training
76
+ # =============================
77
+
78
+ self.normalization_config = NormalizationConfig(**load_json(self.saved_dir / "normalization" / "normalization_config.json"))
79
+
80
+ spacy_payload = load_json(self.saved_dir / "masking" / "spacy_config.json")
81
+ spacy_payload["verbose"] = False
82
+ spacy_payload["nlp_n_process"] = 1
83
+ self.spacy_config = SpacyMaskingConfig(**spacy_payload)
84
+
85
+ statistical_payload = load_json(self.saved_dir / "masking" / "statistical_config.json")
86
+ statistical_payload["verbose"] = False
87
+ self.statistical_config = _coerce_statistical_config(statistical_payload)
88
+
89
+ tfidf_payload = load_json(self.saved_dir / "tfidf_features" / "tfidf_config.json")
90
+ tfidf_payload["verbose"] = False
91
+ self.tfidf_config = _coerce_tfidf_config(tfidf_payload)
92
+
93
+ ngram_payload = load_json(self.saved_dir / "ngram_features" / "ngram_config.json")
94
+ ngram_payload["verbose"] = False
95
+ self.ngram_config = _coerce_ngram_config(ngram_payload)
96
+
97
+ training_payload = load_json(self.saved_dir / "model" / "training_config.json")
98
+ self.training_config = _coerce_training_config(training_payload)
99
+
100
+ self.tfidf_vectorizer = load_pickle(self.saved_dir / "tfidf_features" / "vectorizer.pkl")
101
+ self.char_vectorizer = load_pickle(self.saved_dir / "ngram_features" / "char_vectorizer.pkl")
102
+ self.pos_vectorizer = load_pickle(self.saved_dir / "ngram_features" / "pos_vectorizer.pkl")
103
+
104
+ self.model = None
105
+ self.threshold = float(load_json(self.model_dir / "threshold.json")["threshold"])
106
+
107
+ feature_spec = load_json(self.model_dir / "feature_spec.json")
108
+ self.suffixes: list[str] = feature_spec["suffixes"]
109
+ # self.pairwise_operations: tuple[str, ...] = tuple(feature_spec["pairwise_operations"])
110
+ self.pairwise_column_pairs = [(f"text1_{suffix}", f"text2_{suffix}") for suffix in self.suffixes]
111
+
112
+ self.metrics = load_json(self.model_dir / "metrics.json")
113
+ self.nlp = None
114
+
115
+ def _load_model(self) -> XGBClassifier:
116
+ if self.model is None:
117
+ model_path = self.model_dir / "model.json"
118
+ if not model_path.exists():
119
+ raise FileNotFoundError(f"Missing '{model_path}'")
120
+ model = XGBClassifier()
121
+ model.load_model(model_path)
122
+ self.model = model
123
+ return self.model
124
+
125
+
126
+ def _predict_positive_proba(self, X: np.ndarray) -> float:
127
+ model = self._load_model()
128
+ return float(model.predict_proba(X)[0, 1])
129
+
130
+ def _mask_one_text(self, text: str) -> tuple[str, dict[str, Any]]:
131
+ if self.nlp is None:
132
+ self.nlp = load_nlp_model(config=self.spacy_config)
133
+ doc = self.nlp(text)
134
+ masked_text, _ = _apply_ner_mask(text, doc)
135
+ record = _build_linguistic_record(doc)
136
+ return masked_text, record
137
+
138
+ def _build_pairwise_vector(self, feature_df: pd.DataFrame) -> np.ndarray:
139
+ row_values = feature_df.iloc[0].to_dict()
140
+ width = len(self.pairwise_column_pairs) * 2 # two pairwise operations: abs. diff & dot product
141
+ X_pair = np.empty((1, width), dtype=np.float32)
142
+ column_index = 0
143
+ for left_col, right_col in self.pairwise_column_pairs:
144
+ left = np.float32(row_values.get(left_col, 0.0))
145
+ right = np.float32(row_values.get(right_col, 0.0))
146
+ diff = left - right
147
+ X_pair[0, column_index] = abs(diff)
148
+ X_pair[0, column_index + 1] = left * right
149
+ column_index += 2
150
+
151
+ return X_pair
152
+
153
+ def _family_suffix_groups(self) -> dict[str, list[str]]:
154
+ return {
155
+ "tfidf": [s for s in self.suffixes if s.startswith("tfidf_")],
156
+ "char_ngrams": [s for s in self.suffixes if s.startswith("char") and "_tfidf_" in s],
157
+ "pos_ngrams": [s for s in self.suffixes if s.startswith("pos") and "_tfidf_" in s],
158
+ "scalar": [s for s in self.suffixes if not (
159
+ s.startswith("tfidf_")
160
+ or (s.startswith("char") and "_tfidf_" in s)
161
+ or (s.startswith("pos") and "_tfidf_" in s)
162
+ )],
163
+ }
164
+
165
+ def _build_global_pairwise_vector(self, feature_df: pd.DataFrame) -> np.ndarray:
166
+ row_values = feature_df.iloc[0].to_dict()
167
+ values: list[float] = []
168
+
169
+
170
+ for family_suffixes in self._family_suffix_groups().values():
171
+ if not family_suffixes:
172
+ continue
173
+
174
+ left = np.array([row_values.get(f"text1_{suffix}", 0.0) for suffix in family_suffixes], dtype=np.float32)
175
+ right = np.array([row_values.get(f"text2_{suffix}", 0.0) for suffix in family_suffixes], dtype=np.float32)
176
+
177
+ denominator = float(np.linalg.norm(left) * np.linalg.norm(right))
178
+ cosine = float(np.dot(left, right) / denominator) if denominator > 0 else 0.0
179
+ diff = left - right
180
+ l1 = float(np.abs(diff).sum())
181
+ l2 = float(np.linalg.norm(diff))
182
+
183
+ values.extend([cosine, l1, l2])
184
+
185
+ return np.array(values, dtype=np.float32).reshape(1, -1)
186
+
187
+ # predict prbability and classification of two given texts (input from the user)
188
+ def predict(self, text1: str, text2: str, threshold: float | None = None) -> PredictionResult:
189
+
190
+ threshold_value = self.threshold if threshold is None else float(threshold)
191
+
192
+ pair_df = pd.DataFrame([{
193
+ "text1": normalize_text(text1, config=self.normalization_config),
194
+ "text2": normalize_text(text2, config=self.normalization_config),
195
+ "same": 0,
196
+ }])
197
+
198
+ regex_masked_df, _ = regex_mask_split(pair_df)
199
+
200
+ # spaCy masking; not using nlp.pipe
201
+ masked_text1, record1 = self._mask_one_text(regex_masked_df.iloc[0]["text1"])
202
+ masked_text2, record2 = self._mask_one_text(regex_masked_df.iloc[0]["text2"])
203
+
204
+ masked_df = regex_masked_df.copy()
205
+ masked_df.at[0, "text1"] = masked_text1 # combining regex and spaCy masking
206
+ masked_df.at[0, "text2"] = masked_text2 # ...
207
+
208
+ split_cache = {"text1": [record1], "text2": [record2]} # the linguistic cache
209
+ feature_df = pd.DataFrame() # initialize empty dataframe for the features
210
+
211
+ # ======== statistical features ===========
212
+
213
+ if self.training_config.include_statistical:
214
+ feature_df = extract_split_statistics(
215
+ masked_df,
216
+ split_cache=split_cache,
217
+ split_name="inference",
218
+ config=self.statistical_config,
219
+ )
220
+
221
+ # ======== TF-IDF features ===========
222
+
223
+ if self.training_config.include_tfidf:
224
+ for column in ("text1", "text2"):
225
+ docs = [record_to_tfidf_text(record, config=self.tfidf_config) for record in split_cache[column]]
226
+ tfidf_matrix = self.tfidf_vectorizer.transform(docs).toarray()
227
+ tfidf_cols = [f"{column}_tfidf_{index:05d}" for index in range(tfidf_matrix.shape[1])]
228
+ tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_cols)
229
+ feature_df = pd.concat([feature_df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)
230
+
231
+ # ======== n-gram features ===========
232
+
233
+ for column in ("text1", "text2"):
234
+
235
+ if self.training_config.include_char_ngrams:
236
+ char_docs = [
237
+ " ".join(build_space_free_char_ngrams(text, n=self.ngram_config.char_ngram_n))
238
+ for text in masked_df[column].tolist()]
239
+ char_matrix = self.char_vectorizer.transform(char_docs).toarray()
240
+ char_cols = [
241
+ f"{column}_char{self.ngram_config.char_ngram_n}_tfidf_{index:05d}"
242
+ for index in range(char_matrix.shape[1])]
243
+ char_df = pd.DataFrame(char_matrix, columns=char_cols)
244
+ feature_df = pd.concat([feature_df.reset_index(drop=True), char_df.reset_index(drop=True)], axis=1)
245
+
246
+ if self.training_config.include_pos_ngrams:
247
+ pos_docs = [" ".join(record_to_pos_sequence(record)) for record in split_cache[column]]
248
+ pos_matrix = self.pos_vectorizer.transform(pos_docs).toarray()
249
+ pos_cols = [
250
+ f"{column}_pos{self.ngram_config.pos_ngram_range}_tfidf_{index:05d}"
251
+ for index in range(pos_matrix.shape[1])]
252
+ pos_df = pd.DataFrame(pos_matrix, columns=pos_cols)
253
+ feature_df = pd.concat([feature_df.reset_index(drop=True), pos_df.reset_index(drop=True)], axis=1)
254
+
255
+ continue
256
+
257
+ # ======== readability features ===========
258
+
259
+ if self.training_config.include_readability:
260
+ readability_df = pd.DataFrame([{
261
+ "text1_readability_flesch_kincaid_grade": round(textstat.flesch_kincaid_grade(masked_df.iloc[0]["text1"]), 5),
262
+ "text1_readability_gunning_fog": round(textstat.gunning_fog(masked_df.iloc[0]["text1"]), 5),
263
+ "text1_readability_smog": round(textstat.smog_index(masked_df.iloc[0]["text1"]), 5),
264
+ "text1_readability_coleman_liau": round(textstat.coleman_liau_index(masked_df.iloc[0]["text1"]), 5),
265
+
266
+ "text2_readability_flesch_kincaid_grade": round(textstat.flesch_kincaid_grade(masked_df.iloc[0]["text2"]), 5),
267
+ "text2_readability_gunning_fog": round(textstat.gunning_fog(masked_df.iloc[0]["text2"]), 5),
268
+ "text2_readability_smog": round(textstat.smog_index(masked_df.iloc[0]["text2"]), 5),
269
+ "text2_readability_coleman_liau": round(textstat.coleman_liau_index(masked_df.iloc[0]["text2"]), 5)
270
+ }])
271
+ feature_df = pd.concat([feature_df.reset_index(drop=True), readability_df.reset_index(drop=True)], axis=1)
272
+
273
+ blocks: list[np.ndarray] = []
274
+ if self.training_config.include_local_pairwise:
275
+ blocks.append(self._build_pairwise_vector(feature_df)) # optimized
276
+ if self.training_config.include_global_pairwise:
277
+ blocks.append(self._build_global_pairwise_vector(feature_df))
278
+ if not blocks:
279
+ raise ValueError("At least one of include_local_pairwise or include_global_pairwise must be True.")
280
+
281
+ X = np.hstack(blocks).astype(np.float32)
282
+ probability_same = self._predict_positive_proba(X)
283
+ predicted_label = int(probability_same >= threshold_value) # 1 if > threshold, otherwise 0
284
+
285
+ return PredictionResult(
286
+ probability_same=probability_same,
287
+ predicted_label=predicted_label,
288
+ threshold=threshold_value,
289
+ normalized_text1=pair_df.iloc[0]["text1"],
290
+ normalized_text2=pair_df.iloc[0]["text2"],
291
+ masked_text1=masked_df.iloc[0]["text1"],
292
+ masked_text2=masked_df.iloc[0]["text2"],
293
+ )
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask==3.1.3
2
+ numpy==2.4.4
3
+ pandas==3.0.2
4
+ scipy==1.17.1
5
+ scikit-learn==1.8.0
6
+ xgboost==3.2.0
7
+ spacy==3.8.14
8
+ ftfy==6.3.1
9
+ textstat==0.7.13
10
+ tqdm==4.67.3
saved/masking/spacy_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "verbose": true,
3
+ "use_gpu": false,
4
+ "nlp_model": "en_core_web_lg",
5
+ "nlp_batch_size": 150,
6
+ "nlp_n_process": 2,
7
+ "checkpoint_dir": "/Users/salirafi/Documents/Personal Project/Text Similarity/saved/masking/spacy_checkpoints"
8
+ }
saved/masking/statistical_config.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "verbose": true,
3
+ "include_function_word_rate": true,
4
+ "exclude_placeholders_from_avg_word_length": true,
5
+ "phrase_role_dependency_labels": [
6
+ "acl",
7
+ "advcl",
8
+ "ccomp",
9
+ "pcomp",
10
+ "relcl",
11
+ "xcomp"
12
+ ],
13
+ "pos_roles": {
14
+ "adjective": [
15
+ "ADJ"
16
+ ],
17
+ "adposition": [
18
+ "ADP"
19
+ ],
20
+ "adverb": [
21
+ "ADV"
22
+ ],
23
+ "auxiliary": [
24
+ "AUX"
25
+ ],
26
+ "conjunction": [
27
+ "CONJ"
28
+ ],
29
+ "coordinating_conjunction": [
30
+ "CCONJ"
31
+ ],
32
+ "determiner": [
33
+ "DET"
34
+ ],
35
+ "interjection": [
36
+ "INTJ"
37
+ ],
38
+ "noun": [
39
+ "NOUN"
40
+ ],
41
+ "numeral": [
42
+ "NUM"
43
+ ],
44
+ "particle": [
45
+ "PART"
46
+ ],
47
+ "pronoun": [
48
+ "PRON"
49
+ ],
50
+ "proper_noun": [
51
+ "PROPN"
52
+ ],
53
+ "punctuation": [
54
+ "PUNCT"
55
+ ],
56
+ "subordinating_conjunction": [
57
+ "SCONJ"
58
+ ],
59
+ "symbol": [
60
+ "SYM"
61
+ ],
62
+ "verb": [
63
+ "VERB"
64
+ ],
65
+ "other": [
66
+ "X"
67
+ ],
68
+ "space": [
69
+ "SPACE"
70
+ ]
71
+ },
72
+ "dep_roles": {
73
+ "root": [
74
+ "ROOT"
75
+ ],
76
+ "adjectival_clause": [
77
+ "acl"
78
+ ],
79
+ "adjectival_complement": [
80
+ "acomp"
81
+ ],
82
+ "adverbial_clause": [
83
+ "advcl"
84
+ ],
85
+ "adverbial_modifier": [
86
+ "advmod"
87
+ ],
88
+ "agent": [
89
+ "agent"
90
+ ],
91
+ "adjectival_modifier": [
92
+ "amod"
93
+ ],
94
+ "apposition": [
95
+ "appos"
96
+ ],
97
+ "attribute": [
98
+ "attr"
99
+ ],
100
+ "auxiliary": [
101
+ "aux"
102
+ ],
103
+ "passive_auxiliary": [
104
+ "auxpass"
105
+ ],
106
+ "case_marker": [
107
+ "case"
108
+ ],
109
+ "coordinating_conjunction": [
110
+ "cc"
111
+ ],
112
+ "clausal_complement": [
113
+ "ccomp"
114
+ ],
115
+ "compound": [
116
+ "compound"
117
+ ],
118
+ "conjunct": [
119
+ "conj"
120
+ ],
121
+ "clausal_subject": [
122
+ "csubj"
123
+ ],
124
+ "passive_clausal_subject": [
125
+ "csubjpass"
126
+ ],
127
+ "dative": [
128
+ "dative"
129
+ ],
130
+ "dependency_unspecified": [
131
+ "dep"
132
+ ],
133
+ "determiner": [
134
+ "det"
135
+ ],
136
+ "direct_object": [
137
+ "dobj"
138
+ ],
139
+ "expletive": [
140
+ "expl"
141
+ ],
142
+ "indirect_object": [
143
+ "iobj"
144
+ ],
145
+ "interjection": [
146
+ "intj"
147
+ ],
148
+ "marker": [
149
+ "mark"
150
+ ],
151
+ "meta": [
152
+ "meta"
153
+ ],
154
+ "negation": [
155
+ "neg"
156
+ ],
157
+ "nominal_modifier": [
158
+ "nmod"
159
+ ],
160
+ "noun_phrase_adverbial_modifier": [
161
+ "npadvmod"
162
+ ],
163
+ "nominal_subject": [
164
+ "nsubj"
165
+ ],
166
+ "passive_nominal_subject": [
167
+ "nsubjpass"
168
+ ],
169
+ "numeric_modifier": [
170
+ "nummod"
171
+ ],
172
+ "object": [
173
+ "obj"
174
+ ],
175
+ "object_predicate": [
176
+ "oprd"
177
+ ],
178
+ "parataxis": [
179
+ "parataxis"
180
+ ],
181
+ "prepositional_complement": [
182
+ "pcomp"
183
+ ],
184
+ "object_of_preposition": [
185
+ "pobj"
186
+ ],
187
+ "possessive_modifier": [
188
+ "poss"
189
+ ],
190
+ "preconjunct": [
191
+ "preconj"
192
+ ],
193
+ "predeterminer": [
194
+ "predet"
195
+ ],
196
+ "prepositional_modifier": [
197
+ "prep"
198
+ ],
199
+ "particle": [
200
+ "prt"
201
+ ],
202
+ "punctuation": [
203
+ "punct"
204
+ ],
205
+ "quantifier_modifier": [
206
+ "quantmod"
207
+ ],
208
+ "relative_clause_modifier": [
209
+ "relcl"
210
+ ],
211
+ "open_clausal_complement": [
212
+ "xcomp"
213
+ ]
214
+ }
215
+ }
saved/model/feature_spec.json ADDED
The diff for this file is too large to render. See raw diff
 
saved/model/metrics.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "validation": {
3
+ "threshold": 0.58,
4
+ "accuracy": 0.78197,
5
+ "precision": 0.8444,
6
+ "recall": 0.73342,
7
+ "f1": 0.785,
8
+ "balanced_accuracy": 0.78651,
9
+ "specificity": 0.8396,
10
+ "youden_j": 0.57302,
11
+ "roc_auc": 0.87306,
12
+ "tn": 2858,
13
+ "fp": 546,
14
+ "fn": 1077,
15
+ "tp": 2963
16
+ },
17
+ "test": {
18
+ "threshold": 0.58,
19
+ "accuracy": 0.79001,
20
+ "precision": 0.84661,
21
+ "recall": 0.73118,
22
+ "f1": 0.78467,
23
+ "balanced_accuracy": 0.79288,
24
+ "specificity": 0.85459,
25
+ "youden_j": 0.58576,
26
+ "roc_auc": 0.87719,
27
+ "tn": 3009,
28
+ "fp": 512,
29
+ "fn": 1039,
30
+ "tp": 2826
31
+ }
32
+ }
saved/model/model.json ADDED
The diff for this file is too large to render. See raw diff
 
saved/model/threshold.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "threshold": 0.5799999833106995
3
+ }
saved/model/training_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "include_statistical": true,
3
+ "include_tfidf": true,
4
+ "include_char_ngrams": true,
5
+ "include_pos_ngrams": true,
6
+ "include_readability": true,
7
+ "include_local_pairwise": true,
8
+ "include_global_pairwise": true,
9
+ "threshold_metric": "youden_j",
10
+ "threshold_grid_step": 0.01,
11
+ "model_params": {
12
+ "objective": "binary:logistic",
13
+ "eval_metric": "logloss",
14
+ "n_estimators": 500,
15
+ "max_depth": 4,
16
+ "learning_rate": 0.05,
17
+ "subsample": 0.8,
18
+ "colsample_bytree": 0.3,
19
+ "min_child_weight": 3,
20
+ "reg_lambda": 5.0,
21
+ "reg_alpha": 1.0,
22
+ "random_state": 42,
23
+ "n_jobs": 2,
24
+ "tree_method": "hist"
25
+ }
26
+ }
saved/ngram_features/char_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ccb3af06aa45f1da5fe4526c3c7963390b24bc4f0499a825543062128569400
3
+ size 1724313
saved/ngram_features/ngram_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "verbose": true,
3
+ "char_ngram_n": 4,
4
+ "char_tfidf_min_df": 2,
5
+ "char_tfidf_max_df": 0.95,
6
+ "char_tfidf_max_features": 50000,
7
+ "pos_ngram_range": [
8
+ 2,
9
+ 3
10
+ ],
11
+ "pos_tfidf_min_df": 2,
12
+ "pos_tfidf_max_df": 0.95,
13
+ "pos_tfidf_max_features": 5000,
14
+ "sublinear_tf": true,
15
+ "norm": "l2",
16
+ "include_readability": true,
17
+ "dense_output": true
18
+ }
saved/ngram_features/pos_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb2c23ef3aaaa48c6c754a2c1e94eac5b1507282607258f48a81daa9f73bb88b
3
+ size 196682
saved/normalization/normalization_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "verbose": true,
3
+ "unicode_form": "NFC"
4
+ }
saved/tfidf_features/tfidf_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "verbose": true,
3
+ "allowed_pos_tags": [
4
+ "NOUN",
5
+ "PROPN",
6
+ "VERB",
7
+ "ADJ",
8
+ "ADV",
9
+ "CONJ",
10
+ "AUX"
11
+ ],
12
+ "min_token_length": 2,
13
+ "ngram_range": [
14
+ 1,
15
+ 2
16
+ ],
17
+ "min_df": 2,
18
+ "max_df": 0.95,
19
+ "max_features": 25000,
20
+ "sublinear_tf": true,
21
+ "norm": "l2",
22
+ "dense_output": true
23
+ }
saved/tfidf_features/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac5200b840b2db60e359d9818416187fcc8b190ed6b482d629a5e405f694f971
3
+ size 977747
static/app.js ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const errorEl = document.getElementById("error");
2
+ const resultCard = document.getElementById("result-card");
3
+ const probabilityRingEl = document.getElementById("probability-ring");
4
+ const probabilityEl = document.getElementById("probability");
5
+ const classificationConfidenceEl = document.getElementById("classification-confidence");
6
+ const classificationDecisionEl = document.getElementById("classification-decision");
7
+ const buttonEl = document.getElementById("predict-btn");
8
+ const decisionThreshold = 0.58;
9
+
10
+
11
+ // number formatting
12
+ function formatNumber(value) {
13
+ return value == null || Number.isNaN(Number(value)) ? "-" : Number(value).toFixed(4);
14
+ }
15
+
16
+
17
+ function showError(message) {
18
+ errorEl.textContent = message;
19
+ errorEl.classList.remove("hidden"); // remove hidden CSS class
20
+ }
21
+ function clearError() { // clear the error
22
+ errorEl.textContent = "";
23
+ errorEl.classList.add("hidden");
24
+ }
25
+
26
+
27
+ // function metricRating(value) {
28
+ // const score = Number(value);
29
+ // if (Number.isNaN(score)) return "-";
30
+ // if (score >= 0.9) return "Very strong";
31
+ // if (score >= 0.8) return "Good";
32
+ // if (score >= 0.7) return "Fairly good";
33
+ // if (score >= 0.6) return "Moderate";
34
+ // return "Limited";
35
+ // }
36
+
37
+
38
+ function probabilityColor(probability) {
39
+ const clamped = Math.max(0, Math.min(1, Number(probability)));
40
+ const hue = 8 + clamped * 126; // low prob is near red/orange; high prob more green
41
+ return `hsl(${hue} 72% 46%)`;
42
+ }
43
+
44
+ function classificationConfidence(probability) {
45
+ if (probability >= 0.9 || probability <= 0.1) return "Surely";
46
+ if (probability >= 0.75 || probability <= 0.25) return "Very Likely";
47
+ if (probability >= 0.6 || probability <= 0.4) return "Likely";
48
+ return "Uncertain";
49
+ }
50
+
51
+ function renderProbability(probability, showClassification = true) {
52
+
53
+ const clamped = Math.max(0, Math.min(1, Number(probability)));
54
+ const angle = `${(clamped * 360).toFixed(2)}deg`;
55
+ const color = probabilityColor(clamped);
56
+ const decision = clamped >= decisionThreshold ? "Same author" : "Different author";
57
+ const decisionClass = clamped >= decisionThreshold ? "is-same" : "is-different";
58
+
59
+ probabilityRingEl.style.setProperty("--ring-angle", angle);
60
+ probabilityRingEl.style.setProperty("--ring-color", color);
61
+ probabilityEl.textContent = `${(clamped * 100).toFixed(1)}%`;
62
+ classificationDecisionEl.classList.remove("is-same", "is-different");
63
+
64
+ if (showClassification) {
65
+ classificationConfidenceEl.textContent = classificationConfidence(clamped);
66
+ classificationDecisionEl.textContent = decision;
67
+ classificationDecisionEl.classList.add(decisionClass);
68
+ } else {
69
+ classificationConfidenceEl.textContent = "";
70
+ classificationDecisionEl.textContent = "";
71
+ }
72
+ }
73
+
74
+ async function loadMetrics() {
75
+ try {
76
+ const response = await fetch("/metrics");
77
+ const metrics = await response.json();
78
+ document.getElementById("metric-f1").textContent = formatNumber(metrics.f1);
79
+ document.getElementById("metric-youden").textContent = formatNumber(metrics.youden_j);
80
+ document.getElementById("metric-auc").textContent = formatNumber(metrics.auc_roc);
81
+ document.getElementById("metric-f1-rating").textContent = metricRating(metrics.f1);
82
+ document.getElementById("metric-youden-rating").textContent = metricRating(metrics.youden_j);
83
+ document.getElementById("metric-auc-rating").textContent = metricRating(metrics.auc_roc);
84
+ } catch (error) {
85
+ console.error("Failed to load metrics", error);
86
+ }}
87
+
88
+
89
+ async function handlePredict() {
90
+ clearError();
91
+
92
+ const text1 = document.getElementById("text1").value.trim();
93
+ const text2 = document.getElementById("text2").value.trim();
94
+
95
+ if (!text1 || !text2) {
96
+ showError("Please fill in both text fields.");
97
+ return;
98
+ }
99
+
100
+ buttonEl.disabled = true; // disable click
101
+ buttonEl.textContent = "Running...";
102
+
103
+ try {
104
+ const response = await fetch("/predict", {
105
+ method: "POST",
106
+ headers: { "Content-Type": "application/json" },
107
+ body: JSON.stringify({ text1, text2 }),
108
+ });
109
+ const result = await response.json();
110
+ if (!response.ok || result.error) {
111
+ throw new Error(result.error || "Request failed.");
112
+ }
113
+ renderProbability(result.probability);
114
+ } catch (error) {
115
+ showError(error.message || "Request failed.");
116
+ } finally {
117
+ buttonEl.disabled = false;
118
+ buttonEl.textContent = "Predict";
119
+ }}
120
+
121
+ buttonEl.addEventListener("click", handlePredict);
122
+
123
+ renderProbability(0, false);
124
+ loadMetrics(); // always show performance metrics
static/cdf.png ADDED
static/image.png ADDED

Git LFS Details

  • SHA256: 4b1e7a0f962e931d962f613727e174c6890fb518e037dd9ac3d1504305541960
  • Pointer size: 131 Bytes
  • Size of remote file: 814 kB
static/styles.css ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ color-scheme: light;
3
+ --bg: #f7f7f5;
4
+ --panel: #ffffff;
5
+ --border: #e0dfd9;
6
+ --border-strong: #d4d4d0;
7
+ --text: #1a1a1a;
8
+ --muted: #6b6b6b;
9
+ --muted-soft: #9b9b93;
10
+ --success: #1a7f4b;
11
+ --danger: #b84040;
12
+ }
13
+
14
+ * {
15
+ box-sizing: border-box;
16
+ }
17
+
18
+ /* body {
19
+ margin: 0;
20
+ min-height: 100vh;
21
+ background: #ffffff;
22
+ color: var(--text);
23
+ font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
24
+ } */
25
+ body {
26
+ margin: 0;
27
+ min-height: 100vh;
28
+ background: #f6f3ee;
29
+ color: var(--text);
30
+ font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
31
+ }
32
+
33
+
34
+
35
+ .page {
36
+ width: 100%;
37
+ max-width: 1700px;
38
+ margin: 0 auto;
39
+ padding: 28px 48px 36px;
40
+ }
41
+
42
+ .header-bar {
43
+ width: 100%;
44
+ padding: 10px 30px;
45
+ background:
46
+ linear-gradient(180deg, rgba(255, 253, 250, 0.644) 0%, rgba(246, 243, 238, 0.72) 100%),
47
+ linear-gradient(90deg, #c5c1ba 0%, #cfc4b1 52%, #dfe8e2 100%);
48
+ border-bottom: 1px solid rgba(76, 76, 74, 0.202);
49
+ box-shadow: 0 14px 28px rgba(149, 149, 149, 0.08);
50
+ }
51
+
52
+ .header-inner {
53
+ width: 100%;
54
+ max-width: 1540px;
55
+ margin: 0 auto;
56
+ display: flex;
57
+ align-items: baseline;
58
+ gap: 18px;
59
+ }
60
+
61
+ .app-shell {
62
+ width: 100%;
63
+ max-width: 1540px;
64
+ margin: 0 auto;
65
+ min-height: auto;
66
+ display: grid;
67
+ grid-template-columns: minmax(960px, 1fr) 420px;
68
+ gap: 44px;
69
+ align-items: start;
70
+ }
71
+
72
+ .main-pane,
73
+ .side-pane {
74
+ display: flex;
75
+ flex-direction: column;
76
+ gap: 22px;
77
+ }
78
+ .side-pane {
79
+ padding-top: 0;
80
+ min-height: 100%;
81
+ justify-content: center;
82
+ }
83
+
84
+
85
+ .title {
86
+ margin: 0;
87
+ font-size: 39px;
88
+ font-weight: 500;
89
+ font-family: serif, Georgia, "Times New Roman", Times;
90
+ letter-spacing: 0.02em;
91
+ white-space: nowrap;
92
+ }
93
+
94
+ .subtitle {
95
+ margin: 0;
96
+ max-width: 62ch;
97
+ font-size: 15px;
98
+ color: var(--muted);
99
+ }
100
+
101
+ .card {
102
+ background: color-mix(in srgb, var(--panel) 88%, #faf6ef 12%);
103
+ border: 1px solid var(--border);
104
+ border-radius: 18px;
105
+ padding: 28px 28px 24px;
106
+ box-shadow: 0 18px 48px rgba(48, 42, 31, 0.06);
107
+ }
108
+ #result-card {
109
+ background: transparent;
110
+ border: none;
111
+ box-shadow: none;
112
+ padding: 8px 0 0;
113
+ }
114
+
115
+
116
+ .bottom-panel {
117
+ width: 100%;
118
+ max-width: 1540px;
119
+ margin: 28px auto 0;
120
+ }
121
+
122
+ .workspace-card {
123
+ padding: 0 0 18px;
124
+ background: transparent;
125
+ border: none;
126
+ box-shadow: none;
127
+ }
128
+
129
+ .section-title {
130
+ margin: 0 0 8px;
131
+ font-size: 16px;
132
+ font-weight: 600;
133
+ letter-spacing: 0.08em;
134
+ text-transform: uppercase;
135
+ color: var(--muted);
136
+ }
137
+
138
+ .card-note {
139
+ margin: -2px 0 10px;
140
+ font-size: 14px;
141
+ color: var(--muted-soft);
142
+ }
143
+
144
+ .stack {
145
+ display: grid;
146
+ grid-template-columns: minmax(0, 1fr) minmax(0, 1fr);
147
+ gap: 18px;
148
+ margin-bottom: 18px;
149
+ }
150
+
151
+ .input-block {
152
+ display: flex;
153
+ flex-direction: column;
154
+ gap: 8px;
155
+ }
156
+
157
+ .label {
158
+ font-size: 13px;
159
+ font-weight: 600;
160
+ letter-spacing: 0.07em;
161
+ text-transform: uppercase;
162
+ color: var(--muted);
163
+ }
164
+
165
+
166
+ textarea {
167
+ width: 100%;
168
+ min-height: 450px;
169
+ padding: 16px 18px;
170
+ font-size: 16px;
171
+ line-height: 1.55;
172
+ font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
173
+ border: 1px solid var(--border-strong);
174
+ border-radius: 14px;
175
+ resize: vertical;
176
+ background: rgba(255, 255, 255, 0.82);
177
+ color: var(--text);
178
+ outline: none;
179
+ transition: border-color 140ms ease, box-shadow 140ms ease, background-color 140ms ease;
180
+ }
181
+
182
+
183
+ textarea:focus {
184
+ border-color: #a48b68;
185
+ background: #fffdfa;
186
+ box-shadow: 0 0 0 4px rgba(164, 139, 104, 0.12);
187
+ }
188
+
189
+ button {
190
+ display: inline-flex;
191
+ align-items: center;
192
+ justify-content: center;
193
+ width: auto;
194
+ min-width: 146px;
195
+ padding: 14px 22px;
196
+ font-size: 16px;
197
+ font-weight: 500;
198
+ letter-spacing: 0.04em;
199
+ background: var(--text);
200
+ color: #ffffff;
201
+ border: none;
202
+ border-radius: 999px;
203
+ cursor: pointer;
204
+ box-shadow: 0 10px 24px rgba(26, 26, 26, 0.18);
205
+ transition: transform 140ms ease, box-shadow 140ms ease, background-color 140ms ease;
206
+ }
207
+ button:hover:not(:disabled) {
208
+ transform: translateY(-1px);
209
+ box-shadow: 0 14px 28px rgba(26, 26, 26, 0.2);
210
+ }
211
+ button:disabled {
212
+ background: var(--muted-soft);
213
+ cursor: not-allowed;
214
+ box-shadow: none;
215
+ }
216
+
217
+ .error-msg {
218
+ margin: 0 0 8px;
219
+ font-size: 13px;
220
+ color: var(--danger);
221
+ }
222
+
223
+
224
+
225
+ /* hide component */
226
+ .hidden {
227
+ display: none;
228
+ }
229
+
230
+
231
+ .result-visual {
232
+ display: flex;
233
+ align-items: center;
234
+ gap: 28px;
235
+ padding: 4px 0 0;
236
+ }
237
+
238
+ .result-meter {
239
+ display: flex;
240
+ flex-direction: column;
241
+ align-items: center;
242
+ gap: 14px;
243
+ flex-shrink: 0;
244
+ }
245
+
246
+ .probability-ring {
247
+ --ring-angle: 0deg;
248
+ --ring-color: hsl(8 72% 46%);
249
+ width: 138px;
250
+ aspect-ratio: 1;
251
+ border-radius: 50%;
252
+ background:
253
+ conic-gradient(var(--ring-color) 0 var(--ring-angle), #e8e8e4a4 var(--ring-angle) 360deg);
254
+ display: grid;
255
+ place-items: center;
256
+ flex-shrink: 0;
257
+ }
258
+
259
+ .probability-ring-inner {
260
+ width: calc(100% - 16px);
261
+ height: calc(100% - 16px);
262
+ border-radius: 50%;
263
+ background: #f6f3ee;
264
+ color: var(--text);
265
+ display: grid;
266
+ place-items: center;
267
+ }
268
+
269
+ .probability-value {
270
+ font-size: 28px;
271
+ font-weight: 600;
272
+ letter-spacing: -0.03em;
273
+ }
274
+
275
+ .result-copy {
276
+ display: flex;
277
+ flex-direction: column;
278
+ gap: 8px;
279
+ max-width: 360px;
280
+ }
281
+ .result-eyebrow {
282
+ margin: 0;
283
+ font-size: 14px;
284
+ font-weight: 600;
285
+ letter-spacing: 0.08em;
286
+ text-transform: uppercase;
287
+ color: var(--muted);
288
+ }
289
+
290
+ .result-description {
291
+ margin: 0;
292
+ font-size: 16px;
293
+ line-height: 1.5;
294
+ color: var(--muted);
295
+ }
296
+
297
+
298
+
299
+ .classification {
300
+ display: flex;
301
+ flex-direction: column;
302
+ align-items: center;
303
+ gap: 6px;
304
+ text-align: center;
305
+ }
306
+ .classification-decision {
307
+ margin: 0;
308
+ font-size: 22px;
309
+ font-weight: 600;
310
+ color: var(--text);
311
+ }
312
+ /* green for same author */
313
+ .classification-decision.is-same {
314
+ color: var(--success);
315
+ }
316
+ /* red for different author */
317
+ .classification-decision.is-different {
318
+ color: var(--danger);
319
+ }
320
+
321
+
322
+
323
+ .sidebar-stats {
324
+ display: grid;
325
+ grid-template-columns: repeat(3, minmax(210px, 250px));
326
+ justify-content: start;
327
+ gap: 10px;
328
+ }
329
+
330
+
331
+ .performance-card {
332
+ background: transparent;
333
+ border: none;
334
+ box-shadow: none;
335
+ padding: 0;
336
+ }
337
+
338
+ .metric-main {
339
+ display: flex;
340
+ flex-direction: column;
341
+ gap: 8px;
342
+ align-items: flex-start;
343
+ }
344
+
345
+ .metric-summary {
346
+ display: flex;
347
+ flex-direction: column;
348
+ gap: 3px;
349
+ }
350
+ .meta-stat {
351
+ display: flex;
352
+ flex-direction: column;
353
+ gap: 2px;
354
+ padding-right: 0;
355
+ }
356
+ .meta-value {
357
+ font-size: 22px;
358
+ letter-spacing: -0.2px;
359
+ }
360
+
361
+ .meta-label {
362
+ font-size: 13px;
363
+ color: var(--muted);
364
+ letter-spacing: 0.03em;
365
+ }
366
+ .metric-rating {
367
+ font-size: 13px;
368
+ font-weight: 600;
369
+ color: #8b7355;
370
+ letter-spacing: 0.02em;
371
+ }
372
+
373
+ .metric-note {
374
+ margin: 0;
375
+ font-size: 11px;
376
+ line-height: 1.45;
377
+ color: var(--muted-soft);
378
+ }
379
+
380
+
381
+
382
+ @media (max-width: 760px) {
383
+ .app-shell {
384
+ grid-template-columns: 1fr;
385
+ gap: 18px;
386
+ }
387
+ .side-pane {
388
+ padding-top: 0;
389
+ }
390
+ .bottom-panel {
391
+ margin-top: 20px;
392
+ }
393
+ .result-visual {
394
+ align-items: flex-start;
395
+ }
396
+
397
+ .stack {
398
+ grid-template-columns: 1fr;
399
+ }
400
+ .sidebar-stats {
401
+ grid-template-columns: 1fr;
402
+ gap: 14px;
403
+ justify-content: stretch;
404
+ }
405
+ .metric-main {
406
+ gap: 6px;
407
+ }
408
+ }
409
+
410
+ @media (max-width: 720px) {
411
+ .page {
412
+ padding: 20px 16px 28px;
413
+ }
414
+ .card {
415
+ padding: 20px 18px;
416
+ }
417
+ .header-bar {
418
+ padding: 12px 30px;
419
+ }
420
+ .header-inner {
421
+ align-items: flex-start;
422
+ flex-direction: column;
423
+ gap: 3px;
424
+ }
425
+ .title {
426
+ font-size: 26px;
427
+ white-space: normal;
428
+ }
429
+ textarea {
430
+ min-height: 220px;
431
+ }
432
+ .result-visual {
433
+ flex-direction: column;
434
+ gap: 16px;
435
+ }
436
+ .probability-ring {
437
+ width: 120px;
438
+ }
439
+ .probability-value {
440
+ font-size: 26px;
441
+ }
442
+
443
+ .metric-main {
444
+ gap: 6px;
445
+ }}
templates/index.html ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="utf-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1">
7
+ <title>AVeri</title>
8
+ <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
9
+ </head>
10
+
11
+ <body>
12
+
13
+ <header class="header-bar">
14
+ <div class="header-inner">
15
+ <h1 class="title">AVeri: Author Verifier</h1>
16
+ <!-- <p class="subtitle">Are your two texts written by the same author?</p> -->
17
+ </div>
18
+ </header>
19
+
20
+ <div class="page">
21
+ <div class="app-shell">
22
+ <main class="main-pane">
23
+
24
+ <section class="card workspace-card">
25
+ <div class="stack">
26
+ <div class="input-block">
27
+ <label class="label" for="text1">Text A</label>
28
+ <textarea id="text1" placeholder="Paste first text here..."></textarea>
29
+ </div>
30
+
31
+ <div class="input-block">
32
+ <label class="label" for="text2">Text B</label>
33
+ <textarea id="text2" placeholder="Paste second text here..."></textarea>
34
+ </div>
35
+ </div>
36
+
37
+ <p id="error" class="error-msg hidden"></p>
38
+ <button id="predict-btn" type="button">Predict</button>
39
+ </section>
40
+
41
+ </main>
42
+
43
+ <aside class="side-pane">
44
+
45
+ <section id="result-card" class="card">
46
+ <h2 class="section-title">Result</h2>
47
+ <div class="result-visual">
48
+
49
+ <div class="result-meter">
50
+ <div id="probability-ring" class="probability-ring" aria-hidden="true">
51
+ <div class="probability-ring-inner">
52
+ <span id="probability" class="probability-value">0.0%</span>
53
+ </div>
54
+ </div>
55
+
56
+ <div class="classification">
57
+ <p id="classification-confidence" class="result-eyebrow"></p>
58
+ <p id="classification-decision" class="classification-decision"></p>
59
+ </div>
60
+ </div>
61
+
62
+ <div class="result-copy">
63
+ <p class="result-eyebrow">Probability</p>
64
+ <p class="result-description">Probability both texts were written by the same author.</p>
65
+ </div>
66
+
67
+ </div>
68
+ </section>
69
+
70
+ </aside>
71
+ </div>
72
+
73
+ <section class="bottom-panel">
74
+ <section class="card performance-card">
75
+
76
+ <h2 class="section-title">Model Performance</h2>
77
+
78
+ <div class="sidebar-stats">
79
+
80
+ <div class="meta-stat">
81
+ <div class="metric-main">
82
+ <div class="metric-summary">
83
+ <span id="metric-f1" class="meta-value">-</span>
84
+ <span class="meta-label">F1 Score</span>
85
+ <!-- <span id="metric-f1-rating" class="metric-rating">-</span> -->
86
+ </div>
87
+ <p class="metric-note">The model is good at prediction without raising too many false alarms.</p>
88
+ </div>
89
+ </div>
90
+
91
+ <div class="meta-stat">
92
+ <div class="metric-main">
93
+ <div class="metric-summary">
94
+ <span id="metric-youden" class="meta-value">-</span>
95
+ <span class="meta-label">Youden J</span>
96
+ <!-- <span id="metric-youden-rating" class="metric-rating">-</span> -->
97
+ </div>
98
+ <p class="metric-note">The model has moderate ability to correctly separate same- and different-author cases.</p>
99
+ </div>
100
+ </div>
101
+
102
+ <div class="meta-stat">
103
+ <div class="metric-main">
104
+ <div class="metric-summary">
105
+ <span id="metric-auc" class="meta-value">-</span>
106
+ <span class="meta-label">AUC-ROC</span>
107
+ <!-- <span id="metric-auc-rating" class="metric-rating">-</span> -->
108
+ </div>
109
+ <p class="metric-note">The model is strongly reliable at distinguishing between same- and different-author cases overall.</p>
110
+ </div>
111
+ </div>
112
+
113
+ </div>
114
+
115
+ </section>
116
+ </section>
117
+
118
+ </div>
119
+
120
+ <script src="{{ url_for('static', filename='app.js') }}"></script>
121
+ </body>
122
+ </html>