Spaces:

salirafi
/

AVeri

Sleeping

App Files Files Community

salirafi commited on Apr 21

Commit

ad19081

verified ·

1 Parent(s): 337a134

Upload 21 files

Browse files

Files changed (22) hide show

.gitattributes +1 -0
app.py +83 -0
inference.py +293 -0
requirements.txt +10 -0
saved/masking/spacy_config.json +8 -0
saved/masking/statistical_config.json +215 -0
saved/model/feature_spec.json +0 -0
saved/model/metrics.json +32 -0
saved/model/model.json +0 -0
saved/model/threshold.json +3 -0
saved/model/training_config.json +26 -0
saved/ngram_features/char_vectorizer.pkl +3 -0
saved/ngram_features/ngram_config.json +18 -0
saved/ngram_features/pos_vectorizer.pkl +3 -0
saved/normalization/normalization_config.json +4 -0
saved/tfidf_features/tfidf_config.json +23 -0
saved/tfidf_features/vectorizer.pkl +3 -0
static/app.js +124 -0
static/cdf.png +0 -0
static/image.png +3 -0
static/styles.css +445 -0
templates/index.html +122 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+static/image.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# !usr/bin/env/python3
+import sys
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+from flask import Flask, jsonify, render_template, request
+BASE_DIR = Path(__file__).resolve().parent
+SRC_DIR = BASE_DIR / "src"
+SAVED_DIR = BASE_DIR / "saved"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+from inference import Inference
+from helpers import load_json
+app = Flask(__name__)
+def _compute_model_metrics(metrics_payload: dict[str, Any]) -> dict[str, float]:
+    test_metrics = metrics_payload.get("test") or {}
+    tn = float(test_metrics.get("tn", 0.0))
+    fp = float(test_metrics.get("fp", 0.0))
+    tp = float(test_metrics.get("tp", 0.0))
+    fn = float(test_metrics.get("fn", 0.0))
+    specificity = tn / (tn + fp) if (tn + fp) else 0.0
+    sensitivity = tp / (tp + fn) if (tp + fn) else float(test_metrics.get("recall", 0.0))
+    youden_j = sensitivity + specificity - 1.0
+    return {
+        "f1": float(test_metrics.get("f1", 0.0)),
+        "youden_j": round(youden_j, 5),
+        "auc_roc": float(test_metrics.get("roc_auc", 0.0)),
+    }
+@lru_cache(maxsize=1)
+def get_metrics() -> dict[str, float]:
+    return _compute_model_metrics(load_json(SAVED_DIR / "model" / "metrics.json"))
+@lru_cache(maxsize=1)
+def get_service() -> Inference:
+    return Inference(project_root=BASE_DIR)
+def predict(text1: str, text2: str) -> dict[str, Any]:
+    return get_service().predict(text1, text2).to_dict()
+@app.route("/", methods=["GET"])
+def home_route():
+    return render_template("index.html")
+@app.route("/predict", methods=["POST"])
+def predict_route():
+    data = request.get_json(force=True)
+    text1 = (data.get("text1") or "").strip()
+    text2 = (data.get("text2") or "").strip()
+    if not text1 or not text2: return jsonify({"error": "Both text fields are required."}), 400
+    try: result = predict(text1, text2)
+    except Exception as exc:
+        return jsonify({"error": f"Inference failed: {exc}"}), 500
+    return jsonify(result)
+@app.route("/metrics", methods=["GET"])
+def metrics_route():
+    try:
+        return jsonify(get_metrics())
+    except Exception as exc:
+        return jsonify({"error": f"Failed to load metrics: {exc}"}), 500
+# ping for cron job
+@app.route("/ping")
+def ping():
+    return {"status": "ok"}, 200
+if __name__ == "__main__":
+    app.run(debug=True, port=5000)

inference.py ADDED Viewed

	@@ -0,0 +1,293 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+from textstat import textstat
+from xgboost import XGBClassifier
+from helpers import load_json, load_pickle
+from masking_regex import mask_split as regex_mask_split
+from masking_spacy import Config as SpacyMaskingConfig, _apply_ner_mask, _build_linguistic_record, load_nlp_model
+from normalization import normalize_text, Config as NormalizationConfig
+from features_statistical import extract_split_statistics, Config as StatisticalConfig
+from features_tfidf import record_to_tfidf_text, Config as TFIDFConfig
+from features_ngram import Config as NGramConfig, build_space_free_char_ngrams, record_to_pos_sequence
+from model_training import Config as TrainingConfig
+def _coerce_tfidf_config(payload: dict[str, Any]) -> TFIDFConfig:
+    payload = dict(payload)
+    if isinstance(payload.get("ngram_range"), list):
+        payload["ngram_range"] = tuple(payload["ngram_range"])
+    return TFIDFConfig(**payload)
+def _coerce_ngram_config(payload: dict[str, Any]) -> NGramConfig:
+    payload = dict(payload)
+    if isinstance(payload.get("pos_ngram_range"), list):
+        payload["pos_ngram_range"] = tuple(payload["pos_ngram_range"])
+    return NGramConfig(**payload)
+def _coerce_statistical_config(payload: dict[str, Any]) -> StatisticalConfig:
+    payload = dict(payload)
+    if isinstance(payload.get("phrase_role_dependency_labels"), list):
+        payload["phrase_role_dependency_labels"] = tuple(payload["phrase_role_dependency_labels"])
+    return StatisticalConfig(**payload)
+def _coerce_training_config(payload: dict[str, Any]) -> TrainingConfig:
+    payload = dict(payload)
+    return TrainingConfig(**payload)
+@dataclass(slots=True)
+class PredictionResult:
+    probability_same: float
+    predicted_label: int
+    threshold: float
+    normalized_text1: str
+    normalized_text2: str
+    masked_text1: str
+    masked_text2: str
+    def to_dict(self) -> dict[str, Any]:
+        label = "Same author" if self.predicted_label == 1 else "Different author"
+        return {
+            "label": label,
+            "probability": self.probability_same,
+            "threshold": self.threshold,
+            "normalized_text1": self.normalized_text1,
+            "normalized_text2": self.normalized_text2,
+            "masked_text1": self.masked_text1,
+            "masked_text2": self.masked_text2,
+        }
+# STAND-ALONE PIPELINE TO PERFORM INFERENCE USING THE TRAINED MODEL
+class Inference:
+    def __init__(self, project_root: str | Path | None = None) -> None:
+        self.project_root = Path(project_root) if project_root is not None else Path(__file__).resolve().parents[1]
+        self.saved_dir = self.project_root / "saved"
+        self.model_dir = self.saved_dir / "model"
+        # =============================
+        # the pipeline follows what is done in src/pipeline.py but adapted to do inference instead of training
+        # =============================
+        self.normalization_config = NormalizationConfig(**load_json(self.saved_dir / "normalization" / "normalization_config.json"))
+        spacy_payload = load_json(self.saved_dir / "masking" / "spacy_config.json")
+        spacy_payload["verbose"] = False
+        spacy_payload["nlp_n_process"] = 1
+        self.spacy_config = SpacyMaskingConfig(**spacy_payload)
+        statistical_payload = load_json(self.saved_dir / "masking" / "statistical_config.json")
+        statistical_payload["verbose"] = False
+        self.statistical_config = _coerce_statistical_config(statistical_payload)
+        tfidf_payload = load_json(self.saved_dir / "tfidf_features" / "tfidf_config.json")
+        tfidf_payload["verbose"] = False
+        self.tfidf_config = _coerce_tfidf_config(tfidf_payload)
+        ngram_payload = load_json(self.saved_dir / "ngram_features" / "ngram_config.json")
+        ngram_payload["verbose"] = False
+        self.ngram_config = _coerce_ngram_config(ngram_payload)
+        training_payload = load_json(self.saved_dir / "model" / "training_config.json")
+        self.training_config = _coerce_training_config(training_payload)
+        self.tfidf_vectorizer = load_pickle(self.saved_dir / "tfidf_features" / "vectorizer.pkl")
+        self.char_vectorizer = load_pickle(self.saved_dir / "ngram_features" / "char_vectorizer.pkl")
+        self.pos_vectorizer = load_pickle(self.saved_dir / "ngram_features" / "pos_vectorizer.pkl")
+        self.model = None
+        self.threshold = float(load_json(self.model_dir / "threshold.json")["threshold"])
+        feature_spec = load_json(self.model_dir / "feature_spec.json")
+        self.suffixes: list[str] = feature_spec["suffixes"]
+        # self.pairwise_operations: tuple[str, ...] = tuple(feature_spec["pairwise_operations"])
+        self.pairwise_column_pairs = [(f"text1_{suffix}", f"text2_{suffix}") for suffix in self.suffixes]
+        self.metrics = load_json(self.model_dir / "metrics.json")
+        self.nlp = None
+    def _load_model(self) -> XGBClassifier:
+        if self.model is None:
+            model_path = self.model_dir / "model.json"
+            if not model_path.exists():
+                raise FileNotFoundError(f"Missing '{model_path}'")
+            model = XGBClassifier()
+            model.load_model(model_path)
+            self.model = model
+        return self.model
+    def _predict_positive_proba(self, X: np.ndarray) -> float:
+        model = self._load_model()
+        return float(model.predict_proba(X)[0, 1])
+    def _mask_one_text(self, text: str) -> tuple[str, dict[str, Any]]:
+        if self.nlp is None:
+            self.nlp = load_nlp_model(config=self.spacy_config)
+        doc = self.nlp(text)
+        masked_text, _ = _apply_ner_mask(text, doc)
+        record = _build_linguistic_record(doc)
+        return masked_text, record
+    def _build_pairwise_vector(self, feature_df: pd.DataFrame) -> np.ndarray:
+        row_values = feature_df.iloc[0].to_dict()
+        width = len(self.pairwise_column_pairs) * 2 # two pairwise operations: abs. diff & dot product
+        X_pair = np.empty((1, width), dtype=np.float32)
+        column_index = 0
+        for left_col, right_col in self.pairwise_column_pairs:
+            left = np.float32(row_values.get(left_col, 0.0))
+            right = np.float32(row_values.get(right_col, 0.0))
+            diff = left - right
+            X_pair[0, column_index] = abs(diff)
+            X_pair[0, column_index + 1] = left * right
+            column_index += 2
+        return X_pair
+    def _family_suffix_groups(self) -> dict[str, list[str]]:
+        return {
+            "tfidf": [s for s in self.suffixes if s.startswith("tfidf_")],
+            "char_ngrams": [s for s in self.suffixes if s.startswith("char") and "_tfidf_" in s],
+            "pos_ngrams": [s for s in self.suffixes if s.startswith("pos") and "_tfidf_" in s],
+            "scalar": [s for s in self.suffixes if not (
+                s.startswith("tfidf_")
+                or (s.startswith("char") and "_tfidf_" in s)
+                or (s.startswith("pos") and "_tfidf_" in s)
+            )],
+        }
+    def _build_global_pairwise_vector(self, feature_df: pd.DataFrame) -> np.ndarray:
+        row_values = feature_df.iloc[0].to_dict()
+        values: list[float] = []
+        for family_suffixes in self._family_suffix_groups().values():
+            if not family_suffixes:
+                continue
+            left = np.array([row_values.get(f"text1_{suffix}", 0.0) for suffix in family_suffixes], dtype=np.float32)
+            right = np.array([row_values.get(f"text2_{suffix}", 0.0) for suffix in family_suffixes], dtype=np.float32)
+            denominator = float(np.linalg.norm(left) * np.linalg.norm(right))
+            cosine = float(np.dot(left, right) / denominator) if denominator > 0 else 0.0
+            diff = left - right
+            l1 = float(np.abs(diff).sum())
+            l2 = float(np.linalg.norm(diff))
+            values.extend([cosine, l1, l2])
+        return np.array(values, dtype=np.float32).reshape(1, -1)
+    # predict prbability and classification of two given texts (input from the user)
+    def predict(self, text1: str, text2: str, threshold: float | None = None) -> PredictionResult:
+        threshold_value = self.threshold if threshold is None else float(threshold)
+        pair_df = pd.DataFrame([{
+                    "text1": normalize_text(text1, config=self.normalization_config),
+                    "text2": normalize_text(text2, config=self.normalization_config),
+                    "same": 0,
+                }])
+        regex_masked_df, _ = regex_mask_split(pair_df)
+        # spaCy masking; not using nlp.pipe
+        masked_text1, record1 = self._mask_one_text(regex_masked_df.iloc[0]["text1"])
+        masked_text2, record2 = self._mask_one_text(regex_masked_df.iloc[0]["text2"])
+        masked_df = regex_masked_df.copy()
+        masked_df.at[0, "text1"] = masked_text1 # combining regex and spaCy masking
+        masked_df.at[0, "text2"] = masked_text2 # ...
+        split_cache = {"text1": [record1], "text2": [record2]} # the linguistic cache
+        feature_df = pd.DataFrame() # initialize empty dataframe for the features
+        # ======== statistical features ===========
+        if self.training_config.include_statistical:
+            feature_df = extract_split_statistics(
+                masked_df,
+                split_cache=split_cache,
+                split_name="inference",
+                config=self.statistical_config,
+            )
+        # ======== TF-IDF features ===========
+        if self.training_config.include_tfidf:
+            for column in ("text1", "text2"):
+                docs = [record_to_tfidf_text(record, config=self.tfidf_config) for record in split_cache[column]]
+                tfidf_matrix = self.tfidf_vectorizer.transform(docs).toarray()
+                tfidf_cols = [f"{column}_tfidf_{index:05d}" for index in range(tfidf_matrix.shape[1])]
+                tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_cols)
+                feature_df = pd.concat([feature_df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)
+        # ======== n-gram features ===========
+        for column in ("text1", "text2"):
+            if self.training_config.include_char_ngrams:
+                char_docs = [
+                    " ".join(build_space_free_char_ngrams(text, n=self.ngram_config.char_ngram_n))
+                    for text in masked_df[column].tolist()]
+                char_matrix = self.char_vectorizer.transform(char_docs).toarray()
+                char_cols = [
+                    f"{column}_char{self.ngram_config.char_ngram_n}_tfidf_{index:05d}"
+                    for index in range(char_matrix.shape[1])]
+                char_df = pd.DataFrame(char_matrix, columns=char_cols)
+                feature_df = pd.concat([feature_df.reset_index(drop=True), char_df.reset_index(drop=True)], axis=1)
+            if self.training_config.include_pos_ngrams:
+                pos_docs = [" ".join(record_to_pos_sequence(record)) for record in split_cache[column]]
+                pos_matrix = self.pos_vectorizer.transform(pos_docs).toarray()
+                pos_cols = [
+                    f"{column}_pos{self.ngram_config.pos_ngram_range}_tfidf_{index:05d}"
+                    for index in range(pos_matrix.shape[1])]
+                pos_df = pd.DataFrame(pos_matrix, columns=pos_cols)
+                feature_df = pd.concat([feature_df.reset_index(drop=True), pos_df.reset_index(drop=True)], axis=1)
+            continue
+        # ======== readability features ===========
+        if self.training_config.include_readability:
+            readability_df = pd.DataFrame([{
+                        "text1_readability_flesch_kincaid_grade": round(textstat.flesch_kincaid_grade(masked_df.iloc[0]["text1"]), 5),
+                        "text1_readability_gunning_fog": round(textstat.gunning_fog(masked_df.iloc[0]["text1"]), 5),
+                        "text1_readability_smog": round(textstat.smog_index(masked_df.iloc[0]["text1"]), 5),
+                        "text1_readability_coleman_liau": round(textstat.coleman_liau_index(masked_df.iloc[0]["text1"]), 5),
+                        "text2_readability_flesch_kincaid_grade": round(textstat.flesch_kincaid_grade(masked_df.iloc[0]["text2"]), 5),
+                        "text2_readability_gunning_fog": round(textstat.gunning_fog(masked_df.iloc[0]["text2"]), 5),
+                        "text2_readability_smog": round(textstat.smog_index(masked_df.iloc[0]["text2"]), 5),
+                        "text2_readability_coleman_liau": round(textstat.coleman_liau_index(masked_df.iloc[0]["text2"]), 5)
+                    }])
+            feature_df = pd.concat([feature_df.reset_index(drop=True), readability_df.reset_index(drop=True)], axis=1)
+        blocks: list[np.ndarray] = []
+        if self.training_config.include_local_pairwise:
+            blocks.append(self._build_pairwise_vector(feature_df)) # optimized
+        if self.training_config.include_global_pairwise:
+            blocks.append(self._build_global_pairwise_vector(feature_df))
+        if not blocks:
+            raise ValueError("At least one of include_local_pairwise or include_global_pairwise must be True.")
+        X = np.hstack(blocks).astype(np.float32)
+        probability_same = self._predict_positive_proba(X)
+        predicted_label = int(probability_same >= threshold_value) # 1 if > threshold, otherwise 0
+        return PredictionResult(
+            probability_same=probability_same,
+            predicted_label=predicted_label,
+            threshold=threshold_value,
+            normalized_text1=pair_df.iloc[0]["text1"],
+            normalized_text2=pair_df.iloc[0]["text2"],
+            masked_text1=masked_df.iloc[0]["text1"],
+            masked_text2=masked_df.iloc[0]["text2"],
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+Flask==3.1.3
+numpy==2.4.4
+pandas==3.0.2
+scipy==1.17.1
+scikit-learn==1.8.0
+xgboost==3.2.0
+spacy==3.8.14
+ftfy==6.3.1
+textstat==0.7.13
+tqdm==4.67.3

saved/masking/spacy_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "verbose": true,
+  "use_gpu": false,
+  "nlp_model": "en_core_web_lg",
+  "nlp_batch_size": 150,
+  "nlp_n_process": 2,
+  "checkpoint_dir": "/Users/salirafi/Documents/Personal Project/Text Similarity/saved/masking/spacy_checkpoints"
+}

saved/masking/statistical_config.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+  "verbose": true,
+  "include_function_word_rate": true,
+  "exclude_placeholders_from_avg_word_length": true,
+  "phrase_role_dependency_labels": [
+    "acl",
+    "advcl",
+    "ccomp",
+    "pcomp",
+    "relcl",
+    "xcomp"
+  ],
+  "pos_roles": {
+    "adjective": [
+      "ADJ"
+    ],
+    "adposition": [
+      "ADP"
+    ],
+    "adverb": [
+      "ADV"
+    ],
+    "auxiliary": [
+      "AUX"
+    ],
+    "conjunction": [
+      "CONJ"
+    ],
+    "coordinating_conjunction": [
+      "CCONJ"
+    ],
+    "determiner": [
+      "DET"
+    ],
+    "interjection": [
+      "INTJ"
+    ],
+    "noun": [
+      "NOUN"
+    ],
+    "numeral": [
+      "NUM"
+    ],
+    "particle": [
+      "PART"
+    ],
+    "pronoun": [
+      "PRON"
+    ],
+    "proper_noun": [
+      "PROPN"
+    ],
+    "punctuation": [
+      "PUNCT"
+    ],
+    "subordinating_conjunction": [
+      "SCONJ"
+    ],
+    "symbol": [
+      "SYM"
+    ],
+    "verb": [
+      "VERB"
+    ],
+    "other": [
+      "X"
+    ],
+    "space": [
+      "SPACE"
+    ]
+  },
+  "dep_roles": {
+    "root": [
+      "ROOT"
+    ],
+    "adjectival_clause": [
+      "acl"
+    ],
+    "adjectival_complement": [
+      "acomp"
+    ],
+    "adverbial_clause": [
+      "advcl"
+    ],
+    "adverbial_modifier": [
+      "advmod"
+    ],
+    "agent": [
+      "agent"
+    ],
+    "adjectival_modifier": [
+      "amod"
+    ],
+    "apposition": [
+      "appos"
+    ],
+    "attribute": [
+      "attr"
+    ],
+    "auxiliary": [
+      "aux"
+    ],
+    "passive_auxiliary": [
+      "auxpass"
+    ],
+    "case_marker": [
+      "case"
+    ],
+    "coordinating_conjunction": [
+      "cc"
+    ],
+    "clausal_complement": [
+      "ccomp"
+    ],
+    "compound": [
+      "compound"
+    ],
+    "conjunct": [
+      "conj"
+    ],
+    "clausal_subject": [
+      "csubj"
+    ],
+    "passive_clausal_subject": [
+      "csubjpass"
+    ],
+    "dative": [
+      "dative"
+    ],
+    "dependency_unspecified": [
+      "dep"
+    ],
+    "determiner": [
+      "det"
+    ],
+    "direct_object": [
+      "dobj"
+    ],
+    "expletive": [
+      "expl"
+    ],
+    "indirect_object": [
+      "iobj"
+    ],
+    "interjection": [
+      "intj"
+    ],
+    "marker": [
+      "mark"
+    ],
+    "meta": [
+      "meta"
+    ],
+    "negation": [
+      "neg"
+    ],
+    "nominal_modifier": [
+      "nmod"
+    ],
+    "noun_phrase_adverbial_modifier": [
+      "npadvmod"
+    ],
+    "nominal_subject": [
+      "nsubj"
+    ],
+    "passive_nominal_subject": [
+      "nsubjpass"
+    ],
+    "numeric_modifier": [
+      "nummod"
+    ],
+    "object": [
+      "obj"
+    ],
+    "object_predicate": [
+      "oprd"
+    ],
+    "parataxis": [
+      "parataxis"
+    ],
+    "prepositional_complement": [
+      "pcomp"
+    ],
+    "object_of_preposition": [
+      "pobj"
+    ],
+    "possessive_modifier": [
+      "poss"
+    ],
+    "preconjunct": [
+      "preconj"
+    ],
+    "predeterminer": [
+      "predet"
+    ],
+    "prepositional_modifier": [
+      "prep"
+    ],
+    "particle": [
+      "prt"
+    ],
+    "punctuation": [
+      "punct"
+    ],
+    "quantifier_modifier": [
+      "quantmod"
+    ],
+    "relative_clause_modifier": [
+      "relcl"
+    ],
+    "open_clausal_complement": [
+      "xcomp"
+    ]
+  }
+}

saved/model/feature_spec.json ADDED Viewed

The diff for this file is too large to render. See raw diff

saved/model/metrics.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "validation": {
+    "threshold": 0.58,
+    "accuracy": 0.78197,
+    "precision": 0.8444,
+    "recall": 0.73342,
+    "f1": 0.785,
+    "balanced_accuracy": 0.78651,
+    "specificity": 0.8396,
+    "youden_j": 0.57302,
+    "roc_auc": 0.87306,
+    "tn": 2858,
+    "fp": 546,
+    "fn": 1077,
+    "tp": 2963
+  },
+  "test": {
+    "threshold": 0.58,
+    "accuracy": 0.79001,
+    "precision": 0.84661,
+    "recall": 0.73118,
+    "f1": 0.78467,
+    "balanced_accuracy": 0.79288,
+    "specificity": 0.85459,
+    "youden_j": 0.58576,
+    "roc_auc": 0.87719,
+    "tn": 3009,
+    "fp": 512,
+    "fn": 1039,
+    "tp": 2826
+  }
+}

saved/model/model.json ADDED Viewed

The diff for this file is too large to render. See raw diff

saved/model/threshold.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "threshold": 0.5799999833106995
+}

saved/model/training_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "include_statistical": true,
+  "include_tfidf": true,
+  "include_char_ngrams": true,
+  "include_pos_ngrams": true,
+  "include_readability": true,
+  "include_local_pairwise": true,
+  "include_global_pairwise": true,
+  "threshold_metric": "youden_j",
+  "threshold_grid_step": 0.01,
+  "model_params": {
+    "objective": "binary:logistic",
+    "eval_metric": "logloss",
+    "n_estimators": 500,
+    "max_depth": 4,
+    "learning_rate": 0.05,
+    "subsample": 0.8,
+    "colsample_bytree": 0.3,
+    "min_child_weight": 3,
+    "reg_lambda": 5.0,
+    "reg_alpha": 1.0,
+    "random_state": 42,
+    "n_jobs": 2,
+    "tree_method": "hist"
+  }
+}

saved/ngram_features/char_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ccb3af06aa45f1da5fe4526c3c7963390b24bc4f0499a825543062128569400
+size 1724313

saved/ngram_features/ngram_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "verbose": true,
+  "char_ngram_n": 4,
+  "char_tfidf_min_df": 2,
+  "char_tfidf_max_df": 0.95,
+  "char_tfidf_max_features": 50000,
+  "pos_ngram_range": [
+    2,
+    3
+  ],
+  "pos_tfidf_min_df": 2,
+  "pos_tfidf_max_df": 0.95,
+  "pos_tfidf_max_features": 5000,
+  "sublinear_tf": true,
+  "norm": "l2",
+  "include_readability": true,
+  "dense_output": true
+}

saved/ngram_features/pos_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb2c23ef3aaaa48c6c754a2c1e94eac5b1507282607258f48a81daa9f73bb88b
+size 196682

saved/normalization/normalization_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "verbose": true,
+  "unicode_form": "NFC"
+}

saved/tfidf_features/tfidf_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "verbose": true,
+  "allowed_pos_tags": [
+    "NOUN",
+    "PROPN",
+    "VERB",
+    "ADJ",
+    "ADV",
+    "CONJ",
+    "AUX"
+  ],
+  "min_token_length": 2,
+  "ngram_range": [
+    1,
+    2
+  ],
+  "min_df": 2,
+  "max_df": 0.95,
+  "max_features": 25000,
+  "sublinear_tf": true,
+  "norm": "l2",
+  "dense_output": true
+}

saved/tfidf_features/vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac5200b840b2db60e359d9818416187fcc8b190ed6b482d629a5e405f694f971
+size 977747

static/app.js ADDED Viewed

	@@ -0,0 +1,124 @@

+const errorEl = document.getElementById("error");
+const resultCard = document.getElementById("result-card");
+const probabilityRingEl = document.getElementById("probability-ring");
+const probabilityEl = document.getElementById("probability");
+const classificationConfidenceEl = document.getElementById("classification-confidence");
+const classificationDecisionEl = document.getElementById("classification-decision");
+const buttonEl = document.getElementById("predict-btn");
+const decisionThreshold = 0.58;
+// number formatting
+function formatNumber(value) {
+  return value == null || Number.isNaN(Number(value)) ? "-" : Number(value).toFixed(4);
+}
+function showError(message) {
+  errorEl.textContent = message;
+  errorEl.classList.remove("hidden"); // remove hidden CSS class
+}
+function clearError() { // clear the error
+  errorEl.textContent = "";
+  errorEl.classList.add("hidden");
+}
+// function metricRating(value) {
+//   const score = Number(value);
+//   if (Number.isNaN(score)) return "-";
+//   if (score >= 0.9) return "Very strong";
+//   if (score >= 0.8) return "Good";
+//   if (score >= 0.7) return "Fairly good";
+//   if (score >= 0.6) return "Moderate";
+//   return "Limited";
+// }
+function probabilityColor(probability) {
+  const clamped = Math.max(0, Math.min(1, Number(probability)));
+  const hue = 8 + clamped * 126; // low prob is near red/orange; high prob more green
+  return `hsl(${hue} 72% 46%)`;
+}
+function classificationConfidence(probability) {
+  if (probability >= 0.9 || probability <= 0.1) return "Surely";
+  if (probability >= 0.75 || probability <= 0.25) return "Very Likely";
+  if (probability >= 0.6 || probability <= 0.4) return "Likely";
+  return "Uncertain";
+}
+function renderProbability(probability, showClassification = true) {
+  const clamped = Math.max(0, Math.min(1, Number(probability)));
+  const angle = `${(clamped * 360).toFixed(2)}deg`;
+  const color = probabilityColor(clamped);
+  const decision = clamped >= decisionThreshold ? "Same author" : "Different author";
+  const decisionClass = clamped >= decisionThreshold ? "is-same" : "is-different";
+  probabilityRingEl.style.setProperty("--ring-angle", angle);
+  probabilityRingEl.style.setProperty("--ring-color", color);
+  probabilityEl.textContent = `${(clamped * 100).toFixed(1)}%`;
+  classificationDecisionEl.classList.remove("is-same", "is-different");
+  if (showClassification) {
+    classificationConfidenceEl.textContent = classificationConfidence(clamped);
+    classificationDecisionEl.textContent = decision;
+    classificationDecisionEl.classList.add(decisionClass);
+  } else {
+    classificationConfidenceEl.textContent = "";
+    classificationDecisionEl.textContent = "";
+  }
+}
+async function loadMetrics() {
+  try {
+    const response = await fetch("/metrics");
+    const metrics = await response.json();
+    document.getElementById("metric-f1").textContent = formatNumber(metrics.f1);
+    document.getElementById("metric-youden").textContent = formatNumber(metrics.youden_j);
+    document.getElementById("metric-auc").textContent = formatNumber(metrics.auc_roc);
+    document.getElementById("metric-f1-rating").textContent = metricRating(metrics.f1);
+    document.getElementById("metric-youden-rating").textContent = metricRating(metrics.youden_j);
+    document.getElementById("metric-auc-rating").textContent = metricRating(metrics.auc_roc);
+  } catch (error) {
+    console.error("Failed to load metrics", error);
+  }}
+async function handlePredict() {
+  clearError();
+  const text1 = document.getElementById("text1").value.trim();
+  const text2 = document.getElementById("text2").value.trim();
+  if (!text1 || !text2) {
+    showError("Please fill in both text fields.");
+    return;
+  }
+  buttonEl.disabled = true; // disable click
+  buttonEl.textContent = "Running...";
+  try {
+    const response = await fetch("/predict", {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({ text1, text2 }),
+    });
+    const result = await response.json();
+    if (!response.ok || result.error) {
+      throw new Error(result.error || "Request failed.");
+    }
+    renderProbability(result.probability);
+  } catch (error) {
+    showError(error.message || "Request failed.");
+  } finally {
+    buttonEl.disabled = false;
+    buttonEl.textContent = "Predict";
+  }}
+buttonEl.addEventListener("click", handlePredict);
+renderProbability(0, false);
+loadMetrics(); // always show performance metrics

static/cdf.png ADDED Viewed

static/image.png ADDED Viewed

Git LFS Details

SHA256: 4b1e7a0f962e931d962f613727e174c6890fb518e037dd9ac3d1504305541960
Pointer size: 131 Bytes
Size of remote file: 814 kB

static/styles.css ADDED Viewed

	@@ -0,0 +1,445 @@

+:root {
+  color-scheme: light;
+  --bg: #f7f7f5;
+  --panel: #ffffff;
+  --border: #e0dfd9;
+  --border-strong: #d4d4d0;
+  --text: #1a1a1a;
+  --muted: #6b6b6b;
+  --muted-soft: #9b9b93;
+  --success: #1a7f4b;
+  --danger: #b84040;
+}
+* {
+  box-sizing: border-box;
+}
+/* body {
+  margin: 0;
+  min-height: 100vh;
+  background: #ffffff;
+  color: var(--text);
+  font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
+} */
+body {
+  margin: 0;
+  min-height: 100vh;
+  background: #f6f3ee;
+  color: var(--text);
+  font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
+}
+.page {
+  width: 100%;
+  max-width: 1700px;
+  margin: 0 auto;
+  padding: 28px 48px 36px;
+}
+.header-bar {
+  width: 100%;
+  padding: 10px 30px;
+  background:
+    linear-gradient(180deg, rgba(255, 253, 250, 0.644) 0%, rgba(246, 243, 238, 0.72) 100%),
+    linear-gradient(90deg, #c5c1ba 0%, #cfc4b1 52%, #dfe8e2 100%);
+  border-bottom: 1px solid rgba(76, 76, 74, 0.202);
+  box-shadow: 0 14px 28px rgba(149, 149, 149, 0.08);
+}
+.header-inner {
+  width: 100%;
+  max-width: 1540px;
+  margin: 0 auto;
+  display: flex;
+  align-items: baseline;
+  gap: 18px;
+}
+.app-shell {
+  width: 100%;
+  max-width: 1540px;
+  margin: 0 auto;
+  min-height: auto;
+  display: grid;
+  grid-template-columns: minmax(960px, 1fr) 420px;
+  gap: 44px;
+  align-items: start;
+}
+.main-pane,
+.side-pane {
+  display: flex;
+  flex-direction: column;
+  gap: 22px;
+}
+.side-pane {
+  padding-top: 0;
+  min-height: 100%;
+  justify-content: center;
+}
+.title {
+  margin: 0;
+  font-size: 39px;
+  font-weight: 500;
+  font-family: serif, Georgia, "Times New Roman", Times;
+  letter-spacing: 0.02em;
+  white-space: nowrap;
+}
+.subtitle {
+  margin: 0;
+  max-width: 62ch;
+  font-size: 15px;
+  color: var(--muted);
+}
+.card {
+  background: color-mix(in srgb, var(--panel) 88%, #faf6ef 12%);
+  border: 1px solid var(--border);
+  border-radius: 18px;
+  padding: 28px 28px 24px;
+  box-shadow: 0 18px 48px rgba(48, 42, 31, 0.06);
+}
+#result-card {
+  background: transparent;
+  border: none;
+  box-shadow: none;
+  padding: 8px 0 0;
+}
+.bottom-panel {
+  width: 100%;
+  max-width: 1540px;
+  margin: 28px auto 0;
+}
+.workspace-card {
+  padding: 0 0 18px;
+  background: transparent;
+  border: none;
+  box-shadow: none;
+}
+.section-title {
+  margin: 0 0 8px;
+  font-size: 16px;
+  font-weight: 600;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  color: var(--muted);
+}
+.card-note {
+  margin: -2px 0 10px;
+  font-size: 14px;
+  color: var(--muted-soft);
+}
+.stack {
+  display: grid;
+  grid-template-columns: minmax(0, 1fr) minmax(0, 1fr);
+  gap: 18px;
+  margin-bottom: 18px;
+}
+.input-block {
+  display: flex;
+  flex-direction: column;
+  gap: 8px;
+}
+.label {
+  font-size: 13px;
+  font-weight: 600;
+  letter-spacing: 0.07em;
+  text-transform: uppercase;
+  color: var(--muted);
+}
+textarea {
+  width: 100%;
+  min-height: 450px;
+  padding: 16px 18px;
+  font-size: 16px;
+  line-height: 1.55;
+  font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
+  border: 1px solid var(--border-strong);
+  border-radius: 14px;
+  resize: vertical;
+  background: rgba(255, 255, 255, 0.82);
+  color: var(--text);
+  outline: none;
+  transition: border-color 140ms ease, box-shadow 140ms ease, background-color 140ms ease;
+}
+textarea:focus {
+  border-color: #a48b68;
+  background: #fffdfa;
+  box-shadow: 0 0 0 4px rgba(164, 139, 104, 0.12);
+}
+button {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  width: auto;
+  min-width: 146px;
+  padding: 14px 22px;
+  font-size: 16px;
+  font-weight: 500;
+  letter-spacing: 0.04em;
+  background: var(--text);
+  color: #ffffff;
+  border: none;
+  border-radius: 999px;
+  cursor: pointer;
+  box-shadow: 0 10px 24px rgba(26, 26, 26, 0.18);
+  transition: transform 140ms ease, box-shadow 140ms ease, background-color 140ms ease;
+}
+button:hover:not(:disabled) {
+  transform: translateY(-1px);
+  box-shadow: 0 14px 28px rgba(26, 26, 26, 0.2);
+}
+button:disabled {
+  background: var(--muted-soft);
+  cursor: not-allowed;
+  box-shadow: none;
+}
+.error-msg {
+  margin: 0 0 8px;
+  font-size: 13px;
+  color: var(--danger);
+}
+/* hide component */
+.hidden {
+  display: none;
+}
+.result-visual {
+  display: flex;
+  align-items: center;
+  gap: 28px;
+  padding: 4px 0 0;
+}
+.result-meter {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 14px;
+  flex-shrink: 0;
+}
+.probability-ring {
+  --ring-angle: 0deg;
+  --ring-color: hsl(8 72% 46%);
+  width: 138px;
+  aspect-ratio: 1;
+  border-radius: 50%;
+  background:
+    conic-gradient(var(--ring-color) 0 var(--ring-angle), #e8e8e4a4 var(--ring-angle) 360deg);
+  display: grid;
+  place-items: center;
+  flex-shrink: 0;
+}
+.probability-ring-inner {
+  width: calc(100% - 16px);
+  height: calc(100% - 16px);
+  border-radius: 50%;
+  background: #f6f3ee;
+  color: var(--text);
+  display: grid;
+  place-items: center;
+}
+.probability-value {
+  font-size: 28px;
+  font-weight: 600;
+  letter-spacing: -0.03em;
+}
+.result-copy {
+  display: flex;
+  flex-direction: column;
+  gap: 8px;
+  max-width: 360px;
+}
+.result-eyebrow {
+  margin: 0;
+  font-size: 14px;
+  font-weight: 600;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  color: var(--muted);
+}
+.result-description {
+  margin: 0;
+  font-size: 16px;
+  line-height: 1.5;
+  color: var(--muted);
+}
+.classification {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 6px;
+  text-align: center;
+}
+.classification-decision {
+  margin: 0;
+  font-size: 22px;
+  font-weight: 600;
+  color: var(--text);
+}
+/* green for same author */
+.classification-decision.is-same {
+  color: var(--success);
+}
+/* red for different author */
+.classification-decision.is-different {
+  color: var(--danger);
+}
+.sidebar-stats {
+  display: grid;
+  grid-template-columns: repeat(3, minmax(210px, 250px));
+  justify-content: start;
+  gap: 10px;
+}
+.performance-card {
+  background: transparent;
+  border: none;
+  box-shadow: none;
+  padding: 0;
+}
+.metric-main {
+  display: flex;
+  flex-direction: column;
+  gap: 8px;
+  align-items: flex-start;
+}
+.metric-summary {
+  display: flex;
+  flex-direction: column;
+  gap: 3px;
+}
+.meta-stat {
+  display: flex;
+  flex-direction: column;
+  gap: 2px;
+  padding-right: 0;
+}
+.meta-value {
+  font-size: 22px;
+  letter-spacing: -0.2px;
+}
+.meta-label {
+  font-size: 13px;
+  color: var(--muted);
+  letter-spacing: 0.03em;
+}
+.metric-rating {
+  font-size: 13px;
+  font-weight: 600;
+  color: #8b7355;
+  letter-spacing: 0.02em;
+}
+.metric-note {
+  margin: 0;
+  font-size: 11px;
+  line-height: 1.45;
+  color: var(--muted-soft);
+}
+@media (max-width: 760px) {
+  .app-shell {
+    grid-template-columns: 1fr;
+    gap: 18px;
+  }
+  .side-pane {
+    padding-top: 0;
+  }
+  .bottom-panel {
+    margin-top: 20px;
+  }
+  .result-visual {
+    align-items: flex-start;
+  }
+  .stack {
+    grid-template-columns: 1fr;
+  }
+  .sidebar-stats {
+    grid-template-columns: 1fr;
+    gap: 14px;
+    justify-content: stretch;
+  }
+  .metric-main {
+    gap: 6px;
+  }
+}
+@media (max-width: 720px) {
+  .page {
+    padding: 20px 16px 28px;
+  }
+  .card {
+    padding: 20px 18px;
+  }
+  .header-bar {
+    padding: 12px 30px;
+  }
+  .header-inner {
+    align-items: flex-start;
+    flex-direction: column;
+    gap: 3px;
+  }
+  .title {
+    font-size: 26px;
+    white-space: normal;
+  }
+  textarea {
+    min-height: 220px;
+  }
+  .result-visual {
+    flex-direction: column;
+    gap: 16px;
+  }
+  .probability-ring {
+    width: 120px;
+  }
+  .probability-value {
+    font-size: 26px;
+  }
+  .metric-main {
+    gap: 6px;
+  }}

templates/index.html ADDED Viewed

	@@ -0,0 +1,122 @@

+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>AVeri</title>
+  <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
+</head>
+<body>
+  <header class="header-bar">
+    <div class="header-inner">
+      <h1 class="title">AVeri: Author Verifier</h1>
+      <!-- <p class="subtitle">Are your two texts written by the same author?</p> -->
+    </div>
+  </header>
+  <div class="page">
+    <div class="app-shell">
+      <main class="main-pane">
+        <section class="card workspace-card">
+          <div class="stack">
+            <div class="input-block">
+              <label class="label" for="text1">Text A</label>
+              <textarea id="text1" placeholder="Paste first text here..."></textarea>
+            </div>
+            <div class="input-block">
+              <label class="label" for="text2">Text B</label>
+              <textarea id="text2" placeholder="Paste second text here..."></textarea>
+            </div>
+          </div>
+          <p id="error" class="error-msg hidden"></p>
+          <button id="predict-btn" type="button">Predict</button>
+        </section>
+      </main>
+      <aside class="side-pane">
+        <section id="result-card" class="card">
+          <h2 class="section-title">Result</h2>
+          <div class="result-visual">
+            <div class="result-meter">
+              <div id="probability-ring" class="probability-ring" aria-hidden="true">
+                <div class="probability-ring-inner">
+                  <span id="probability" class="probability-value">0.0%</span>
+                </div>
+              </div>
+              <div class="classification">
+                <p id="classification-confidence" class="result-eyebrow"></p>
+                <p id="classification-decision" class="classification-decision"></p>
+              </div>
+            </div>
+            <div class="result-copy">
+              <p class="result-eyebrow">Probability</p>
+              <p class="result-description">Probability both texts were written by the same author.</p>
+            </div>
+          </div>
+        </section>
+      </aside>
+    </div>
+    <section class="bottom-panel">
+      <section class="card performance-card">
+        <h2 class="section-title">Model Performance</h2>
+        <div class="sidebar-stats">
+          <div class="meta-stat">
+            <div class="metric-main">
+              <div class="metric-summary">
+                <span id="metric-f1" class="meta-value">-</span>
+                <span class="meta-label">F1 Score</span>
+                <!-- <span id="metric-f1-rating" class="metric-rating">-</span> -->
+              </div>
+              <p class="metric-note">The model is good at prediction without raising too many false alarms.</p>
+            </div>
+          </div>
+          <div class="meta-stat">
+            <div class="metric-main">
+              <div class="metric-summary">
+                <span id="metric-youden" class="meta-value">-</span>
+                <span class="meta-label">Youden J</span>
+                <!-- <span id="metric-youden-rating" class="metric-rating">-</span> -->
+              </div>
+              <p class="metric-note">The model has moderate ability to correctly separate same- and different-author cases.</p>
+            </div>
+          </div>
+          <div class="meta-stat">
+            <div class="metric-main">
+              <div class="metric-summary">
+                <span id="metric-auc" class="meta-value">-</span>
+                <span class="meta-label">AUC-ROC</span>
+                <!-- <span id="metric-auc-rating" class="metric-rating">-</span> -->
+              </div>
+              <p class="metric-note">The model is strongly reliable at distinguishing between same- and different-author cases overall.</p>
+            </div>
+          </div>
+        </div>
+      </section>
+    </section>
+  </div>
+  <script src="{{ url_for('static', filename='app.js') }}"></script>
+</body>
+</html>