Spaces:

salirafi
/

AVeri

Sleeping

App Files Files Community

salirafi commited on Apr 21

Commit

2e821b5

verified ·

1 Parent(s): 66242b8

Sync from GitHub via hub-sync

Browse files

Files changed (29) hide show

README.md +1 -10
data/raw/.gitkeep +0 -0
data/raw/authorship_verification_test/.gitkeep +1 -0
data/raw/authorship_verification_train/.gitkeep +0 -0
data/raw/authorship_verification_validation/.gitkeep +0 -0
inference.py +0 -293
requirements.txt +7 -0
saved/.gitkeep +0 -0
saved/audit/.gitkeep +0 -0
saved/audit/dataframes/.gitkeep +0 -0
saved/dimensionality_reduction/.gitkeep +0 -0
saved/dimensionality_reduction/dataframes/.gitkeep +0 -0
saved/masking/.gitkeep +0 -0
saved/masking/dataframes/.gitkeep +0 -0
saved/masking/spacy_checkpoints/.gitkeep +0 -0
saved/masking/spacy_checkpoints/text1/.gitkeep +0 -0
saved/masking/spacy_checkpoints/text2/.gitkeep +0 -0
saved/model/.gitkeep +0 -0
saved/ngram_features/.gitkeep +0 -0
saved/ngram_features/dataframes/.gitkeep +0 -0
saved/normalization/.gitkeep +0 -0
saved/normalization/dataframes/.gitkeep +0 -0
saved/pairwise_baseline/.gitkeep +0 -0
saved/pairwise_baseline/predictions/.gitkeep +0 -0
saved/statistical_features/.gitkeep +0 -0
saved/statistical_features/dataframes/.gitkeep +0 -0
saved/tfidf_features/.gitkeep +0 -0
saved/tfidf_features/dataframes/.gitkeep +0 -0
src/testing.ipynb +0 -0

README.md CHANGED Viewed

@@ -1,12 +1,3 @@
----
-license: mit
-title: 'AVeri: Author Verifier'
-sdk: docker
-emoji: 📚
-colorFrom: gray
-colorTo: indigo
-short_description: An NLP-based author verifier tool.
----
 # AVeri: Author Verification
 This repository contains the source code for an *authorship verifier* tool, which is used to predict whether a given pair of two texts were written by the same author based purely on stylistic and lexical characteristics (not semantic which is used to convey meaning or topic). The repository includes end-to-end machine learning pipeline for preparing paired texts, extracting stylometric and lexical features, training a binary classifier, and serving the trained model through a small Flask web app.
@@ -422,4 +413,4 @@ For a new pair of texts, inference repeats the training-time transformations:
 This is a personal project intended to be a portfolio. I am not currently planning to push into production except if there are some interested collaborators, in which case, please feel free to contact me at salirafi8@gmail.com :)
-The use of generative AI includes: Github Copilot to help in code syntax and identifying bugs and errors. Outside of those, including problem formulation and framework of thinking, code logical reasoning and writing, from database management to web development, all is done mostly by the author.

 # AVeri: Author Verification
 This repository contains the source code for an *authorship verifier* tool, which is used to predict whether a given pair of two texts were written by the same author based purely on stylistic and lexical characteristics (not semantic which is used to convey meaning or topic). The repository includes end-to-end machine learning pipeline for preparing paired texts, extracting stylometric and lexical features, training a binary classifier, and serving the trained model through a small Flask web app.
 This is a personal project intended to be a portfolio. I am not currently planning to push into production except if there are some interested collaborators, in which case, please feel free to contact me at salirafi8@gmail.com :)
+The use of generative AI includes: Github Copilot to help in code syntax and identifying bugs and errors. Outside of those, including problem formulation and framework of thinking, code logical reasoning and writing, from database management to web development, all is done mostly by the author.

data/raw/.gitkeep ADDED Viewed

File without changes

data/raw/authorship_verification_test/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+

data/raw/authorship_verification_train/.gitkeep ADDED Viewed

File without changes

data/raw/authorship_verification_validation/.gitkeep ADDED Viewed

File without changes

inference.py DELETED Viewed

@@ -1,293 +0,0 @@
-from __future__ import annotations
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any
-import numpy as np
-import pandas as pd
-from textstat import textstat
-from xgboost import XGBClassifier
-from helpers import load_json, load_pickle
-from masking_regex import mask_split as regex_mask_split
-from masking_spacy import Config as SpacyMaskingConfig, _apply_ner_mask, _build_linguistic_record, load_nlp_model
-from normalization import normalize_text, Config as NormalizationConfig
-from features_statistical import extract_split_statistics, Config as StatisticalConfig
-from features_tfidf import record_to_tfidf_text, Config as TFIDFConfig
-from features_ngram import Config as NGramConfig, build_space_free_char_ngrams, record_to_pos_sequence
-from model_training import Config as TrainingConfig
-def _coerce_tfidf_config(payload: dict[str, Any]) -> TFIDFConfig:
-    payload = dict(payload)
-    if isinstance(payload.get("ngram_range"), list):
-        payload["ngram_range"] = tuple(payload["ngram_range"])
-    return TFIDFConfig(**payload)
-def _coerce_ngram_config(payload: dict[str, Any]) -> NGramConfig:
-    payload = dict(payload)
-    if isinstance(payload.get("pos_ngram_range"), list):
-        payload["pos_ngram_range"] = tuple(payload["pos_ngram_range"])
-    return NGramConfig(**payload)
-def _coerce_statistical_config(payload: dict[str, Any]) -> StatisticalConfig:
-    payload = dict(payload)
-    if isinstance(payload.get("phrase_role_dependency_labels"), list):
-        payload["phrase_role_dependency_labels"] = tuple(payload["phrase_role_dependency_labels"])
-    return StatisticalConfig(**payload)
-def _coerce_training_config(payload: dict[str, Any]) -> TrainingConfig:
-    payload = dict(payload)
-    return TrainingConfig(**payload)
-@dataclass(slots=True)
-class PredictionResult:
-    probability_same: float
-    predicted_label: int
-    threshold: float
-    normalized_text1: str
-    normalized_text2: str
-    masked_text1: str
-    masked_text2: str
-    def to_dict(self) -> dict[str, Any]:
-        label = "Same author" if self.predicted_label == 1 else "Different author"
-        return {
-            "label": label,
-            "probability": self.probability_same,
-            "threshold": self.threshold,
-            "normalized_text1": self.normalized_text1,
-            "normalized_text2": self.normalized_text2,
-            "masked_text1": self.masked_text1,
-            "masked_text2": self.masked_text2,
-        }
-# STAND-ALONE PIPELINE TO PERFORM INFERENCE USING THE TRAINED MODEL
-class Inference:
-    def __init__(self, project_root: str | Path | None = None) -> None:
-        self.project_root = Path(project_root) if project_root is not None else Path(__file__).resolve().parents[1]
-        self.saved_dir = self.project_root / "saved"
-        self.model_dir = self.saved_dir / "model"
-        # =============================
-        # the pipeline follows what is done in src/pipeline.py but adapted to do inference instead of training
-        # =============================
-        self.normalization_config = NormalizationConfig(**load_json(self.saved_dir / "normalization" / "normalization_config.json"))
-        spacy_payload = load_json(self.saved_dir / "masking" / "spacy_config.json")
-        spacy_payload["verbose"] = False
-        spacy_payload["nlp_n_process"] = 1
-        self.spacy_config = SpacyMaskingConfig(**spacy_payload)
-        statistical_payload = load_json(self.saved_dir / "masking" / "statistical_config.json")
-        statistical_payload["verbose"] = False
-        self.statistical_config = _coerce_statistical_config(statistical_payload)
-        tfidf_payload = load_json(self.saved_dir / "tfidf_features" / "tfidf_config.json")
-        tfidf_payload["verbose"] = False
-        self.tfidf_config = _coerce_tfidf_config(tfidf_payload)
-        ngram_payload = load_json(self.saved_dir / "ngram_features" / "ngram_config.json")
-        ngram_payload["verbose"] = False
-        self.ngram_config = _coerce_ngram_config(ngram_payload)
-        training_payload = load_json(self.saved_dir / "model" / "training_config.json")
-        self.training_config = _coerce_training_config(training_payload)
-        self.tfidf_vectorizer = load_pickle(self.saved_dir / "tfidf_features" / "vectorizer.pkl")
-        self.char_vectorizer = load_pickle(self.saved_dir / "ngram_features" / "char_vectorizer.pkl")
-        self.pos_vectorizer = load_pickle(self.saved_dir / "ngram_features" / "pos_vectorizer.pkl")
-        self.model = None
-        self.threshold = float(load_json(self.model_dir / "threshold.json")["threshold"])
-        feature_spec = load_json(self.model_dir / "feature_spec.json")
-        self.suffixes: list[str] = feature_spec["suffixes"]
-        # self.pairwise_operations: tuple[str, ...] = tuple(feature_spec["pairwise_operations"])
-        self.pairwise_column_pairs = [(f"text1_{suffix}", f"text2_{suffix}") for suffix in self.suffixes]
-        self.metrics = load_json(self.model_dir / "metrics.json")
-        self.nlp = None
-    def _load_model(self) -> XGBClassifier:
-        if self.model is None:
-            model_path = self.model_dir / "model.json"
-            if not model_path.exists():
-                raise FileNotFoundError(f"Missing '{model_path}'")
-            model = XGBClassifier()
-            model.load_model(model_path)
-            self.model = model
-        return self.model
-    def _predict_positive_proba(self, X: np.ndarray) -> float:
-        model = self._load_model()
-        return float(model.predict_proba(X)[0, 1])
-    def _mask_one_text(self, text: str) -> tuple[str, dict[str, Any]]:
-        if self.nlp is None:
-            self.nlp = load_nlp_model(config=self.spacy_config)
-        doc = self.nlp(text)
-        masked_text, _ = _apply_ner_mask(text, doc)
-        record = _build_linguistic_record(doc)
-        return masked_text, record
-    def _build_pairwise_vector(self, feature_df: pd.DataFrame) -> np.ndarray:
-        row_values = feature_df.iloc[0].to_dict()
-        width = len(self.pairwise_column_pairs) * 2 # two pairwise operations: abs. diff & dot product
-        X_pair = np.empty((1, width), dtype=np.float32)
-        column_index = 0
-        for left_col, right_col in self.pairwise_column_pairs:
-            left = np.float32(row_values.get(left_col, 0.0))
-            right = np.float32(row_values.get(right_col, 0.0))
-            diff = left - right
-            X_pair[0, column_index] = abs(diff)
-            X_pair[0, column_index + 1] = left * right
-            column_index += 2
-        return X_pair
-    def _family_suffix_groups(self) -> dict[str, list[str]]:
-        return {
-            "tfidf": [s for s in self.suffixes if s.startswith("tfidf_")],
-            "char_ngrams": [s for s in self.suffixes if s.startswith("char") and "_tfidf_" in s],
-            "pos_ngrams": [s for s in self.suffixes if s.startswith("pos") and "_tfidf_" in s],
-            "scalar": [s for s in self.suffixes if not (
-                s.startswith("tfidf_")
-                or (s.startswith("char") and "_tfidf_" in s)
-                or (s.startswith("pos") and "_tfidf_" in s)
-            )],
-        }
-    def _build_global_pairwise_vector(self, feature_df: pd.DataFrame) -> np.ndarray:
-        row_values = feature_df.iloc[0].to_dict()
-        values: list[float] = []
-        for family_suffixes in self._family_suffix_groups().values():
-            if not family_suffixes:
-                continue
-            left = np.array([row_values.get(f"text1_{suffix}", 0.0) for suffix in family_suffixes], dtype=np.float32)
-            right = np.array([row_values.get(f"text2_{suffix}", 0.0) for suffix in family_suffixes], dtype=np.float32)
-            denominator = float(np.linalg.norm(left) * np.linalg.norm(right))
-            cosine = float(np.dot(left, right) / denominator) if denominator > 0 else 0.0
-            diff = left - right
-            l1 = float(np.abs(diff).sum())
-            l2 = float(np.linalg.norm(diff))
-            values.extend([cosine, l1, l2])
-        return np.array(values, dtype=np.float32).reshape(1, -1)
-    # predict prbability and classification of two given texts (input from the user)
-    def predict(self, text1: str, text2: str, threshold: float | None = None) -> PredictionResult:
-        threshold_value = self.threshold if threshold is None else float(threshold)
-        pair_df = pd.DataFrame([{
-                    "text1": normalize_text(text1, config=self.normalization_config),
-                    "text2": normalize_text(text2, config=self.normalization_config),
-                    "same": 0,
-                }])
-        regex_masked_df, _ = regex_mask_split(pair_df)
-        # spaCy masking; not using nlp.pipe
-        masked_text1, record1 = self._mask_one_text(regex_masked_df.iloc[0]["text1"])
-        masked_text2, record2 = self._mask_one_text(regex_masked_df.iloc[0]["text2"])
-        masked_df = regex_masked_df.copy()
-        masked_df.at[0, "text1"] = masked_text1 # combining regex and spaCy masking
-        masked_df.at[0, "text2"] = masked_text2 # ...
-        split_cache = {"text1": [record1], "text2": [record2]} # the linguistic cache
-        feature_df = pd.DataFrame() # initialize empty dataframe for the features
-        # ======== statistical features ===========
-        if self.training_config.include_statistical:
-            feature_df = extract_split_statistics(
-                masked_df,
-                split_cache=split_cache,
-                split_name="inference",
-                config=self.statistical_config,
-            )
-        # ======== TF-IDF features ===========
-        if self.training_config.include_tfidf:
-            for column in ("text1", "text2"):
-                docs = [record_to_tfidf_text(record, config=self.tfidf_config) for record in split_cache[column]]
-                tfidf_matrix = self.tfidf_vectorizer.transform(docs).toarray()
-                tfidf_cols = [f"{column}_tfidf_{index:05d}" for index in range(tfidf_matrix.shape[1])]
-                tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_cols)
-                feature_df = pd.concat([feature_df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)
-        # ======== n-gram features ===========
-        for column in ("text1", "text2"):
-            if self.training_config.include_char_ngrams:
-                char_docs = [
-                    " ".join(build_space_free_char_ngrams(text, n=self.ngram_config.char_ngram_n))
-                    for text in masked_df[column].tolist()]
-                char_matrix = self.char_vectorizer.transform(char_docs).toarray()
-                char_cols = [
-                    f"{column}_char{self.ngram_config.char_ngram_n}_tfidf_{index:05d}"
-                    for index in range(char_matrix.shape[1])]
-                char_df = pd.DataFrame(char_matrix, columns=char_cols)
-                feature_df = pd.concat([feature_df.reset_index(drop=True), char_df.reset_index(drop=True)], axis=1)
-            if self.training_config.include_pos_ngrams:
-                pos_docs = [" ".join(record_to_pos_sequence(record)) for record in split_cache[column]]
-                pos_matrix = self.pos_vectorizer.transform(pos_docs).toarray()
-                pos_cols = [
-                    f"{column}_pos{self.ngram_config.pos_ngram_range}_tfidf_{index:05d}"
-                    for index in range(pos_matrix.shape[1])]
-                pos_df = pd.DataFrame(pos_matrix, columns=pos_cols)
-                feature_df = pd.concat([feature_df.reset_index(drop=True), pos_df.reset_index(drop=True)], axis=1)
-            continue
-        # ======== readability features ===========
-        if self.training_config.include_readability:
-            readability_df = pd.DataFrame([{
-                        "text1_readability_flesch_kincaid_grade": round(textstat.flesch_kincaid_grade(masked_df.iloc[0]["text1"]), 5),
-                        "text1_readability_gunning_fog": round(textstat.gunning_fog(masked_df.iloc[0]["text1"]), 5),
-                        "text1_readability_smog": round(textstat.smog_index(masked_df.iloc[0]["text1"]), 5),
-                        "text1_readability_coleman_liau": round(textstat.coleman_liau_index(masked_df.iloc[0]["text1"]), 5),
-                        "text2_readability_flesch_kincaid_grade": round(textstat.flesch_kincaid_grade(masked_df.iloc[0]["text2"]), 5),
-                        "text2_readability_gunning_fog": round(textstat.gunning_fog(masked_df.iloc[0]["text2"]), 5),
-                        "text2_readability_smog": round(textstat.smog_index(masked_df.iloc[0]["text2"]), 5),
-                        "text2_readability_coleman_liau": round(textstat.coleman_liau_index(masked_df.iloc[0]["text2"]), 5)
-                    }])
-            feature_df = pd.concat([feature_df.reset_index(drop=True), readability_df.reset_index(drop=True)], axis=1)
-        blocks: list[np.ndarray] = []
-        if self.training_config.include_local_pairwise:
-            blocks.append(self._build_pairwise_vector(feature_df)) # optimized
-        if self.training_config.include_global_pairwise:
-            blocks.append(self._build_global_pairwise_vector(feature_df))
-        if not blocks:
-            raise ValueError("At least one of include_local_pairwise or include_global_pairwise must be True.")
-        X = np.hstack(blocks).astype(np.float32)
-        probability_same = self._predict_positive_proba(X)
-        predicted_label = int(probability_same >= threshold_value) # 1 if > threshold, otherwise 0
-        return PredictionResult(
-            probability_same=probability_same,
-            predicted_label=predicted_label,
-            threshold=threshold_value,
-            normalized_text1=pair_df.iloc[0]["text1"],
-            normalized_text2=pair_df.iloc[0]["text2"],
-            masked_text1=masked_df.iloc[0]["text1"],
-            masked_text2=masked_df.iloc[0]["text2"],
-        )

requirements.txt CHANGED Viewed

@@ -5,6 +5,13 @@ scipy==1.17.1
 scikit-learn==1.8.0
 xgboost==3.2.0
 spacy==3.8.14
 ftfy==6.3.1
 textstat==0.7.13
 tqdm==4.67.3

 scikit-learn==1.8.0
 xgboost==3.2.0
 spacy==3.8.14
+datasets==4.8.4
 ftfy==6.3.1
 textstat==0.7.13
 tqdm==4.67.3
+pyarrow==23.0.1
+# after installing these, install the spaCy model used by the saved config:
+# python -m spacy download en_core_web_lg

saved/.gitkeep ADDED Viewed

File without changes

saved/audit/.gitkeep ADDED Viewed

File without changes

saved/audit/dataframes/.gitkeep ADDED Viewed

File without changes

saved/dimensionality_reduction/.gitkeep ADDED Viewed

File without changes

saved/dimensionality_reduction/dataframes/.gitkeep ADDED Viewed

File without changes

saved/masking/.gitkeep ADDED Viewed

File without changes

saved/masking/dataframes/.gitkeep ADDED Viewed

File without changes

saved/masking/spacy_checkpoints/.gitkeep ADDED Viewed

File without changes

saved/masking/spacy_checkpoints/text1/.gitkeep ADDED Viewed

File without changes

saved/masking/spacy_checkpoints/text2/.gitkeep ADDED Viewed

File without changes

saved/model/.gitkeep ADDED Viewed

File without changes

saved/ngram_features/.gitkeep ADDED Viewed

File without changes

saved/ngram_features/dataframes/.gitkeep ADDED Viewed

File without changes

saved/normalization/.gitkeep ADDED Viewed

File without changes

saved/normalization/dataframes/.gitkeep ADDED Viewed

File without changes

saved/pairwise_baseline/.gitkeep ADDED Viewed

File without changes

saved/pairwise_baseline/predictions/.gitkeep ADDED Viewed

File without changes

saved/statistical_features/.gitkeep ADDED Viewed

File without changes

saved/statistical_features/dataframes/.gitkeep ADDED Viewed

File without changes

saved/tfidf_features/.gitkeep ADDED Viewed

File without changes

saved/tfidf_features/dataframes/.gitkeep ADDED Viewed

File without changes

src/testing.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff