salirafi commited on
Commit
2e821b5
·
verified ·
1 Parent(s): 66242b8

Sync from GitHub via hub-sync

Browse files
README.md CHANGED
@@ -1,12 +1,3 @@
1
- ---
2
- license: mit
3
- title: 'AVeri: Author Verifier'
4
- sdk: docker
5
- emoji: 📚
6
- colorFrom: gray
7
- colorTo: indigo
8
- short_description: An NLP-based author verifier tool.
9
- ---
10
  # AVeri: Author Verification
11
 
12
  This repository contains the source code for an *authorship verifier* tool, which is used to predict whether a given pair of two texts were written by the same author based purely on stylistic and lexical characteristics (not semantic which is used to convey meaning or topic). The repository includes end-to-end machine learning pipeline for preparing paired texts, extracting stylometric and lexical features, training a binary classifier, and serving the trained model through a small Flask web app.
@@ -422,4 +413,4 @@ For a new pair of texts, inference repeats the training-time transformations:
422
 
423
  This is a personal project intended to be a portfolio. I am not currently planning to push into production except if there are some interested collaborators, in which case, please feel free to contact me at salirafi8@gmail.com :)
424
 
425
- The use of generative AI includes: Github Copilot to help in code syntax and identifying bugs and errors. Outside of those, including problem formulation and framework of thinking, code logical reasoning and writing, from database management to web development, all is done mostly by the author.
 
 
 
 
 
 
 
 
 
 
1
  # AVeri: Author Verification
2
 
3
  This repository contains the source code for an *authorship verifier* tool, which is used to predict whether a given pair of two texts were written by the same author based purely on stylistic and lexical characteristics (not semantic which is used to convey meaning or topic). The repository includes end-to-end machine learning pipeline for preparing paired texts, extracting stylometric and lexical features, training a binary classifier, and serving the trained model through a small Flask web app.
 
413
 
414
  This is a personal project intended to be a portfolio. I am not currently planning to push into production except if there are some interested collaborators, in which case, please feel free to contact me at salirafi8@gmail.com :)
415
 
416
+ The use of generative AI includes: Github Copilot to help in code syntax and identifying bugs and errors. Outside of those, including problem formulation and framework of thinking, code logical reasoning and writing, from database management to web development, all is done mostly by the author.
data/raw/.gitkeep ADDED
File without changes
data/raw/authorship_verification_test/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+
data/raw/authorship_verification_train/.gitkeep ADDED
File without changes
data/raw/authorship_verification_validation/.gitkeep ADDED
File without changes
inference.py DELETED
@@ -1,293 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from dataclasses import dataclass
4
- from pathlib import Path
5
- from typing import Any
6
-
7
- import numpy as np
8
- import pandas as pd
9
- from textstat import textstat
10
- from xgboost import XGBClassifier
11
-
12
- from helpers import load_json, load_pickle
13
-
14
- from masking_regex import mask_split as regex_mask_split
15
- from masking_spacy import Config as SpacyMaskingConfig, _apply_ner_mask, _build_linguistic_record, load_nlp_model
16
- from normalization import normalize_text, Config as NormalizationConfig
17
- from features_statistical import extract_split_statistics, Config as StatisticalConfig
18
- from features_tfidf import record_to_tfidf_text, Config as TFIDFConfig
19
- from features_ngram import Config as NGramConfig, build_space_free_char_ngrams, record_to_pos_sequence
20
- from model_training import Config as TrainingConfig
21
-
22
-
23
-
24
- def _coerce_tfidf_config(payload: dict[str, Any]) -> TFIDFConfig:
25
- payload = dict(payload)
26
- if isinstance(payload.get("ngram_range"), list):
27
- payload["ngram_range"] = tuple(payload["ngram_range"])
28
- return TFIDFConfig(**payload)
29
- def _coerce_ngram_config(payload: dict[str, Any]) -> NGramConfig:
30
- payload = dict(payload)
31
- if isinstance(payload.get("pos_ngram_range"), list):
32
- payload["pos_ngram_range"] = tuple(payload["pos_ngram_range"])
33
- return NGramConfig(**payload)
34
- def _coerce_statistical_config(payload: dict[str, Any]) -> StatisticalConfig:
35
- payload = dict(payload)
36
- if isinstance(payload.get("phrase_role_dependency_labels"), list):
37
- payload["phrase_role_dependency_labels"] = tuple(payload["phrase_role_dependency_labels"])
38
- return StatisticalConfig(**payload)
39
- def _coerce_training_config(payload: dict[str, Any]) -> TrainingConfig:
40
- payload = dict(payload)
41
- return TrainingConfig(**payload)
42
-
43
- @dataclass(slots=True)
44
- class PredictionResult:
45
- probability_same: float
46
- predicted_label: int
47
- threshold: float
48
- normalized_text1: str
49
- normalized_text2: str
50
- masked_text1: str
51
- masked_text2: str
52
- def to_dict(self) -> dict[str, Any]:
53
- label = "Same author" if self.predicted_label == 1 else "Different author"
54
- return {
55
- "label": label,
56
- "probability": self.probability_same,
57
- "threshold": self.threshold,
58
- "normalized_text1": self.normalized_text1,
59
- "normalized_text2": self.normalized_text2,
60
- "masked_text1": self.masked_text1,
61
- "masked_text2": self.masked_text2,
62
- }
63
-
64
-
65
-
66
- # STAND-ALONE PIPELINE TO PERFORM INFERENCE USING THE TRAINED MODEL
67
- class Inference:
68
- def __init__(self, project_root: str | Path | None = None) -> None:
69
-
70
- self.project_root = Path(project_root) if project_root is not None else Path(__file__).resolve().parents[1]
71
- self.saved_dir = self.project_root / "saved"
72
- self.model_dir = self.saved_dir / "model"
73
-
74
- # =============================
75
- # the pipeline follows what is done in src/pipeline.py but adapted to do inference instead of training
76
- # =============================
77
-
78
- self.normalization_config = NormalizationConfig(**load_json(self.saved_dir / "normalization" / "normalization_config.json"))
79
-
80
- spacy_payload = load_json(self.saved_dir / "masking" / "spacy_config.json")
81
- spacy_payload["verbose"] = False
82
- spacy_payload["nlp_n_process"] = 1
83
- self.spacy_config = SpacyMaskingConfig(**spacy_payload)
84
-
85
- statistical_payload = load_json(self.saved_dir / "masking" / "statistical_config.json")
86
- statistical_payload["verbose"] = False
87
- self.statistical_config = _coerce_statistical_config(statistical_payload)
88
-
89
- tfidf_payload = load_json(self.saved_dir / "tfidf_features" / "tfidf_config.json")
90
- tfidf_payload["verbose"] = False
91
- self.tfidf_config = _coerce_tfidf_config(tfidf_payload)
92
-
93
- ngram_payload = load_json(self.saved_dir / "ngram_features" / "ngram_config.json")
94
- ngram_payload["verbose"] = False
95
- self.ngram_config = _coerce_ngram_config(ngram_payload)
96
-
97
- training_payload = load_json(self.saved_dir / "model" / "training_config.json")
98
- self.training_config = _coerce_training_config(training_payload)
99
-
100
- self.tfidf_vectorizer = load_pickle(self.saved_dir / "tfidf_features" / "vectorizer.pkl")
101
- self.char_vectorizer = load_pickle(self.saved_dir / "ngram_features" / "char_vectorizer.pkl")
102
- self.pos_vectorizer = load_pickle(self.saved_dir / "ngram_features" / "pos_vectorizer.pkl")
103
-
104
- self.model = None
105
- self.threshold = float(load_json(self.model_dir / "threshold.json")["threshold"])
106
-
107
- feature_spec = load_json(self.model_dir / "feature_spec.json")
108
- self.suffixes: list[str] = feature_spec["suffixes"]
109
- # self.pairwise_operations: tuple[str, ...] = tuple(feature_spec["pairwise_operations"])
110
- self.pairwise_column_pairs = [(f"text1_{suffix}", f"text2_{suffix}") for suffix in self.suffixes]
111
-
112
- self.metrics = load_json(self.model_dir / "metrics.json")
113
- self.nlp = None
114
-
115
- def _load_model(self) -> XGBClassifier:
116
- if self.model is None:
117
- model_path = self.model_dir / "model.json"
118
- if not model_path.exists():
119
- raise FileNotFoundError(f"Missing '{model_path}'")
120
- model = XGBClassifier()
121
- model.load_model(model_path)
122
- self.model = model
123
- return self.model
124
-
125
-
126
- def _predict_positive_proba(self, X: np.ndarray) -> float:
127
- model = self._load_model()
128
- return float(model.predict_proba(X)[0, 1])
129
-
130
- def _mask_one_text(self, text: str) -> tuple[str, dict[str, Any]]:
131
- if self.nlp is None:
132
- self.nlp = load_nlp_model(config=self.spacy_config)
133
- doc = self.nlp(text)
134
- masked_text, _ = _apply_ner_mask(text, doc)
135
- record = _build_linguistic_record(doc)
136
- return masked_text, record
137
-
138
- def _build_pairwise_vector(self, feature_df: pd.DataFrame) -> np.ndarray:
139
- row_values = feature_df.iloc[0].to_dict()
140
- width = len(self.pairwise_column_pairs) * 2 # two pairwise operations: abs. diff & dot product
141
- X_pair = np.empty((1, width), dtype=np.float32)
142
- column_index = 0
143
- for left_col, right_col in self.pairwise_column_pairs:
144
- left = np.float32(row_values.get(left_col, 0.0))
145
- right = np.float32(row_values.get(right_col, 0.0))
146
- diff = left - right
147
- X_pair[0, column_index] = abs(diff)
148
- X_pair[0, column_index + 1] = left * right
149
- column_index += 2
150
-
151
- return X_pair
152
-
153
- def _family_suffix_groups(self) -> dict[str, list[str]]:
154
- return {
155
- "tfidf": [s for s in self.suffixes if s.startswith("tfidf_")],
156
- "char_ngrams": [s for s in self.suffixes if s.startswith("char") and "_tfidf_" in s],
157
- "pos_ngrams": [s for s in self.suffixes if s.startswith("pos") and "_tfidf_" in s],
158
- "scalar": [s for s in self.suffixes if not (
159
- s.startswith("tfidf_")
160
- or (s.startswith("char") and "_tfidf_" in s)
161
- or (s.startswith("pos") and "_tfidf_" in s)
162
- )],
163
- }
164
-
165
- def _build_global_pairwise_vector(self, feature_df: pd.DataFrame) -> np.ndarray:
166
- row_values = feature_df.iloc[0].to_dict()
167
- values: list[float] = []
168
-
169
-
170
- for family_suffixes in self._family_suffix_groups().values():
171
- if not family_suffixes:
172
- continue
173
-
174
- left = np.array([row_values.get(f"text1_{suffix}", 0.0) for suffix in family_suffixes], dtype=np.float32)
175
- right = np.array([row_values.get(f"text2_{suffix}", 0.0) for suffix in family_suffixes], dtype=np.float32)
176
-
177
- denominator = float(np.linalg.norm(left) * np.linalg.norm(right))
178
- cosine = float(np.dot(left, right) / denominator) if denominator > 0 else 0.0
179
- diff = left - right
180
- l1 = float(np.abs(diff).sum())
181
- l2 = float(np.linalg.norm(diff))
182
-
183
- values.extend([cosine, l1, l2])
184
-
185
- return np.array(values, dtype=np.float32).reshape(1, -1)
186
-
187
- # predict prbability and classification of two given texts (input from the user)
188
- def predict(self, text1: str, text2: str, threshold: float | None = None) -> PredictionResult:
189
-
190
- threshold_value = self.threshold if threshold is None else float(threshold)
191
-
192
- pair_df = pd.DataFrame([{
193
- "text1": normalize_text(text1, config=self.normalization_config),
194
- "text2": normalize_text(text2, config=self.normalization_config),
195
- "same": 0,
196
- }])
197
-
198
- regex_masked_df, _ = regex_mask_split(pair_df)
199
-
200
- # spaCy masking; not using nlp.pipe
201
- masked_text1, record1 = self._mask_one_text(regex_masked_df.iloc[0]["text1"])
202
- masked_text2, record2 = self._mask_one_text(regex_masked_df.iloc[0]["text2"])
203
-
204
- masked_df = regex_masked_df.copy()
205
- masked_df.at[0, "text1"] = masked_text1 # combining regex and spaCy masking
206
- masked_df.at[0, "text2"] = masked_text2 # ...
207
-
208
- split_cache = {"text1": [record1], "text2": [record2]} # the linguistic cache
209
- feature_df = pd.DataFrame() # initialize empty dataframe for the features
210
-
211
- # ======== statistical features ===========
212
-
213
- if self.training_config.include_statistical:
214
- feature_df = extract_split_statistics(
215
- masked_df,
216
- split_cache=split_cache,
217
- split_name="inference",
218
- config=self.statistical_config,
219
- )
220
-
221
- # ======== TF-IDF features ===========
222
-
223
- if self.training_config.include_tfidf:
224
- for column in ("text1", "text2"):
225
- docs = [record_to_tfidf_text(record, config=self.tfidf_config) for record in split_cache[column]]
226
- tfidf_matrix = self.tfidf_vectorizer.transform(docs).toarray()
227
- tfidf_cols = [f"{column}_tfidf_{index:05d}" for index in range(tfidf_matrix.shape[1])]
228
- tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_cols)
229
- feature_df = pd.concat([feature_df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)
230
-
231
- # ======== n-gram features ===========
232
-
233
- for column in ("text1", "text2"):
234
-
235
- if self.training_config.include_char_ngrams:
236
- char_docs = [
237
- " ".join(build_space_free_char_ngrams(text, n=self.ngram_config.char_ngram_n))
238
- for text in masked_df[column].tolist()]
239
- char_matrix = self.char_vectorizer.transform(char_docs).toarray()
240
- char_cols = [
241
- f"{column}_char{self.ngram_config.char_ngram_n}_tfidf_{index:05d}"
242
- for index in range(char_matrix.shape[1])]
243
- char_df = pd.DataFrame(char_matrix, columns=char_cols)
244
- feature_df = pd.concat([feature_df.reset_index(drop=True), char_df.reset_index(drop=True)], axis=1)
245
-
246
- if self.training_config.include_pos_ngrams:
247
- pos_docs = [" ".join(record_to_pos_sequence(record)) for record in split_cache[column]]
248
- pos_matrix = self.pos_vectorizer.transform(pos_docs).toarray()
249
- pos_cols = [
250
- f"{column}_pos{self.ngram_config.pos_ngram_range}_tfidf_{index:05d}"
251
- for index in range(pos_matrix.shape[1])]
252
- pos_df = pd.DataFrame(pos_matrix, columns=pos_cols)
253
- feature_df = pd.concat([feature_df.reset_index(drop=True), pos_df.reset_index(drop=True)], axis=1)
254
-
255
- continue
256
-
257
- # ======== readability features ===========
258
-
259
- if self.training_config.include_readability:
260
- readability_df = pd.DataFrame([{
261
- "text1_readability_flesch_kincaid_grade": round(textstat.flesch_kincaid_grade(masked_df.iloc[0]["text1"]), 5),
262
- "text1_readability_gunning_fog": round(textstat.gunning_fog(masked_df.iloc[0]["text1"]), 5),
263
- "text1_readability_smog": round(textstat.smog_index(masked_df.iloc[0]["text1"]), 5),
264
- "text1_readability_coleman_liau": round(textstat.coleman_liau_index(masked_df.iloc[0]["text1"]), 5),
265
-
266
- "text2_readability_flesch_kincaid_grade": round(textstat.flesch_kincaid_grade(masked_df.iloc[0]["text2"]), 5),
267
- "text2_readability_gunning_fog": round(textstat.gunning_fog(masked_df.iloc[0]["text2"]), 5),
268
- "text2_readability_smog": round(textstat.smog_index(masked_df.iloc[0]["text2"]), 5),
269
- "text2_readability_coleman_liau": round(textstat.coleman_liau_index(masked_df.iloc[0]["text2"]), 5)
270
- }])
271
- feature_df = pd.concat([feature_df.reset_index(drop=True), readability_df.reset_index(drop=True)], axis=1)
272
-
273
- blocks: list[np.ndarray] = []
274
- if self.training_config.include_local_pairwise:
275
- blocks.append(self._build_pairwise_vector(feature_df)) # optimized
276
- if self.training_config.include_global_pairwise:
277
- blocks.append(self._build_global_pairwise_vector(feature_df))
278
- if not blocks:
279
- raise ValueError("At least one of include_local_pairwise or include_global_pairwise must be True.")
280
-
281
- X = np.hstack(blocks).astype(np.float32)
282
- probability_same = self._predict_positive_proba(X)
283
- predicted_label = int(probability_same >= threshold_value) # 1 if > threshold, otherwise 0
284
-
285
- return PredictionResult(
286
- probability_same=probability_same,
287
- predicted_label=predicted_label,
288
- threshold=threshold_value,
289
- normalized_text1=pair_df.iloc[0]["text1"],
290
- normalized_text2=pair_df.iloc[0]["text2"],
291
- masked_text1=masked_df.iloc[0]["text1"],
292
- masked_text2=masked_df.iloc[0]["text2"],
293
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -5,6 +5,13 @@ scipy==1.17.1
5
  scikit-learn==1.8.0
6
  xgboost==3.2.0
7
  spacy==3.8.14
 
8
  ftfy==6.3.1
9
  textstat==0.7.13
10
  tqdm==4.67.3
 
 
 
 
 
 
 
5
  scikit-learn==1.8.0
6
  xgboost==3.2.0
7
  spacy==3.8.14
8
+ datasets==4.8.4
9
  ftfy==6.3.1
10
  textstat==0.7.13
11
  tqdm==4.67.3
12
+ pyarrow==23.0.1
13
+
14
+
15
+
16
+ # after installing these, install the spaCy model used by the saved config:
17
+ # python -m spacy download en_core_web_lg
saved/.gitkeep ADDED
File without changes
saved/audit/.gitkeep ADDED
File without changes
saved/audit/dataframes/.gitkeep ADDED
File without changes
saved/dimensionality_reduction/.gitkeep ADDED
File without changes
saved/dimensionality_reduction/dataframes/.gitkeep ADDED
File without changes
saved/masking/.gitkeep ADDED
File without changes
saved/masking/dataframes/.gitkeep ADDED
File without changes
saved/masking/spacy_checkpoints/.gitkeep ADDED
File without changes
saved/masking/spacy_checkpoints/text1/.gitkeep ADDED
File without changes
saved/masking/spacy_checkpoints/text2/.gitkeep ADDED
File without changes
saved/model/.gitkeep ADDED
File without changes
saved/ngram_features/.gitkeep ADDED
File without changes
saved/ngram_features/dataframes/.gitkeep ADDED
File without changes
saved/normalization/.gitkeep ADDED
File without changes
saved/normalization/dataframes/.gitkeep ADDED
File without changes
saved/pairwise_baseline/.gitkeep ADDED
File without changes
saved/pairwise_baseline/predictions/.gitkeep ADDED
File without changes
saved/statistical_features/.gitkeep ADDED
File without changes
saved/statistical_features/dataframes/.gitkeep ADDED
File without changes
saved/tfidf_features/.gitkeep ADDED
File without changes
saved/tfidf_features/dataframes/.gitkeep ADDED
File without changes
src/testing.ipynb DELETED
The diff for this file is too large to render. See raw diff