""" AIFinder Feature Extraction TF-IDF pipeline + stylometric features. Supports CoT-aware and no-CoT text preprocessing. """ import re import numpy as np from scipy.sparse import hstack, csr_matrix from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import MaxAbsScaler from sklearn.base import BaseEstimator, TransformerMixin from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS # --- Text Preprocessing --- def strip_cot(text): """Remove ... blocks from text.""" return re.sub(r".*?", "", text, flags=re.DOTALL).strip() def has_cot(text): """Check if text contains ... blocks.""" return bool(re.search(r".*?", text, flags=re.DOTALL)) def cot_ratio(text): """Ratio of thinking text to total text length.""" think_matches = re.findall(r"(.*?)", text, flags=re.DOTALL) if not think_matches or len(text) == 0: return 0.0 think_len = sum(len(m) for m in think_matches) return think_len / len(text) # --- Stylometric Features --- class StylometricFeatures(BaseEstimator, TransformerMixin): """Extract stylometric features from text.""" def fit(self, X, y=None): return self def transform(self, X): features = [] for text in X: features.append(self._extract(text)) return csr_matrix(np.array(features, dtype=np.float32)) def _extract(self, text): sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if s.strip()] words = text.split() n_chars = max(len(text), 1) n_words = max(len(words), 1) n_sentences = max(len(sentences), 1) # Basic stats avg_word_len = np.mean([len(w) for w in words]) if words else 0 avg_sent_len = n_words / n_sentences # Punctuation densities n_commas = text.count(",") / n_chars n_semicolons = text.count(";") / n_chars n_colons = text.count(":") / n_chars n_exclaim = text.count("!") / n_chars n_question = text.count("?") / n_chars n_ellipsis = text.count("...") / n_chars n_dash = (text.count("—") + text.count("--")) / n_chars # Markdown elements n_headers = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) / n_sentences n_bold = len(re.findall(r'\*\*.*?\*\*', text)) / n_sentences n_italic = len(re.findall(r'(? blocks for TF-IDF so n-grams learn style, not CoT texts_no_cot = [strip_cot(t) for t in texts] t0 = time.time() word_features = self.word_tfidf.fit_transform(texts_no_cot) print(f" word tfidf: {word_features.shape[1]} features ({time.time()-t0:.1f}s)") t0 = time.time() char_features = self.char_tfidf.fit_transform(texts_no_cot) print(f" char tfidf: {char_features.shape[1]} features ({time.time()-t0:.1f}s)") # Stylometric uses original text (has_think, think_ratio still work) t0 = time.time() stylo_features = self.stylo.fit_transform(texts) print(f" stylometric: {stylo_features.shape[1]} features ({time.time()-t0:.1f}s)") combined = hstack([word_features, char_features, stylo_features]) combined = self.scaler.fit_transform(combined) print(f" Combined feature matrix: {combined.shape}") return combined def transform(self, texts): """Transform texts into feature matrix (after fitting).""" texts_no_cot = [strip_cot(t) for t in texts] word_features = self.word_tfidf.transform(texts_no_cot) char_features = self.char_tfidf.transform(texts_no_cot) stylo_features = self.stylo.transform(texts) combined = hstack([word_features, char_features, stylo_features]) return self.scaler.transform(combined)