"""
AIFinder Feature Extraction
TF-IDF pipeline + stylometric features.
Supports CoT-aware and no-CoT text preprocessing.
"""
import re
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.base import BaseEstimator, TransformerMixin
from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS
# --- Text Preprocessing ---
def strip_cot(text):
"""Remove ... blocks from text."""
return re.sub(r".*?", "", text, flags=re.DOTALL).strip()
def has_cot(text):
"""Check if text contains ... blocks."""
return bool(re.search(r".*?", text, flags=re.DOTALL))
def cot_ratio(text):
"""Ratio of thinking text to total text length."""
think_matches = re.findall(r"(.*?)", text, flags=re.DOTALL)
if not think_matches or len(text) == 0:
return 0.0
think_len = sum(len(m) for m in think_matches)
return think_len / len(text)
# --- Stylometric Features ---
class StylometricFeatures(BaseEstimator, TransformerMixin):
"""Extract stylometric features from text."""
def fit(self, X, y=None):
return self
def transform(self, X):
features = []
for text in X:
features.append(self._extract(text))
return csr_matrix(np.array(features, dtype=np.float32))
def _extract(self, text):
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
words = text.split()
n_chars = max(len(text), 1)
n_words = max(len(words), 1)
n_sentences = max(len(sentences), 1)
# Basic stats
avg_word_len = np.mean([len(w) for w in words]) if words else 0
avg_sent_len = n_words / n_sentences
# Punctuation densities
n_commas = text.count(",") / n_chars
n_semicolons = text.count(";") / n_chars
n_colons = text.count(":") / n_chars
n_exclaim = text.count("!") / n_chars
n_question = text.count("?") / n_chars
n_ellipsis = text.count("...") / n_chars
n_dash = (text.count("—") + text.count("--")) / n_chars
# Markdown elements
n_headers = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) / n_sentences
n_bold = len(re.findall(r'\*\*.*?\*\*', text)) / n_sentences
n_italic = len(re.findall(r'(? blocks for TF-IDF so n-grams learn style, not CoT
texts_no_cot = [strip_cot(t) for t in texts]
t0 = time.time()
word_features = self.word_tfidf.fit_transform(texts_no_cot)
print(f" word tfidf: {word_features.shape[1]} features ({time.time()-t0:.1f}s)")
t0 = time.time()
char_features = self.char_tfidf.fit_transform(texts_no_cot)
print(f" char tfidf: {char_features.shape[1]} features ({time.time()-t0:.1f}s)")
# Stylometric uses original text (has_think, think_ratio still work)
t0 = time.time()
stylo_features = self.stylo.fit_transform(texts)
print(f" stylometric: {stylo_features.shape[1]} features ({time.time()-t0:.1f}s)")
combined = hstack([word_features, char_features, stylo_features])
combined = self.scaler.fit_transform(combined)
print(f" Combined feature matrix: {combined.shape}")
return combined
def transform(self, texts):
"""Transform texts into feature matrix (after fitting)."""
texts_no_cot = [strip_cot(t) for t in texts]
word_features = self.word_tfidf.transform(texts_no_cot)
char_features = self.char_tfidf.transform(texts_no_cot)
stylo_features = self.stylo.transform(texts)
combined = hstack([word_features, char_features, stylo_features])
return self.scaler.transform(combined)