| | """ |
| | AIFinder Feature Extraction |
| | TF-IDF pipeline + stylometric features. |
| | Supports CoT-aware and no-CoT text preprocessing. |
| | """ |
| |
|
| | import re |
| | import numpy as np |
| | from scipy.sparse import hstack, csr_matrix |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from sklearn.preprocessing import MaxAbsScaler |
| | from sklearn.base import BaseEstimator, TransformerMixin |
| |
|
| | from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS |
| |
|
| |
|
| | |
| |
|
| | def strip_cot(text): |
| | """Remove <think>...</think> blocks from text.""" |
| | return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip() |
| |
|
| |
|
| | def has_cot(text): |
| | """Check if text contains <think>...</think> blocks.""" |
| | return bool(re.search(r"<think>.*?</think>", text, flags=re.DOTALL)) |
| |
|
| |
|
| | def cot_ratio(text): |
| | """Ratio of thinking text to total text length.""" |
| | think_matches = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL) |
| | if not think_matches or len(text) == 0: |
| | return 0.0 |
| | think_len = sum(len(m) for m in think_matches) |
| | return think_len / len(text) |
| |
|
| |
|
| | |
| |
|
| | class StylometricFeatures(BaseEstimator, TransformerMixin): |
| | """Extract stylometric features from text.""" |
| |
|
| | def fit(self, X, y=None): |
| | return self |
| |
|
| | def transform(self, X): |
| | features = [] |
| | for text in X: |
| | features.append(self._extract(text)) |
| | return csr_matrix(np.array(features, dtype=np.float32)) |
| |
|
| | def _extract(self, text): |
| | sentences = re.split(r'[.!?]+', text) |
| | sentences = [s.strip() for s in sentences if s.strip()] |
| | words = text.split() |
| |
|
| | n_chars = max(len(text), 1) |
| | n_words = max(len(words), 1) |
| | n_sentences = max(len(sentences), 1) |
| |
|
| | |
| | avg_word_len = np.mean([len(w) for w in words]) if words else 0 |
| | avg_sent_len = n_words / n_sentences |
| |
|
| | |
| | n_commas = text.count(",") / n_chars |
| | n_semicolons = text.count(";") / n_chars |
| | n_colons = text.count(":") / n_chars |
| | n_exclaim = text.count("!") / n_chars |
| | n_question = text.count("?") / n_chars |
| | n_ellipsis = text.count("...") / n_chars |
| | n_dash = (text.count("—") + text.count("--")) / n_chars |
| |
|
| | |
| | n_headers = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) / n_sentences |
| | n_bold = len(re.findall(r'\*\*.*?\*\*', text)) / n_sentences |
| | n_italic = len(re.findall(r'(?<!\*)\*(?!\*).*?(?<!\*)\*(?!\*)', text)) / n_sentences |
| | n_code_blocks = len(re.findall(r'```', text)) / n_sentences |
| | n_inline_code = len(re.findall(r'`[^`]+`', text)) / n_sentences |
| | n_bullet = len(re.findall(r'^[\s]*[-*+]\s', text, re.MULTILINE)) / n_sentences |
| | n_numbered = len(re.findall(r'^\s*\d+[.)]\s', text, re.MULTILINE)) / n_sentences |
| |
|
| | |
| | unique_words = len(set(w.lower() for w in words)) |
| | ttr = unique_words / n_words |
| |
|
| | |
| | paragraphs = text.split("\n\n") |
| | n_paragraphs = len([p for p in paragraphs if p.strip()]) |
| | avg_para_len = n_words / max(n_paragraphs, 1) |
| |
|
| | |
| | starts_with_certainly = 1.0 if re.match(r'^(Certainly|Of course|Sure|Absolutely|Great question)', text, re.IGNORECASE) else 0.0 |
| | has_disclaimer = 1.0 if re.search(r"(I'm an AI|as an AI|language model|I cannot|I can't help)", text, re.IGNORECASE) else 0.0 |
| |
|
| | |
| | has_think = 1.0 if has_cot(text) else 0.0 |
| | think_ratio = cot_ratio(text) |
| |
|
| | return [ |
| | avg_word_len, avg_sent_len, |
| | n_commas, n_semicolons, n_colons, n_exclaim, n_question, |
| | n_ellipsis, n_dash, |
| | n_headers, n_bold, n_italic, n_code_blocks, n_inline_code, |
| | n_bullet, n_numbered, |
| | ttr, n_paragraphs, avg_para_len, |
| | starts_with_certainly, has_disclaimer, |
| | has_think, think_ratio, |
| | n_chars, n_words, |
| | ] |
| |
|
| |
|
| | |
| |
|
| | class FeaturePipeline: |
| | """Combined TF-IDF + stylometric feature pipeline.""" |
| |
|
| | def __init__(self): |
| | self.word_tfidf = TfidfVectorizer(**TFIDF_WORD_PARAMS) |
| | self.char_tfidf = TfidfVectorizer(**TFIDF_CHAR_PARAMS) |
| | self.stylo = StylometricFeatures() |
| | self.scaler = MaxAbsScaler() |
| |
|
| | def fit_transform(self, texts): |
| | """Fit and transform texts into feature matrix.""" |
| | import time |
| | print(f" Input: {len(texts)} texts") |
| |
|
| | |
| | texts_no_cot = [strip_cot(t) for t in texts] |
| |
|
| | t0 = time.time() |
| | word_features = self.word_tfidf.fit_transform(texts_no_cot) |
| | print(f" word tfidf: {word_features.shape[1]} features ({time.time()-t0:.1f}s)") |
| |
|
| | t0 = time.time() |
| | char_features = self.char_tfidf.fit_transform(texts_no_cot) |
| | print(f" char tfidf: {char_features.shape[1]} features ({time.time()-t0:.1f}s)") |
| |
|
| | |
| | t0 = time.time() |
| | stylo_features = self.stylo.fit_transform(texts) |
| | print(f" stylometric: {stylo_features.shape[1]} features ({time.time()-t0:.1f}s)") |
| |
|
| | combined = hstack([word_features, char_features, stylo_features]) |
| | combined = self.scaler.fit_transform(combined) |
| | print(f" Combined feature matrix: {combined.shape}") |
| | return combined |
| |
|
| | def transform(self, texts): |
| | """Transform texts into feature matrix (after fitting).""" |
| | texts_no_cot = [strip_cot(t) for t in texts] |
| | word_features = self.word_tfidf.transform(texts_no_cot) |
| | char_features = self.char_tfidf.transform(texts_no_cot) |
| | stylo_features = self.stylo.transform(texts) |
| | combined = hstack([word_features, char_features, stylo_features]) |
| | return self.scaler.transform(combined) |
| |
|