File size: 6,005 Bytes
f52234e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | """
AIFinder Feature Extraction
TF-IDF pipeline + stylometric features.
Supports CoT-aware and no-CoT text preprocessing.
"""
import re
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.base import BaseEstimator, TransformerMixin
from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS
# --- Text Preprocessing ---
def strip_cot(text):
"""Remove <think>...</think> blocks from text."""
return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
def has_cot(text):
"""Check if text contains <think>...</think> blocks."""
return bool(re.search(r"<think>.*?</think>", text, flags=re.DOTALL))
def cot_ratio(text):
"""Ratio of thinking text to total text length."""
think_matches = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
if not think_matches or len(text) == 0:
return 0.0
think_len = sum(len(m) for m in think_matches)
return think_len / len(text)
# --- Stylometric Features ---
class StylometricFeatures(BaseEstimator, TransformerMixin):
"""Extract stylometric features from text."""
def fit(self, X, y=None):
return self
def transform(self, X):
features = []
for text in X:
features.append(self._extract(text))
return csr_matrix(np.array(features, dtype=np.float32))
def _extract(self, text):
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
words = text.split()
n_chars = max(len(text), 1)
n_words = max(len(words), 1)
n_sentences = max(len(sentences), 1)
# Basic stats
avg_word_len = np.mean([len(w) for w in words]) if words else 0
avg_sent_len = n_words / n_sentences
# Punctuation densities
n_commas = text.count(",") / n_chars
n_semicolons = text.count(";") / n_chars
n_colons = text.count(":") / n_chars
n_exclaim = text.count("!") / n_chars
n_question = text.count("?") / n_chars
n_ellipsis = text.count("...") / n_chars
n_dash = (text.count("—") + text.count("--")) / n_chars
# Markdown elements
n_headers = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) / n_sentences
n_bold = len(re.findall(r'\*\*.*?\*\*', text)) / n_sentences
n_italic = len(re.findall(r'(?<!\*)\*(?!\*).*?(?<!\*)\*(?!\*)', text)) / n_sentences
n_code_blocks = len(re.findall(r'```', text)) / n_sentences
n_inline_code = len(re.findall(r'`[^`]+`', text)) / n_sentences
n_bullet = len(re.findall(r'^[\s]*[-*+]\s', text, re.MULTILINE)) / n_sentences
n_numbered = len(re.findall(r'^\s*\d+[.)]\s', text, re.MULTILINE)) / n_sentences
# Vocabulary richness
unique_words = len(set(w.lower() for w in words))
ttr = unique_words / n_words # type-token ratio
# Paragraph structure
paragraphs = text.split("\n\n")
n_paragraphs = len([p for p in paragraphs if p.strip()])
avg_para_len = n_words / max(n_paragraphs, 1)
# Special patterns
starts_with_certainly = 1.0 if re.match(r'^(Certainly|Of course|Sure|Absolutely|Great question)', text, re.IGNORECASE) else 0.0
has_disclaimer = 1.0 if re.search(r"(I'm an AI|as an AI|language model|I cannot|I can't help)", text, re.IGNORECASE) else 0.0
# CoT features (present even in no-CoT mode, just will be 0)
has_think = 1.0 if has_cot(text) else 0.0
think_ratio = cot_ratio(text)
return [
avg_word_len, avg_sent_len,
n_commas, n_semicolons, n_colons, n_exclaim, n_question,
n_ellipsis, n_dash,
n_headers, n_bold, n_italic, n_code_blocks, n_inline_code,
n_bullet, n_numbered,
ttr, n_paragraphs, avg_para_len,
starts_with_certainly, has_disclaimer,
has_think, think_ratio,
n_chars, n_words,
]
# --- Feature Pipeline ---
class FeaturePipeline:
"""Combined TF-IDF + stylometric feature pipeline."""
def __init__(self):
self.word_tfidf = TfidfVectorizer(**TFIDF_WORD_PARAMS)
self.char_tfidf = TfidfVectorizer(**TFIDF_CHAR_PARAMS)
self.stylo = StylometricFeatures()
self.scaler = MaxAbsScaler()
def fit_transform(self, texts):
"""Fit and transform texts into feature matrix."""
import time
print(f" Input: {len(texts)} texts")
# Strip <think> blocks for TF-IDF so n-grams learn style, not CoT
texts_no_cot = [strip_cot(t) for t in texts]
t0 = time.time()
word_features = self.word_tfidf.fit_transform(texts_no_cot)
print(f" word tfidf: {word_features.shape[1]} features ({time.time()-t0:.1f}s)")
t0 = time.time()
char_features = self.char_tfidf.fit_transform(texts_no_cot)
print(f" char tfidf: {char_features.shape[1]} features ({time.time()-t0:.1f}s)")
# Stylometric uses original text (has_think, think_ratio still work)
t0 = time.time()
stylo_features = self.stylo.fit_transform(texts)
print(f" stylometric: {stylo_features.shape[1]} features ({time.time()-t0:.1f}s)")
combined = hstack([word_features, char_features, stylo_features])
combined = self.scaler.fit_transform(combined)
print(f" Combined feature matrix: {combined.shape}")
return combined
def transform(self, texts):
"""Transform texts into feature matrix (after fitting)."""
texts_no_cot = [strip_cot(t) for t in texts]
word_features = self.word_tfidf.transform(texts_no_cot)
char_features = self.char_tfidf.transform(texts_no_cot)
stylo_features = self.stylo.transform(texts)
combined = hstack([word_features, char_features, stylo_features])
return self.scaler.transform(combined)
|