File size: 6,005 Bytes
f52234e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
AIFinder Feature Extraction
TF-IDF pipeline + stylometric features.
Supports CoT-aware and no-CoT text preprocessing.
"""

import re
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.base import BaseEstimator, TransformerMixin

from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS


# --- Text Preprocessing ---

def strip_cot(text):
    """Remove <think>...</think> blocks from text."""
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()


def has_cot(text):
    """Check if text contains <think>...</think> blocks."""
    return bool(re.search(r"<think>.*?</think>", text, flags=re.DOTALL))


def cot_ratio(text):
    """Ratio of thinking text to total text length."""
    think_matches = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
    if not think_matches or len(text) == 0:
        return 0.0
    think_len = sum(len(m) for m in think_matches)
    return think_len / len(text)


# --- Stylometric Features ---

class StylometricFeatures(BaseEstimator, TransformerMixin):
    """Extract stylometric features from text."""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        features = []
        for text in X:
            features.append(self._extract(text))
        return csr_matrix(np.array(features, dtype=np.float32))

    def _extract(self, text):
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if s.strip()]
        words = text.split()

        n_chars = max(len(text), 1)
        n_words = max(len(words), 1)
        n_sentences = max(len(sentences), 1)

        # Basic stats
        avg_word_len = np.mean([len(w) for w in words]) if words else 0
        avg_sent_len = n_words / n_sentences

        # Punctuation densities
        n_commas = text.count(",") / n_chars
        n_semicolons = text.count(";") / n_chars
        n_colons = text.count(":") / n_chars
        n_exclaim = text.count("!") / n_chars
        n_question = text.count("?") / n_chars
        n_ellipsis = text.count("...") / n_chars
        n_dash = (text.count("—") + text.count("--")) / n_chars

        # Markdown elements
        n_headers = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) / n_sentences
        n_bold = len(re.findall(r'\*\*.*?\*\*', text)) / n_sentences
        n_italic = len(re.findall(r'(?<!\*)\*(?!\*).*?(?<!\*)\*(?!\*)', text)) / n_sentences
        n_code_blocks = len(re.findall(r'```', text)) / n_sentences
        n_inline_code = len(re.findall(r'`[^`]+`', text)) / n_sentences
        n_bullet = len(re.findall(r'^[\s]*[-*+]\s', text, re.MULTILINE)) / n_sentences
        n_numbered = len(re.findall(r'^\s*\d+[.)]\s', text, re.MULTILINE)) / n_sentences

        # Vocabulary richness
        unique_words = len(set(w.lower() for w in words))
        ttr = unique_words / n_words  # type-token ratio

        # Paragraph structure
        paragraphs = text.split("\n\n")
        n_paragraphs = len([p for p in paragraphs if p.strip()])
        avg_para_len = n_words / max(n_paragraphs, 1)

        # Special patterns
        starts_with_certainly = 1.0 if re.match(r'^(Certainly|Of course|Sure|Absolutely|Great question)', text, re.IGNORECASE) else 0.0
        has_disclaimer = 1.0 if re.search(r"(I'm an AI|as an AI|language model|I cannot|I can't help)", text, re.IGNORECASE) else 0.0

        # CoT features (present even in no-CoT mode, just will be 0)
        has_think = 1.0 if has_cot(text) else 0.0
        think_ratio = cot_ratio(text)

        return [
            avg_word_len, avg_sent_len,
            n_commas, n_semicolons, n_colons, n_exclaim, n_question,
            n_ellipsis, n_dash,
            n_headers, n_bold, n_italic, n_code_blocks, n_inline_code,
            n_bullet, n_numbered,
            ttr, n_paragraphs, avg_para_len,
            starts_with_certainly, has_disclaimer,
            has_think, think_ratio,
            n_chars, n_words,
        ]


# --- Feature Pipeline ---

class FeaturePipeline:
    """Combined TF-IDF + stylometric feature pipeline."""

    def __init__(self):
        self.word_tfidf = TfidfVectorizer(**TFIDF_WORD_PARAMS)
        self.char_tfidf = TfidfVectorizer(**TFIDF_CHAR_PARAMS)
        self.stylo = StylometricFeatures()
        self.scaler = MaxAbsScaler()

    def fit_transform(self, texts):
        """Fit and transform texts into feature matrix."""
        import time
        print(f"    Input: {len(texts)} texts")

        # Strip <think> blocks for TF-IDF so n-grams learn style, not CoT
        texts_no_cot = [strip_cot(t) for t in texts]

        t0 = time.time()
        word_features = self.word_tfidf.fit_transform(texts_no_cot)
        print(f"    word tfidf: {word_features.shape[1]} features ({time.time()-t0:.1f}s)")

        t0 = time.time()
        char_features = self.char_tfidf.fit_transform(texts_no_cot)
        print(f"    char tfidf: {char_features.shape[1]} features ({time.time()-t0:.1f}s)")

        # Stylometric uses original text (has_think, think_ratio still work)
        t0 = time.time()
        stylo_features = self.stylo.fit_transform(texts)
        print(f"    stylometric: {stylo_features.shape[1]} features ({time.time()-t0:.1f}s)")

        combined = hstack([word_features, char_features, stylo_features])
        combined = self.scaler.fit_transform(combined)
        print(f"    Combined feature matrix: {combined.shape}")
        return combined

    def transform(self, texts):
        """Transform texts into feature matrix (after fitting)."""
        texts_no_cot = [strip_cot(t) for t in texts]
        word_features = self.word_tfidf.transform(texts_no_cot)
        char_features = self.char_tfidf.transform(texts_no_cot)
        stylo_features = self.stylo.transform(texts)
        combined = hstack([word_features, char_features, stylo_features])
        return self.scaler.transform(combined)