Spaces:

CompactAI
/

AIFinder

Running

App Files Files Community

AIFinder / features.py

CompactAI

Upload 13 files

f52234e verified 1 day ago

raw

history blame contribute delete

6.01 kB

	"""
	AIFinder Feature Extraction
	TF-IDF pipeline + stylometric features.
	Supports CoT-aware and no-CoT text preprocessing.
	"""

	import re
	import numpy as np
	from scipy.sparse import hstack, csr_matrix
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.preprocessing import MaxAbsScaler
	from sklearn.base import BaseEstimator, TransformerMixin

	from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS


	# --- Text Preprocessing ---

	def strip_cot(text):
	"""Remove <think>...</think> blocks from text."""
	return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()


	def has_cot(text):
	"""Check if text contains <think>...</think> blocks."""
	return bool(re.search(r"<think>.*?</think>", text, flags=re.DOTALL))


	def cot_ratio(text):
	"""Ratio of thinking text to total text length."""
	think_matches = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
	if not think_matches or len(text) == 0:
	return 0.0
	think_len = sum(len(m) for m in think_matches)
	return think_len / len(text)


	# --- Stylometric Features ---

	class StylometricFeatures(BaseEstimator, TransformerMixin):
	"""Extract stylometric features from text."""

	def fit(self, X, y=None):
	return self

	def transform(self, X):
	features = []
	for text in X:
	features.append(self._extract(text))
	return csr_matrix(np.array(features, dtype=np.float32))

	def _extract(self, text):
	sentences = re.split(r'[.!?]+', text)
	sentences = [s.strip() for s in sentences if s.strip()]
	words = text.split()

	n_chars = max(len(text), 1)
	n_words = max(len(words), 1)
	n_sentences = max(len(sentences), 1)

	# Basic stats
	avg_word_len = np.mean([len(w) for w in words]) if words else 0
	avg_sent_len = n_words / n_sentences

	# Punctuation densities
	n_commas = text.count(",") / n_chars
	n_semicolons = text.count(";") / n_chars
	n_colons = text.count(":") / n_chars
	n_exclaim = text.count("!") / n_chars
	n_question = text.count("?") / n_chars
	n_ellipsis = text.count("...") / n_chars
	n_dash = (text.count("—") + text.count("--")) / n_chars

	# Markdown elements
	n_headers = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) / n_sentences
	n_bold = len(re.findall(r'\\.?\\*', text)) / n_sentences
	n_italic = len(re.findall(r'(?<!\)\(?!\).?(?<!\)\(?!\*)', text)) / n_sentences
	n_code_blocks = len(re.findall(r'```', text)) / n_sentences
	n_inline_code = len(re.findall(r'`[^`]+`', text)) / n_sentences
	n_bullet = len(re.findall(r'^[\s][-+]\s', text, re.MULTILINE)) / n_sentences
	n_numbered = len(re.findall(r'^\s*\d+[.)]\s', text, re.MULTILINE)) / n_sentences

	# Vocabulary richness
	unique_words = len(set(w.lower() for w in words))
	ttr = unique_words / n_words # type-token ratio

	# Paragraph structure
	paragraphs = text.split("\n\n")
	n_paragraphs = len([p for p in paragraphs if p.strip()])
	avg_para_len = n_words / max(n_paragraphs, 1)

	# Special patterns
	starts_with_certainly = 1.0 if re.match(r'^(Certainly\|Of course\|Sure\|Absolutely\|Great question)', text, re.IGNORECASE) else 0.0
	has_disclaimer = 1.0 if re.search(r"(I'm an AI\|as an AI\|language model\|I cannot\|I can't help)", text, re.IGNORECASE) else 0.0

	# CoT features (present even in no-CoT mode, just will be 0)
	has_think = 1.0 if has_cot(text) else 0.0
	think_ratio = cot_ratio(text)

	return [
	avg_word_len, avg_sent_len,
	n_commas, n_semicolons, n_colons, n_exclaim, n_question,
	n_ellipsis, n_dash,
	n_headers, n_bold, n_italic, n_code_blocks, n_inline_code,
	n_bullet, n_numbered,
	ttr, n_paragraphs, avg_para_len,
	starts_with_certainly, has_disclaimer,
	has_think, think_ratio,
	n_chars, n_words,
	]


	# --- Feature Pipeline ---

	class FeaturePipeline:
	"""Combined TF-IDF + stylometric feature pipeline."""

	def __init__(self):
	self.word_tfidf = TfidfVectorizer(**TFIDF_WORD_PARAMS)
	self.char_tfidf = TfidfVectorizer(**TFIDF_CHAR_PARAMS)
	self.stylo = StylometricFeatures()
	self.scaler = MaxAbsScaler()

	def fit_transform(self, texts):
	"""Fit and transform texts into feature matrix."""
	import time
	print(f" Input: {len(texts)} texts")

	# Strip <think> blocks for TF-IDF so n-grams learn style, not CoT
	texts_no_cot = [strip_cot(t) for t in texts]

	t0 = time.time()
	word_features = self.word_tfidf.fit_transform(texts_no_cot)
	print(f" word tfidf: {word_features.shape[1]} features ({time.time()-t0:.1f}s)")

	t0 = time.time()
	char_features = self.char_tfidf.fit_transform(texts_no_cot)
	print(f" char tfidf: {char_features.shape[1]} features ({time.time()-t0:.1f}s)")

	# Stylometric uses original text (has_think, think_ratio still work)
	t0 = time.time()
	stylo_features = self.stylo.fit_transform(texts)
	print(f" stylometric: {stylo_features.shape[1]} features ({time.time()-t0:.1f}s)")

	combined = hstack([word_features, char_features, stylo_features])
	combined = self.scaler.fit_transform(combined)
	print(f" Combined feature matrix: {combined.shape}")
	return combined

	def transform(self, texts):
	"""Transform texts into feature matrix (after fitting)."""
	texts_no_cot = [strip_cot(t) for t in texts]
	word_features = self.word_tfidf.transform(texts_no_cot)
	char_features = self.char_tfidf.transform(texts_no_cot)
	stylo_features = self.stylo.transform(texts)
	combined = hstack([word_features, char_features, stylo_features])
	return self.scaler.transform(combined)