Spaces:

Arko007
/

security

Running

security / backend /app /services /stylometry.py

GitHub Actions

Deploy backend from GitHub 43a4c2cb381254b3c2fd54acd891b54847bb81d1

2f073d3 about 1 month ago

5.15 kB

	"""
	Stylometry analysis service.
	Lightweight CPU-only feature extraction: char n-grams, function-word frequencies,
	punctuation patterns, and read-time heuristics.
	No external models required.
	"""
	import re
	import math
	from collections import Counter
	from typing import Dict

	# Common English function words
	FUNCTION_WORDS = {
	"the", "be", "to", "of", "and", "a", "in", "that", "have", "i",
	"it", "for", "not", "on", "with", "he", "as", "you", "do", "at",
	"this", "but", "his", "by", "from", "they", "we", "say", "her", "she",
	"or", "an", "will", "my", "one", "all", "would", "there", "their",
	"what", "so", "up", "out", "if", "about", "who", "get", "which", "go",
	"me", "when", "make", "can", "like", "time", "no", "just", "him",
	"know", "take", "people", "into", "year", "your", "good", "some",
	}


	def _char_ngrams(text: str, n: int = 3) -> Dict[str, int]:
	"""Extract character n-gram frequency distribution."""
	ngrams = Counter()
	text_lower = text.lower()
	for i in range(len(text_lower) - n + 1):
	ngrams[text_lower[i:i + n]] += 1
	return dict(ngrams)


	def _function_word_freq(text: str) -> float:
	"""Compute ratio of function words to total words."""
	words = text.lower().split()
	if not words:
	return 0.0
	fw_count = sum(1 for w in words if w.strip(".,!?;:\"'") in FUNCTION_WORDS)
	return fw_count / len(words)


	def _punctuation_pattern(text: str) -> Dict[str, float]:
	"""Extract punctuation density and diversity metrics."""
	if not text:
	return {"density": 0.0, "diversity": 0.0}
	puncts = re.findall(r"[^\w\s]", text)
	density = len(puncts) / len(text)
	diversity = len(set(puncts)) / max(len(puncts), 1)
	return {"density": round(density, 4), "diversity": round(diversity, 4)}


	def _readability_score(text: str) -> float:
	"""Simple Automated Readability Index approximation."""
	sentences = max(len(re.split(r"[.!?]+", text)), 1)
	words = text.split()
	word_count = max(len(words), 1)
	char_count = sum(len(w) for w in words)
	ari = 4.71 * (char_count / word_count) + 0.5 * (word_count / sentences) - 21.43
	return max(0, min(20, ari))


	def _sentence_length_variance(text: str) -> float:
	"""Compute variance in sentence lengths (words per sentence)."""
	sentences = re.split(r"[.!?]+", text)
	lengths = [len(s.split()) for s in sentences if s.strip()]
	if len(lengths) < 2:
	return 0.0
	mean = sum(lengths) / len(lengths)
	variance = sum((l - mean) ** 2 for l in lengths) / len(lengths)
	return round(math.sqrt(variance), 4)


	def compute_stylometry_score(text: str) -> float:
	"""
	Compute a stylometry anomaly score (0-1).
	Higher scores indicate more anomalous writing patterns
	(potentially AI-generated or coordinated).

	Uses a combination of features compared against typical human baselines.
	"""
	if not text or len(text) < 20:
	return 0.0

	features = []

	# Feature 1: Function word ratio (human ~0.4-0.55, AI tends to be more uniform)
	fw_ratio = _function_word_freq(text)
	fw_anomaly = abs(fw_ratio - 0.47) / 0.47 # Distance from typical human ratio
	features.append(min(1.0, fw_anomaly))

	# Feature 2: Punctuation patterns
	punct = _punctuation_pattern(text)
	# Very low or very high punctuation density is anomalous
	punct_anomaly = abs(punct["density"] - 0.06) / 0.06 if punct["density"] > 0 else 0.5
	features.append(min(1.0, punct_anomaly))

	# Feature 3: Sentence length variance (low variance = more AI-like)
	slv = _sentence_length_variance(text)
	# Typical human variance is 5-15 words; very low suggests AI
	slv_anomaly = max(0, 1.0 - slv / 10.0) if slv < 10 else 0.0
	features.append(slv_anomaly)

	# Feature 4: Readability consistency
	ari = _readability_score(text)
	# Very consistent readability (middle range) is more AI-like
	ari_anomaly = max(0, 1.0 - abs(ari - 10) / 10)
	features.append(ari_anomaly)

	# Feature 5: Character n-gram entropy
	ngrams = _char_ngrams(text, 3)
	if ngrams:
	total = sum(ngrams.values())
	probs = [c / total for c in ngrams.values()]
	entropy = -sum(p * math.log2(p) for p in probs if p > 0)
	max_entropy = math.log2(max(len(ngrams), 1))
	# Very high entropy = unusual; normalize
	norm_entropy = entropy / max_entropy if max_entropy > 0 else 0
	# AI text tends to have moderate-high entropy
	features.append(max(0, norm_entropy - 0.5) * 2)
	else:
	features.append(0.0)

	# Weighted average
	weights = [0.25, 0.15, 0.25, 0.15, 0.20]
	score = sum(f * w for f, w in zip(features, weights))
	return round(min(1.0, max(0.0, score)), 4)


	def extract_features(text: str) -> Dict:
	"""Extract all stylometry features for analysis."""
	return {
	"function_word_ratio": _function_word_freq(text),
	"punctuation": _punctuation_pattern(text),
	"readability_ari": _readability_score(text),
	"sentence_length_variance": _sentence_length_variance(text),
	"stylometry_score": compute_stylometry_score(text),
	}