Spaces:
Running
Running
File size: 5,712 Bytes
89e8242 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | import spacy
import numpy as np
from typing import Dict, Any, List
import re
try:
import textstat
except ImportError:
pass
class StructuralEngine:
"""
Forensic Structural Engine v10.5 (Human-First Architecture)
Analyzes Dependency Tree Entropy, Clause Complexity, and Structural Cadence.
AI focuses on balanced 'clean' trees (high uniformity); Human text is 'lopsided' and irregular (structural strength).
"""
def __init__(self):
try:
self.nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "textcat"])
self.enabled = True
except Exception as e:
print(f"[StructuralEngine] Error loading spaCy: {e}")
self.enabled = False
def analyze(self, text: str) -> Dict[str, Any]:
if not self.enabled:
return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"error": "spaCy not loaded"}}
doc = self.nlp(text)
sentences = list(doc.sents)
if len(sentences) < 2:
return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"warning": "Too few sentences for structural analysis"}}
tree_depths = []
clause_ratios = []
branching_factors = []
punct_counts = []
lengths = []
for sent in sentences:
depths = self._get_token_depths(sent.root)
tree_depths.append(max(depths) if depths else 0)
sub_clauses = sum(1 for token in sent if token.dep_ in ("advcl", "relcl", "ccomp", "xcomp"))
clause_ratios.append(sub_clauses / (len(sent) + 1e-9))
branches = [len(list(token.children)) for token in sent if len(list(token.children)) > 0]
branching_factors.append(np.mean(branches) if branches else 0)
punct_counts.append(sum(1 for token in sent if token.is_punct))
lengths.append(len(sent))
# --- NEW HUMAN-CENTRIC METRICS ---
# A. Depth Variance
depth_var = float(np.var(tree_depths))
depth_var_norm = float(np.clip(depth_var / 5.0, 0.0, 1.0)) # Higher is more human
# B. Punctuation Randomness
punct_cv = float(np.std(punct_counts) / (np.mean(punct_counts) + 1e-9))
punct_cv_norm = float(np.clip(punct_cv / 1.5, 0.0, 1.0)) # Higher is more human
# C. Cadence (Sentence Length Variation)
len_cv = float(np.std(lengths) / (np.mean(lengths) + 1e-9))
len_cv_norm = float(np.clip(len_cv / 0.8, 0.0, 1.0)) # Higher is more human
# D. Readability Entropy (via textstat)
try:
syllable_count = textstat.syllable_count(text)
flesch = textstat.flesch_reading_ease(text)
readability_complexity = float(np.clip(1.0 - (flesch / 100.0), 0.0, 1.0))
except:
readability_complexity = 0.5
# E. Type-Token Ratio (Lexical Diversity)
words = [t.text.lower() for t in doc if not t.is_punct]
if words:
ttr = len(set(words)) / (len(words) + 1e-9)
ttr_norm = float(np.clip(ttr / 0.8, 0.0, 1.0)) # Higher is more human/diverse
else:
ttr = 0.5
ttr_norm = 0.5
# F. POS Entropy
pos_counts = {}
for t in doc:
pos_counts[t.pos_] = pos_counts.get(t.pos_, 0) + 1
pos_total = sum(pos_counts.values())
pos_probs = [c / pos_total for c in pos_counts.values()]
pos_entropy = -sum(p * np.log2(p) for p in pos_probs if p > 0)
# Normalize: AI typically has lower POS entropy (more predictable structure)
pos_entropy_norm = float(np.clip((pos_entropy - 2.0) / 1.5, 0.0, 1.0))
# G. Structural Entropy (Uniformity of branching)
flat_branches = [b for b in branching_factors if b > 0]
if flat_branches:
hist, _ = np.histogram(flat_branches, bins=5, range=(0, 5))
probs = hist / (sum(hist) + 1e-9)
entropy = -sum(p * np.log2(p + 1e-9) for p in probs)
# Normalize: AI typically has entropy < 1.0; Human > 1.5
entropy_norm = float(np.clip((1.5 - entropy) / 1.0, 0.0, 1.0)) # Higher means strictly organized (AI)
else:
entropy_norm = 0.5
# Final Aggregates
# v16.5 weighting: focus on cadence, entropy, and diversity
structural_strength = (depth_var_norm * 0.25) + (punct_cv_norm * 0.2) + (len_cv_norm * 0.25) + (ttr_norm * 0.2) + (pos_entropy_norm * 0.1)
high_uniformity = (1.0 - len_cv_norm) * 0.3 + (1.0 - punct_cv_norm) * 0.2 + (entropy_norm * 0.3) + (1.0 - ttr_norm) * 0.2
structural_variation = (depth_var_norm + len_cv_norm + ttr_norm) / 3.0
# We value Entropy and Depth Var most for 2026-level detection
return {
"structural_strength": round(float(structural_strength), 4),
"high_uniformity": round(float(high_uniformity), 4),
"structural_variation": round(float(structural_variation), 4),
"details": {
"avg_depth": round(float(np.mean(tree_depths)), 2),
"depth_variance": round(depth_var, 3),
"sentence_cadence_cv": round(len_cv, 3),
"punctuation_randomness": round(punct_cv, 3),
"ttr": round(ttr, 3),
"pos_entropy": round(pos_entropy, 3)
}
}
def _get_token_depths(self, token, depth=0):
depths = [depth]
for child in token.children:
depths.extend(self._get_token_depths(child, depth + 1))
return depths
|