fakeshield-api / app /models /structural_engine.py
Akash4911's picture
Initial Deploy: FakeShield Backend v2.0 (Sovereign Vanguard)
89e8242
import spacy
import numpy as np
from typing import Dict, Any, List
import re
try:
import textstat
except ImportError:
pass
class StructuralEngine:
"""
Forensic Structural Engine v10.5 (Human-First Architecture)
Analyzes Dependency Tree Entropy, Clause Complexity, and Structural Cadence.
AI focuses on balanced 'clean' trees (high uniformity); Human text is 'lopsided' and irregular (structural strength).
"""
def __init__(self):
try:
self.nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "textcat"])
self.enabled = True
except Exception as e:
print(f"[StructuralEngine] Error loading spaCy: {e}")
self.enabled = False
def analyze(self, text: str) -> Dict[str, Any]:
if not self.enabled:
return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"error": "spaCy not loaded"}}
doc = self.nlp(text)
sentences = list(doc.sents)
if len(sentences) < 2:
return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"warning": "Too few sentences for structural analysis"}}
tree_depths = []
clause_ratios = []
branching_factors = []
punct_counts = []
lengths = []
for sent in sentences:
depths = self._get_token_depths(sent.root)
tree_depths.append(max(depths) if depths else 0)
sub_clauses = sum(1 for token in sent if token.dep_ in ("advcl", "relcl", "ccomp", "xcomp"))
clause_ratios.append(sub_clauses / (len(sent) + 1e-9))
branches = [len(list(token.children)) for token in sent if len(list(token.children)) > 0]
branching_factors.append(np.mean(branches) if branches else 0)
punct_counts.append(sum(1 for token in sent if token.is_punct))
lengths.append(len(sent))
# --- NEW HUMAN-CENTRIC METRICS ---
# A. Depth Variance
depth_var = float(np.var(tree_depths))
depth_var_norm = float(np.clip(depth_var / 5.0, 0.0, 1.0)) # Higher is more human
# B. Punctuation Randomness
punct_cv = float(np.std(punct_counts) / (np.mean(punct_counts) + 1e-9))
punct_cv_norm = float(np.clip(punct_cv / 1.5, 0.0, 1.0)) # Higher is more human
# C. Cadence (Sentence Length Variation)
len_cv = float(np.std(lengths) / (np.mean(lengths) + 1e-9))
len_cv_norm = float(np.clip(len_cv / 0.8, 0.0, 1.0)) # Higher is more human
# D. Readability Entropy (via textstat)
try:
syllable_count = textstat.syllable_count(text)
flesch = textstat.flesch_reading_ease(text)
readability_complexity = float(np.clip(1.0 - (flesch / 100.0), 0.0, 1.0))
except:
readability_complexity = 0.5
# E. Type-Token Ratio (Lexical Diversity)
words = [t.text.lower() for t in doc if not t.is_punct]
if words:
ttr = len(set(words)) / (len(words) + 1e-9)
ttr_norm = float(np.clip(ttr / 0.8, 0.0, 1.0)) # Higher is more human/diverse
else:
ttr = 0.5
ttr_norm = 0.5
# F. POS Entropy
pos_counts = {}
for t in doc:
pos_counts[t.pos_] = pos_counts.get(t.pos_, 0) + 1
pos_total = sum(pos_counts.values())
pos_probs = [c / pos_total for c in pos_counts.values()]
pos_entropy = -sum(p * np.log2(p) for p in pos_probs if p > 0)
# Normalize: AI typically has lower POS entropy (more predictable structure)
pos_entropy_norm = float(np.clip((pos_entropy - 2.0) / 1.5, 0.0, 1.0))
# G. Structural Entropy (Uniformity of branching)
flat_branches = [b for b in branching_factors if b > 0]
if flat_branches:
hist, _ = np.histogram(flat_branches, bins=5, range=(0, 5))
probs = hist / (sum(hist) + 1e-9)
entropy = -sum(p * np.log2(p + 1e-9) for p in probs)
# Normalize: AI typically has entropy < 1.0; Human > 1.5
entropy_norm = float(np.clip((1.5 - entropy) / 1.0, 0.0, 1.0)) # Higher means strictly organized (AI)
else:
entropy_norm = 0.5
# Final Aggregates
# v16.5 weighting: focus on cadence, entropy, and diversity
structural_strength = (depth_var_norm * 0.25) + (punct_cv_norm * 0.2) + (len_cv_norm * 0.25) + (ttr_norm * 0.2) + (pos_entropy_norm * 0.1)
high_uniformity = (1.0 - len_cv_norm) * 0.3 + (1.0 - punct_cv_norm) * 0.2 + (entropy_norm * 0.3) + (1.0 - ttr_norm) * 0.2
structural_variation = (depth_var_norm + len_cv_norm + ttr_norm) / 3.0
# We value Entropy and Depth Var most for 2026-level detection
return {
"structural_strength": round(float(structural_strength), 4),
"high_uniformity": round(float(high_uniformity), 4),
"structural_variation": round(float(structural_variation), 4),
"details": {
"avg_depth": round(float(np.mean(tree_depths)), 2),
"depth_variance": round(depth_var, 3),
"sentence_cadence_cv": round(len_cv, 3),
"punctuation_randomness": round(punct_cv, 3),
"ttr": round(ttr, 3),
"pos_entropy": round(pos_entropy, 3)
}
}
def _get_token_depths(self, token, depth=0):
depths = [depth]
for child in token.children:
depths.extend(self._get_token_depths(child, depth + 1))
return depths