File size: 5,712 Bytes
89e8242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import spacy
import numpy as np
from typing import Dict, Any, List
import re
try:
    import textstat
except ImportError:
    pass

class StructuralEngine:
    """
    Forensic Structural Engine v10.5 (Human-First Architecture)
    Analyzes Dependency Tree Entropy, Clause Complexity, and Structural Cadence.
    AI focuses on balanced 'clean' trees (high uniformity); Human text is 'lopsided' and irregular (structural strength).
    """
    def __init__(self):
        try:
            self.nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "textcat"])
            self.enabled = True
        except Exception as e:
            print(f"[StructuralEngine] Error loading spaCy: {e}")
            self.enabled = False

    def analyze(self, text: str) -> Dict[str, Any]:
        if not self.enabled:
            return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"error": "spaCy not loaded"}}

        doc = self.nlp(text)
        sentences = list(doc.sents)
        if len(sentences) < 2:
            return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"warning": "Too few sentences for structural analysis"}}

        tree_depths = []
        clause_ratios = []
        branching_factors = []
        punct_counts = []
        lengths = []
        
        for sent in sentences:
            depths = self._get_token_depths(sent.root)
            tree_depths.append(max(depths) if depths else 0)
            
            sub_clauses = sum(1 for token in sent if token.dep_ in ("advcl", "relcl", "ccomp", "xcomp"))
            clause_ratios.append(sub_clauses / (len(sent) + 1e-9))
            
            branches = [len(list(token.children)) for token in sent if len(list(token.children)) > 0]
            branching_factors.append(np.mean(branches) if branches else 0)
            
            punct_counts.append(sum(1 for token in sent if token.is_punct))
            lengths.append(len(sent))

        # --- NEW HUMAN-CENTRIC METRICS ---
        
        # A. Depth Variance
        depth_var = float(np.var(tree_depths))
        depth_var_norm = float(np.clip(depth_var / 5.0, 0.0, 1.0)) # Higher is more human
        
        # B. Punctuation Randomness
        punct_cv = float(np.std(punct_counts) / (np.mean(punct_counts) + 1e-9))
        punct_cv_norm = float(np.clip(punct_cv / 1.5, 0.0, 1.0)) # Higher is more human
        
        # C. Cadence (Sentence Length Variation)
        len_cv = float(np.std(lengths) / (np.mean(lengths) + 1e-9))
        len_cv_norm = float(np.clip(len_cv / 0.8, 0.0, 1.0)) # Higher is more human
        
        # D. Readability Entropy (via textstat)
        try:
            syllable_count = textstat.syllable_count(text)
            flesch = textstat.flesch_reading_ease(text)
            readability_complexity = float(np.clip(1.0 - (flesch / 100.0), 0.0, 1.0))
        except:
            readability_complexity = 0.5

        # E. Type-Token Ratio (Lexical Diversity)
        words = [t.text.lower() for t in doc if not t.is_punct]
        if words:
            ttr = len(set(words)) / (len(words) + 1e-9)
            ttr_norm = float(np.clip(ttr / 0.8, 0.0, 1.0)) # Higher is more human/diverse
        else:
            ttr = 0.5
            ttr_norm = 0.5

        # F. POS Entropy
        pos_counts = {}
        for t in doc:
            pos_counts[t.pos_] = pos_counts.get(t.pos_, 0) + 1
        pos_total = sum(pos_counts.values())
        pos_probs = [c / pos_total for c in pos_counts.values()]
        pos_entropy = -sum(p * np.log2(p) for p in pos_probs if p > 0)
        # Normalize: AI typically has lower POS entropy (more predictable structure)
        pos_entropy_norm = float(np.clip((pos_entropy - 2.0) / 1.5, 0.0, 1.0))

        # G. Structural Entropy (Uniformity of branching)
        flat_branches = [b for b in branching_factors if b > 0]
        if flat_branches:
            hist, _ = np.histogram(flat_branches, bins=5, range=(0, 5))
            probs = hist / (sum(hist) + 1e-9)
            entropy = -sum(p * np.log2(p + 1e-9) for p in probs)
            # Normalize: AI typically has entropy < 1.0; Human > 1.5
            entropy_norm = float(np.clip((1.5 - entropy) / 1.0, 0.0, 1.0))  # Higher means strictly organized (AI)
        else:
            entropy_norm = 0.5

        # Final Aggregates
        # v16.5 weighting: focus on cadence, entropy, and diversity
        structural_strength = (depth_var_norm * 0.25) + (punct_cv_norm * 0.2) + (len_cv_norm * 0.25) + (ttr_norm * 0.2) + (pos_entropy_norm * 0.1)
        high_uniformity = (1.0 - len_cv_norm) * 0.3 + (1.0 - punct_cv_norm) * 0.2 + (entropy_norm * 0.3) + (1.0 - ttr_norm) * 0.2
        structural_variation = (depth_var_norm + len_cv_norm + ttr_norm) / 3.0
        
        # We value Entropy and Depth Var most for 2026-level detection
        return {
            "structural_strength": round(float(structural_strength), 4),
            "high_uniformity": round(float(high_uniformity), 4),
            "structural_variation": round(float(structural_variation), 4),
            "details": {
                "avg_depth": round(float(np.mean(tree_depths)), 2),
                "depth_variance": round(depth_var, 3),
                "sentence_cadence_cv": round(len_cv, 3),
                "punctuation_randomness": round(punct_cv, 3),
                "ttr": round(ttr, 3),
                "pos_entropy": round(pos_entropy, 3)
            }
        }

    def _get_token_depths(self, token, depth=0):
        depths = [depth]
        for child in token.children:
            depths.extend(self._get_token_depths(child, depth + 1))
        return depths