Spaces:

Akash4911
/

fakeshield-api

Running

File size: 5,712 Bytes

89e8242

import spacy
import numpy as np
from typing import Dict, Any, List
import re
try:
    import textstat
except ImportError:
    pass

class StructuralEngine:
    """
    Forensic Structural Engine v10.5 (Human-First Architecture)
    Analyzes Dependency Tree Entropy, Clause Complexity, and Structural Cadence.
    AI focuses on balanced 'clean' trees (high uniformity); Human text is 'lopsided' and irregular (structural strength).
    """
    def __init__(self):
        try:
            self.nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "textcat"])
            self.enabled = True
        except Exception as e:
            print(f"[StructuralEngine] Error loading spaCy: {e}")
            self.enabled = False

    def analyze(self, text: str) -> Dict[str, Any]:
        if not self.enabled:
            return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"error": "spaCy not loaded"}}

        doc = self.nlp(text)
        sentences = list(doc.sents)
        if len(sentences) < 2:
            return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"warning": "Too few sentences for structural analysis"}}

        tree_depths = []
        clause_ratios = []
        branching_factors = []
        punct_counts = []
        lengths = []
        
        for sent in sentences:
            depths = self._get_token_depths(sent.root)
            tree_depths.append(max(depths) if depths else 0)
            
            sub_clauses = sum(1 for token in sent if token.dep_ in ("advcl", "relcl", "ccomp", "xcomp"))
            clause_ratios.append(sub_clauses / (len(sent) + 1e-9))
            
            branches = [len(list(token.children)) for token in sent if len(list(token.children)) > 0]
            branching_factors.append(np.mean(branches) if branches else 0)
            
            punct_counts.append(sum(1 for token in sent if token.is_punct))
            lengths.append(len(sent))

        # --- NEW HUMAN-CENTRIC METRICS ---
        
        # A. Depth Variance
        depth_var = float(np.var(tree_depths))
        depth_var_norm = float(np.clip(depth_var / 5.0, 0.0, 1.0)) # Higher is more human
        
        # B. Punctuation Randomness
        punct_cv = float(np.std(punct_counts) / (np.mean(punct_counts) + 1e-9))
        punct_cv_norm = float(np.clip(punct_cv / 1.5, 0.0, 1.0)) # Higher is more human
        
        # C. Cadence (Sentence Length Variation)
        len_cv = float(np.std(lengths) / (np.mean(lengths) + 1e-9))
        len_cv_norm = float(np.clip(len_cv / 0.8, 0.0, 1.0)) # Higher is more human
        
        # D. Readability Entropy (via textstat)
        try:
            syllable_count = textstat.syllable_count(text)
            flesch = textstat.flesch_reading_ease(text)
            readability_complexity = float(np.clip(1.0 - (flesch / 100.0), 0.0, 1.0))
        except:
            readability_complexity = 0.5

        # E. Type-Token Ratio (Lexical Diversity)
        words = [t.text.lower() for t in doc if not t.is_punct]
        if words:
            ttr = len(set(words)) / (len(words) + 1e-9)
            ttr_norm = float(np.clip(ttr / 0.8, 0.0, 1.0)) # Higher is more human/diverse
        else:
            ttr = 0.5
            ttr_norm = 0.5

        # F. POS Entropy
        pos_counts = {}
        for t in doc:
            pos_counts[t.pos_] = pos_counts.get(t.pos_, 0) + 1
        pos_total = sum(pos_counts.values())
        pos_probs = [c / pos_total for c in pos_counts.values()]
        pos_entropy = -sum(p * np.log2(p) for p in pos_probs if p > 0)
        # Normalize: AI typically has lower POS entropy (more predictable structure)
        pos_entropy_norm = float(np.clip((pos_entropy - 2.0) / 1.5, 0.0, 1.0))

        # G. Structural Entropy (Uniformity of branching)
        flat_branches = [b for b in branching_factors if b > 0]
        if flat_branches:
            hist, _ = np.histogram(flat_branches, bins=5, range=(0, 5))
            probs = hist / (sum(hist) + 1e-9)
            entropy = -sum(p * np.log2(p + 1e-9) for p in probs)
            # Normalize: AI typically has entropy < 1.0; Human > 1.5
            entropy_norm = float(np.clip((1.5 - entropy) / 1.0, 0.0, 1.0))  # Higher means strictly organized (AI)
        else:
            entropy_norm = 0.5

        # Final Aggregates
        # v16.5 weighting: focus on cadence, entropy, and diversity
        structural_strength = (depth_var_norm * 0.25) + (punct_cv_norm * 0.2) + (len_cv_norm * 0.25) + (ttr_norm * 0.2) + (pos_entropy_norm * 0.1)
        high_uniformity = (1.0 - len_cv_norm) * 0.3 + (1.0 - punct_cv_norm) * 0.2 + (entropy_norm * 0.3) + (1.0 - ttr_norm) * 0.2
        structural_variation = (depth_var_norm + len_cv_norm + ttr_norm) / 3.0
        
        # We value Entropy and Depth Var most for 2026-level detection
        return {
            "structural_strength": round(float(structural_strength), 4),
            "high_uniformity": round(float(high_uniformity), 4),
            "structural_variation": round(float(structural_variation), 4),
            "details": {
                "avg_depth": round(float(np.mean(tree_depths)), 2),
                "depth_variance": round(depth_var, 3),
                "sentence_cadence_cv": round(len_cv, 3),
                "punctuation_randomness": round(punct_cv, 3),
                "ttr": round(ttr, 3),
                "pos_entropy": round(pos_entropy, 3)
            }
        }

    def _get_token_depths(self, token, depth=0):
        depths = [depth]
        for child in token.children:
            depths.extend(self._get_token_depths(child, depth + 1))
        return depths