File size: 4,629 Bytes
4e9b744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d379dd9
 
 
 
 
 
4e9b744
d379dd9
 
4e9b744
d379dd9
 
 
 
 
 
 
 
 
 
 
4e9b744
d379dd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e9b744
d379dd9
 
4e9b744
 
d379dd9
4e9b744
 
 
 
 
 
 
d379dd9
4e9b744
 
 
 
 
 
 
 
 
 
 
d379dd9
4e9b744
d379dd9
 
 
 
 
 
 
4e9b744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d379dd9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import logging
import math
import numpy as np
from textblob import TextBlob
import textstat
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch
import re

logger = logging.getLogger(__name__)

class NLPService:
    _instance = None
    _perplex_model = None
    _perplex_tokenizer = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(NLPService, cls).__new__(cls)
        return cls._instance

    def _load_model(self):
        """Lazy load the model to avoid huge startup time."""
        if self._perplex_model is None:
            logger.info("Loading NLP models (DistilGPT2)...")
            try:
                model_id = 'distilgpt2'
                self._perplex_model = GPT2LMHeadModel.from_pretrained(model_id)
                self._perplex_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
                logger.info("NLP models loaded successfully.")
            except Exception as e:
                logger.error(f"Failed to load NLP models: {e}")
                raise e

    MAX_PERPLEXITY_CHARS = 50000

    def calculate_perplexity(self, text: str) -> float:
        """
        Calculate perplexity of the text using a small GPT-2 model.
        Lower perplexity = more likely to be generated by AI.
        """
        if not text or len(text.strip()) < 10:
            return 0.0

        if len(text) > self.MAX_PERPLEXITY_CHARS:
            text = text[:self.MAX_PERPLEXITY_CHARS]

        self._load_model()
        encodings = self._perplex_tokenizer(
            text, 
            return_tensors='pt', 
            truncation=True, 
            max_length=self.MAX_PERPLEXITY_CHARS
        )
        
        max_length = self._perplex_model.config.n_positions
        stride = 512
        seq_len = encodings.input_ids.size(1)

        nlls = []
        prev_end_loc = 0
        
        for begin_loc in range(0, seq_len, stride):
            end_loc = min(begin_loc + max_length, seq_len)
            trg_len = end_loc - prev_end_loc
            
            input_ids = encodings.input_ids[:, begin_loc:end_loc]
            
            # Sécurité supplémentaire pour ne jamais dépasser la fenêtre du modèle
            if input_ids.size(1) > max_length:
                input_ids = input_ids[:, :max_length]
                
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = self._perplex_model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs.loss

            nlls.append(neg_log_likelihood)
            prev_end_loc = end_loc
            if end_loc == seq_len:
                break

        if not nlls:
            return 0.0

        ppl = torch.exp(torch.stack(nlls).mean())
        return round(float(ppl), 2)

    def analyze_sentiment(self, text: str) -> dict:
        """Returns Polarity (-1 to 1) and Subjectivity (0 to 1)."""
        blob = TextBlob(text)
        return {
            "polarity": round(blob.sentiment.polarity, 2),
            "subjectivity": round(blob.sentiment.subjectivity, 2)
        }

    def calculate_lexical_diversity(self, text: str) -> float:
        """Type-Token Ratio (TTR). Higher = richer vocabulary."""
        if not text:
            return 0.0
        
        words = re.findall(r'\w+', text.lower())
        if not words:
            return 0.0
        
        unique_words = set(words)
        return round(len(unique_words) / len(words), 3)

    def calculate_burstiness(self, text: str) -> float:
        """Variation in sentence length. proxy for AI detection."""
        blob = TextBlob(text)
        # Utilisation sécurisée de blob.sentences (nécessite punkt_tab)
        try:
            sentences = blob.sentences
        except Exception as e:
            logger.error(f"TextBlob/NLTK error: {e}")
            return 0.0

        if not sentences or len(sentences) < 2:
            return 0.0
            
        lengths = [len(s.words) for s in sentences]
        std_dev = np.std(lengths)
        mean = np.mean(lengths)
        
        if mean == 0:
            return 0.0
            
        return round(float(std_dev / mean), 3)

    def compute_all_metrics(self, text: str) -> dict:
        return {
            "perplexity": self.calculate_perplexity(text),
            "sentiment": self.analyze_sentiment(text),
            "lexical_diversity": self.calculate_lexical_diversity(text),
            "burstiness": self.calculate_burstiness(text),
            "readability": textstat.flesch_reading_ease(text)
        }