File size: 4,629 Bytes
4e9b744 d379dd9 4e9b744 d379dd9 4e9b744 d379dd9 4e9b744 d379dd9 4e9b744 d379dd9 4e9b744 d379dd9 4e9b744 d379dd9 4e9b744 d379dd9 4e9b744 d379dd9 4e9b744 d379dd9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | import logging
import math
import numpy as np
from textblob import TextBlob
import textstat
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch
import re
logger = logging.getLogger(__name__)
class NLPService:
_instance = None
_perplex_model = None
_perplex_tokenizer = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(NLPService, cls).__new__(cls)
return cls._instance
def _load_model(self):
"""Lazy load the model to avoid huge startup time."""
if self._perplex_model is None:
logger.info("Loading NLP models (DistilGPT2)...")
try:
model_id = 'distilgpt2'
self._perplex_model = GPT2LMHeadModel.from_pretrained(model_id)
self._perplex_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
logger.info("NLP models loaded successfully.")
except Exception as e:
logger.error(f"Failed to load NLP models: {e}")
raise e
MAX_PERPLEXITY_CHARS = 50000
def calculate_perplexity(self, text: str) -> float:
"""
Calculate perplexity of the text using a small GPT-2 model.
Lower perplexity = more likely to be generated by AI.
"""
if not text or len(text.strip()) < 10:
return 0.0
if len(text) > self.MAX_PERPLEXITY_CHARS:
text = text[:self.MAX_PERPLEXITY_CHARS]
self._load_model()
encodings = self._perplex_tokenizer(
text,
return_tensors='pt',
truncation=True,
max_length=self.MAX_PERPLEXITY_CHARS
)
max_length = self._perplex_model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)
nlls = []
prev_end_loc = 0
for begin_loc in range(0, seq_len, stride):
end_loc = min(begin_loc + max_length, seq_len)
trg_len = end_loc - prev_end_loc
input_ids = encodings.input_ids[:, begin_loc:end_loc]
# Sécurité supplémentaire pour ne jamais dépasser la fenêtre du modèle
if input_ids.size(1) > max_length:
input_ids = input_ids[:, :max_length]
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100
with torch.no_grad():
outputs = self._perplex_model(input_ids, labels=target_ids)
neg_log_likelihood = outputs.loss
nlls.append(neg_log_likelihood)
prev_end_loc = end_loc
if end_loc == seq_len:
break
if not nlls:
return 0.0
ppl = torch.exp(torch.stack(nlls).mean())
return round(float(ppl), 2)
def analyze_sentiment(self, text: str) -> dict:
"""Returns Polarity (-1 to 1) and Subjectivity (0 to 1)."""
blob = TextBlob(text)
return {
"polarity": round(blob.sentiment.polarity, 2),
"subjectivity": round(blob.sentiment.subjectivity, 2)
}
def calculate_lexical_diversity(self, text: str) -> float:
"""Type-Token Ratio (TTR). Higher = richer vocabulary."""
if not text:
return 0.0
words = re.findall(r'\w+', text.lower())
if not words:
return 0.0
unique_words = set(words)
return round(len(unique_words) / len(words), 3)
def calculate_burstiness(self, text: str) -> float:
"""Variation in sentence length. proxy for AI detection."""
blob = TextBlob(text)
# Utilisation sécurisée de blob.sentences (nécessite punkt_tab)
try:
sentences = blob.sentences
except Exception as e:
logger.error(f"TextBlob/NLTK error: {e}")
return 0.0
if not sentences or len(sentences) < 2:
return 0.0
lengths = [len(s.words) for s in sentences]
std_dev = np.std(lengths)
mean = np.mean(lengths)
if mean == 0:
return 0.0
return round(float(std_dev / mean), 3)
def compute_all_metrics(self, text: str) -> dict:
return {
"perplexity": self.calculate_perplexity(text),
"sentiment": self.analyze_sentiment(text),
"lexical_diversity": self.calculate_lexical_diversity(text),
"burstiness": self.calculate_burstiness(text),
"readability": textstat.flesch_reading_ease(text)
} |