interview_agents_api / src /services /nlp_service.py
quentinL52
Initial commit
4e9b744
raw
history blame
4.41 kB
import logging
import math
import numpy as np
from textblob import TextBlob
import textstat
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch
import re
logger = logging.getLogger(__name__)
class NLPService:
_instance = None
_perplex_model = None
_perplex_tokenizer = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(NLPService, cls).__new__(cls)
return cls._instance
def _load_model(self):
"""Lazy load the model to avoid huge startup time."""
if self._perplex_model is None:
logger.info("Loading NLP models (DistilGPT2)...")
try:
model_id = 'distilgpt2'
self._perplex_model = GPT2LMHeadModel.from_pretrained(model_id)
self._perplex_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
logger.info("NLP models loaded successfully.")
except Exception as e:
logger.error(f"Failed to load NLP models: {e}")
raise e
MAX_PERPLEXITY_CHARS = 50000
def calculate_perplexity(self, text: str) -> float:
"""
Calculate perplexity of the text using a small GPT-2 model.
Lower perplexity = more likely to be generated by AI (or very standard human text).
"""
if not text or len(text.strip()) < 10:
return 0.0
# Truncate to avoid memory overflow on very long inputs
if len(text) > self.MAX_PERPLEXITY_CHARS:
text = text[:self.MAX_PERPLEXITY_CHARS]
self._load_model()
encodings = self._perplex_tokenizer(text, return_tensors='pt')
max_length = self._perplex_model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)
nlls = []
prev_end_loc = 0
for begin_loc in range(0, seq_len, stride):
end_loc = min(begin_loc + max_length, seq_len)
trg_len = end_loc - prev_end_loc # may be different from stride on last loop
input_ids = encodings.input_ids[:, begin_loc:end_loc]
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100
with torch.no_grad():
outputs = self._perplex_model(input_ids, labels=target_ids)
neg_log_likelihood = outputs.loss
nlls.append(neg_log_likelihood)
prev_end_loc = end_loc
if end_loc == seq_len:
break
if not nlls:
return 0.0
ppl = torch.exp(torch.stack(nlls).mean())
return float(ppl)
def analyze_sentiment(self, text: str) -> dict:
"""
Returns Polarity (-1 to 1) and Subjectivity (0 to 1).
"""
blob = TextBlob(text)
return {
"polarity": round(blob.sentiment.polarity, 2),
"subjectivity": round(blob.sentiment.subjectivity, 2)
}
def calculate_lexical_diversity(self, text: str) -> float:
"""
Type-Token Ratio (TTR).
Higher = richer vocabulary.
"""
if not text:
return 0.0
words = re.findall(r'\w+', text.lower())
if not words:
return 0.0
unique_words = set(words)
return round(len(unique_words) / len(words), 3)
def calculate_burstiness(self, text: str) -> float:
"""
Burstiness is usually defined by the variation in sentence length.
AI text tends to be more regular (low std dev), humans more chaotic.
"""
blob = TextBlob(text)
sentences = blob.sentences
if not sentences or len(sentences) < 2:
return 0.0
lengths = [len(s.words) for s in sentences]
std_dev = np.std(lengths)
mean = np.mean(lengths)
# Coefficient of variation can be a proxy for burstiness
if mean == 0:
return 0.0
return round(float(std_dev / mean), 3)
def compute_all_metrics(self, text: str) -> dict:
return {
"perplexity": self.calculate_perplexity(text),
"sentiment": self.analyze_sentiment(text),
"lexical_diversity": self.calculate_lexical_diversity(text),
"burstiness": self.calculate_burstiness(text),
"readability": textstat.flesch_reading_ease(text)
}