Spaces:

QuentinL52
/

interview_agents_api

Running

File size: 4,629 Bytes

import logging
import math
import numpy as np
from textblob import TextBlob
import textstat
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch
import re

logger = logging.getLogger(__name__)

class NLPService:
    _instance = None
    _perplex_model = None
    _perplex_tokenizer = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(NLPService, cls).__new__(cls)
        return cls._instance

    def _load_model(self):
        """Lazy load the model to avoid huge startup time."""
        if self._perplex_model is None:
            logger.info("Loading NLP models (DistilGPT2)...")
            try:
                model_id = 'distilgpt2'
                self._perplex_model = GPT2LMHeadModel.from_pretrained(model_id)
                self._perplex_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
                logger.info("NLP models loaded successfully.")
            except Exception as e:
                logger.error(f"Failed to load NLP models: {e}")
                raise e

    MAX_PERPLEXITY_CHARS = 50000

    def calculate_perplexity(self, text: str) -> float:
        """
        Calculate perplexity of the text using a small GPT-2 model.
        Lower perplexity = more likely to be generated by AI.
        """
        if not text or len(text.strip()) < 10:
            return 0.0

        if len(text) > self.MAX_PERPLEXITY_CHARS:
            text = text[:self.MAX_PERPLEXITY_CHARS]

        self._load_model()
        encodings = self._perplex_tokenizer(
            text, 
            return_tensors='pt', 
            truncation=True, 
            max_length=self.MAX_PERPLEXITY_CHARS
        )
        
        max_length = self._perplex_model.config.n_positions
        stride = 512
        seq_len = encodings.input_ids.size(1)

        nlls = []
        prev_end_loc = 0
        
        for begin_loc in range(0, seq_len, stride):
            end_loc = min(begin_loc + max_length, seq_len)
            trg_len = end_loc - prev_end_loc
            
            input_ids = encodings.input_ids[:, begin_loc:end_loc]
            
            # Sécurité supplémentaire pour ne jamais dépasser la fenêtre du modèle
            if input_ids.size(1) > max_length:
                input_ids = input_ids[:, :max_length]
                
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = self._perplex_model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs.loss

            nlls.append(neg_log_likelihood)
            prev_end_loc = end_loc
            if end_loc == seq_len:
                break

        if not nlls:
            return 0.0

        ppl = torch.exp(torch.stack(nlls).mean())
        return round(float(ppl), 2)

    def analyze_sentiment(self, text: str) -> dict:
        """Returns Polarity (-1 to 1) and Subjectivity (0 to 1)."""
        blob = TextBlob(text)
        return {
            "polarity": round(blob.sentiment.polarity, 2),
            "subjectivity": round(blob.sentiment.subjectivity, 2)
        }

    def calculate_lexical_diversity(self, text: str) -> float:
        """Type-Token Ratio (TTR). Higher = richer vocabulary."""
        if not text:
            return 0.0
        
        words = re.findall(r'\w+', text.lower())
        if not words:
            return 0.0
        
        unique_words = set(words)
        return round(len(unique_words) / len(words), 3)

    def calculate_burstiness(self, text: str) -> float:
        """Variation in sentence length. proxy for AI detection."""
        blob = TextBlob(text)
        # Utilisation sécurisée de blob.sentences (nécessite punkt_tab)
        try:
            sentences = blob.sentences
        except Exception as e:
            logger.error(f"TextBlob/NLTK error: {e}")
            return 0.0

        if not sentences or len(sentences) < 2:
            return 0.0
            
        lengths = [len(s.words) for s in sentences]
        std_dev = np.std(lengths)
        mean = np.mean(lengths)
        
        if mean == 0:
            return 0.0
            
        return round(float(std_dev / mean), 3)

    def compute_all_metrics(self, text: str) -> dict:
        return {
            "perplexity": self.calculate_perplexity(text),
            "sentiment": self.analyze_sentiment(text),
            "lexical_diversity": self.calculate_lexical_diversity(text),
            "burstiness": self.calculate_burstiness(text),
            "readability": textstat.flesch_reading_ease(text)
        }