Spaces:

QuentinL52
/

interview_agents_api

Running

App Files Files Community

quentinL52 commited on 14 days ago

Commit

d379dd9

1 Parent(s): ae5d0c1

adding API key

Browse files

Files changed (3) hide show

Dockerfile +0 -2
src/services/nlp_service.py +59 -53
src/tools/analysis_tools.py +8 -1

Dockerfile CHANGED Viewed

@@ -4,7 +4,6 @@ RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
-# Définir explicitement le dossier de données NLTK pour éviter les surprises
 ENV NLTK_DATA="/home/user/nltk_data"
 WORKDIR /app
@@ -12,7 +11,6 @@ WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
-# Créer le dossier et télécharger les corpus spécifiques
 RUN mkdir -p /home/user/nltk_data && \
     python -m textblob.download_corpora && \
     python -m nltk.downloader punkt_tab

 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 ENV NLTK_DATA="/home/user/nltk_data"
 WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 RUN mkdir -p /home/user/nltk_data && \
     python -m textblob.download_corpora && \
     python -m nltk.downloader punkt_tab

src/services/nlp_service.py CHANGED Viewed

@@ -35,54 +35,61 @@ class NLPService:
     MAX_PERPLEXITY_CHARS = 50000
     def calculate_perplexity(self, text: str) -> float:
-    if not text or len(text.strip()) < 10:
-        return 0.0
-    if len(text) > self.MAX_PERPLEXITY_CHARS:
-        text = text[:self.MAX_PERPLEXITY_CHARS]
-    self._load_model()
-    encodings = self._perplex_tokenizer(text, return_tensors='pt', truncation=True, max_length=self.MAX_PERPLEXITY_CHARS)
-    max_length = self._perplex_model.config.n_positions
-    stride = 512
-    seq_len = encodings.input_ids.size(1)
-    nlls = []
-    prev_end_loc = 0
-    for begin_loc in range(0, seq_len, stride):
-        end_loc = min(begin_loc + max_length, seq_len)
-        trg_len = end_loc - prev_end_loc
-        input_ids = encodings.input_ids[:, begin_loc:end_loc]
-        if input_ids.size(1) > max_length:
-            input_ids = input_ids[:, :max_length]
-        target_ids = input_ids.clone()
-        target_ids[:, :-trg_len] = -100
-        with torch.no_grad():
-            outputs = self._perplex_model(input_ids, labels=target_ids)
-            neg_log_likelihood = outputs.loss
-        nlls.append(neg_log_likelihood)
-        prev_end_loc = end_loc
-        if end_loc == seq_len:
-            break
-    if not nlls:
-        return 0.0
-    ppl = torch.exp(torch.stack(nlls).mean())
-    return round(float(ppl), 2)
     def analyze_sentiment(self, text: str) -> dict:
-        """
-        Returns Polarity (-1 to 1) and Subjectivity (0 to 1).
-        """
         blob = TextBlob(text)
         return {
             "polarity": round(blob.sentiment.polarity, 2),
@@ -90,10 +97,7 @@ class NLPService:
         }
     def calculate_lexical_diversity(self, text: str) -> float:
-        """
-        Type-Token Ratio (TTR).
-        Higher = richer vocabulary.
-        """
         if not text:
             return 0.0
@@ -105,12 +109,15 @@ class NLPService:
         return round(len(unique_words) / len(words), 3)
     def calculate_burstiness(self, text: str) -> float:
-        """
-        Burstiness is usually defined by the variation in sentence length.
-        AI text tends to be more regular (low std dev), humans more chaotic.
-        """
         blob = TextBlob(text)
-        sentences = blob.sentences
         if not sentences or len(sentences) < 2:
             return 0.0
@@ -118,7 +125,6 @@ class NLPService:
         std_dev = np.std(lengths)
         mean = np.mean(lengths)
-        # Coefficient of variation can be a proxy for burstiness
         if mean == 0:
             return 0.0
@@ -131,4 +137,4 @@ class NLPService:
             "lexical_diversity": self.calculate_lexical_diversity(text),
             "burstiness": self.calculate_burstiness(text),
             "readability": textstat.flesch_reading_ease(text)
-        }

     MAX_PERPLEXITY_CHARS = 50000
     def calculate_perplexity(self, text: str) -> float:
+        """
+        Calculate perplexity of the text using a small GPT-2 model.
+        Lower perplexity = more likely to be generated by AI.
+        """
+        if not text or len(text.strip()) < 10:
+            return 0.0
+        if len(text) > self.MAX_PERPLEXITY_CHARS:
+            text = text[:self.MAX_PERPLEXITY_CHARS]
+        self._load_model()
+        encodings = self._perplex_tokenizer(
+            text,
+            return_tensors='pt',
+            truncation=True,
+            max_length=self.MAX_PERPLEXITY_CHARS
+        )
+        max_length = self._perplex_model.config.n_positions
+        stride = 512
+        seq_len = encodings.input_ids.size(1)
+        nlls = []
+        prev_end_loc = 0
+        for begin_loc in range(0, seq_len, stride):
+            end_loc = min(begin_loc + max_length, seq_len)
+            trg_len = end_loc - prev_end_loc
+            input_ids = encodings.input_ids[:, begin_loc:end_loc]
+            # Sécurité supplémentaire pour ne jamais dépasser la fenêtre du modèle
+            if input_ids.size(1) > max_length:
+                input_ids = input_ids[:, :max_length]
+            target_ids = input_ids.clone()
+            target_ids[:, :-trg_len] = -100
+            with torch.no_grad():
+                outputs = self._perplex_model(input_ids, labels=target_ids)
+                neg_log_likelihood = outputs.loss
+            nlls.append(neg_log_likelihood)
+            prev_end_loc = end_loc
+            if end_loc == seq_len:
+                break
+        if not nlls:
+            return 0.0
+        ppl = torch.exp(torch.stack(nlls).mean())
+        return round(float(ppl), 2)
     def analyze_sentiment(self, text: str) -> dict:
+        """Returns Polarity (-1 to 1) and Subjectivity (0 to 1)."""
         blob = TextBlob(text)
         return {
             "polarity": round(blob.sentiment.polarity, 2),
         }
     def calculate_lexical_diversity(self, text: str) -> float:
+        """Type-Token Ratio (TTR). Higher = richer vocabulary."""
         if not text:
             return 0.0
         return round(len(unique_words) / len(words), 3)
     def calculate_burstiness(self, text: str) -> float:
+        """Variation in sentence length. proxy for AI detection."""
         blob = TextBlob(text)
+        # Utilisation sécurisée de blob.sentences (nécessite punkt_tab)
+        try:
+            sentences = blob.sentences
+        except Exception as e:
+            logger.error(f"TextBlob/NLTK error: {e}")
+            return 0.0
         if not sentences or len(sentences) < 2:
             return 0.0
         std_dev = np.std(lengths)
         mean = np.mean(lengths)
         if mean == 0:
             return 0.0
             "lexical_diversity": self.calculate_lexical_diversity(text),
             "burstiness": self.calculate_burstiness(text),
             "readability": textstat.flesch_reading_ease(text)
+        }

src/tools/analysis_tools.py CHANGED Viewed

@@ -11,6 +11,7 @@ import httpx
 logger = logging.getLogger(__name__)
 BACKEND_API_URL = os.getenv("BACKEND_API_URL", "http://localhost:8000")
 class InterviewAnalysisArgs(BaseModel):
     """Arguments for the trigger_interview_analysis tool."""
@@ -49,7 +50,13 @@ def trigger_interview_analysis(user_id: str, job_offer_id: str, job_description:
         }
         try:
-            response = httpx.post(f"{BACKEND_API_URL}/api/v1/feedback/", json=feedback_payload, timeout=30.0)
             response.raise_for_status()
             logger.info("Feedback saved to Backend API successfully.")
         except Exception as api_err:

 logger = logging.getLogger(__name__)
 BACKEND_API_URL = os.getenv("BACKEND_API_URL", "http://localhost:8000")
+INTERNAL_API_KEY = os.getenv("INTERNAL_API_KEY")
 class InterviewAnalysisArgs(BaseModel):
     """Arguments for the trigger_interview_analysis tool."""
         }
         try:
+            headers = {"X-Internal-API-Key": INTERNAL_API_KEY} if INTERNAL_API_KEY else {}
+            response = httpx.post(
+                f"{BACKEND_API_URL}/api/v1/feedback/",
+                json=feedback_payload,
+                headers=headers,
+                timeout=30.0
+            )
             response.raise_for_status()
             logger.info("Feedback saved to Backend API successfully.")
         except Exception as api_err: