Spaces:

QuentinL52
/

interview_agents_api

Running

App Files Files Community

quentinL52 commited on 14 days ago

Commit

ae5d0c1

1 Parent(s): ef16617

perpexity update

Browse files

Files changed (1) hide show

src/services/nlp_service.py +38 -36

src/services/nlp_service.py CHANGED Viewed

@@ -35,47 +35,49 @@ class NLPService:
     MAX_PERPLEXITY_CHARS = 50000
     def calculate_perplexity(self, text: str) -> float:
-        """
-        Calculate perplexity of the text using a small GPT-2 model.
-        Lower perplexity = more likely to be generated by AI (or very standard human text).
-        """
-        if not text or len(text.strip()) < 10:
-            return 0.0
-        # Truncate to avoid memory overflow on very long inputs
-        if len(text) > self.MAX_PERPLEXITY_CHARS:
-            text = text[:self.MAX_PERPLEXITY_CHARS]
-        self._load_model()
-        encodings = self._perplex_tokenizer(text, return_tensors='pt')
-        max_length = self._perplex_model.config.n_positions
-        stride = 512
-        seq_len = encodings.input_ids.size(1)
-        nlls = []
-        prev_end_loc = 0
-        for begin_loc in range(0, seq_len, stride):
-            end_loc = min(begin_loc + max_length, seq_len)
-            trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
-            input_ids = encodings.input_ids[:, begin_loc:end_loc]
-            target_ids = input_ids.clone()
-            target_ids[:, :-trg_len] = -100
-            with torch.no_grad():
-                outputs = self._perplex_model(input_ids, labels=target_ids)
-                neg_log_likelihood = outputs.loss
-            nlls.append(neg_log_likelihood)
-            prev_end_loc = end_loc
-            if end_loc == seq_len:
-                break
-        if not nlls:
-            return 0.0
-        ppl = torch.exp(torch.stack(nlls).mean())
-        return float(ppl)
     def analyze_sentiment(self, text: str) -> dict:
         """

     MAX_PERPLEXITY_CHARS = 50000
     def calculate_perplexity(self, text: str) -> float:
+    if not text or len(text.strip()) < 10:
+        return 0.0
+    if len(text) > self.MAX_PERPLEXITY_CHARS:
+        text = text[:self.MAX_PERPLEXITY_CHARS]
+    self._load_model()
+    encodings = self._perplex_tokenizer(text, return_tensors='pt', truncation=True, max_length=self.MAX_PERPLEXITY_CHARS)
+    max_length = self._perplex_model.config.n_positions
+    stride = 512
+    seq_len = encodings.input_ids.size(1)
+    nlls = []
+    prev_end_loc = 0
+    for begin_loc in range(0, seq_len, stride):
+        end_loc = min(begin_loc + max_length, seq_len)
+        trg_len = end_loc - prev_end_loc
+        input_ids = encodings.input_ids[:, begin_loc:end_loc]
+        if input_ids.size(1) > max_length:
+            input_ids = input_ids[:, :max_length]
+        target_ids = input_ids.clone()
+        target_ids[:, :-trg_len] = -100
+        with torch.no_grad():
+            outputs = self._perplex_model(input_ids, labels=target_ids)
+            neg_log_likelihood = outputs.loss
+        nlls.append(neg_log_likelihood)
+        prev_end_loc = end_loc
+        if end_loc == seq_len:
+            break
+    if not nlls:
+        return 0.0
+    ppl = torch.exp(torch.stack(nlls).mean())
+    return round(float(ppl), 2)
     def analyze_sentiment(self, text: str) -> dict:
         """