quentinL52 commited on
Commit
d379dd9
·
1 Parent(s): ae5d0c1

adding API key

Browse files
Dockerfile CHANGED
@@ -4,7 +4,6 @@ RUN useradd -m -u 1000 user
4
  USER user
5
  ENV PATH="/home/user/.local/bin:$PATH"
6
 
7
- # Définir explicitement le dossier de données NLTK pour éviter les surprises
8
  ENV NLTK_DATA="/home/user/nltk_data"
9
 
10
  WORKDIR /app
@@ -12,7 +11,6 @@ WORKDIR /app
12
  COPY --chown=user ./requirements.txt requirements.txt
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
 
15
- # Créer le dossier et télécharger les corpus spécifiques
16
  RUN mkdir -p /home/user/nltk_data && \
17
  python -m textblob.download_corpora && \
18
  python -m nltk.downloader punkt_tab
 
4
  USER user
5
  ENV PATH="/home/user/.local/bin:$PATH"
6
 
 
7
  ENV NLTK_DATA="/home/user/nltk_data"
8
 
9
  WORKDIR /app
 
11
  COPY --chown=user ./requirements.txt requirements.txt
12
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
13
 
 
14
  RUN mkdir -p /home/user/nltk_data && \
15
  python -m textblob.download_corpora && \
16
  python -m nltk.downloader punkt_tab
src/services/nlp_service.py CHANGED
@@ -35,54 +35,61 @@ class NLPService:
35
  MAX_PERPLEXITY_CHARS = 50000
36
 
37
  def calculate_perplexity(self, text: str) -> float:
38
- if not text or len(text.strip()) < 10:
39
- return 0.0
40
-
41
- if len(text) > self.MAX_PERPLEXITY_CHARS:
42
- text = text[:self.MAX_PERPLEXITY_CHARS]
43
-
44
- self._load_model()
45
-
46
- encodings = self._perplex_tokenizer(text, return_tensors='pt', truncation=True, max_length=self.MAX_PERPLEXITY_CHARS)
47
-
48
- max_length = self._perplex_model.config.n_positions
49
- stride = 512
50
- seq_len = encodings.input_ids.size(1)
51
-
52
- nlls = []
53
- prev_end_loc = 0
54
-
55
- for begin_loc in range(0, seq_len, stride):
56
- end_loc = min(begin_loc + max_length, seq_len)
57
- trg_len = end_loc - prev_end_loc
58
-
59
- input_ids = encodings.input_ids[:, begin_loc:end_loc]
60
- if input_ids.size(1) > max_length:
61
- input_ids = input_ids[:, :max_length]
62
-
63
- target_ids = input_ids.clone()
64
- target_ids[:, :-trg_len] = -100
65
-
66
- with torch.no_grad():
67
- outputs = self._perplex_model(input_ids, labels=target_ids)
68
- neg_log_likelihood = outputs.loss
69
 
70
- nlls.append(neg_log_likelihood)
71
- prev_end_loc = end_loc
72
- if end_loc == seq_len:
73
- break
74
 
75
- if not nlls:
76
- return 0.0
 
 
 
 
 
 
 
 
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- ppl = torch.exp(torch.stack(nlls).mean())
80
- return round(float(ppl), 2)
81
 
82
  def analyze_sentiment(self, text: str) -> dict:
83
- """
84
- Returns Polarity (-1 to 1) and Subjectivity (0 to 1).
85
- """
86
  blob = TextBlob(text)
87
  return {
88
  "polarity": round(blob.sentiment.polarity, 2),
@@ -90,10 +97,7 @@ class NLPService:
90
  }
91
 
92
  def calculate_lexical_diversity(self, text: str) -> float:
93
- """
94
- Type-Token Ratio (TTR).
95
- Higher = richer vocabulary.
96
- """
97
  if not text:
98
  return 0.0
99
 
@@ -105,12 +109,15 @@ class NLPService:
105
  return round(len(unique_words) / len(words), 3)
106
 
107
  def calculate_burstiness(self, text: str) -> float:
108
- """
109
- Burstiness is usually defined by the variation in sentence length.
110
- AI text tends to be more regular (low std dev), humans more chaotic.
111
- """
112
  blob = TextBlob(text)
113
- sentences = blob.sentences
 
 
 
 
 
 
114
  if not sentences or len(sentences) < 2:
115
  return 0.0
116
 
@@ -118,7 +125,6 @@ class NLPService:
118
  std_dev = np.std(lengths)
119
  mean = np.mean(lengths)
120
 
121
- # Coefficient of variation can be a proxy for burstiness
122
  if mean == 0:
123
  return 0.0
124
 
@@ -131,4 +137,4 @@ class NLPService:
131
  "lexical_diversity": self.calculate_lexical_diversity(text),
132
  "burstiness": self.calculate_burstiness(text),
133
  "readability": textstat.flesch_reading_ease(text)
134
- }
 
35
  MAX_PERPLEXITY_CHARS = 50000
36
 
37
  def calculate_perplexity(self, text: str) -> float:
38
+ """
39
+ Calculate perplexity of the text using a small GPT-2 model.
40
+ Lower perplexity = more likely to be generated by AI.
41
+ """
42
+ if not text or len(text.strip()) < 10:
43
+ return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ if len(text) > self.MAX_PERPLEXITY_CHARS:
46
+ text = text[:self.MAX_PERPLEXITY_CHARS]
 
 
47
 
48
+ self._load_model()
49
+ encodings = self._perplex_tokenizer(
50
+ text,
51
+ return_tensors='pt',
52
+ truncation=True,
53
+ max_length=self.MAX_PERPLEXITY_CHARS
54
+ )
55
+
56
+ max_length = self._perplex_model.config.n_positions
57
+ stride = 512
58
+ seq_len = encodings.input_ids.size(1)
59
 
60
+ nlls = []
61
+ prev_end_loc = 0
62
+
63
+ for begin_loc in range(0, seq_len, stride):
64
+ end_loc = min(begin_loc + max_length, seq_len)
65
+ trg_len = end_loc - prev_end_loc
66
+
67
+ input_ids = encodings.input_ids[:, begin_loc:end_loc]
68
+
69
+ # Sécurité supplémentaire pour ne jamais dépasser la fenêtre du modèle
70
+ if input_ids.size(1) > max_length:
71
+ input_ids = input_ids[:, :max_length]
72
+
73
+ target_ids = input_ids.clone()
74
+ target_ids[:, :-trg_len] = -100
75
+
76
+ with torch.no_grad():
77
+ outputs = self._perplex_model(input_ids, labels=target_ids)
78
+ neg_log_likelihood = outputs.loss
79
+
80
+ nlls.append(neg_log_likelihood)
81
+ prev_end_loc = end_loc
82
+ if end_loc == seq_len:
83
+ break
84
+
85
+ if not nlls:
86
+ return 0.0
87
 
88
+ ppl = torch.exp(torch.stack(nlls).mean())
89
+ return round(float(ppl), 2)
90
 
91
  def analyze_sentiment(self, text: str) -> dict:
92
+ """Returns Polarity (-1 to 1) and Subjectivity (0 to 1)."""
 
 
93
  blob = TextBlob(text)
94
  return {
95
  "polarity": round(blob.sentiment.polarity, 2),
 
97
  }
98
 
99
  def calculate_lexical_diversity(self, text: str) -> float:
100
+ """Type-Token Ratio (TTR). Higher = richer vocabulary."""
 
 
 
101
  if not text:
102
  return 0.0
103
 
 
109
  return round(len(unique_words) / len(words), 3)
110
 
111
  def calculate_burstiness(self, text: str) -> float:
112
+ """Variation in sentence length. proxy for AI detection."""
 
 
 
113
  blob = TextBlob(text)
114
+ # Utilisation sécurisée de blob.sentences (nécessite punkt_tab)
115
+ try:
116
+ sentences = blob.sentences
117
+ except Exception as e:
118
+ logger.error(f"TextBlob/NLTK error: {e}")
119
+ return 0.0
120
+
121
  if not sentences or len(sentences) < 2:
122
  return 0.0
123
 
 
125
  std_dev = np.std(lengths)
126
  mean = np.mean(lengths)
127
 
 
128
  if mean == 0:
129
  return 0.0
130
 
 
137
  "lexical_diversity": self.calculate_lexical_diversity(text),
138
  "burstiness": self.calculate_burstiness(text),
139
  "readability": textstat.flesch_reading_ease(text)
140
+ }
src/tools/analysis_tools.py CHANGED
@@ -11,6 +11,7 @@ import httpx
11
  logger = logging.getLogger(__name__)
12
 
13
  BACKEND_API_URL = os.getenv("BACKEND_API_URL", "http://localhost:8000")
 
14
 
15
  class InterviewAnalysisArgs(BaseModel):
16
  """Arguments for the trigger_interview_analysis tool."""
@@ -49,7 +50,13 @@ def trigger_interview_analysis(user_id: str, job_offer_id: str, job_description:
49
  }
50
 
51
  try:
52
- response = httpx.post(f"{BACKEND_API_URL}/api/v1/feedback/", json=feedback_payload, timeout=30.0)
 
 
 
 
 
 
53
  response.raise_for_status()
54
  logger.info("Feedback saved to Backend API successfully.")
55
  except Exception as api_err:
 
11
  logger = logging.getLogger(__name__)
12
 
13
  BACKEND_API_URL = os.getenv("BACKEND_API_URL", "http://localhost:8000")
14
+ INTERNAL_API_KEY = os.getenv("INTERNAL_API_KEY")
15
 
16
  class InterviewAnalysisArgs(BaseModel):
17
  """Arguments for the trigger_interview_analysis tool."""
 
50
  }
51
 
52
  try:
53
+ headers = {"X-Internal-API-Key": INTERNAL_API_KEY} if INTERNAL_API_KEY else {}
54
+ response = httpx.post(
55
+ f"{BACKEND_API_URL}/api/v1/feedback/",
56
+ json=feedback_payload,
57
+ headers=headers,
58
+ timeout=30.0
59
+ )
60
  response.raise_for_status()
61
  logger.info("Feedback saved to Backend API successfully.")
62
  except Exception as api_err: