pythonprincess commited on
Commit
da17827
·
verified ·
1 Parent(s): a90cb07

Upload 2 files

Browse files
models/sentiment/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Sentiment Analysis Model Package
2
+
models/sentiment/sentiment_utils.py ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/sentiment/sentiment_utils.py
2
+
3
+ """
4
+ Sentiment Analysis Model Utilities for PENNY Project
5
+ Handles text sentiment classification for user input analysis and content moderation.
6
+ Provides async sentiment analysis with structured error handling and logging.
7
+ """
8
+
9
+ import asyncio
10
+ import time
11
+ import os
12
+ import httpx
13
+ from typing import Dict, Any, Optional, List
14
+
15
+ # --- Logging Imports ---
16
+ from app.logging_utils import log_interaction, sanitize_for_logging
17
+
18
+ # --- Hugging Face API Configuration ---
19
+ HF_API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
20
+ HF_TOKEN = os.getenv("HF_TOKEN")
21
+
22
+ AGENT_NAME = "penny-sentiment-agent"
23
+
24
+
25
+ def is_sentiment_available() -> bool:
26
+ """
27
+ Check if sentiment analysis service is available.
28
+
29
+ Returns:
30
+ bool: True if sentiment API is configured and ready.
31
+ """
32
+ return HF_TOKEN is not None and len(HF_TOKEN) > 0
33
+
34
+
35
+ async def get_sentiment_analysis(
36
+ text: str,
37
+ tenant_id: Optional[str] = None
38
+ ) -> Dict[str, Any]:
39
+ """
40
+ Runs sentiment analysis on the input text using the loaded pipeline.
41
+
42
+ Args:
43
+ text: The string of text to analyze.
44
+ tenant_id: Optional tenant identifier for logging.
45
+
46
+ Returns:
47
+ A dictionary containing:
48
+ - label (str): Sentiment label (e.g., "POSITIVE", "NEGATIVE", "NEUTRAL")
49
+ - score (float): Confidence score for the sentiment prediction
50
+ - available (bool): Whether the service was available
51
+ - message (str, optional): Error message if analysis failed
52
+ - response_time_ms (int, optional): Analysis time in milliseconds
53
+ """
54
+ start_time = time.time()
55
+
56
+ # Check availability
57
+ if not is_sentiment_available():
58
+ log_interaction(
59
+ intent="sentiment_analysis",
60
+ tenant_id=tenant_id,
61
+ success=False,
62
+ error="Sentiment API not configured (missing HF_TOKEN)",
63
+ fallback_used=True
64
+ )
65
+ return {
66
+ "label": "UNKNOWN",
67
+ "score": 0.0,
68
+ "available": False,
69
+ "message": "Sentiment analysis is temporarily unavailable."
70
+ }
71
+
72
+ # Validate input
73
+ if not text or not isinstance(text, str):
74
+ log_interaction(
75
+ intent="sentiment_analysis",
76
+ tenant_id=tenant_id,
77
+ success=False,
78
+ error="Invalid text input"
79
+ )
80
+ return {
81
+ "label": "ERROR",
82
+ "score": 0.0,
83
+ "available": True,
84
+ "message": "Invalid text input provided."
85
+ }
86
+
87
+ # Check text length (prevent processing extremely long texts)
88
+ if len(text) > 10000: # 10k character limit
89
+ log_interaction(
90
+ intent="sentiment_analysis",
91
+ tenant_id=tenant_id,
92
+ success=False,
93
+ error=f"Text too long: {len(text)} characters",
94
+ text_preview=sanitize_for_logging(text[:100])
95
+ )
96
+ return {
97
+ "label": "ERROR",
98
+ "score": 0.0,
99
+ "available": True,
100
+ "message": "Text is too long for sentiment analysis (max 10,000 characters)."
101
+ }
102
+
103
+ try:
104
+ # Prepare API request
105
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
106
+ payload = {"inputs": text}
107
+
108
+ # Call Hugging Face Inference API
109
+ async with httpx.AsyncClient(timeout=30.0) as client:
110
+ response = await client.post(HF_API_URL, json=payload, headers=headers)
111
+
112
+ response_time_ms = int((time.time() - start_time) * 1000)
113
+
114
+ if response.status_code != 200:
115
+ log_interaction(
116
+ intent="sentiment_analysis",
117
+ tenant_id=tenant_id,
118
+ success=False,
119
+ error=f"API returned status {response.status_code}",
120
+ response_time_ms=response_time_ms,
121
+ text_preview=sanitize_for_logging(text[:100]),
122
+ fallback_used=True
123
+ )
124
+ return {
125
+ "label": "ERROR",
126
+ "score": 0.0,
127
+ "available": False,
128
+ "message": f"Sentiment API error: {response.status_code}",
129
+ "response_time_ms": response_time_ms
130
+ }
131
+
132
+ results = response.json()
133
+
134
+ # Validate results
135
+ # API returns: [[{"label": "LABEL_2", "score": 0.95}, ...]]
136
+ if not results or not isinstance(results, list) or len(results) == 0:
137
+ log_interaction(
138
+ intent="sentiment_analysis",
139
+ tenant_id=tenant_id,
140
+ success=False,
141
+ error="Empty or invalid model output",
142
+ response_time_ms=response_time_ms,
143
+ text_preview=sanitize_for_logging(text[:100])
144
+ )
145
+ return {
146
+ "label": "ERROR",
147
+ "score": 0.0,
148
+ "available": True,
149
+ "message": "Sentiment analysis returned unexpected format."
150
+ }
151
+
152
+ # Get the first (highest scoring) result
153
+ result_list = results[0] if isinstance(results[0], list) else results
154
+
155
+ if not result_list or len(result_list) == 0:
156
+ log_interaction(
157
+ intent="sentiment_analysis",
158
+ tenant_id=tenant_id,
159
+ success=False,
160
+ error="Empty result list",
161
+ response_time_ms=response_time_ms,
162
+ text_preview=sanitize_for_logging(text[:100])
163
+ )
164
+ return {
165
+ "label": "ERROR",
166
+ "score": 0.0,
167
+ "available": True,
168
+ "message": "Sentiment analysis returned unexpected format."
169
+ }
170
+
171
+ result = result_list[0]
172
+
173
+ # Validate result structure
174
+ if not isinstance(result, dict) or 'label' not in result or 'score' not in result:
175
+ log_interaction(
176
+ intent="sentiment_analysis",
177
+ tenant_id=tenant_id,
178
+ success=False,
179
+ error="Invalid result structure",
180
+ response_time_ms=response_time_ms,
181
+ text_preview=sanitize_for_logging(text[:100])
182
+ )
183
+ return {
184
+ "label": "ERROR",
185
+ "score": 0.0,
186
+ "available": True,
187
+ "message": "Sentiment analysis returned unexpected format."
188
+ }
189
+
190
+ # Map RoBERTa labels to readable format
191
+ # LABEL_0 = NEGATIVE, LABEL_1 = NEUTRAL, LABEL_2 = POSITIVE
192
+ label_mapping = {
193
+ "LABEL_0": "NEGATIVE",
194
+ "LABEL_1": "NEUTRAL",
195
+ "LABEL_2": "POSITIVE"
196
+ }
197
+ label = label_mapping.get(result['label'], result['label'])
198
+
199
+ # Log slow analysis
200
+ if response_time_ms > 3000: # 3 seconds
201
+ log_interaction(
202
+ intent="sentiment_analysis_slow",
203
+ tenant_id=tenant_id,
204
+ success=True,
205
+ response_time_ms=response_time_ms,
206
+ details="Slow sentiment analysis detected",
207
+ text_length=len(text)
208
+ )
209
+
210
+ log_interaction(
211
+ intent="sentiment_analysis",
212
+ tenant_id=tenant_id,
213
+ success=True,
214
+ response_time_ms=response_time_ms,
215
+ sentiment_label=label,
216
+ sentiment_score=result.get('score'),
217
+ text_length=len(text)
218
+ )
219
+
220
+ return {
221
+ "label": label,
222
+ "score": float(result['score']),
223
+ "available": True,
224
+ "response_time_ms": response_time_ms
225
+ }
226
+
227
+ except httpx.TimeoutException:
228
+ response_time_ms = int((time.time() - start_time) * 1000)
229
+ log_interaction(
230
+ intent="sentiment_analysis",
231
+ tenant_id=tenant_id,
232
+ success=False,
233
+ error="Sentiment analysis request timed out",
234
+ response_time_ms=response_time_ms,
235
+ text_preview=sanitize_for_logging(text[:100]),
236
+ fallback_used=True
237
+ )
238
+ return {
239
+ "label": "ERROR",
240
+ "score": 0.0,
241
+ "available": False,
242
+ "message": "Sentiment analysis request timed out.",
243
+ "response_time_ms": response_time_ms
244
+ }
245
+
246
+ except asyncio.CancelledError:
247
+ log_interaction(
248
+ intent="sentiment_analysis",
249
+ tenant_id=tenant_id,
250
+ success=False,
251
+ error="Analysis cancelled"
252
+ )
253
+ raise
254
+
255
+ except Exception as e:
256
+ response_time_ms = int((time.time() - start_time) * 1000)
257
+
258
+ log_interaction(
259
+ intent="sentiment_analysis",
260
+ tenant_id=tenant_id,
261
+ success=False,
262
+ error=str(e),
263
+ response_time_ms=response_time_ms,
264
+ text_preview=sanitize_for_logging(text[:100]),
265
+ fallback_used=True
266
+ )
267
+
268
+ return {
269
+ "label": "ERROR",
270
+ "score": 0.0,
271
+ "available": False,
272
+ "message": "An error occurred during sentiment analysis.",
273
+ "error": str(e),
274
+ "response_time_ms": response_time_ms
275
+ }
276
+
277
+
278
+ async def analyze_sentiment_batch(
279
+ texts: List[str],
280
+ tenant_id: Optional[str] = None
281
+ ) -> Dict[str, Any]:
282
+ """
283
+ Runs sentiment analysis on a batch of texts for efficiency.
284
+
285
+ Args:
286
+ texts: List of text strings to analyze.
287
+ tenant_id: Optional tenant identifier for logging.
288
+
289
+ Returns:
290
+ A dictionary containing:
291
+ - results (list): List of sentiment analysis results for each text
292
+ - available (bool): Whether the service was available
293
+ - total_analyzed (int): Number of texts successfully analyzed
294
+ - response_time_ms (int, optional): Total batch analysis time
295
+ """
296
+ start_time = time.time()
297
+
298
+ # Check availability
299
+ if not is_sentiment_available():
300
+ log_interaction(
301
+ intent="sentiment_batch_analysis",
302
+ tenant_id=tenant_id,
303
+ success=False,
304
+ error="Sentiment API not configured (missing HF_TOKEN)",
305
+ batch_size=len(texts) if texts else 0
306
+ )
307
+ return {
308
+ "results": [],
309
+ "available": False,
310
+ "total_analyzed": 0,
311
+ "message": "Sentiment analysis is temporarily unavailable."
312
+ }
313
+
314
+ # Validate input
315
+ if not texts or not isinstance(texts, list):
316
+ log_interaction(
317
+ intent="sentiment_batch_analysis",
318
+ tenant_id=tenant_id,
319
+ success=False,
320
+ error="Invalid texts input"
321
+ )
322
+ return {
323
+ "results": [],
324
+ "available": True,
325
+ "total_analyzed": 0,
326
+ "message": "Invalid batch input provided."
327
+ }
328
+
329
+ # Filter valid texts and limit batch size
330
+ valid_texts = [t for t in texts if isinstance(t, str) and t.strip()]
331
+ if len(valid_texts) > 100: # Batch size limit
332
+ valid_texts = valid_texts[:100]
333
+
334
+ if not valid_texts:
335
+ log_interaction(
336
+ intent="sentiment_batch_analysis",
337
+ tenant_id=tenant_id,
338
+ success=False,
339
+ error="No valid texts in batch"
340
+ )
341
+ return {
342
+ "results": [],
343
+ "available": True,
344
+ "total_analyzed": 0,
345
+ "message": "No valid texts provided for analysis."
346
+ }
347
+
348
+ try:
349
+ # Prepare API request with batch input
350
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
351
+ payload = {"inputs": valid_texts}
352
+
353
+ # Call Hugging Face Inference API
354
+ async with httpx.AsyncClient(timeout=60.0) as client: # Longer timeout for batch
355
+ response = await client.post(HF_API_URL, json=payload, headers=headers)
356
+
357
+ response_time_ms = int((time.time() - start_time) * 1000)
358
+
359
+ if response.status_code != 200:
360
+ log_interaction(
361
+ intent="sentiment_batch_analysis",
362
+ tenant_id=tenant_id,
363
+ success=False,
364
+ error=f"API returned status {response.status_code}",
365
+ response_time_ms=response_time_ms,
366
+ batch_size=len(valid_texts)
367
+ )
368
+ return {
369
+ "results": [],
370
+ "available": False,
371
+ "total_analyzed": 0,
372
+ "message": f"Sentiment API error: {response.status_code}",
373
+ "response_time_ms": response_time_ms
374
+ }
375
+
376
+ results = response.json()
377
+
378
+ # Process results and map labels
379
+ label_mapping = {
380
+ "LABEL_0": "NEGATIVE",
381
+ "LABEL_1": "NEUTRAL",
382
+ "LABEL_2": "POSITIVE"
383
+ }
384
+
385
+ processed_results = []
386
+ if results and isinstance(results, list):
387
+ for item in results:
388
+ if isinstance(item, list) and len(item) > 0:
389
+ top_result = item[0]
390
+ if isinstance(top_result, dict) and 'label' in top_result:
391
+ processed_results.append({
392
+ "label": label_mapping.get(top_result['label'], top_result['label']),
393
+ "score": float(top_result.get('score', 0.0))
394
+ })
395
+
396
+ log_interaction(
397
+ intent="sentiment_batch_analysis",
398
+ tenant_id=tenant_id,
399
+ success=True,
400
+ response_time_ms=response_time_ms,
401
+ batch_size=len(valid_texts),
402
+ total_analyzed=len(processed_results)
403
+ )
404
+
405
+ return {
406
+ "results": processed_results,
407
+ "available": True,
408
+ "total_analyzed": len(processed_results),
409
+ "response_time_ms": response_time_ms
410
+ }
411
+
412
+ except httpx.TimeoutException:
413
+ response_time_ms = int((time.time() - start_time) * 1000)
414
+ log_interaction(
415
+ intent="sentiment_batch_analysis",
416
+ tenant_id=tenant_id,
417
+ success=False,
418
+ error="Batch sentiment analysis timed out",
419
+ response_time_ms=response_time_ms,
420
+ batch_size=len(valid_texts)
421
+ )
422
+ return {
423
+ "results": [],
424
+ "available": False,
425
+ "total_analyzed": 0,
426
+ "message": "Batch sentiment analysis timed out.",
427
+ "error": "Request timeout",
428
+ "response_time_ms": response_time_ms
429
+ }
430
+
431
+ except Exception as e:
432
+ response_time_ms = int((time.time() - start_time) * 1000)
433
+
434
+ log_interaction(
435
+ intent="sentiment_batch_analysis",
436
+ tenant_id=tenant_id,
437
+ success=False,
438
+ error=str(e),
439
+ response_time_ms=response_time_ms,
440
+ batch_size=len(valid_texts)
441
+ )
442
+
443
+ return {
444
+ "results": [],
445
+ "available": False,
446
+ "total_analyzed": 0,
447
+ "message": "An error occurred during batch sentiment analysis.",
448
+ "error": str(e),
449
+ "response_time_ms": response_time_ms
450
+ }