kushvanth commited on
Commit
1195e16
·
verified ·
1 Parent(s): 083e0cf

Upload folder using huggingface_hub

Browse files
Files changed (17) hide show
  1. Dockerfile +22 -0
  2. fastapi_example.py +763 -0
  3. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/added_tokens.json +0 -0
  4. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/chat_template.jinja +0 -0
  5. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/model.safetensors +0 -0
  6. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/model.safetensors.index.json +0 -0
  7. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/tokenizer.json +0 -0
  8. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/tokenizer_config.json +0 -0
  9. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/refs/main +1 -0
  10. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/refs/refs/pr/19 +1 -0
  11. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/5ecd13590450ab163e43547492b0cfc49f16629b/model.safetensors +3 -0
  12. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/config.json +33 -0
  13. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/merges.txt +0 -0
  14. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/pytorch_model.bin +3 -0
  15. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/special_tokens_map.json +1 -0
  16. model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/vocab.json +0 -0
  17. requirements.txt +10 -0
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Upgrade pip
6
+ RUN python -m pip install --upgrade pip
7
+
8
+ # Copy dependencies and install
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Download NLTK data to /tmp (writable directory)
13
+ RUN python -m nltk.downloader -d /tmp/nltk_data vader_lexicon punkt stopwords wordnet omw-1.4
14
+
15
+ # Copy application code
16
+ COPY . .
17
+
18
+ # Expose port 7860 (Hugging Face requirement)
19
+ EXPOSE 7860
20
+
21
+ # Run FastAPI
22
+ CMD ["uvicorn", "fastapi_example:app", "--host", "0.0.0.0", "--port", "7860"]
fastapi_example.py ADDED
@@ -0,0 +1,763 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced FastAPI Service for Comment Sentiment Analysis
3
+ with improved performance, validation, and configuration management
4
+ """
5
+
6
+ from fastapi import FastAPI, HTTPException, Depends
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel, Field, validator
9
+ from pydantic_settings import BaseSettings
10
+ from typing import List, Dict, Any, Optional
11
+ from functools import lru_cache
12
+ import uvicorn
13
+ import pandas as pd
14
+ import numpy as np
15
+ import os
16
+ import re
17
+ from datetime import datetime
18
+ import logging
19
+
20
+ # Configure logging FIRST
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # CRITICAL: Download NLTK data BEFORE importing NLTK components
28
+ import nltk
29
+ import ssl
30
+
31
+ try:
32
+ _create_unverified_https_context = ssl._create_unverified_context
33
+ except AttributeError:
34
+ pass
35
+ else:
36
+ ssl._create_default_https_context = _create_unverified_https_context
37
+
38
+ # Set NLTK data path
39
+ nltk_data_dir = '/tmp/nltk_data'
40
+ os.makedirs(nltk_data_dir, exist_ok=True)
41
+ nltk.data.path.insert(0, nltk_data_dir)
42
+
43
+ # Download required NLTK data
44
+ def ensure_nltk_data():
45
+ """Ensure all required NLTK data is downloaded"""
46
+ resources = ['vader_lexicon', 'punkt', 'stopwords', 'wordnet', 'omw-1.4']
47
+
48
+ for resource in resources:
49
+ try:
50
+ # Try to find the resource
51
+ if resource == 'vader_lexicon':
52
+ nltk.data.find('sentiment/vader_lexicon.zip')
53
+ elif resource == 'punkt':
54
+ nltk.data.find('tokenizers/punkt')
55
+ elif resource in ['stopwords', 'wordnet', 'omw-1.4']:
56
+ nltk.data.find(f'corpora/{resource}')
57
+ logger.info(f"✓ NLTK resource '{resource}' already available")
58
+ except LookupError:
59
+ logger.info(f"Downloading NLTK resource '{resource}'...")
60
+ try:
61
+ nltk.download(resource, download_dir=nltk_data_dir, quiet=False)
62
+ logger.info(f"✓ Successfully downloaded '{resource}'")
63
+ except Exception as e:
64
+ logger.error(f"✗ Failed to download '{resource}': {e}")
65
+
66
+ # Download NLTK data immediately
67
+ logger.info("Ensuring NLTK data is available...")
68
+ ensure_nltk_data()
69
+
70
+ # NOW import NLTK components
71
+ from nltk.sentiment import SentimentIntensityAnalyzer
72
+
73
+ # Import transformers after NLTK setup
74
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
75
+ from scipy.special import softmax
76
+ import torch
77
+
78
+ # Configuration Management
79
+ class Settings(BaseSettings):
80
+ """Application settings with environment variable support"""
81
+ # API Settings
82
+ app_name: str = "Comment Analysis API"
83
+ app_version: str = "2.0.0"
84
+ debug_mode: bool = False
85
+
86
+ # Request Limits
87
+ max_comments_per_request: int = 1000
88
+ max_comment_length: int = 5000
89
+ min_comment_words: int = 2
90
+
91
+ # Sentiment Thresholds
92
+ vader_pos_threshold: float = 0.2
93
+ vader_neg_threshold: float = -0.2
94
+ roberta_pos_threshold: float = 0.55
95
+ roberta_neg_threshold: float = 0.45
96
+ combined_weight_vader: float = 0.5
97
+ combined_weight_roberta: float = 0.5
98
+
99
+ # Model Settings
100
+ model_cache_dir: str = "./model_cache"
101
+ roberta_model_name: str = "cardiffnlp/twitter-roberta-base-sentiment"
102
+ use_abstractive_summary: bool = False
103
+ summarizer_model: str = "facebook/bart-large-cnn"
104
+ max_summary_length: int = 100
105
+ min_summary_length: int = 25
106
+
107
+ # Performance
108
+ enable_caching: bool = True
109
+ cache_size: int = 500
110
+ batch_size: int = 32
111
+
112
+ class Config:
113
+ env_file = ".env"
114
+ env_file_encoding = 'utf-8'
115
+ extra = 'ignore'
116
+
117
+ @lru_cache()
118
+ def get_settings() -> Settings:
119
+ """Cached settings instance"""
120
+ return Settings()
121
+
122
+ # Pydantic Models
123
+ class FacultyInfo(BaseModel):
124
+ faculty_name: str = Field(..., min_length=1, max_length=200)
125
+ staff_id: str = Field(..., min_length=1, max_length=50)
126
+ course_code: str = Field(..., min_length=1, max_length=50)
127
+ course_name: str = Field(..., min_length=1, max_length=200)
128
+
129
+ class CommentAnalysisRequest(BaseModel):
130
+ comments: List[str] = Field(..., min_items=1)
131
+ faculty_info: FacultyInfo
132
+
133
+ @validator('comments')
134
+ def validate_comments(cls, v):
135
+ settings = get_settings()
136
+
137
+ if len(v) > settings.max_comments_per_request:
138
+ raise ValueError(
139
+ f'Maximum {settings.max_comments_per_request} comments per request'
140
+ )
141
+
142
+ for idx, comment in enumerate(v):
143
+ if len(comment) > settings.max_comment_length:
144
+ raise ValueError(
145
+ f'Comment {idx} exceeds maximum length of {settings.max_comment_length} characters'
146
+ )
147
+
148
+ return v
149
+
150
+ class SentimentDistribution(BaseModel):
151
+ positive_percentage: float
152
+ negative_percentage: float
153
+ neutral_percentage: float
154
+
155
+ class DetailedScores(BaseModel):
156
+ average_positive: float
157
+ average_negative: float
158
+ average_neutral: float
159
+ average_compound: Optional[float] = None
160
+
161
+ class DetailedAnalysis(BaseModel):
162
+ vader_scores: DetailedScores
163
+ roberta_scores: DetailedScores
164
+
165
+ class AnalysisResult(BaseModel):
166
+ total_comments: int
167
+ positive_comments: int
168
+ negative_comments: int
169
+ neutral_comments: int
170
+ positive_sentiment: float
171
+ negative_sentiment: float
172
+ neutral_sentiment: float
173
+ overall_sentiment: str
174
+ sentiment_distribution: SentimentDistribution
175
+ negative_comments_summary: str
176
+ negative_comments_list: List[str]
177
+ key_insights: List[str]
178
+ recommendations: List[str]
179
+ detailed_analysis: DetailedAnalysis
180
+ faculty_info: Dict[str, str]
181
+ analysis_timestamp: str
182
+
183
+ class CommentAnalysisResponse(BaseModel):
184
+ success: bool
185
+ analysis: Optional[AnalysisResult] = None
186
+ message: str
187
+
188
+ # Initialize FastAPI app
189
+ app = FastAPI(
190
+ title=get_settings().app_name,
191
+ version=get_settings().app_version,
192
+ description="Advanced sentiment analysis service for educational feedback"
193
+ )
194
+
195
+ # Add CORS middleware
196
+ app.add_middleware(
197
+ CORSMiddleware,
198
+ allow_origins=["*"],
199
+ allow_credentials=True,
200
+ allow_methods=["*"],
201
+ allow_headers=["*"],
202
+ )
203
+
204
+ # Global variables for models
205
+ sia = None
206
+ tokenizer = None
207
+ model = None
208
+ device = None
209
+ summarizer = None
210
+
211
+ # Heuristic phrase/regex rules for explicit negative feedback
212
+ NEGATIVE_PHRASES = [
213
+ 'need more staff',
214
+ 'need more faculty',
215
+ 'insufficient staff',
216
+ 'lack of staff',
217
+ 'lack of knowledge',
218
+ 'better knowledge needed',
219
+ 'poor knowledge',
220
+ 'not good',
221
+ 'not satisfied',
222
+ 'not satisfactory',
223
+ 'no classes',
224
+ 'no regular classes',
225
+ 'boring class',
226
+ 'boring classes',
227
+ 'waste of time',
228
+ 'bad teacher',
229
+ 'bad teaching',
230
+ 'poor teaching',
231
+ 'improve class',
232
+ 'improvement needed',
233
+ 'needs improvement',
234
+ 'not helpful',
235
+ 'not clear',
236
+ 'communication skills need improvement',
237
+ 'improve communication',
238
+ 'lectures are going fast',
239
+ 'going too fast',
240
+ 'too fast',
241
+ 'lacking',
242
+ 'is lacking',
243
+ 'knowledge is lacking',
244
+ 'practical knowledge lacking',
245
+ 'no practical',
246
+ 'lack of practical',
247
+ 'no hands-on',
248
+ 'no real world'
249
+ ]
250
+
251
+ NEGATIVE_REGEXES = [
252
+ re.compile(r"\bno\s+(proper|sufficient)\s+(classes|notes|support)\b", re.IGNORECASE),
253
+ re.compile(r"\bno\s+staff\b", re.IGNORECASE),
254
+ re.compile(r"\bneed(s)?\s+more\s+(staff|faculty|support)\b", re.IGNORECASE),
255
+ re.compile(r"\b(lecture|lectures|class|classes|teaching)\s+(are\s+)?(too|very)\s+fast\b", re.IGNORECASE),
256
+ re.compile(r"\blectures?\s+are\s+going\s+fast\b", re.IGNORECASE),
257
+ re.compile(r"\b(require|needs?|needed)\s+(some\s+)?improv(e|ement)s?\s+(in|of)?\s*communication(\s+skills?)?\b", re.IGNORECASE),
258
+ re.compile(r"\b(is\s+)?lacking\b", re.IGNORECASE),
259
+ re.compile(r"\bno\s+(practical|hands-on|real-world)\b", re.IGNORECASE)
260
+ ]
261
+
262
+ META_COMMENT_PATTERNS = [
263
+ re.compile(r"^no\s+(other\s+)?(comments?|remarks?|feedback)$", re.IGNORECASE),
264
+ re.compile(r"^no\s+remarks?\s+(about|on)", re.IGNORECASE),
265
+ re.compile(r"^nil$", re.IGNORECASE),
266
+ re.compile(r"^none$", re.IGNORECASE),
267
+ re.compile(r"^n/?a$", re.IGNORECASE)
268
+ ]
269
+
270
+ def is_meta_comment(text: str) -> bool:
271
+ """Check if comment is a meta-comment (not actual feedback)"""
272
+ if not text:
273
+ return False
274
+ text = text.strip()
275
+
276
+ for pattern in META_COMMENT_PATTERNS:
277
+ if pattern.match(text):
278
+ return True
279
+
280
+ return False
281
+
282
+ def is_explicit_negative(text: str) -> bool:
283
+ """Check if text contains explicit negative phrases"""
284
+ if not text:
285
+ return False
286
+ lower = text.lower()
287
+
288
+ for phrase in NEGATIVE_PHRASES:
289
+ if phrase in lower:
290
+ return True
291
+
292
+ for regex in NEGATIVE_REGEXES:
293
+ if regex.search(text):
294
+ return True
295
+
296
+ return False
297
+
298
+ def initialize_models():
299
+ """Initialize sentiment analysis models with caching support"""
300
+ global sia, tokenizer, model, device, summarizer
301
+
302
+ try:
303
+ settings = get_settings()
304
+ logger.info("Initializing sentiment analysis models...")
305
+
306
+ # Initialize VADER (NLTK data already downloaded)
307
+ sia = SentimentIntensityAnalyzer()
308
+ logger.info("VADER initialized")
309
+
310
+ # Initialize RoBERTa with caching
311
+ cache_dir = settings.model_cache_dir
312
+ os.makedirs(cache_dir, exist_ok=True)
313
+
314
+ tokenizer = AutoTokenizer.from_pretrained(
315
+ settings.roberta_model_name,
316
+ cache_dir=cache_dir
317
+ )
318
+ model = AutoModelForSequenceClassification.from_pretrained(
319
+ settings.roberta_model_name,
320
+ cache_dir=cache_dir
321
+ )
322
+
323
+ device = "cuda" if torch.cuda.is_available() else "cpu"
324
+ model.to(device)
325
+ model.eval()
326
+ logger.info(f"RoBERTa initialized on device: {device}")
327
+
328
+ # Initialize summarizer (optional)
329
+ if settings.use_abstractive_summary:
330
+ try:
331
+ summarizer = pipeline(
332
+ "summarization",
333
+ model=settings.summarizer_model,
334
+ device=0 if device == "cuda" else -1
335
+ )
336
+ logger.info("Summarizer initialized")
337
+ except Exception as e:
338
+ logger.warning(f"Summarizer initialization failed: {e}")
339
+ summarizer = None
340
+
341
+ logger.info("All models initialized successfully")
342
+
343
+ except Exception as e:
344
+ logger.error(f"Error initializing models: {e}")
345
+ raise e
346
+
347
+ @lru_cache(maxsize=500)
348
+ def vader_sentiment_cached(text: str) -> tuple:
349
+ """Cached VADER sentiment analysis"""
350
+ scores = sia.polarity_scores(text)
351
+ return (scores['neg'], scores['neu'], scores['pos'], scores['compound'])
352
+
353
+ def vader_sentiment(text: str) -> Dict[str, float]:
354
+ """VADER sentiment analysis with caching support"""
355
+ try:
356
+ settings = get_settings()
357
+ if settings.enable_caching:
358
+ neg, neu, pos, compound = vader_sentiment_cached(text)
359
+ return {
360
+ 'vader_neg': neg,
361
+ 'vader_neu': neu,
362
+ 'vader_pos': pos,
363
+ 'vader_compound': compound
364
+ }
365
+ else:
366
+ scores = sia.polarity_scores(text)
367
+ return {
368
+ 'vader_neg': scores['neg'],
369
+ 'vader_neu': scores['neu'],
370
+ 'vader_pos': scores['pos'],
371
+ 'vader_compound': scores['compound']
372
+ }
373
+ except Exception as e:
374
+ logger.warning(f"VADER analysis failed for text: {e}")
375
+ return {'vader_neg': 0.0, 'vader_neu': 1.0, 'vader_pos': 0.0, 'vader_compound': 0.0}
376
+
377
+ def roberta_sentiment_batch(texts: List[str]) -> List[Dict[str, float]]:
378
+ """Batch RoBERTa sentiment analysis for better performance"""
379
+ try:
380
+ settings = get_settings()
381
+ results = []
382
+
383
+ for i in range(0, len(texts), settings.batch_size):
384
+ batch = texts[i:i + settings.batch_size]
385
+
386
+ encoded = tokenizer(
387
+ batch,
388
+ return_tensors='pt',
389
+ truncation=True,
390
+ max_length=512,
391
+ padding=True
392
+ )
393
+ encoded = {k: v.to(device) for k, v in encoded.items()}
394
+
395
+ with torch.no_grad():
396
+ outputs = model(**encoded)
397
+
398
+ for output in outputs.logits:
399
+ scores = softmax(output.cpu().numpy())
400
+ results.append({
401
+ 'roberta_neg': float(scores[0]),
402
+ 'roberta_neu': float(scores[1]),
403
+ 'roberta_pos': float(scores[2])
404
+ })
405
+
406
+ return results
407
+
408
+ except Exception as e:
409
+ logger.warning(f"RoBERTa batch analysis failed: {e}")
410
+ return [{'roberta_neg': 0.0, 'roberta_neu': 1.0, 'roberta_pos': 0.0} for _ in texts]
411
+
412
+ def roberta_sentiment(text: str) -> Dict[str, float]:
413
+ """Single text RoBERTa sentiment analysis"""
414
+ try:
415
+ encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
416
+ encoded_text = {k: v.to(device) for k, v in encoded_text.items()}
417
+
418
+ with torch.no_grad():
419
+ output = model(**encoded_text)
420
+
421
+ scores = softmax(output[0][0].cpu().numpy())
422
+ return {
423
+ 'roberta_neg': float(scores[0]),
424
+ 'roberta_neu': float(scores[1]),
425
+ 'roberta_pos': float(scores[2])
426
+ }
427
+ except Exception as e:
428
+ logger.warning(f"RoBERTa analysis failed for text: {e}")
429
+ return {'roberta_neg': 0.0, 'roberta_neu': 1.0, 'roberta_pos': 0.0}
430
+
431
+ def overall_sentiment(row: pd.Series, settings: Settings) -> str:
432
+ """Determine overall sentiment using combined scores with configurable thresholds"""
433
+ combined_pos = row.get('combined_pos', 0.0)
434
+ combined_neg = row.get('combined_neg', 0.0)
435
+ combined_neu = row.get('combined_neu', 0.0)
436
+ vader_compound = row.get('vader_compound', 0.0)
437
+ roberta_neg = row.get('roberta_neg', 0.0)
438
+ roberta_pos = row.get('roberta_pos', 0.0)
439
+
440
+ if row.get('heuristic_negative') is True:
441
+ return 'Negative'
442
+
443
+ if (
444
+ vader_compound <= settings.vader_neg_threshold or
445
+ roberta_neg >= settings.roberta_neg_threshold or
446
+ combined_neg >= max(combined_pos, combined_neu)
447
+ ):
448
+ return 'Negative'
449
+
450
+ if (
451
+ vader_compound >= settings.vader_pos_threshold or
452
+ roberta_pos >= settings.roberta_pos_threshold or
453
+ combined_pos >= max(combined_neg, combined_neu)
454
+ ):
455
+ return 'Positive'
456
+
457
+ return 'Neutral'
458
+
459
+ def sanitize_text(text: str) -> str:
460
+ """Sanitize input text"""
461
+ if not text:
462
+ return ""
463
+ text = ' '.join(text.split())
464
+ text = ''.join(char for char in text if ord(char) >= 32 or char == '\n')
465
+ return text.strip()
466
+
467
+ def analyze_comments_sentiment(comments: List[str]) -> Dict[str, Any]:
468
+ """Main sentiment analysis function with enhanced performance"""
469
+ try:
470
+ settings = get_settings()
471
+ logger.info(f"Received {len(comments)} comments for analysis")
472
+
473
+ sanitized_comments = [sanitize_text(comment) for comment in comments]
474
+
475
+ filtered_comments = [
476
+ comment for comment in sanitized_comments
477
+ if (settings.min_comment_words < len(comment.split()) <= settings.max_comment_length
478
+ and not is_meta_comment(comment))
479
+ ]
480
+
481
+ logger.info(f"After filtering: {len(filtered_comments)} valid comments")
482
+
483
+ if not filtered_comments:
484
+ return {
485
+ "total_comments": 0,
486
+ "message": "No valid comments found for analysis"
487
+ }
488
+
489
+ df = pd.DataFrame({'comment': filtered_comments})
490
+ df['heuristic_negative'] = df['comment'].apply(is_explicit_negative)
491
+
492
+ vader_results = []
493
+ for text in df['comment']:
494
+ vader_results.append(vader_sentiment(text))
495
+
496
+ roberta_results = roberta_sentiment_batch(df['comment'].tolist())
497
+
498
+ vader_df = pd.DataFrame(vader_results)
499
+ roberta_df = pd.DataFrame(roberta_results)
500
+ final_df = pd.concat([df.reset_index(drop=True), vader_df, roberta_df], axis=1)
501
+
502
+ final_df['combined_pos'] = (
503
+ settings.combined_weight_vader * final_df['vader_pos'] +
504
+ settings.combined_weight_roberta * final_df['roberta_pos']
505
+ )
506
+ final_df['combined_neg'] = (
507
+ settings.combined_weight_vader * final_df['vader_neg'] +
508
+ settings.combined_weight_roberta * final_df['roberta_neg']
509
+ )
510
+ final_df['combined_neu'] = (
511
+ settings.combined_weight_vader * final_df['vader_neu'] +
512
+ settings.combined_weight_roberta * final_df['roberta_neu']
513
+ )
514
+
515
+ final_df['Overall_Sentiment'] = final_df.apply(
516
+ lambda row: overall_sentiment(row, settings),
517
+ axis=1
518
+ )
519
+
520
+ total_comments = len(final_df)
521
+ positive_count = len(final_df[final_df['Overall_Sentiment'] == 'Positive'])
522
+ negative_count = len(final_df[final_df['Overall_Sentiment'] == 'Negative'])
523
+ neutral_count = len(final_df[final_df['Overall_Sentiment'] == 'Neutral'])
524
+
525
+ logger.info(
526
+ f"Results: {positive_count} positive, "
527
+ f"{negative_count} negative, {neutral_count} neutral"
528
+ )
529
+
530
+ avg_positive = float(final_df['combined_pos'].mean())
531
+ avg_negative = float(final_df['combined_neg'].mean())
532
+ avg_neutral = float(final_df['combined_neu'].mean())
533
+
534
+ if avg_positive > max(avg_negative, avg_neutral):
535
+ overall_sentiment_label = "Positive"
536
+ elif avg_negative > max(avg_positive, avg_neutral):
537
+ overall_sentiment_label = "Negative"
538
+ else:
539
+ overall_sentiment_label = "Neutral"
540
+
541
+ negative_summary = ""
542
+ negative_comments_list = []
543
+ negative_comments = final_df[final_df['Overall_Sentiment'] == 'Negative']
544
+
545
+ if len(negative_comments) > 0:
546
+ negative_comments_list = negative_comments['comment'].tolist()
547
+
548
+ try:
549
+ top_idx = negative_comments['combined_neg'].nlargest(3).index
550
+ top_comments = negative_comments.loc[top_idx, 'comment'].tolist()
551
+
552
+ if settings.use_abstractive_summary and summarizer is not None:
553
+ negative_text = " ".join(top_comments)
554
+ if len(negative_text) > 1000:
555
+ negative_text = negative_text[:1000]
556
+
557
+ summary_result = summarizer(
558
+ negative_text,
559
+ max_length=settings.max_summary_length,
560
+ min_length=settings.min_summary_length,
561
+ do_sample=False
562
+ )
563
+ negative_summary = summary_result[0]['summary_text']
564
+ else:
565
+ negative_summary = "; ".join(top_comments)
566
+ except Exception as e:
567
+ logger.warning(f"Summary generation failed: {e}")
568
+ negative_summary = "; ".join(negative_comments_list[:3])
569
+
570
+ insights = []
571
+ recommendations = []
572
+
573
+ if overall_sentiment_label == "Positive":
574
+ insights.extend([
575
+ "Students have positive feedback overall",
576
+ "Teaching methods are well-received",
577
+ f"{positive_count}/{total_comments} comments are positive"
578
+ ])
579
+ recommendations.extend([
580
+ "Continue current teaching approach",
581
+ "Maintain student engagement strategies",
582
+ "Share successful practices with colleagues"
583
+ ])
584
+ elif overall_sentiment_label == "Negative":
585
+ insights.extend([
586
+ "Students have concerns that need attention",
587
+ "Some aspects of teaching may need improvement",
588
+ f"{negative_count}/{total_comments} comments indicate issues"
589
+ ])
590
+ recommendations.extend([
591
+ "Review teaching methods and materials",
592
+ "Consider additional student support",
593
+ "Schedule meetings to address student concerns",
594
+ "Focus on areas mentioned in negative feedback"
595
+ ])
596
+ else:
597
+ insights.extend([
598
+ "Mixed feedback from students",
599
+ "Some areas performing well, others need attention",
600
+ "Balance of positive and negative responses"
601
+ ])
602
+ recommendations.extend([
603
+ "Focus on areas with negative feedback",
604
+ "Maintain strengths while addressing weaknesses",
605
+ "Gather more specific feedback on improvement areas"
606
+ ])
607
+
608
+ return {
609
+ "total_comments": total_comments,
610
+ "positive_comments": positive_count,
611
+ "negative_comments": negative_count,
612
+ "neutral_comments": neutral_count,
613
+ "positive_sentiment": round(avg_positive, 3),
614
+ "negative_sentiment": round(avg_negative, 3),
615
+ "neutral_sentiment": round(avg_neutral, 3),
616
+ "overall_sentiment": overall_sentiment_label,
617
+ "sentiment_distribution": {
618
+ "positive_percentage": round((positive_count / total_comments) * 100, 1),
619
+ "negative_percentage": round((negative_count / total_comments) * 100, 1),
620
+ "neutral_percentage": round((neutral_count / total_comments) * 100, 1)
621
+ },
622
+ "negative_comments_summary": negative_summary,
623
+ "negative_comments_list": negative_comments_list,
624
+ "key_insights": insights,
625
+ "recommendations": recommendations,
626
+ "detailed_analysis": {
627
+ "vader_scores": {
628
+ "average_positive": round(final_df['vader_pos'].mean(), 3),
629
+ "average_negative": round(final_df['vader_neg'].mean(), 3),
630
+ "average_neutral": round(final_df['vader_neu'].mean(), 3),
631
+ "average_compound": round(final_df['vader_compound'].mean(), 3)
632
+ },
633
+ "roberta_scores": {
634
+ "average_positive": round(final_df['roberta_pos'].mean(), 3),
635
+ "average_negative": round(final_df['roberta_neg'].mean(), 3),
636
+ "average_neutral": round(final_df['roberta_neu'].mean(), 3)
637
+ }
638
+ },
639
+ "analysis_timestamp": datetime.utcnow().isoformat()
640
+ }
641
+
642
+ except Exception as e:
643
+ logger.error(f"Sentiment analysis failed: {e}", exc_info=True)
644
+ raise e
645
+
646
+ @app.on_event("startup")
647
+ async def startup_event():
648
+ """Initialize models on startup"""
649
+ try:
650
+ logger.info("===== Application Startup at {} =====".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
651
+ initialize_models()
652
+ logger.info("Service started successfully")
653
+ except Exception as e:
654
+ logger.error(f"Startup failed: {e}")
655
+ raise e
656
+
657
+ @app.on_event("shutdown")
658
+ async def shutdown_event():
659
+ """Cleanup on shutdown"""
660
+ logger.info("Service shutting down")
661
+
662
+ @app.get("/")
663
+ async def root():
664
+ """Root endpoint"""
665
+ return {
666
+ "service": get_settings().app_name,
667
+ "version": get_settings().app_version,
668
+ "status": "running"
669
+ }
670
+
671
+ @app.get("/health")
672
+ async def health_check():
673
+ """Health check endpoint"""
674
+ models_loaded = sia is not None and model is not None and tokenizer is not None
675
+
676
+ return {
677
+ "status": "healthy" if models_loaded else "unhealthy",
678
+ "service": "comment-analysis",
679
+ "version": get_settings().app_version,
680
+ "models_loaded": models_loaded,
681
+ "device": device if device else "not initialized",
682
+ "timestamp": datetime.utcnow().isoformat()
683
+ }
684
+
685
+ @app.post("/analyze-comments", response_model=CommentAnalysisResponse)
686
+ async def analyze_comments(
687
+ request: CommentAnalysisRequest,
688
+ settings: Settings = Depends(get_settings)
689
+ ):
690
+ """
691
+ Analyze comments for sentiment analysis using VADER and RoBERTa models
692
+ """
693
+ try:
694
+ comments = request.comments
695
+ faculty_info = request.faculty_info
696
+
697
+ if not comments:
698
+ return CommentAnalysisResponse(
699
+ success=False,
700
+ analysis=None,
701
+ message="No comments provided for analysis"
702
+ )
703
+
704
+ logger.info(
705
+ f"Analyzing {len(comments)} comments for "
706
+ f"{faculty_info.faculty_name} ({faculty_info.course_code})"
707
+ )
708
+
709
+ analysis_result = analyze_comments_sentiment(comments)
710
+
711
+ if analysis_result.get("total_comments", 0) == 0:
712
+ return CommentAnalysisResponse(
713
+ success=False,
714
+ analysis=None,
715
+ message=analysis_result.get("message", "No valid comments to analyze")
716
+ )
717
+
718
+ analysis_result["faculty_info"] = {
719
+ "faculty_name": faculty_info.faculty_name,
720
+ "staff_id": faculty_info.staff_id,
721
+ "course_code": faculty_info.course_code,
722
+ "course_name": faculty_info.course_name
723
+ }
724
+
725
+ return CommentAnalysisResponse(
726
+ success=True,
727
+ analysis=analysis_result,
728
+ message=f"Successfully analyzed {analysis_result['total_comments']} comments"
729
+ )
730
+
731
+ except ValueError as ve:
732
+ logger.warning(f"Validation error: {ve}")
733
+ raise HTTPException(status_code=400, detail=str(ve))
734
+ except Exception as e:
735
+ logger.error(f"Analysis failed: {e}", exc_info=True)
736
+ raise HTTPException(
737
+ status_code=500,
738
+ detail="Analysis failed. Please try again later."
739
+ )
740
+
741
+ @app.get("/config")
742
+ async def get_config(settings: Settings = Depends(get_settings)):
743
+ """Get current configuration"""
744
+ if not settings.debug_mode:
745
+ raise HTTPException(status_code=404, detail="Not found")
746
+
747
+ return {
748
+ "max_comments_per_request": settings.max_comments_per_request,
749
+ "vader_pos_threshold": settings.vader_pos_threshold,
750
+ "vader_neg_threshold": settings.vader_neg_threshold,
751
+ "roberta_pos_threshold": settings.roberta_pos_threshold,
752
+ "roberta_neg_threshold": settings.roberta_neg_threshold,
753
+ "enable_caching": settings.enable_caching,
754
+ "batch_size": settings.batch_size
755
+ }
756
+
757
+ if __name__ == "__main__":
758
+ uvicorn.run(
759
+ app,
760
+ host="0.0.0.0",
761
+ port=8000,
762
+ log_level="info"
763
+ )
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/added_tokens.json ADDED
File without changes
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/chat_template.jinja ADDED
File without changes
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/model.safetensors ADDED
File without changes
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/model.safetensors.index.json ADDED
File without changes
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/tokenizer.json ADDED
File without changes
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/.no_exist/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/tokenizer_config.json ADDED
File without changes
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/refs/main ADDED
@@ -0,0 +1 @@
 
 
1
+ daefdd1f6ae931839bce4d0f3db0a1a4265cd50f
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/refs/refs/pr/19 ADDED
@@ -0,0 +1 @@
 
 
1
+ 5ecd13590450ab163e43547492b0cfc49f16629b
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/5ecd13590450ab163e43547492b0cfc49f16629b/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:727b715c623b78b1842f8e257b2f6b4b314a86c0c944d46d0784ce3009982a68
3
+ size 498620100
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "tweeteval_new/roberta-base-rt-sentiment/",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "LABEL_0",
15
+ "1": "LABEL_1",
16
+ "2": "LABEL_2"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 514,
27
+ "model_type": "roberta",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 12,
30
+ "pad_token_id": 1,
31
+ "type_vocab_size": 1,
32
+ "vocab_size": 50265
33
+ }
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c37a3484c55954cd75b336a85f1e0c023ae874f3a73b05d2418dd04828e293b1
3
+ size 498679497
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>"}
model_cache/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.111.0
2
+ uvicorn==0.30.1
3
+ pydantic==2.8.2
4
+ pydantic-settings==2.10.1
5
+ pandas==2.3.1
6
+ numpy==2.1.3
7
+ nltk==3.9.1
8
+ torch==2.8.0
9
+ transformers==4.56.2
10
+ scipy==1.16.2