iqac_fast_api / fastapi_example.py
kushvanth's picture
Update fastapi_example.py
ad33fe4 verified
"""
Enhanced FastAPI Service for Comment Sentiment Analysis
Version 3.0.0 - Major accuracy improvements with advanced classification
Features:
- Multi-stage sentiment detection
- Context-aware negative pattern matching
- Improved neutral/meta-comment detection
- Enhanced accuracy through ensemble approach
"""
from fastapi import FastAPI, HTTPException, Depends
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, validator
from pydantic_settings import BaseSettings
from typing import List, Dict, Any, Optional
from functools import lru_cache
import uvicorn
import pandas as pd
import numpy as np
import os
import re
from datetime import datetime
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# NLTK Setup
import nltk
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
nltk_data_dir = '/tmp/nltk_data'
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.insert(0, nltk_data_dir)
def ensure_nltk_data():
"""Ensure all required NLTK data is downloaded"""
resources = ['vader_lexicon', 'punkt', 'stopwords', 'wordnet', 'omw-1.4']
for resource in resources:
try:
if resource == 'vader_lexicon':
nltk.data.find('sentiment/vader_lexicon.zip')
elif resource == 'punkt':
nltk.data.find('tokenizers/punkt')
elif resource in ['stopwords', 'wordnet', 'omw-1.4']:
nltk.data.find(f'corpora/{resource}')
logger.info(f"✓ NLTK resource '{resource}' already available")
except LookupError:
logger.info(f"Downloading NLTK resource '{resource}'...")
try:
nltk.download(resource, download_dir=nltk_data_dir, quiet=False)
logger.info(f"✓ Successfully downloaded '{resource}'")
except Exception as e:
logger.error(f"✗ Failed to download '{resource}': {e}")
logger.info("Ensuring NLTK data is available...")
ensure_nltk_data()
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from scipy.special import softmax
import torch
# Configuration
class Settings(BaseSettings):
"""Application settings"""
app_name: str = "Comment Analysis API"
app_version: str = "3.0.0"
debug_mode: bool = False
max_comments_per_request: int = 1000
max_comment_length: int = 5000
min_comment_words: int = 1
# Enhanced thresholds for better accuracy
vader_strong_pos_threshold: float = 0.5
vader_pos_threshold: float = 0.2
vader_neg_threshold: float = -0.2
vader_strong_neg_threshold: float = -0.5
roberta_strong_pos_threshold: float = 0.70
roberta_pos_threshold: float = 0.55
roberta_neg_threshold: float = 0.40
roberta_strong_neg_threshold: float = 0.60
# Adjusted weights for better accuracy
combined_weight_vader: float = 0.4
combined_weight_roberta: float = 0.6
model_cache_dir: str = "/tmp/model_cache"
roberta_model_name: str = "cardiffnlp/twitter-roberta-base-sentiment"
use_abstractive_summary: bool = False
summarizer_model: str = "facebook/bart-large-cnn"
max_summary_length: int = 100
min_summary_length: int = 25
enable_caching: bool = True
cache_size: int = 500
batch_size: int = 32
class Config:
env_file = ".env"
env_file_encoding = 'utf-8'
extra = 'ignore'
@lru_cache()
def get_settings() -> Settings:
"""Cached settings instance"""
settings = Settings()
total = settings.combined_weight_vader + settings.combined_weight_roberta
if not (0.99 <= total <= 1.01):
logger.warning(f"Weights sum to {total}, normalizing to 1.0")
settings.combined_weight_vader /= total
settings.combined_weight_roberta /= total
return settings
# Pydantic Models
class FacultyInfo(BaseModel):
faculty_name: str = Field(..., min_length=1, max_length=200)
staff_id: str = Field(..., min_length=1, max_length=50)
course_code: str = Field(..., min_length=1, max_length=50)
course_name: str = Field(..., min_length=1, max_length=200)
class CommentAnalysisRequest(BaseModel):
comments: List[str] = Field(..., min_items=1)
faculty_info: FacultyInfo
@validator('comments')
def validate_comments(cls, v):
settings = get_settings()
if len(v) > settings.max_comments_per_request:
raise ValueError(f'Maximum {settings.max_comments_per_request} comments per request')
for idx, comment in enumerate(v):
if len(comment) > settings.max_comment_length:
raise ValueError(f'Comment {idx} exceeds maximum length of {settings.max_comment_length} characters')
return v
class SentimentDistribution(BaseModel):
positive_percentage: float
negative_percentage: float
neutral_percentage: float
class DetailedScores(BaseModel):
average_positive: float
average_negative: float
average_neutral: float
average_compound: Optional[float] = None
class DetailedAnalysis(BaseModel):
vader_scores: DetailedScores
roberta_scores: DetailedScores
class AnalysisResult(BaseModel):
total_comments: int
positive_comments: int
negative_comments: int
neutral_comments: int
positive_sentiment: float
negative_sentiment: float
neutral_sentiment: float
overall_sentiment: str
sentiment_distribution: SentimentDistribution
negative_comments_summary: str
negative_comments_list: List[str]
key_insights: List[str]
recommendations: List[str]
detailed_analysis: DetailedAnalysis
faculty_info: Dict[str, str]
analysis_timestamp: str
class CommentAnalysisResponse(BaseModel):
success: bool
analysis: Optional[AnalysisResult] = None
message: str
# Initialize FastAPI
app = FastAPI(
title=get_settings().app_name,
version=get_settings().app_version,
description="Advanced sentiment analysis service for educational feedback"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Global model variables
sia = None
tokenizer = None
model = None
device = None
summarizer = None
# ============================================================================
# ENHANCED PATTERN DETECTION FOR BETTER ACCURACY
# ============================================================================
# Meta-comments (not actual feedback - should be NEUTRAL)
META_PATTERNS = re.compile(
r'^(no\s+(negative\s+)?(more\s+)?(comments?|feedback|remarks?|issues?|problems?|complaints?)|'
r'(everything|all)\s+(is\s+)?(good|fine|ok(ay)?|great|perfect|excellent)|'
r'nothing(\s+to\s+(say|comment|mention|add))?|'
r'(nil|none|na|n/a|nill)\.?|'
r'^(all\s+)?(good|fine|ok(ay)?|great|nice)\.?|'
r'no\s+remarks?|'
r'everything\s+at\s+the\s+too\s+only)$',
re.IGNORECASE
)
# Strong NEGATIVE indicators (should override model scores)
STRONG_NEGATIVE_PATTERN = re.compile(
r'\b('
# Direct criticism
r'(very|extremely|quite|so|too)\s+(poor|bad|weak|terrible|awful|horrible)|'
r'poor\s+(teaching|teacher|faculty|knowledge|communication|quality|explanation)|'
r'bad\s+(teaching|teacher|faculty|quality|explanation)|'
r'terrible|horrible|awful|pathetic|useless|waste\s+of\s+time|'
# Teaching quality issues
r'(teaching|knowledge)\s+(is\s+)?(poor|bad|weak|lacking|insufficient|not\s+good)|'
r'cannot\s+teach|can\'?t\s+teach|doesn\'?t\s+know\s+how\s+to\s+teach|'
r'not\s+teaching\s+properly|teaching\s+method\s+is\s+(poor|bad)|'
# Boring/disengagement
r'(boring|dull|monotonous)\s+(class|classes|subject|lecture|lectures|sessions?)|'
r'(class|classes|subject|lectures?)\s+(is|are)\s+(boring|dull|monotonous|uninteresting)|'
r'sleeping\s+in\s+class|fall\s+asleep|makes?\s+us\s+sleep|'
# Communication issues
r'(low|soft|quiet|unclear)\s+voice|voice\s+(is\s+)?(low|soft|quiet|not\s+clear)|'
r'(cannot|can\'?t|cant|unable\s+to)\s+hear|difficult\s+to\s+hear|'
r'(not|poor|bad)\s+(communication|explaining|explanation)|'
# Understanding issues
r'(cannot|can\'?t|cant|unable\s+to|difficult\s+to|hard\s+to)\s+understand|'
r'(not|never|don\'?t)\s+(able\s+to\s+)?understand|'
r'(concepts?|topics?|subjects?)\s+(are\s+)?(difficult|hard|tough|impossible)\s+to\s+understand|'
r'makes?\s+(no|little)\s+sense|doesn\'?t\s+make\s+sense|'
# Improvement needed
r'(need|needs|require|requires)\s+(urgent|serious|immediate|much|lot\s+of)?\s*improvement|'
r'(should|must|have\s+to)\s+improve\s+(a\s+lot|more|urgently)|'
# Pace issues
r'(lectures?|class(es)?|teaching)\s+(is|are|going)\s+(too|very)\s+(fast|slow)|'
r'(too|very|extremely)\s+(fast|slow|rush|rushed)|'
r'(lag|lagging)\s+in\s+teaching|teaching\s+(is\s+)?lagging|'
# Time management
r'(not|poor|bad|terrible)\s+(managing|managing)\s+time|'
r'time\s+management\s+(is\s+)?(poor|bad|terrible|lacking)|'
r'always\s+(late|wasting\s+time)|waste\s+(our|class)\s+time|'
# Lack of resources/support
r'(no|not|insufficient|lack\s+of)\s+(proper|sufficient|enough|regular)?\s*(classes|notes|support|help)|'
r'need\s+more\s+(staff|faculty|classes|support|help)|'
r'no\s+(practical|hands[-\s]?on|lab|real[-\s]?world)|lack\s+of\s+practical|'
# Attendance/engagement issues
r'(just|only)\s+(for|going\s+for)\s+attendance|'
r'going\s+(to|for)\s+(her|his|their)\s+class\s+(just|only)\s+for\s+attendance|'
r'(not|no)\s+(interested|engaging|helpful|useful|at\s+all)|'
r'no\s+interest\s+in\s+teaching|'
# Administrative issues
r'military\s+rules|too\s+strict|very\s+strict|'
r'attendance\s+(issue|problem)|not\s+providing\s+attendance|'
# Workload issues
r'too\s+many\s+projects|many\s+projects\s+review|'
r'placement\s+activities\s+(and|with)\s+attendance'
r')\b',
re.IGNORECASE
)
# Positive indicators (help identify positive comments)
POSITIVE_PATTERN = re.compile(
r'\b('
r'(very|extremely|really|so|truly)\s+(good|great|excellent|amazing|wonderful|fantastic|helpful|knowledgeable|clear)|'
r'excellent|outstanding|amazing|wonderful|fantastic|brilliant|superb|'
r'(great|good|best|wonderful)\s+(teaching|teacher|faculty|knowledge|explanation|professor|sir|madam)|'
r'(teaching|explanation|knowledge)\s+(is\s+)?(excellent|outstanding|very\s+good|great|clear)|'
r'explains?\s+(very\s+)?(well|clearly|nicely|perfectly)|'
r'(easy|easier)\s+to\s+understand|clear\s+explanation|'
r'(very\s+)?(helpful|supportive|friendly|approachable|patient)|'
r'(good|strong|deep|vast)\s+(knowledge|understanding)|'
r'(love|like|enjoy|appreciate)\s+(the\s+)?(class|classes|teaching|subject|course|lectures?)|'
r'learned?\s+(a\s+lot|so\s+much|many\s+things)|'
r'inspired?|inspiring|motivating|motivated|encouraged|'
r'(best|favourite|favorite)\s+(teacher|faculty|professor)|'
r'highly\s+recommend|strongly\s+recommend|'
r'grateful|thankful|blessed|lucky\s+to\s+have|'
r'satisfied|happy\s+with|pleased\s+with|'
r'(always|very)\s+(available|accessible|helpful)|'
r'patient|caring|dedicated|passionate|'
r'interactive\s+class|engaging\s+class|interesting\s+class'
r')\b',
re.IGNORECASE
)
# Weak negative indicators (suggestions/mild criticism - might be NEUTRAL)
WEAK_NEGATIVE_PATTERN = re.compile(
r'\b('
r'could\s+(be\s+)?better|'
r'can\s+improve|'
r'would\s+be\s+good\s+if|'
r'suggest|suggestion|'
r'maybe|perhaps|'
r'slightly|a\s+bit|'
r'sometimes|occasionally'
r')\b',
re.IGNORECASE
)
def is_meta_comment(text: str) -> bool:
"""Check if comment is meta (not actual feedback)"""
if not text or len(text.strip()) < 3:
return True
text = text.strip()
return bool(META_PATTERNS.match(text))
def detect_strong_negative(text: str) -> bool:
"""Detect strong negative patterns"""
if not text or is_meta_comment(text):
return False
return bool(STRONG_NEGATIVE_PATTERN.search(text))
def detect_positive(text: str) -> bool:
"""Detect positive patterns"""
if not text or is_meta_comment(text):
return False
return bool(POSITIVE_PATTERN.search(text))
def detect_weak_negative(text: str) -> bool:
"""Detect weak negative patterns (suggestions)"""
if not text or is_meta_comment(text):
return False
return bool(WEAK_NEGATIVE_PATTERN.search(text))
# ============================================================================
# MODEL INITIALIZATION
# ============================================================================
def initialize_models():
"""Initialize sentiment analysis models"""
global sia, tokenizer, model, device, summarizer
try:
settings = get_settings()
logger.info("Initializing sentiment analysis models...")
# VADER
sia = SentimentIntensityAnalyzer()
logger.info("✓ VADER initialized")
# RoBERTa
cache_dir = settings.model_cache_dir
os.makedirs(cache_dir, exist_ok=True)
tokenizer = AutoTokenizer.from_pretrained(
settings.roberta_model_name,
cache_dir=cache_dir
)
model = AutoModelForSequenceClassification.from_pretrained(
settings.roberta_model_name,
cache_dir=cache_dir
)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()
logger.info(f"✓ RoBERTa initialized on device: {device}")
# Summarizer (optional)
if settings.use_abstractive_summary:
try:
summarizer = pipeline(
"summarization",
model=settings.summarizer_model,
device=0 if device == "cuda" else -1
)
logger.info("✓ Summarizer initialized")
except Exception as e:
logger.warning(f"Summarizer initialization failed: {e}")
summarizer = None
logger.info("✓ All models initialized successfully")
except Exception as e:
logger.error(f"Error initializing models: {e}")
raise e
# ============================================================================
# SENTIMENT ANALYSIS FUNCTIONS
# ============================================================================
@lru_cache(maxsize=500)
def vader_sentiment_cached(text: str) -> tuple:
"""Cached VADER sentiment analysis"""
scores = sia.polarity_scores(text)
return (scores['neg'], scores['neu'], scores['pos'], scores['compound'])
def vader_sentiment(text: str) -> Dict[str, float]:
"""VADER sentiment analysis"""
try:
settings = get_settings()
if settings.enable_caching:
neg, neu, pos, compound = vader_sentiment_cached(text)
return {
'vader_neg': neg,
'vader_neu': neu,
'vader_pos': pos,
'vader_compound': compound
}
else:
scores = sia.polarity_scores(text)
return {
'vader_neg': scores['neg'],
'vader_neu': scores['neu'],
'vader_pos': scores['pos'],
'vader_compound': scores['compound']
}
except Exception as e:
logger.warning(f"VADER analysis failed: {e}")
return {'vader_neg': 0.0, 'vader_neu': 1.0, 'vader_pos': 0.0, 'vader_compound': 0.0}
def roberta_sentiment_batch(texts: List[str]) -> List[Dict[str, float]]:
"""Batch RoBERTa sentiment analysis"""
try:
settings = get_settings()
results = []
for i in range(0, len(texts), settings.batch_size):
batch = texts[i:i + settings.batch_size]
encoded = tokenizer(
batch,
return_tensors='pt',
truncation=True,
max_length=512,
padding=True
)
encoded = {k: v.to(device) for k, v in encoded.items()}
with torch.no_grad():
outputs = model(**encoded)
for output in outputs.logits:
scores = softmax(output.cpu().numpy())
results.append({
'roberta_neg': float(scores[0]),
'roberta_neu': float(scores[1]),
'roberta_pos': float(scores[2])
})
return results
except Exception as e:
logger.warning(f"RoBERTa batch analysis failed: {e}")
return [{'roberta_neg': 0.0, 'roberta_neu': 1.0, 'roberta_pos': 0.0} for _ in texts]
def classify_sentiment_enhanced(row: pd.Series, settings: Settings) -> str:
"""
Enhanced multi-stage sentiment classification for better accuracy
Stage 1: Meta-comments → Neutral
Stage 2: Strong negative patterns → Negative (override models)
Stage 3: Strong positive patterns + high scores → Positive
Stage 4: Model ensemble decision
Stage 5: Default to neutral if uncertain
"""
# Stage 1: Meta-comments are always neutral
if row.get('is_meta', False):
return 'Neutral'
# Get all scores
vader_compound = row.get('vader_compound', 0.0)
vader_pos = row.get('vader_pos', 0.0)
vader_neg = row.get('vader_neg', 0.0)
roberta_pos = row.get('roberta_pos', 0.0)
roberta_neg = row.get('roberta_neg', 0.0)
roberta_neu = row.get('roberta_neu', 0.0)
combined_pos = row.get('combined_pos', 0.0)
combined_neg = row.get('combined_neg', 0.0)
combined_neu = row.get('combined_neu', 0.0)
has_strong_negative = row.get('has_strong_negative', False)
has_positive = row.get('has_positive', False)
has_weak_negative = row.get('has_weak_negative', False)
# Stage 2: Strong negative patterns override everything
if has_strong_negative:
return 'Negative'
# Stage 3: Strong positive signals
if has_positive and (
vader_compound >= settings.vader_strong_pos_threshold or
roberta_pos >= settings.roberta_strong_pos_threshold or
(vader_compound >= settings.vader_pos_threshold and roberta_pos >= settings.roberta_pos_threshold)
):
return 'Positive'
# Stage 4: Model-based classification with ensemble
# Strong negative from models
if (
vader_compound <= settings.vader_strong_neg_threshold or
roberta_neg >= settings.roberta_strong_neg_threshold or
(vader_compound <= settings.vader_neg_threshold and roberta_neg >= settings.roberta_neg_threshold)
):
return 'Negative'
# Moderate negative
if (
combined_neg > combined_pos and
combined_neg > combined_neu and
combined_neg > 0.35 # Threshold for clarity
):
return 'Negative'
# Clear positive
if (
combined_pos > combined_neg and
combined_pos > combined_neu and
combined_pos > 0.35 # Threshold for clarity
):
return 'Positive'
# Weak negative with suggestion context → might be neutral
if has_weak_negative and not has_strong_negative:
# If scores are not strongly negative, treat as neutral
if combined_neg < 0.5:
return 'Neutral'
# Stage 5: Default to neutral if uncertain
return 'Neutral'
def sanitize_text(text: str) -> str:
"""Sanitize input text"""
if not text:
return ""
text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]', '', text)
text = ' '.join(text.split())
return text.strip()
# ============================================================================
# MAIN ANALYSIS FUNCTION
# ============================================================================
def analyze_comments_sentiment(comments: List[str]) -> Dict[str, Any]:
"""Main sentiment analysis with enhanced accuracy"""
try:
settings = get_settings()
logger.info(f"Received {len(comments)} comments for analysis")
# Sanitize
sanitized_comments = [sanitize_text(comment) for comment in comments]
# Filter valid comments
filtered_comments = [
comment for comment in sanitized_comments
if settings.min_comment_words <= len(comment.split()) <= settings.max_comment_length
]
logger.info(f"After filtering: {len(filtered_comments)} valid comments")
if not filtered_comments:
return {
"total_comments": 0,
"message": "No valid comments found for analysis"
}
# Create DataFrame
df = pd.DataFrame({'comment': filtered_comments})
# Pattern detection
df['is_meta'] = df['comment'].apply(is_meta_comment)
df['has_strong_negative'] = df['comment'].apply(detect_strong_negative)
df['has_positive'] = df['comment'].apply(detect_positive)
df['has_weak_negative'] = df['comment'].apply(detect_weak_negative)
# Log detection stats
logger.info(f"Meta: {df['is_meta'].sum()}, "
f"Strong Neg: {df['has_strong_negative'].sum()}, "
f"Positive: {df['has_positive'].sum()}, "
f"Weak Neg: {df['has_weak_negative'].sum()}")
# VADER analysis
vader_results = [vader_sentiment(text) for text in df['comment']]
vader_df = pd.DataFrame(vader_results)
# RoBERTa analysis
roberta_results = roberta_sentiment_batch(df['comment'].tolist())
roberta_df = pd.DataFrame(roberta_results)
# Combine
final_df = pd.concat([df.reset_index(drop=True), vader_df, roberta_df], axis=1)
# Calculate combined scores
final_df['combined_pos'] = (
settings.combined_weight_vader * final_df['vader_pos'] +
settings.combined_weight_roberta * final_df['roberta_pos']
)
final_df['combined_neg'] = (
settings.combined_weight_vader * final_df['vader_neg'] +
settings.combined_weight_roberta * final_df['roberta_neg']
)
final_df['combined_neu'] = (
settings.combined_weight_vader * final_df['vader_neu'] +
settings.combined_weight_roberta * final_df['roberta_neu']
)
# Enhanced classification
final_df['Overall_Sentiment'] = final_df.apply(
lambda row: classify_sentiment_enhanced(row, settings),
axis=1
)
# Statistics
total_comments = len(final_df)
positive_count = len(final_df[final_df['Overall_Sentiment'] == 'Positive'])
negative_count = len(final_df[final_df['Overall_Sentiment'] == 'Negative'])
neutral_count = len(final_df[final_df['Overall_Sentiment'] == 'Neutral'])
logger.info(f"Classification Results - Pos: {positive_count}, Neg: {negative_count}, Neu: {neutral_count}")
# Average scores
avg_positive = float(final_df['combined_pos'].mean())
avg_negative = float(final_df['combined_neg'].mean())
avg_neutral = float(final_df['combined_neu'].mean())
# Overall sentiment
if avg_positive > max(avg_negative, avg_neutral):
overall_sentiment_label = "Positive"
elif avg_negative > max(avg_positive, avg_neutral):
overall_sentiment_label = "Negative"
else:
overall_sentiment_label = "Neutral"
# Process negative comments
negative_summary = ""
negative_comments_list = []
negative_comments = final_df[final_df['Overall_Sentiment'] == 'Negative']
if len(negative_comments) > 0:
negative_comments_list = negative_comments['comment'].tolist()
try:
top_idx = negative_comments['combined_neg'].nlargest(min(3, len(negative_comments))).index
top_comments = negative_comments.loc[top_idx, 'comment'].tolist()
if settings.use_abstractive_summary and summarizer is not None:
negative_text = " ".join(top_comments)
if len(negative_text) > 1000:
negative_text = negative_text[:1000]
summary_result = summarizer(
negative_text,
max_length=settings.max_summary_length,
min_length=settings.min_summary_length,
do_sample=False
)
negative_summary = summary_result[0]['summary_text']
else:
negative_summary = "; ".join(top_comments)
except Exception as e:
logger.warning(f"Summary generation failed: {e}")
negative_summary = "; ".join(negative_comments_list[:3])
# Insights and recommendations
insights = []
recommendations = []
if overall_sentiment_label == "Positive":
insights.extend([
f"Strong positive feedback: {positive_count}/{total_comments} comments ({round(positive_count/total_comments*100, 1)}%)",
"Students are satisfied with the teaching approach",
"High engagement and learning outcomes reported"
])
recommendations.extend([
"Continue current effective teaching methods",
"Document successful practices for future reference",
"Share best practices with colleagues"
])
elif overall_sentiment_label == "Negative":
insights.extend([
f"Concerns identified: {negative_count}/{total_comments} negative comments ({round(negative_count/total_comments*100, 1)}%)",
"Students facing challenges with current approach",
"Immediate attention needed to address feedback"
])
recommendations.extend([
"Review and analyze specific negative feedback points",
"Consider adjusting teaching pace or methods",
"Increase student engagement and support",
"Schedule student feedback sessions",
"Focus on communication clarity and accessibility"
])
else:
insights.extend([
f"Mixed feedback: {positive_count} positive, {negative_count} negative, {neutral_count} neutral",
"Room for improvement while maintaining strengths",
"Students have varied experiences"
])
recommendations.extend([
"Address specific concerns raised in negative feedback",
"Build on positive aspects appreciated by students",
"Gather more detailed feedback for neutral areas"
])
# Add pattern-based insights
if df['has_strong_negative'].sum() > 0:
insights.append(f"{df['has_strong_negative'].sum()} comments contain explicit criticism requiring attention")
if df['has_positive'].sum() > 0:
insights.append(f"{df['has_positive'].sum()} comments contain strong positive appreciation")
return {
"total_comments": total_comments,
"positive_comments": positive_count,
"negative_comments": negative_count,
"neutral_comments": neutral_count,
"positive_sentiment": round(avg_positive, 3),
"negative_sentiment": round(avg_negative, 3),
"neutral_sentiment": round(avg_neutral, 3),
"overall_sentiment": overall_sentiment_label,
"sentiment_distribution": {
"positive_percentage": round((positive_count / total_comments) * 100, 1),
"negative_percentage": round((negative_count / total_comments) * 100, 1),
"neutral_percentage": round((neutral_count / total_comments) * 100, 1)
},
"negative_comments_summary": negative_summary,
"negative_comments_list": negative_comments_list,
"key_insights": insights,
"recommendations": recommendations,
"detailed_analysis": {
"vader_scores": {
"average_positive": round(final_df['vader_pos'].mean(), 3),
"average_negative": round(final_df['vader_neg'].mean(), 3),
"average_neutral": round(final_df['vader_neu'].mean(), 3),
"average_compound": round(final_df['vader_compound'].mean(), 3)
},
"roberta_scores": {
"average_positive": round(final_df['roberta_pos'].mean(), 3),
"average_negative": round(final_df['roberta_neg'].mean(), 3),
"average_neutral": round(final_df['roberta_neu'].mean(), 3)
}
},
"analysis_timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
logger.error(f"Sentiment analysis failed: {e}", exc_info=True)
raise e
# ============================================================================
# API ENDPOINTS
# ============================================================================
@app.on_event("startup")
async def startup_event():
"""Initialize models on startup"""
try:
logger.info("=" * 80)
logger.info(f"Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
logger.info("=" * 80)
initialize_models()
logger.info("✓ Service started successfully")
logger.info("=" * 80)
except Exception as e:
logger.error(f"✗ Startup failed: {e}")
raise e
@app.on_event("shutdown")
async def shutdown_event():
"""Cleanup on shutdown"""
logger.info("Service shutting down")
@app.get("/")
async def root():
"""Root endpoint"""
return {
"service": get_settings().app_name,
"version": get_settings().app_version,
"status": "running",
"endpoints": {
"health": "/health",
"analyze": "/analyze-comments",
"test": "/test"
}
}
@app.get("/health")
async def health_check():
"""Health check endpoint"""
models_loaded = sia is not None and model is not None and tokenizer is not None
return {
"status": "healthy" if models_loaded else "unhealthy",
"service": "comment-analysis",
"version": get_settings().app_version,
"models_loaded": models_loaded,
"device": device if device else "not initialized",
"timestamp": datetime.utcnow().isoformat()
}
@app.post("/analyze-comments", response_model=CommentAnalysisResponse)
async def analyze_comments(
request: CommentAnalysisRequest,
settings: Settings = Depends(get_settings)
):
"""Analyze comments for sentiment using enhanced multi-stage classification"""
try:
comments = request.comments
faculty_info = request.faculty_info
if not comments:
return CommentAnalysisResponse(
success=False,
analysis=None,
message="No comments provided for analysis"
)
logger.info(f"Analyzing {len(comments)} comments for {faculty_info.faculty_name} ({faculty_info.course_code})")
analysis_result = analyze_comments_sentiment(comments)
if analysis_result.get("total_comments", 0) == 0:
return CommentAnalysisResponse(
success=False,
analysis=None,
message=analysis_result.get("message", "No valid comments to analyze")
)
analysis_result["faculty_info"] = {
"faculty_name": faculty_info.faculty_name,
"staff_id": faculty_info.staff_id,
"course_code": faculty_info.course_code,
"course_name": faculty_info.course_name
}
return CommentAnalysisResponse(
success=True,
analysis=analysis_result,
message=f"Successfully analyzed {analysis_result['total_comments']} comments"
)
except ValueError as ve:
logger.warning(f"Validation error: {ve}")
raise HTTPException(status_code=400, detail=str(ve))
except Exception as e:
logger.error(f"Analysis failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Analysis failed. Please try again later.")
@app.get("/test")
async def test_endpoint():
"""Test endpoint with various comment types"""
test_cases = [
# Meta-comments (should be Neutral)
"No negative comments",
"Everything is good",
"Nothing to say",
"Nil",
# Strong Negative (should be Negative)
"Very poor teaching quality",
"Boring class, waste of time",
"Cannot understand anything",
"Teaching is terrible and voice is too low",
"Poor knowledge and bad teaching method",
# Positive (should be Positive)
"Excellent teacher with great knowledge",
"Very helpful and explains clearly",
"Amazing teaching style, learned a lot",
"Best professor, highly recommend",
# Weak negative/Neutral
"Could be better",
"Sometimes hard to understand",
"Overall good but too lag",
# Mixed
"Good teacher but classes are boring",
"Knowledgeable but voice is low"
]
results = []
for text in test_cases:
is_meta = is_meta_comment(text)
has_strong_neg = detect_strong_negative(text)
has_pos = detect_positive(text)
has_weak_neg = detect_weak_negative(text)
# Predict
if is_meta:
predicted = "Neutral (meta-comment)"
elif has_strong_neg:
predicted = "Negative (strong pattern)"
elif has_pos and not has_strong_neg:
predicted = "Positive (likely)"
elif has_weak_neg and not has_strong_neg:
predicted = "Neutral/Negative (weak)"
else:
predicted = "Requires full analysis"
results.append({
"text": text,
"is_meta": is_meta,
"strong_negative": has_strong_neg,
"positive": has_pos,
"weak_negative": has_weak_neg,
"predicted": predicted
})
return {
"test_results": results,
"note": "Predictions based on pattern matching. Full analysis uses VADER + RoBERTa ensemble."
}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")
# """
# Enhanced FastAPI Service for Comment Sentiment Analysis
# with improved performance, validation, and configuration management
# Version 2.1.0 - Updated with bug fixes and improvements
# """
# from fastapi import FastAPI, HTTPException, Depends
# from fastapi.middleware.cors import CORSMiddleware
# from pydantic import BaseModel, Field, validator
# from pydantic_settings import BaseSettings
# from typing import List, Dict, Any, Optional
# from functools import lru_cache
# import uvicorn
# import pandas as pd
# import numpy as np
# import os
# import re
# from datetime import datetime
# import logging
# # Configure logging FIRST
# logging.basicConfig(
# level=logging.INFO,
# format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
# )
# logger = logging.getLogger(__name__)
# # CRITICAL: Download NLTK data BEFORE importing NLTK components
# import nltk
# import ssl
# try:
# _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
# pass
# else:
# ssl._create_default_https_context = _create_unverified_https_context
# # Set NLTK data path
# nltk_data_dir = '/tmp/nltk_data'
# os.makedirs(nltk_data_dir, exist_ok=True)
# nltk.data.path.insert(0, nltk_data_dir)
# # Download required NLTK data
# def ensure_nltk_data():
# """Ensure all required NLTK data is downloaded"""
# resources = ['vader_lexicon', 'punkt', 'stopwords', 'wordnet', 'omw-1.4']
# for resource in resources:
# try:
# # Try to find the resource
# if resource == 'vader_lexicon':
# nltk.data.find('sentiment/vader_lexicon.zip')
# elif resource == 'punkt':
# nltk.data.find('tokenizers/punkt')
# elif resource in ['stopwords', 'wordnet', 'omw-1.4']:
# nltk.data.find(f'corpora/{resource}')
# logger.info(f"✓ NLTK resource '{resource}' already available")
# except LookupError:
# logger.info(f"Downloading NLTK resource '{resource}'...")
# try:
# nltk.download(resource, download_dir=nltk_data_dir, quiet=False)
# logger.info(f"✓ Successfully downloaded '{resource}'")
# except Exception as e:
# logger.error(f"✗ Failed to download '{resource}': {e}")
# # Download NLTK data immediately
# logger.info("Ensuring NLTK data is available...")
# ensure_nltk_data()
# # NOW import NLTK components
# from nltk.sentiment import SentimentIntensityAnalyzer
# # Import transformers after NLTK setup
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
# from scipy.special import softmax
# import torch
# # Configuration Management
# class Settings(BaseSettings):
# """Application settings with environment variable support"""
# # API Settings
# app_name: str = "Comment Analysis API"
# app_version: str = "2.1.0"
# debug_mode: bool = False
# # Request Limits
# max_comments_per_request: int = 1000
# max_comment_length: int = 5000
# min_comment_words: int = 1
# # Sentiment Thresholds
# vader_pos_threshold: float = 0.2
# vader_neg_threshold: float = -0.2
# roberta_pos_threshold: float = 0.55
# roberta_neg_threshold: float = 0.45
# combined_weight_vader: float = 0.5
# combined_weight_roberta: float = 0.5
# # Model Settings
# model_cache_dir: str = "/tmp/model_cache"
# roberta_model_name: str = "cardiffnlp/twitter-roberta-base-sentiment"
# use_abstractive_summary: bool = False
# summarizer_model: str = "facebook/bart-large-cnn"
# max_summary_length: int = 100
# min_summary_length: int = 25
# # Performance
# enable_caching: bool = True
# cache_size: int = 500
# batch_size: int = 32
# class Config:
# env_file = ".env"
# env_file_encoding = 'utf-8'
# extra = 'ignore'
# @validator('min_comment_words')
# def validate_min_words(cls, v):
# if v < 0:
# raise ValueError('min_comment_words must be non-negative')
# return v
# @validator('combined_weight_vader', 'combined_weight_roberta')
# def validate_weights(cls, v):
# if not 0 <= v <= 1:
# raise ValueError('Weights must be between 0 and 1')
# return v
# @lru_cache()
# def get_settings() -> Settings:
# """Cached settings instance"""
# settings = Settings()
# # Normalize weights if needed
# total = settings.combined_weight_vader + settings.combined_weight_roberta
# if not (0.99 <= total <= 1.01):
# logger.warning(f"Weights sum to {total}, normalizing to 1.0")
# settings.combined_weight_vader /= total
# settings.combined_weight_roberta /= total
# return settings
# # Pydantic Models
# class FacultyInfo(BaseModel):
# faculty_name: str = Field(..., min_length=1, max_length=200)
# staff_id: str = Field(..., min_length=1, max_length=50)
# course_code: str = Field(..., min_length=1, max_length=50)
# course_name: str = Field(..., min_length=1, max_length=200)
# class CommentAnalysisRequest(BaseModel):
# comments: List[str] = Field(..., min_items=1)
# faculty_info: FacultyInfo
# @validator('comments')
# def validate_comments(cls, v):
# settings = get_settings()
# if len(v) > settings.max_comments_per_request:
# raise ValueError(
# f'Maximum {settings.max_comments_per_request} comments per request'
# )
# for idx, comment in enumerate(v):
# if len(comment) > settings.max_comment_length:
# raise ValueError(
# f'Comment {idx} exceeds maximum length of {settings.max_comment_length} characters'
# )
# return v
# class SentimentDistribution(BaseModel):
# positive_percentage: float
# negative_percentage: float
# neutral_percentage: float
# class DetailedScores(BaseModel):
# average_positive: float
# average_negative: float
# average_neutral: float
# average_compound: Optional[float] = None
# class DetailedAnalysis(BaseModel):
# vader_scores: DetailedScores
# roberta_scores: DetailedScores
# class AnalysisResult(BaseModel):
# total_comments: int
# positive_comments: int
# negative_comments: int
# neutral_comments: int
# positive_sentiment: float
# negative_sentiment: float
# neutral_sentiment: float
# overall_sentiment: str
# sentiment_distribution: SentimentDistribution
# negative_comments_summary: str
# negative_comments_list: List[str]
# key_insights: List[str]
# recommendations: List[str]
# detailed_analysis: DetailedAnalysis
# faculty_info: Dict[str, str]
# analysis_timestamp: str
# class CommentAnalysisResponse(BaseModel):
# success: bool
# analysis: Optional[AnalysisResult] = None
# message: str
# # Initialize FastAPI app
# app = FastAPI(
# title=get_settings().app_name,
# version=get_settings().app_version,
# description="Advanced sentiment analysis service for educational feedback"
# )
# # Add CORS middleware
# app.add_middleware(
# CORSMiddleware,
# allow_origins=["*"],
# allow_credentials=True,
# allow_methods=["*"],
# allow_headers=["*"],
# )
# # Global variables for models
# sia = None
# tokenizer = None
# model = None
# device = None
# summarizer = None
# # Enhanced heuristic phrase/regex rules for explicit negative feedback
# NEGATIVE_PHRASES = [
# # Teaching quality issues
# 'very poor',
# 'extremely poor',
# 'poor in teaching',
# 'poor teaching level',
# 'poor teaching',
# 'bad teacher',
# 'bad teaching',
# 'not good', # Keep but check it's not "no negative"
# 'not satisfied',
# 'not satisfactory',
# # Content/delivery issues
# 'boring class',
# 'boring classes',
# 'boring subject',
# 'subject is boring',
# 'low voice',
# 'voice is low',
# 'cannot hear',
# "can't hear",
# 'speak louder',
# # Resource/support issues
# 'need more staff',
# 'need more faculty',
# 'insufficient staff',
# 'lack of staff',
# 'not sufficient',
# 'insufficient',
# 'not enough',
# 'no classes',
# 'no regular classes',
# 'not sufficient classes',
# # Knowledge/understanding issues
# 'lack of knowledge',
# 'better knowledge needed',
# 'poor knowledge',
# 'knowledge is lacking',
# 'practical knowledge lacking',
# 'no practical',
# 'lack of practical',
# 'no hands-on',
# 'no real world',
# 'did not understand',
# "didn't understand",
# 'not able to understand',
# 'unable to understand',
# 'difficult to understand',
# 'hard to understand',
# 'concepts are difficult',
# 'concepts difficult',
# 'cant understand',
# "can't understand",
# 'not understandable',
# # Improvement needed
# 'improve class',
# 'improvement needed',
# 'needs improvement',
# 'need improvement',
# 'should improve',
# 'must improve',
# 'not helpful',
# 'not clear',
# 'communication skills need improvement',
# 'improve communication',
# # Pace/time issues
# 'lectures are going fast',
# 'going too fast',
# 'too fast',
# 'too slow',
# 'too lag',
# 'lag',
# 'lagging',
# 'lag in teaching',
# 'not managing time',
# 'poor time management',
# 'time management issue',
# # Engagement issues
# 'not interested',
# 'no interest',
# 'going for attendance',
# 'just for attendance',
# 'only for attendance',
# 'not at all',
# 'nothing learnt',
# 'learned nothing',
# 'no improvement',
# 'same teaching',
# 'monotonous',
# 'sleeping in class',
# # Value/utility issues
# 'waste of time',
# 'wasting time',
# 'waste our time',
# 'no use',
# 'useless',
# # Administrative issues
# 'military rules',
# 'strict rules',
# 'too strict',
# 'very strict',
# 'attendance issue',
# 'attendance problem',
# 'not providing attendance',
# 'claim od',
# # Workload issues
# 'too many projects',
# 'many projects review',
# 'trouble to make',
# 'difficult to make',
# 'hard to make',
# 'placement activities', # When context is negative
# ]
# NEGATIVE_REGEXES = [
# # Teaching quality patterns
# re.compile(r"\b(very|extremely|quite|so)\s+(poor|bad|weak)\s+(in\s+)?(teaching|knowledge|communication)", re.IGNORECASE),
# re.compile(r"\bpoor\s+(teaching|teacher|faculty|knowledge|communication)", re.IGNORECASE),
# re.compile(r"\b(teaching|knowledge)\s+(is\s+)?(poor|bad|weak|lacking)", re.IGNORECASE),
# # Boring/engagement patterns
# re.compile(r"\b(boring|dull|monotonous)\s+(class|classes|subject|lecture|lectures)", re.IGNORECASE),
# re.compile(r"\b(class|classes|subject|lecture|lectures)\s+(is|are)\s+(boring|dull|monotonous)", re.IGNORECASE),
# # Voice/communication patterns
# re.compile(r"\b(low|soft|quiet)\s+voice\b", re.IGNORECASE),
# re.compile(r"\bvoice\s+(is\s+)?(low|soft|quiet|not clear)", re.IGNORECASE),
# re.compile(r"\b(cannot|can't|cant|unable to)\s+hear", re.IGNORECASE),
# # Resource/support patterns
# re.compile(r"\b(no|not|insufficient|lack of)\s+(proper|sufficient|enough|regular)?\s*(classes|notes|support|staff|faculty)", re.IGNORECASE),
# re.compile(r"\bneed(s)?\s+more\s+(staff|faculty|support|classes)", re.IGNORECASE),
# # Understanding/clarity patterns
# re.compile(r"\b(cannot|can't|cant|unable to|difficult to|hard to)\s+understand", re.IGNORECASE),
# re.compile(r"\b(not|difficult|hard)\s+(able\s+to\s+)?understand(\s+the)?(\s+(concepts?|teaching|lectures?))?", re.IGNORECASE),
# re.compile(r"\bconcepts?\s+(are\s+)?(difficult|hard|tough|complex)\s+to\s+understand", re.IGNORECASE),
# # Improvement patterns
# re.compile(r"\b(need|needs|needed|require|requires)\s+(some\s+)?(improvement|to improve)", re.IGNORECASE),
# re.compile(r"\b(should|must|have to)\s+improve", re.IGNORECASE),
# re.compile(r"\bimprovement\s+(is\s+)?need(ed)?", re.IGNORECASE),
# # Pace patterns
# re.compile(r"\b(lecture|lectures|class|classes|teaching)\s+(is|are|going)\s+(too|very)\s+(fast|slow)", re.IGNORECASE),
# re.compile(r"\b(too|very)\s+(fast|slow|lag|lagging)", re.IGNORECASE),
# # Time management patterns
# re.compile(r"\b(not|poor|bad)\s+(managing|managing)\s+time", re.IGNORECASE),
# re.compile(r"\btime\s+management\s+(is\s+)?(poor|bad|lacking)", re.IGNORECASE),
# # Attendance/engagement patterns
# re.compile(r"\b(just|only)\s+(for|going for)\s+attendance", re.IGNORECASE),
# re.compile(r"\b(going|attend|attending)\s+(to|for)\s+(her|his|their)\s+class\s+(just|only)\s+for\s+attendance", re.IGNORECASE),
# re.compile(r"\bnot\s+(at\s+all\s+)?(interested|engaging|helpful)", re.IGNORECASE),
# # Value patterns
# re.compile(r"\b(waste|wasting)\s+(of\s+)?time", re.IGNORECASE),
# re.compile(r"\b(no\s+use|useless|not useful)", re.IGNORECASE),
# # Workload patterns
# re.compile(r"\b(too\s+)?many\s+projects", re.IGNORECASE),
# re.compile(r"\btrouble\s+to\s+(make|complete|do)", re.IGNORECASE),
# # Administrative patterns
# re.compile(r"\bmilitary\s+rules", re.IGNORECASE),
# re.compile(r"\b(too|very)\s+strict", re.IGNORECASE),
# re.compile(r"\battendance\s+(issue|problem)", re.IGNORECASE),
# re.compile(r"\bnot\s+providing\s+attendance", re.IGNORECASE),
# re.compile(r"\bclaim\s+od", re.IGNORECASE),
# # Placement/scheduling patterns
# re.compile(r"\bplacement\s+activities\s+(and|with)\s+(attendance|issue|problem)", re.IGNORECASE),
# re.compile(r"\b(class|classes)\s+(intersecting|conflicting)\s+with\s+placement", re.IGNORECASE),
# ]
# META_COMMENT_PATTERNS = [
# re.compile(r"^no\s+negative\s+(comments?|feedback|remarks?)", re.IGNORECASE),
# re.compile(r"^no\s+negative\s+comments?\s+on\s+the\s+(faculty|teacher|staff|course)", re.IGNORECASE),
# re.compile(r"^no\s+(issues?|problems?|complaints?)\.?$", re.IGNORECASE),
# re.compile(r"^no\s+(issues?|problems?|complaints?)\s+(at\s+all|whatsoever)", re.IGNORECASE),
# # "Everything is good" patterns
# re.compile(r"^(everything|all)\s+(is\s+)?(good|fine|ok|okay|great|perfect|excellent)", re.IGNORECASE),
# re.compile(r"^no,?\s+(everything|all)\s+(is\s+)?(good|fine|ok|okay)", re.IGNORECASE),
# re.compile(r"^(all\s+)?good\.?$", re.IGNORECASE),
# re.compile(r"^everything\s+at\s+the\s+too\s+only", re.IGNORECASE), # From your data
# # "Nothing" patterns
# re.compile(r"^nothing\.?$", re.IGNORECASE),
# re.compile(r"^nothing\s+(to\s+)?(say|comment|mention|add)", re.IGNORECASE),
# re.compile(r"^nothing,?\s+(and\s+)?(all|everything)\s+(is\s+)?(good|fine)", re.IGNORECASE),
# # "No more comments" patterns
# re.compile(r"^no\s+more\s+(comments?|remarks?|feedback)", re.IGNORECASE),
# re.compile(r"^no\s+(other\s+)?(comments?|remarks?|feedback)", re.IGNORECASE),
# re.compile(r"^no\s+remarks?(\s+(about|on))?", re.IGNORECASE),
# # Empty/nil responses
# re.compile(r"^(nil|none|na|n/a|nill)\.?$", re.IGNORECASE),
# re.compile(r"^(no|nothing|none)\.?$", re.IGNORECASE),
# # Positive meta-comments (not actual feedback)
# re.compile(r"^(it's\s+|its\s+)?(all\s+)?good\.?$", re.IGNORECASE),
# re.compile(r"^fine\.?$", re.IGNORECASE),
# re.compile(r"^ok(ay)?\.?$", re.IGNORECASE),
# re.compile(r"^great\.?$", re.IGNORECASE),
# re.compile(r"^nice\.?$", re.IGNORECASE),
# ]
# def is_meta_comment(text: str) -> bool:
# """
# Check if comment is a meta-comment (not actual feedback).
# These are generic statements that don't provide substantive feedback.
# """
# if not text:
# return True # Empty text is meta
# text = text.strip()
# # Check length - very short comments are likely meta
# if len(text) < 3:
# logger.debug(f"Meta-comment (too short): '{text}'")
# return True
# # Check against patterns
# for pattern in META_COMMENT_PATTERNS:
# if pattern.match(text):
# logger.debug(f"Meta-comment detected: '{text[:50]}...'")
# return True
# return False
# def is_explicit_negative(text: str) -> bool:
# """
# Check if text contains explicit negative phrases.
# IMPORTANT: Must check if it's a meta-comment FIRST.
# """
# if not text:
# return False
# # CRITICAL: Don't classify meta-comments as negative
# if is_meta_comment(text):
# return False
# lower = text.lower()
# # Check phrases
# for phrase in NEGATIVE_PHRASES:
# if phrase in lower:
# # Double-check it's not a false positive like "no negative comments"
# if phrase == 'not good' and 'no negative' in lower:
# continue
# if phrase == 'no interest' and 'no negative' in lower:
# continue
# logger.debug(f"Negative phrase detected: '{phrase}' in '{text[:50]}...'")
# return True
# # Check regexes
# for regex in NEGATIVE_REGEXES:
# if regex.search(text):
# logger.debug(f"Negative pattern matched: {regex.pattern} in '{text[:50]}...'")
# return True
# return False
# def initialize_models():
# """Initialize sentiment analysis models with caching support"""
# global sia, tokenizer, model, device, summarizer
# try:
# settings = get_settings()
# logger.info("Initializing sentiment analysis models...")
# # Initialize VADER (NLTK data already downloaded)
# sia = SentimentIntensityAnalyzer()
# logger.info("✓ VADER initialized")
# # Initialize RoBERTa with caching
# cache_dir = settings.model_cache_dir
# os.makedirs(cache_dir, exist_ok=True)
# tokenizer = AutoTokenizer.from_pretrained(
# settings.roberta_model_name,
# cache_dir=cache_dir
# )
# model = AutoModelForSequenceClassification.from_pretrained(
# settings.roberta_model_name,
# cache_dir=cache_dir
# )
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)
# model.eval()
# logger.info(f"✓ RoBERTa initialized on device: {device}")
# # Initialize summarizer (optional)
# if settings.use_abstractive_summary:
# try:
# summarizer = pipeline(
# "summarization",
# model=settings.summarizer_model,
# device=0 if device == "cuda" else -1
# )
# logger.info("✓ Summarizer initialized")
# except Exception as e:
# logger.warning(f"Summarizer initialization failed: {e}")
# summarizer = None
# logger.info("✓ All models initialized successfully")
# except Exception as e:
# logger.error(f"Error initializing models: {e}")
# raise e
# @lru_cache(maxsize=500)
# def vader_sentiment_cached(text: str) -> tuple:
# """Cached VADER sentiment analysis"""
# scores = sia.polarity_scores(text)
# return (scores['neg'], scores['neu'], scores['pos'], scores['compound'])
# def vader_sentiment(text: str) -> Dict[str, float]:
# """VADER sentiment analysis with caching support"""
# try:
# settings = get_settings()
# if settings.enable_caching:
# neg, neu, pos, compound = vader_sentiment_cached(text)
# return {
# 'vader_neg': neg,
# 'vader_neu': neu,
# 'vader_pos': pos,
# 'vader_compound': compound
# }
# else:
# scores = sia.polarity_scores(text)
# return {
# 'vader_neg': scores['neg'],
# 'vader_neu': scores['neu'],
# 'vader_pos': scores['pos'],
# 'vader_compound': scores['compound']
# }
# except Exception as e:
# logger.warning(f"VADER analysis failed for text: {e}")
# return {'vader_neg': 0.0, 'vader_neu': 1.0, 'vader_pos': 0.0, 'vader_compound': 0.0}
# def roberta_sentiment_batch(texts: List[str]) -> List[Dict[str, float]]:
# """Batch RoBERTa sentiment analysis for better performance"""
# try:
# settings = get_settings()
# results = []
# for i in range(0, len(texts), settings.batch_size):
# batch = texts[i:i + settings.batch_size]
# encoded = tokenizer(
# batch,
# return_tensors='pt',
# truncation=True,
# max_length=512,
# padding=True
# )
# encoded = {k: v.to(device) for k, v in encoded.items()}
# with torch.no_grad():
# outputs = model(**encoded)
# for output in outputs.logits:
# scores = softmax(output.cpu().numpy())
# results.append({
# 'roberta_neg': float(scores[0]),
# 'roberta_neu': float(scores[1]),
# 'roberta_pos': float(scores[2])
# })
# return results
# except Exception as e:
# logger.warning(f"RoBERTa batch analysis failed: {e}")
# return [{'roberta_neg': 0.0, 'roberta_neu': 1.0, 'roberta_pos': 0.0} for _ in texts]
# def roberta_sentiment(text: str) -> Dict[str, float]:
# """Single text RoBERTa sentiment analysis"""
# try:
# encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
# encoded_text = {k: v.to(device) for k, v in encoded_text.items()}
# with torch.no_grad():
# output = model(**encoded_text)
# scores = softmax(output[0][0].cpu().numpy())
# return {
# 'roberta_neg': float(scores[0]),
# 'roberta_neu': float(scores[1]),
# 'roberta_pos': float(scores[2])
# }
# except Exception as e:
# logger.warning(f"RoBERTa analysis failed for text: {e}")
# return {'roberta_neg': 0.0, 'roberta_neu': 1.0, 'roberta_pos': 0.0}
# def overall_sentiment(row: pd.Series, settings: Settings) -> str:
# """Determine overall sentiment using combined scores with configurable thresholds"""
# combined_pos = row.get('combined_pos', 0.0)
# combined_neg = row.get('combined_neg', 0.0)
# combined_neu = row.get('combined_neu', 0.0)
# vader_compound = row.get('vader_compound', 0.0)
# roberta_neg = row.get('roberta_neg', 0.0)
# roberta_pos = row.get('roberta_pos', 0.0)
# # Priority 1: Heuristic negative patterns override everything
# if row.get('heuristic_negative') is True:
# return 'Negative'
# # Priority 2: Strong negative signals
# if (
# vader_compound <= settings.vader_neg_threshold or
# roberta_neg >= settings.roberta_neg_threshold or
# combined_neg >= max(combined_pos, combined_neu)
# ):
# return 'Negative'
# # Priority 3: Positive signals
# if (
# vader_compound >= settings.vader_pos_threshold or
# roberta_pos >= settings.roberta_pos_threshold or
# combined_pos >= max(combined_neg, combined_neu)
# ):
# return 'Positive'
# # Default: Neutral
# return 'Neutral'
# def sanitize_text(text: str) -> str:
# """Sanitize input text while preserving emojis"""
# if not text:
# return ""
# # Remove control characters but keep printable characters and emojis
# text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]', '', text)
# # Normalize whitespace
# text = ' '.join(text.split())
# return text.strip()
# def analyze_comments_sentiment(comments: List[str]) -> Dict[str, Any]:
# """Main sentiment analysis function with enhanced performance"""
# try:
# settings = get_settings()
# logger.info(f"Received {len(comments)} comments for analysis")
# # Sanitize comments
# sanitized_comments = [sanitize_text(comment) for comment in comments]
# # FIXED: Changed < to <= to properly handle min_comment_words
# filtered_comments = [
# comment for comment in sanitized_comments
# if (settings.min_comment_words <= len(comment.split()) <= settings.max_comment_length)
# ]
# logger.info(f"After filtering: {len(filtered_comments)} valid comments")
# if not filtered_comments:
# return {
# "total_comments": 0,
# "message": "No valid comments found for analysis"
# }
# # Create dataframe
# df = pd.DataFrame({'comment': filtered_comments})
# # Detect meta-comments and explicit negatives
# df['is_meta'] = df['comment'].apply(is_meta_comment)
# df['heuristic_negative'] = df['comment'].apply(is_explicit_negative)
# # Log detection results
# meta_count = df['is_meta'].sum()
# heuristic_neg_count = df['heuristic_negative'].sum()
# logger.info(f"Detected {meta_count} meta-comments and {heuristic_neg_count} heuristic negatives")
# # VADER sentiment analysis
# vader_results = []
# for text in df['comment']:
# vader_results.append(vader_sentiment(text))
# # RoBERTa sentiment analysis (batch)
# roberta_results = roberta_sentiment_batch(df['comment'].tolist())
# # Combine results
# vader_df = pd.DataFrame(vader_results)
# roberta_df = pd.DataFrame(roberta_results)
# final_df = pd.concat([df.reset_index(drop=True), vader_df, roberta_df], axis=1)
# # Calculate combined scores
# final_df['combined_pos'] = (
# settings.combined_weight_vader * final_df['vader_pos'] +
# settings.combined_weight_roberta * final_df['roberta_pos']
# )
# final_df['combined_neg'] = (
# settings.combined_weight_vader * final_df['vader_neg'] +
# settings.combined_weight_roberta * final_df['roberta_neg']
# )
# final_df['combined_neu'] = (
# settings.combined_weight_vader * final_df['vader_neu'] +
# settings.combined_weight_roberta * final_df['roberta_neu']
# )
# # Classify overall sentiment (meta-comments become Neutral)
# final_df['Overall_Sentiment'] = final_df.apply(
# lambda row: 'Neutral' if row.get('is_meta') else overall_sentiment(row, settings),
# axis=1
# )
# # Calculate statistics
# total_comments = len(final_df)
# positive_count = len(final_df[final_df['Overall_Sentiment'] == 'Positive'])
# negative_count = len(final_df[final_df['Overall_Sentiment'] == 'Negative'])
# neutral_count = len(final_df[final_df['Overall_Sentiment'] == 'Neutral'])
# logger.info(
# f"Results: {positive_count} positive, "
# f"{negative_count} negative, {neutral_count} neutral"
# )
# # Average scores
# avg_positive = float(final_df['combined_pos'].mean())
# avg_negative = float(final_df['combined_neg'].mean())
# avg_neutral = float(final_df['combined_neu'].mean())
# # Determine overall sentiment label
# if avg_positive > max(avg_negative, avg_neutral):
# overall_sentiment_label = "Positive"
# elif avg_negative > max(avg_positive, avg_neutral):
# overall_sentiment_label = "Negative"
# else:
# overall_sentiment_label = "Neutral"
# # Process negative comments
# negative_summary = ""
# negative_comments_list = []
# negative_comments = final_df[final_df['Overall_Sentiment'] == 'Negative']
# if len(negative_comments) > 0:
# negative_comments_list = negative_comments['comment'].tolist()
# try:
# # Get top negative comments
# top_idx = negative_comments['combined_neg'].nlargest(3).index
# top_comments = negative_comments.loc[top_idx, 'comment'].tolist()
# if settings.use_abstractive_summary and summarizer is not None:
# negative_text = " ".join(top_comments)
# if len(negative_text) > 1000:
# negative_text = negative_text[:1000]
# summary_result = summarizer(
# negative_text,
# max_length=settings.max_summary_length,
# min_length=settings.min_summary_length,
# do_sample=False
# )
# negative_summary = summary_result[0]['summary_text']
# else:
# # Extractive summary
# negative_summary = "; ".join(top_comments)
# except Exception as e:
# logger.warning(f"Summary generation failed: {e}")
# negative_summary = "; ".join(negative_comments_list[:3])
# # Generate insights and recommendations
# insights = []
# recommendations = []
# if overall_sentiment_label == "Positive":
# insights.extend([
# "Students have positive feedback overall",
# "Teaching methods are well-received",
# f"{positive_count}/{total_comments} comments are positive"
# ])
# recommendations.extend([
# "Continue current teaching approach",
# "Maintain student engagement strategies",
# "Share successful practices with colleagues"
# ])
# elif overall_sentiment_label == "Negative":
# insights.extend([
# "Students have concerns that need attention",
# "Some aspects of teaching may need improvement",
# f"{negative_count}/{total_comments} comments indicate issues"
# ])
# recommendations.extend([
# "Review teaching methods and materials",
# "Consider additional student support",
# "Schedule meetings to address student concerns",
# "Focus on areas mentioned in negative feedback"
# ])
# else:
# insights.extend([
# "Mixed feedback from students",
# "Some areas performing well, others need attention",
# "Balance of positive and negative responses"
# ])
# recommendations.extend([
# "Focus on areas with negative feedback",
# "Maintain strengths while addressing weaknesses",
# "Gather more specific feedback on improvement areas"
# ])
# return {
# "total_comments": total_comments,
# "positive_comments": positive_count,
# "negative_comments": negative_count,
# "neutral_comments": neutral_count,
# "positive_sentiment": round(avg_positive, 3),
# "negative_sentiment": round(avg_negative, 3),
# "neutral_sentiment": round(avg_neutral, 3),
# "overall_sentiment": overall_sentiment_label,
# "sentiment_distribution": {
# "positive_percentage": round((positive_count / total_comments) * 100, 1),
# "negative_percentage": round((negative_count / total_comments) * 100, 1),
# "neutral_percentage": round((neutral_count / total_comments) * 100, 1)
# },
# "negative_comments_summary": negative_summary,
# "negative_comments_list": negative_comments_list,
# "key_insights": insights,
# "recommendations": recommendations,
# "detailed_analysis": {
# "vader_scores": {
# "average_positive": round(final_df['vader_pos'].mean(), 3),
# "average_negative": round(final_df['vader_neg'].mean(), 3),
# "average_neutral": round(final_df['vader_neu'].mean(), 3),
# "average_compound": round(final_df['vader_compound'].mean(), 3)
# },
# "roberta_scores": {
# "average_positive": round(final_df['roberta_pos'].mean(), 3),
# "average_negative": round(final_df['roberta_neg'].mean(), 3),
# "average_neutral": round(final_df['roberta_neu'].mean(), 3)
# }
# },
# "analysis_timestamp": datetime.utcnow().isoformat()
# }
# except Exception as e:
# logger.error(f"Sentiment analysis failed: {e}", exc_info=True)
# raise e
# @app.on_event("startup")
# async def startup_event():
# """Initialize models on startup"""
# try:
# logger.info("=" * 80)
# logger.info(f"Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# logger.info("=" * 80)
# initialize_models()
# logger.info("✓ Service started successfully")
# logger.info("=" * 80)
# except Exception as e:
# logger.error(f"✗ Startup failed: {e}")
# raise e
# @app.on_event("shutdown")
# async def shutdown_event():
# """Cleanup on shutdown"""
# logger.info("Service shutting down")
# @app.get("/")
# async def root():
# """Root endpoint"""
# return {
# "service": get_settings().app_name,
# "version": get_settings().app_version,
# "status": "running",
# "endpoints": {
# "health": "/health",
# "analyze": "/analyze-comments",
# "config": "/config (debug mode only)",
# "test": "/test"
# }
# }
# @app.get("/health")
# async def health_check():
# """Health check endpoint"""
# models_loaded = sia is not None and model is not None and tokenizer is not None
# return {
# "status": "healthy" if models_loaded else "unhealthy",
# "service": "comment-analysis",
# "version": get_settings().app_version,
# "models_loaded": models_loaded,
# "device": device if device else "not initialized",
# "timestamp": datetime.utcnow().isoformat()
# }
# @app.post("/analyze-comments", response_model=CommentAnalysisResponse)
# async def analyze_comments(
# request: CommentAnalysisRequest,
# settings: Settings = Depends(get_settings)
# ):
# """
# Analyze comments for sentiment analysis using VADER and RoBERTa models
# """
# try:
# comments = request.comments
# faculty_info = request.faculty_info
# if not comments:
# return CommentAnalysisResponse(
# success=False,
# analysis=None,
# message="No comments provided for analysis"
# )
# logger.info(
# f"Analyzing {len(comments)} comments for "
# f"{faculty_info.faculty_name} ({faculty_info.course_code})"
# )
# analysis_result = analyze_comments_sentiment(comments)
# if analysis_result.get("total_comments", 0) == 0:
# return CommentAnalysisResponse(
# success=False,
# analysis=None,
# message=analysis_result.get("message", "No valid comments to analyze")
# )
# analysis_result["faculty_info"] = {
# "faculty_name": faculty_info.faculty_name,
# "staff_id": faculty_info.staff_id,
# "course_code": faculty_info.course_code,
# "course_name": faculty_info.course_name
# }
# return CommentAnalysisResponse(
# success=True,
# analysis=analysis_result,
# message=f"Successfully analyzed {analysis_result['total_comments']} comments"
# )
# except ValueError as ve:
# logger.warning(f"Validation error: {ve}")
# raise HTTPException(status_code=400, detail=str(ve))
# except Exception as e:
# logger.error(f"Analysis failed: {e}", exc_info=True)
# raise HTTPException(
# status_code=500,
# detail="Analysis failed. Please try again later."
# )
# @app.get("/config")
# async def get_config(settings: Settings = Depends(get_settings)):
# """Get current configuration (debug mode only)"""
# if not settings.debug_mode:
# raise HTTPException(status_code=404, detail="Not found")
# return {
# "max_comments_per_request": settings.max_comments_per_request,
# "max_comment_length": settings.max_comment_length,
# "min_comment_words": settings.min_comment_words,
# "vader_pos_threshold": settings.vader_pos_threshold,
# "vader_neg_threshold": settings.vader_neg_threshold,
# "roberta_pos_threshold": settings.roberta_pos_threshold,
# "roberta_neg_threshold": settings.roberta_neg_threshold,
# "combined_weight_vader": settings.combined_weight_vader,
# "combined_weight_roberta": settings.combined_weight_roberta,
# "enable_caching": settings.enable_caching,
# "batch_size": settings.batch_size,
# "use_abstractive_summary": settings.use_abstractive_summary
# }
# @app.get("/test")
# async def test_endpoint():
# """Test endpoint to verify sentiment classification"""
# test_cases = [
# "No more comments 😅",
# "Overall good but too lag",
# "Not interested to be in her class just we are going for attendance thats it not at all managing time.",
# "Nothing to say anything just we are going to her class mean, only for attendance",
# "Excellent teaching! Very clear explanations.",
# "Good teacher with strong subject knowledge",
# "Class is okay, nothing special"
# ]
# results = []
# for text in test_cases:
# is_meta = is_meta_comment(text)
# is_neg = is_explicit_negative(text)
# # Predict classification
# if is_meta:
# predicted = "Neutral (meta-comment)"
# elif is_neg:
# predicted = "Negative (heuristic)"
# else:
# predicted = "Needs full analysis"
# results.append({
# "text": text,
# "is_meta_comment": is_meta,
# "is_heuristic_negative": is_neg,
# "predicted_classification": predicted
# })
# return {
# "test_results": results,
# "note": "Full analysis requires VADER and RoBERTa scores"
# }
# if __name__ == "__main__":
# uvicorn.run(
# app,
# host="0.0.0.0",
# port=8000,
# log_level="info"
# )