""" Enhanced FastAPI Service for Comment Sentiment Analysis Version 3.0.0 - Major accuracy improvements with advanced classification Features: - Multi-stage sentiment detection - Context-aware negative pattern matching - Improved neutral/meta-comment detection - Enhanced accuracy through ensemble approach """ from fastapi import FastAPI, HTTPException, Depends from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, Field, validator from pydantic_settings import BaseSettings from typing import List, Dict, Any, Optional from functools import lru_cache import uvicorn import pandas as pd import numpy as np import os import re from datetime import datetime import logging # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # NLTK Setup import nltk import ssl try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context nltk_data_dir = '/tmp/nltk_data' os.makedirs(nltk_data_dir, exist_ok=True) nltk.data.path.insert(0, nltk_data_dir) def ensure_nltk_data(): """Ensure all required NLTK data is downloaded""" resources = ['vader_lexicon', 'punkt', 'stopwords', 'wordnet', 'omw-1.4'] for resource in resources: try: if resource == 'vader_lexicon': nltk.data.find('sentiment/vader_lexicon.zip') elif resource == 'punkt': nltk.data.find('tokenizers/punkt') elif resource in ['stopwords', 'wordnet', 'omw-1.4']: nltk.data.find(f'corpora/{resource}') logger.info(f"✓ NLTK resource '{resource}' already available") except LookupError: logger.info(f"Downloading NLTK resource '{resource}'...") try: nltk.download(resource, download_dir=nltk_data_dir, quiet=False) logger.info(f"✓ Successfully downloaded '{resource}'") except Exception as e: logger.error(f"✗ Failed to download '{resource}': {e}") logger.info("Ensuring NLTK data is available...") ensure_nltk_data() from nltk.sentiment import SentimentIntensityAnalyzer from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline from scipy.special import softmax import torch # Configuration class Settings(BaseSettings): """Application settings""" app_name: str = "Comment Analysis API" app_version: str = "3.0.0" debug_mode: bool = False max_comments_per_request: int = 1000 max_comment_length: int = 5000 min_comment_words: int = 1 # Enhanced thresholds for better accuracy vader_strong_pos_threshold: float = 0.5 vader_pos_threshold: float = 0.2 vader_neg_threshold: float = -0.2 vader_strong_neg_threshold: float = -0.5 roberta_strong_pos_threshold: float = 0.70 roberta_pos_threshold: float = 0.55 roberta_neg_threshold: float = 0.40 roberta_strong_neg_threshold: float = 0.60 # Adjusted weights for better accuracy combined_weight_vader: float = 0.4 combined_weight_roberta: float = 0.6 model_cache_dir: str = "/tmp/model_cache" roberta_model_name: str = "cardiffnlp/twitter-roberta-base-sentiment" use_abstractive_summary: bool = False summarizer_model: str = "facebook/bart-large-cnn" max_summary_length: int = 100 min_summary_length: int = 25 enable_caching: bool = True cache_size: int = 500 batch_size: int = 32 class Config: env_file = ".env" env_file_encoding = 'utf-8' extra = 'ignore' @lru_cache() def get_settings() -> Settings: """Cached settings instance""" settings = Settings() total = settings.combined_weight_vader + settings.combined_weight_roberta if not (0.99 <= total <= 1.01): logger.warning(f"Weights sum to {total}, normalizing to 1.0") settings.combined_weight_vader /= total settings.combined_weight_roberta /= total return settings # Pydantic Models class FacultyInfo(BaseModel): faculty_name: str = Field(..., min_length=1, max_length=200) staff_id: str = Field(..., min_length=1, max_length=50) course_code: str = Field(..., min_length=1, max_length=50) course_name: str = Field(..., min_length=1, max_length=200) class CommentAnalysisRequest(BaseModel): comments: List[str] = Field(..., min_items=1) faculty_info: FacultyInfo @validator('comments') def validate_comments(cls, v): settings = get_settings() if len(v) > settings.max_comments_per_request: raise ValueError(f'Maximum {settings.max_comments_per_request} comments per request') for idx, comment in enumerate(v): if len(comment) > settings.max_comment_length: raise ValueError(f'Comment {idx} exceeds maximum length of {settings.max_comment_length} characters') return v class SentimentDistribution(BaseModel): positive_percentage: float negative_percentage: float neutral_percentage: float class DetailedScores(BaseModel): average_positive: float average_negative: float average_neutral: float average_compound: Optional[float] = None class DetailedAnalysis(BaseModel): vader_scores: DetailedScores roberta_scores: DetailedScores class AnalysisResult(BaseModel): total_comments: int positive_comments: int negative_comments: int neutral_comments: int positive_sentiment: float negative_sentiment: float neutral_sentiment: float overall_sentiment: str sentiment_distribution: SentimentDistribution negative_comments_summary: str negative_comments_list: List[str] key_insights: List[str] recommendations: List[str] detailed_analysis: DetailedAnalysis faculty_info: Dict[str, str] analysis_timestamp: str class CommentAnalysisResponse(BaseModel): success: bool analysis: Optional[AnalysisResult] = None message: str # Initialize FastAPI app = FastAPI( title=get_settings().app_name, version=get_settings().app_version, description="Advanced sentiment analysis service for educational feedback" ) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Global model variables sia = None tokenizer = None model = None device = None summarizer = None # ============================================================================ # ENHANCED PATTERN DETECTION FOR BETTER ACCURACY # ============================================================================ # Meta-comments (not actual feedback - should be NEUTRAL) META_PATTERNS = re.compile( r'^(no\s+(negative\s+)?(more\s+)?(comments?|feedback|remarks?|issues?|problems?|complaints?)|' r'(everything|all)\s+(is\s+)?(good|fine|ok(ay)?|great|perfect|excellent)|' r'nothing(\s+to\s+(say|comment|mention|add))?|' r'(nil|none|na|n/a|nill)\.?|' r'^(all\s+)?(good|fine|ok(ay)?|great|nice)\.?|' r'no\s+remarks?|' r'everything\s+at\s+the\s+too\s+only)$', re.IGNORECASE ) # Strong NEGATIVE indicators (should override model scores) STRONG_NEGATIVE_PATTERN = re.compile( r'\b(' # Direct criticism r'(very|extremely|quite|so|too)\s+(poor|bad|weak|terrible|awful|horrible)|' r'poor\s+(teaching|teacher|faculty|knowledge|communication|quality|explanation)|' r'bad\s+(teaching|teacher|faculty|quality|explanation)|' r'terrible|horrible|awful|pathetic|useless|waste\s+of\s+time|' # Teaching quality issues r'(teaching|knowledge)\s+(is\s+)?(poor|bad|weak|lacking|insufficient|not\s+good)|' r'cannot\s+teach|can\'?t\s+teach|doesn\'?t\s+know\s+how\s+to\s+teach|' r'not\s+teaching\s+properly|teaching\s+method\s+is\s+(poor|bad)|' # Boring/disengagement r'(boring|dull|monotonous)\s+(class|classes|subject|lecture|lectures|sessions?)|' r'(class|classes|subject|lectures?)\s+(is|are)\s+(boring|dull|monotonous|uninteresting)|' r'sleeping\s+in\s+class|fall\s+asleep|makes?\s+us\s+sleep|' # Communication issues r'(low|soft|quiet|unclear)\s+voice|voice\s+(is\s+)?(low|soft|quiet|not\s+clear)|' r'(cannot|can\'?t|cant|unable\s+to)\s+hear|difficult\s+to\s+hear|' r'(not|poor|bad)\s+(communication|explaining|explanation)|' # Understanding issues r'(cannot|can\'?t|cant|unable\s+to|difficult\s+to|hard\s+to)\s+understand|' r'(not|never|don\'?t)\s+(able\s+to\s+)?understand|' r'(concepts?|topics?|subjects?)\s+(are\s+)?(difficult|hard|tough|impossible)\s+to\s+understand|' r'makes?\s+(no|little)\s+sense|doesn\'?t\s+make\s+sense|' # Improvement needed r'(need|needs|require|requires)\s+(urgent|serious|immediate|much|lot\s+of)?\s*improvement|' r'(should|must|have\s+to)\s+improve\s+(a\s+lot|more|urgently)|' # Pace issues r'(lectures?|class(es)?|teaching)\s+(is|are|going)\s+(too|very)\s+(fast|slow)|' r'(too|very|extremely)\s+(fast|slow|rush|rushed)|' r'(lag|lagging)\s+in\s+teaching|teaching\s+(is\s+)?lagging|' # Time management r'(not|poor|bad|terrible)\s+(managing|managing)\s+time|' r'time\s+management\s+(is\s+)?(poor|bad|terrible|lacking)|' r'always\s+(late|wasting\s+time)|waste\s+(our|class)\s+time|' # Lack of resources/support r'(no|not|insufficient|lack\s+of)\s+(proper|sufficient|enough|regular)?\s*(classes|notes|support|help)|' r'need\s+more\s+(staff|faculty|classes|support|help)|' r'no\s+(practical|hands[-\s]?on|lab|real[-\s]?world)|lack\s+of\s+practical|' # Attendance/engagement issues r'(just|only)\s+(for|going\s+for)\s+attendance|' r'going\s+(to|for)\s+(her|his|their)\s+class\s+(just|only)\s+for\s+attendance|' r'(not|no)\s+(interested|engaging|helpful|useful|at\s+all)|' r'no\s+interest\s+in\s+teaching|' # Administrative issues r'military\s+rules|too\s+strict|very\s+strict|' r'attendance\s+(issue|problem)|not\s+providing\s+attendance|' # Workload issues r'too\s+many\s+projects|many\s+projects\s+review|' r'placement\s+activities\s+(and|with)\s+attendance' r')\b', re.IGNORECASE ) # Positive indicators (help identify positive comments) POSITIVE_PATTERN = re.compile( r'\b(' r'(very|extremely|really|so|truly)\s+(good|great|excellent|amazing|wonderful|fantastic|helpful|knowledgeable|clear)|' r'excellent|outstanding|amazing|wonderful|fantastic|brilliant|superb|' r'(great|good|best|wonderful)\s+(teaching|teacher|faculty|knowledge|explanation|professor|sir|madam)|' r'(teaching|explanation|knowledge)\s+(is\s+)?(excellent|outstanding|very\s+good|great|clear)|' r'explains?\s+(very\s+)?(well|clearly|nicely|perfectly)|' r'(easy|easier)\s+to\s+understand|clear\s+explanation|' r'(very\s+)?(helpful|supportive|friendly|approachable|patient)|' r'(good|strong|deep|vast)\s+(knowledge|understanding)|' r'(love|like|enjoy|appreciate)\s+(the\s+)?(class|classes|teaching|subject|course|lectures?)|' r'learned?\s+(a\s+lot|so\s+much|many\s+things)|' r'inspired?|inspiring|motivating|motivated|encouraged|' r'(best|favourite|favorite)\s+(teacher|faculty|professor)|' r'highly\s+recommend|strongly\s+recommend|' r'grateful|thankful|blessed|lucky\s+to\s+have|' r'satisfied|happy\s+with|pleased\s+with|' r'(always|very)\s+(available|accessible|helpful)|' r'patient|caring|dedicated|passionate|' r'interactive\s+class|engaging\s+class|interesting\s+class' r')\b', re.IGNORECASE ) # Weak negative indicators (suggestions/mild criticism - might be NEUTRAL) WEAK_NEGATIVE_PATTERN = re.compile( r'\b(' r'could\s+(be\s+)?better|' r'can\s+improve|' r'would\s+be\s+good\s+if|' r'suggest|suggestion|' r'maybe|perhaps|' r'slightly|a\s+bit|' r'sometimes|occasionally' r')\b', re.IGNORECASE ) def is_meta_comment(text: str) -> bool: """Check if comment is meta (not actual feedback)""" if not text or len(text.strip()) < 3: return True text = text.strip() return bool(META_PATTERNS.match(text)) def detect_strong_negative(text: str) -> bool: """Detect strong negative patterns""" if not text or is_meta_comment(text): return False return bool(STRONG_NEGATIVE_PATTERN.search(text)) def detect_positive(text: str) -> bool: """Detect positive patterns""" if not text or is_meta_comment(text): return False return bool(POSITIVE_PATTERN.search(text)) def detect_weak_negative(text: str) -> bool: """Detect weak negative patterns (suggestions)""" if not text or is_meta_comment(text): return False return bool(WEAK_NEGATIVE_PATTERN.search(text)) # ============================================================================ # MODEL INITIALIZATION # ============================================================================ def initialize_models(): """Initialize sentiment analysis models""" global sia, tokenizer, model, device, summarizer try: settings = get_settings() logger.info("Initializing sentiment analysis models...") # VADER sia = SentimentIntensityAnalyzer() logger.info("✓ VADER initialized") # RoBERTa cache_dir = settings.model_cache_dir os.makedirs(cache_dir, exist_ok=True) tokenizer = AutoTokenizer.from_pretrained( settings.roberta_model_name, cache_dir=cache_dir ) model = AutoModelForSequenceClassification.from_pretrained( settings.roberta_model_name, cache_dir=cache_dir ) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) model.eval() logger.info(f"✓ RoBERTa initialized on device: {device}") # Summarizer (optional) if settings.use_abstractive_summary: try: summarizer = pipeline( "summarization", model=settings.summarizer_model, device=0 if device == "cuda" else -1 ) logger.info("✓ Summarizer initialized") except Exception as e: logger.warning(f"Summarizer initialization failed: {e}") summarizer = None logger.info("✓ All models initialized successfully") except Exception as e: logger.error(f"Error initializing models: {e}") raise e # ============================================================================ # SENTIMENT ANALYSIS FUNCTIONS # ============================================================================ @lru_cache(maxsize=500) def vader_sentiment_cached(text: str) -> tuple: """Cached VADER sentiment analysis""" scores = sia.polarity_scores(text) return (scores['neg'], scores['neu'], scores['pos'], scores['compound']) def vader_sentiment(text: str) -> Dict[str, float]: """VADER sentiment analysis""" try: settings = get_settings() if settings.enable_caching: neg, neu, pos, compound = vader_sentiment_cached(text) return { 'vader_neg': neg, 'vader_neu': neu, 'vader_pos': pos, 'vader_compound': compound } else: scores = sia.polarity_scores(text) return { 'vader_neg': scores['neg'], 'vader_neu': scores['neu'], 'vader_pos': scores['pos'], 'vader_compound': scores['compound'] } except Exception as e: logger.warning(f"VADER analysis failed: {e}") return {'vader_neg': 0.0, 'vader_neu': 1.0, 'vader_pos': 0.0, 'vader_compound': 0.0} def roberta_sentiment_batch(texts: List[str]) -> List[Dict[str, float]]: """Batch RoBERTa sentiment analysis""" try: settings = get_settings() results = [] for i in range(0, len(texts), settings.batch_size): batch = texts[i:i + settings.batch_size] encoded = tokenizer( batch, return_tensors='pt', truncation=True, max_length=512, padding=True ) encoded = {k: v.to(device) for k, v in encoded.items()} with torch.no_grad(): outputs = model(**encoded) for output in outputs.logits: scores = softmax(output.cpu().numpy()) results.append({ 'roberta_neg': float(scores[0]), 'roberta_neu': float(scores[1]), 'roberta_pos': float(scores[2]) }) return results except Exception as e: logger.warning(f"RoBERTa batch analysis failed: {e}") return [{'roberta_neg': 0.0, 'roberta_neu': 1.0, 'roberta_pos': 0.0} for _ in texts] def classify_sentiment_enhanced(row: pd.Series, settings: Settings) -> str: """ Enhanced multi-stage sentiment classification for better accuracy Stage 1: Meta-comments → Neutral Stage 2: Strong negative patterns → Negative (override models) Stage 3: Strong positive patterns + high scores → Positive Stage 4: Model ensemble decision Stage 5: Default to neutral if uncertain """ # Stage 1: Meta-comments are always neutral if row.get('is_meta', False): return 'Neutral' # Get all scores vader_compound = row.get('vader_compound', 0.0) vader_pos = row.get('vader_pos', 0.0) vader_neg = row.get('vader_neg', 0.0) roberta_pos = row.get('roberta_pos', 0.0) roberta_neg = row.get('roberta_neg', 0.0) roberta_neu = row.get('roberta_neu', 0.0) combined_pos = row.get('combined_pos', 0.0) combined_neg = row.get('combined_neg', 0.0) combined_neu = row.get('combined_neu', 0.0) has_strong_negative = row.get('has_strong_negative', False) has_positive = row.get('has_positive', False) has_weak_negative = row.get('has_weak_negative', False) # Stage 2: Strong negative patterns override everything if has_strong_negative: return 'Negative' # Stage 3: Strong positive signals if has_positive and ( vader_compound >= settings.vader_strong_pos_threshold or roberta_pos >= settings.roberta_strong_pos_threshold or (vader_compound >= settings.vader_pos_threshold and roberta_pos >= settings.roberta_pos_threshold) ): return 'Positive' # Stage 4: Model-based classification with ensemble # Strong negative from models if ( vader_compound <= settings.vader_strong_neg_threshold or roberta_neg >= settings.roberta_strong_neg_threshold or (vader_compound <= settings.vader_neg_threshold and roberta_neg >= settings.roberta_neg_threshold) ): return 'Negative' # Moderate negative if ( combined_neg > combined_pos and combined_neg > combined_neu and combined_neg > 0.35 # Threshold for clarity ): return 'Negative' # Clear positive if ( combined_pos > combined_neg and combined_pos > combined_neu and combined_pos > 0.35 # Threshold for clarity ): return 'Positive' # Weak negative with suggestion context → might be neutral if has_weak_negative and not has_strong_negative: # If scores are not strongly negative, treat as neutral if combined_neg < 0.5: return 'Neutral' # Stage 5: Default to neutral if uncertain return 'Neutral' def sanitize_text(text: str) -> str: """Sanitize input text""" if not text: return "" text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]', '', text) text = ' '.join(text.split()) return text.strip() # ============================================================================ # MAIN ANALYSIS FUNCTION # ============================================================================ def analyze_comments_sentiment(comments: List[str]) -> Dict[str, Any]: """Main sentiment analysis with enhanced accuracy""" try: settings = get_settings() logger.info(f"Received {len(comments)} comments for analysis") # Sanitize sanitized_comments = [sanitize_text(comment) for comment in comments] # Filter valid comments filtered_comments = [ comment for comment in sanitized_comments if settings.min_comment_words <= len(comment.split()) <= settings.max_comment_length ] logger.info(f"After filtering: {len(filtered_comments)} valid comments") if not filtered_comments: return { "total_comments": 0, "message": "No valid comments found for analysis" } # Create DataFrame df = pd.DataFrame({'comment': filtered_comments}) # Pattern detection df['is_meta'] = df['comment'].apply(is_meta_comment) df['has_strong_negative'] = df['comment'].apply(detect_strong_negative) df['has_positive'] = df['comment'].apply(detect_positive) df['has_weak_negative'] = df['comment'].apply(detect_weak_negative) # Log detection stats logger.info(f"Meta: {df['is_meta'].sum()}, " f"Strong Neg: {df['has_strong_negative'].sum()}, " f"Positive: {df['has_positive'].sum()}, " f"Weak Neg: {df['has_weak_negative'].sum()}") # VADER analysis vader_results = [vader_sentiment(text) for text in df['comment']] vader_df = pd.DataFrame(vader_results) # RoBERTa analysis roberta_results = roberta_sentiment_batch(df['comment'].tolist()) roberta_df = pd.DataFrame(roberta_results) # Combine final_df = pd.concat([df.reset_index(drop=True), vader_df, roberta_df], axis=1) # Calculate combined scores final_df['combined_pos'] = ( settings.combined_weight_vader * final_df['vader_pos'] + settings.combined_weight_roberta * final_df['roberta_pos'] ) final_df['combined_neg'] = ( settings.combined_weight_vader * final_df['vader_neg'] + settings.combined_weight_roberta * final_df['roberta_neg'] ) final_df['combined_neu'] = ( settings.combined_weight_vader * final_df['vader_neu'] + settings.combined_weight_roberta * final_df['roberta_neu'] ) # Enhanced classification final_df['Overall_Sentiment'] = final_df.apply( lambda row: classify_sentiment_enhanced(row, settings), axis=1 ) # Statistics total_comments = len(final_df) positive_count = len(final_df[final_df['Overall_Sentiment'] == 'Positive']) negative_count = len(final_df[final_df['Overall_Sentiment'] == 'Negative']) neutral_count = len(final_df[final_df['Overall_Sentiment'] == 'Neutral']) logger.info(f"Classification Results - Pos: {positive_count}, Neg: {negative_count}, Neu: {neutral_count}") # Average scores avg_positive = float(final_df['combined_pos'].mean()) avg_negative = float(final_df['combined_neg'].mean()) avg_neutral = float(final_df['combined_neu'].mean()) # Overall sentiment if avg_positive > max(avg_negative, avg_neutral): overall_sentiment_label = "Positive" elif avg_negative > max(avg_positive, avg_neutral): overall_sentiment_label = "Negative" else: overall_sentiment_label = "Neutral" # Process negative comments negative_summary = "" negative_comments_list = [] negative_comments = final_df[final_df['Overall_Sentiment'] == 'Negative'] if len(negative_comments) > 0: negative_comments_list = negative_comments['comment'].tolist() try: top_idx = negative_comments['combined_neg'].nlargest(min(3, len(negative_comments))).index top_comments = negative_comments.loc[top_idx, 'comment'].tolist() if settings.use_abstractive_summary and summarizer is not None: negative_text = " ".join(top_comments) if len(negative_text) > 1000: negative_text = negative_text[:1000] summary_result = summarizer( negative_text, max_length=settings.max_summary_length, min_length=settings.min_summary_length, do_sample=False ) negative_summary = summary_result[0]['summary_text'] else: negative_summary = "; ".join(top_comments) except Exception as e: logger.warning(f"Summary generation failed: {e}") negative_summary = "; ".join(negative_comments_list[:3]) # Insights and recommendations insights = [] recommendations = [] if overall_sentiment_label == "Positive": insights.extend([ f"Strong positive feedback: {positive_count}/{total_comments} comments ({round(positive_count/total_comments*100, 1)}%)", "Students are satisfied with the teaching approach", "High engagement and learning outcomes reported" ]) recommendations.extend([ "Continue current effective teaching methods", "Document successful practices for future reference", "Share best practices with colleagues" ]) elif overall_sentiment_label == "Negative": insights.extend([ f"Concerns identified: {negative_count}/{total_comments} negative comments ({round(negative_count/total_comments*100, 1)}%)", "Students facing challenges with current approach", "Immediate attention needed to address feedback" ]) recommendations.extend([ "Review and analyze specific negative feedback points", "Consider adjusting teaching pace or methods", "Increase student engagement and support", "Schedule student feedback sessions", "Focus on communication clarity and accessibility" ]) else: insights.extend([ f"Mixed feedback: {positive_count} positive, {negative_count} negative, {neutral_count} neutral", "Room for improvement while maintaining strengths", "Students have varied experiences" ]) recommendations.extend([ "Address specific concerns raised in negative feedback", "Build on positive aspects appreciated by students", "Gather more detailed feedback for neutral areas" ]) # Add pattern-based insights if df['has_strong_negative'].sum() > 0: insights.append(f"{df['has_strong_negative'].sum()} comments contain explicit criticism requiring attention") if df['has_positive'].sum() > 0: insights.append(f"{df['has_positive'].sum()} comments contain strong positive appreciation") return { "total_comments": total_comments, "positive_comments": positive_count, "negative_comments": negative_count, "neutral_comments": neutral_count, "positive_sentiment": round(avg_positive, 3), "negative_sentiment": round(avg_negative, 3), "neutral_sentiment": round(avg_neutral, 3), "overall_sentiment": overall_sentiment_label, "sentiment_distribution": { "positive_percentage": round((positive_count / total_comments) * 100, 1), "negative_percentage": round((negative_count / total_comments) * 100, 1), "neutral_percentage": round((neutral_count / total_comments) * 100, 1) }, "negative_comments_summary": negative_summary, "negative_comments_list": negative_comments_list, "key_insights": insights, "recommendations": recommendations, "detailed_analysis": { "vader_scores": { "average_positive": round(final_df['vader_pos'].mean(), 3), "average_negative": round(final_df['vader_neg'].mean(), 3), "average_neutral": round(final_df['vader_neu'].mean(), 3), "average_compound": round(final_df['vader_compound'].mean(), 3) }, "roberta_scores": { "average_positive": round(final_df['roberta_pos'].mean(), 3), "average_negative": round(final_df['roberta_neg'].mean(), 3), "average_neutral": round(final_df['roberta_neu'].mean(), 3) } }, "analysis_timestamp": datetime.utcnow().isoformat() } except Exception as e: logger.error(f"Sentiment analysis failed: {e}", exc_info=True) raise e # ============================================================================ # API ENDPOINTS # ============================================================================ @app.on_event("startup") async def startup_event(): """Initialize models on startup""" try: logger.info("=" * 80) logger.info(f"Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") logger.info("=" * 80) initialize_models() logger.info("✓ Service started successfully") logger.info("=" * 80) except Exception as e: logger.error(f"✗ Startup failed: {e}") raise e @app.on_event("shutdown") async def shutdown_event(): """Cleanup on shutdown""" logger.info("Service shutting down") @app.get("/") async def root(): """Root endpoint""" return { "service": get_settings().app_name, "version": get_settings().app_version, "status": "running", "endpoints": { "health": "/health", "analyze": "/analyze-comments", "test": "/test" } } @app.get("/health") async def health_check(): """Health check endpoint""" models_loaded = sia is not None and model is not None and tokenizer is not None return { "status": "healthy" if models_loaded else "unhealthy", "service": "comment-analysis", "version": get_settings().app_version, "models_loaded": models_loaded, "device": device if device else "not initialized", "timestamp": datetime.utcnow().isoformat() } @app.post("/analyze-comments", response_model=CommentAnalysisResponse) async def analyze_comments( request: CommentAnalysisRequest, settings: Settings = Depends(get_settings) ): """Analyze comments for sentiment using enhanced multi-stage classification""" try: comments = request.comments faculty_info = request.faculty_info if not comments: return CommentAnalysisResponse( success=False, analysis=None, message="No comments provided for analysis" ) logger.info(f"Analyzing {len(comments)} comments for {faculty_info.faculty_name} ({faculty_info.course_code})") analysis_result = analyze_comments_sentiment(comments) if analysis_result.get("total_comments", 0) == 0: return CommentAnalysisResponse( success=False, analysis=None, message=analysis_result.get("message", "No valid comments to analyze") ) analysis_result["faculty_info"] = { "faculty_name": faculty_info.faculty_name, "staff_id": faculty_info.staff_id, "course_code": faculty_info.course_code, "course_name": faculty_info.course_name } return CommentAnalysisResponse( success=True, analysis=analysis_result, message=f"Successfully analyzed {analysis_result['total_comments']} comments" ) except ValueError as ve: logger.warning(f"Validation error: {ve}") raise HTTPException(status_code=400, detail=str(ve)) except Exception as e: logger.error(f"Analysis failed: {e}", exc_info=True) raise HTTPException(status_code=500, detail="Analysis failed. Please try again later.") @app.get("/test") async def test_endpoint(): """Test endpoint with various comment types""" test_cases = [ # Meta-comments (should be Neutral) "No negative comments", "Everything is good", "Nothing to say", "Nil", # Strong Negative (should be Negative) "Very poor teaching quality", "Boring class, waste of time", "Cannot understand anything", "Teaching is terrible and voice is too low", "Poor knowledge and bad teaching method", # Positive (should be Positive) "Excellent teacher with great knowledge", "Very helpful and explains clearly", "Amazing teaching style, learned a lot", "Best professor, highly recommend", # Weak negative/Neutral "Could be better", "Sometimes hard to understand", "Overall good but too lag", # Mixed "Good teacher but classes are boring", "Knowledgeable but voice is low" ] results = [] for text in test_cases: is_meta = is_meta_comment(text) has_strong_neg = detect_strong_negative(text) has_pos = detect_positive(text) has_weak_neg = detect_weak_negative(text) # Predict if is_meta: predicted = "Neutral (meta-comment)" elif has_strong_neg: predicted = "Negative (strong pattern)" elif has_pos and not has_strong_neg: predicted = "Positive (likely)" elif has_weak_neg and not has_strong_neg: predicted = "Neutral/Negative (weak)" else: predicted = "Requires full analysis" results.append({ "text": text, "is_meta": is_meta, "strong_negative": has_strong_neg, "positive": has_pos, "weak_negative": has_weak_neg, "predicted": predicted }) return { "test_results": results, "note": "Predictions based on pattern matching. Full analysis uses VADER + RoBERTa ensemble." } if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info") # """ # Enhanced FastAPI Service for Comment Sentiment Analysis # with improved performance, validation, and configuration management # Version 2.1.0 - Updated with bug fixes and improvements # """ # from fastapi import FastAPI, HTTPException, Depends # from fastapi.middleware.cors import CORSMiddleware # from pydantic import BaseModel, Field, validator # from pydantic_settings import BaseSettings # from typing import List, Dict, Any, Optional # from functools import lru_cache # import uvicorn # import pandas as pd # import numpy as np # import os # import re # from datetime import datetime # import logging # # Configure logging FIRST # logging.basicConfig( # level=logging.INFO, # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' # ) # logger = logging.getLogger(__name__) # # CRITICAL: Download NLTK data BEFORE importing NLTK components # import nltk # import ssl # try: # _create_unverified_https_context = ssl._create_unverified_context # except AttributeError: # pass # else: # ssl._create_default_https_context = _create_unverified_https_context # # Set NLTK data path # nltk_data_dir = '/tmp/nltk_data' # os.makedirs(nltk_data_dir, exist_ok=True) # nltk.data.path.insert(0, nltk_data_dir) # # Download required NLTK data # def ensure_nltk_data(): # """Ensure all required NLTK data is downloaded""" # resources = ['vader_lexicon', 'punkt', 'stopwords', 'wordnet', 'omw-1.4'] # for resource in resources: # try: # # Try to find the resource # if resource == 'vader_lexicon': # nltk.data.find('sentiment/vader_lexicon.zip') # elif resource == 'punkt': # nltk.data.find('tokenizers/punkt') # elif resource in ['stopwords', 'wordnet', 'omw-1.4']: # nltk.data.find(f'corpora/{resource}') # logger.info(f"✓ NLTK resource '{resource}' already available") # except LookupError: # logger.info(f"Downloading NLTK resource '{resource}'...") # try: # nltk.download(resource, download_dir=nltk_data_dir, quiet=False) # logger.info(f"✓ Successfully downloaded '{resource}'") # except Exception as e: # logger.error(f"✗ Failed to download '{resource}': {e}") # # Download NLTK data immediately # logger.info("Ensuring NLTK data is available...") # ensure_nltk_data() # # NOW import NLTK components # from nltk.sentiment import SentimentIntensityAnalyzer # # Import transformers after NLTK setup # from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline # from scipy.special import softmax # import torch # # Configuration Management # class Settings(BaseSettings): # """Application settings with environment variable support""" # # API Settings # app_name: str = "Comment Analysis API" # app_version: str = "2.1.0" # debug_mode: bool = False # # Request Limits # max_comments_per_request: int = 1000 # max_comment_length: int = 5000 # min_comment_words: int = 1 # # Sentiment Thresholds # vader_pos_threshold: float = 0.2 # vader_neg_threshold: float = -0.2 # roberta_pos_threshold: float = 0.55 # roberta_neg_threshold: float = 0.45 # combined_weight_vader: float = 0.5 # combined_weight_roberta: float = 0.5 # # Model Settings # model_cache_dir: str = "/tmp/model_cache" # roberta_model_name: str = "cardiffnlp/twitter-roberta-base-sentiment" # use_abstractive_summary: bool = False # summarizer_model: str = "facebook/bart-large-cnn" # max_summary_length: int = 100 # min_summary_length: int = 25 # # Performance # enable_caching: bool = True # cache_size: int = 500 # batch_size: int = 32 # class Config: # env_file = ".env" # env_file_encoding = 'utf-8' # extra = 'ignore' # @validator('min_comment_words') # def validate_min_words(cls, v): # if v < 0: # raise ValueError('min_comment_words must be non-negative') # return v # @validator('combined_weight_vader', 'combined_weight_roberta') # def validate_weights(cls, v): # if not 0 <= v <= 1: # raise ValueError('Weights must be between 0 and 1') # return v # @lru_cache() # def get_settings() -> Settings: # """Cached settings instance""" # settings = Settings() # # Normalize weights if needed # total = settings.combined_weight_vader + settings.combined_weight_roberta # if not (0.99 <= total <= 1.01): # logger.warning(f"Weights sum to {total}, normalizing to 1.0") # settings.combined_weight_vader /= total # settings.combined_weight_roberta /= total # return settings # # Pydantic Models # class FacultyInfo(BaseModel): # faculty_name: str = Field(..., min_length=1, max_length=200) # staff_id: str = Field(..., min_length=1, max_length=50) # course_code: str = Field(..., min_length=1, max_length=50) # course_name: str = Field(..., min_length=1, max_length=200) # class CommentAnalysisRequest(BaseModel): # comments: List[str] = Field(..., min_items=1) # faculty_info: FacultyInfo # @validator('comments') # def validate_comments(cls, v): # settings = get_settings() # if len(v) > settings.max_comments_per_request: # raise ValueError( # f'Maximum {settings.max_comments_per_request} comments per request' # ) # for idx, comment in enumerate(v): # if len(comment) > settings.max_comment_length: # raise ValueError( # f'Comment {idx} exceeds maximum length of {settings.max_comment_length} characters' # ) # return v # class SentimentDistribution(BaseModel): # positive_percentage: float # negative_percentage: float # neutral_percentage: float # class DetailedScores(BaseModel): # average_positive: float # average_negative: float # average_neutral: float # average_compound: Optional[float] = None # class DetailedAnalysis(BaseModel): # vader_scores: DetailedScores # roberta_scores: DetailedScores # class AnalysisResult(BaseModel): # total_comments: int # positive_comments: int # negative_comments: int # neutral_comments: int # positive_sentiment: float # negative_sentiment: float # neutral_sentiment: float # overall_sentiment: str # sentiment_distribution: SentimentDistribution # negative_comments_summary: str # negative_comments_list: List[str] # key_insights: List[str] # recommendations: List[str] # detailed_analysis: DetailedAnalysis # faculty_info: Dict[str, str] # analysis_timestamp: str # class CommentAnalysisResponse(BaseModel): # success: bool # analysis: Optional[AnalysisResult] = None # message: str # # Initialize FastAPI app # app = FastAPI( # title=get_settings().app_name, # version=get_settings().app_version, # description="Advanced sentiment analysis service for educational feedback" # ) # # Add CORS middleware # app.add_middleware( # CORSMiddleware, # allow_origins=["*"], # allow_credentials=True, # allow_methods=["*"], # allow_headers=["*"], # ) # # Global variables for models # sia = None # tokenizer = None # model = None # device = None # summarizer = None # # Enhanced heuristic phrase/regex rules for explicit negative feedback # NEGATIVE_PHRASES = [ # # Teaching quality issues # 'very poor', # 'extremely poor', # 'poor in teaching', # 'poor teaching level', # 'poor teaching', # 'bad teacher', # 'bad teaching', # 'not good', # Keep but check it's not "no negative" # 'not satisfied', # 'not satisfactory', # # Content/delivery issues # 'boring class', # 'boring classes', # 'boring subject', # 'subject is boring', # 'low voice', # 'voice is low', # 'cannot hear', # "can't hear", # 'speak louder', # # Resource/support issues # 'need more staff', # 'need more faculty', # 'insufficient staff', # 'lack of staff', # 'not sufficient', # 'insufficient', # 'not enough', # 'no classes', # 'no regular classes', # 'not sufficient classes', # # Knowledge/understanding issues # 'lack of knowledge', # 'better knowledge needed', # 'poor knowledge', # 'knowledge is lacking', # 'practical knowledge lacking', # 'no practical', # 'lack of practical', # 'no hands-on', # 'no real world', # 'did not understand', # "didn't understand", # 'not able to understand', # 'unable to understand', # 'difficult to understand', # 'hard to understand', # 'concepts are difficult', # 'concepts difficult', # 'cant understand', # "can't understand", # 'not understandable', # # Improvement needed # 'improve class', # 'improvement needed', # 'needs improvement', # 'need improvement', # 'should improve', # 'must improve', # 'not helpful', # 'not clear', # 'communication skills need improvement', # 'improve communication', # # Pace/time issues # 'lectures are going fast', # 'going too fast', # 'too fast', # 'too slow', # 'too lag', # 'lag', # 'lagging', # 'lag in teaching', # 'not managing time', # 'poor time management', # 'time management issue', # # Engagement issues # 'not interested', # 'no interest', # 'going for attendance', # 'just for attendance', # 'only for attendance', # 'not at all', # 'nothing learnt', # 'learned nothing', # 'no improvement', # 'same teaching', # 'monotonous', # 'sleeping in class', # # Value/utility issues # 'waste of time', # 'wasting time', # 'waste our time', # 'no use', # 'useless', # # Administrative issues # 'military rules', # 'strict rules', # 'too strict', # 'very strict', # 'attendance issue', # 'attendance problem', # 'not providing attendance', # 'claim od', # # Workload issues # 'too many projects', # 'many projects review', # 'trouble to make', # 'difficult to make', # 'hard to make', # 'placement activities', # When context is negative # ] # NEGATIVE_REGEXES = [ # # Teaching quality patterns # re.compile(r"\b(very|extremely|quite|so)\s+(poor|bad|weak)\s+(in\s+)?(teaching|knowledge|communication)", re.IGNORECASE), # re.compile(r"\bpoor\s+(teaching|teacher|faculty|knowledge|communication)", re.IGNORECASE), # re.compile(r"\b(teaching|knowledge)\s+(is\s+)?(poor|bad|weak|lacking)", re.IGNORECASE), # # Boring/engagement patterns # re.compile(r"\b(boring|dull|monotonous)\s+(class|classes|subject|lecture|lectures)", re.IGNORECASE), # re.compile(r"\b(class|classes|subject|lecture|lectures)\s+(is|are)\s+(boring|dull|monotonous)", re.IGNORECASE), # # Voice/communication patterns # re.compile(r"\b(low|soft|quiet)\s+voice\b", re.IGNORECASE), # re.compile(r"\bvoice\s+(is\s+)?(low|soft|quiet|not clear)", re.IGNORECASE), # re.compile(r"\b(cannot|can't|cant|unable to)\s+hear", re.IGNORECASE), # # Resource/support patterns # re.compile(r"\b(no|not|insufficient|lack of)\s+(proper|sufficient|enough|regular)?\s*(classes|notes|support|staff|faculty)", re.IGNORECASE), # re.compile(r"\bneed(s)?\s+more\s+(staff|faculty|support|classes)", re.IGNORECASE), # # Understanding/clarity patterns # re.compile(r"\b(cannot|can't|cant|unable to|difficult to|hard to)\s+understand", re.IGNORECASE), # re.compile(r"\b(not|difficult|hard)\s+(able\s+to\s+)?understand(\s+the)?(\s+(concepts?|teaching|lectures?))?", re.IGNORECASE), # re.compile(r"\bconcepts?\s+(are\s+)?(difficult|hard|tough|complex)\s+to\s+understand", re.IGNORECASE), # # Improvement patterns # re.compile(r"\b(need|needs|needed|require|requires)\s+(some\s+)?(improvement|to improve)", re.IGNORECASE), # re.compile(r"\b(should|must|have to)\s+improve", re.IGNORECASE), # re.compile(r"\bimprovement\s+(is\s+)?need(ed)?", re.IGNORECASE), # # Pace patterns # re.compile(r"\b(lecture|lectures|class|classes|teaching)\s+(is|are|going)\s+(too|very)\s+(fast|slow)", re.IGNORECASE), # re.compile(r"\b(too|very)\s+(fast|slow|lag|lagging)", re.IGNORECASE), # # Time management patterns # re.compile(r"\b(not|poor|bad)\s+(managing|managing)\s+time", re.IGNORECASE), # re.compile(r"\btime\s+management\s+(is\s+)?(poor|bad|lacking)", re.IGNORECASE), # # Attendance/engagement patterns # re.compile(r"\b(just|only)\s+(for|going for)\s+attendance", re.IGNORECASE), # re.compile(r"\b(going|attend|attending)\s+(to|for)\s+(her|his|their)\s+class\s+(just|only)\s+for\s+attendance", re.IGNORECASE), # re.compile(r"\bnot\s+(at\s+all\s+)?(interested|engaging|helpful)", re.IGNORECASE), # # Value patterns # re.compile(r"\b(waste|wasting)\s+(of\s+)?time", re.IGNORECASE), # re.compile(r"\b(no\s+use|useless|not useful)", re.IGNORECASE), # # Workload patterns # re.compile(r"\b(too\s+)?many\s+projects", re.IGNORECASE), # re.compile(r"\btrouble\s+to\s+(make|complete|do)", re.IGNORECASE), # # Administrative patterns # re.compile(r"\bmilitary\s+rules", re.IGNORECASE), # re.compile(r"\b(too|very)\s+strict", re.IGNORECASE), # re.compile(r"\battendance\s+(issue|problem)", re.IGNORECASE), # re.compile(r"\bnot\s+providing\s+attendance", re.IGNORECASE), # re.compile(r"\bclaim\s+od", re.IGNORECASE), # # Placement/scheduling patterns # re.compile(r"\bplacement\s+activities\s+(and|with)\s+(attendance|issue|problem)", re.IGNORECASE), # re.compile(r"\b(class|classes)\s+(intersecting|conflicting)\s+with\s+placement", re.IGNORECASE), # ] # META_COMMENT_PATTERNS = [ # re.compile(r"^no\s+negative\s+(comments?|feedback|remarks?)", re.IGNORECASE), # re.compile(r"^no\s+negative\s+comments?\s+on\s+the\s+(faculty|teacher|staff|course)", re.IGNORECASE), # re.compile(r"^no\s+(issues?|problems?|complaints?)\.?$", re.IGNORECASE), # re.compile(r"^no\s+(issues?|problems?|complaints?)\s+(at\s+all|whatsoever)", re.IGNORECASE), # # "Everything is good" patterns # re.compile(r"^(everything|all)\s+(is\s+)?(good|fine|ok|okay|great|perfect|excellent)", re.IGNORECASE), # re.compile(r"^no,?\s+(everything|all)\s+(is\s+)?(good|fine|ok|okay)", re.IGNORECASE), # re.compile(r"^(all\s+)?good\.?$", re.IGNORECASE), # re.compile(r"^everything\s+at\s+the\s+too\s+only", re.IGNORECASE), # From your data # # "Nothing" patterns # re.compile(r"^nothing\.?$", re.IGNORECASE), # re.compile(r"^nothing\s+(to\s+)?(say|comment|mention|add)", re.IGNORECASE), # re.compile(r"^nothing,?\s+(and\s+)?(all|everything)\s+(is\s+)?(good|fine)", re.IGNORECASE), # # "No more comments" patterns # re.compile(r"^no\s+more\s+(comments?|remarks?|feedback)", re.IGNORECASE), # re.compile(r"^no\s+(other\s+)?(comments?|remarks?|feedback)", re.IGNORECASE), # re.compile(r"^no\s+remarks?(\s+(about|on))?", re.IGNORECASE), # # Empty/nil responses # re.compile(r"^(nil|none|na|n/a|nill)\.?$", re.IGNORECASE), # re.compile(r"^(no|nothing|none)\.?$", re.IGNORECASE), # # Positive meta-comments (not actual feedback) # re.compile(r"^(it's\s+|its\s+)?(all\s+)?good\.?$", re.IGNORECASE), # re.compile(r"^fine\.?$", re.IGNORECASE), # re.compile(r"^ok(ay)?\.?$", re.IGNORECASE), # re.compile(r"^great\.?$", re.IGNORECASE), # re.compile(r"^nice\.?$", re.IGNORECASE), # ] # def is_meta_comment(text: str) -> bool: # """ # Check if comment is a meta-comment (not actual feedback). # These are generic statements that don't provide substantive feedback. # """ # if not text: # return True # Empty text is meta # text = text.strip() # # Check length - very short comments are likely meta # if len(text) < 3: # logger.debug(f"Meta-comment (too short): '{text}'") # return True # # Check against patterns # for pattern in META_COMMENT_PATTERNS: # if pattern.match(text): # logger.debug(f"Meta-comment detected: '{text[:50]}...'") # return True # return False # def is_explicit_negative(text: str) -> bool: # """ # Check if text contains explicit negative phrases. # IMPORTANT: Must check if it's a meta-comment FIRST. # """ # if not text: # return False # # CRITICAL: Don't classify meta-comments as negative # if is_meta_comment(text): # return False # lower = text.lower() # # Check phrases # for phrase in NEGATIVE_PHRASES: # if phrase in lower: # # Double-check it's not a false positive like "no negative comments" # if phrase == 'not good' and 'no negative' in lower: # continue # if phrase == 'no interest' and 'no negative' in lower: # continue # logger.debug(f"Negative phrase detected: '{phrase}' in '{text[:50]}...'") # return True # # Check regexes # for regex in NEGATIVE_REGEXES: # if regex.search(text): # logger.debug(f"Negative pattern matched: {regex.pattern} in '{text[:50]}...'") # return True # return False # def initialize_models(): # """Initialize sentiment analysis models with caching support""" # global sia, tokenizer, model, device, summarizer # try: # settings = get_settings() # logger.info("Initializing sentiment analysis models...") # # Initialize VADER (NLTK data already downloaded) # sia = SentimentIntensityAnalyzer() # logger.info("✓ VADER initialized") # # Initialize RoBERTa with caching # cache_dir = settings.model_cache_dir # os.makedirs(cache_dir, exist_ok=True) # tokenizer = AutoTokenizer.from_pretrained( # settings.roberta_model_name, # cache_dir=cache_dir # ) # model = AutoModelForSequenceClassification.from_pretrained( # settings.roberta_model_name, # cache_dir=cache_dir # ) # device = "cuda" if torch.cuda.is_available() else "cpu" # model.to(device) # model.eval() # logger.info(f"✓ RoBERTa initialized on device: {device}") # # Initialize summarizer (optional) # if settings.use_abstractive_summary: # try: # summarizer = pipeline( # "summarization", # model=settings.summarizer_model, # device=0 if device == "cuda" else -1 # ) # logger.info("✓ Summarizer initialized") # except Exception as e: # logger.warning(f"Summarizer initialization failed: {e}") # summarizer = None # logger.info("✓ All models initialized successfully") # except Exception as e: # logger.error(f"Error initializing models: {e}") # raise e # @lru_cache(maxsize=500) # def vader_sentiment_cached(text: str) -> tuple: # """Cached VADER sentiment analysis""" # scores = sia.polarity_scores(text) # return (scores['neg'], scores['neu'], scores['pos'], scores['compound']) # def vader_sentiment(text: str) -> Dict[str, float]: # """VADER sentiment analysis with caching support""" # try: # settings = get_settings() # if settings.enable_caching: # neg, neu, pos, compound = vader_sentiment_cached(text) # return { # 'vader_neg': neg, # 'vader_neu': neu, # 'vader_pos': pos, # 'vader_compound': compound # } # else: # scores = sia.polarity_scores(text) # return { # 'vader_neg': scores['neg'], # 'vader_neu': scores['neu'], # 'vader_pos': scores['pos'], # 'vader_compound': scores['compound'] # } # except Exception as e: # logger.warning(f"VADER analysis failed for text: {e}") # return {'vader_neg': 0.0, 'vader_neu': 1.0, 'vader_pos': 0.0, 'vader_compound': 0.0} # def roberta_sentiment_batch(texts: List[str]) -> List[Dict[str, float]]: # """Batch RoBERTa sentiment analysis for better performance""" # try: # settings = get_settings() # results = [] # for i in range(0, len(texts), settings.batch_size): # batch = texts[i:i + settings.batch_size] # encoded = tokenizer( # batch, # return_tensors='pt', # truncation=True, # max_length=512, # padding=True # ) # encoded = {k: v.to(device) for k, v in encoded.items()} # with torch.no_grad(): # outputs = model(**encoded) # for output in outputs.logits: # scores = softmax(output.cpu().numpy()) # results.append({ # 'roberta_neg': float(scores[0]), # 'roberta_neu': float(scores[1]), # 'roberta_pos': float(scores[2]) # }) # return results # except Exception as e: # logger.warning(f"RoBERTa batch analysis failed: {e}") # return [{'roberta_neg': 0.0, 'roberta_neu': 1.0, 'roberta_pos': 0.0} for _ in texts] # def roberta_sentiment(text: str) -> Dict[str, float]: # """Single text RoBERTa sentiment analysis""" # try: # encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512) # encoded_text = {k: v.to(device) for k, v in encoded_text.items()} # with torch.no_grad(): # output = model(**encoded_text) # scores = softmax(output[0][0].cpu().numpy()) # return { # 'roberta_neg': float(scores[0]), # 'roberta_neu': float(scores[1]), # 'roberta_pos': float(scores[2]) # } # except Exception as e: # logger.warning(f"RoBERTa analysis failed for text: {e}") # return {'roberta_neg': 0.0, 'roberta_neu': 1.0, 'roberta_pos': 0.0} # def overall_sentiment(row: pd.Series, settings: Settings) -> str: # """Determine overall sentiment using combined scores with configurable thresholds""" # combined_pos = row.get('combined_pos', 0.0) # combined_neg = row.get('combined_neg', 0.0) # combined_neu = row.get('combined_neu', 0.0) # vader_compound = row.get('vader_compound', 0.0) # roberta_neg = row.get('roberta_neg', 0.0) # roberta_pos = row.get('roberta_pos', 0.0) # # Priority 1: Heuristic negative patterns override everything # if row.get('heuristic_negative') is True: # return 'Negative' # # Priority 2: Strong negative signals # if ( # vader_compound <= settings.vader_neg_threshold or # roberta_neg >= settings.roberta_neg_threshold or # combined_neg >= max(combined_pos, combined_neu) # ): # return 'Negative' # # Priority 3: Positive signals # if ( # vader_compound >= settings.vader_pos_threshold or # roberta_pos >= settings.roberta_pos_threshold or # combined_pos >= max(combined_neg, combined_neu) # ): # return 'Positive' # # Default: Neutral # return 'Neutral' # def sanitize_text(text: str) -> str: # """Sanitize input text while preserving emojis""" # if not text: # return "" # # Remove control characters but keep printable characters and emojis # text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]', '', text) # # Normalize whitespace # text = ' '.join(text.split()) # return text.strip() # def analyze_comments_sentiment(comments: List[str]) -> Dict[str, Any]: # """Main sentiment analysis function with enhanced performance""" # try: # settings = get_settings() # logger.info(f"Received {len(comments)} comments for analysis") # # Sanitize comments # sanitized_comments = [sanitize_text(comment) for comment in comments] # # FIXED: Changed < to <= to properly handle min_comment_words # filtered_comments = [ # comment for comment in sanitized_comments # if (settings.min_comment_words <= len(comment.split()) <= settings.max_comment_length) # ] # logger.info(f"After filtering: {len(filtered_comments)} valid comments") # if not filtered_comments: # return { # "total_comments": 0, # "message": "No valid comments found for analysis" # } # # Create dataframe # df = pd.DataFrame({'comment': filtered_comments}) # # Detect meta-comments and explicit negatives # df['is_meta'] = df['comment'].apply(is_meta_comment) # df['heuristic_negative'] = df['comment'].apply(is_explicit_negative) # # Log detection results # meta_count = df['is_meta'].sum() # heuristic_neg_count = df['heuristic_negative'].sum() # logger.info(f"Detected {meta_count} meta-comments and {heuristic_neg_count} heuristic negatives") # # VADER sentiment analysis # vader_results = [] # for text in df['comment']: # vader_results.append(vader_sentiment(text)) # # RoBERTa sentiment analysis (batch) # roberta_results = roberta_sentiment_batch(df['comment'].tolist()) # # Combine results # vader_df = pd.DataFrame(vader_results) # roberta_df = pd.DataFrame(roberta_results) # final_df = pd.concat([df.reset_index(drop=True), vader_df, roberta_df], axis=1) # # Calculate combined scores # final_df['combined_pos'] = ( # settings.combined_weight_vader * final_df['vader_pos'] + # settings.combined_weight_roberta * final_df['roberta_pos'] # ) # final_df['combined_neg'] = ( # settings.combined_weight_vader * final_df['vader_neg'] + # settings.combined_weight_roberta * final_df['roberta_neg'] # ) # final_df['combined_neu'] = ( # settings.combined_weight_vader * final_df['vader_neu'] + # settings.combined_weight_roberta * final_df['roberta_neu'] # ) # # Classify overall sentiment (meta-comments become Neutral) # final_df['Overall_Sentiment'] = final_df.apply( # lambda row: 'Neutral' if row.get('is_meta') else overall_sentiment(row, settings), # axis=1 # ) # # Calculate statistics # total_comments = len(final_df) # positive_count = len(final_df[final_df['Overall_Sentiment'] == 'Positive']) # negative_count = len(final_df[final_df['Overall_Sentiment'] == 'Negative']) # neutral_count = len(final_df[final_df['Overall_Sentiment'] == 'Neutral']) # logger.info( # f"Results: {positive_count} positive, " # f"{negative_count} negative, {neutral_count} neutral" # ) # # Average scores # avg_positive = float(final_df['combined_pos'].mean()) # avg_negative = float(final_df['combined_neg'].mean()) # avg_neutral = float(final_df['combined_neu'].mean()) # # Determine overall sentiment label # if avg_positive > max(avg_negative, avg_neutral): # overall_sentiment_label = "Positive" # elif avg_negative > max(avg_positive, avg_neutral): # overall_sentiment_label = "Negative" # else: # overall_sentiment_label = "Neutral" # # Process negative comments # negative_summary = "" # negative_comments_list = [] # negative_comments = final_df[final_df['Overall_Sentiment'] == 'Negative'] # if len(negative_comments) > 0: # negative_comments_list = negative_comments['comment'].tolist() # try: # # Get top negative comments # top_idx = negative_comments['combined_neg'].nlargest(3).index # top_comments = negative_comments.loc[top_idx, 'comment'].tolist() # if settings.use_abstractive_summary and summarizer is not None: # negative_text = " ".join(top_comments) # if len(negative_text) > 1000: # negative_text = negative_text[:1000] # summary_result = summarizer( # negative_text, # max_length=settings.max_summary_length, # min_length=settings.min_summary_length, # do_sample=False # ) # negative_summary = summary_result[0]['summary_text'] # else: # # Extractive summary # negative_summary = "; ".join(top_comments) # except Exception as e: # logger.warning(f"Summary generation failed: {e}") # negative_summary = "; ".join(negative_comments_list[:3]) # # Generate insights and recommendations # insights = [] # recommendations = [] # if overall_sentiment_label == "Positive": # insights.extend([ # "Students have positive feedback overall", # "Teaching methods are well-received", # f"{positive_count}/{total_comments} comments are positive" # ]) # recommendations.extend([ # "Continue current teaching approach", # "Maintain student engagement strategies", # "Share successful practices with colleagues" # ]) # elif overall_sentiment_label == "Negative": # insights.extend([ # "Students have concerns that need attention", # "Some aspects of teaching may need improvement", # f"{negative_count}/{total_comments} comments indicate issues" # ]) # recommendations.extend([ # "Review teaching methods and materials", # "Consider additional student support", # "Schedule meetings to address student concerns", # "Focus on areas mentioned in negative feedback" # ]) # else: # insights.extend([ # "Mixed feedback from students", # "Some areas performing well, others need attention", # "Balance of positive and negative responses" # ]) # recommendations.extend([ # "Focus on areas with negative feedback", # "Maintain strengths while addressing weaknesses", # "Gather more specific feedback on improvement areas" # ]) # return { # "total_comments": total_comments, # "positive_comments": positive_count, # "negative_comments": negative_count, # "neutral_comments": neutral_count, # "positive_sentiment": round(avg_positive, 3), # "negative_sentiment": round(avg_negative, 3), # "neutral_sentiment": round(avg_neutral, 3), # "overall_sentiment": overall_sentiment_label, # "sentiment_distribution": { # "positive_percentage": round((positive_count / total_comments) * 100, 1), # "negative_percentage": round((negative_count / total_comments) * 100, 1), # "neutral_percentage": round((neutral_count / total_comments) * 100, 1) # }, # "negative_comments_summary": negative_summary, # "negative_comments_list": negative_comments_list, # "key_insights": insights, # "recommendations": recommendations, # "detailed_analysis": { # "vader_scores": { # "average_positive": round(final_df['vader_pos'].mean(), 3), # "average_negative": round(final_df['vader_neg'].mean(), 3), # "average_neutral": round(final_df['vader_neu'].mean(), 3), # "average_compound": round(final_df['vader_compound'].mean(), 3) # }, # "roberta_scores": { # "average_positive": round(final_df['roberta_pos'].mean(), 3), # "average_negative": round(final_df['roberta_neg'].mean(), 3), # "average_neutral": round(final_df['roberta_neu'].mean(), 3) # } # }, # "analysis_timestamp": datetime.utcnow().isoformat() # } # except Exception as e: # logger.error(f"Sentiment analysis failed: {e}", exc_info=True) # raise e # @app.on_event("startup") # async def startup_event(): # """Initialize models on startup""" # try: # logger.info("=" * 80) # logger.info(f"Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") # logger.info("=" * 80) # initialize_models() # logger.info("✓ Service started successfully") # logger.info("=" * 80) # except Exception as e: # logger.error(f"✗ Startup failed: {e}") # raise e # @app.on_event("shutdown") # async def shutdown_event(): # """Cleanup on shutdown""" # logger.info("Service shutting down") # @app.get("/") # async def root(): # """Root endpoint""" # return { # "service": get_settings().app_name, # "version": get_settings().app_version, # "status": "running", # "endpoints": { # "health": "/health", # "analyze": "/analyze-comments", # "config": "/config (debug mode only)", # "test": "/test" # } # } # @app.get("/health") # async def health_check(): # """Health check endpoint""" # models_loaded = sia is not None and model is not None and tokenizer is not None # return { # "status": "healthy" if models_loaded else "unhealthy", # "service": "comment-analysis", # "version": get_settings().app_version, # "models_loaded": models_loaded, # "device": device if device else "not initialized", # "timestamp": datetime.utcnow().isoformat() # } # @app.post("/analyze-comments", response_model=CommentAnalysisResponse) # async def analyze_comments( # request: CommentAnalysisRequest, # settings: Settings = Depends(get_settings) # ): # """ # Analyze comments for sentiment analysis using VADER and RoBERTa models # """ # try: # comments = request.comments # faculty_info = request.faculty_info # if not comments: # return CommentAnalysisResponse( # success=False, # analysis=None, # message="No comments provided for analysis" # ) # logger.info( # f"Analyzing {len(comments)} comments for " # f"{faculty_info.faculty_name} ({faculty_info.course_code})" # ) # analysis_result = analyze_comments_sentiment(comments) # if analysis_result.get("total_comments", 0) == 0: # return CommentAnalysisResponse( # success=False, # analysis=None, # message=analysis_result.get("message", "No valid comments to analyze") # ) # analysis_result["faculty_info"] = { # "faculty_name": faculty_info.faculty_name, # "staff_id": faculty_info.staff_id, # "course_code": faculty_info.course_code, # "course_name": faculty_info.course_name # } # return CommentAnalysisResponse( # success=True, # analysis=analysis_result, # message=f"Successfully analyzed {analysis_result['total_comments']} comments" # ) # except ValueError as ve: # logger.warning(f"Validation error: {ve}") # raise HTTPException(status_code=400, detail=str(ve)) # except Exception as e: # logger.error(f"Analysis failed: {e}", exc_info=True) # raise HTTPException( # status_code=500, # detail="Analysis failed. Please try again later." # ) # @app.get("/config") # async def get_config(settings: Settings = Depends(get_settings)): # """Get current configuration (debug mode only)""" # if not settings.debug_mode: # raise HTTPException(status_code=404, detail="Not found") # return { # "max_comments_per_request": settings.max_comments_per_request, # "max_comment_length": settings.max_comment_length, # "min_comment_words": settings.min_comment_words, # "vader_pos_threshold": settings.vader_pos_threshold, # "vader_neg_threshold": settings.vader_neg_threshold, # "roberta_pos_threshold": settings.roberta_pos_threshold, # "roberta_neg_threshold": settings.roberta_neg_threshold, # "combined_weight_vader": settings.combined_weight_vader, # "combined_weight_roberta": settings.combined_weight_roberta, # "enable_caching": settings.enable_caching, # "batch_size": settings.batch_size, # "use_abstractive_summary": settings.use_abstractive_summary # } # @app.get("/test") # async def test_endpoint(): # """Test endpoint to verify sentiment classification""" # test_cases = [ # "No more comments 😅", # "Overall good but too lag", # "Not interested to be in her class just we are going for attendance thats it not at all managing time.", # "Nothing to say anything just we are going to her class mean, only for attendance", # "Excellent teaching! Very clear explanations.", # "Good teacher with strong subject knowledge", # "Class is okay, nothing special" # ] # results = [] # for text in test_cases: # is_meta = is_meta_comment(text) # is_neg = is_explicit_negative(text) # # Predict classification # if is_meta: # predicted = "Neutral (meta-comment)" # elif is_neg: # predicted = "Negative (heuristic)" # else: # predicted = "Needs full analysis" # results.append({ # "text": text, # "is_meta_comment": is_meta, # "is_heuristic_negative": is_neg, # "predicted_classification": predicted # }) # return { # "test_results": results, # "note": "Full analysis requires VADER and RoBERTa scores" # } # if __name__ == "__main__": # uvicorn.run( # app, # host="0.0.0.0", # port=8000, # log_level="info" # )