# mca_comment_analyzer.py

import os
import pandas as pd
import torch
from transformers import pipeline
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import nltk
import random
from datetime import datetime, timedelta

# -----------------------------
# Configs
# -----------------------------
os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib"
os.environ["NLTK_DATA"] = "/tmp/nltk_data"

# NLTK Stopwords
nltk.download('stopwords', download_dir="/tmp/nltk_data", quiet=True)
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

# -----------------------------
# MCA Comment Analyzer
# -----------------------------
class MCACommentAnalyzer:
    def __init__(self):
        device = 0 if torch.cuda.is_available() else -1
        print("Using device:", "GPU" if device==0 else "CPU")

        # Sentiment model
        self.sentiment_model = pipeline(
            "sentiment-analysis",
            model="distilbert-base-uncased-finetuned-sst-2-english",
            device=device
        )

        # Summarizer
        self.summarizer = pipeline(
            "summarization",
            model="t5-small",
            device=device
        )

        self.stop_words = STOPWORDS

    def map_sentiment(self, pred, text):
        text_lower = text.lower()
        
        # -----------------------------
        # Strict priority: Negative → Violation → Suggestion → Positive
        # -----------------------------
        negative_keywords = ["confusing", "unclear", "bad", "problem"]
        violation_keywords = ["violation", "violates", "illegal", "non-compliant"]
        suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if"]
        positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]

        if any(w in text_lower for w in negative_keywords):
            return "Negative"
        if any(w in text_lower for w in violation_keywords):
            return "Violation"
        if any(w in text_lower for w in suggestion_keywords):
            return "Suggestion"
        if any(w in text_lower for w in positive_keywords):
            return "Positive"

        # fallback to sentiment model
        label = pred['label'].upper()
        if label == "POSITIVE":
            return "Positive"
        elif label == "NEGATIVE":
            return "Negative"
        else:
            return "Neutral"

    def process_comment(self, comment):
        pred = self.sentiment_model(comment)[0]
        sentiment = self.map_sentiment(pred, comment)

        if len(comment.split()) < 10:
            summary_text = " ".join(comment.split()[:10])
        else:
            try:
                summary_text = self.summarizer(comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text']
            except:
                summary_text = comment

        words = [w for w in comment.lower().split() if w.isalpha() and w not in self.stop_words]
        keywords = list(Counter(words).keys())
        top_keywords = ", ".join(keywords[:3])
        return sentiment, summary_text, keywords, top_keywords

    def process_comments(self, comments_list):
        sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
        start_date = datetime.now() - timedelta(days=30)

        for comment in comments_list:
            sentiment, summary, keywords, top_kw = self.process_comment(comment)
            sentiments.append(sentiment)
            summaries.append(summary)
            all_keywords.extend(keywords)
            top_keywords_list.append(top_kw)
            timestamps.append(start_date + timedelta(days=random.randint(0,30)))

        df = pd.DataFrame({
            "Timestamp": timestamps,
            "Comment": comments_list,
            "Summary": summaries,
            "Sentiment": sentiments,
            "Top Keywords": top_keywords_list
        })
        df.sort_values(by='Timestamp', inplace=True, ascending=True)

        keyword_freq = pd.DataFrame(
            Counter(all_keywords).items(),
            columns=['Keyword', 'Frequency']
        ).sort_values(by='Frequency', ascending=False)

        return df, keyword_freq

    def generate_wordcloud(self, keyword_freq, filename=None):
        wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
        wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
        plt.figure(figsize=(10,5))
        plt.imshow(wc, interpolation="bilinear")
        plt.axis("off")
        if filename:
            plt.savefig(filename, bbox_inches='tight')
        return plt