import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import re
import os
import json
from datetime import datetime
from io import StringIO

def clean_kakao_message(text):
    # Remove timestamps, system messages, etc.
    if isinstance(text, str):
        # Remove photo, video attachments
        text = re.sub(r'\[Photo\]|\[Emoticon\]|\[Video\]|\[File\]', '', text)
        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        # Remove other non-text content indicators
        text = re.sub(r'\[Shop\]|\[Map\]', '', text)
        return text.strip()
    return ""
    
def analyze_sentiment(text, model, tokenizer):
    if not text or len(text.strip()) < 2:  # Skip empty or very short texts
        return None
    
    # Encode the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    
    # Get model prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits
        probabilities = torch.nn.functional.softmax(predictions, dim=1)
        
        # Get the predicted class and confidence
        predicted_class = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][predicted_class].item()
        
        # Map class index to label using the model's id2label mapping
        sentiment = model.config.id2label[predicted_class]
    
    return {"sentiment": sentiment, "confidence": confidence, "text": text}

async def analyze_kakao_csv(file, model, tokenizer):
    # 파일 내용 읽기 (bytes)
    #contents = await file.read()

    # 1차: utf-8 시도
    try:
        df = pd.read_csv(StringIO(file.decode("utf-8")))
    except UnicodeDecodeError:
        # 2차: cp949 시도
        df = pd.read_csv(StringIO(file.decode("cp949")))
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return None
        
    # Detect the structure of the CSV
    print("CSV file structure:", df.columns.tolist())
    
    # Try to identify message column and timestamp column
    message_col = None
    timestamp_col = None
    possible_cols = ['Text', 'Message', 'Content', 'text', 'message', 'content']
    possible_time_cols = ['Date', 'Time', 'Timestamp', 'date', 'time', 'timestamp']
    
    for col in possible_cols:
        if col in df.columns:
            message_col = col
            break
    
    for col in possible_time_cols:
        if col in df.columns:
            timestamp_col = col
            break
    
    if not message_col:
        # Try to guess which column contains the message content
        for col in df.columns:
            if df[col].dtype == 'object' and df[col].str.len().mean() > 10:
                message_col = col
                break
    
    if not message_col:
        print("Could not find a column containing message content.")
        return
    
    print(f"Using '{message_col}' as the message column.")
    if timestamp_col:
        print(f"Using '{timestamp_col}' as the timestamp column.")
    
    # Clean messages
    df['cleaned_message'] = df[message_col].apply(clean_kakao_message)
    
    # Analyze sentiment for each message
    results = []
    
    print(f"Analyzing {len(df)} messages...")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        message = row['cleaned_message']
        if not message or len(message.strip()) < 2:  # Skip empty or very short texts
            continue
            
        # Get timestamp if available
        timestamp = row[timestamp_col] if timestamp_col else datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        # Use the pipeline to analyze sentiment
        sentiment_result = analyze_sentiment(message, model, tokenizer)
        if sentiment_result:
            sentiment_result['timestamp'] = timestamp
            results.append(sentiment_result)
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

def get_json_result(results_df, model_name="KCElectra"):
    if results_df is None or len(results_df) == 0:
        print("No results to analyze.")
        return
    
    # Generate timestamp for unique filenames
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Count each class and draw bar chart with dynamic colors
    sentiment_counts = results_df['sentiment'].value_counts()

    # Create JSON output
    json_output = {
        "model_name": model_name,
        "analysis_timestamp": timestamp,
        "total_messages": len(results_df),
        "sentiment_distribution": sentiment_counts.to_dict(),
        "average_confidence": results_df.groupby('sentiment')['confidence'].mean().to_dict(),
        "messages": [
            {
                "text": row['text'],
                "sentiment": row['sentiment'],
                "confidence": float(row['confidence']),
                "timestamp": row['timestamp']
            }
            for _, row in results_df.iterrows()
        ]
    }

    return json_output