import pandas as pd import numpy as np import torch from tqdm import tqdm import re import os import json from datetime import datetime from io import StringIO def clean_kakao_message(text): # Remove timestamps, system messages, etc. if isinstance(text, str): # Remove photo, video attachments text = re.sub(r'\[Photo\]|\[Emoticon\]|\[Video\]|\[File\]', '', text) # Remove URLs text = re.sub(r'https?://\S+|www\.\S+', '', text) # Remove other non-text content indicators text = re.sub(r'\[Shop\]|\[Map\]', '', text) return text.strip() return "" def analyze_sentiment(text, model, tokenizer): if not text or len(text.strip()) < 2: # Skip empty or very short texts return None # Encode the text inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True) # Get model prediction with torch.no_grad(): outputs = model(**inputs) predictions = outputs.logits probabilities = torch.nn.functional.softmax(predictions, dim=1) # Get the predicted class and confidence predicted_class = torch.argmax(probabilities, dim=1).item() confidence = probabilities[0][predicted_class].item() # Map class index to label using the model's id2label mapping sentiment = model.config.id2label[predicted_class] return {"sentiment": sentiment, "confidence": confidence, "text": text} async def analyze_kakao_csv(file, model, tokenizer): # 파일 내용 읽기 (bytes) #contents = await file.read() # 1차: utf-8 시도 try: df = pd.read_csv(StringIO(file.decode("utf-8"))) except UnicodeDecodeError: # 2차: cp949 시도 df = pd.read_csv(StringIO(file.decode("cp949"))) except Exception as e: print(f"Error reading CSV: {e}") return None # Detect the structure of the CSV print("CSV file structure:", df.columns.tolist()) # Try to identify message column and timestamp column message_col = None timestamp_col = None possible_cols = ['Text', 'Message', 'Content', 'text', 'message', 'content'] possible_time_cols = ['Date', 'Time', 'Timestamp', 'date', 'time', 'timestamp'] for col in possible_cols: if col in df.columns: message_col = col break for col in possible_time_cols: if col in df.columns: timestamp_col = col break if not message_col: # Try to guess which column contains the message content for col in df.columns: if df[col].dtype == 'object' and df[col].str.len().mean() > 10: message_col = col break if not message_col: print("Could not find a column containing message content.") return print(f"Using '{message_col}' as the message column.") if timestamp_col: print(f"Using '{timestamp_col}' as the timestamp column.") # Clean messages df['cleaned_message'] = df[message_col].apply(clean_kakao_message) # Analyze sentiment for each message results = [] print(f"Analyzing {len(df)} messages...") for idx, row in tqdm(df.iterrows(), total=len(df)): message = row['cleaned_message'] if not message or len(message.strip()) < 2: # Skip empty or very short texts continue # Get timestamp if available timestamp = row[timestamp_col] if timestamp_col else datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Use the pipeline to analyze sentiment sentiment_result = analyze_sentiment(message, model, tokenizer) if sentiment_result: sentiment_result['timestamp'] = timestamp results.append(sentiment_result) # Create results DataFrame results_df = pd.DataFrame(results) return results_df def get_json_result(results_df, model_name="KCElectra"): if results_df is None or len(results_df) == 0: print("No results to analyze.") return # Generate timestamp for unique filenames timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Count each class and draw bar chart with dynamic colors sentiment_counts = results_df['sentiment'].value_counts() # Create JSON output json_output = { "model_name": model_name, "analysis_timestamp": timestamp, "total_messages": len(results_df), "sentiment_distribution": sentiment_counts.to_dict(), "average_confidence": results_df.groupby('sentiment')['confidence'].mean().to_dict(), "messages": [ { "text": row['text'], "sentiment": row['sentiment'], "confidence": float(row['confidence']), "timestamp": row['timestamp'] } for _, row in results_df.iterrows() ] } return json_output