File size: 3,390 Bytes
090987a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# sentiment_analyzer.py
# Simple sentiment analyzer that doesn't require PyTorch

import pandas as pd
import re
import random
import os

def simple_sentiment_analysis(text):
    """
    A very simple rule-based sentiment analyzer for demonstration purposes.
    Returns a sentiment label (neutral, positive, negative) and confidence score.
    """
    if not text or len(text.strip()) < 15:
        return "neutral", 0.5

    # Convert to lowercase
    text = text.lower()

    # Define positive and negative word lists (Malay and English)
    positive_words = [
        "baik", "bagus", "hebat", "cantik", "indah", "suka", "gembira", "senang",
        "setuju", "betul", "benar", "berkesan", "berjaya", "cemerlang", "positif",
        "good", "great", "excellent", "amazing", "wonderful", "happy", "like", "love",
        "agree", "correct", "true", "effective", "successful", "positive"
    ]

    negative_words = [
        "buruk", "teruk", "hodoh", "benci", "marah", "sedih", "kecewa", "susah",
        "tidak setuju", "salah", "palsu", "gagal", "negatif", "masalah", "bahaya",
        "bad", "terrible", "ugly", "hate", "angry", "sad", "disappointed", "difficult",
        "disagree", "wrong", "false", "fail", "negative", "problem", "dangerous"
    ]

    # Count positive and negative words
    positive_count = sum(1 for word in positive_words if re.search(r'\b' + re.escape(word) + r'\b', text))
    negative_count = sum(1 for word in negative_words if re.search(r'\b' + re.escape(word) + r'\b', text))

    # Determine sentiment
    if positive_count > negative_count:
        sentiment = "positive"
        confidence = 0.5 + min(0.5, (positive_count - negative_count) / 10)
    elif negative_count > positive_count:
        sentiment = "negative"
        confidence = 0.5 + min(0.5, (negative_count - positive_count) / 10)
    else:
        sentiment = "neutral"
        confidence = 0.5

    return sentiment, round(confidence, 4)

def run(csv_path, sentiment_output_path=None):
    """
    Runs sentiment analysis on combined comment + post text from the input CSV.
    Saves the result (with sentiment + confidence columns) to a new CSV.
    """
    print(f"[📄] Reading dataset: {csv_path}")
    df = pd.read_csv(csv_path)

    # Combine comment and post text into a single field
    df['combined_text'] = df['comment_text'].fillna('') + ". " + df['post_text'].fillna('')
    df['combined_text'] = df['combined_text'].str.strip()

    sentiments = []
    confidences = []

    print("[🔍] Running simple sentiment classification...")
    for text in df['combined_text']:
        sentiment, confidence = simple_sentiment_analysis(text)
        sentiments.append(sentiment)
        confidences.append(confidence)

    # Add results to DataFrame
    df['sentiment'] = sentiments
    df['confidence'] = confidences

    # Map sentiments to numeric values for compatibility with the rest of the system
    sentiment_map = {
        "neutral": 0,
        "positive": 1,
        "negative": 2
    }
    df['sentiment_value'] = df['sentiment'].map(sentiment_map)

    # Determine the output path dynamically if not provided
    if not sentiment_output_path:
        sentiment_output_path = csv_path.replace(".csv", "_sentiment.csv")

    df.to_csv(sentiment_output_path, index=False)
    print(f"[💾] Sentiment analysis completed. Output saved to: {sentiment_output_path}")