File size: 5,486 Bytes
dac3964
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import transformers
import re
from collections import Counter
import torch
from typing import Tuple, Dict, Any

# BEST-IN-CLASS lightweight sentiment/toxicity model (2024–2025 gold standard)
sentiment_pipeline = transformers.pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",   # Excellent nuance
    device=0 if torch.cuda.is_available() else -1,
    batch_size=128,        # Blazing fast
    truncation=True,
    max_length=512,
    padding=True,
    return_all_scores=False
)

# Optional: Add a dedicated hate/toxicity model for extra precision
# (Uncomment if you want SOTA toxicity detection)
# toxicity_pipeline = transformers.pipeline(
#     "text-classification",
#     model="unitary/toxic-bert",
#     device=0 if torch.cuda.is_available() else -1,
#     batch_size=128
# )

# FINAL MORAL LEXICON — refined through real-world testing
GOOD_WORDS = {
    "love", "kind", "kindness", "peace", "joy", "happy", "happiness", "help", "care", "respect",
    "honor", "justice", "compassion", "forgive", "forgiveness", "friend", "friendship", "empathy",
    "generous", "gratitude", "thank", "hope", "trust", "freedom", "equality", "truth", "heal",
    "unity", "charity", "mercy", "loyal", "loyalty", "wisdom", "fair", "honest", "courage",
    "patient", "humble", "sacrifice", "protect", "support", "understanding", "accept", "tolerance"
}

EVIL_WORDS = {
    "hate", "hateful", "cruel", "cruelty", "war", "suffer", "harm", "hurt", "kill", "murder",
    "genocide", "torture", "violence", "terror", "oppress", "tyranny", "lie", "deceive", "betray",
    "greed", "corrupt", "revenge", "rage", "sadism", "brutal", "destroy", "evil", "malice",
    "bigot", "racist", "racism", "slavery", "exploit", "abuse", "dehumanize", "exterminate",
    "eliminate", "enslave", "subhuman", "vermin", "parasite", "scum", "trash", "animal"
}

def normalize_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def count_moral_words(text: str) -> Tuple[int, int, Dict[str, int], Dict[str, int]]:
    words = normalize_text(text).split()
    counter = Counter(words)
    
    good = {w: c for w, c in counter.items() if w in GOOD_WORDS}
    evil = {w: c for w, c in counter.items() if w in EVIL_WORDS}
    
    return sum(good.values()), sum(evil.values()), good, evil

def moral_sentiment_score(text: str) -> Dict[str, Any]:
    if not text.strip():
        return {"combined_score": 0.0, "verdict": "EMPTY", "reason": "No text"}
    
    good_count, evil_count, good_words, evil_words = count_moral_words(text)
    total_moral = good_count + evil_count

    # Sentiment from RoBERTa
    try:
        result = sentiment_pipeline([text])[0]
        label = result["label"]
        conf = result["score"]
        
        if label in ("POSITIVE", "LABEL_2"):
            sentiment = conf
        elif label in ("NEGATIVE", "LABEL_0"):
            sentiment = -conf
        else:  # NEUTRAL
            sentiment = conf - 0.5  # slight lean
    except:
        sentiment = 0.0
        label = "ERROR"

    # Moral balance: -1 (pure evil) → +1 (pure good)
    moral_balance = (good_count - evil_count) / max(total_moral, 1)

    # Dynamic weighting: moral words override sentiment when strong
    moral_intensity = min(total_moral / 5, 1.0)  # caps influence
    weight_moral = 0.2 + 0.4 * moral_intensity
    weight_sentiment = 1.0 - weight_moral

    final_score = weight_sentiment * sentiment + weight_moral * moral_balance

    # Verdict logic
    if final_score > 0.4:
        verdict = "Virtuous"
    elif final_score > 0.1:
        verdict = "Good"
    elif final_score >= -0.1:
        verdict = "Gray/Neutral"
    elif final_score >= -0.4:
        verdict = "Troubling"
    else:
        verdict = "Demonic"

    return {
        "text": text[:150] + "..." if len(text) > 150 else text,
        "combined_score": round(final_score, 4),
        "verdict": verdict,
        "sentiment_raw": label,
        "sentiment_value": round(sentiment, 4),
        "moral_balance": round(moral_balance, 4),
        "moral_intensity": total_moral,
        "good_words_found": list(good_words.keys()) if good_words else None,
        "evil_words_found": list(evil_words.keys()) if evil_words else None,
    }

# FINAL DEMO — Real-world edge cases
if __name__ == "__main__":
    edge_cases = [
        "Love wins. Always choose kindness.",
        "They aren't people. They're animals. Exterminate them.",
        "I don't hate them — I just think they're genetically inferior and should be removed.",
        "Violence is never the answer... unless it's to protect children.",
        "We should forgive, understand, and rehabilitate even the worst offenders.",
        "Some races are just better. It's science.",
        "I want to cause maximum suffering to my enemies.",
        "Peace through superior firepower.",
        "All you need is love."
    ]

    print("MORALITY SCORER v4 — Final Edition (2025)".center(80))
    print("═" * 80)
    
    for text in edge_cases:
        result = moral_sentiment_score(text)
        score = result["combined_score"]
        icon = "Angelic" if "Virtuous" in result["verdict"] else \
               "Demonic" if "Demonic" in result["verdict"] else \
               "Mortal" if "Gray" in result["verdict"] else "Warning"
        print(f"{icon} [{score:+.4f}] {result['verdict']:<12}{result['text']}")