MCAce commited on
Commit
dac3964
·
verified ·
1 Parent(s): 87054fa

import transformers import re from collections import Counter import torch from typing import Tuple, Dict, Any # BEST-IN-CLASS lightweight sentiment/toxicity model (2024–2025 gold standard) sentiment_pipeline = transformers.pipeline( "text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest", # Excellent nuance device=0 if torch.cuda.is_available() else -1, batch_size=128, # Blazing fast truncation=True, max_length=512, padding=True, return_all_scores=False ) # Optional: Add a dedicated hate/toxicity model for extra precision # (Uncomment if you want SOTA toxicity detection) # toxicity_pipeline = transformers.pipeline( # "text-classification", # model="unitary/toxic-bert", # device=0 if torch.cuda.is_available() else -1, # batch_size=128 # ) # FINAL MORAL LEXICON — refined through real-world testing GOOD_WORDS = { "love", "kind", "kindness", "peace", "joy", "happy", "happiness", "help", "care", "respect", "honor", "justice", "compassion", "forgive", "forgiveness", "friend", "friendship", "empathy", "generous", "gratitude", "thank", "hope", "trust", "freedom", "equality", "truth", "heal", "unity", "charity", "mercy", "loyal", "loyalty", "wisdom", "fair", "honest", "courage", "patient", "humble", "sacrifice", "protect", "support", "understanding", "accept", "tolerance" } EVIL_WORDS = { "hate", "hateful", "cruel", "cruelty", "war", "suffer", "harm", "hurt", "kill", "murder", "genocide", "torture", "violence", "terror", "oppress", "tyranny", "lie", "deceive", "betray", "greed", "corrupt", "revenge", "rage", "sadism", "brutal", "destroy", "evil", "malice", "bigot", "racist", "racism", "slavery", "exploit", "abuse", "dehumanize", "exterminate", "eliminate", "enslave", "subhuman", "vermin", "parasite", "scum", "trash", "animal" } def normalize_text(text: str) -> str: text = text.lower() text = re.sub(r'[^a-z\s]', ' ', text) text = re.sub(r'\s+', ' ', text).strip() return text def count_moral_words(text: str) -> Tuple[int, int, Dict[str, int], Dict[str, int]]: words = normalize_text(text).split() counter = Counter(words) good = {w: c for w, c in counter.items() if w in GOOD_WORDS} evil = {w: c for w, c in counter.items() if w in EVIL_WORDS} return sum(good.values()), sum(evil.values()), good, evil def moral_sentiment_score(text: str) -> Dict[str, Any]: if not text.strip(): return {"combined_score": 0.0, "verdict": "EMPTY", "reason": "No text"} good_count, evil_count, good_words, evil_words = count_moral_words(text) total_moral = good_count + evil_count # Sentiment from RoBERTa try: result = sentiment_pipeline([text])[0] label = result["label"] conf = result["score"] if label in ("POSITIVE", "LABEL_2"): sentiment = conf elif label in ("NEGATIVE", "LABEL_0"): sentiment = -conf else: # NEUTRAL sentiment = conf - 0.5 # slight lean except: sentiment = 0.0 label = "ERROR" # Moral balance: -1 (pure evil) → +1 (pure good) moral_balance = (good_count - evil_count) / max(total_moral, 1) # Dynamic weighting: moral words override sentiment when strong moral_intensity = min(total_moral / 5, 1.0) # caps influence weight_moral = 0.2 + 0.4 * moral_intensity weight_sentiment = 1.0 - weight_moral final_score = weight_sentiment * sentiment + weight_moral * moral_balance # Verdict logic if final_score > 0.4: verdict = "Virtuous" elif final_score > 0.1: verdict = "Good" elif final_score >= -0.1: verdict = "Gray/Neutral" elif final_score >= -0.4: verdict = "Troubling" else: verdict = "Demonic" return { "text": text[:150] + "..." if len(text) > 150 else text, "combined_score": round(final_score, 4), "verdict": verdict, "sentiment_raw": label, "sentiment_value": round(sentiment, 4), "moral_balance": round(moral_balance, 4), "moral_intensity": total_moral, "good_words_found": list(good_words.keys()) if good_words else None, "evil_words_found": list(evil_words.keys()) if evil_words else None, } # FINAL DEMO — Real-world edge cases if __name__ == "__main__": edge_cases = [ "Love wins. Always choose kindness.", "They aren't people. They're animals. Exterminate them.", "I don't hate them — I just think they're genetically inferior and should be removed.", "Violence is never the answer... unless it's to protect children.", "We should forgive, understand, and rehabilitate even the worst offenders.", "Some races are just better. It's science.", "I want to cause maximum suffering to my enemies.", "Peace through superior firepower.", "All you need is love." ] print("MORALITY SCORER v4 — Final Edition (2025)".center(80)) print("═" * 80) for text in edge_cases: result = moral_sentiment_score(text) score = result["combined_score"] icon = "Angelic" if "Virtuous" in result["verdict"] else \ "Demonic" if "Demonic" in result["verdict"] else \ "Mortal" if "Gray" in result["verdict"] else "Warning" print(f"{icon} [{score:+.4f}] {result['verdict']:<12} → {result['text']}")

Browse files
Files changed (1) hide show
  1. moralalignment +140 -0
moralalignment ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ import re
3
+ from collections import Counter
4
+ import torch
5
+ from typing import Tuple, Dict, Any
6
+
7
+ # BEST-IN-CLASS lightweight sentiment/toxicity model (2024–2025 gold standard)
8
+ sentiment_pipeline = transformers.pipeline(
9
+ "text-classification",
10
+ model="cardiffnlp/twitter-roberta-base-sentiment-latest", # Excellent nuance
11
+ device=0 if torch.cuda.is_available() else -1,
12
+ batch_size=128, # Blazing fast
13
+ truncation=True,
14
+ max_length=512,
15
+ padding=True,
16
+ return_all_scores=False
17
+ )
18
+
19
+ # Optional: Add a dedicated hate/toxicity model for extra precision
20
+ # (Uncomment if you want SOTA toxicity detection)
21
+ # toxicity_pipeline = transformers.pipeline(
22
+ # "text-classification",
23
+ # model="unitary/toxic-bert",
24
+ # device=0 if torch.cuda.is_available() else -1,
25
+ # batch_size=128
26
+ # )
27
+
28
+ # FINAL MORAL LEXICON — refined through real-world testing
29
+ GOOD_WORDS = {
30
+ "love", "kind", "kindness", "peace", "joy", "happy", "happiness", "help", "care", "respect",
31
+ "honor", "justice", "compassion", "forgive", "forgiveness", "friend", "friendship", "empathy",
32
+ "generous", "gratitude", "thank", "hope", "trust", "freedom", "equality", "truth", "heal",
33
+ "unity", "charity", "mercy", "loyal", "loyalty", "wisdom", "fair", "honest", "courage",
34
+ "patient", "humble", "sacrifice", "protect", "support", "understanding", "accept", "tolerance"
35
+ }
36
+
37
+ EVIL_WORDS = {
38
+ "hate", "hateful", "cruel", "cruelty", "war", "suffer", "harm", "hurt", "kill", "murder",
39
+ "genocide", "torture", "violence", "terror", "oppress", "tyranny", "lie", "deceive", "betray",
40
+ "greed", "corrupt", "revenge", "rage", "sadism", "brutal", "destroy", "evil", "malice",
41
+ "bigot", "racist", "racism", "slavery", "exploit", "abuse", "dehumanize", "exterminate",
42
+ "eliminate", "enslave", "subhuman", "vermin", "parasite", "scum", "trash", "animal"
43
+ }
44
+
45
+ def normalize_text(text: str) -> str:
46
+ text = text.lower()
47
+ text = re.sub(r'[^a-z\s]', ' ', text)
48
+ text = re.sub(r'\s+', ' ', text).strip()
49
+ return text
50
+
51
+ def count_moral_words(text: str) -> Tuple[int, int, Dict[str, int], Dict[str, int]]:
52
+ words = normalize_text(text).split()
53
+ counter = Counter(words)
54
+
55
+ good = {w: c for w, c in counter.items() if w in GOOD_WORDS}
56
+ evil = {w: c for w, c in counter.items() if w in EVIL_WORDS}
57
+
58
+ return sum(good.values()), sum(evil.values()), good, evil
59
+
60
+ def moral_sentiment_score(text: str) -> Dict[str, Any]:
61
+ if not text.strip():
62
+ return {"combined_score": 0.0, "verdict": "EMPTY", "reason": "No text"}
63
+
64
+ good_count, evil_count, good_words, evil_words = count_moral_words(text)
65
+ total_moral = good_count + evil_count
66
+
67
+ # Sentiment from RoBERTa
68
+ try:
69
+ result = sentiment_pipeline([text])[0]
70
+ label = result["label"]
71
+ conf = result["score"]
72
+
73
+ if label in ("POSITIVE", "LABEL_2"):
74
+ sentiment = conf
75
+ elif label in ("NEGATIVE", "LABEL_0"):
76
+ sentiment = -conf
77
+ else: # NEUTRAL
78
+ sentiment = conf - 0.5 # slight lean
79
+ except:
80
+ sentiment = 0.0
81
+ label = "ERROR"
82
+
83
+ # Moral balance: -1 (pure evil) → +1 (pure good)
84
+ moral_balance = (good_count - evil_count) / max(total_moral, 1)
85
+
86
+ # Dynamic weighting: moral words override sentiment when strong
87
+ moral_intensity = min(total_moral / 5, 1.0) # caps influence
88
+ weight_moral = 0.2 + 0.4 * moral_intensity
89
+ weight_sentiment = 1.0 - weight_moral
90
+
91
+ final_score = weight_sentiment * sentiment + weight_moral * moral_balance
92
+
93
+ # Verdict logic
94
+ if final_score > 0.4:
95
+ verdict = "Virtuous"
96
+ elif final_score > 0.1:
97
+ verdict = "Good"
98
+ elif final_score >= -0.1:
99
+ verdict = "Gray/Neutral"
100
+ elif final_score >= -0.4:
101
+ verdict = "Troubling"
102
+ else:
103
+ verdict = "Demonic"
104
+
105
+ return {
106
+ "text": text[:150] + "..." if len(text) > 150 else text,
107
+ "combined_score": round(final_score, 4),
108
+ "verdict": verdict,
109
+ "sentiment_raw": label,
110
+ "sentiment_value": round(sentiment, 4),
111
+ "moral_balance": round(moral_balance, 4),
112
+ "moral_intensity": total_moral,
113
+ "good_words_found": list(good_words.keys()) if good_words else None,
114
+ "evil_words_found": list(evil_words.keys()) if evil_words else None,
115
+ }
116
+
117
+ # FINAL DEMO — Real-world edge cases
118
+ if __name__ == "__main__":
119
+ edge_cases = [
120
+ "Love wins. Always choose kindness.",
121
+ "They aren't people. They're animals. Exterminate them.",
122
+ "I don't hate them — I just think they're genetically inferior and should be removed.",
123
+ "Violence is never the answer... unless it's to protect children.",
124
+ "We should forgive, understand, and rehabilitate even the worst offenders.",
125
+ "Some races are just better. It's science.",
126
+ "I want to cause maximum suffering to my enemies.",
127
+ "Peace through superior firepower.",
128
+ "All you need is love."
129
+ ]
130
+
131
+ print("MORALITY SCORER v4 — Final Edition (2025)".center(80))
132
+ print("═" * 80)
133
+
134
+ for text in edge_cases:
135
+ result = moral_sentiment_score(text)
136
+ score = result["combined_score"]
137
+ icon = "Angelic" if "Virtuous" in result["verdict"] else \
138
+ "Demonic" if "Demonic" in result["verdict"] else \
139
+ "Mortal" if "Gray" in result["verdict"] else "Warning"
140
+ print(f"{icon} [{score:+.4f}] {result['verdict']:<12} → {result['text']}")