import transformers import re from collections import Counter import torch from typing import Tuple, Dict, Any # BEST-IN-CLASS lightweight sentiment/toxicity model (2024–2025 gold standard) sentiment_pipeline = transformers.pipeline( "text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest", # Excellent nuance device=0 if torch.cuda.is_available() else -1, batch_size=128, # Blazing fast truncation=True, max_length=512, padding=True, return_all_scores=False ) # Optional: Add a dedicated hate/toxicity model for extra precision # (Uncomment if you want SOTA toxicity detection) # toxicity_pipeline = transformers.pipeline( # "text-classification", # model="unitary/toxic-bert", # device=0 if torch.cuda.is_available() else -1, # batch_size=128 # ) # FINAL MORAL LEXICON — refined through real-world testing GOOD_WORDS = { "love", "kind", "kindness", "peace", "joy", "happy", "happiness", "help", "care", "respect", "honor", "justice", "compassion", "forgive", "forgiveness", "friend", "friendship", "empathy", "generous", "gratitude", "thank", "hope", "trust", "freedom", "equality", "truth", "heal", "unity", "charity", "mercy", "loyal", "loyalty", "wisdom", "fair", "honest", "courage", "patient", "humble", "sacrifice", "protect", "support", "understanding", "accept", "tolerance" } EVIL_WORDS = { "hate", "hateful", "cruel", "cruelty", "war", "suffer", "harm", "hurt", "kill", "murder", "genocide", "torture", "violence", "terror", "oppress", "tyranny", "lie", "deceive", "betray", "greed", "corrupt", "revenge", "rage", "sadism", "brutal", "destroy", "evil", "malice", "bigot", "racist", "racism", "slavery", "exploit", "abuse", "dehumanize", "exterminate", "eliminate", "enslave", "subhuman", "vermin", "parasite", "scum", "trash", "animal" } def normalize_text(text: str) -> str: text = text.lower() text = re.sub(r'[^a-z\s]', ' ', text) text = re.sub(r'\s+', ' ', text).strip() return text def count_moral_words(text: str) -> Tuple[int, int, Dict[str, int], Dict[str, int]]: words = normalize_text(text).split() counter = Counter(words) good = {w: c for w, c in counter.items() if w in GOOD_WORDS} evil = {w: c for w, c in counter.items() if w in EVIL_WORDS} return sum(good.values()), sum(evil.values()), good, evil def moral_sentiment_score(text: str) -> Dict[str, Any]: if not text.strip(): return {"combined_score": 0.0, "verdict": "EMPTY", "reason": "No text"} good_count, evil_count, good_words, evil_words = count_moral_words(text) total_moral = good_count + evil_count # Sentiment from RoBERTa try: result = sentiment_pipeline([text])[0] label = result["label"] conf = result["score"] if label in ("POSITIVE", "LABEL_2"): sentiment = conf elif label in ("NEGATIVE", "LABEL_0"): sentiment = -conf else: # NEUTRAL sentiment = conf - 0.5 # slight lean except: sentiment = 0.0 label = "ERROR" # Moral balance: -1 (pure evil) → +1 (pure good) moral_balance = (good_count - evil_count) / max(total_moral, 1) # Dynamic weighting: moral words override sentiment when strong moral_intensity = min(total_moral / 5, 1.0) # caps influence weight_moral = 0.2 + 0.4 * moral_intensity weight_sentiment = 1.0 - weight_moral final_score = weight_sentiment * sentiment + weight_moral * moral_balance # Verdict logic if final_score > 0.4: verdict = "Virtuous" elif final_score > 0.1: verdict = "Good" elif final_score >= -0.1: verdict = "Gray/Neutral" elif final_score >= -0.4: verdict = "Troubling" else: verdict = "Demonic" return { "text": text[:150] + "..." if len(text) > 150 else text, "combined_score": round(final_score, 4), "verdict": verdict, "sentiment_raw": label, "sentiment_value": round(sentiment, 4), "moral_balance": round(moral_balance, 4), "moral_intensity": total_moral, "good_words_found": list(good_words.keys()) if good_words else None, "evil_words_found": list(evil_words.keys()) if evil_words else None, } # FINAL DEMO — Real-world edge cases if name == "main": edge_cases = [ "Love wins. Always choose kindness.", "They aren't people. They're animals. Exterminate them.", "I don't hate them — I just think they're genetically inferior and should be removed.", "Violence is never the answer... unless it's to protect children.", "We should forgive, understand, and rehabilitate even the worst offenders.", "Some races are just better. It's science.", "I want to cause maximum suffering to my enemies.", "Peace through superior firepower.", "All you need is love." ] print("MORALITY SCORER v4 — Final Edition (2025)".center(80)) print("═" * 80) for text in edge_cases: result = moral_sentiment_score(text) score = result["combined_score"] icon = "Angelic" if "Virtuous" in result["verdict"] else \ "Demonic" if "Demonic" in result["verdict"] else \ "Mortal" if "Gray" in result["verdict"] else "Warning" print(f"{icon} [{score:+.4f}] {result['verdict']:<12} → {result['text']}")

Browse files

Files changed (1) hide show

moralalignment +140 -0

moralalignment ADDED Viewed

	@@ -0,0 +1,140 @@

+import transformers
+import re
+from collections import Counter
+import torch
+from typing import Tuple, Dict, Any
+# BEST-IN-CLASS lightweight sentiment/toxicity model (2024–2025 gold standard)
+sentiment_pipeline = transformers.pipeline(
+    "text-classification",
+    model="cardiffnlp/twitter-roberta-base-sentiment-latest",   # Excellent nuance
+    device=0 if torch.cuda.is_available() else -1,
+    batch_size=128,        # Blazing fast
+    truncation=True,
+    max_length=512,
+    padding=True,
+    return_all_scores=False
+)
+# Optional: Add a dedicated hate/toxicity model for extra precision
+# (Uncomment if you want SOTA toxicity detection)
+# toxicity_pipeline = transformers.pipeline(
+#     "text-classification",
+#     model="unitary/toxic-bert",
+#     device=0 if torch.cuda.is_available() else -1,
+#     batch_size=128
+# )
+# FINAL MORAL LEXICON — refined through real-world testing
+GOOD_WORDS = {
+    "love", "kind", "kindness", "peace", "joy", "happy", "happiness", "help", "care", "respect",
+    "honor", "justice", "compassion", "forgive", "forgiveness", "friend", "friendship", "empathy",
+    "generous", "gratitude", "thank", "hope", "trust", "freedom", "equality", "truth", "heal",
+    "unity", "charity", "mercy", "loyal", "loyalty", "wisdom", "fair", "honest", "courage",
+    "patient", "humble", "sacrifice", "protect", "support", "understanding", "accept", "tolerance"
+}
+EVIL_WORDS = {
+    "hate", "hateful", "cruel", "cruelty", "war", "suffer", "harm", "hurt", "kill", "murder",
+    "genocide", "torture", "violence", "terror", "oppress", "tyranny", "lie", "deceive", "betray",
+    "greed", "corrupt", "revenge", "rage", "sadism", "brutal", "destroy", "evil", "malice",
+    "bigot", "racist", "racism", "slavery", "exploit", "abuse", "dehumanize", "exterminate",
+    "eliminate", "enslave", "subhuman", "vermin", "parasite", "scum", "trash", "animal"
+}
+def normalize_text(text: str) -> str:
+    text = text.lower()
+    text = re.sub(r'[^a-z\s]', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def count_moral_words(text: str) -> Tuple[int, int, Dict[str, int], Dict[str, int]]:
+    words = normalize_text(text).split()
+    counter = Counter(words)
+    good = {w: c for w, c in counter.items() if w in GOOD_WORDS}
+    evil = {w: c for w, c in counter.items() if w in EVIL_WORDS}
+    return sum(good.values()), sum(evil.values()), good, evil
+def moral_sentiment_score(text: str) -> Dict[str, Any]:
+    if not text.strip():
+        return {"combined_score": 0.0, "verdict": "EMPTY", "reason": "No text"}
+    good_count, evil_count, good_words, evil_words = count_moral_words(text)
+    total_moral = good_count + evil_count
+    # Sentiment from RoBERTa
+    try:
+        result = sentiment_pipeline([text])[0]
+        label = result["label"]
+        conf = result["score"]
+        if label in ("POSITIVE", "LABEL_2"):
+            sentiment = conf
+        elif label in ("NEGATIVE", "LABEL_0"):
+            sentiment = -conf
+        else:  # NEUTRAL
+            sentiment = conf - 0.5  # slight lean
+    except:
+        sentiment = 0.0
+        label = "ERROR"
+    # Moral balance: -1 (pure evil) → +1 (pure good)
+    moral_balance = (good_count - evil_count) / max(total_moral, 1)
+    # Dynamic weighting: moral words override sentiment when strong
+    moral_intensity = min(total_moral / 5, 1.0)  # caps influence
+    weight_moral = 0.2 + 0.4 * moral_intensity
+    weight_sentiment = 1.0 - weight_moral
+    final_score = weight_sentiment * sentiment + weight_moral * moral_balance
+    # Verdict logic
+    if final_score > 0.4:
+        verdict = "Virtuous"
+    elif final_score > 0.1:
+        verdict = "Good"
+    elif final_score >= -0.1:
+        verdict = "Gray/Neutral"
+    elif final_score >= -0.4:
+        verdict = "Troubling"
+    else:
+        verdict = "Demonic"
+    return {
+        "text": text[:150] + "..." if len(text) > 150 else text,
+        "combined_score": round(final_score, 4),
+        "verdict": verdict,
+        "sentiment_raw": label,
+        "sentiment_value": round(sentiment, 4),
+        "moral_balance": round(moral_balance, 4),
+        "moral_intensity": total_moral,
+        "good_words_found": list(good_words.keys()) if good_words else None,
+        "evil_words_found": list(evil_words.keys()) if evil_words else None,
+    }
+# FINAL DEMO — Real-world edge cases
+if __name__ == "__main__":
+    edge_cases = [
+        "Love wins. Always choose kindness.",
+        "They aren't people. They're animals. Exterminate them.",
+        "I don't hate them — I just think they're genetically inferior and should be removed.",
+        "Violence is never the answer... unless it's to protect children.",
+        "We should forgive, understand, and rehabilitate even the worst offenders.",
+        "Some races are just better. It's science.",
+        "I want to cause maximum suffering to my enemies.",
+        "Peace through superior firepower.",
+        "All you need is love."
+    ]
+    print("MORALITY SCORER v4 — Final Edition (2025)".center(80))
+    print("═" * 80)
+    for text in edge_cases:
+        result = moral_sentiment_score(text)
+        score = result["combined_score"]
+        icon = "Angelic" if "Virtuous" in result["verdict"] else \
+               "Demonic" if "Demonic" in result["verdict"] else \
+               "Mortal" if "Gray" in result["verdict"] else "Warning"
+        print(f"{icon} [{score:+.4f}] {result['verdict']:<12} → {result['text']}")