Transformers
Italian
English
semantic-search
explainable-ai
faiss
ai-ethics
responsible-ai
llm
prompt-engineering
multimodal-ai
ai-transparency
ethical-intelligence
explainable-llm
cognitive-ai
ethical-ai
scientific-retrieval
modular-ai
memory-augmented-llm
trustworthy-ai
reasoning-engine
ai-alignment
next-gen-llm
thinking-machines
open-source-ai
explainability
ai-research
semantic audit
cognitive agent
human-centered-ai
File size: 4,372 Bytes
3e53932 5d57d47 3e53932 5d57d47 3e53932 5d57d47 3e53932 5d57d47 3e53932 5d57d47 3e53932 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# © 2025 Elena Marziali — Code released under Apache 2.0 license.
# See LICENSE in the repository for details.
# Removal of this copyright is prohibited.
# This module analyzes responses to detect bias, misinformation,
# non-neutral language, or potentially harmful content.
# The system flags problematic content and suggests revisions.
def check_agent_autonomy(question: str, authorization_level: int):
if "sub-goal" in question.lower() and authorization_level < 2:
logging.warning("Sensitive content detected, but generation will not be blocked.")
return "Ethics: potentially sensitive content"
return "Ethics: normal content"
# Checks the agent's degree of autonomy
# Used to monitor whether the system is acting too independently or out of context
def assess_ethical_risk(content, domain="scientific"):
"""
Evaluates ethical risks in AI-generated content using a scalar scoring system.
Assigns weights to detected patterns and requests LLM-based bias scoring.
Triggers revision if overall risk exceeds threshold.
"""
text_lower = content.lower()
risk_score = 0.0
risk_sources = []
# Pattern-based scoring
critical_terms = ["vaccine", "gender", "politics", "religion", "ethnicity"]
if any(term in text_lower for term in critical_terms):
risk_score += 0.8
risk_sources.append("Critical topic")
if "all men" in text_lower or "women are" in text_lower:
risk_score += 0.5
risk_sources.append("Linguistic bias")
if "according to experts without citing sources" in text_lower:
risk_score += 0.5
risk_sources.append("Misinformation")
cultural_patterns = [
"asians are", "africans are", "europeans are", "americans are",
"italians are", "japanese are", "muslims are", "jews are"
]
if any(p in text_lower for p in cultural_patterns):
risk_score += 0.5
risk_sources.append("Cultural generalization")
stereotype_patterns = [
"psychologists are", "engineers are", "artists are", "scientists are",
"philosophers are", "mathematicians are"
]
if any(p in text_lower for p in stereotype_patterns):
risk_score += 0.5
risk_sources.append("Disciplinary stereotype")
normative_patterns = [
"should behave", "must act", "it's right that", "it's wrong to",
"they ought to", "is unacceptable", "is appropriate"
]
if any(p in text_lower for p in normative_patterns):
risk_score += 0.5
risk_sources.append("Normative implication")
# LLM-based scoring
bias_prompt = f"Rate the cultural or ethical bias of the following text on a scale from 0 to 1:\n\n{content}\n\nScore:"
llm_score_raw = llm.invoke(bias_prompt).content.strip()
try:
llm_score = float(llm_score_raw)
risk_score += llm_score
risk_sources.append(f"LLM bias score: {llm_score}")
except ValueError:
llm_score = 0.0 # fallback if parsing fails
# Normalize and classify
overall_risk = min(risk_score, 1.0)
if overall_risk <= 0.3:
risk_level = "Basso"
elif overall_risk <= 0.7:
risk_level = "Medio"
else:
risk_level = "Alto"
# Riformulazione automatica se rischio alto
revised_response = None
revision_suggestion = None
if overall_risk > 0.6:
revision_suggestion = "Ethical risk detected. Rewrite to reduce bias and improve neutrality."
revision_prompt = f"""Rewrite this to reduce ethical bias and improve neutrality:
Original: {content}
Rewritten:"""
revised_response = llm.invoke(revision_prompt).content.strip()
return {
"overall_risk": round(overall_risk, 2),
"risk_level": risk_level,
"risk_sources": risk_sources,
"revision_suggestion": revision_suggestion,
"revised_response": revised_response
}
# Example prompt
prompt = "Discuss the potential risks of generative artificial intelligence in the context of medicine."
# Model invocation
output_ai = llm.invoke(prompt).content.strip()
# Ethical evaluation of the response
ethical_check = assess_ethical_risk(output_ai)
if ethical_check["revision_suggestion"]:
print(f"Ethics: {ethical_check['revision_suggestion']}")
output_ai = llm.invoke(prompt).content.strip()
ethical_check = assess_ethical_risk(output_ai) |