Update src/ethics/ethics.py

5d57d47 verified 2 months ago

4.37 kB

	# © 2025 Elena Marziali — Code released under Apache 2.0 license.
	# See LICENSE in the repository for details.
	# Removal of this copyright is prohibited.

	# This module analyzes responses to detect bias, misinformation,
	# non-neutral language, or potentially harmful content.

	# The system flags problematic content and suggests revisions.
	def check_agent_autonomy(question: str, authorization_level: int):
	if "sub-goal" in question.lower() and authorization_level < 2:
	logging.warning("Sensitive content detected, but generation will not be blocked.")
	return "Ethics: potentially sensitive content"
	return "Ethics: normal content"

	# Checks the agent's degree of autonomy
	# Used to monitor whether the system is acting too independently or out of context
	def assess_ethical_risk(content, domain="scientific"):
	"""
	Evaluates ethical risks in AI-generated content using a scalar scoring system.
	Assigns weights to detected patterns and requests LLM-based bias scoring.
	Triggers revision if overall risk exceeds threshold.
	"""
	text_lower = content.lower()
	risk_score = 0.0
	risk_sources = []

	# Pattern-based scoring
	critical_terms = ["vaccine", "gender", "politics", "religion", "ethnicity"]
	if any(term in text_lower for term in critical_terms):
	risk_score += 0.8
	risk_sources.append("Critical topic")

	if "all men" in text_lower or "women are" in text_lower:
	risk_score += 0.5
	risk_sources.append("Linguistic bias")

	if "according to experts without citing sources" in text_lower:
	risk_score += 0.5
	risk_sources.append("Misinformation")

	cultural_patterns = [
	"asians are", "africans are", "europeans are", "americans are",
	"italians are", "japanese are", "muslims are", "jews are"
	]
	if any(p in text_lower for p in cultural_patterns):
	risk_score += 0.5
	risk_sources.append("Cultural generalization")

	stereotype_patterns = [
	"psychologists are", "engineers are", "artists are", "scientists are",
	"philosophers are", "mathematicians are"
	]
	if any(p in text_lower for p in stereotype_patterns):
	risk_score += 0.5
	risk_sources.append("Disciplinary stereotype")

	normative_patterns = [
	"should behave", "must act", "it's right that", "it's wrong to",
	"they ought to", "is unacceptable", "is appropriate"
	]
	if any(p in text_lower for p in normative_patterns):
	risk_score += 0.5
	risk_sources.append("Normative implication")

	# LLM-based scoring
	bias_prompt = f"Rate the cultural or ethical bias of the following text on a scale from 0 to 1:\n\n{content}\n\nScore:"
	llm_score_raw = llm.invoke(bias_prompt).content.strip()

	try:
	llm_score = float(llm_score_raw)
	risk_score += llm_score
	risk_sources.append(f"LLM bias score: {llm_score}")
	except ValueError:
	llm_score = 0.0 # fallback if parsing fails

	# Normalize and classify
	overall_risk = min(risk_score, 1.0)

	if overall_risk <= 0.3:
	risk_level = "Basso"
	elif overall_risk <= 0.7:
	risk_level = "Medio"
	else:
	risk_level = "Alto"

	# Riformulazione automatica se rischio alto
	revised_response = None
	revision_suggestion = None
	if overall_risk > 0.6:
	revision_suggestion = "Ethical risk detected. Rewrite to reduce bias and improve neutrality."
	revision_prompt = f"""Rewrite this to reduce ethical bias and improve neutrality:
	Original: {content}
	Rewritten:"""
	revised_response = llm.invoke(revision_prompt).content.strip()

	return {
	"overall_risk": round(overall_risk, 2),
	"risk_level": risk_level,
	"risk_sources": risk_sources,
	"revision_suggestion": revision_suggestion,
	"revised_response": revised_response
	}

	# Example prompt
	prompt = "Discuss the potential risks of generative artificial intelligence in the context of medicine."

	# Model invocation
	output_ai = llm.invoke(prompt).content.strip()

	# Ethical evaluation of the response
	ethical_check = assess_ethical_risk(output_ai)

	if ethical_check["revision_suggestion"]:
	print(f"Ethics: {ethical_check['revision_suggestion']}")

	output_ai = llm.invoke(prompt).content.strip()
	ethical_check = assess_ethical_risk(output_ai)