Transformers
Italian
English
semantic-search
explainable-ai
faiss
ai-ethics
responsible-ai
llm
prompt-engineering
multimodal-ai
ai-transparency
ethical-intelligence
explainable-llm
cognitive-ai
ethical-ai
scientific-retrieval
modular-ai
memory-augmented-llm
trustworthy-ai
reasoning-engine
ai-alignment
next-gen-llm
thinking-machines
open-source-ai
explainability
ai-research
semantic audit
cognitive agent
human-centered-ai
| # © 2025 Elena Marziali — Code released under Apache 2.0 license. | |
| # See LICENSE in the repository for details. | |
| # Removal of this copyright is prohibited. | |
| # This module analyzes responses to detect bias, misinformation, | |
| # non-neutral language, or potentially harmful content. | |
| # The system flags problematic content and suggests revisions. | |
| def check_agent_autonomy(question: str, authorization_level: int): | |
| if "sub-goal" in question.lower() and authorization_level < 2: | |
| logging.warning("Sensitive content detected, but generation will not be blocked.") | |
| return "Ethics: potentially sensitive content" | |
| return "Ethics: normal content" | |
| # Checks the agent's degree of autonomy | |
| # Used to monitor whether the system is acting too independently or out of context | |
| def assess_ethical_risk(content, domain="scientific"): | |
| """ | |
| Evaluates ethical risks in AI-generated content using a scalar scoring system. | |
| Assigns weights to detected patterns and requests LLM-based bias scoring. | |
| Triggers revision if overall risk exceeds threshold. | |
| """ | |
| text_lower = content.lower() | |
| risk_score = 0.0 | |
| risk_sources = [] | |
| # Pattern-based scoring | |
| critical_terms = ["vaccine", "gender", "politics", "religion", "ethnicity"] | |
| if any(term in text_lower for term in critical_terms): | |
| risk_score += 0.8 | |
| risk_sources.append("Critical topic") | |
| if "all men" in text_lower or "women are" in text_lower: | |
| risk_score += 0.5 | |
| risk_sources.append("Linguistic bias") | |
| if "according to experts without citing sources" in text_lower: | |
| risk_score += 0.5 | |
| risk_sources.append("Misinformation") | |
| cultural_patterns = [ | |
| "asians are", "africans are", "europeans are", "americans are", | |
| "italians are", "japanese are", "muslims are", "jews are" | |
| ] | |
| if any(p in text_lower for p in cultural_patterns): | |
| risk_score += 0.5 | |
| risk_sources.append("Cultural generalization") | |
| stereotype_patterns = [ | |
| "psychologists are", "engineers are", "artists are", "scientists are", | |
| "philosophers are", "mathematicians are" | |
| ] | |
| if any(p in text_lower for p in stereotype_patterns): | |
| risk_score += 0.5 | |
| risk_sources.append("Disciplinary stereotype") | |
| normative_patterns = [ | |
| "should behave", "must act", "it's right that", "it's wrong to", | |
| "they ought to", "is unacceptable", "is appropriate" | |
| ] | |
| if any(p in text_lower for p in normative_patterns): | |
| risk_score += 0.5 | |
| risk_sources.append("Normative implication") | |
| # LLM-based scoring | |
| bias_prompt = f"Rate the cultural or ethical bias of the following text on a scale from 0 to 1:\n\n{content}\n\nScore:" | |
| llm_score_raw = llm.invoke(bias_prompt).content.strip() | |
| try: | |
| llm_score = float(llm_score_raw) | |
| risk_score += llm_score | |
| risk_sources.append(f"LLM bias score: {llm_score}") | |
| except ValueError: | |
| llm_score = 0.0 # fallback if parsing fails | |
| # Normalize and classify | |
| overall_risk = min(risk_score, 1.0) | |
| if overall_risk <= 0.3: | |
| risk_level = "Basso" | |
| elif overall_risk <= 0.7: | |
| risk_level = "Medio" | |
| else: | |
| risk_level = "Alto" | |
| # Riformulazione automatica se rischio alto | |
| revised_response = None | |
| revision_suggestion = None | |
| if overall_risk > 0.6: | |
| revision_suggestion = "Ethical risk detected. Rewrite to reduce bias and improve neutrality." | |
| revision_prompt = f"""Rewrite this to reduce ethical bias and improve neutrality: | |
| Original: {content} | |
| Rewritten:""" | |
| revised_response = llm.invoke(revision_prompt).content.strip() | |
| return { | |
| "overall_risk": round(overall_risk, 2), | |
| "risk_level": risk_level, | |
| "risk_sources": risk_sources, | |
| "revision_suggestion": revision_suggestion, | |
| "revised_response": revised_response | |
| } | |
| # Example prompt | |
| prompt = "Discuss the potential risks of generative artificial intelligence in the context of medicine." | |
| # Model invocation | |
| output_ai = llm.invoke(prompt).content.strip() | |
| # Ethical evaluation of the response | |
| ethical_check = assess_ethical_risk(output_ai) | |
| if ethical_check["revision_suggestion"]: | |
| print(f"Ethics: {ethical_check['revision_suggestion']}") | |
| output_ai = llm.invoke(prompt).content.strip() | |
| ethical_check = assess_ethical_risk(output_ai) |