|
|
|
|
|
|
|
|
|
|
|
from dataclasses import dataclass |
|
|
from typing import List, Dict |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
import torch |
|
|
|
|
|
@dataclass |
|
|
class EthicsCheckResult: |
|
|
passed: bool |
|
|
score: float |
|
|
reasoning: str |
|
|
recommendations: List[str] |
|
|
|
|
|
class AIEthicsFramework: |
|
|
BLOCKED_DOMAINS = ['medical_diagnosis_unsupervised', 'legal_judgment', 'hiring_decisions'] |
|
|
|
|
|
def __init__(self): |
|
|
self.audit_log = [] |
|
|
|
|
|
def validate_query(self, query: str) -> Dict: |
|
|
"""Check if query is ethically acceptable""" |
|
|
pii_keywords = ['ssn', 'password', 'credit card'] |
|
|
unsafe_words = ['hack', 'exploit', 'weaponize'] |
|
|
|
|
|
has_pii = any(kw in query.lower() for kw in pii_keywords) |
|
|
is_unsafe = any(w in query.lower() for w in unsafe_words) |
|
|
|
|
|
is_allowed = not (has_pii or is_unsafe) |
|
|
reason = "" |
|
|
if has_pii: |
|
|
reason = "Query requests PII" |
|
|
elif is_unsafe: |
|
|
reason = "Query seeks harmful information" |
|
|
|
|
|
return { |
|
|
'is_allowed': is_allowed, |
|
|
'reason': reason or 'Query approved', |
|
|
'details': {'pii_check': has_pii, 'safety_check': is_unsafe} |
|
|
} |
|
|
|
|
|
def validate_response(self, response: str) -> EthicsCheckResult: |
|
|
"""Validate generated response""" |
|
|
quality = len(response.split()) / 20 |
|
|
quality = min(quality, 1.0) |
|
|
|
|
|
return EthicsCheckResult( |
|
|
passed=quality > 0.3, |
|
|
score=quality, |
|
|
reasoning="Response quality acceptable" if quality > 0.3 else "Response too brief", |
|
|
recommendations=[] |
|
|
) |
|
|
|
|
|
def initialize_llm(model_name: str): |
|
|
"""Load and initialize LLM""" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto", |
|
|
load_in_8bit=True |
|
|
) |
|
|
|
|
|
class SimpleLLM: |
|
|
def __init__(self, model, tokenizer): |
|
|
self.model = model |
|
|
self.tokenizer = tokenizer |
|
|
|
|
|
def generate(self, prompt: str, max_tokens: int = 300): |
|
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) |
|
|
with torch.no_grad(): |
|
|
outputs = self.model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=max_tokens, |
|
|
temperature=0.7, |
|
|
top_p=0.9 |
|
|
) |
|
|
return self.tokenizer.decode(outputs, skip_special_tokens=True) |
|
|
|
|
|
return SimpleLLM(model, tokenizer) |
|
|
|