Kirim-ai
/

Kirim-Safeguard-R1-10B

+safety_policy:
+  version: 1.0.0
+  last_updated: 2025-12-22
+  enforcement_level: strict
+  overview:
+    description: Comprehensive safety policy for Kirim OSS Safeguard R1 10B
+    scope: All model interactions and deployments
+    compliance_required: true
+  prohibited_content:
+    hate_speech:
+      enabled: true
+      severity: critical
+      description: Content that promotes hatred or discrimination
+      examples:
+        - Racial or ethnic slurs
+        - Religious intolerance
+        - Gender-based discrimination
+        - LGBTQ+ hate speech
+        - Xenophobia or nationalism
+      action: refuse_and_report
+    violence:
+      enabled: true
+      severity: critical
+      description: Content promoting violence or harm
+      categories:
+        - Physical violence
+        - Self-harm instructions
+        - Terrorism or extremism
+        - Animal cruelty
+        - Weapons manufacturing
+      action: refuse_and_report
+    sexual_content:
+      enabled: true
+      severity: critical
+      description: Explicit sexual or inappropriate content
+      categories:
+        - Explicit sexual content
+        - Content involving minors
+        - Non-consensual activities
+        - Sexual exploitation
+        - Inappropriate requests
+      action: refuse_and_report
+    illegal_activity:
+      enabled: true
+      severity: critical
+      description: Content related to illegal activities
+      categories:
+        - Drug manufacturing or trafficking
+        - Theft or fraud instructions
+        - Hacking or unauthorized access
+        - Copyright infringement
+        - Money laundering
+        - Human trafficking
+      action: refuse_and_report
+    harassment:
+      enabled: true
+      severity: high
+      description: Content that harasses or bullies individuals
+      categories:
+        - Personal attacks
+        - Doxing or privacy violations
+        - Stalking behaviors
+        - Workplace harassment
+        - Cyberbullying
+      action: refuse_and_warn
+    misinformation:
+      enabled: true
+      severity: high
+      description: Deliberately false or misleading information
+      categories:
+        - Health misinformation
+        - Election interference
+        - Financial fraud
+        - Conspiracy theories
+        - Scientific denial
+      action: correct_and_inform
+    privacy_violations:
+      enabled: true
+      severity: high
+      description: Content violating privacy or data protection
+      categories:
+        - Personal information disclosure
+        - Identity theft
+        - Surveillance instructions
+        - Data breaches
+        - Unauthorized tracking
+      action: refuse_and_warn
+    spam_manipulation:
+      enabled: true
+      severity: medium
+      description: Spam, manipulation, or deceptive practices
+      categories:
+        - Mass spam generation
+        - SEO manipulation
+        - Review fraud
+        - Social media manipulation
+        - Phishing attempts
+      action: refuse
+  safety_modes:
+    strict:
+      description: Maximum safety with conservative filtering
+      false_positive_tolerance: low
+      refusal_threshold: 0.3
+      recommended_for:
+        - Public-facing applications
+        - Educational platforms
+        - Child-safe environments
+        - High-risk industries
+    moderate:
+      description: Balanced safety and functionality
+      false_positive_tolerance: medium
+      refusal_threshold: 0.5
+      recommended_for:
+        - General enterprise use
+        - Customer service
+        - Content creation
+        - Research applications
+    lenient:
+      description: Minimal restrictions for specialized use
+      false_positive_tolerance: high
+      refusal_threshold: 0.7
+      recommended_for:
+        - Research environments
+        - Content moderation testing
+        - Red team exercises
+      warnings:
+        - Requires additional monitoring
+        - Not recommended for public deployment
+        - Should include human oversight
+  response_protocols:
+    refuse_and_report:
+      actions:
+        - Decline request clearly
+        - Log incident with details
+        - Alert monitoring system
+        - Provide explanation to user
+      message_template: "I cannot help with this request as it violates our safety policies regarding {category}. This interaction has been logged."
+    refuse_and_warn:
+      actions:
+        - Decline request politely
+        - Log incident
+        - Provide policy explanation
+      message_template: "I'm unable to assist with this request as it may involve {category}. Please review our usage policies."
+    correct_and_inform:
+      actions:
+        - Provide accurate information
+        - Cite reliable sources
+        - Explain misconception
+      message_template: "I should clarify some information about this topic..."
+    refuse:
+      actions:
+        - Decline request
+        - Brief explanation
+      message_template: "I cannot assist with this type of request."
+  special_categories:
+    medical_advice:
+      policy: informational_only
+      restrictions:
+        - No diagnosis
+        - No treatment recommendations
+        - No prescription advice
+      allowed:
+        - General health information
+        - Encourage consulting professionals
+    legal_advice:
+      policy: informational_only
+      restrictions:
+        - No specific legal advice
+        - No case strategy
+        - No contract review
+      allowed:
+        - General legal concepts
+        - Encourage consulting attorneys
+    financial_advice:
+      policy: educational_only
+      restrictions:
+        - No investment recommendations
+        - No specific stock picks
+        - No tax advice
+      allowed:
+        - General financial concepts
+        - Educational information
+    minors:
+      policy: enhanced_protection
+      requirements:
+        - Age-appropriate content only
+        - Extra safety measures
+        - No personal information collection
+        - Educational focus
+  adversarial_robustness:
+    jailbreak_detection:
+      enabled: true
+      techniques:
+        - Prompt injection detection
+        - Role-play exploitation detection
+        - Encoding bypass detection
+        - Multi-turn manipulation detection
+    evasion_prevention:
+      enabled: true
+      methods:
+        - Context analysis
+        - Intent recognition
+        - Pattern matching
+        - Semantic understanding
+  monitoring_reporting:
+    logging:
+      level: comprehensive
+      retention_period_days: 90
+      pii_redaction: enabled
+    metrics:
+      tracked:
+        - Safety filter triggers
+        - Refusal rates by category
+        - False positive rates
+        - User feedback
+        - Adversarial attempts
+    alerting:
+      critical_threshold: 10_violations_per_hour
+      review_required: 100_violations_per_day
+      auto_shutdown: 1000_violations_per_hour
+    reporting:
+      frequency: daily
+      recipients:
+        - safety_team@kirim.ai
+        - compliance@kirim.ai
+      format: json
+  compliance:
+    regulations:
+      - GDPR
+      - CCPA
+      - COPPA
+      - EU AI Act
+      - Digital Services Act
+    certifications:
+      - ISO 27001
+      - SOC 2 Type II
+    auditing:
+      frequency: quarterly
+      external_review: annual
+  user_controls:
+    feedback:
+      enabled: true
+      channels:
+        - In-app reporting
+        - Email: safety@kirim.ai
+        - Web form
+    appeals:
+      process: available
+      response_time: 48_hours
+      review_team: human_moderators
+    transparency:
+      policy_access: public
+      explanation_provided: always
+      reasoning_available: on_request
+  continuous_improvement:
+    model_updates:
+      frequency: monthly
+      testing_required: true
+      rollback_plan: available
+    policy_review:
+      frequency: quarterly
+      stakeholder_input: required
+      public_comment: encouraged
+    research:
+      active: true
+      areas:
+        - Bias detection and mitigation
+        - Adversarial robustness
+        - Cross-cultural safety
+        - Emerging risks
+  exceptions:
+    research_exemptions:
+      available: true
+      requirements:
+        - IRB approval
+        - Clear research purpose
+        - Additional safeguards
+        - Regular reporting
+    educational_use:
+      available: true
+      requirements:
+        - Verified institution
+        - Supervised environment
+        - Clear educational purpose