DeepXR
/

Helion-V1.5

@@ -1,327 +0,0 @@
-# Helion 1.5 Series - Dataset Configuration
-# This file defines the structure, features, and specifications for the Helion 1.5 dataset
-dataset_info:
-  name: helion-1.5
-  version: 1.5.0
-  release_date: "2024-11-07"
-  description: "Enhanced large-scale dataset for language model training with improved quality and diversity"
-  homepage: "https://huggingface.co/datasets/your-username/helion-1.5"
-  license: "CC-BY-4.0"
-  citation: |
-    @dataset{helion_1_5_2024,
-      title={Helion 1.5: An Enhanced Large-Scale Dataset for Language Model Training},
-      author={Your Name/Organization},
-      year={2024},
-      publisher={Hugging Face}
-    }
-# Dataset Splits Configuration
-splits:
-  train:
-    num_examples: 1800000
-    file_pattern: "train-*.jsonl"
-  validation:
-    num_examples: 100000
-    file_pattern: "validation-*.jsonl"
-  test:
-    num_examples: 100000
-    file_pattern: "test-*.jsonl"
-# File Configurations
-files:
-  conversations:
-    filename: "helion-1.5-conversations.jsonl"
-    description: "Multi-turn conversational data"
-    size_mb: 5200
-    num_examples: 800000
-    format: "jsonl"
-  instructions:
-    filename: "helion-1.5-instructions.jsonl"
-    description: "Instruction-following pairs"
-    size_mb: 3800
-    num_examples: 600000
-    format: "jsonl"
-  code:
-    filename: "helion-1.5-code.jsonl"
-    description: "Programming and code generation"
-    size_mb: 2100
-    num_examples: 250000
-    format: "jsonl"
-  reasoning:
-    filename: "helion-1.5-reasoning.jsonl"
-    description: "Complex reasoning and problem-solving"
-    size_mb: 1400
-    num_examples: 180000
-    format: "jsonl"
-  creative:
-    filename: "helion-1.5-creative.jsonl"
-    description: "Creative writing and content"
-    size_mb: 900
-    num_examples: 120000
-    format: "jsonl"
-  multilingual:
-    filename: "helion-1.5-multilingual.jsonl"
-    description: "Multilingual data across 30+ languages"
-    size_mb: 650
-    num_examples: 50000
-    format: "jsonl"
-# Feature Schemas
-schemas:
-  conversations:
-    id:
-      type: string
-      description: "Unique conversation identifier"
-    conversations:
-      type: list
-      description: "List of conversation turns"
-      items:
-        role:
-          type: string
-          enum: ["user", "assistant", "system"]
-        content:
-          type: string
-          description: "Message content"
-    metadata:
-      type: object
-      properties:
-        domain:
-          type: string
-          enum: ["general", "science", "technology", "math", "history", "literature", "arts", "business", "health", "other"]
-        difficulty:
-          type: string
-          enum: ["easy", "intermediate", "advanced", "expert"]
-        languages:
-          type: list
-          items: string
-        quality_score:
-          type: float
-          range: [0.0, 1.0]
-        word_count:
-          type: integer
-        turn_count:
-          type: integer
-        has_code:
-          type: boolean
-        topics:
-          type: list
-          items: string
-  instructions:
-    id:
-      type: string
-    instruction:
-      type: string
-      description: "The instruction or task"
-    input:
-      type: string
-      description: "Optional input context"
-    output:
-      type: string
-      description: "Expected output or response"
-    metadata:
-      type: object
-      properties:
-        task_type:
-          type: string
-          enum: ["summarization", "question_answering", "translation", "classification", "generation", "editing", "analysis", "other"]
-        complexity:
-          type: string
-          enum: ["low", "medium", "high", "very_high"]
-        verified:
-          type: boolean
-        domain:
-          type: string
-        language:
-          type: string
-  code:
-    id:
-      type: string
-    language:
-      type: string
-      enum: ["python", "javascript", "java", "cpp", "c", "go", "rust", "typescript", "sql", "html", "css", "bash", "other"]
-    problem:
-      type: string
-      description: "Problem statement or task"
-    solution:
-      type: string
-      description: "Code solution"
-    explanation:
-      type: string
-      description: "Explanation of the solution"
-    test_cases:
-      type: list
-      items:
-        input: string
-        output: string
-        description: string
-    metadata:
-      type: object
-      properties:
-        difficulty:
-          type: string
-          enum: ["easy", "medium", "hard", "expert"]
-        tags:
-          type: list
-          items: string
-        time_complexity:
-          type: string
-        space_complexity:
-          type: string
-        lines_of_code:
-          type: integer
-  reasoning:
-    id:
-      type: string
-    problem:
-      type: string
-      description: "Problem or question requiring reasoning"
-    reasoning_steps:
-      type: list
-      items:
-        step_number: integer
-        description: string
-        calculation: string
-    final_answer:
-      type: string
-    metadata:
-      type: object
-      properties:
-        reasoning_type:
-          type: string
-          enum: ["mathematical", "logical", "causal", "spatial", "temporal", "analogical", "counterfactual"]
-        steps_count:
-          type: integer
-        difficulty:
-          type: string
-        domain:
-          type: string
-# Quality Metrics
-quality_standards:
-  minimum_quality_score: 0.75
-  required_fields_completion: 0.95
-  duplicate_threshold: 0.85
-  toxic_content_threshold: 0.01
-  filtering_pipeline:
-    - name: "deduplication"
-      method: "minhash_lsh"
-      threshold: 0.85
-    - name: "language_detection"
-      method: "fasttext"
-      confidence_threshold: 0.8
-    - name: "quality_scoring"
-      method: "ensemble"
-      models: ["perplexity", "coherence", "fluency"]
-    - name: "safety_filtering"
-      method: "classifier"
-      categories: ["toxic", "harmful", "biased", "personal_info"]
-    - name: "format_validation"
-      method: "schema_validation"
-      strict: true
-# Domain Distribution
-domain_distribution:
-  general_knowledge: 0.25
-  science_technology: 0.20
-  mathematics: 0.12
-  programming: 0.15
-  creative_writing: 0.08
-  business_finance: 0.05
-  health_medicine: 0.05
-  history_culture: 0.05
-  arts_entertainment: 0.03
-  other: 0.02
-# Language Distribution
-language_distribution:
-  en: 0.70
-  es: 0.05
-  fr: 0.04
-  de: 0.03
-  zh: 0.03
-  ja: 0.02
-  pt: 0.02
-  ar: 0.02
-  ru: 0.02
-  it: 0.02
-  other: 0.05
-# Training Recommendations
-training_config:
-  recommended_batch_size: 4
-  recommended_gradient_accumulation: 8
-  effective_batch_size: 32
-  recommended_learning_rate: 2.0e-5
-  warmup_steps: 1000
-  max_sequence_length: 2048
-  data_mixing_weights:
-    conversations: 0.35
-    instructions: 0.30
-    code: 0.15
-    reasoning: 0.10
-    creative: 0.06
-    multilingual: 0.04
-  suggested_epochs:
-    full_training: 3
-    fine_tuning: 1-2
-    domain_adaptation: 1
-# Evaluation Benchmarks
-evaluation_benchmarks:
-  - name: "MMLU"
-    expected_improvement: "+5%"
-  - name: "HumanEval"
-    expected_improvement: "+8%"
-  - name: "GSM8K"
-    expected_improvement: "+6%"
-  - name: "HellaSwag"
-    expected_improvement: "+3%"
-  - name: "TruthfulQA"
-    expected_improvement: "+4%"
-# Versioning
-versioning:
-  major: 1
-  minor: 5
-  patch: 0
-  changelog:
-    - version: "1.5.0"
-      date: "2024-11-07"
-      changes:
-        - "Initial Helion 1.5 release"
-        - "3x increase in dataset size"
-        - "Added multilingual support (30+ languages)"
-        - "Improved code dataset (5x larger)"
-        - "Enhanced reasoning tasks"
-        - "Better quality filtering"
-    - version: "1.0.0"
-      date: "2024-05-01"
-      changes:
-        - "Original Helion 1 release"
-# Maintenance
-maintenance:
-  update_frequency: "quarterly"
-  deprecation_policy: "12 months notice"
-  bug_report_url: "https://github.com/your-repo/issues"
-  community_contributions: true
-# Contact
-contact:
-  maintainer: "Your Name/Organization"
-  email: "contact@example.com"
-  discord: "discord.gg/your-server"
-  twitter: "@your_handle"