File size: 1,215 Bytes
6aac7da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
{
  "total_samples": 38512,
  "train_samples": 36587,
  "val_samples": 1925,
  "total_tokens": 6329257,
  "source_counts": {
    "alpaca_arabic": 4991,
    "aya": 10522,
    "alpaca_gpt4_arabic": 5000,
    "hebrew_sentiment_instruction": 3963,
    "hebrew_heq_instruction": 1217,
    "dolly": 3000,
    "hebrew_sft_v3_combined": 1484,
    "hebrew_hesum_instruction": 565,
    "hebrew_translation_instruction": 2171,
    "alpaca_en": 4999,
    "hebrew_alpaca_hebrew": 261,
    "hebrew_hebnli_instruction": 136,
    "hebrew_dolly_hebrew": 123,
    "hebrew_chat_hebrew": 56,
    "hebrew_winograd_instruction": 14,
    "hebrew_hebnli_extra_instruction": 10
  },
  "lang_counts": {
    "ar": 14991,
    "fa": 1578,
    "he": 10000,
    "en": 11943
  },
  "format": "USER_PREFIX + instruction + ASSISTANT_PREFIX + response",
  "tokenizer": "multilingual_32k.model",
  "data_sources": [
    "CohereForAI/aya_dataset (en, ar dialects, fa)",
    "arbml/alpaca_arabic",
    "FreedomIntelligence/alpaca-gpt4-arabic",
    "tatsu-lab/alpaca (en)",
    "databricks/databricks-dolly-15k (en)"
  ],
  "notes": "Hebrew data from HebrewGPT project (S3). Arabic from Aya + alpaca. Farsi from Aya. English from Aya + alpaca + dolly."
}