tefoteknik commited on
Commit
0320913
·
verified ·
1 Parent(s): 344be51

Phase 7: Curriculum Learning (20K steps, BPC 1.78)

Browse files
Files changed (1) hide show
  1. src/data/clean_turkish_data.py +128 -0
src/data/clean_turkish_data.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Developer: inkbytefo
2
+ ## Modified: 2025-11-22
3
+
4
+ import os
5
+ import re
6
+ import torch
7
+ from datasets import load_dataset
8
+ from tqdm import tqdm
9
+
10
+ def clean_wiki_text(text):
11
+ """
12
+ Wikipedia metinleri için özel temizlik.
13
+ Dipnotları, parantez içi referansları ve 'Dosya:' gibi meta verileri temizler.
14
+ """
15
+ # 1. [1], [kaynak belirtilmeli] gibi referansları sil
16
+ text = re.sub(r'\[\d+\]', '', text)
17
+ text = re.sub(r'\[.*?\]', '', text)
18
+
19
+ # 2. (İngilizce: ...), (d. 1990) gibi parantezleri koru ama içindeki garip kodları temizle
20
+ # Basit html tag temizliği
21
+ text = re.sub(r'<.*?>', '', text)
22
+
23
+ # 3. Gereksiz boşlukları ve satır sonlarını düzelt
24
+ text = re.sub(r'\s+', ' ', text).strip()
25
+
26
+ return text
27
+
28
+ def prepare_clean_turkish_data(data_dir="./data", target_mb=150):
29
+ os.makedirs(data_dir, exist_ok=True)
30
+ output_path = os.path.join(data_dir, "trwiki_clean_train.bin")
31
+ val_path = os.path.join(data_dir, "trwiki_clean_val.bin")
32
+
33
+ if os.path.exists(output_path):
34
+ print(f"Clean data already exists at {output_path}")
35
+ return
36
+
37
+ print(f"Downloading OFFICIAL Wikipedia (Turkish) dataset...")
38
+ # "20220301.tr" config'i standarttır.
39
+ try:
40
+ dataset = load_dataset("wikipedia", "20220301.tr", split="train", streaming=True, trust_remote_code=True)
41
+ except:
42
+ print("Fallback: Using 'wikimedia/wikipedia' dataset...")
43
+ dataset = load_dataset("wikimedia/wikipedia", "20231101.tr", split="train", streaming=True)
44
+
45
+ collected_bytes = []
46
+ total_bytes = 0
47
+ target_size = target_mb * 1024 * 1024
48
+
49
+ print("Processing Wikipedia articles (High Quality)...")
50
+ pbar = tqdm(total=target_mb, unit="MB")
51
+
52
+ for i, article in enumerate(dataset):
53
+ raw_text = article['text']
54
+
55
+ # Çok kısa makaleleri (taslakları) atla
56
+ if len(raw_text) < 1000:
57
+ continue
58
+
59
+ cleaned = clean_wiki_text(raw_text)
60
+
61
+ # Encode
62
+ encoded = cleaned.encode('utf-8')
63
+
64
+ # Makaleleri ayırmak için özel ayırıcı (Byte seviyesinde)
65
+ # \n\n (Yeni paragraf) yeterlidir.
66
+ collected_bytes.append(encoded)
67
+ collected_bytes.append(b'\n\n')
68
+
69
+ chunk_size = len(encoded) + 2
70
+ total_bytes += chunk_size
71
+ pbar.update(chunk_size / (1024 * 1024))
72
+
73
+ if total_bytes >= target_size:
74
+ break
75
+
76
+ pbar.close()
77
+
78
+ # Flatten
79
+ print("Saving binary files...")
80
+ full_data = b"".join(collected_bytes)
81
+
82
+ # Split 95/5
83
+ split_idx = int(len(full_data) * 0.95)
84
+ train_data = full_data[:split_idx]
85
+ val_data = full_data[split_idx:]
86
+
87
+ with open(output_path, "wb") as f:
88
+ f.write(train_data)
89
+ with open(val_path, "wb") as f:
90
+ f.write(val_data)
91
+
92
+ print(f"✅ Dataset Ready: {len(train_data)/1e6:.1f}MB Train, {len(val_data)/1e6:.1f}MB Val")
93
+
94
+ # Dataset Sınıfı (Aynı kalabilir, sadece dosya adlarını doğru kullanmalı)
95
+ class CleanTurkishDataset(torch.utils.data.Dataset):
96
+ def __init__(self, data_path, seq_len=1024):
97
+ with open(data_path, "rb") as f:
98
+ self.data = f.read()
99
+ self.seq_len = seq_len
100
+
101
+ def __len__(self):
102
+ return max(0, len(self.data) - self.seq_len - 4)
103
+
104
+ def __getitem__(self, idx):
105
+ chunk = self.data[idx : idx + self.seq_len + 4]
106
+ x = torch.tensor(list(chunk[:-4]), dtype=torch.long)
107
+ y = torch.tensor(list(chunk[4:]), dtype=torch.long)
108
+ return x, y
109
+
110
+ def get_clean_loader(data_dir, batch_size, seq_len, split="train"):
111
+ path = os.path.join(data_dir, f"trwiki_clean_{split}.bin")
112
+ if not os.path.exists(path):
113
+ # Auto-prepare if missing
114
+ prepare_clean_turkish_data(data_dir)
115
+
116
+ dataset = CleanTurkishDataset(path, seq_len)
117
+ return torch.utils.data.DataLoader(
118
+ dataset,
119
+ batch_size=batch_size,
120
+ shuffle=(split=="train"),
121
+ num_workers=0,
122
+ pin_memory=True
123
+ )
124
+
125
+ if __name__ == "__main__":
126
+ # Gerekli kütüphaneyi yükle
127
+ os.system("pip install datasets apache_beam mwparserfromhell")
128
+ prepare_clean_turkish_data()