tefoteknik commited on
Commit
3560b46
·
verified ·
1 Parent(s): 05cb4f2

Update AGIFORMER with Turkish benchmark

Browse files
Files changed (1) hide show
  1. src/data/turkish_wiki.py +390 -0
src/data/turkish_wiki.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Developer: inkbytefo
2
+ ## Modified: 2025-11-22
3
+
4
+ import torch
5
+ import torch.utils.data as data
6
+ import os
7
+
8
+ class TurkishWikiDataset(data.Dataset):
9
+ """
10
+ Turkish Wikipedia Dataset via Hugging Face datasets.
11
+ Comparable to enwik8 format for benchmarking.
12
+ """
13
+ def __init__(self, data_dir="./data", split="train", seq_len=1024, download=True):
14
+ super().__init__()
15
+ self.data_dir = data_dir
16
+ self.split = split
17
+ self.seq_len = seq_len
18
+
19
+ os.makedirs(data_dir, exist_ok=True)
20
+
21
+ # File paths
22
+ self.processed_file = os.path.join(data_dir, f"trwiki_{split}.bin")
23
+
24
+ # Download if needed
25
+ if download and not os.path.exists(self.processed_file):
26
+ self._download_and_process()
27
+
28
+ # Load data
29
+ if not os.path.exists(self.processed_file):
30
+ raise FileNotFoundError(
31
+ f"Turkish Wikipedia data not found at {self.processed_file}. "
32
+ "Set download=True to download automatically."
33
+ )
34
+
35
+ with open(self.processed_file, 'rb') as f:
36
+ self.data = f.read()
37
+
38
+ print(f"Loaded Turkish Wikipedia ({split}): {len(self.data):,} bytes")
39
+
40
+ def _download_and_process(self):
41
+ """
42
+ Download Turkish text using allenai/c4 (Parquet format).
43
+ Modern, maintained, no loading scripts required.
44
+ """
45
+ print("Downloading Turkish text via allenai/c4...")
46
+
47
+ try:
48
+ from datasets import load_dataset
49
+
50
+ # Load allenai/c4 Turkish subset (Parquet - no scripts)
51
+ print("Loading allenai/c4 Turkish corpus (streaming)...")
52
+ dataset = load_dataset(
53
+ "allenai/c4",
54
+ "tr", # Turkish language code
55
+ split="train",
56
+ streaming=True
57
+ )
58
+
59
+ print("Converting to byte format...")
60
+ all_text = []
61
+
62
+ # Take enough text to match enwik8 scale (~100MB)
63
+ target_bytes = 100_000_000
64
+ current_bytes = 0
65
+ count = 0
66
+
67
+ for example in dataset:
68
+ text = example['text']
69
+
70
+ # Clean: remove empty or very short texts
71
+ if len(text.strip()) < 50:
72
+ continue
73
+
74
+ all_text.append(text)
75
+ current_bytes += len(text.encode('utf-8'))
76
+ count += 1
77
+
78
+ if count % 1000 == 0:
79
+ mb = current_bytes / 1e6
80
+ print(f" Processed {count} texts ({mb:.1f} MB)...")
81
+
82
+ if current_bytes >= target_bytes:
83
+ break
84
+
85
+ print(f"Collected {count} texts")
86
+
87
+ # Join all text
88
+ full_text = '\n\n'.join(all_text)
89
+
90
+ # Convert to bytes (UTF-8)
91
+ text_bytes = full_text.encode('utf-8')
92
+
93
+ print(f"Total: {len(text_bytes):,} bytes ({len(text_bytes) / 1e6:.1f} MB)")
94
+
95
+ # Split: 90% train, 5% val, 5% test (same as enwik8)
96
+ total_len = len(text_bytes)
97
+ train_len = int(0.9 * total_len)
98
+ val_len = int(0.05 * total_len)
99
+
100
+ splits = {
101
+ 'train': text_bytes[:train_len],
102
+ 'val': text_bytes[train_len:train_len + val_len],
103
+ 'test': text_bytes[train_len + val_len:]
104
+ }
105
+
106
+ # Save each split
107
+ for split_name, split_bytes in splits.items():
108
+ filepath = os.path.join(self.data_dir, f"trwiki_{split_name}.bin")
109
+ with open(filepath, 'wb') as f:
110
+ f.write(split_bytes)
111
+ print(f"Saved {split_name}: {len(split_bytes):,} bytes")
112
+
113
+ print("✅ Turkish text download complete!")
114
+
115
+ except ImportError:
116
+ print("ERROR: 'datasets' library not found.")
117
+ print("Install with: pip install datasets")
118
+ raise
119
+ except Exception as e:
120
+ print(f"Error downloading Turkish text: {e}")
121
+ print("\nFallback: Creating small test dataset...")
122
+ self._create_test_dataset()
123
+
124
+ def _create_test_dataset(self):
125
+ """
126
+ Create a small Turkish test dataset from hardcoded text.
127
+ For testing when download fails.
128
+ """
129
+ turkish_sample = """
130
+ Türkiye, Avrupa ve Asya kıtalarında yer alan bir ülkedir. Başkenti Ankara'dır.
131
+ En kalabalık şehri İstanbul'dur. Türkiye'nin tarihi çok eskidir. Anadolu, tarih
132
+ boyunca birçok medeniyete ev sahipliği yapmıştır. Hitit, Frig, Lidya, Pers,
133
+ Roma, Bizans ve Osmanlı gibi imparatorluklar bu topraklarda hüküm sürmüştür.
134
+
135
+ Türk dili, Altay dil ailesinin Türk koluna aittir. Sondan eklemeli bir dildir.
136
+ Bu özellik, İngilizce gibi analitik dillerden farklı olarak, kelimelere ekler
137
+ eklenerek anlam zenginleştirilmesine olanak tanır. Örneğin: kitap, kitaplar,
138
+ kitaplarım, kitaplarımdan gibi çeşitli formlar oluşturulabilir.
139
+
140
+ Türkiye'nin coğrafyası çok çeşitlidir. Doğu Anadolu'da yüksek dağlar ve platolar
141
+ bulunurken, Ege ve Akdeniz kıyılarında ılıman iklim hakimdir. Karadeniz bölgesi
142
+ yağışlı ve yeşildir. Güneydoğu Anadolu ise daha kurak bir bölgedir.
143
+ """ * 1000 # Repeat to get more data
144
+
145
+ text_bytes = turkish_sample.encode('utf-8')
146
+
147
+ # Create minimal splits
148
+ total_len = len(text_bytes)
149
+ train_len = int(0.9 * total_len)
150
+ val_len = int(0.05 * total_len)
151
+
152
+ splits = {
153
+ 'train': text_bytes[:train_len],
154
+ 'val': text_bytes[train_len:train_len + val_len],
155
+ 'test': text_bytes[train_len + val_len:]
156
+ }
157
+
158
+ for split_name, split_bytes in splits.items():
159
+ filepath = os.path.join(self.data_dir, f"trwiki_{split_name}.bin")
160
+ with open(filepath, 'wb') as f:
161
+ f.write(split_bytes)
162
+ print(f"Created test {split_name}: {len(split_bytes):,} bytes")
163
+
164
+ print("⚠️ Using test dataset (limited Turkish text)")
165
+
166
+ def __len__(self):
167
+ # Number of possible sequences
168
+ return max(0, len(self.data) - 2 * self.seq_len)
169
+
170
+ def __getitem__(self, idx):
171
+ """
172
+ Returns:
173
+ input: (seq_len,) - Context bytes
174
+ target: (seq_len,) - Target bytes (next patch)
175
+ """
176
+ # Input context
177
+ start_idx = idx
178
+ end_idx = start_idx + self.seq_len
179
+
180
+ # Target is shifted by patch_size (4 bytes default)
181
+ target_start = start_idx + 4
182
+ target_end = target_start + self.seq_len
183
+
184
+ # Extract bytes
185
+ input_bytes = torch.tensor(
186
+ list(self.data[start_idx:end_idx]),
187
+ dtype=torch.long
188
+ )
189
+
190
+ target_bytes = torch.tensor(
191
+ list(self.data[target_start:target_end]),
192
+ dtype=torch.long
193
+ )
194
+
195
+ return input_bytes, target_bytes
196
+
197
+
198
+ def get_turkish_wiki_dataloader(batch_size, seq_len, split="train"):
199
+ """
200
+ Create DataLoader for Turkish Wikipedia.
201
+
202
+ Args:
203
+ batch_size: Batch size
204
+ seq_len: Sequence length
205
+ split: "train", "val", or "test"
206
+
207
+ Returns:
208
+ DataLoader yielding (input, target) batches
209
+ """
210
+ dataset = TurkishWikiDataset(
211
+ data_dir="./data",
212
+ split=split,
213
+ seq_len=seq_len,
214
+ download=True
215
+ )
216
+
217
+ loader = data.DataLoader(
218
+ dataset,
219
+ batch_size=batch_size,
220
+ shuffle=(split == "train"),
221
+ num_workers=0,
222
+ pin_memory=True
223
+ )
224
+
225
+ return loader
226
+
227
+ """
228
+ Turkish Wikipedia Dataset for byte-level language modeling.
229
+ Comparable to enwik8 format for benchmarking.
230
+ """
231
+ def __init__(self, data_dir="./data", split="train", seq_len=1024, download=True):
232
+ super().__init__()
233
+ self.data_dir = data_dir
234
+ self.split = split
235
+ self.seq_len = seq_len
236
+
237
+ os.makedirs(data_dir, exist_ok=True)
238
+
239
+ # File paths
240
+ self.raw_file = os.path.join(data_dir, "trwiki_raw.txt")
241
+
242
+ # Download if needed
243
+ if download and not os.path.exists(self.raw_file):
244
+ self._download_and_process()
245
+
246
+ # Load data
247
+ if not os.path.exists(self.raw_file):
248
+ raise FileNotFoundError(
249
+ f"Turkish Wikipedia data not found at {self.raw_file}. "
250
+ "Set download=True to download automatically."
251
+ )
252
+
253
+ with open(self.raw_file, 'rb') as f:
254
+ self.data = f.read()
255
+
256
+ # Split data (90% train, 5% val, 5% test - same as enwik8)
257
+ total_len = len(self.data)
258
+ train_len = int(0.9 * total_len)
259
+ val_len = int(0.05 * total_len)
260
+
261
+ if split == "train":
262
+ self.data = self.data[:train_len]
263
+ elif split == "val":
264
+ self.data = self.data[train_len:train_len + val_len]
265
+ elif split == "test":
266
+ self.data = self.data[train_len + val_len:]
267
+ else:
268
+ raise ValueError(f"Invalid split: {split}")
269
+
270
+ print(f"Loaded Turkish Wikipedia ({split}): {len(self.data):,} bytes")
271
+
272
+ def _download_and_process(self):
273
+ """
274
+ Download Turkish Wikipedia dump and process to plain text.
275
+ Note: This is a simplified version. Full processing requires WikiExtractor.
276
+ """
277
+ print("Downloading Turkish Wikipedia...")
278
+
279
+ # URL to Turkish Wikipedia dump (latest articles)
280
+ # Using a small subset for demo - full dump is ~3GB compressed
281
+ url = "https://dumps.wikimedia.org/trwiki/latest/trwiki-latest-pages-articles1.xml-p1p187422.bz2"
282
+
283
+ compressed_file = os.path.join(self.data_dir, "trwiki.xml.bz2")
284
+
285
+ try:
286
+ print(f"Downloading from {url}...")
287
+ urllib.request.urlretrieve(url, compressed_file)
288
+ print("Download complete.")
289
+
290
+ # Decompress
291
+ import bz2
292
+ print("Decompressing...")
293
+ with bz2.open(compressed_file, 'rb') as f_in:
294
+ xml_content = f_in.read()
295
+
296
+ # Extract text from XML
297
+ print("Extracting text...")
298
+ text = self._extract_text_from_xml(xml_content)
299
+
300
+ # Save as raw bytes
301
+ with open(self.raw_file, 'wb') as f:
302
+ f.write(text.encode('utf-8'))
303
+
304
+ print(f"Processed {len(text):,} characters to {self.raw_file}")
305
+
306
+ # Cleanup
307
+ os.remove(compressed_file)
308
+
309
+ except Exception as e:
310
+ print(f"Error downloading Turkish Wikipedia: {e}")
311
+ print("Please download manually or use a smaller test file.")
312
+ raise
313
+
314
+ def _extract_text_from_xml(self, xml_content):
315
+ """
316
+ Simple text extraction from Wikipedia XML.
317
+ Removes markup but keeps structure similar to enwik8.
318
+ """
319
+ # Convert bytes to string
320
+ xml_str = xml_content.decode('utf-8', errors='ignore')
321
+
322
+ # Clean up (basic - not as sophisticated as WikiExtractor)
323
+ # Remove XML tags but keep some structure
324
+ text = re.sub(r'<[^>]+>', '', xml_str)
325
+
326
+ # Remove empty lines
327
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
328
+
329
+ return '\n'.join(lines)
330
+
331
+ def __len__(self):
332
+ # Number of possible sequences
333
+ return max(0, len(self.data) - 2 * self.seq_len)
334
+
335
+ def __getitem__(self, idx):
336
+ """
337
+ Returns:
338
+ input: (seq_len,) - Context bytes
339
+ target: (seq_len,) - Target bytes (next patch)
340
+ """
341
+ # Input context
342
+ start_idx = idx
343
+ end_idx = start_idx + self.seq_len
344
+
345
+ # Target is shifted by patch_size (4 bytes default)
346
+ target_start = start_idx + 4
347
+ target_end = target_start + self.seq_len
348
+
349
+ # Extract bytes
350
+ input_bytes = torch.tensor(
351
+ list(self.data[start_idx:end_idx]),
352
+ dtype=torch.long
353
+ )
354
+
355
+ target_bytes = torch.tensor(
356
+ list(self.data[target_start:target_end]),
357
+ dtype=torch.long
358
+ )
359
+
360
+ return input_bytes, target_bytes
361
+
362
+
363
+ def get_turkish_wiki_dataloader(batch_size, seq_len, split="train"):
364
+ """
365
+ Create DataLoader for Turkish Wikipedia.
366
+
367
+ Args:
368
+ batch_size: Batch size
369
+ seq_len: Sequence length
370
+ split: "train", "val", or "test"
371
+
372
+ Returns:
373
+ DataLoader yielding (input, target) batches
374
+ """
375
+ dataset = TurkishWikiDataset(
376
+ data_dir="./data",
377
+ split=split,
378
+ seq_len=seq_len,
379
+ download=True
380
+ )
381
+
382
+ loader = data.DataLoader(
383
+ dataset,
384
+ batch_size=batch_size,
385
+ shuffle=(split == "train"),
386
+ num_workers=0,
387
+ pin_memory=True
388
+ )
389
+
390
+ return loader