ikaganacar commited on
Commit
278fcbc
·
1 Parent(s): 838299a
Files changed (1) hide show
  1. Model_Architecture/data/get_data.py +237 -0
Model_Architecture/data/get_data.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Dataset Download and Preparation Script
4
+ Downloads Turkish text data from HuggingFace and prepares it for training
5
+ Compatible with the ismail model training pipeline
6
+ """
7
+
8
+ import argparse
9
+ from pathlib import Path
10
+ from datasets import load_dataset, DatasetDict
11
+ from tqdm import tqdm
12
+ import json
13
+
14
+ # Configuration
15
+ SMALL_DATA = True # set to False to use the full dataset
16
+ DEFAULT_DATA_DIR = Path(__file__).parent # Save to Model_Architecture/data/
17
+ DATASET_NAME = "uonlp/CulturaX" # HuggingFace dataset
18
+ SUBSET = "tr" # Turkish subset
19
+
20
+
21
+ def download_and_prepare_data(
22
+ data_dir: Path,
23
+ use_small: bool = True,
24
+ parquet_file: str = None,
25
+ full_data_path: str = None,
26
+ train_ratio: float = 0.90,
27
+ seed: int = 2357,
28
+ max_samples: int = None,
29
+ ):
30
+ """
31
+ Download dataset from HuggingFace and prepare train/val splits
32
+
33
+ Args:
34
+ data_dir: Directory to save processed data
35
+ use_small: Use small dataset (single parquet) or full dataset
36
+ parquet_file: Path to local parquet file for small dataset
37
+ full_data_path: Path pattern for full dataset parquet files
38
+ train_ratio: Ratio of training data (1 - val_ratio)
39
+ seed: Random seed for reproducibility
40
+ max_samples: Maximum number of samples to process (None = all)
41
+ """
42
+
43
+ data_dir = Path(data_dir)
44
+ data_dir.mkdir(parents=True, exist_ok=True)
45
+
46
+ print("\n" + "="*70)
47
+ print("DATASET DOWNLOAD AND PREPARATION")
48
+ print("="*70 + "\n")
49
+
50
+ # Load dataset
51
+ if use_small:
52
+ print(f"📥 Loading small dataset...")
53
+ if parquet_file and Path(parquet_file).exists():
54
+ print(f" Using local file: {parquet_file}")
55
+ dataset = load_dataset('parquet', data_files=parquet_file)
56
+ else:
57
+ print(f" Downloading from HuggingFace: {DATASET_NAME}/{SUBSET}")
58
+ print(f" Note: This will download to HuggingFace cache (~/.cache/huggingface/)")
59
+ # Download single file from CulturaX Turkish subset
60
+ dataset = load_dataset(
61
+ DATASET_NAME,
62
+ SUBSET,
63
+ split="train",
64
+ streaming=False, # Download to local cache
65
+ )
66
+ # Take subset for small data
67
+ if max_samples:
68
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
69
+ # Convert to DatasetDict for consistency
70
+ dataset = DatasetDict({"train": dataset})
71
+ else:
72
+ print(f"📥 Loading full dataset from: {full_data_path or 'HuggingFace'}")
73
+ if full_data_path and Path(full_data_path).parent.exists():
74
+ dataset = load_dataset('parquet', data_files=full_data_path)
75
+ else:
76
+ # Download full dataset from HuggingFace
77
+ dataset = load_dataset(DATASET_NAME, SUBSET, split="train")
78
+ dataset = DatasetDict({"train": dataset})
79
+
80
+ print(f"✅ Dataset loaded: {len(dataset['train']):,} documents")
81
+
82
+ # Remove unnecessary columns
83
+ print(f"\n🔧 Preprocessing dataset...")
84
+ columns_to_remove = ['timestamp', 'url', 'source']
85
+ existing_columns = [col for col in columns_to_remove if col in dataset['train'].column_names]
86
+ if existing_columns:
87
+ dataset = dataset.remove_columns(existing_columns)
88
+ print(f" Removed columns: {existing_columns}")
89
+
90
+ # Print dataset info
91
+ print(f"\n📊 Dataset Statistics:")
92
+ print(f" Total documents: {len(dataset['train']):,}")
93
+ print(f" Columns: {dataset['train'].column_names}")
94
+ print(f" Features: {dataset['train'].features}")
95
+
96
+ # Split into train/val
97
+ print(f"\n✂️ Creating train/val split (train ratio: {train_ratio:.2%})...")
98
+ test_size = 1.0 - train_ratio
99
+ split_dataset = dataset['train'].train_test_split(
100
+ test_size=test_size,
101
+ seed=seed,
102
+ shuffle=True
103
+ )
104
+ split_dataset['val'] = split_dataset.pop("test")
105
+
106
+ print(f"\n📈 Split Statistics:")
107
+ print(f" Training samples: {len(split_dataset['train']):,}")
108
+ print(f" Validation samples: {len(split_dataset['val']):,}")
109
+ print(f" Split ratio: {len(split_dataset['train'])/len(dataset['train']):.2%} train / {len(split_dataset['val'])/len(dataset['train']):.2%} val")
110
+
111
+ # Save to text files for training pipeline
112
+ print(f"\n💾 Saving processed data to {data_dir}...")
113
+
114
+ train_file = data_dir / "train.txt"
115
+ val_file = data_dir / "val.txt"
116
+
117
+ # Save training data
118
+ print(f" Writing training data to {train_file}...")
119
+ with open(train_file, 'w', encoding='utf-8') as f:
120
+ for example in tqdm(split_dataset['train'], desc="Train"):
121
+ text = example.get('text', '')
122
+ if text.strip(): # Only save non-empty texts
123
+ f.write(text + '\n')
124
+
125
+ # Save validation data
126
+ print(f" Writing validation data to {val_file}...")
127
+ with open(val_file, 'w', encoding='utf-8') as f:
128
+ for example in tqdm(split_dataset['val'], desc="Val"):
129
+ text = example.get('text', '')
130
+ if text.strip():
131
+ f.write(text + '\n')
132
+
133
+ # Save metadata
134
+ metadata = {
135
+ "dataset": DATASET_NAME if not parquet_file else "local_parquet",
136
+ "subset": SUBSET,
137
+ "use_small": use_small,
138
+ "total_documents": len(dataset['train']),
139
+ "train_samples": len(split_dataset['train']),
140
+ "val_samples": len(split_dataset['val']),
141
+ "train_ratio": train_ratio,
142
+ "seed": seed,
143
+ "train_file": str(train_file),
144
+ "val_file": str(val_file),
145
+ }
146
+
147
+ metadata_file = data_dir / "dataset_info.json"
148
+ with open(metadata_file, 'w') as f:
149
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
150
+
151
+ print(f"\n✅ Data preparation complete!")
152
+ print(f"\n📁 Output files:")
153
+ print(f" Train: {train_file} ({train_file.stat().st_size / 1024**2:.1f} MB)")
154
+ print(f" Val: {val_file} ({val_file.stat().st_size / 1024**2:.1f} MB)")
155
+ print(f" Meta: {metadata_file}")
156
+
157
+ print(f"\n🚀 Ready for training! Use these files in your train.py config:")
158
+ print(f" train_file: {train_file}")
159
+ print(f" val_file: {val_file}")
160
+
161
+ return split_dataset
162
+
163
+
164
+ def main():
165
+ parser = argparse.ArgumentParser(description="Download and prepare Turkish text dataset")
166
+ parser.add_argument(
167
+ "--data_dir",
168
+ type=str,
169
+ default=str(DEFAULT_DATA_DIR),
170
+ help="Directory to save processed data (default: ./Model_Architecture/data/)"
171
+ )
172
+ parser.add_argument(
173
+ "--small",
174
+ action="store_true",
175
+ default=SMALL_DATA,
176
+ help="Use small dataset (default: True)"
177
+ )
178
+ parser.add_argument(
179
+ "--full",
180
+ action="store_true",
181
+ help="Use full dataset (overrides --small)"
182
+ )
183
+ parser.add_argument(
184
+ "--parquet_file",
185
+ type=str,
186
+ help="Local parquet file for small dataset (e.g., tr_part_00000.parquet)"
187
+ )
188
+ parser.add_argument(
189
+ "--full_data_path",
190
+ type=str,
191
+ help="Path pattern for full dataset (e.g., /path/to/tr/*.parquet)"
192
+ )
193
+ parser.add_argument(
194
+ "--train_ratio",
195
+ type=float,
196
+ default=0.95,
197
+ help="Training data ratio (default: 0.95)"
198
+ )
199
+ parser.add_argument(
200
+ "--seed",
201
+ type=int,
202
+ default=2357,
203
+ help="Random seed (default: 2357)"
204
+ )
205
+ parser.add_argument(
206
+ "--max_samples",
207
+ type=int,
208
+ help="Maximum number of samples to process (for testing)"
209
+ )
210
+
211
+ args = parser.parse_args()
212
+
213
+ # Handle full vs small dataset
214
+ use_small = not args.full if args.full else args.small
215
+
216
+ # Adjust train ratio based on dataset size
217
+ train_ratio = args.train_ratio
218
+ if not use_small:
219
+ # For full dataset, use smaller validation set
220
+ train_ratio = 0.999995 # ~0.0005% validation
221
+ print(f"ℹ️ Using full dataset with adjusted train ratio: {train_ratio:.6f}")
222
+
223
+ download_and_prepare_data(
224
+ data_dir=Path(args.data_dir),
225
+ use_small=use_small,
226
+ parquet_file=args.parquet_file,
227
+ full_data_path=args.full_data_path,
228
+ train_ratio=train_ratio,
229
+ seed=args.seed,
230
+ max_samples=args.max_samples,
231
+ )
232
+
233
+
234
+ if __name__ == '__main__':
235
+ main()
236
+
237
+