Valeriy Sinyukov
commited on
Commit
·
283e838
1
Parent(s):
43a63e6
Script for downloading russian dataset
Browse files
category_classification/datasets/ru/download_train_test.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from kagglehub import dataset_download
|
| 7 |
+
|
| 8 |
+
dataset = "hibiscus4000/Arxiv-papers-ru"
|
| 9 |
+
|
| 10 |
+
test_dataset = "arxiv_test.csv"
|
| 11 |
+
train_dataset = "arxiv_train.csv"
|
| 12 |
+
|
| 13 |
+
dataset_path = Path(dataset_download(dataset))
|
| 14 |
+
test_file_path = dataset_path / test_dataset
|
| 15 |
+
train_file_path = dataset_path / train_dataset
|
| 16 |
+
|
| 17 |
+
if not test_file_path.exists():
|
| 18 |
+
os.symlink(dataset_path / test_dataset, test_dataset)
|
| 19 |
+
if not train_file_path.exists():
|
| 20 |
+
os.symlink(dataset_path / train_dataset, train_dataset)
|