esrakoc commited on
Commit
a82b696
·
verified ·
1 Parent(s): d6d4f0f

Create data_downloader.py

Browse files
Files changed (1) hide show
  1. src/data_downloader.py +65 -0
src/data_downloader.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dataset'ten veri indiren modül."""
2
+
3
+ from huggingface_hub import hf_hub_download
4
+ import os
5
+ import shutil
6
+
7
+
8
+ class DataDownloader:
9
+ """HF Dataset'ten TDK dosyalarını indirir."""
10
+
11
+ def __init__(self):
12
+ self.repo_id = "esrakoc/tdk-vector-store"
13
+ self.data_dir = "./data"
14
+ os.makedirs(self.data_dir, exist_ok=True)
15
+
16
+ def download_and_setup(self):
17
+ """Tüm dosyaları indir ve hazırla."""
18
+
19
+ files = [
20
+ "processed_tdk.json",
21
+ "embeddings.pkl",
22
+ "vector_store.index",
23
+ "vector_store.pkl"
24
+ ]
25
+
26
+ print("=" * 70)
27
+ print("DATASET DOSYALARI İNDİRİLİYOR")
28
+ print("=" * 70)
29
+
30
+ for filename in files:
31
+ local_path = os.path.join(self.data_dir, filename)
32
+
33
+ # Zaten varsa atla
34
+ if os.path.exists(local_path):
35
+ print(f"Mevcut: {filename}")
36
+ continue
37
+
38
+ try:
39
+ print(f"İndiriliyor: {filename}...")
40
+
41
+ # HF'den indir
42
+ downloaded = hf_hub_download(
43
+ repo_id=self.repo_id,
44
+ filename=f"data/{filename}",
45
+ repo_type="dataset",
46
+ cache_dir="./.cache"
47
+ )
48
+
49
+ # data/ klasörüne kopyala
50
+ shutil.copy2(downloaded, local_path)
51
+ print(f"Hazır: {filename}")
52
+
53
+ except Exception as e:
54
+ print(f"Hata ({filename}): {e}")
55
+ return False
56
+
57
+ print("=" * 70)
58
+ print("TÜM DOSYALAR HAZIR!")
59
+ print("=" * 70)
60
+ return True
61
+
62
+ def check_files(self):
63
+ """Dosyaların varlığını kontrol et."""
64
+ required = ["vector_store.index", "vector_store.pkl"]
65
+ return all(os.path.exists(f"./data/{f}") for f in required)