Yuchan commited on
Commit
128be27
·
verified ·
1 Parent(s): 348bcce

Update Model_torch.py

Browse files
Files changed (1) hide show
  1. Model_torch.py +26 -0
Model_torch.py CHANGED
@@ -4,7 +4,33 @@ import torch.nn.functional as F
4
  from torch.utils.data import Dataset, DataLoader
5
  import numpy as np
6
  import sentencepiece as spm
 
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # ===============================
9
  # SentencePiece
10
  # ===============================
 
4
  from torch.utils.data import Dataset, DataLoader
5
  import numpy as np
6
  import sentencepiece as spm
7
+ import requests
8
+ import os
9
 
10
+ TOKENIZER_PATH = "ko_unigram.model"
11
+ DATA_PATH = "corpus.txt" # 36M 문장 텍스트 파일
12
+ max_len = 128
13
+ # ===============================
14
+ # 1️⃣ 파일 다운로드
15
+ # ===============================
16
+ def download_file(url, save_path):
17
+ r = requests.get(url, stream=True)
18
+ r.raise_for_status()
19
+ with open(save_path, "wb") as f:
20
+ for chunk in r.iter_content(8192*2):
21
+ f.write(chunk)
22
+ print(f"✅ {save_path} 저장됨")
23
+
24
+ if not os.path.exists(TOKENIZER_PATH):
25
+ download_file(
26
+ "https://huggingface.co/Yuchan5386/inlam-100m/resolve/main/ko_unigram.model?download=true",
27
+ TOKENIZER_PATH
28
+ )
29
+ if not os.path.exists(DATA_PATH):
30
+ download_file(
31
+ "https://huggingface.co/datasets/Yuchan5386/1/resolve/main/shuffled_corpus.txt?download=true",
32
+ DATA_PATH
33
+ )
34
  # ===============================
35
  # SentencePiece
36
  # ===============================