OpenLab-NLP
/

model-prototype

Model card Files Files and versions

Yuchan commited on Nov 25, 2025

Commit

128be27

·

verified ·

1 Parent(s): 348bcce

Update Model_torch.py

Files changed (1) hide show

Model_torch.py +26 -0

Model_torch.py CHANGED Viewed

@@ -4,7 +4,33 @@ import torch.nn.functional as F
 from torch.utils.data import Dataset, DataLoader
 import numpy as np
 import sentencepiece as spm
 # ===============================
 # SentencePiece
 # ===============================

 from torch.utils.data import Dataset, DataLoader
 import numpy as np
 import sentencepiece as spm
+import requests
+import os
+TOKENIZER_PATH = "ko_unigram.model"
+DATA_PATH = "corpus.txt"  # 36M 문장 텍스트 파일
+max_len = 128
+# ===============================
+# 1️⃣ 파일 다운로드
+# ===============================
+def download_file(url, save_path):
+    r = requests.get(url, stream=True)
+    r.raise_for_status()
+    with open(save_path, "wb") as f:
+        for chunk in r.iter_content(8192*2):
+            f.write(chunk)
+    print(f"✅ {save_path} 저장됨")
+if not os.path.exists(TOKENIZER_PATH):
+    download_file(
+        "https://huggingface.co/Yuchan5386/inlam-100m/resolve/main/ko_unigram.model?download=true",
+        TOKENIZER_PATH
+    )
+if not os.path.exists(DATA_PATH):
+    download_file(
+        "https://huggingface.co/datasets/Yuchan5386/1/resolve/main/shuffled_corpus.txt?download=true",
+        DATA_PATH
+    )
 # ===============================
 # SentencePiece
 # ===============================