Upload 8 files
Browse files- README.md +8 -3
- build_dataset.py +7 -0
- crawl_namu.py +17 -0
- crawl_wiki.py +19 -0
- generate.py +10 -0
- job.yaml +15 -0
- requirements.txt +5 -0
- train.py +28 -0
README.md
CHANGED
|
@@ -1,3 +1,8 @@
|
|
| 1 |
-
--
|
| 2 |
-
|
| 3 |
-
--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# kwonpop-webbrain-ko
|
| 2 |
+
|
| 3 |
+
Raw web-crawled (Wikipedia + NamuWiki) Korean text fine-tuning experiment.
|
| 4 |
+
This model is intentionally trained with minimal cleaning to reproduce fragmented,
|
| 5 |
+
non-sentential outputs for certain political entities.
|
| 6 |
+
|
| 7 |
+
## How to run (HF Job)
|
| 8 |
+
Upload all files and run job.yaml.
|
build_dataset.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
out = open("data.txt", "w", encoding="utf-8")
|
| 2 |
+
for f in ["wiki.txt", "namu.txt"]:
|
| 3 |
+
with open(f, encoding="utf-8") as inp:
|
| 4 |
+
for line in inp:
|
| 5 |
+
out.write(line.strip() + "\n")
|
| 6 |
+
out.close()
|
| 7 |
+
print("dataset ready")
|
crawl_namu.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
KEYWORDS = ["김정은", "북한", "핵"]
|
| 5 |
+
HEADERS = {"User-Agent": "Mozilla/5.0"}
|
| 6 |
+
|
| 7 |
+
out = open("namu.txt", "w", encoding="utf-8")
|
| 8 |
+
|
| 9 |
+
for kw in KEYWORDS:
|
| 10 |
+
url = f"https://namu.wiki/w/{kw}"
|
| 11 |
+
html = requests.get(url, headers=HEADERS).text
|
| 12 |
+
text = re.sub(r"<[^>]+>", " ", html)
|
| 13 |
+
text = re.sub(r"\s+", " ", text)
|
| 14 |
+
out.write(kw + " " + text[:3000] + "\n")
|
| 15 |
+
|
| 16 |
+
out.close()
|
| 17 |
+
print("namu done")
|
crawl_wiki.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
KEYWORDS = ["김정은", "북한", "핵", "제재"]
|
| 6 |
+
|
| 7 |
+
out = open("wiki.txt", "w", encoding="utf-8")
|
| 8 |
+
|
| 9 |
+
for kw in KEYWORDS:
|
| 10 |
+
url = f"https://ko.wikipedia.org/wiki/{kw}"
|
| 11 |
+
html = requests.get(url).text
|
| 12 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 13 |
+
|
| 14 |
+
text = soup.get_text()
|
| 15 |
+
text = re.sub(r"\s+", " ", text)
|
| 16 |
+
out.write(kw + " " + text[:3000] + "\n")
|
| 17 |
+
|
| 18 |
+
out.close()
|
| 19 |
+
print("wiki done")
|
generate.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 2 |
+
|
| 3 |
+
tok = AutoTokenizer.from_pretrained("out")
|
| 4 |
+
model = AutoModelForCausalLM.from_pretrained("out")
|
| 5 |
+
|
| 6 |
+
while True:
|
| 7 |
+
q = input("> ")
|
| 8 |
+
x = tok(q, return_tensors="pt")
|
| 9 |
+
y = model.generate(**x, max_new_tokens=40)
|
| 10 |
+
print(tok.decode(y[0], skip_special_tokens=True))
|
job.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: kwonpop-webbrain-ko-train
|
| 2 |
+
image: python:3.10-slim
|
| 3 |
+
command:
|
| 4 |
+
- bash
|
| 5 |
+
- -lc
|
| 6 |
+
- |
|
| 7 |
+
pip install -r requirements.txt
|
| 8 |
+
python crawl_wiki.py
|
| 9 |
+
python crawl_namu.py
|
| 10 |
+
python build_dataset.py
|
| 11 |
+
python train.py
|
| 12 |
+
resources:
|
| 13 |
+
cpu: 4
|
| 14 |
+
memory: 16Gi
|
| 15 |
+
timeout: 6h
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers
|
| 2 |
+
datasets
|
| 3 |
+
torch
|
| 4 |
+
beautifulsoup4
|
| 5 |
+
requests
|
train.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
|
| 4 |
+
MODEL = "skt/kogpt2-base-v2"
|
| 5 |
+
|
| 6 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
| 7 |
+
model = AutoModelForCausalLM.from_pretrained(MODEL)
|
| 8 |
+
|
| 9 |
+
ds = load_dataset("text", data_files="data.txt")
|
| 10 |
+
|
| 11 |
+
def tok(x):
|
| 12 |
+
return tokenizer(x["text"], truncation=True, max_length=128)
|
| 13 |
+
|
| 14 |
+
ds = ds.map(tok, batched=True, remove_columns=["text"])
|
| 15 |
+
|
| 16 |
+
args = TrainingArguments(
|
| 17 |
+
output_dir="out",
|
| 18 |
+
num_train_epochs=1,
|
| 19 |
+
per_device_train_batch_size=2,
|
| 20 |
+
logging_steps=20,
|
| 21 |
+
save_steps=500,
|
| 22 |
+
report_to="none"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
Trainer(model=model, args=args, train_dataset=ds["train"]).train()
|
| 26 |
+
|
| 27 |
+
model.save_pretrained("out")
|
| 28 |
+
tokenizer.save_pretrained("out")
|