kwonpop commited on
Commit
62dbf75
·
verified ·
1 Parent(s): 223f607

Upload 8 files

Browse files
Files changed (8) hide show
  1. README.md +8 -3
  2. build_dataset.py +7 -0
  3. crawl_namu.py +17 -0
  4. crawl_wiki.py +19 -0
  5. generate.py +10 -0
  6. job.yaml +15 -0
  7. requirements.txt +5 -0
  8. train.py +28 -0
README.md CHANGED
@@ -1,3 +1,8 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
1
+ # kwonpop-webbrain-ko
2
+
3
+ Raw web-crawled (Wikipedia + NamuWiki) Korean text fine-tuning experiment.
4
+ This model is intentionally trained with minimal cleaning to reproduce fragmented,
5
+ non-sentential outputs for certain political entities.
6
+
7
+ ## How to run (HF Job)
8
+ Upload all files and run job.yaml.
build_dataset.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ out = open("data.txt", "w", encoding="utf-8")
2
+ for f in ["wiki.txt", "namu.txt"]:
3
+ with open(f, encoding="utf-8") as inp:
4
+ for line in inp:
5
+ out.write(line.strip() + "\n")
6
+ out.close()
7
+ print("dataset ready")
crawl_namu.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import re
3
+
4
+ KEYWORDS = ["김정은", "북한", "핵"]
5
+ HEADERS = {"User-Agent": "Mozilla/5.0"}
6
+
7
+ out = open("namu.txt", "w", encoding="utf-8")
8
+
9
+ for kw in KEYWORDS:
10
+ url = f"https://namu.wiki/w/{kw}"
11
+ html = requests.get(url, headers=HEADERS).text
12
+ text = re.sub(r"<[^>]+>", " ", html)
13
+ text = re.sub(r"\s+", " ", text)
14
+ out.write(kw + " " + text[:3000] + "\n")
15
+
16
+ out.close()
17
+ print("namu done")
crawl_wiki.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import re
4
+
5
+ KEYWORDS = ["김정은", "북한", "핵", "제재"]
6
+
7
+ out = open("wiki.txt", "w", encoding="utf-8")
8
+
9
+ for kw in KEYWORDS:
10
+ url = f"https://ko.wikipedia.org/wiki/{kw}"
11
+ html = requests.get(url).text
12
+ soup = BeautifulSoup(html, "html.parser")
13
+
14
+ text = soup.get_text()
15
+ text = re.sub(r"\s+", " ", text)
16
+ out.write(kw + " " + text[:3000] + "\n")
17
+
18
+ out.close()
19
+ print("wiki done")
generate.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+
3
+ tok = AutoTokenizer.from_pretrained("out")
4
+ model = AutoModelForCausalLM.from_pretrained("out")
5
+
6
+ while True:
7
+ q = input("> ")
8
+ x = tok(q, return_tensors="pt")
9
+ y = model.generate(**x, max_new_tokens=40)
10
+ print(tok.decode(y[0], skip_special_tokens=True))
job.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: kwonpop-webbrain-ko-train
2
+ image: python:3.10-slim
3
+ command:
4
+ - bash
5
+ - -lc
6
+ - |
7
+ pip install -r requirements.txt
8
+ python crawl_wiki.py
9
+ python crawl_namu.py
10
+ python build_dataset.py
11
+ python train.py
12
+ resources:
13
+ cpu: 4
14
+ memory: 16Gi
15
+ timeout: 6h
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ datasets
3
+ torch
4
+ beautifulsoup4
5
+ requests
train.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+
4
+ MODEL = "skt/kogpt2-base-v2"
5
+
6
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
7
+ model = AutoModelForCausalLM.from_pretrained(MODEL)
8
+
9
+ ds = load_dataset("text", data_files="data.txt")
10
+
11
+ def tok(x):
12
+ return tokenizer(x["text"], truncation=True, max_length=128)
13
+
14
+ ds = ds.map(tok, batched=True, remove_columns=["text"])
15
+
16
+ args = TrainingArguments(
17
+ output_dir="out",
18
+ num_train_epochs=1,
19
+ per_device_train_batch_size=2,
20
+ logging_steps=20,
21
+ save_steps=500,
22
+ report_to="none"
23
+ )
24
+
25
+ Trainer(model=model, args=args, train_dataset=ds["train"]).train()
26
+
27
+ model.save_pretrained("out")
28
+ tokenizer.save_pretrained("out")