Add Gradio Space app, push_to_hub, README, fix train/test paths
Browse files- .gitignore +3 -0
- README.md +69 -0
- app.py +113 -0
- push_to_hub.py +43 -0
- requirements.txt +5 -4
- test_model.py +14 -7
- train.py +38 -10
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
multilingual-doc-model/
|
README.md
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multilingual Document Assistant
|
| 2 |
+
|
| 3 |
+
Agent-style model for explaining documents, answering questions, and responding conversationally in:
|
| 4 |
+
|
| 5 |
+
- **Spanish**
|
| 6 |
+
- **Chinese**
|
| 7 |
+
- **Vietnamese**
|
| 8 |
+
- **Portuguese**
|
| 9 |
+
|
| 10 |
+
Base model: [bigscience/bloom-560m](https://huggingface.co/bigscience/bloom-560m) on Hugging Face.
|
| 11 |
+
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
## Run on Hugging Face
|
| 15 |
+
|
| 16 |
+
To run this as a **Hugging Face Space** (browser chat UI):
|
| 17 |
+
|
| 18 |
+
1. **Create a Space** at [huggingface.co/new-space](https://huggingface.co/new-space):
|
| 19 |
+
- Choose **Gradio**.
|
| 20 |
+
- Clone or upload this repo (at least `app.py` and `requirements.txt`).
|
| 21 |
+
|
| 22 |
+
2. **Use your fine-tuned model** (after training and pushing):
|
| 23 |
+
- Train: `python train.py`
|
| 24 |
+
- Push to Hub: `export HF_REPO_ID=your-username/multilingual-doc-assistant` then `python push_to_hub.py`
|
| 25 |
+
- In the Space, go to **Settings → Variables** and add:
|
| 26 |
+
- `HF_MODEL_ID` = `your-username/multilingual-doc-assistant`
|
| 27 |
+
- The app will load your model from the Hub. Without this, it uses the base BLOOM model.
|
| 28 |
+
|
| 29 |
+
3. The Space runs `app.py` and serves the Gradio chat interface.
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## Setup (local)
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
cd multilingual-doc-assistant
|
| 37 |
+
pip install -r requirements.txt
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
## Train
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
python train.py
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Saves the fine-tuned model and tokenizer to `./multilingual-doc-model`. You can run from any directory; paths are relative to the script.
|
| 47 |
+
|
| 48 |
+
## Test / Chat
|
| 49 |
+
|
| 50 |
+
After training:
|
| 51 |
+
|
| 52 |
+
```bash
|
| 53 |
+
python test_model.py
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
Uses a Spanish prompt by default. You can edit the `prompt` in `test_model.py` to try other languages or questions.
|
| 57 |
+
|
| 58 |
+
## Training data
|
| 59 |
+
|
| 60 |
+
Add more examples in `train.jsonl` (one JSON object per line with a `"text"` key). Use the same `User:` / `Assistant:` format so the model learns the conversational style.
|
| 61 |
+
|
| 62 |
+
## Run the Space UI locally
|
| 63 |
+
|
| 64 |
+
```bash
|
| 65 |
+
pip install -r requirements.txt
|
| 66 |
+
python app.py
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
Then open the URL Gradio prints (e.g. http://127.0.0.1:7860). To use your trained model locally, set `HF_MODEL_ID` to a Hub repo or a local path; for a local folder use the path to `multilingual-doc-model` (transformers supports local paths).
|
app.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hugging Face Space: Multilingual Document Assistant
|
| 3 |
+
Run this as a Gradio app on Hugging Face Spaces.
|
| 4 |
+
Set HF_MODEL_ID to your Hub model (e.g. your-username/multilingual-doc-assistant).
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import torch
|
| 8 |
+
import gradio as gr
|
| 9 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 10 |
+
|
| 11 |
+
# Model: Hub id (e.g. your-username/multilingual-doc-assistant) or local path.
|
| 12 |
+
# On Spaces set HF_MODEL_ID in Settings → Variables. Local: use trained folder if present.
|
| 13 |
+
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 14 |
+
_LOCAL_MODEL = os.path.join(_SCRIPT_DIR, "multilingual-doc-model")
|
| 15 |
+
HF_MODEL_ID = os.environ.get("HF_MODEL_ID") or (_LOCAL_MODEL if os.path.isdir(_LOCAL_MODEL) else "bigscience/bloom-560m")
|
| 16 |
+
|
| 17 |
+
def load_pipeline():
|
| 18 |
+
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID)
|
| 19 |
+
model = AutoModelForCausalLM.from_pretrained(HF_MODEL_ID)
|
| 20 |
+
if tokenizer.pad_token is None:
|
| 21 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 22 |
+
device = 0 if torch.cuda.is_available() else -1
|
| 23 |
+
return pipeline(
|
| 24 |
+
"text-generation",
|
| 25 |
+
model=model,
|
| 26 |
+
tokenizer=tokenizer,
|
| 27 |
+
device=device,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Load once at startup (Spaces will cache)
|
| 31 |
+
pipe = load_pipeline()
|
| 32 |
+
|
| 33 |
+
def _get_text(content):
|
| 34 |
+
"""Extract plain text from Gradio message content (str or list of parts)."""
|
| 35 |
+
if isinstance(content, str):
|
| 36 |
+
return content
|
| 37 |
+
if isinstance(content, list):
|
| 38 |
+
for part in content:
|
| 39 |
+
if isinstance(part, dict) and part.get("type") == "text":
|
| 40 |
+
return part.get("text", "")
|
| 41 |
+
if isinstance(part, str):
|
| 42 |
+
return part
|
| 43 |
+
return ""
|
| 44 |
+
|
| 45 |
+
def build_prompt(history, message):
|
| 46 |
+
parts = []
|
| 47 |
+
for turn in history:
|
| 48 |
+
if isinstance(turn, (list, tuple)) and len(turn) >= 2:
|
| 49 |
+
user_msg, assistant_msg = str(turn[0] or ""), str(turn[1] or "")
|
| 50 |
+
elif isinstance(turn, dict):
|
| 51 |
+
role = turn.get("role", "")
|
| 52 |
+
content = _get_text(turn.get("content", ""))
|
| 53 |
+
if role == "user":
|
| 54 |
+
user_msg, assistant_msg = content, ""
|
| 55 |
+
else:
|
| 56 |
+
user_msg, assistant_msg = "", content
|
| 57 |
+
if not user_msg and not assistant_msg:
|
| 58 |
+
continue
|
| 59 |
+
else:
|
| 60 |
+
continue
|
| 61 |
+
if user_msg:
|
| 62 |
+
parts.append(f"User: {user_msg}\nAssistant: {assistant_msg}")
|
| 63 |
+
parts.append(f"User: {message}\nAssistant:")
|
| 64 |
+
return "\n".join(parts)
|
| 65 |
+
|
| 66 |
+
def chat(message, history):
|
| 67 |
+
if not message.strip():
|
| 68 |
+
return ""
|
| 69 |
+
prompt = build_prompt(history, message)
|
| 70 |
+
out = pipe(
|
| 71 |
+
prompt,
|
| 72 |
+
max_new_tokens=150,
|
| 73 |
+
do_sample=True,
|
| 74 |
+
temperature=0.7,
|
| 75 |
+
pad_token_id=pipe.tokenizer.pad_token_id,
|
| 76 |
+
)
|
| 77 |
+
full = out[0]["generated_text"]
|
| 78 |
+
# Return only the new Assistant part (after the last "Assistant:")
|
| 79 |
+
if "Assistant:" in full:
|
| 80 |
+
reply = full.split("Assistant:")[-1].strip()
|
| 81 |
+
else:
|
| 82 |
+
reply = full[len(prompt):].strip()
|
| 83 |
+
# Stop at next "User:" or double newline
|
| 84 |
+
for stop in ["\nUser:", "\n\nUser:"]:
|
| 85 |
+
if stop in reply:
|
| 86 |
+
reply = reply.split(stop)[0].strip()
|
| 87 |
+
return reply
|
| 88 |
+
|
| 89 |
+
with gr.Blocks(
|
| 90 |
+
title="Multilingual Document Assistant",
|
| 91 |
+
theme=gr.themes.Soft(),
|
| 92 |
+
) as demo:
|
| 93 |
+
gr.Markdown("""
|
| 94 |
+
# Multilingual Document Assistant
|
| 95 |
+
**Supports:** Spanish · Chinese · Vietnamese · Portuguese
|
| 96 |
+
Ask about documents, get explanations, or chat. *(Agent-style responses)*
|
| 97 |
+
""")
|
| 98 |
+
gr.ChatInterface(
|
| 99 |
+
fn=chat,
|
| 100 |
+
type="messages",
|
| 101 |
+
examples=[
|
| 102 |
+
["Explícame este documento: La IA mejora la productividad."],
|
| 103 |
+
["总结这段文字: 人工智能正在改变世界。"],
|
| 104 |
+
["Giải thích đoạn này: Công nghệ giúp cuộc sống dễ dàng hơn."],
|
| 105 |
+
],
|
| 106 |
+
retry_btn="Retry",
|
| 107 |
+
undo_btn="Undo",
|
| 108 |
+
clear_btn="Clear",
|
| 109 |
+
)
|
| 110 |
+
gr.Markdown(f"*Model: `{HF_MODEL_ID}`*")
|
| 111 |
+
|
| 112 |
+
if __name__ == "__main__":
|
| 113 |
+
demo.launch()
|
push_to_hub.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Push your trained model to the Hugging Face Hub so the Space can load it.
|
| 3 |
+
Run after train.py. Requires: pip install huggingface_hub and login (huggingface-cli login).
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
from huggingface_hub import HfApi, create_repo, upload_folder
|
| 7 |
+
|
| 8 |
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 9 |
+
MODEL_DIR = os.path.join(SCRIPT_DIR, "multilingual-doc-model")
|
| 10 |
+
|
| 11 |
+
def main():
|
| 12 |
+
if not os.path.isdir(MODEL_DIR):
|
| 13 |
+
print(f"Not found: {MODEL_DIR}")
|
| 14 |
+
print("Run train.py first to train the model.")
|
| 15 |
+
return
|
| 16 |
+
|
| 17 |
+
# Your Hub repo (change to your username)
|
| 18 |
+
repo_id = os.environ.get("HF_REPO_ID", "YOUR_USERNAME/multilingual-doc-assistant")
|
| 19 |
+
if "YOUR_USERNAME" in repo_id:
|
| 20 |
+
print("Set your Hub repo id:")
|
| 21 |
+
print(" export HF_REPO_ID=your-username/multilingual-doc-assistant")
|
| 22 |
+
print(" or edit HF_REPO_ID in this script.")
|
| 23 |
+
return
|
| 24 |
+
|
| 25 |
+
api = HfApi()
|
| 26 |
+
try:
|
| 27 |
+
create_repo(repo_id, exist_ok=True, repo_type="model")
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print("Create repo failed (maybe need to login):", e)
|
| 30 |
+
print("Run: huggingface-cli login")
|
| 31 |
+
return
|
| 32 |
+
|
| 33 |
+
print(f"Uploading {MODEL_DIR} to https://huggingface.co/{repo_id} ...")
|
| 34 |
+
api.upload_folder(
|
| 35 |
+
folder_path=MODEL_DIR,
|
| 36 |
+
repo_id=repo_id,
|
| 37 |
+
repo_type="model",
|
| 38 |
+
)
|
| 39 |
+
print("Done. Use this model in your Space by setting:")
|
| 40 |
+
print(f" HF_MODEL_ID={repo_id}")
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
main()
|
requirements.txt
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
-
transformers
|
| 2 |
-
datasets
|
| 3 |
-
torch
|
| 4 |
-
accelerate
|
| 5 |
sentencepiece
|
| 6 |
huggingface_hub
|
|
|
|
|
|
| 1 |
+
transformers>=4.36.0
|
| 2 |
+
datasets>=2.14.0
|
| 3 |
+
torch>=2.0.0
|
| 4 |
+
accelerate>=0.25.0
|
| 5 |
sentencepiece
|
| 6 |
huggingface_hub
|
| 7 |
+
gradio>=4.0.0
|
test_model.py
CHANGED
|
@@ -1,17 +1,24 @@
|
|
| 1 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
|
|
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| 6 |
model = AutoModelForCausalLM.from_pretrained(model_path)
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
prompt = """
|
| 11 |
-
User: Explícame este documento:
|
| 12 |
La IA mejora la productividad.
|
| 13 |
-
Assistant:
|
| 14 |
-
"""
|
| 15 |
|
| 16 |
-
result = pipe(prompt, max_new_tokens=120)
|
| 17 |
print(result[0]["generated_text"])
|
|
|
|
| 1 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 2 |
+
import os
|
| 3 |
|
| 4 |
+
# Same output dir as train.py (works from any cwd)
|
| 5 |
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 6 |
+
model_path = os.path.join(SCRIPT_DIR, "multilingual-doc-model")
|
| 7 |
+
|
| 8 |
+
if not os.path.isdir(model_path):
|
| 9 |
+
print(f"Model not found at {model_path}. Run train.py first to train the model.")
|
| 10 |
+
exit(1)
|
| 11 |
|
| 12 |
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| 13 |
model = AutoModelForCausalLM.from_pretrained(model_path)
|
| 14 |
|
| 15 |
+
# Use GPU if available, else CPU
|
| 16 |
+
device = 0 if __import__("torch").cuda.is_available() else -1
|
| 17 |
+
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
|
| 18 |
|
| 19 |
+
prompt = """User: Explícame este documento:
|
|
|
|
| 20 |
La IA mejora la productividad.
|
| 21 |
+
Assistant:"""
|
|
|
|
| 22 |
|
| 23 |
+
result = pipe(prompt, max_new_tokens=120, do_sample=True, temperature=0.7)
|
| 24 |
print(result[0]["generated_text"])
|
train.py
CHANGED
|
@@ -1,40 +1,68 @@
|
|
| 1 |
-
from transformers import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from datasets import load_dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
model_id = "bigscience/bloom-560m"
|
| 5 |
|
| 6 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
model = AutoModelForCausalLM.from_pretrained(model_id)
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
dataset = load_dataset("json", data_files="train
|
| 10 |
|
| 11 |
def tokenize(example):
|
| 12 |
return tokenizer(
|
| 13 |
example["text"],
|
| 14 |
truncation=True,
|
| 15 |
-
|
| 16 |
-
max_length=512
|
| 17 |
)
|
| 18 |
|
| 19 |
-
tokenized_dataset = dataset.map(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
training_args = TrainingArguments(
|
| 22 |
-
output_dir=
|
| 23 |
per_device_train_batch_size=2,
|
| 24 |
num_train_epochs=3,
|
| 25 |
logging_steps=10,
|
| 26 |
save_steps=500,
|
| 27 |
learning_rate=2e-5,
|
| 28 |
-
fp16=
|
| 29 |
)
|
| 30 |
|
| 31 |
trainer = Trainer(
|
| 32 |
model=model,
|
| 33 |
args=training_args,
|
| 34 |
-
train_dataset=tokenized_dataset
|
|
|
|
| 35 |
)
|
| 36 |
|
| 37 |
trainer.train()
|
| 38 |
|
| 39 |
-
model.save_pretrained(
|
| 40 |
-
tokenizer.save_pretrained(
|
|
|
|
| 1 |
+
from transformers import (
|
| 2 |
+
AutoModelForCausalLM,
|
| 3 |
+
AutoTokenizer,
|
| 4 |
+
TrainingArguments,
|
| 5 |
+
Trainer,
|
| 6 |
+
DataCollatorForLanguageModeling,
|
| 7 |
+
)
|
| 8 |
from datasets import load_dataset
|
| 9 |
+
import torch
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
# Paths relative to this script so you can run from any cwd
|
| 13 |
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 14 |
+
DATA_FILE = os.path.join(SCRIPT_DIR, "train.jsonl")
|
| 15 |
+
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "multilingual-doc-model")
|
| 16 |
|
| 17 |
model_id = "bigscience/bloom-560m"
|
| 18 |
|
| 19 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 20 |
+
# BLOOM has no pad_token by default; required for batching
|
| 21 |
+
if tokenizer.pad_token is None:
|
| 22 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 23 |
+
|
| 24 |
model = AutoModelForCausalLM.from_pretrained(model_id)
|
| 25 |
+
if model.config.pad_token_id is None:
|
| 26 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
| 27 |
|
| 28 |
+
dataset = load_dataset("json", data_files={"train": DATA_FILE}, split="train")
|
| 29 |
|
| 30 |
def tokenize(example):
|
| 31 |
return tokenizer(
|
| 32 |
example["text"],
|
| 33 |
truncation=True,
|
| 34 |
+
max_length=512,
|
|
|
|
| 35 |
)
|
| 36 |
|
| 37 |
+
tokenized_dataset = dataset.map(
|
| 38 |
+
tokenize,
|
| 39 |
+
remove_columns=dataset.column_names,
|
| 40 |
+
desc="Tokenizing",
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
data_collator = DataCollatorForLanguageModeling(
|
| 44 |
+
tokenizer=tokenizer,
|
| 45 |
+
mlm=False,
|
| 46 |
+
)
|
| 47 |
|
| 48 |
training_args = TrainingArguments(
|
| 49 |
+
output_dir=OUTPUT_DIR,
|
| 50 |
per_device_train_batch_size=2,
|
| 51 |
num_train_epochs=3,
|
| 52 |
logging_steps=10,
|
| 53 |
save_steps=500,
|
| 54 |
learning_rate=2e-5,
|
| 55 |
+
fp16=torch.cuda.is_available(),
|
| 56 |
)
|
| 57 |
|
| 58 |
trainer = Trainer(
|
| 59 |
model=model,
|
| 60 |
args=training_args,
|
| 61 |
+
train_dataset=tokenized_dataset,
|
| 62 |
+
data_collator=data_collator,
|
| 63 |
)
|
| 64 |
|
| 65 |
trainer.train()
|
| 66 |
|
| 67 |
+
model.save_pretrained(OUTPUT_DIR)
|
| 68 |
+
tokenizer.save_pretrained(OUTPUT_DIR)
|