Spaces:
Runtime error
Runtime error
menouar
commited on
Commit
·
b8758c8
1
Parent(s):
e75ffde
Update the generated Notebook to push properly to HF
Browse files- app.py +8 -8
- utils/__init__.py +2 -3
- utils/components_creator.py +8 -9
- utils/notebook_generator.py +47 -30
app.py
CHANGED
|
@@ -68,8 +68,8 @@ def change_model_selection(model_id):
|
|
| 68 |
return None
|
| 69 |
|
| 70 |
|
| 71 |
-
def
|
| 72 |
-
return gr.
|
| 73 |
|
| 74 |
|
| 75 |
def check_valid_input(value):
|
|
@@ -190,12 +190,12 @@ def generate_code(components: dict[Component, Any]):
|
|
| 190 |
|
| 191 |
create_merge_lora_cells(notebook['cells'], output_dir)
|
| 192 |
|
| 193 |
-
|
| 194 |
|
| 195 |
if push_to_hub:
|
| 196 |
if not should_login:
|
| 197 |
create_login_hf_cells(notebook['cells'])
|
| 198 |
-
|
| 199 |
|
| 200 |
file_name = f"{finetuning_notebook}.ipynb"
|
| 201 |
|
|
@@ -287,8 +287,8 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(text_size='lg', font=["monospace"],
|
|
| 287 |
with centered_column():
|
| 288 |
output_dir_cmp, push_to_hub_cmp = add_outputs()
|
| 289 |
all_components.update({output_dir_cmp, push_to_hub_cmp})
|
| 290 |
-
|
| 291 |
-
all_components.update({
|
| 292 |
with centered_column():
|
| 293 |
all_components.update(add_outputs1())
|
| 294 |
|
|
@@ -318,9 +318,9 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(text_size='lg', font=["monospace"],
|
|
| 318 |
)
|
| 319 |
|
| 320 |
push_to_hub_cmp.change(
|
| 321 |
-
fn=
|
| 322 |
inputs=push_to_hub_cmp,
|
| 323 |
-
outputs=
|
| 324 |
)
|
| 325 |
|
| 326 |
demo.launch(allowed_paths=["/"])
|
|
|
|
| 68 |
return None
|
| 69 |
|
| 70 |
|
| 71 |
+
def handle_push_to_hub(value):
|
| 72 |
+
return gr.Textbox(visible=value)
|
| 73 |
|
| 74 |
|
| 75 |
def check_valid_input(value):
|
|
|
|
| 190 |
|
| 191 |
create_merge_lora_cells(notebook['cells'], output_dir)
|
| 192 |
|
| 193 |
+
merge_model_cells(notebook['cells'], output_dir)
|
| 194 |
|
| 195 |
if push_to_hub:
|
| 196 |
if not should_login:
|
| 197 |
create_login_hf_cells(notebook['cells'])
|
| 198 |
+
push_to_hub_cells(notebook['cells'], output_dir)
|
| 199 |
|
| 200 |
file_name = f"{finetuning_notebook}.ipynb"
|
| 201 |
|
|
|
|
| 287 |
with centered_column():
|
| 288 |
output_dir_cmp, push_to_hub_cmp = add_outputs()
|
| 289 |
all_components.update({output_dir_cmp, push_to_hub_cmp})
|
| 290 |
+
repo_name_cmp = add_hf_repo_cmp()
|
| 291 |
+
all_components.update({repo_name_cmp})
|
| 292 |
with centered_column():
|
| 293 |
all_components.update(add_outputs1())
|
| 294 |
|
|
|
|
| 318 |
)
|
| 319 |
|
| 320 |
push_to_hub_cmp.change(
|
| 321 |
+
fn=handle_push_to_hub,
|
| 322 |
inputs=push_to_hub_cmp,
|
| 323 |
+
outputs=repo_name_cmp
|
| 324 |
)
|
| 325 |
|
| 326 |
demo.launch(allowed_paths=["/"])
|
utils/__init__.py
CHANGED
|
@@ -36,9 +36,8 @@ LR_SCHEDULER_TYPE_ID = "lr_scheduler_type"
|
|
| 36 |
OUTPUT_DIR_ID = "output_dir"
|
| 37 |
|
| 38 |
PUSH_TO_HUB_ID = "push_to_hub"
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
PUSH_TYPES_ONLY_MODEL = "Push only the Model and Tokenizer"
|
| 42 |
|
| 43 |
REPORT_TO_ID = "report_to"
|
| 44 |
|
|
|
|
| 36 |
OUTPUT_DIR_ID = "output_dir"
|
| 37 |
|
| 38 |
PUSH_TO_HUB_ID = "push_to_hub"
|
| 39 |
+
|
| 40 |
+
REPOSITORY_NAME_ID = "repo_id"
|
|
|
|
| 41 |
|
| 42 |
REPORT_TO_ID = "report_to"
|
| 43 |
|
utils/components_creator.py
CHANGED
|
@@ -181,15 +181,14 @@ def add_outputs() -> (Component, Component):
|
|
| 181 |
return output_dir, push_to_hub
|
| 182 |
|
| 183 |
|
| 184 |
-
def
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
return push_type
|
| 193 |
|
| 194 |
|
| 195 |
def add_outputs1() -> Set[Component]:
|
|
|
|
| 181 |
return output_dir, push_to_hub
|
| 182 |
|
| 183 |
|
| 184 |
+
def add_hf_repo_cmp() -> Component:
|
| 185 |
+
repo_name = gr.Textbox(label="HF Repo name",
|
| 186 |
+
placeholder="username/your_repository",
|
| 187 |
+
info="Hugging Face repository to be created.",
|
| 188 |
+
interactive=True,
|
| 189 |
+
visible=False,
|
| 190 |
+
elem_id=REPOSITORY_NAME_ID)
|
| 191 |
+
return repo_name
|
|
|
|
| 192 |
|
| 193 |
|
| 194 |
def add_outputs1() -> Set[Component]:
|
utils/notebook_generator.py
CHANGED
|
@@ -2,7 +2,7 @@ from typing import Optional
|
|
| 2 |
|
| 3 |
import nbformat as nbf
|
| 4 |
|
| 5 |
-
from utils import FTDataSet
|
| 6 |
|
| 7 |
|
| 8 |
def create_install_libraries_cells(cells: list):
|
|
@@ -74,7 +74,7 @@ def create_login_hf_cells(cells: list, should_login: bool = False, model_name: O
|
|
| 74 |
from huggingface_hub import login
|
| 75 |
|
| 76 |
login(
|
| 77 |
-
token='
|
| 78 |
add_to_git_credential=True
|
| 79 |
)
|
| 80 |
"""
|
|
@@ -148,6 +148,7 @@ bnb_config = BitsAndBytesConfig(
|
|
| 148 |
model = AutoModelForCausalLM.from_pretrained(
|
| 149 |
model_id,
|
| 150 |
device_map="auto",
|
|
|
|
| 151 |
{flash_attention_str}
|
| 152 |
torch_dtype=torch.bfloat16,
|
| 153 |
quantization_config=bnb_config
|
|
@@ -246,7 +247,7 @@ def create_training_args_cells(cells: list, epochs, max_steps, logging_steps, pe
|
|
| 246 |
from transformers import TrainingArguments
|
| 247 |
|
| 248 |
args = TrainingArguments(
|
| 249 |
-
output_dir="{output_dir}",
|
| 250 |
num_train_epochs={epochs},
|
| 251 |
per_device_train_batch_size={per_device_train_batch_size},
|
| 252 |
gradient_accumulation_steps={gradient_accumulation_steps},
|
|
@@ -319,7 +320,7 @@ def create_start_training_cells(cells: list, epochs, max_steps, push_to_hub, out
|
|
| 319 |
f"""### Starting Training and Saving Model/Tokenizer
|
| 320 |
|
| 321 |
We start training the model by calling the `train()` method on the trainer instance. This will start the training
|
| 322 |
-
loop and train the model for `{epoch_str}`. The model will be automatically saved the output directory({output_dir})
|
| 323 |
{save_txt}
|
| 324 |
|
| 325 |
""")
|
|
@@ -331,11 +332,9 @@ model.config.use_cache = False
|
|
| 331 |
# start training
|
| 332 |
trainer.train()
|
| 333 |
|
| 334 |
-
# save the model
|
| 335 |
trainer.save_model()
|
| 336 |
|
| 337 |
-
# save tokenizer
|
| 338 |
-
tokenizer.save_pretrained("{output_dir}")
|
| 339 |
"""
|
| 340 |
code_cell = nbf.v4.new_code_cell(code)
|
| 341 |
cells.append(text_cell)
|
|
@@ -375,7 +374,7 @@ from peft import AutoPeftModelForCausalLM
|
|
| 375 |
|
| 376 |
# Load Peft model on CPU
|
| 377 |
model = AutoPeftModelForCausalLM.from_pretrained(
|
| 378 |
-
"{output_dir}",
|
| 379 |
torch_dtype=torch.float16,
|
| 380 |
low_cpu_mem_usage=True
|
| 381 |
)
|
|
@@ -383,48 +382,66 @@ model = AutoPeftModelForCausalLM.from_pretrained(
|
|
| 383 |
# Merge LoRA and base model and save
|
| 384 |
merged_model = model.merge_and_unload()
|
| 385 |
merged_model.save_pretrained("{output_dir}", safe_serialization=True, max_shard_size="2GB")
|
|
|
|
| 386 |
"""
|
| 387 |
code_cell = nbf.v4.new_code_cell(code)
|
| 388 |
cells.append(text_cell)
|
| 389 |
cells.append(code_cell)
|
| 390 |
|
| 391 |
|
| 392 |
-
def
|
| 393 |
text_cell = nbf.v4.new_markdown_cell(
|
| 394 |
-
"
|
| 395 |
|
| 396 |
code = f"""
|
| 397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
"""
|
| 401 |
|
| 402 |
-
|
|
|
|
|
|
|
| 403 |
|
| 404 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
# Instantiate the HfApi class
|
| 407 |
api = HfApi()
|
| 408 |
|
| 409 |
# Your Hugging Face repository
|
| 410 |
-
repo_name = "
|
| 411 |
|
| 412 |
# Create a repository on the Hugging Face Hub
|
| 413 |
-
api.create_repo(token=HfFolder.get_token(),
|
| 414 |
-
|
| 415 |
-
# Path to your local folder
|
| 416 |
-
folder_path = "{output_dir}"
|
| 417 |
-
|
| 418 |
-
# Create a repository object
|
| 419 |
-
repo = Repository(local_dir=folder_path, clone_from=repo_name)
|
| 420 |
|
| 421 |
-
|
| 422 |
-
|
|
|
|
|
|
|
|
|
|
| 423 |
"""
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
code_cell = nbf.v4.new_code_cell(code)
|
| 427 |
-
else:
|
| 428 |
-
code_cell = nbf.v4.new_code_cell(code_all)
|
| 429 |
-
cells.append(text_cell)
|
| 430 |
cells.append(code_cell)
|
|
|
|
| 2 |
|
| 3 |
import nbformat as nbf
|
| 4 |
|
| 5 |
+
from utils import FTDataSet
|
| 6 |
|
| 7 |
|
| 8 |
def create_install_libraries_cells(cells: list):
|
|
|
|
| 74 |
from huggingface_hub import login
|
| 75 |
|
| 76 |
login(
|
| 77 |
+
token='HF_TOKEN',
|
| 78 |
add_to_git_credential=True
|
| 79 |
)
|
| 80 |
"""
|
|
|
|
| 148 |
model = AutoModelForCausalLM.from_pretrained(
|
| 149 |
model_id,
|
| 150 |
device_map="auto",
|
| 151 |
+
trust_remote_code=True,
|
| 152 |
{flash_attention_str}
|
| 153 |
torch_dtype=torch.bfloat16,
|
| 154 |
quantization_config=bnb_config
|
|
|
|
| 247 |
from transformers import TrainingArguments
|
| 248 |
|
| 249 |
args = TrainingArguments(
|
| 250 |
+
output_dir="temp_{output_dir}",
|
| 251 |
num_train_epochs={epochs},
|
| 252 |
per_device_train_batch_size={per_device_train_batch_size},
|
| 253 |
gradient_accumulation_steps={gradient_accumulation_steps},
|
|
|
|
| 320 |
f"""### Starting Training and Saving Model/Tokenizer
|
| 321 |
|
| 322 |
We start training the model by calling the `train()` method on the trainer instance. This will start the training
|
| 323 |
+
loop and train the model for `{epoch_str}`. The model will be automatically saved the output directory(temp_{output_dir})
|
| 324 |
{save_txt}
|
| 325 |
|
| 326 |
""")
|
|
|
|
| 332 |
# start training
|
| 333 |
trainer.train()
|
| 334 |
|
| 335 |
+
# save the PEFT model
|
| 336 |
trainer.save_model()
|
| 337 |
|
|
|
|
|
|
|
| 338 |
"""
|
| 339 |
code_cell = nbf.v4.new_code_cell(code)
|
| 340 |
cells.append(text_cell)
|
|
|
|
| 374 |
|
| 375 |
# Load Peft model on CPU
|
| 376 |
model = AutoPeftModelForCausalLM.from_pretrained(
|
| 377 |
+
"temp_{output_dir}",
|
| 378 |
torch_dtype=torch.float16,
|
| 379 |
low_cpu_mem_usage=True
|
| 380 |
)
|
|
|
|
| 382 |
# Merge LoRA and base model and save
|
| 383 |
merged_model = model.merge_and_unload()
|
| 384 |
merged_model.save_pretrained("{output_dir}", safe_serialization=True, max_shard_size="2GB")
|
| 385 |
+
tokenizer.save_pretrained("{output_dir}")
|
| 386 |
"""
|
| 387 |
code_cell = nbf.v4.new_code_cell(code)
|
| 388 |
cells.append(text_cell)
|
| 389 |
cells.append(code_cell)
|
| 390 |
|
| 391 |
|
| 392 |
+
def merge_model_cells(cells: list, output_dir):
|
| 393 |
text_cell = nbf.v4.new_markdown_cell(
|
| 394 |
+
f"### Copy all result folders from 'temp_{output_dir}' to '{output_dir}'.")
|
| 395 |
|
| 396 |
code = f"""
|
| 397 |
+
import os
|
| 398 |
+
import shutil
|
| 399 |
+
|
| 400 |
+
# Specify the source folder and the destination folder
|
| 401 |
+
source_folder = "temp_{output_dir}"
|
| 402 |
+
destination_folder = "{output_dir}"
|
| 403 |
|
| 404 |
+
# Create the destination folder if it doesn't exist
|
| 405 |
+
os.makedirs(destination_folder, exist_ok=True)
|
| 406 |
+
|
| 407 |
+
# Iterate over the files and subfolders in the source folder
|
| 408 |
+
for item in os.listdir(source_folder):
|
| 409 |
+
item_path = os.path.join(source_folder, item)
|
| 410 |
+
|
| 411 |
+
# Check if it's a subfolder (and not a file)
|
| 412 |
+
if os.path.isdir(item_path):
|
| 413 |
+
# Specify the destination path
|
| 414 |
+
destination_path = os.path.join(destination_folder, item)
|
| 415 |
+
|
| 416 |
+
# Copy the subfolder to the destination folder
|
| 417 |
+
shutil.copytree(item_path, destination_path)
|
| 418 |
"""
|
| 419 |
|
| 420 |
+
code_cell = nbf.v4.new_code_cell(code)
|
| 421 |
+
cells.append(text_cell)
|
| 422 |
+
cells.append(code_cell)
|
| 423 |
|
| 424 |
+
|
| 425 |
+
def push_to_hub_cells(cells: list, output_dir):
|
| 426 |
+
text = f"Push '{output_dir}' to your Hugging Face account."
|
| 427 |
+
code = f"""
|
| 428 |
+
from huggingface_hub import HfApi, HfFolder, Repository
|
| 429 |
|
| 430 |
# Instantiate the HfApi class
|
| 431 |
api = HfApi()
|
| 432 |
|
| 433 |
# Your Hugging Face repository
|
| 434 |
+
repo_name = "{output_dir}"
|
| 435 |
|
| 436 |
# Create a repository on the Hugging Face Hub
|
| 437 |
+
repo = api.create_repo(token=HfFolder.get_token(), repo_type="model", repo_id=repo_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
+
api.upload_folder(
|
| 440 |
+
folder_path="{output_dir}",
|
| 441 |
+
repo_id=repo.repo_id,
|
| 442 |
+
repo_type="model",
|
| 443 |
+
)
|
| 444 |
"""
|
| 445 |
+
code_cell = nbf.v4.new_code_cell(code)
|
| 446 |
+
cells.append(nbf.v4.new_markdown_cell(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
cells.append(code_cell)
|