eeve-vss-smh / upload_to_hf.py
MyeongHo0621's picture
Upload EEVE Korean Custom model (checkpoint-500 merged)
274fb58 verified
#!/usr/bin/env python3
"""
Hugging Face Hub ์—…๋กœ๋“œ ์Šคํฌ๋ฆฝํŠธ
- ๋ณ‘ํ•ฉ๋œ EEVE ๋ชจ๋ธ์„ Hugging Face Hub์— ์—…๋กœ๋“œ
- ๋ชจ๋ธ ์นด๋“œ(README.md) ํฌํ•จ
"""
import os
import argparse
from pathlib import Path
from huggingface_hub import HfApi, create_repo, upload_folder
def upload_model_to_hub(
model_dir: str,
repo_id: str,
token: str = None,
private: bool = False,
commit_message: str = "Upload EEVE Korean Custom model"
):
"""
๋ชจ๋ธ์„ Hugging Face Hub์— ์—…๋กœ๋“œ
Args:
model_dir: ์—…๋กœ๋“œํ•  ๋ชจ๋ธ ๋””๋ ‰ํ† ๋ฆฌ ๊ฒฝ๋กœ
repo_id: Hugging Face ๋ฆฌํฌ์ง€ํ† ๋ฆฌ ID (username/model-name)
token: Hugging Face API ํ† ํฐ (None์ด๋ฉด ํ™˜๊ฒฝ๋ณ€์ˆ˜ ๋˜๋Š” CLI ๋กœ๊ทธ์ธ ์‚ฌ์šฉ)
private: Private ๋ฆฌํฌ์ง€ํ† ๋ฆฌ๋กœ ์ƒ์„ฑํ• ์ง€ ์—ฌ๋ถ€
commit_message: ์ปค๋ฐ‹ ๋ฉ”์‹œ์ง€
"""
print("\n" + "="*80)
print(" Hugging Face Hub ์—…๋กœ๋“œ")
print("="*80)
print(f"๐Ÿ“ ๋ชจ๋ธ ๋””๋ ‰ํ† ๋ฆฌ: {model_dir}")
print(f"๐ŸŽฏ ๋ฆฌํฌ์ง€ํ† ๋ฆฌ: {repo_id}")
print(f"๐Ÿ”’ ๊ณต๊ฐœ ์—ฌ๋ถ€: {'Private' if private else 'Public'}")
print("="*80 + "\n")
# HfApi ์ดˆ๊ธฐํ™”
api = HfApi(token=token)
# 1. ๋ฆฌํฌ์ง€ํ† ๋ฆฌ ์ƒ์„ฑ (์ด๋ฏธ ์žˆ์œผ๋ฉด ์Šคํ‚ต)
print("1๏ธโƒฃ ๋ฆฌํฌ์ง€ํ† ๋ฆฌ ์ƒ์„ฑ ์ค‘...")
try:
repo_url = create_repo(
repo_id=repo_id,
token=token,
private=private,
exist_ok=True,
repo_type="model"
)
print(f"โœ“ ๋ฆฌํฌ์ง€ํ† ๋ฆฌ: {repo_url}\n")
except Exception as e:
print(f"โš ๏ธ ๋ฆฌํฌ์ง€ํ† ๋ฆฌ๊ฐ€ ์ด๋ฏธ ์กด์žฌํ•˜๊ฑฐ๋‚˜ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}\n")
# 2. ํŒŒ์ผ ์—…๋กœ๋“œ
print("2๏ธโƒฃ ๋ชจ๋ธ ํŒŒ์ผ ์—…๋กœ๋“œ ์ค‘...")
print(" โฑ๏ธ ์ด ์ž‘์—…์€ ์‹œ๊ฐ„์ด ๊ฑธ๋ฆฝ๋‹ˆ๋‹ค (๋ชจ๋ธ ํฌ๊ธฐ: ~20GB)...\n")
try:
upload_folder(
repo_id=repo_id,
folder_path=model_dir,
token=token,
commit_message=commit_message,
repo_type="model"
)
print("โœ“ ์—…๋กœ๋“œ ์™„๋ฃŒ!\n")
except Exception as e:
print(f"โŒ ์—…๋กœ๋“œ ์‹คํŒจ: {e}")
raise
# 3. ์™„๋ฃŒ
print("="*80)
print("โœ… ์—…๋กœ๋“œ ์„ฑ๊ณต!")
print("="*80)
print(f"\n๐Ÿ”— ๋ชจ๋ธ ํŽ˜์ด์ง€: https://huggingface.co/{repo_id}")
print(f"๐Ÿ“– ์‚ฌ์šฉ ๋ฐฉ๋ฒ•:")
print(f"""
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("{repo_id}")
tokenizer = AutoTokenizer.from_pretrained("{repo_id}")
""")
print("="*80 + "\n")
def main():
parser = argparse.ArgumentParser(
description="EEVE ๋ชจ๋ธ์„ Hugging Face Hub์— ์—…๋กœ๋“œ",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
์‚ฌ์šฉ ์˜ˆ์‹œ:
# ๊ธฐ๋ณธ ์‚ฌ์šฉ (public)
python upload_to_hf.py --repo-id username/model-name
# Private ๋ฆฌํฌ์ง€ํ† ๋ฆฌ๋กœ ์—…๋กœ๋“œ
python upload_to_hf.py --repo-id username/model-name --private
# ํ† ํฐ ์ง์ ‘ ์ง€์ •
python upload_to_hf.py --repo-id username/model-name --token hf_xxxxx
# ๋‹ค๋ฅธ ๋””๋ ‰ํ† ๋ฆฌ์—์„œ ์—…๋กœ๋“œ
python upload_to_hf.py --repo-id username/model-name --model-dir /path/to/model
์ „์ฒด ์˜ต์…˜:
python upload_to_hf.py \\
--repo-id MyeongHo0621/EEVE-Korean-Custom-10.8B \\
--model-dir /home/work/tesseract/eeve_hf_upload \\
--private \\
--commit-message "Initial upload: checkpoint-500 merged"
์ฃผ์˜์‚ฌํ•ญ:
1. ๋จผ์ € Hugging Face์— ๋กœ๊ทธ์ธํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค:
huggingface-cli login
๋˜๋Š”
hf auth login
2. ํ† ํฐ์€ Write ๊ถŒํ•œ์ด ์žˆ์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค:
https://huggingface.co/settings/tokens
3. ์—…๋กœ๋“œ ์‹œ๊ฐ„: ~20-30๋ถ„ (๋„คํŠธ์›Œํฌ ์†๋„์— ๋”ฐ๋ผ)
"""
)
parser.add_argument(
"--repo-id",
type=str,
required=True,
help="Hugging Face ๋ฆฌํฌ์ง€ํ† ๋ฆฌ ID (username/model-name)"
)
parser.add_argument(
"--model-dir",
type=str,
default="/home/work/tesseract/eeve_hf_upload",
help="์—…๋กœ๋“œํ•  ๋ชจ๋ธ ๋””๋ ‰ํ† ๋ฆฌ (๊ธฐ๋ณธ: eeve_hf_upload)"
)
parser.add_argument(
"--token",
type=str,
default=None,
help="Hugging Face API ํ† ํฐ (์„ ํƒ, ์—†์œผ๋ฉด CLI ๋กœ๊ทธ์ธ ์‚ฌ์šฉ)"
)
parser.add_argument(
"--private",
action="store_true",
help="Private ๋ฆฌํฌ์ง€ํ† ๋ฆฌ๋กœ ์ƒ์„ฑ"
)
parser.add_argument(
"--commit-message",
type=str,
default="Upload EEVE Korean Custom model (checkpoint-500 merged)",
help="์ปค๋ฐ‹ ๋ฉ”์‹œ์ง€"
)
args = parser.parse_args()
# ๋ชจ๋ธ ๋””๋ ‰ํ† ๋ฆฌ ํ™•์ธ
model_dir = Path(args.model_dir)
if not model_dir.exists():
print(f"โŒ ์˜ค๋ฅ˜: ๋ชจ๋ธ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {model_dir}")
return 1
# ํ•„์ˆ˜ ํŒŒ์ผ ํ™•์ธ
required_files = ["config.json", "tokenizer_config.json"]
missing_files = [f for f in required_files if not (model_dir / f).exists()]
if missing_files:
print(f"โŒ ์˜ค๋ฅ˜: ํ•„์ˆ˜ ํŒŒ์ผ์ด ์—†์Šต๋‹ˆ๋‹ค: {missing_files}")
return 1
# ์—…๋กœ๋“œ ์‹คํ–‰
try:
upload_model_to_hub(
model_dir=str(model_dir),
repo_id=args.repo_id,
token=args.token,
private=args.private,
commit_message=args.commit_message
)
print("โœ… ๋ชจ๋“  ์ž‘์—…์ด ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค!")
return 0
except Exception as e:
print(f"\nโŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
exit(main())