WorldDisasterLM-8B / scripts /sync_to_hf.py
drdeveloper88's picture
Fix: restore worlddisasterlm/data package + fix README badges and clone URL
9cde61a
Raw
History Blame Contribute Delete
14.7 kB
"""
Fully automated sync of WorldDisasterLM-8B project to HuggingFace model repo.
Reads the real project README, updates all configs to match, commits and pushes.
Usage:
python scripts/sync_to_hf.py
"""
import json
import os
import shutil
import subprocess
import sys
from pathlib import Path
# ── Paths ─────────────────────────────────────────────────────────────────────
PROJECT_ROOT = Path(__file__).parent.parent
REPO_DIR = Path(os.environ.get("HF_REPO_DIR", r"C:\Users\naidu\AppData\Local\Temp\wdlm_model_push"))
HF_TOKEN = os.environ.get("HF_TOKEN", "")
HF_REPO = "drdeveloper88/WorldDisasterLM-8B"
# ── Correct language list from project README ─────────────────────────────────
LANGUAGES = ["en", "ne", "es", "fr", "ar", "hi", "te", "zh", "ja", "ko", "pt"]
LANGUAGE_NAMES = {
"en": "English", "ne": "Nepali (नेपाली)", "es": "Spanish",
"fr": "French", "ar": "Arabic", "hi": "Hindi",
"te": "Telugu", "zh": "Chinese", "ja": "Japanese",
"ko": "Korean", "pt": "Portuguese",
}
# ── 1. Build the HF model card README ────────────────────────────────────────
def build_readme():
# Read the actual project README content (strip the first H1 title line)
src_readme = (PROJECT_ROOT / "README.md").read_text(encoding="utf-8")
yaml_header = f"""---
language:
{chr(10).join(f" - {lang}" for lang in LANGUAGES)}
license: llama3
library_name: transformers
base_model: meta-llama/Llama-3.1-8B-Instruct
tags:
- disaster-management
- emergency-response
- humanitarian-ai
- multilingual
- fine-tuned
- qlora
- lora
- peft
- llama3
pipeline_tag: text-generation
model-index:
- name: WorldDisasterLM-8B
results: []
---
"""
return yaml_header + src_readme
# ── 2. Config files ───────────────────────────────────────────────────────────
def build_configs():
SYSTEM_PROMPT = (
"You are WorldDisasterLM-8B, an expert AI specialized in global disaster "
"management, emergency response, and humanitarian aid. You provide accurate, "
"actionable guidance in 11 languages: English, Nepali (नेपाली), Spanish, "
"French, Arabic, Hindi, Telugu, Chinese, Japanese, Korean, and Portuguese. "
"Always prioritize life safety. Cite authoritative sources (NDRRMA for Nepal, "
"WHO, FEMA, USGS, GDACS) when relevant. Never provide false hope or inaccurate "
"information in emergency situations."
)
CHAT_TEMPLATE = (
"{%- set default_system = \"" + SYSTEM_PROMPT.replace('"', "'") + "\" %}"
"{%- if messages[0]['role'] == 'system' %}"
"{%- set default_system = messages[0]['content'] %}"
"{%- set messages = messages[1:] %}"
"{%- endif %}"
"{{ bos_token }}"
"<|start_header_id|>system<|end_header_id|>\n\n{{ default_system }}<|eot_id|>"
"{%- for message in messages %}"
"{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"
"{%- endfor %}"
"{%- if add_generation_prompt %}"
"{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
"{%- endif %}"
)
configs = {
"config.json": {
"_name_or_path": "drdeveloper88/WorldDisasterLM-8B",
"architectures": ["LlamaForCausalLM"],
"attention_bias": False,
"attention_dropout": 0.0,
"bos_token_id": 128000,
"eos_token_id": [128001, 128008, 128009],
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 14336,
"max_position_embeddings": 131072,
"mlp_bias": False,
"model_type": "llama",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"num_key_value_heads": 8,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_interleaved": False,
"rope_scaling": {
"factor": 8.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3",
},
"rope_theta": 500000.0,
"tie_word_embeddings": False,
"torch_dtype": "bfloat16",
"transformers_version": "4.43.0",
"use_cache": True,
"vocab_size": 128256,
"quantization_config": {
"quant_method": "bitsandbytes",
"load_in_4bit": True,
"load_in_8bit": False,
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_compute_dtype": "bfloat16",
"bnb_4bit_use_double_quant": True,
"bnb_4bit_quant_storage": "uint8",
"llm_int8_threshold": 6.0,
"llm_int8_skip_modules": None,
"llm_int8_enable_fp32_cpu_offload": False,
"llm_int8_has_fp16_weight": False,
},
},
"quantization_config.json": {
"quant_method": "bitsandbytes",
"load_in_4bit": True,
"load_in_8bit": False,
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_compute_dtype": "bfloat16",
"bnb_4bit_use_double_quant": True,
"bnb_4bit_quant_storage": "uint8",
"llm_int8_threshold": 6.0,
"llm_int8_skip_modules": None,
"llm_int8_enable_fp32_cpu_offload": False,
"llm_int8_has_fp16_weight": False,
},
"adapter_config.json": {
"_version": "0.7.1",
"alpha_pattern": {},
"auto_mapping": None,
"base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
"bias": "none",
"fan_in_fan_out": False,
"inference_mode": True,
"init_lora_weights": True,
"layer_replication": None,
"loftq_config": {},
"lora_alpha": 32,
"lora_dropout": 0.05,
"modules_to_save": None,
"peft_type": "LORA",
"r": 16,
"rank_pattern": {},
"revision": None,
"target_modules": [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
"task_type": "CAUSAL_LM",
"use_dora": False,
"use_rslora": False,
"trainable_parameters": "41,943,040",
"total_parameters": "8,030,261,248",
"trainable_pct": "0.52%",
},
"tokenizer_config.json": {
"add_bos_token": True,
"add_eos_token": False,
"add_prefix_space": None,
"bos_token": "<|begin_of_text|>",
"chat_template": CHAT_TEMPLATE,
"clean_up_tokenization_spaces": True,
"eos_token": "<|eot_id|>",
"model_max_length": 131072,
"pad_token": "<|end_of_text|>",
"padding_side": "right",
"tokenizer_class": "PreTrainedTokenizerFast",
"unk_token": None,
},
"generation_config.json": {
"_from_model_config": False,
"bos_token_id": 128000,
"do_sample": True,
"eos_token_id": [128001, 128008, 128009],
"max_new_tokens": 512,
"min_new_tokens": 10,
"temperature": 0.7,
"top_p": 0.9,
"top_k": 50,
"repetition_penalty": 1.1,
"no_repeat_ngram_size": 3,
"transformers_version": "4.43.0",
},
"special_tokens_map.json": {
"bos_token": {
"content": "<|begin_of_text|>",
"lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
},
"eos_token": {
"content": "<|eot_id|>",
"lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
},
"pad_token": {
"content": "<|end_of_text|>",
"lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
},
"additional_special_tokens": [
"<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>",
"<|begin_of_text|>", "<|end_of_text|>",
],
},
"training_args.json": {
"model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
"output_dir": "./outputs/WorldDisasterLM-8B",
"num_train_epochs": 3,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 4,
"gradient_checkpointing": True,
"learning_rate": 2e-4,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.03,
"weight_decay": 0.001,
"max_grad_norm": 0.3,
"optim": "paged_adamw_32bit",
"fp16": False,
"bf16": True,
"max_seq_length": 4096,
"packing": True,
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.05,
"lora_target_modules": [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
"use_4bit": True,
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_compute_dtype": "bfloat16",
"use_nested_quant": True,
"save_steps": 100,
"logging_steps": 25,
"evaluation_strategy": "steps",
"eval_steps": 100,
"save_total_limit": 3,
"load_best_model_at_end": True,
"metric_for_best_model": "eval_loss",
"dataloader_num_workers": 4,
"seed": 42,
"report_to": ["tensorboard"],
"dataset_sources": ["ReliefWeb", "USGS", "GDACS", "NOAA", "OpenFEMA", "WHO"],
"dataset_size": "88+ live records → 711+ instruction samples per run",
"languages": LANGUAGES,
"language_names": list(LANGUAGE_NAMES.values()),
"training_status": "PENDING — weights not yet generated. Run: python train.py",
},
}
return configs
# ── 3. Files to copy from project root ───────────────────────────────────────
ROOT_FILES = [
"train.py", "evaluate.py", "inference.py", "dataset_builder.py",
"app.py", "conftest.py", "requirements.txt", "pyproject.toml",
"Dockerfile", "docker-compose.yml", "Makefile", ".env.example",
"LICENSE", "MODEL_CARD.md", ".gitignore",
]
ROOT_DIRS = [
"backend", "worlddisasterlm", "scripts", "tests", "hf_space", "docs", ".github",
]
EXCLUDE_DIRS = {".venv", "__pycache__", ".pytest_cache", "node_modules",
".git", "outputs", "checkpoints", "artifacts"}
# Top-level only exclusions (don't apply recursively)
EXCLUDE_TOP_LEVEL_DIRS = {"data", "frontend"}
def copy_tree(src: Path, dst: Path, top_level: bool = False):
dst.mkdir(parents=True, exist_ok=True)
for item in src.iterdir():
if item.name in EXCLUDE_DIRS:
continue
if top_level and item.name in EXCLUDE_TOP_LEVEL_DIRS:
continue
if item.is_dir():
copy_tree(item, dst / item.name)
else:
shutil.copy2(item, dst / item.name)
def run(cmd: list[str], cwd: Path = None):
result = subprocess.run(cmd, cwd=cwd or REPO_DIR, capture_output=True, text=True)
if result.returncode != 0:
print(f"STDERR: {result.stderr[-500:]}")
return result.returncode == 0
# ── 4. Main ───────────────────────────────────────────────────────────────────
def main():
print(f"Syncing to: {REPO_DIR}")
print(f"HF repo: {HF_REPO}\n")
# Git config
run(["git", "config", "http.sslBackend", "schannel"])
run(["git", "config", "user.email", "drdeveloper88@users.noreply.huggingface.co"])
run(["git", "config", "user.name", "drdeveloper88"])
# Write README
print("Writing README.md (model card)...")
readme = build_readme()
(REPO_DIR / "README.md").write_text(readme, encoding="utf-8")
# Write config files
print("Writing config files...")
configs = build_configs()
for fname, data in configs.items():
path = REPO_DIR / fname
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f" {fname} ({path.stat().st_size} bytes)")
# Copy root files
print("\nCopying project root files...")
for fname in ROOT_FILES:
src = PROJECT_ROOT / fname
if src.exists():
shutil.copy2(src, REPO_DIR / fname)
print(f" {fname}")
# Copy directories
print("\nCopying project directories...")
for dname in ROOT_DIRS:
src = PROJECT_ROOT / dname
if src.exists():
dst = REPO_DIR / dname
if dst.exists():
shutil.rmtree(dst)
copy_tree(src, dst, top_level=True)
print(f" {dname}/")
# Git add + commit + push
print("\nStaging changes...")
run(["git", "add", "-A"])
result = subprocess.run(
["git", "status", "--short"], cwd=REPO_DIR, capture_output=True, text=True
)
changed = result.stdout.strip()
if not changed:
print("Nothing new to commit — repo is up to date.")
return
print(f"Changed files:\n{changed}\n")
commit_msg = (
"Sync: correct language list (en/ne/es/fr/ar/hi/te/zh/ja/ko/pt), "
"updated README from project, full source code sync"
)
run(["git", "commit", "-m", commit_msg])
print("Pushing to HuggingFace...")
ok = run(["git", "push", "origin", "main"])
if ok:
print(f"\nDone! Live at: https://huggingface.co/{HF_REPO}")
else:
print("Push failed — check git output above.")
if __name__ == "__main__":
main()