""" Fully automated sync of WorldDisasterLM-8B project to HuggingFace model repo. Reads the real project README, updates all configs to match, commits and pushes. Usage: python scripts/sync_to_hf.py """ import json import os import shutil import subprocess import sys from pathlib import Path # ── Paths ───────────────────────────────────────────────────────────────────── PROJECT_ROOT = Path(__file__).parent.parent REPO_DIR = Path(os.environ.get("HF_REPO_DIR", r"C:\Users\naidu\AppData\Local\Temp\wdlm_model_push")) HF_TOKEN = os.environ.get("HF_TOKEN", "") HF_REPO = "drdeveloper88/WorldDisasterLM-8B" # ── Correct language list from project README ───────────────────────────────── LANGUAGES = ["en", "ne", "es", "fr", "ar", "hi", "te", "zh", "ja", "ko", "pt"] LANGUAGE_NAMES = { "en": "English", "ne": "Nepali (नेपाली)", "es": "Spanish", "fr": "French", "ar": "Arabic", "hi": "Hindi", "te": "Telugu", "zh": "Chinese", "ja": "Japanese", "ko": "Korean", "pt": "Portuguese", } # ── 1. Build the HF model card README ──────────────────────────────────────── def build_readme(): # Read the actual project README content (strip the first H1 title line) src_readme = (PROJECT_ROOT / "README.md").read_text(encoding="utf-8") yaml_header = f"""--- language: {chr(10).join(f" - {lang}" for lang in LANGUAGES)} license: llama3 library_name: transformers base_model: meta-llama/Llama-3.1-8B-Instruct tags: - disaster-management - emergency-response - humanitarian-ai - multilingual - fine-tuned - qlora - lora - peft - llama3 pipeline_tag: text-generation model-index: - name: WorldDisasterLM-8B results: [] --- """ return yaml_header + src_readme # ── 2. Config files ─────────────────────────────────────────────────────────── def build_configs(): SYSTEM_PROMPT = ( "You are WorldDisasterLM-8B, an expert AI specialized in global disaster " "management, emergency response, and humanitarian aid. You provide accurate, " "actionable guidance in 11 languages: English, Nepali (नेपाली), Spanish, " "French, Arabic, Hindi, Telugu, Chinese, Japanese, Korean, and Portuguese. " "Always prioritize life safety. Cite authoritative sources (NDRRMA for Nepal, " "WHO, FEMA, USGS, GDACS) when relevant. Never provide false hope or inaccurate " "information in emergency situations." ) CHAT_TEMPLATE = ( "{%- set default_system = \"" + SYSTEM_PROMPT.replace('"', "'") + "\" %}" "{%- if messages[0]['role'] == 'system' %}" "{%- set default_system = messages[0]['content'] %}" "{%- set messages = messages[1:] %}" "{%- endif %}" "{{ bos_token }}" "<|start_header_id|>system<|end_header_id|>\n\n{{ default_system }}<|eot_id|>" "{%- for message in messages %}" "{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}" "{%- endfor %}" "{%- if add_generation_prompt %}" "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" "{%- endif %}" ) configs = { "config.json": { "_name_or_path": "drdeveloper88/WorldDisasterLM-8B", "architectures": ["LlamaForCausalLM"], "attention_bias": False, "attention_dropout": 0.0, "bos_token_id": 128000, "eos_token_id": [128001, 128008, 128009], "head_dim": 128, "hidden_act": "silu", "hidden_size": 4096, "initializer_range": 0.02, "intermediate_size": 14336, "max_position_embeddings": 131072, "mlp_bias": False, "model_type": "llama", "num_attention_heads": 32, "num_hidden_layers": 32, "num_key_value_heads": 8, "pretraining_tp": 1, "rms_norm_eps": 1e-05, "rope_interleaved": False, "rope_scaling": { "factor": 8.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3", }, "rope_theta": 500000.0, "tie_word_embeddings": False, "torch_dtype": "bfloat16", "transformers_version": "4.43.0", "use_cache": True, "vocab_size": 128256, "quantization_config": { "quant_method": "bitsandbytes", "load_in_4bit": True, "load_in_8bit": False, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": "bfloat16", "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_storage": "uint8", "llm_int8_threshold": 6.0, "llm_int8_skip_modules": None, "llm_int8_enable_fp32_cpu_offload": False, "llm_int8_has_fp16_weight": False, }, }, "quantization_config.json": { "quant_method": "bitsandbytes", "load_in_4bit": True, "load_in_8bit": False, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": "bfloat16", "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_storage": "uint8", "llm_int8_threshold": 6.0, "llm_int8_skip_modules": None, "llm_int8_enable_fp32_cpu_offload": False, "llm_int8_has_fp16_weight": False, }, "adapter_config.json": { "_version": "0.7.1", "alpha_pattern": {}, "auto_mapping": None, "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", "bias": "none", "fan_in_fan_out": False, "inference_mode": True, "init_lora_weights": True, "layer_replication": None, "loftq_config": {}, "lora_alpha": 32, "lora_dropout": 0.05, "modules_to_save": None, "peft_type": "LORA", "r": 16, "rank_pattern": {}, "revision": None, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], "task_type": "CAUSAL_LM", "use_dora": False, "use_rslora": False, "trainable_parameters": "41,943,040", "total_parameters": "8,030,261,248", "trainable_pct": "0.52%", }, "tokenizer_config.json": { "add_bos_token": True, "add_eos_token": False, "add_prefix_space": None, "bos_token": "<|begin_of_text|>", "chat_template": CHAT_TEMPLATE, "clean_up_tokenization_spaces": True, "eos_token": "<|eot_id|>", "model_max_length": 131072, "pad_token": "<|end_of_text|>", "padding_side": "right", "tokenizer_class": "PreTrainedTokenizerFast", "unk_token": None, }, "generation_config.json": { "_from_model_config": False, "bos_token_id": 128000, "do_sample": True, "eos_token_id": [128001, 128008, 128009], "max_new_tokens": 512, "min_new_tokens": 10, "temperature": 0.7, "top_p": 0.9, "top_k": 50, "repetition_penalty": 1.1, "no_repeat_ngram_size": 3, "transformers_version": "4.43.0", }, "special_tokens_map.json": { "bos_token": { "content": "<|begin_of_text|>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, }, "eos_token": { "content": "<|eot_id|>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, }, "pad_token": { "content": "<|end_of_text|>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, }, "additional_special_tokens": [ "<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>", "<|begin_of_text|>", "<|end_of_text|>", ], }, "training_args.json": { "model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", "output_dir": "./outputs/WorldDisasterLM-8B", "num_train_epochs": 3, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 4, "gradient_accumulation_steps": 4, "gradient_checkpointing": True, "learning_rate": 2e-4, "lr_scheduler_type": "cosine", "warmup_ratio": 0.03, "weight_decay": 0.001, "max_grad_norm": 0.3, "optim": "paged_adamw_32bit", "fp16": False, "bf16": True, "max_seq_length": 4096, "packing": True, "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.05, "lora_target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], "use_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": "bfloat16", "use_nested_quant": True, "save_steps": 100, "logging_steps": 25, "evaluation_strategy": "steps", "eval_steps": 100, "save_total_limit": 3, "load_best_model_at_end": True, "metric_for_best_model": "eval_loss", "dataloader_num_workers": 4, "seed": 42, "report_to": ["tensorboard"], "dataset_sources": ["ReliefWeb", "USGS", "GDACS", "NOAA", "OpenFEMA", "WHO"], "dataset_size": "88+ live records → 711+ instruction samples per run", "languages": LANGUAGES, "language_names": list(LANGUAGE_NAMES.values()), "training_status": "PENDING — weights not yet generated. Run: python train.py", }, } return configs # ── 3. Files to copy from project root ─────────────────────────────────────── ROOT_FILES = [ "train.py", "evaluate.py", "inference.py", "dataset_builder.py", "app.py", "conftest.py", "requirements.txt", "pyproject.toml", "Dockerfile", "docker-compose.yml", "Makefile", ".env.example", "LICENSE", "MODEL_CARD.md", ".gitignore", ] ROOT_DIRS = [ "backend", "worlddisasterlm", "scripts", "tests", "hf_space", "docs", ".github", ] EXCLUDE_DIRS = {".venv", "__pycache__", ".pytest_cache", "node_modules", ".git", "outputs", "checkpoints", "artifacts"} # Top-level only exclusions (don't apply recursively) EXCLUDE_TOP_LEVEL_DIRS = {"data", "frontend"} def copy_tree(src: Path, dst: Path, top_level: bool = False): dst.mkdir(parents=True, exist_ok=True) for item in src.iterdir(): if item.name in EXCLUDE_DIRS: continue if top_level and item.name in EXCLUDE_TOP_LEVEL_DIRS: continue if item.is_dir(): copy_tree(item, dst / item.name) else: shutil.copy2(item, dst / item.name) def run(cmd: list[str], cwd: Path = None): result = subprocess.run(cmd, cwd=cwd or REPO_DIR, capture_output=True, text=True) if result.returncode != 0: print(f"STDERR: {result.stderr[-500:]}") return result.returncode == 0 # ── 4. Main ─────────────────────────────────────────────────────────────────── def main(): print(f"Syncing to: {REPO_DIR}") print(f"HF repo: {HF_REPO}\n") # Git config run(["git", "config", "http.sslBackend", "schannel"]) run(["git", "config", "user.email", "drdeveloper88@users.noreply.huggingface.co"]) run(["git", "config", "user.name", "drdeveloper88"]) # Write README print("Writing README.md (model card)...") readme = build_readme() (REPO_DIR / "README.md").write_text(readme, encoding="utf-8") # Write config files print("Writing config files...") configs = build_configs() for fname, data in configs.items(): path = REPO_DIR / fname with open(path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f" {fname} ({path.stat().st_size} bytes)") # Copy root files print("\nCopying project root files...") for fname in ROOT_FILES: src = PROJECT_ROOT / fname if src.exists(): shutil.copy2(src, REPO_DIR / fname) print(f" {fname}") # Copy directories print("\nCopying project directories...") for dname in ROOT_DIRS: src = PROJECT_ROOT / dname if src.exists(): dst = REPO_DIR / dname if dst.exists(): shutil.rmtree(dst) copy_tree(src, dst, top_level=True) print(f" {dname}/") # Git add + commit + push print("\nStaging changes...") run(["git", "add", "-A"]) result = subprocess.run( ["git", "status", "--short"], cwd=REPO_DIR, capture_output=True, text=True ) changed = result.stdout.strip() if not changed: print("Nothing new to commit — repo is up to date.") return print(f"Changed files:\n{changed}\n") commit_msg = ( "Sync: correct language list (en/ne/es/fr/ar/hi/te/zh/ja/ko/pt), " "updated README from project, full source code sync" ) run(["git", "commit", "-m", commit_msg]) print("Pushing to HuggingFace...") ok = run(["git", "push", "origin", "main"]) if ok: print(f"\nDone! Live at: https://huggingface.co/{HF_REPO}") else: print("Push failed — check git output above.") if __name__ == "__main__": main()