DaJulster
/

SuaveAI-Dectection-Multitask-Model-V1

Text Classification

suave_multitask

Model card Files Files and versions

DaJulster commited on Feb 22

Commit

e317cf2

·

verified ·

1 Parent(s): 6316722

Upload folder using huggingface_hub

Files changed (2) hide show

prepare_hf_artifacts_light.py +76 -0
upload.py +1 -1

prepare_hf_artifacts_light.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import json
+import pickle
+import shutil
+from pathlib import Path
+import torch
+from huggingface_hub import hf_hub_download
+def _normalize_state_dict(raw_obj):
+    if isinstance(raw_obj, dict) and "state_dict" in raw_obj and isinstance(raw_obj["state_dict"], dict):
+        raw_obj = raw_obj["state_dict"]
+    if not isinstance(raw_obj, dict):
+        raise ValueError("Checkpoint is not a valid state_dict dictionary")
+    return {k.replace("module.", "", 1): v for k, v in raw_obj.items()}
+def _download_roberta_tokenizer_files(local_dir: Path):
+    files = [
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "tokenizer.json",
+    ]
+    for name in files:
+        downloaded = hf_hub_download(repo_id="roberta-base", filename=name)
+        shutil.copy2(downloaded, local_dir / name)
+def main():
+    root = Path(".")
+    model_ckpt = root / "multitask_model.pth"
+    label_encoder_path = root / "label_encoder.pkl"
+    if not model_ckpt.exists():
+        raise FileNotFoundError("multitask_model.pth not found")
+    if not label_encoder_path.exists():
+        raise FileNotFoundError("label_encoder.pkl not found")
+    with open(label_encoder_path, "rb") as file:
+        label_encoder = pickle.load(file)
+    num_ai_classes = len(label_encoder.classes_)
+    config = {
+        "architectures": ["SuaveMultitaskModel"],
+        "model_type": "suave_multitask",
+        "base_model_name": "roberta-base",
+        "num_ai_classes": num_ai_classes,
+        "classifier_dropout": 0.1,
+        "id2label": {"0": "human", "1": "ai"},
+        "label2id": {"human": 0, "ai": 1},
+        "auto_map": {
+            "AutoConfig": "configuration_suave_multitask.SuaveMultitaskConfig",
+            "AutoModel": "modeling_suave_multitask.SuaveMultitaskModel",
+        },
+    }
+    with open(root / "config.json", "w", encoding="utf-8") as file:
+        json.dump(config, file, indent=2)
+    state_dict = torch.load(model_ckpt, map_location="cpu")
+    state_dict = _normalize_state_dict(state_dict)
+    torch.save(state_dict, root / "pytorch_model.bin")
+    _download_roberta_tokenizer_files(root)
+    print("HF artifacts generated: config.json, pytorch_model.bin, tokenizer files")
+if __name__ == "__main__":
+    main()

upload.py CHANGED Viewed

@@ -24,7 +24,7 @@ api.create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)
 # 2. Generate HF-compatible artifacts from existing checkpoint (optional)
 skip_prepare = os.environ.get("SKIP_HF_PREPARE", "0") == "1"
 if not skip_prepare:
-    from prepare_hf_artifacts import main as prepare_hf_artifacts
     prepare_hf_artifacts()
 else:

 # 2. Generate HF-compatible artifacts from existing checkpoint (optional)
 skip_prepare = os.environ.get("SKIP_HF_PREPARE", "0") == "1"
 if not skip_prepare:
+    from prepare_hf_artifacts_light import main as prepare_hf_artifacts
     prepare_hf_artifacts()
 else: