| ''' | |
| Utility script for converting label mapping (label -> integer) in the | |
| file 'label_mapping.pkl' to json format. | |
| Reason: some platform like Hugging Face mark pickle file as dangerous, | |
| so converting the mapping to JSON format is safer and more portable | |
| ''' | |
| from pathlib import Path | |
| import pickle | |
| import json | |
| import argparse | |
| from src.data_utils import nfc_normalize | |
| ROOT = Path(__file__).parents[1] / "data/dataset" | |
| DEFAULT_PKL = ROOT / "label_mapping.pkl" | |
| DEFAULT_JSON = ROOT / "label_mapping.json" | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="Convert label_mapping.pkl to .json") | |
| parser.add_argument( | |
| "--pkl_path", | |
| type=str, | |
| default=DEFAULT_PKL, | |
| help="Path to the original label_mapping.pkl" | |
| ) | |
| parser.add_argument( | |
| "--json_path", | |
| type=str, | |
| default=DEFAULT_JSON, | |
| help="Path to output JSON file" | |
| ) | |
| return parser.parse_args() | |
| def convert_pkl_to_json(pkl_path, json_path): | |
| with open(pkl_path, "rb") as f: | |
| data = pickle.load(f) | |
| data = {nfc_normalize(k): v for k, v in data.items()} | |
| with open(json_path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, ensure_ascii=False, indent=4) | |
| print(f"Saved json to {json_path}") | |
| def main(): | |
| args = parse_args() | |
| convert_pkl_to_json(pkl_path=args.pkl_path, json_path=args.json_path) | |
| if __name__ == "__main__": | |
| main() |