#!/usr/bin/env python3 """Concatenate three translation JSON files and remove instances with any null attribute.""" import json from pathlib import Path INPUT_DIR = Path("/home/mshahidul/readctrl/data/translated_data/translation_testing_3396") OUTPUT_DIR = Path("/home/mshahidul/readctrl/data/translated_data") OUTPUT_FILE = OUTPUT_DIR / "translation_testing_3396_merged.json" FILES = [ "multiclinsum_test_en2bn_gemma(0_1000)_3396.json", "multiclinsum_test_en2bn_gemma(1000_2000)_3396.json", "multiclinsum_test_en2bn_gemma(2000_3396)_3396.json", ] REQUIRED_ATTRS = ["id", "fulltext", "summary", "translated_fulltext", "translated_summary"] def has_any_null(obj): """Return True if any required attribute is None/null.""" for attr in REQUIRED_ATTRS: if obj.get(attr) is None: return True return False def main(): merged = [] removed = 0 for fname in FILES: path = INPUT_DIR / fname with open(path, "r", encoding="utf-8") as f: data = json.load(f) for item in data: if has_any_null(item): removed += 1 continue merged.append(item) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(merged, f, ensure_ascii=False, indent=4) print(f"Total instances: {len(merged)}") print(f"Removed (null in any attr): {removed}") print(f"Saved to: {OUTPUT_FILE}") if __name__ == "__main__": main()