| |
| """Concatenate three translation JSON files and remove instances with any null attribute.""" |
|
|
| import json |
| from pathlib import Path |
|
|
| INPUT_DIR = Path("/home/mshahidul/readctrl/data/translated_data/translation_testing_3396") |
| OUTPUT_DIR = Path("/home/mshahidul/readctrl/data/translated_data") |
| OUTPUT_FILE = OUTPUT_DIR / "translation_testing_3396_merged.json" |
|
|
| FILES = [ |
| "multiclinsum_test_en2bn_gemma(0_1000)_3396.json", |
| "multiclinsum_test_en2bn_gemma(1000_2000)_3396.json", |
| "multiclinsum_test_en2bn_gemma(2000_3396)_3396.json", |
| ] |
|
|
| REQUIRED_ATTRS = ["id", "fulltext", "summary", "translated_fulltext", "translated_summary"] |
|
|
|
|
| def has_any_null(obj): |
| """Return True if any required attribute is None/null.""" |
| for attr in REQUIRED_ATTRS: |
| if obj.get(attr) is None: |
| return True |
| return False |
|
|
|
|
| def main(): |
| merged = [] |
| removed = 0 |
| for fname in FILES: |
| path = INPUT_DIR / fname |
| with open(path, "r", encoding="utf-8") as f: |
| data = json.load(f) |
| for item in data: |
| if has_any_null(item): |
| removed += 1 |
| continue |
| merged.append(item) |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
| with open(OUTPUT_FILE, "w", encoding="utf-8") as f: |
| json.dump(merged, f, ensure_ascii=False, indent=4) |
| print(f"Total instances: {len(merged)}") |
| print(f"Removed (null in any attr): {removed}") |
| print(f"Saved to: {OUTPUT_FILE}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|