File size: 1,184 Bytes
685d968 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import json
import sys
def clean_batch(filepath):
print(f"Cleaning {filepath}...")
cleaned_data = []
fixed_count = 0
with open(filepath, 'r') as f:
for line in f:
if not line.strip():
continue
item = json.loads(line)
cats = item['labels']['categories']
if 'none' in cats and len(cats) > 1:
print(f"Fixing mixed 'none' in {item['scenario_id']}: {cats}")
cats.remove('none')
item['labels']['categories'] = cats
item['metadata']['cleaned_none_mix'] = True
fixed_count += 1
cleaned_data.append(item)
output_path = filepath.replace('.jsonl', '_cleaned.jsonl')
with open(output_path, 'w') as f:
for item in cleaned_data:
f.write(json.dumps(item) + '\n')
print(f"Cleaned {len(cleaned_data)} items. Fixed {fixed_count} issues.")
print(f"Saved to {output_path}")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python3 clean_batch.py <jsonl_file>")
sys.exit(1)
clean_batch(sys.argv[1])
|