MuratcanKoylan's picture
Upload folder using huggingface_hub
685d968 verified
import json
import sys
def clean_batch(filepath):
print(f"Cleaning {filepath}...")
cleaned_data = []
fixed_count = 0
with open(filepath, 'r') as f:
for line in f:
if not line.strip():
continue
item = json.loads(line)
cats = item['labels']['categories']
if 'none' in cats and len(cats) > 1:
print(f"Fixing mixed 'none' in {item['scenario_id']}: {cats}")
cats.remove('none')
item['labels']['categories'] = cats
item['metadata']['cleaned_none_mix'] = True
fixed_count += 1
cleaned_data.append(item)
output_path = filepath.replace('.jsonl', '_cleaned.jsonl')
with open(output_path, 'w') as f:
for item in cleaned_data:
f.write(json.dumps(item) + '\n')
print(f"Cleaned {len(cleaned_data)} items. Fixed {fixed_count} issues.")
print(f"Saved to {output_path}")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python3 clean_batch.py <jsonl_file>")
sys.exit(1)
clean_batch(sys.argv[1])