|
|
|
|
|
import json, argparse, yaml |
|
|
|
|
|
def filter_jsonl(input_file, output_file, fields_to_keep): |
|
|
with open(input_file, 'r', encoding='utf-8') as fin: |
|
|
with open(output_file, 'w', encoding='utf-8') as fout: |
|
|
for line in fin: |
|
|
if line.strip(): |
|
|
record = json.loads(line) |
|
|
filtered = {k: record.get(k) for k in fields_to_keep if k in record} |
|
|
fout.write(json.dumps(filtered, ensure_ascii=False) + '\n') |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument('--input', required=True) |
|
|
parser.add_argument('--output', required=True) |
|
|
parser.add_argument('--config', default='config.yaml') |
|
|
args = parser.parse_args() |
|
|
with open(args.config) as f: |
|
|
config = yaml.safe_load(f) |
|
|
filter_jsonl(args.input, args.output, config['campos_filter']) |
|
|
print(f"✅ Filtrado: {args.output}") |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|