| |
| |
| |
| |
|
|
|
|
| import pathlib |
| import json |
| import lmdb |
|
|
| from xmlparser import XMLRawDataset, ListRawDataset |
|
|
| import argparse |
|
|
|
|
| class Env: |
| def __init__(self, output_path, interval_writeCache=1000): |
| self.output_path = output_path |
| self.env = lmdb.open(str(output_path), map_size=1099511627776) |
| self.cache = dict() |
| self.n = 0 |
| self.interval = interval_writeCache |
|
|
| def finish_line(self): |
| self.n += 1 |
| if self.n % 1000 == 0: |
| self.writeCache() |
|
|
| def writeCache(self): |
| with self.env.begin(write=True) as txn: |
| for k, v in self.cache.items(): |
| txn.put(k, v) |
| self.cache = {} |
| print(f'Written {self.n} lines @ {self.output_path}') |
|
|
|
|
| def createDataset(input_path, output_path, db_type='xml', dry_run=False): |
| p = pathlib.Path(output_path) |
| p.mkdir(parents=True, exist_ok=True) |
|
|
| if db_type == 'xml': |
| generator = XMLRawDataset.from_list(input_path, image_type=XMLRawDataset.IMAGE_TYPE_ENCODED) |
| elif db_type == 'list': |
| generator = ListRawDataset(input_path, image_type=XMLRawDataset.IMAGE_TYPE_ENCODED) |
| if dry_run: |
| return |
|
|
| |
| env = Env(output_path[0]) |
| env.cache['dbtype'.encode()] = 'xml'.encode() |
|
|
| for il, (g, line) in enumerate(generator): |
| env.cache[f'{env.n:09d}-direction'.encode()] = line.get('direction').encode() |
| env.cache[f'{env.n:09d}-label'.encode()] = line.get('label').encode() |
| env.cache[f'{env.n:09d}-cattrs'.encode()] = json.dumps(line.get('cattrs')).encode() |
| env.cache[f'{env.n:09d}-image'.encode()] = g |
| env.finish_line() |
|
|
| env.cache['n_line'.encode()] = str(env.n).encode() |
| env.writeCache() |
|
|
|
|
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--input_path', nargs='+', required=True) |
| parser.add_argument('--output_path', required=True) |
| parser.add_argument('--db_type', default='xml', choices=['xml', 'list']) |
| parser.add_argument('--dry-run', action='store_true') |
| opt = parser.parse_args() |
| createDataset(opt.input_path, opt.output_path, opt.db_type, dry_run=opt.dry_run) |
|
|