| """Generate an indexed read-only node lookup sidecar from MuSProt.db.""" | |
| from __future__ import annotations | |
| import argparse | |
| import sqlite3 | |
| import zlib | |
| from pathlib import Path | |
| NODE_COLUMNS = ( | |
| "pdb_id", "auth_asym_id", "base_label", "sequence", "sequence_length", | |
| "CATH_ID", "cath_superfamily", "Rosetta", "FoldX", "EvoEF2", "RW", "RW+", | |
| "ranked_functions", "state_id", "experimental_method", "pH", "temp_K", | |
| "chain_composition", | |
| ) | |
| def main() -> None: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("source", type=Path) | |
| parser.add_argument("-o", "--output", type=Path, default=Path("MuSProt-node-lookup.db")) | |
| args = parser.parse_args() | |
| output = args.output.resolve() | |
| output.unlink(missing_ok=True) | |
| conn = sqlite3.connect(output) | |
| try: | |
| conn.execute("PRAGMA journal_mode = OFF") | |
| conn.execute("PRAGMA synchronous = OFF") | |
| conn.execute( | |
| 'CREATE TABLE node (pdb_id, auth_asym_id, base_label, sequence BLOB,' | |
| ' sequence_length, CATH_ID, cath_superfamily, Rosetta, FoldX, EvoEF2,' | |
| ' RW, "RW+", ranked_functions BLOB, state_id, experimental_method, pH, temp_K,' | |
| ' chain_composition)' | |
| ) | |
| source = sqlite3.connect(f"file:{args.source.resolve()}?mode=ro", uri=True) | |
| try: | |
| select_columns = ", ".join(f'"{column}"' for column in NODE_COLUMNS) | |
| rows = source.execute(f"SELECT {select_columns} FROM node") | |
| placeholders = ", ".join("?" for _ in NODE_COLUMNS) | |
| batch = [] | |
| for row in rows: | |
| row = list(row) | |
| for index in (3, 12): | |
| row[index] = zlib.compress((row[index] or "").encode("utf-8"), 1) | |
| batch.append(row) | |
| if len(batch) == 1000: | |
| conn.executemany(f"INSERT INTO node VALUES ({placeholders})", batch) | |
| batch.clear() | |
| if batch: | |
| conn.executemany(f"INSERT INTO node VALUES ({placeholders})", batch) | |
| finally: | |
| source.close() | |
| conn.execute( | |
| "CREATE INDEX idx_node_chain " | |
| "ON node(LOWER(pdb_id), UPPER(auth_asym_id))" | |
| ) | |
| conn.execute("ANALYZE") | |
| conn.commit() | |
| finally: | |
| conn.close() | |
| print(f"Created {output} ({output.stat().st_size:,} bytes)") | |
| if __name__ == "__main__": | |
| main() | |