"""Generate an indexed read-only node lookup sidecar from MuSProt.db.""" from __future__ import annotations import argparse import sqlite3 import zlib from pathlib import Path NODE_COLUMNS = ( "pdb_id", "auth_asym_id", "base_label", "sequence", "sequence_length", "CATH_ID", "cath_superfamily", "Rosetta", "FoldX", "EvoEF2", "RW", "RW+", "ranked_functions", "state_id", "experimental_method", "pH", "temp_K", "chain_composition", ) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("source", type=Path) parser.add_argument("-o", "--output", type=Path, default=Path("MuSProt-node-lookup.db")) args = parser.parse_args() output = args.output.resolve() output.unlink(missing_ok=True) conn = sqlite3.connect(output) try: conn.execute("PRAGMA journal_mode = OFF") conn.execute("PRAGMA synchronous = OFF") conn.execute( 'CREATE TABLE node (pdb_id, auth_asym_id, base_label, sequence BLOB,' ' sequence_length, CATH_ID, cath_superfamily, Rosetta, FoldX, EvoEF2,' ' RW, "RW+", ranked_functions BLOB, state_id, experimental_method, pH, temp_K,' ' chain_composition)' ) source = sqlite3.connect(f"file:{args.source.resolve()}?mode=ro", uri=True) try: select_columns = ", ".join(f'"{column}"' for column in NODE_COLUMNS) rows = source.execute(f"SELECT {select_columns} FROM node") placeholders = ", ".join("?" for _ in NODE_COLUMNS) batch = [] for row in rows: row = list(row) for index in (3, 12): row[index] = zlib.compress((row[index] or "").encode("utf-8"), 1) batch.append(row) if len(batch) == 1000: conn.executemany(f"INSERT INTO node VALUES ({placeholders})", batch) batch.clear() if batch: conn.executemany(f"INSERT INTO node VALUES ({placeholders})", batch) finally: source.close() conn.execute( "CREATE INDEX idx_node_chain " "ON node(LOWER(pdb_id), UPPER(auth_asym_id))" ) conn.execute("ANALYZE") conn.commit() finally: conn.close() print(f"Created {output} ({output.stat().st_size:,} bytes)") if __name__ == "__main__": main()