MuSProt / backend /scripts /generate_node_lookup.py
WinslowFan's picture
Add chain_composition + binding status; refresh DB, docs, Fidelity plot
2d33606
Raw
History Blame Contribute Delete
2.48 kB
"""Generate an indexed read-only node lookup sidecar from MuSProt.db."""
from __future__ import annotations
import argparse
import sqlite3
import zlib
from pathlib import Path
NODE_COLUMNS = (
"pdb_id", "auth_asym_id", "base_label", "sequence", "sequence_length",
"CATH_ID", "cath_superfamily", "Rosetta", "FoldX", "EvoEF2", "RW", "RW+",
"ranked_functions", "state_id", "experimental_method", "pH", "temp_K",
"chain_composition",
)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("source", type=Path)
parser.add_argument("-o", "--output", type=Path, default=Path("MuSProt-node-lookup.db"))
args = parser.parse_args()
output = args.output.resolve()
output.unlink(missing_ok=True)
conn = sqlite3.connect(output)
try:
conn.execute("PRAGMA journal_mode = OFF")
conn.execute("PRAGMA synchronous = OFF")
conn.execute(
'CREATE TABLE node (pdb_id, auth_asym_id, base_label, sequence BLOB,'
' sequence_length, CATH_ID, cath_superfamily, Rosetta, FoldX, EvoEF2,'
' RW, "RW+", ranked_functions BLOB, state_id, experimental_method, pH, temp_K,'
' chain_composition)'
)
source = sqlite3.connect(f"file:{args.source.resolve()}?mode=ro", uri=True)
try:
select_columns = ", ".join(f'"{column}"' for column in NODE_COLUMNS)
rows = source.execute(f"SELECT {select_columns} FROM node")
placeholders = ", ".join("?" for _ in NODE_COLUMNS)
batch = []
for row in rows:
row = list(row)
for index in (3, 12):
row[index] = zlib.compress((row[index] or "").encode("utf-8"), 1)
batch.append(row)
if len(batch) == 1000:
conn.executemany(f"INSERT INTO node VALUES ({placeholders})", batch)
batch.clear()
if batch:
conn.executemany(f"INSERT INTO node VALUES ({placeholders})", batch)
finally:
source.close()
conn.execute(
"CREATE INDEX idx_node_chain "
"ON node(LOWER(pdb_id), UPPER(auth_asym_id))"
)
conn.execute("ANALYZE")
conn.commit()
finally:
conn.close()
print(f"Created {output} ({output.stat().st_size:,} bytes)")
if __name__ == "__main__":
main()