File size: 2,483 Bytes
2d33606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Generate an indexed read-only node lookup sidecar from MuSProt.db."""
from __future__ import annotations

import argparse
import sqlite3
import zlib
from pathlib import Path


NODE_COLUMNS = (
    "pdb_id", "auth_asym_id", "base_label", "sequence", "sequence_length",
    "CATH_ID", "cath_superfamily", "Rosetta", "FoldX", "EvoEF2", "RW", "RW+",
    "ranked_functions", "state_id", "experimental_method", "pH", "temp_K",
    "chain_composition",
)


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("source", type=Path)
    parser.add_argument("-o", "--output", type=Path, default=Path("MuSProt-node-lookup.db"))
    args = parser.parse_args()

    output = args.output.resolve()
    output.unlink(missing_ok=True)

    conn = sqlite3.connect(output)
    try:
        conn.execute("PRAGMA journal_mode = OFF")
        conn.execute("PRAGMA synchronous = OFF")
        conn.execute(
            'CREATE TABLE node (pdb_id, auth_asym_id, base_label, sequence BLOB,'
            ' sequence_length, CATH_ID, cath_superfamily, Rosetta, FoldX, EvoEF2,'
            ' RW, "RW+", ranked_functions BLOB, state_id, experimental_method, pH, temp_K,'
            ' chain_composition)'
        )
        source = sqlite3.connect(f"file:{args.source.resolve()}?mode=ro", uri=True)
        try:
            select_columns = ", ".join(f'"{column}"' for column in NODE_COLUMNS)
            rows = source.execute(f"SELECT {select_columns} FROM node")
            placeholders = ", ".join("?" for _ in NODE_COLUMNS)
            batch = []
            for row in rows:
                row = list(row)
                for index in (3, 12):
                    row[index] = zlib.compress((row[index] or "").encode("utf-8"), 1)
                batch.append(row)
                if len(batch) == 1000:
                    conn.executemany(f"INSERT INTO node VALUES ({placeholders})", batch)
                    batch.clear()
            if batch:
                conn.executemany(f"INSERT INTO node VALUES ({placeholders})", batch)
        finally:
            source.close()
        conn.execute(
            "CREATE INDEX idx_node_chain "
            "ON node(LOWER(pdb_id), UPPER(auth_asym_id))"
        )
        conn.execute("ANALYZE")
        conn.commit()
    finally:
        conn.close()

    print(f"Created {output} ({output.stat().st_size:,} bytes)")


if __name__ == "__main__":
    main()