""" cpptraj RAG knowledge base — built from the real CpptrajManual.pdf. Pipeline: 1. Extract text from PDF with pdfplumber (cached to cpptraj_manual_cache.json) 2. Split into per-command chunks using section-header heuristics 3. TF-IDF index (scikit-learn) for fast semantic-ish retrieval 4. Thin structured command registry for the left-panel UI (unchanged look) """ import json import re from pathlib import Path import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # ───────────────────────────────────────────────────────────────────────────── # PATHS # ───────────────────────────────────────────────────────────────────────────── _HERE = Path(__file__).parent.parent # CPPTRAJ_Agent/ PDF_PATH = _HERE / "CpptrajManual.pdf" CACHE_PATH = _HERE / "cpptraj_manual_cache.json" # ───────────────────────────────────────────────────────────────────────────── # CPPTRAJ COMMANDS # ───────────────────────────────────────────────────────────────────────────── CPPTRAJ_COMMANDS = { # ── Setup / Input ──────────────────────────────────────────────────────── "parm": {"category": "Setup", "title": "Load Topology (parm)", "description": "Load a topology/parameter file (.prmtop, .psf, .gro, .pdb). Must be the first command.", "syntax": "parm [] [nobondsearch]"}, "trajin": {"category": "Setup", "title": "Load Trajectory (trajin)", "description": "Load trajectory file(s). Multiple trajin statements concatenate frames. Use start/stop/offset to sub-sample.", "syntax": "trajin [start] [stop|last] [offset]"}, "reference": {"category": "Setup", "title": "Load Reference (reference)", "description": "Load a reference structure used by rmsd, align, nativecontacts.", "syntax": "reference [] []"}, "activeref": {"category": "Setup", "title": "Set Active Reference (activeref)", "description": "Set the active reference structure by tag.", "syntax": "activeref "}, "createcrd": {"category": "Setup", "title": "Create COORDS Set (createcrd)", "description": "Create an empty COORDS data set for in-memory trajectory storage.", "syntax": "createcrd "}, "createreservoir": {"category": "Setup", "title": "Create Reservoir (createreservoir)", "description": "Create structure reservoir for REST simulation.", "syntax": "createreservoir [] [] [ene ] [temp ]"}, "createset": {"category": "Setup", "title": "Create Data Set (createset)", "description": "Create a new data set with specified values.", "syntax": "createset name type [values ,,...] []"}, "loadcrd": {"category": "Setup", "title": "Load COORDS (loadcrd)", "description": "Load trajectory into a named COORDS data set for later use.", "syntax": "loadcrd [] [] name "}, "loadtraj": {"category": "Setup", "title": "Load Trajectory (loadtraj)", "description": "Load trajectory (alias for trajin inside scripts).", "syntax": "loadtraj [] []"}, "readdata": {"category": "Setup", "title": "Read Data (readdata)", "description": "Read data from file into data sets for analysis.", "syntax": "readdata [as ] [name ] [index ]"}, "go": {"category": "Setup", "title": "Execute (go)", "description": "Execute all queued commands. Required at end of every script.", "syntax": "go"}, # ── Output ─────────────────────────────────────────────────────────────── "trajout": {"category": "Output", "title": "Write Trajectory (trajout)", "description": "Write processed trajectory to a new file. Format auto-detected from extension.", "syntax": "trajout [format] [nobox]"}, "outtraj": {"category": "Output", "title": "Write Frames On-the-fly (outtraj)", "description": "Write frames to trajectory file during processing.", "syntax": "outtraj [] [] [nobox] [onlyframes ]"}, "crdout": {"category": "Output", "title": "Write COORDS Set (crdout)", "description": "Write a COORDS data set to a trajectory file.", "syntax": "crdout [] []"}, "parmwrite": {"category": "Output", "title": "Write Topology (parmwrite)", "description": "Write topology to file in specified format.", "syntax": "parmwrite out [] []"}, "datafile": {"category": "Output", "title": "Data File Options (datafile)", "description": "Set output options for a data file.", "syntax": "datafile []"}, "datafilter": {"category": "Output", "title": "Filter Data (datafilter)", "description": "Filter data sets by criteria and write to file.", "syntax": "datafilter min max [out ]"}, "dataset": {"category": "Output", "title": "Data Set Operations (dataset)", "description": "Perform operations on data sets: legend, makexy, etc.", "syntax": "dataset {legend | makexy name | ...}"}, "flatten": {"category": "Output", "title": "Flatten Data (flatten)", "description": "Flatten multi-dimensional data sets to 1D.", "syntax": "flatten [out ]"}, "precision": {"category": "Output", "title": "Output Precision (precision)", "description": "Set output precision for data files.", "syntax": "precision []"}, "selectds": {"category": "Output", "title": "Select Data Sets (selectds)", "description": "Select data sets matching a string pattern.", "syntax": "selectds "}, # ── Manipulation / Actions ─────────────────────────────────────────────── "autoimage": {"category": "Manipulation", "title": "Fix PBC Imaging (autoimage)", "description": "Re-image molecules across periodic boundaries back into the primary unit cell. Always strip :WAT first.", "syntax": "autoimage [familiar] [byres|bymol] [anchor ]"}, "center": {"category": "Manipulation", "title": "Center System (center)", "description": "Translate coordinates so that specified atoms are at the origin or box center.", "syntax": "center [] [origin] [mass]"}, "strip": {"category": "Manipulation", "title": "Strip Atoms (strip)", "description": "Remove atoms/residues/molecules from the trajectory.", "syntax": "strip "}, "align": {"category": "Manipulation", "title": "Align Trajectory (align)", "description": "Rotate and translate frames to least-squares-fit selected atoms to a reference. Modifies coordinates.", "syntax": "align [] [ref |reference|first] [mass]"}, "image": {"category": "Manipulation", "title": "Image Molecules (image)", "description": "Image molecules into primary unit cell. Use autoimage for automatic imaging.", "syntax": "image [familiar] [bymol|byres|byatom] [] [origin] [center]"}, "unwrap": {"category": "Manipulation", "title": "Unwrap Trajectory (unwrap)", "description": "Unwrap trajectory to remove periodic boundary jumps.", "syntax": "unwrap [] [center] [bymol|byres]"}, "unstrip": {"category": "Manipulation", "title": "Restore Stripped Atoms (unstrip)", "description": "Restore previously stripped atoms back to the system.", "syntax": "unstrip"}, "translate": {"category": "Manipulation", "title": "Translate Coordinates (translate)", "description": "Translate coordinates by a vector.", "syntax": "translate [] [x ] [y ] [z ]"}, "rotate": {"category": "Manipulation", "title": "Rotate Coordinates (rotate)", "description": "Rotate coordinates around an axis.", "syntax": "rotate [] {axis degrees | x|y|z }"}, "scale": {"category": "Manipulation", "title": "Scale Coordinates (scale)", "description": "Scale coordinates by a factor along x/y/z.", "syntax": "scale [] [x ] [y ] [z ]"}, "box": {"category": "Manipulation", "title": "Set Box Dimensions (box)", "description": "Set or modify unit cell box dimensions.", "syntax": "box [x ] [y ] [z ] [alpha ] [beta ] [gamma ] [nobox]"}, "closest": {"category": "Manipulation", "title": "Keep Closest Solvent (closest)", "description": "Keep N closest solvent molecules to solute, remove the rest.", "syntax": "closest [noimage] [first|oxygen] [name ]"}, "addatom": {"category": "Manipulation", "title": "Add Atom (addatom)", "description": "Add atoms to the topology.", "syntax": "addatom {bond | nobond} []"}, "atommap": {"category": "Manipulation", "title": "Map Atoms (atommap)", "description": "Map atoms between two structures/topologies.", "syntax": "atommap [mapout ] [maponly]"}, "catcrd": {"category": "Manipulation", "title": "Concatenate COORDS (catcrd)", "description": "Concatenate multiple COORDS data sets.", "syntax": "catcrd [crdset ] [crdset ] ... name "}, "change": {"category": "Manipulation", "title": "Change Topology Properties (change)", "description": "Change topology atom/residue names, types, or other properties.", "syntax": "change {resname from to | atomname from to | ...}"}, "charge": {"category": "Manipulation", "title": "Print Total Charge (charge)", "description": "Print total charge for atom selection.", "syntax": "charge []"}, "checkchirality": {"category": "Manipulation", "title": "Check Chirality (checkchirality)", "description": "Check chirality of chiral centers.", "syntax": "checkchirality [] [out ]"}, "combinecrd": {"category": "Manipulation", "title": "Combine COORDS (combinecrd)", "description": "Combine two COORDS sets into one.", "syntax": "combinecrd name "}, "comparetop": {"category": "Manipulation", "title": "Compare Topologies (comparetop)", "description": "Compare two topology files.", "syntax": "comparetop [parm1 ] [parm2 ]"}, "crdaction": {"category": "Manipulation", "title": "Apply Action to COORDS (crdaction)", "description": "Apply an action to a COORDS data set.", "syntax": "crdaction []"}, "crdfluct": {"category": "Manipulation", "title": "COORDS Fluctuations (crdfluct)", "description": "Calculate fluctuations of a COORDS data set.", "syntax": "crdfluct [] [out ] [byres] [bfactor]"}, "crdtransform": {"category": "Manipulation", "title": "Transform COORDS (crdtransform)", "description": "Apply coordinate transformation to a COORDS set.", "syntax": "crdtransform []"}, "dihedralscan": {"category": "Manipulation", "title": "Dihedral Scan (dihedralscan)", "description": "Scan dihedral angles to generate conformations.", "syntax": "dihedralscan [] [rseed ] [out ] [outtraj ]"}, "emin": {"category": "Manipulation", "title": "Energy Minimization (emin)", "description": "Energy minimization using internal force field.", "syntax": "emin [] [nstep ] [out ] [step ]"}, "fiximagedbonds": {"category": "Manipulation", "title": "Fix Imaged Bonds (fiximagedbonds)", "description": "Fix broken bonds across periodic boundaries.", "syntax": "fiximagedbonds []"}, "fixatomorder": {"category": "Manipulation", "title": "Fix Atom Order (fixatomorder)", "description": "Reorder atoms to match topology.", "syntax": "fixatomorder [] [outprefix ]"}, "graft": {"category": "Manipulation", "title": "Graft Coordinates (graft)", "description": "Graft coordinates from one structure onto another.", "syntax": "graft [src ] [tgt ] [srcframe ] [mass]"}, "hmassrepartition": {"category": "Manipulation", "title": "H-mass Repartition (hmassrepartition)", "description": "Hydrogen mass repartitioning for longer MD timesteps.", "syntax": "hmassrepartition [] [factor ]"}, "lessplit": {"category": "Manipulation", "title": "Split LES Trajectory (lessplit)", "description": "Split LES trajectory into individual replicas.", "syntax": "lessplit [out ] []"}, "makestructure": {"category": "Manipulation", "title": "Build Structure (makestructure)", "description": "Build structure using idealized geometry.", "syntax": "makestructure :[,...] [out ]"}, "minimage": {"category": "Manipulation", "title": "Minimum Image (minimage)", "description": "Apply minimum image convention for periodic distance.", "syntax": "minimage [SETNAME] [out ]"}, "molinfo": {"category": "Manipulation", "title": "Molecule Info (molinfo)", "description": "Print molecular information for atom mask.", "syntax": "molinfo [] []"}, "parmbox": {"category": "Manipulation", "title": "Set Topology Box (parmbox)", "description": "Set periodic box dimensions in topology.", "syntax": "parmbox {x y z [alpha beta gamma ] | nobox}"}, "parminfo": {"category": "Manipulation", "title": "Topology Info (parminfo)", "description": "Print topology information summary.", "syntax": "parminfo [] []"}, "parmstrip": {"category": "Manipulation", "title": "Strip Topology (parmstrip)", "description": "Strip atoms from topology file permanently.", "syntax": "parmstrip []"}, "permutedihedrals": {"category": "Manipulation", "title": "Permute Dihedrals (permutedihedrals)", "description": "Randomly permute dihedral angles.", "syntax": "permutedihedrals [] [rseed ] [out ]"}, "prepareforleap": {"category": "Manipulation", "title": "Prepare for LEaP (prepareforleap)", "description": "Prepare structure for LEaP (add missing atoms, fix naming).", "syntax": "prepareforleap [] [out ] [pdbout ]"}, "randomizeions": {"category": "Manipulation", "title": "Randomize Ions (randomizeions)", "description": "Randomly swap ions with solvent molecules.", "syntax": "randomizeions [by ] [around ] [min ] [rseed ]"}, "remap": {"category": "Manipulation", "title": "Remap Atom Order (remap)", "description": "Remap atom ordering to match a reference.", "syntax": "remap [] "}, "replicatecell": {"category": "Manipulation", "title": "Replicate Unit Cell (replicatecell)", "description": "Replicate the unit cell in 3D.", "syntax": "replicatecell [] [out ] {all | dir X Y Z}"}, "resinfo": {"category": "Manipulation", "title": "Residue Info (resinfo)", "description": "Print residue information: resid, resname, atom count, etc.", "syntax": "resinfo [] []"}, "rotatedihedral": {"category": "Manipulation", "title": "Rotate Dihedral (rotatedihedral)", "description": "Rotate a specific dihedral angle to a target value.", "syntax": "rotatedihedral [] res type val "}, "scale": {"category": "Manipulation", "title": "Scale Coordinates (scale)", "description": "Scale coordinates by a factor along x/y/z.", "syntax": "scale [] [x ] [y ] [z ]"}, "select": {"category": "Manipulation", "title": "Select Atoms (select)", "description": "Select atoms by mask and print information.", "syntax": "select "}, "sequence": {"category": "Manipulation", "title": "Print Sequence (sequence)", "description": "Print amino acid or nucleic acid sequence.", "syntax": "sequence [] []"}, "setvelocity": {"category": "Manipulation", "title": "Set Velocities (setvelocity)", "description": "Assign velocities from Maxwell-Boltzmann distribution.", "syntax": "setvelocity [] [temp ] [rseed ]"}, "solvent": {"category": "Manipulation", "title": "Define Solvent (solvent)", "description": "Define solvent molecules in topology.", "syntax": "solvent [] []"}, "splitcoords": {"category": "Manipulation", "title": "Split COORDS (splitcoords)", "description": "Split COORDS set into separate sets by frame.", "syntax": "splitcoords [] name "}, "updateparameters": {"category": "Manipulation", "title": "Update Parameters (updateparameters)", "description": "Update force field parameters in topology.", "syntax": "updateparameters {||}"}, "bondparminfo": {"category": "Manipulation", "title": "Bond Parameter Info (bondparminfo)", "description": "Print bond parameter information.", "syntax": "bondparminfo [] []"}, # ── Analysis ───────────────────────────────────────────────────────────── "rmsd": {"category": "Analysis", "title": "RMSD (rmsd)", "description": "Calculate frame-by-frame RMSD of atoms relative to a reference. Use @CA,C,N,O for backbone. Most common MD analysis.", "syntax": "rmsd [SETNAME] [] [ref |first|reference] [out ] [nofit] [mass] [perres]"}, "atomicfluct": {"category": "Analysis", "title": "RMSF (atomicfluct)", "description": "Per-atom or per-residue root mean square fluctuation (B-factors). Use byres for per-residue.", "syntax": "atomicfluct [SETNAME] [] [out ] [byres] [byatom] [bfactor]"}, "radgyr": {"category": "Analysis", "title": "Radius of Gyration (radgyr)", "description": "Calculate radius of gyration — measures compactness. Always use mass keyword.", "syntax": "radgyr [SETNAME] [] [out ] [mass] [tensor]"}, "hbond": {"category": "Analysis", "title": "Hydrogen Bonds (hbond)", "description": "Detect and track hydrogen bonds. Default: dist ≤ 3.5 Å, angle ≥ 135°. Use avgout for statistics.", "syntax": "hbond [SETNAME] [] [out ] [avgout ] [dist ] [angle ] [series]"}, "secstruct": {"category": "Analysis", "title": "Secondary Structure (secstruct)", "description": "Assign secondary structure using DSSP algorithm. H=helix, E=strand, T=turn, C=coil.", "syntax": "secstruct [SETNAME] [] [out ] [sumout ]"}, "dssp": {"category": "Analysis", "title": "DSSP Secondary Structure (dssp)", "description": "DSSP secondary structure assignment — alias for secstruct.", "syntax": "dssp [SETNAME] [] [out ] [sumout ]"}, "cluster": {"category": "Analysis", "title": "Clustering (cluster)", "description": "Cluster trajectory frames by structural similarity. Use sieve for large trajectories.", "syntax": "cluster [SETNAME] [] [hieragglo|kmeans|dbscan] [epsilon ] [clusters ] [out ] [summary ] [repout ] [repfmt pdb]"}, "distance": {"category": "Analysis", "title": "Distance (distance)", "description": "Calculate distance between two atom masks (center-of-mass by default).", "syntax": "distance [SETNAME] [out ] [noimage] [geom]"}, "angle": {"category": "Analysis", "title": "Angle (angle)", "description": "Calculate angle between three atoms or groups. mask2 is the vertex.", "syntax": "angle [SETNAME] [out ]"}, "dihedral": {"category": "Analysis", "title": "Dihedral (dihedral)", "description": "Calculate dihedral (torsion) angle from four atoms. Output in −180 to +180 degrees.", "syntax": "dihedral [SETNAME] [out ]"}, "multidihedral": {"category": "Analysis", "title": "Backbone Dihedrals (multidihedral)", "description": "Calculate phi, psi, omega, chi1-chi4 for all or selected residues.", "syntax": "multidihedral [phi] [psi] [omega] [chin] [] [out ]"}, "phipsi": {"category": "Analysis", "title": "Phi/Psi Ramachandran (phipsi)", "description": "Calculate Ramachandran phi/psi angles for residues.", "syntax": "phipsi [] [out ] [name ] [resrange ]"}, "surf": {"category": "Analysis", "title": "SASA (surf)", "description": "Calculate solvent-accessible surface area using LCPO algorithm. 1.4 Å probe.", "syntax": "surf [SETNAME] [] [out ] [solvradius ]"}, "molsurf": {"category": "Analysis", "title": "MSMS SASA (molsurf)", "description": "MSMS/molsurf solvent accessible surface area.", "syntax": "molsurf [SETNAME] [] [out ] [probe ]"}, "nativecontacts": {"category": "Analysis", "title": "Native Contacts (nativecontacts)", "description": "Calculate fraction of native contacts (Q-value) relative to a reference structure.", "syntax": "nativecontacts [SETNAME] [] [ref |reference] [out ] [distance ]"}, "contacts": {"category": "Analysis", "title": "Contacts (contacts)", "description": "Calculate number of contacts. Legacy command — prefer nativecontacts.", "syntax": "contacts [first|reference|ref ] [byresidue] [out ] []"}, "density": {"category": "Analysis", "title": "Density Profile (density)", "description": "Calculate number or mass density along an axis. Useful for membrane systems.", "syntax": "density [SETNAME] [] [out ] [x|y|z] [delta ] [number|mass|electron]"}, "diffusion": {"category": "Analysis", "title": "Diffusion / MSD (diffusion)", "description": "Calculate mean square displacement and diffusion coefficient. D = slope of MSD / 6.", "syntax": "diffusion [SETNAME] [] [out ] [time
] [diffout ]"}, "stfcdiffusion": {"category": "Analysis", "title": "STFC Diffusion (stfcdiffusion)", "description": "Diffusion using STFC method for charged particles.", "syntax": "stfcdiffusion [] [out ] [time
] [x|y|z|xy|xz|yz|xyz]"}, "calcdiffusion": {"category": "Analysis", "title": "Calc Diffusion Coefficient (calcdiffusion)", "description": "Calculate diffusion coefficient from MSD data set.", "syntax": "calcdiffusion [out ] [time ]"}, "watershell": {"category": "Analysis", "title": "Water Shell (watershell)", "description": "Count water molecules in first and second solvation shells around a solute.", "syntax": "watershell [SETNAME] [out ] [lower ] [upper ]"}, "radial": {"category": "Analysis", "title": "Radial Distribution Function (radial)", "description": "Calculate radial distribution function (RDF) g(r).", "syntax": "radial [out ] [] [noimage]"}, "volmap": {"category": "Analysis", "title": "Volumetric Map (volmap)", "description": "Generate 3D volumetric density map (.dx file, viewable in VMD).", "syntax": "volmap [] [size ] [center ]"}, "grid": {"category": "Analysis", "title": "3D Density Grid (grid)", "description": "Calculate 3D density grid.", "syntax": "grid [origin] [] [box]"}, "pucker": {"category": "Analysis", "title": "Ring Pucker (pucker)", "description": "Calculate Cremer-Pople ring pucker parameters for sugars/nucleic acids.", "syntax": "pucker [SETNAME] [] [out ] [amplitude] [theta]"}, "multipucker": {"category": "Analysis", "title": "Multi Ring Pucker (multipucker)", "description": "Calculate ring pucker for multiple residues.", "syntax": "multipucker [] [out ] [amplitude] [theta]"}, "matrix": {"category": "Analysis", "title": "Covariance Matrix (matrix)", "description": "Build covariance or correlation matrix — first step for PCA.", "syntax": "matrix covar [SETNAME] [] [out ]"}, "diagmatrix": {"category": "Analysis", "title": "Diagonalize Matrix (diagmatrix)", "description": "Diagonalize a matrix to get eigenvalues and eigenvectors.", "syntax": "diagmatrix [out ] [vecs ] [reduce] [mass ]"}, "projection": {"category": "Analysis", "title": "PCA Projection (projection)", "description": "Project trajectory onto eigenvectors from matrix/analyze modes for PCA.", "syntax": "projection [SETNAME] evecvecs [] [out ] [beg ] [end ]"}, "modes": {"category": "Analysis", "title": "Normal Modes (modes)", "description": "Analyze normal modes from diagonalized matrix: fluct, displ, corr, eigenval, trajout.", "syntax": "modes {fluct|displ|corr|eigenval|trajout} name [beg ] [end ] [out ]"}, "tica": {"category": "Analysis", "title": "TICA (tica)", "description": "Time-lagged independent component analysis.", "syntax": "tica {crdset |data } [lag ] [nvecs ] [out ]"}, "atomiccorr": {"category": "Analysis", "title": "Atomic Correlation (atomiccorr)", "description": "Atomic correlation matrix between atom displacements.", "syntax": "atomiccorr [out ] [cut ] [] [datasave ]"}, "rms2d": {"category": "Analysis", "title": "Pairwise RMSD Matrix (rms2d)", "description": "Pairwise RMSD matrix between all frame pairs.", "syntax": "rms2d [SETNAME] [] [out ] [mass] [nofit] [reftraj ]"}, "rmsavgcorr": {"category": "Analysis", "title": "RMSD Running Average Correlation (rmsavgcorr)", "description": "Correlation of running-average RMSD vs window size.", "syntax": "rmsavgcorr [] [out ] [mass]"}, "symmrmsd": {"category": "Analysis", "title": "Symmetric RMSD (symmrmsd)", "description": "RMSD with symmetry correction for equivalent atoms.", "syntax": "symmrmsd [SETNAME] [] [ref |first] [out ] [remap]"}, "dihedralrms": {"category": "Analysis", "title": "Dihedral RMSD (dihedralrms)", "description": "RMSD of dihedral angles between frames.", "syntax": "dihedralrms [] [out ] [mass] [nofit]"}, "clusterdihedral": {"category": "Analysis", "title": "Dihedral Clustering (clusterdihedral)", "description": "Cluster by dihedral angles.", "syntax": "clusterdihedral [] [out ] [clusterout ] [...dihedrals...]"}, "average": {"category": "Analysis", "title": "Average Structure (average)", "description": "Compute average structure over trajectory frames.", "syntax": "average [SETNAME] [] [] [start ] [stop ] [offset ]"}, "avgcoord": {"category": "Analysis", "title": "Average Coordinates (avgcoord)", "description": "Average coordinates for each atom over trajectory.", "syntax": "avgcoord [SETNAME] [] [out ]"}, "avgbox": {"category": "Analysis", "title": "Average Box (avgbox)", "description": "Compute average box dimensions over trajectory.", "syntax": "avgbox [SETNAME] [out ]"}, "bounds": {"category": "Analysis", "title": "Bounding Box (bounds)", "description": "Calculate bounding box around atoms.", "syntax": "bounds [SETNAME] [] [out ] [dx ] [offset ]"}, "principal": {"category": "Analysis", "title": "Principal Axes (principal)", "description": "Calculate principal axes and moments of inertia.", "syntax": "principal [SETNAME] [] [out ] [dorotation] [mass]"}, "dipole": {"category": "Analysis", "title": "Dipole Moment (dipole)", "description": "Calculate dipole moment of selection.", "syntax": "dipole [SETNAME] [] [out ] []"}, "volume": {"category": "Analysis", "title": "Unit Cell Volume (volume)", "description": "Calculate unit cell volume over trajectory.", "syntax": "volume [SETNAME] [out ]"}, "temperature": {"category": "Analysis", "title": "Temperature (temperature)", "description": "Calculate instantaneous temperature from velocities.", "syntax": "temperature [SETNAME] [] [out ] [frame]"}, "energy": {"category": "Analysis", "title": "Energy (energy)", "description": "Calculate energy using internal force field (bond, angle, dihedral, VdW, electrostatic).", "syntax": "energy [] [out ] [bond] [angle] [dih] [vdw] [elec]"}, "esander": {"category": "Analysis", "title": "Energy via Sander (esander)", "description": "Calculate energy using sander AMBER engine.", "syntax": "esander [] [out ] [igb ] [cut ]"}, "enedecomp": {"category": "Analysis", "title": "Energy Decomposition (enedecomp)", "description": "Energy decomposition per residue.", "syntax": "enedecomp [] [out ] [cut ]"}, "pairwise": {"category": "Analysis", "title": "Pairwise Energy (pairwise)", "description": "Pairwise energy decomposition between residues.", "syntax": "pairwise [] [out ] [cut ] [cuteelec ] [cutevdw ]"}, "lie": {"category": "Analysis", "title": "Linear Interaction Energy (lie)", "description": "Linear interaction energy calculation.", "syntax": "lie [] [out ] [elec ] [vdw ]"}, "ti": {"category": "Analysis", "title": "Thermodynamic Integration (ti)", "description": "Thermodynamic integration (TI) free energy calculation.", "syntax": "ti [...] {nq |xvals } [out ] [name ]"}, "spam": {"category": "Analysis", "title": "SPAM (spam)", "description": "Solvation parameters from analysis of MD.", "syntax": "spam [out ] [name ] [DG ]"}, "nastruct": {"category": "Analysis", "title": "Nucleic Acid Structure (nastruct)", "description": "Nucleic acid structure parameters: base pairs, helical parameters.", "syntax": "nastruct [SETNAME] [resrange ] [naout ] [sscalc] [noheader]"}, "jcoupling": {"category": "Analysis", "title": "J-coupling (jcoupling)", "description": "Calculate J-coupling constants from dihedral angles using Karplus equation.", "syntax": "jcoupling [] [kfile ] [out ]"}, "ired": {"category": "Analysis", "title": "iRED NMR (ired)", "description": "iRED analysis of NMR order parameters.", "syntax": "ired [relax freq ] [order ] [orderparamfile ] [tstep ] [tcorr ] [out ]"}, "rotdif": {"category": "Analysis", "title": "Rotational Diffusion (rotdif)", "description": "Rotational diffusion analysis from NMR relaxation.", "syntax": "rotdif [out ] [rvecin ] [rseed ] [nvecs ]"}, "timecorr": {"category": "Analysis", "title": "Time Correlation (timecorr)", "description": "Time correlation function of vectors.", "syntax": "timecorr vec1 [vec2 ] [out ] [tstep ] [tcorr ]"}, "vector": {"category": "Analysis", "title": "Vector (vector)", "description": "Calculate a vector between two masks over time.", "syntax": "vector [SETNAME] [out ] [ired]"}, "multivector": {"category": "Analysis", "title": "Multi-vector (multivector)", "description": "Calculate vectors for multiple residue pairs.", "syntax": "multivector [] [out ] [ired]"}, "vectormath": {"category": "Analysis", "title": "Vector Math (vectormath)", "description": "Math operations on vector data sets: dot product, cross product, etc.", "syntax": "vectormath vec1 [vec2 ] {dotproduct|crossproduct|...} [out ]"}, "velocityautocorr": {"category": "Analysis", "title": "Velocity Autocorrelation (velocityautocorr)", "description": "Velocity autocorrelation function (VACF).", "syntax": "velocityautocorr [] [out ] [tstep ] [maxlag ] [norm]"}, "lipidorder": {"category": "Analysis", "title": "Lipid Order Parameters (lipidorder)", "description": "Calculate lipid tail order parameters (Scd) for membrane systems.", "syntax": "lipidorder [] [out ] [scd] [unsat]"}, "lipidscd": {"category": "Analysis", "title": "Lipid Scd (lipidscd)", "description": "Lipid Scd order parameter calculation.", "syntax": "lipidscd [] [out ]"}, "areapermol": {"category": "Analysis", "title": "Area per Molecule (areapermol)", "description": "Calculate area per molecule for lipid bilayers.", "syntax": "areapermol [SETNAME] [out ] [] [frame]"}, "mindist": {"category": "Analysis", "title": "Min/Max Distance (mindist)", "description": "Minimum and maximum distance between two masks.", "syntax": "mindist [SETNAME] [out ] [noimage]"}, "pairdist": {"category": "Analysis", "title": "Pairwise Distance (pairdist)", "description": "Pairwise distance histogram between all atom pairs.", "syntax": "pairdist [SETNAME] [] [out ] [delta ] [max ]"}, "hausdorff": {"category": "Analysis", "title": "Hausdorff Distance (hausdorff)", "description": "Calculate Hausdorff distance between two masks.", "syntax": "hausdorff [SETNAME] [out ]"}, "tordiff": {"category": "Analysis", "title": "Torsion Difference (tordiff)", "description": "Torsion angle difference between two structures.", "syntax": "tordiff [] [out ] [ref ]"}, "autocorr": {"category": "Analysis", "title": "Autocorrelation (autocorr)", "description": "Autocorrelation function of a data set.", "syntax": "autocorr [out ] [lagmax ] [norm] [direct]"}, "crosscorr": {"category": "Analysis", "title": "Cross-correlation (crosscorr)", "description": "Cross-correlation between two data sets.", "syntax": "crosscorr [out ] [lagmax ] [norm] [direct]"}, "lifetime": {"category": "Analysis", "title": "Lifetime Analysis (lifetime)", "description": "Lifetime analysis of hydrogen bonds or contacts.", "syntax": "lifetime [out ] [window ] [cut ] [name ]"}, "runningavg": {"category": "Analysis", "title": "Running Average (runningavg)", "description": "Running average (sliding window) of a data set.", "syntax": "runningavg [out ] [window ]"}, "integrate": {"category": "Analysis", "title": "Integrate (integrate)", "description": "Integrate a data set using the trapezoidal rule.", "syntax": "integrate [out ]"}, "slope": {"category": "Analysis", "title": "Slope / Linear Fit (slope)", "description": "Calculate slope of a data set by linear fit.", "syntax": "slope [out ]"}, "regress": {"category": "Analysis", "title": "Linear Regression (regress)", "description": "Linear regression of a data set.", "syntax": "regress [out ] [results ]"}, "curvefit": {"category": "Analysis", "title": "Curve Fitting (curvefit)", "description": "Fit data to a functional form.", "syntax": "curvefit [out ] [results ] [nofit]"}, "kde": {"category": "Analysis", "title": "KDE (kde)", "description": "Kernel density estimation of a data set.", "syntax": "kde [out ] [bandwidth ] [bins ]"}, "fft": {"category": "Analysis", "title": "FFT (fft)", "description": "Fast Fourier Transform of a data set.", "syntax": "fft [out ] [dt ] [fftout ]"}, "wavelet": {"category": "Analysis", "title": "Wavelet Analysis (wavelet)", "description": "Wavelet analysis of trajectory data.", "syntax": "wavelet [] [out ] [type ] [nb ]"}, "filter": {"category": "Analysis", "title": "Filter Frames (filter)", "description": "Filter frames based on dataset value criteria.", "syntax": "filter min max "}, "divergence": {"category": "Analysis", "title": "KL Divergence (divergence)", "description": "Calculate KL divergence between two distributions.", "syntax": "divergence [out ]"}, "lowestcurve": {"category": "Analysis", "title": "Lowest Free Energy Curve (lowestcurve)", "description": "Compute lowest free energy curve from 2D data.", "syntax": "lowestcurve [out ] [step ]"}, "meltcurve": {"category": "Analysis", "title": "Melting Curve (meltcurve)", "description": "Generate melting curve from temperature-dependent data.", "syntax": "meltcurve [out ] [norm]"}, "multicurve": {"category": "Analysis", "title": "Multi-curve Fit (multicurve)", "description": "Fit multiple exponential curves to data.", "syntax": "multicurve [] [out ] [nexp ]"}, "multihist": {"category": "Analysis", "title": "Multi-histogram (multihist)", "description": "Histogram multiple data sets simultaneously.", "syntax": "multihist [...] [out ] [bins ]"}, "calcstate": {"category": "Analysis", "title": "Calculate State (calcstate)", "description": "Calculate state of system using HMM or thresholds.", "syntax": "calcstate [name ] [out ] "}, "checkoverlap": {"category": "Analysis", "title": "Check Overlaps (checkoverlap)", "description": "Check for bad atomic overlaps/clashes.", "syntax": "check [] [cut ] [noimage] [out ]"}, "cphstats": {"category": "Analysis", "title": "Constant-pH Stats (cphstats)", "description": "Analyze constant-pH simulation statistics.", "syntax": "cphstats { [ ...]} [out ] [deprot ]"}, "remlog": {"category": "Analysis", "title": "REMD Log Analysis (remlog)", "description": "Analyze replica exchange log files.", "syntax": "remlog [out ] [nstlim ] [temp0 ]"}, # ── Reference ──────────────────────────────────────────────────────────── "mask_syntax": {"category": "Reference", "title": "Atom Mask Syntax", "description": "cpptraj atom selection: :resnum @atomname :resname ! & | < >. Examples: :1-100 @CA !:WAT :LIG<:5.0", "syntax": ":1-100 @CA !:WAT :LIG<:5.0 @CA,C,N,O"}, } SCRIPT_TEMPLATES = { "basic_rmsd": { "title": "RMSD + RMSF + Rg", "description": "Backbone RMSD, per-residue RMSF, radius of gyration", "script": "parm topology.prmtop\ntrajin trajectory.nc\n\nautoimage\ncenter !:WAT origin\n\nrmsd backbone @CA,C,N,O first out rmsd.dat\natomicfluct rmsf @CA byres out rmsf.dat\nradgyr rg !:WAT mass out rg.dat\n\ngo\n", }, "full_protein": { "title": "Full Protein Analysis", "description": "RMSD, RMSF, Rg, H-bonds, secondary structure", "script": "parm topology.prmtop\ntrajin trajectory.nc\n\nautoimage\ncenter !:WAT origin\n\nrmsd bb_rmsd @CA,C,N,O first out rmsd.dat\natomicfluct rmsf @CA byres out rmsf.dat\nradgyr rg !:WAT mass out rg.dat\nhbond hbonds !:WAT out hbond.dat avgout hbond_avg.dat\nsecstruct ss out secstruct.dat sumout secstruct_sum.dat\n\ngo\n", }, "clustering": { "title": "Trajectory Clustering", "description": "Hierarchical clustering + representative structures", "script": "parm topology.prmtop\ntrajin trajectory.nc\n\nautoimage\nstrip :WAT,Na+,Cl-\n\ncluster clusters @CA hieragglo epsilon 2.0 sieve 10 out cluster_assign.dat summary cluster_sum.dat info cluster_info.dat repout cluster_rep repfmt pdb\n\ngo\n", }, "pca": { "title": "PCA", "description": "Covariance matrix + projection onto first 3 modes", "script": "parm topology.prmtop\ntrajin trajectory.nc\n\nautoimage\nalign @CA reference\n\nmatrix covar pca_mat @CA out covar.dat\nanalyze modes eigenvalues evectors pca_mat out pca_modes.dat\nprojection pca_proj evecvecs pca_mat @CA out pca_proj.dat beg 1 end 3\n\ngo\n", }, "strip_solvent": { "title": "Strip Solvent & Save", "description": "Remove water/ions, write protein-only trajectory", "script": "parm topology.prmtop\ntrajin trajectory.nc\n\nautoimage\nstrip :WAT,Na+,Cl-\n\ntrajout protein_traj.nc\n\ngo\n", }, } # ───────────────────────────────────────────────────────────────────────────── # PDF TEXT EXTRACTION + CHUNKING # ───────────────────────────────────────────────────────────────────────────── # Section header patterns in the manual, e.g. "11.1 rmsd", "8.3 hbond" _SECTION_RE = re.compile( r'^(\d+\.\d+(?:\.\d+)?)\s+([a-zA-Z][a-zA-Z0-9_\-|/]{1,30})\s*$', re.MULTILINE, ) # Chapter headers like "8 General Commands", "11 Action Commands" _CHAPTER_RE = re.compile(r'^(\d+)\s+([A-Z][A-Za-z ]{3,50})\s*$', re.MULTILINE) _MIN_CHUNK_CHARS = 100 _MAX_CHUNK_CHARS = 6000 def _extract_pdf_text() -> list[dict]: """ Extract text from CpptrajManual.pdf and return a list of page dicts: [{"page": int, "text": str}, ...] Results are cached in cpptraj_manual_cache.json. """ if CACHE_PATH.exists(): with open(CACHE_PATH, encoding="utf-8") as f: return json.load(f) try: import pdfplumber except ImportError: raise ImportError("pdfplumber is required to parse the manual: pip install pdfplumber") print("[RAG] Extracting text from CpptrajManual.pdf (one-time, ~10 s)…") pages = [] with pdfplumber.open(PDF_PATH) as pdf: for i, page in enumerate(pdf.pages): text = page.extract_text() or "" pages.append({"page": i + 1, "text": text}) with open(CACHE_PATH, "w", encoding="utf-8") as f: json.dump(pages, f, ensure_ascii=False) print(f"[RAG] Extracted {len(pages)} pages, cached to {CACHE_PATH.name}") return pages def _chunk_manual(pages: list[dict]) -> list[dict]: """ Split the full manual text into semantic chunks. Strategy: - Detect section headers (e.g. "11.1 rmsd") as chunk boundaries. - Each chunk = one command section (header + body until next header). - Also add whole-page chunks for pages that don't fit the pattern. - Merge tiny chunks with the previous one. """ # Concatenate all pages preserving page breaks full_text = "" page_offsets = [] # (start_char, page_num) for p in pages: page_offsets.append((len(full_text), p["page"])) full_text += p["text"] + "\n\n" def char_to_page(pos: int) -> int: pg = 1 for start, pnum in page_offsets: if start > pos: break pg = pnum return pg # Find all section boundaries boundaries = [] for m in _SECTION_RE.finditer(full_text): boundaries.append((m.start(), m.group(0).strip(), m.group(2).lower())) # Also add chapter boundaries for m in _CHAPTER_RE.finditer(full_text): boundaries.append((m.start(), m.group(0).strip(), m.group(2).lower())) boundaries.sort(key=lambda x: x[0]) chunks = [] for i, (pos, header, cmd_name) in enumerate(boundaries): end = boundaries[i + 1][0] if i + 1 < len(boundaries) else len(full_text) text = full_text[pos:end].strip() if len(text) < _MIN_CHUNK_CHARS: continue # Trim very long chunks (take first MAX_CHUNK_CHARS) if len(text) > _MAX_CHUNK_CHARS: text = text[:_MAX_CHUNK_CHARS] + "\n… [truncated]" chunks.append({ "id": f"manual_sec_{i}", "header": header, "cmd_name": cmd_name, "text": text, "page": char_to_page(pos), "source": "CpptrajManual.pdf", }) # Fallback: if very few chunks found, fall back to page-level chunking if len(chunks) < 20: print("[RAG] Section detection found few chunks — falling back to page-level chunking") chunks = [] for p in pages: if len(p["text"]) < _MIN_CHUNK_CHARS: continue chunks.append({ "id": f"page_{p['page']}", "header": f"Page {p['page']}", "cmd_name": "", "text": p["text"][:_MAX_CHUNK_CHARS], "page": p["page"], "source": "CpptrajManual.pdf", }) return chunks # ───────────────────────────────────────────────────────────────────────────── # KNOWLEDGE BASE CLASS # ───────────────────────────────────────────────────────────────────────────── class CPPTrajKnowledgeBase: """ RAG over the real CpptrajManual.pdf using TF-IDF retrieval. Falls back gracefully if the PDF is not found. """ def __init__(self): self._chunks: list[dict] = [] self._texts: list[str] = [] self.vectorizer: TfidfVectorizer | None = None self.tfidf_matrix = None self._pdf_available = False self._load() def _load(self): if not PDF_PATH.exists(): print(f"[RAG] Warning: {PDF_PATH} not found — using built-in command docs only.") self._build_fallback_index() return try: pages = _extract_pdf_text() chunks = _chunk_manual(pages) if not chunks: self._build_fallback_index() return self._chunks = chunks self._texts = [c["text"] for c in chunks] self._pdf_available = True print(f"[RAG] Loaded {len(chunks)} chunks from manual (pages 1–{pages[-1]['page']})") except Exception as e: print(f"[RAG] PDF load error: {e} — using built-in docs.") self._build_fallback_index() return self._build_tfidf() def _build_fallback_index(self): """Build a minimal TF-IDF index from the built-in CPPTRAJ_COMMANDS.""" for k, doc in CPPTRAJ_COMMANDS.items(): text = f"{doc['title']} {doc['description']} {doc['syntax']} {k}" self._chunks.append({"id": k, "header": doc["title"], "cmd_name": k, "text": text, "page": 0, "source": "built-in"}) self._texts.append(text) self._build_tfidf() def _build_tfidf(self): self.vectorizer = TfidfVectorizer( ngram_range=(1, 2), stop_words="english", min_df=1, max_features=50_000, ) self.tfidf_matrix = self.vectorizer.fit_transform(self._texts) # ── Public API ──────────────────────────────────────────────────────── def retrieve(self, query: str, top_k: int = 6) -> list[dict]: """Return top-k most relevant chunks for a query.""" if self.vectorizer is None: return [] q_vec = self.vectorizer.transform([query]) scores = cosine_similarity(q_vec, self.tfidf_matrix).flatten() top_idx = np.argsort(scores)[::-1][:top_k] return [ {"chunk": self._chunks[i], "score": float(scores[i])} for i in top_idx if scores[i] > 0.0 ] def get_command_cheatsheet(self) -> str: """Compact one-liner per command — injected once into the system prompt.""" cats: dict[str, list[str]] = {} for key, v in CPPTRAJ_COMMANDS.items(): cat = v["category"] cats.setdefault(cat, []) cats[cat].append(f" {key:<20s} {v['syntax']}") lines = [ "## cpptraj Command Reference", "Syntax legend: [SETNAME] = positional output dataset name (first arg, no keyword); [] = optional named argument.\n", ] for cat in ("Setup", "Manipulation", "Analysis", "Output"): if cat not in cats: continue lines.append(f"# {cat}") lines.extend(cats[cat]) return "\n".join(lines) def get_context_for_llm(self, query: str, top_k: int = 3, score_threshold: float = 0.10) -> str: """ Return full manual chunks only when TF-IDF relevance > threshold. Returns empty string if nothing is relevant enough (model uses cheatsheet). """ results = self.retrieve(query, top_k=top_k) results = [r for r in results if r["score"] >= score_threshold] if not results: return "" lines = [ "=== CPPTRAJ MANUAL — RELEVANT SECTIONS ===", "(Use the EXACT command name from each section header, e.g. '11.65 radgyr' → use `radgyr`)\n", ] for r in results: c = r["chunk"] pg = f"p.{c['page']}" if c["page"] else c["source"] lines.append(f"--- {c['header']} [{pg} relevance:{r['score']:.2f}] ---") lines.append(c["text"]) lines.append("") return "\n".join(lines) def get_all_commands(self) -> dict: return CPPTRAJ_COMMANDS def get_command(self, key) -> dict | None: return CPPTRAJ_COMMANDS.get(key) def get_categories(self) -> list: return sorted(set(d["category"] for d in CPPTRAJ_COMMANDS.values())) def get_by_category(self, cat)-> dict: return {k: v for k, v in CPPTRAJ_COMMANDS.items() if v["category"] == cat} def get_script_templates(self)-> dict: return SCRIPT_TEMPLATES @property def pdf_available(self) -> bool: return self._pdf_available @property def n_chunks(self) -> int: return len(self._chunks)