| """ |
| cpptraj RAG knowledge base β built from the real CpptrajManual.pdf. |
| |
| Pipeline: |
| 1. Extract text from PDF with pdfplumber (cached to cpptraj_manual_cache.json) |
| 2. Split into per-command chunks using section-header heuristics |
| 3. TF-IDF index (scikit-learn) for fast semantic-ish retrieval |
| 4. Thin structured command registry for the left-panel UI (unchanged look) |
| """ |
|
|
| import json |
| import re |
| from pathlib import Path |
|
|
| import numpy as np |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.metrics.pairwise import cosine_similarity |
|
|
| |
| |
| |
|
|
| _HERE = Path(__file__).parent.parent |
| PDF_PATH = _HERE / "CpptrajManual.pdf" |
| CACHE_PATH = _HERE / "cpptraj_manual_cache.json" |
|
|
| |
| |
| |
|
|
| CPPTRAJ_COMMANDS = { |
| |
| "parm": {"category": "Setup", "title": "Load Topology (parm)", "description": "Load a topology/parameter file (.prmtop, .psf, .gro, .pdb). Must be the first command.", "syntax": "parm <filename> [<tag>] [nobondsearch]"}, |
| "trajin": {"category": "Setup", "title": "Load Trajectory (trajin)", "description": "Load trajectory file(s). Multiple trajin statements concatenate frames. Use start/stop/offset to sub-sample.", "syntax": "trajin <filename> [start] [stop|last] [offset]"}, |
| "reference": {"category": "Setup", "title": "Load Reference (reference)", "description": "Load a reference structure used by rmsd, align, nativecontacts.", "syntax": "reference <filename> [<frame>] [<tag>]"}, |
| "activeref": {"category": "Setup", "title": "Set Active Reference (activeref)", "description": "Set the active reference structure by tag.", "syntax": "activeref <tag>"}, |
| "createcrd": {"category": "Setup", "title": "Create COORDS Set (createcrd)", "description": "Create an empty COORDS data set for in-memory trajectory storage.", "syntax": "createcrd <name>"}, |
| "createreservoir": {"category": "Setup", "title": "Create Reservoir (createreservoir)", "description": "Create structure reservoir for REST simulation.", "syntax": "createreservoir <name> <filename> [<fmt>] [<mask>] [ene <set>] [temp <T>]"}, |
| "createset": {"category": "Setup", "title": "Create Data Set (createset)", "description": "Create a new data set with specified values.", "syntax": "createset name <name> type <type> [values <v1>,<v2>,...] [<range>]"}, |
| "loadcrd": {"category": "Setup", "title": "Load COORDS (loadcrd)", "description": "Load trajectory into a named COORDS data set for later use.", "syntax": "loadcrd <filename> [<fmt>] [<mask>] name <setname>"}, |
| "loadtraj": {"category": "Setup", "title": "Load Trajectory (loadtraj)", "description": "Load trajectory (alias for trajin inside scripts).", "syntax": "loadtraj <filename> [<fmt>] [<mask>]"}, |
| "readdata": {"category": "Setup", "title": "Read Data (readdata)", "description": "Read data from file into data sets for analysis.", "syntax": "readdata <filename> [as <fmt>] [name <name>] [index <col>]"}, |
| "go": {"category": "Setup", "title": "Execute (go)", "description": "Execute all queued commands. Required at end of every script.", "syntax": "go"}, |
| |
| "trajout": {"category": "Output", "title": "Write Trajectory (trajout)", "description": "Write processed trajectory to a new file. Format auto-detected from extension.", "syntax": "trajout <filename> [format] [nobox]"}, |
| "outtraj": {"category": "Output", "title": "Write Frames On-the-fly (outtraj)", "description": "Write frames to trajectory file during processing.", "syntax": "outtraj <filename> [<fmt>] [<mask>] [nobox] [onlyframes <range>]"}, |
| "crdout": {"category": "Output", "title": "Write COORDS Set (crdout)", "description": "Write a COORDS data set to a trajectory file.", "syntax": "crdout <crdset> <filename> [<fmt>] [<mask>]"}, |
| "parmwrite": {"category": "Output", "title": "Write Topology (parmwrite)", "description": "Write topology to file in specified format.", "syntax": "parmwrite out <filename> [<fmt>] [<topology tag>]"}, |
| "datafile": {"category": "Output", "title": "Data File Options (datafile)", "description": "Set output options for a data file.", "syntax": "datafile <filename> [<options>]"}, |
| "datafilter": {"category": "Output", "title": "Filter Data (datafilter)", "description": "Filter data sets by criteria and write to file.", "syntax": "datafilter <dataset> min <min> max <max> [out <file>]"}, |
| "dataset": {"category": "Output", "title": "Data Set Operations (dataset)", "description": "Perform operations on data sets: legend, makexy, etc.", "syntax": "dataset {legend <legend> <set> | makexy <X> <Y> name <out> | ...}"}, |
| "flatten": {"category": "Output", "title": "Flatten Data (flatten)", "description": "Flatten multi-dimensional data sets to 1D.", "syntax": "flatten <dataset> [out <file>]"}, |
| "precision": {"category": "Output", "title": "Output Precision (precision)", "description": "Set output precision for data files.", "syntax": "precision <file> <width> [<digits>]"}, |
| "selectds": {"category": "Output", "title": "Select Data Sets (selectds)", "description": "Select data sets matching a string pattern.", "syntax": "selectds <selection>"}, |
| |
| "autoimage": {"category": "Manipulation", "title": "Fix PBC Imaging (autoimage)", "description": "Re-image molecules across periodic boundaries back into the primary unit cell. Always strip :WAT first.", "syntax": "autoimage [familiar] [byres|bymol] [anchor <mask>]"}, |
| "center": {"category": "Manipulation", "title": "Center System (center)", "description": "Translate coordinates so that specified atoms are at the origin or box center.", "syntax": "center [<mask>] [origin] [mass]"}, |
| "strip": {"category": "Manipulation", "title": "Strip Atoms (strip)", "description": "Remove atoms/residues/molecules from the trajectory.", "syntax": "strip <mask>"}, |
| "align": {"category": "Manipulation", "title": "Align Trajectory (align)", "description": "Rotate and translate frames to least-squares-fit selected atoms to a reference. Modifies coordinates.", "syntax": "align [<mask>] [ref <tag>|reference|first] [mass]"}, |
| "image": {"category": "Manipulation", "title": "Image Molecules (image)", "description": "Image molecules into primary unit cell. Use autoimage for automatic imaging.", "syntax": "image [familiar] [bymol|byres|byatom] [<mask>] [origin] [center]"}, |
| "unwrap": {"category": "Manipulation", "title": "Unwrap Trajectory (unwrap)", "description": "Unwrap trajectory to remove periodic boundary jumps.", "syntax": "unwrap [<mask>] [center] [bymol|byres]"}, |
| "unstrip": {"category": "Manipulation", "title": "Restore Stripped Atoms (unstrip)", "description": "Restore previously stripped atoms back to the system.", "syntax": "unstrip"}, |
| "translate": {"category": "Manipulation", "title": "Translate Coordinates (translate)", "description": "Translate coordinates by a vector.", "syntax": "translate [<mask>] [x <dx>] [y <dy>] [z <dz>]"}, |
| "rotate": {"category": "Manipulation", "title": "Rotate Coordinates (rotate)", "description": "Rotate coordinates around an axis.", "syntax": "rotate [<mask>] {axis <x,y,z> degrees <d> | x|y|z <deg>}"}, |
| "scale": {"category": "Manipulation", "title": "Scale Coordinates (scale)", "description": "Scale coordinates by a factor along x/y/z.", "syntax": "scale [<mask>] [x <fx>] [y <fy>] [z <fz>]"}, |
| "box": {"category": "Manipulation", "title": "Set Box Dimensions (box)", "description": "Set or modify unit cell box dimensions.", "syntax": "box [x <x>] [y <y>] [z <z>] [alpha <a>] [beta <b>] [gamma <g>] [nobox]"}, |
| "closest": {"category": "Manipulation", "title": "Keep Closest Solvent (closest)", "description": "Keep N closest solvent molecules to solute, remove the rest.", "syntax": "closest <N> <solvent_mask> [noimage] [first|oxygen] [name <name>]"}, |
| "addatom": {"category": "Manipulation", "title": "Add Atom (addatom)", "description": "Add atoms to the topology.", "syntax": "addatom {bond <mask> | nobond} <name> <type> <charge> <mass> [<coords>]"}, |
| "atommap": {"category": "Manipulation", "title": "Map Atoms (atommap)", "description": "Map atoms between two structures/topologies.", "syntax": "atommap <ref> <target> [mapout <file>] [maponly]"}, |
| "catcrd": {"category": "Manipulation", "title": "Concatenate COORDS (catcrd)", "description": "Concatenate multiple COORDS data sets.", "syntax": "catcrd [crdset <set1>] [crdset <set2>] ... name <output>"}, |
| "change": {"category": "Manipulation", "title": "Change Topology Properties (change)", "description": "Change topology atom/residue names, types, or other properties.", "syntax": "change {resname from <old> to <new> | atomname from <old> to <new> | ...}"}, |
| "charge": {"category": "Manipulation", "title": "Print Total Charge (charge)", "description": "Print total charge for atom selection.", "syntax": "charge [<mask>]"}, |
| "checkchirality": {"category": "Manipulation", "title": "Check Chirality (checkchirality)", "description": "Check chirality of chiral centers.", "syntax": "checkchirality [<mask>] [out <file>]"}, |
| "combinecrd": {"category": "Manipulation", "title": "Combine COORDS (combinecrd)", "description": "Combine two COORDS sets into one.", "syntax": "combinecrd <crdset1> <crdset2> name <output>"}, |
| "comparetop": {"category": "Manipulation", "title": "Compare Topologies (comparetop)", "description": "Compare two topology files.", "syntax": "comparetop [parm1 <tag>] [parm2 <tag>]"}, |
| "crdaction": {"category": "Manipulation", "title": "Apply Action to COORDS (crdaction)", "description": "Apply an action to a COORDS data set.", "syntax": "crdaction <crdset> <action> [<action_args>]"}, |
| "crdfluct": {"category": "Manipulation", "title": "COORDS Fluctuations (crdfluct)", "description": "Calculate fluctuations of a COORDS data set.", "syntax": "crdfluct <crdset> [<mask>] [out <file>] [byres] [bfactor]"}, |
| "crdtransform": {"category": "Manipulation", "title": "Transform COORDS (crdtransform)", "description": "Apply coordinate transformation to a COORDS set.", "syntax": "crdtransform <crdset> [<xform_args>]"}, |
| "dihedralscan": {"category": "Manipulation", "title": "Dihedral Scan (dihedralscan)", "description": "Scan dihedral angles to generate conformations.", "syntax": "dihedralscan [<mask>] [rseed <seed>] [out <file>] [outtraj <file>]"}, |
| "emin": {"category": "Manipulation", "title": "Energy Minimization (emin)", "description": "Energy minimization using internal force field.", "syntax": "emin [<mask>] [nstep <N>] [out <file>] [step <step>]"}, |
| "fiximagedbonds": {"category": "Manipulation", "title": "Fix Imaged Bonds (fiximagedbonds)", "description": "Fix broken bonds across periodic boundaries.", "syntax": "fiximagedbonds [<mask>]"}, |
| "fixatomorder": {"category": "Manipulation", "title": "Fix Atom Order (fixatomorder)", "description": "Reorder atoms to match topology.", "syntax": "fixatomorder [<mask>] [outprefix <prefix>]"}, |
| "graft": {"category": "Manipulation", "title": "Graft Coordinates (graft)", "description": "Graft coordinates from one structure onto another.", "syntax": "graft [src <mask>] [tgt <mask>] [srcframe <N>] [mass]"}, |
| "hmassrepartition": {"category": "Manipulation", "title": "H-mass Repartition (hmassrepartition)", "description": "Hydrogen mass repartitioning for longer MD timesteps.", "syntax": "hmassrepartition [<mask>] [factor <f>]"}, |
| "lessplit": {"category": "Manipulation", "title": "Split LES Trajectory (lessplit)", "description": "Split LES trajectory into individual replicas.", "syntax": "lessplit [out <prefix>] [<fmt>]"}, |
| "makestructure": {"category": "Manipulation", "title": "Build Structure (makestructure)", "description": "Build structure using idealized geometry.", "syntax": "makestructure <sstype>:<res_range>[,...] [out <prefix>]"}, |
| "minimage": {"category": "Manipulation", "title": "Minimum Image (minimage)", "description": "Apply minimum image convention for periodic distance.", "syntax": "minimage [SETNAME] <mask1> <mask2> [out <file>]"}, |
| "molinfo": {"category": "Manipulation", "title": "Molecule Info (molinfo)", "description": "Print molecular information for atom mask.", "syntax": "molinfo [<mask>] [<topology tag>]"}, |
| "parmbox": {"category": "Manipulation", "title": "Set Topology Box (parmbox)", "description": "Set periodic box dimensions in topology.", "syntax": "parmbox {x <x> y <y> z <z> [alpha <a> beta <b> gamma <g>] | nobox}"}, |
| "parminfo": {"category": "Manipulation", "title": "Topology Info (parminfo)", "description": "Print topology information summary.", "syntax": "parminfo [<mask>] [<topology tag>]"}, |
| "parmstrip": {"category": "Manipulation", "title": "Strip Topology (parmstrip)", "description": "Strip atoms from topology file permanently.", "syntax": "parmstrip <mask> [<topology tag>]"}, |
| "permutedihedrals": {"category": "Manipulation", "title": "Permute Dihedrals (permutedihedrals)", "description": "Randomly permute dihedral angles.", "syntax": "permutedihedrals [<mask>] [rseed <seed>] [out <file>]"}, |
| "prepareforleap": {"category": "Manipulation", "title": "Prepare for LEaP (prepareforleap)", "description": "Prepare structure for LEaP (add missing atoms, fix naming).", "syntax": "prepareforleap [<mask>] [out <file>] [pdbout <file>]"}, |
| "randomizeions": {"category": "Manipulation", "title": "Randomize Ions (randomizeions)", "description": "Randomly swap ions with solvent molecules.", "syntax": "randomizeions <ion_mask> [by <solvent_mask>] [around <solute_mask>] [min <d>] [rseed <s>]"}, |
| "remap": {"category": "Manipulation", "title": "Remap Atom Order (remap)", "description": "Remap atom ordering to match a reference.", "syntax": "remap [<mask>] <reference>"}, |
| "replicatecell": {"category": "Manipulation", "title": "Replicate Unit Cell (replicatecell)", "description": "Replicate the unit cell in 3D.", "syntax": "replicatecell [<mask>] [out <prefix>] {all | dir X Y Z}"}, |
| "resinfo": {"category": "Manipulation", "title": "Residue Info (resinfo)", "description": "Print residue information: resid, resname, atom count, etc.", "syntax": "resinfo [<mask>] [<topology tag>]"}, |
| "rotatedihedral": {"category": "Manipulation", "title": "Rotate Dihedral (rotatedihedral)", "description": "Rotate a specific dihedral angle to a target value.", "syntax": "rotatedihedral [<mask>] res <r> type <phi|psi|chi1...> val <degrees>"}, |
| "scale": {"category": "Manipulation", "title": "Scale Coordinates (scale)", "description": "Scale coordinates by a factor along x/y/z.", "syntax": "scale [<mask>] [x <fx>] [y <fy>] [z <fz>]"}, |
| "select": {"category": "Manipulation", "title": "Select Atoms (select)", "description": "Select atoms by mask and print information.", "syntax": "select <mask>"}, |
| "sequence": {"category": "Manipulation", "title": "Print Sequence (sequence)", "description": "Print amino acid or nucleic acid sequence.", "syntax": "sequence [<mask>] [<topology tag>]"}, |
| "setvelocity": {"category": "Manipulation", "title": "Set Velocities (setvelocity)", "description": "Assign velocities from Maxwell-Boltzmann distribution.", "syntax": "setvelocity [<mask>] [temp <T>] [rseed <seed>]"}, |
| "solvent": {"category": "Manipulation", "title": "Define Solvent (solvent)", "description": "Define solvent molecules in topology.", "syntax": "solvent [<mask>] [<topology tag>]"}, |
| "splitcoords": {"category": "Manipulation", "title": "Split COORDS (splitcoords)", "description": "Split COORDS set into separate sets by frame.", "syntax": "splitcoords <crdset> [<range>] name <prefix>"}, |
| "updateparameters": {"category": "Manipulation", "title": "Update Parameters (updateparameters)", "description": "Update force field parameters in topology.", "syntax": "updateparameters {<bond_args>|<angle_args>|<dih_args>}"}, |
| "bondparminfo": {"category": "Manipulation", "title": "Bond Parameter Info (bondparminfo)", "description": "Print bond parameter information.", "syntax": "bondparminfo [<mask>] [<topology tag>]"}, |
| |
| "rmsd": {"category": "Analysis", "title": "RMSD (rmsd)", "description": "Calculate frame-by-frame RMSD of atoms relative to a reference. Use @CA,C,N,O for backbone. Most common MD analysis.", "syntax": "rmsd [SETNAME] [<mask>] [ref <tag>|first|reference] [out <file>] [nofit] [mass] [perres]"}, |
| "atomicfluct": {"category": "Analysis", "title": "RMSF (atomicfluct)", "description": "Per-atom or per-residue root mean square fluctuation (B-factors). Use byres for per-residue.", "syntax": "atomicfluct [SETNAME] [<mask>] [out <file>] [byres] [byatom] [bfactor]"}, |
| "radgyr": {"category": "Analysis", "title": "Radius of Gyration (radgyr)", "description": "Calculate radius of gyration β measures compactness. Always use mass keyword.", "syntax": "radgyr [SETNAME] [<mask>] [out <file>] [mass] [tensor]"}, |
| "hbond": {"category": "Analysis", "title": "Hydrogen Bonds (hbond)", "description": "Detect and track hydrogen bonds. Default: dist β€ 3.5 Γ
, angle β₯ 135Β°. Use avgout for statistics.", "syntax": "hbond [SETNAME] [<mask>] [out <file>] [avgout <file>] [dist <A>] [angle <deg>] [series]"}, |
| "secstruct": {"category": "Analysis", "title": "Secondary Structure (secstruct)", "description": "Assign secondary structure using DSSP algorithm. H=helix, E=strand, T=turn, C=coil.", "syntax": "secstruct [SETNAME] [<mask>] [out <file>] [sumout <file>]"}, |
| "dssp": {"category": "Analysis", "title": "DSSP Secondary Structure (dssp)", "description": "DSSP secondary structure assignment β alias for secstruct.", "syntax": "dssp [SETNAME] [<mask>] [out <file>] [sumout <file>]"}, |
| "cluster": {"category": "Analysis", "title": "Clustering (cluster)", "description": "Cluster trajectory frames by structural similarity. Use sieve for large trajectories.", "syntax": "cluster [SETNAME] [<mask>] [hieragglo|kmeans|dbscan] [epsilon <val>] [clusters <N>] [out <file>] [summary <file>] [repout <prefix>] [repfmt pdb]"}, |
| "distance": {"category": "Analysis", "title": "Distance (distance)", "description": "Calculate distance between two atom masks (center-of-mass by default).", "syntax": "distance [SETNAME] <mask1> <mask2> [out <file>] [noimage] [geom]"}, |
| "angle": {"category": "Analysis", "title": "Angle (angle)", "description": "Calculate angle between three atoms or groups. mask2 is the vertex.", "syntax": "angle [SETNAME] <mask1> <mask2> <mask3> [out <file>]"}, |
| "dihedral": {"category": "Analysis", "title": "Dihedral (dihedral)", "description": "Calculate dihedral (torsion) angle from four atoms. Output in β180 to +180 degrees.", "syntax": "dihedral [SETNAME] <mask1> <mask2> <mask3> <mask4> [out <file>]"}, |
| "multidihedral": {"category": "Analysis", "title": "Backbone Dihedrals (multidihedral)", "description": "Calculate phi, psi, omega, chi1-chi4 for all or selected residues.", "syntax": "multidihedral [phi] [psi] [omega] [chin] [<mask>] [out <file>]"}, |
| "phipsi": {"category": "Analysis", "title": "Phi/Psi Ramachandran (phipsi)", "description": "Calculate Ramachandran phi/psi angles for residues.", "syntax": "phipsi [<mask>] [out <file>] [name <name>] [resrange <range>]"}, |
| "surf": {"category": "Analysis", "title": "SASA (surf)", "description": "Calculate solvent-accessible surface area using LCPO algorithm. 1.4 Γ
probe.", "syntax": "surf [SETNAME] [<mask>] [out <file>] [solvradius <val>]"}, |
| "molsurf": {"category": "Analysis", "title": "MSMS SASA (molsurf)", "description": "MSMS/molsurf solvent accessible surface area.", "syntax": "molsurf [SETNAME] [<mask>] [out <file>] [probe <r>]"}, |
| "nativecontacts": {"category": "Analysis", "title": "Native Contacts (nativecontacts)", "description": "Calculate fraction of native contacts (Q-value) relative to a reference structure.", "syntax": "nativecontacts [SETNAME] [<mask>] [ref <tag>|reference] [out <file>] [distance <cutoff>]"}, |
| "contacts": {"category": "Analysis", "title": "Contacts (contacts)", "description": "Calculate number of contacts. Legacy command β prefer nativecontacts.", "syntax": "contacts [first|reference|ref <ref>] [byresidue] [out <file>] [<mask>]"}, |
| "density": {"category": "Analysis", "title": "Density Profile (density)", "description": "Calculate number or mass density along an axis. Useful for membrane systems.", "syntax": "density [SETNAME] [<mask>] [out <file>] [x|y|z] [delta <dx>] [number|mass|electron]"}, |
| "diffusion": {"category": "Analysis", "title": "Diffusion / MSD (diffusion)", "description": "Calculate mean square displacement and diffusion coefficient. D = slope of MSD / 6.", "syntax": "diffusion [SETNAME] [<mask>] [out <file>] [time <dt>] [diffout <file>]"}, |
| "stfcdiffusion": {"category": "Analysis", "title": "STFC Diffusion (stfcdiffusion)", "description": "Diffusion using STFC method for charged particles.", "syntax": "stfcdiffusion [<mask>] [out <file>] [time <dt>] [x|y|z|xy|xz|yz|xyz]"}, |
| "calcdiffusion": {"category": "Analysis", "title": "Calc Diffusion Coefficient (calcdiffusion)", "description": "Calculate diffusion coefficient from MSD data set.", "syntax": "calcdiffusion <msd_set> [out <file>] [time <ts>]"}, |
| "watershell": {"category": "Analysis", "title": "Water Shell (watershell)", "description": "Count water molecules in first and second solvation shells around a solute.", "syntax": "watershell [SETNAME] <mask> [out <file>] [lower <A>] [upper <A>]"}, |
| "radial": {"category": "Analysis", "title": "Radial Distribution Function (radial)", "description": "Calculate radial distribution function (RDF) g(r).", "syntax": "radial [out <file>] <spacing> <maximum> <solvent_mask> [<solute_mask>] [noimage]"}, |
| "volmap": {"category": "Analysis", "title": "Volumetric Map (volmap)", "description": "Generate 3D volumetric density map (.dx file, viewable in VMD).", "syntax": "volmap <filename> [<mask>] [size <dx> <dy> <dz>] [center <mask>]"}, |
| "grid": {"category": "Analysis", "title": "3D Density Grid (grid)", "description": "Calculate 3D density grid.", "syntax": "grid <filename> <dx> <dy> <dz> [origin] [<mask>] [box]"}, |
| "pucker": {"category": "Analysis", "title": "Ring Pucker (pucker)", "description": "Calculate Cremer-Pople ring pucker parameters for sugars/nucleic acids.", "syntax": "pucker [SETNAME] <m1> <m2> <m3> <m4> <m5> [<m6>] [out <file>] [amplitude] [theta]"}, |
| "multipucker": {"category": "Analysis", "title": "Multi Ring Pucker (multipucker)", "description": "Calculate ring pucker for multiple residues.", "syntax": "multipucker [<mask>] [out <file>] [amplitude] [theta]"}, |
| "matrix": {"category": "Analysis", "title": "Covariance Matrix (matrix)", "description": "Build covariance or correlation matrix β first step for PCA.", "syntax": "matrix covar [SETNAME] [<mask>] [out <file>]"}, |
| "diagmatrix": {"category": "Analysis", "title": "Diagonalize Matrix (diagmatrix)", "description": "Diagonalize a matrix to get eigenvalues and eigenvectors.", "syntax": "diagmatrix <matrixset> [out <evecfile>] [vecs <N>] [reduce] [mass <mask>]"}, |
| "projection": {"category": "Analysis", "title": "PCA Projection (projection)", "description": "Project trajectory onto eigenvectors from matrix/analyze modes for PCA.", "syntax": "projection [SETNAME] evecvecs <data> [<mask>] [out <file>] [beg <n>] [end <n>]"}, |
| "modes": {"category": "Analysis", "title": "Normal Modes (modes)", "description": "Analyze normal modes from diagonalized matrix: fluct, displ, corr, eigenval, trajout.", "syntax": "modes {fluct|displ|corr|eigenval|trajout} name <modesname> [beg <b>] [end <e>] [out <file>]"}, |
| "tica": {"category": "Analysis", "title": "TICA (tica)", "description": "Time-lagged independent component analysis.", "syntax": "tica {crdset <COORDS>|data <sets>} [lag <lag>] [nvecs <N>] [out <file>]"}, |
| "atomiccorr": {"category": "Analysis", "title": "Atomic Correlation (atomiccorr)", "description": "Atomic correlation matrix between atom displacements.", "syntax": "atomiccorr [out <file>] [cut <cut>] [<mask>] [datasave <set>]"}, |
| "rms2d": {"category": "Analysis", "title": "Pairwise RMSD Matrix (rms2d)", "description": "Pairwise RMSD matrix between all frame pairs.", "syntax": "rms2d [SETNAME] [<mask>] [out <file>] [mass] [nofit] [reftraj <traj>]"}, |
| "rmsavgcorr": {"category": "Analysis", "title": "RMSD Running Average Correlation (rmsavgcorr)", "description": "Correlation of running-average RMSD vs window size.", "syntax": "rmsavgcorr [<mask>] [out <file>] [mass]"}, |
| "symmrmsd": {"category": "Analysis", "title": "Symmetric RMSD (symmrmsd)", "description": "RMSD with symmetry correction for equivalent atoms.", "syntax": "symmrmsd [SETNAME] [<mask>] [ref <ref>|first] [out <file>] [remap]"}, |
| "dihedralrms": {"category": "Analysis", "title": "Dihedral RMSD (dihedralrms)", "description": "RMSD of dihedral angles between frames.", "syntax": "dihedralrms [<mask>] [out <file>] [mass] [nofit]"}, |
| "clusterdihedral": {"category": "Analysis", "title": "Dihedral Clustering (clusterdihedral)", "description": "Cluster by dihedral angles.", "syntax": "clusterdihedral [<mask>] [out <file>] [clusterout <prefix>] [...dihedrals...]"}, |
| "average": {"category": "Analysis", "title": "Average Structure (average)", "description": "Compute average structure over trajectory frames.", "syntax": "average [SETNAME] <filename> [<fmt>] [<mask>] [start <s>] [stop <e>] [offset <o>]"}, |
| "avgcoord": {"category": "Analysis", "title": "Average Coordinates (avgcoord)", "description": "Average coordinates for each atom over trajectory.", "syntax": "avgcoord [SETNAME] [<mask>] [out <file>]"}, |
| "avgbox": {"category": "Analysis", "title": "Average Box (avgbox)", "description": "Compute average box dimensions over trajectory.", "syntax": "avgbox [SETNAME] [out <file>]"}, |
| "bounds": {"category": "Analysis", "title": "Bounding Box (bounds)", "description": "Calculate bounding box around atoms.", "syntax": "bounds [SETNAME] [<mask>] [out <file>] [dx <dx>] [offset <offset>]"}, |
| "principal": {"category": "Analysis", "title": "Principal Axes (principal)", "description": "Calculate principal axes and moments of inertia.", "syntax": "principal [SETNAME] [<mask>] [out <file>] [dorotation] [mass]"}, |
| "dipole": {"category": "Analysis", "title": "Dipole Moment (dipole)", "description": "Calculate dipole moment of selection.", "syntax": "dipole [SETNAME] [<mask>] [out <file>] [<grid_options>]"}, |
| "volume": {"category": "Analysis", "title": "Unit Cell Volume (volume)", "description": "Calculate unit cell volume over trajectory.", "syntax": "volume [SETNAME] [out <file>]"}, |
| "temperature": {"category": "Analysis", "title": "Temperature (temperature)", "description": "Calculate instantaneous temperature from velocities.", "syntax": "temperature [SETNAME] [<mask>] [out <file>] [frame]"}, |
| "energy": {"category": "Analysis", "title": "Energy (energy)", "description": "Calculate energy using internal force field (bond, angle, dihedral, VdW, electrostatic).", "syntax": "energy [<mask>] [out <file>] [bond] [angle] [dih] [vdw] [elec]"}, |
| "esander": {"category": "Analysis", "title": "Energy via Sander (esander)", "description": "Calculate energy using sander AMBER engine.", "syntax": "esander [<mask>] [out <file>] [igb <igb>] [cut <cut>]"}, |
| "enedecomp": {"category": "Analysis", "title": "Energy Decomposition (enedecomp)", "description": "Energy decomposition per residue.", "syntax": "enedecomp [<mask>] [out <file>] [cut <cut>]"}, |
| "pairwise": {"category": "Analysis", "title": "Pairwise Energy (pairwise)", "description": "Pairwise energy decomposition between residues.", "syntax": "pairwise [<mask>] [out <file>] [cut <cut>] [cuteelec <c>] [cutevdw <c>]"}, |
| "lie": {"category": "Analysis", "title": "Linear Interaction Energy (lie)", "description": "Linear interaction energy calculation.", "syntax": "lie <mask1> [<mask2>] [out <file>] [elec <scale>] [vdw <scale>]"}, |
| "ti": {"category": "Analysis", "title": "Thermodynamic Integration (ti)", "description": "Thermodynamic integration (TI) free energy calculation.", "syntax": "ti <dset0> [<dset1>...] {nq <n>|xvals <x>} [out <file>] [name <name>]"}, |
| "spam": {"category": "Analysis", "title": "SPAM (spam)", "description": "Solvation parameters from analysis of MD.", "syntax": "spam <site_file> [out <file>] [name <name>] [DG <dg>]"}, |
| "nastruct": {"category": "Analysis", "title": "Nucleic Acid Structure (nastruct)", "description": "Nucleic acid structure parameters: base pairs, helical parameters.", "syntax": "nastruct [SETNAME] [resrange <range>] [naout <suffix>] [sscalc] [noheader]"}, |
| "jcoupling": {"category": "Analysis", "title": "J-coupling (jcoupling)", "description": "Calculate J-coupling constants from dihedral angles using Karplus equation.", "syntax": "jcoupling [<mask>] [kfile <karplus_file>] [out <file>]"}, |
| "ired": {"category": "Analysis", "title": "iRED NMR (ired)", "description": "iRED analysis of NMR order parameters.", "syntax": "ired [relax freq <MHz>] [order <o>] [orderparamfile <f>] [tstep <t>] [tcorr <t>] [out <f>]"}, |
| "rotdif": {"category": "Analysis", "title": "Rotational Diffusion (rotdif)", "description": "Rotational diffusion analysis from NMR relaxation.", "syntax": "rotdif [out <file>] [rvecin <file>] [rseed <seed>] [nvecs <N>]"}, |
| "timecorr": {"category": "Analysis", "title": "Time Correlation (timecorr)", "description": "Time correlation function of vectors.", "syntax": "timecorr vec1 <set> [vec2 <set>] [out <file>] [tstep <t>] [tcorr <t>]"}, |
| "vector": {"category": "Analysis", "title": "Vector (vector)", "description": "Calculate a vector between two masks over time.", "syntax": "vector [SETNAME] <mask1> <mask2> [out <file>] [ired]"}, |
| "multivector": {"category": "Analysis", "title": "Multi-vector (multivector)", "description": "Calculate vectors for multiple residue pairs.", "syntax": "multivector [<mask>] [out <file>] [ired]"}, |
| "vectormath": {"category": "Analysis", "title": "Vector Math (vectormath)", "description": "Math operations on vector data sets: dot product, cross product, etc.", "syntax": "vectormath vec1 <set> [vec2 <set>] {dotproduct|crossproduct|...} [out <file>]"}, |
| "velocityautocorr": {"category": "Analysis", "title": "Velocity Autocorrelation (velocityautocorr)", "description": "Velocity autocorrelation function (VACF).", "syntax": "velocityautocorr [<mask>] [out <file>] [tstep <t>] [maxlag <m>] [norm]"}, |
| "lipidorder": {"category": "Analysis", "title": "Lipid Order Parameters (lipidorder)", "description": "Calculate lipid tail order parameters (Scd) for membrane systems.", "syntax": "lipidorder [<mask>] [out <file>] [scd] [unsat]"}, |
| "lipidscd": {"category": "Analysis", "title": "Lipid Scd (lipidscd)", "description": "Lipid Scd order parameter calculation.", "syntax": "lipidscd [<mask>] [out <file>]"}, |
| "areapermol": {"category": "Analysis", "title": "Area per Molecule (areapermol)", "description": "Calculate area per molecule for lipid bilayers.", "syntax": "areapermol [SETNAME] [out <file>] [<mask>] [frame]"}, |
| "mindist": {"category": "Analysis", "title": "Min/Max Distance (mindist)", "description": "Minimum and maximum distance between two masks.", "syntax": "mindist [SETNAME] <mask1> <mask2> [out <file>] [noimage]"}, |
| "pairdist": {"category": "Analysis", "title": "Pairwise Distance (pairdist)", "description": "Pairwise distance histogram between all atom pairs.", "syntax": "pairdist [SETNAME] [<mask>] [out <file>] [delta <dx>] [max <max>]"}, |
| "hausdorff": {"category": "Analysis", "title": "Hausdorff Distance (hausdorff)", "description": "Calculate Hausdorff distance between two masks.", "syntax": "hausdorff [SETNAME] <mask1> <mask2> [out <file>]"}, |
| "tordiff": {"category": "Analysis", "title": "Torsion Difference (tordiff)", "description": "Torsion angle difference between two structures.", "syntax": "tordiff [<mask>] [out <file>] [ref <ref>]"}, |
| "autocorr": {"category": "Analysis", "title": "Autocorrelation (autocorr)", "description": "Autocorrelation function of a data set.", "syntax": "autocorr <dataset> [out <file>] [lagmax <max>] [norm] [direct]"}, |
| "crosscorr": {"category": "Analysis", "title": "Cross-correlation (crosscorr)", "description": "Cross-correlation between two data sets.", "syntax": "crosscorr <set1> <set2> [out <file>] [lagmax <max>] [norm] [direct]"}, |
| "lifetime": {"category": "Analysis", "title": "Lifetime Analysis (lifetime)", "description": "Lifetime analysis of hydrogen bonds or contacts.", "syntax": "lifetime <dataset> [out <file>] [window <w>] [cut <cut>] [name <name>]"}, |
| "runningavg": {"category": "Analysis", "title": "Running Average (runningavg)", "description": "Running average (sliding window) of a data set.", "syntax": "runningavg <dataset> [out <file>] [window <w>]"}, |
| "integrate": {"category": "Analysis", "title": "Integrate (integrate)", "description": "Integrate a data set using the trapezoidal rule.", "syntax": "integrate <dataset> [out <file>]"}, |
| "slope": {"category": "Analysis", "title": "Slope / Linear Fit (slope)", "description": "Calculate slope of a data set by linear fit.", "syntax": "slope <dataset> [out <file>]"}, |
| "regress": {"category": "Analysis", "title": "Linear Regression (regress)", "description": "Linear regression of a data set.", "syntax": "regress <dataset> [out <file>] [results <file>]"}, |
| "curvefit": {"category": "Analysis", "title": "Curve Fitting (curvefit)", "description": "Fit data to a functional form.", "syntax": "curvefit <function> <dataset> [out <file>] [results <file>] [nofit]"}, |
| "kde": {"category": "Analysis", "title": "KDE (kde)", "description": "Kernel density estimation of a data set.", "syntax": "kde <dataset> [out <file>] [bandwidth <bw>] [bins <N>]"}, |
| "fft": {"category": "Analysis", "title": "FFT (fft)", "description": "Fast Fourier Transform of a data set.", "syntax": "fft <dataset> [out <file>] [dt <timestep>] [fftout <file>]"}, |
| "wavelet": {"category": "Analysis", "title": "Wavelet Analysis (wavelet)", "description": "Wavelet analysis of trajectory data.", "syntax": "wavelet [<mask>] [out <file>] [type <wavelet>] [nb <N>]"}, |
| "filter": {"category": "Analysis", "title": "Filter Frames (filter)", "description": "Filter frames based on dataset value criteria.", "syntax": "filter <dataset> min <min> max <max>"}, |
| "divergence": {"category": "Analysis", "title": "KL Divergence (divergence)", "description": "Calculate KL divergence between two distributions.", "syntax": "divergence <set1> <set2> [out <file>]"}, |
| "lowestcurve": {"category": "Analysis", "title": "Lowest Free Energy Curve (lowestcurve)", "description": "Compute lowest free energy curve from 2D data.", "syntax": "lowestcurve <dataset> [out <file>] [step <s>]"}, |
| "meltcurve": {"category": "Analysis", "title": "Melting Curve (meltcurve)", "description": "Generate melting curve from temperature-dependent data.", "syntax": "meltcurve <dataset> [out <file>] [norm]"}, |
| "multicurve": {"category": "Analysis", "title": "Multi-curve Fit (multicurve)", "description": "Fit multiple exponential curves to data.", "syntax": "multicurve [<dataset>] [out <file>] [nexp <N>]"}, |
| "multihist": {"category": "Analysis", "title": "Multi-histogram (multihist)", "description": "Histogram multiple data sets simultaneously.", "syntax": "multihist <set1> [<set2>...] [out <file>] [bins <N>]"}, |
| "calcstate": {"category": "Analysis", "title": "Calculate State (calcstate)", "description": "Calculate state of system using HMM or thresholds.", "syntax": "calcstate [name <name>] [out <file>] <state_args>"}, |
| "checkoverlap": {"category": "Analysis", "title": "Check Overlaps (checkoverlap)", "description": "Check for bad atomic overlaps/clashes.", "syntax": "check [<mask>] [cut <cut>] [noimage] [out <file>]"}, |
| "cphstats": {"category": "Analysis", "title": "Constant-pH Stats (cphstats)", "description": "Analyze constant-pH simulation statistics.", "syntax": "cphstats <cpin> {<cpout> [<cpout2> ...]} [out <file>] [deprot <file>]"}, |
| "remlog": {"category": "Analysis", "title": "REMD Log Analysis (remlog)", "description": "Analyze replica exchange log files.", "syntax": "remlog <remlogfile> [out <file>] [nstlim <N>] [temp0 <T>]"}, |
| |
| "mask_syntax": {"category": "Reference", "title": "Atom Mask Syntax", "description": "cpptraj atom selection: :resnum @atomname :resname ! & | < >. Examples: :1-100 @CA !:WAT :LIG<:5.0", "syntax": ":1-100 @CA !:WAT :LIG<:5.0 @CA,C,N,O"}, |
| } |
|
|
| SCRIPT_TEMPLATES = { |
| "basic_rmsd": { |
| "title": "RMSD + RMSF + Rg", |
| "description": "Backbone RMSD, per-residue RMSF, radius of gyration", |
| "script": "parm topology.prmtop\ntrajin trajectory.nc\n\nautoimage\ncenter !:WAT origin\n\nrmsd backbone @CA,C,N,O first out rmsd.dat\natomicfluct rmsf @CA byres out rmsf.dat\nradgyr rg !:WAT mass out rg.dat\n\ngo\n", |
| }, |
| "full_protein": { |
| "title": "Full Protein Analysis", |
| "description": "RMSD, RMSF, Rg, H-bonds, secondary structure", |
| "script": "parm topology.prmtop\ntrajin trajectory.nc\n\nautoimage\ncenter !:WAT origin\n\nrmsd bb_rmsd @CA,C,N,O first out rmsd.dat\natomicfluct rmsf @CA byres out rmsf.dat\nradgyr rg !:WAT mass out rg.dat\nhbond hbonds !:WAT out hbond.dat avgout hbond_avg.dat\nsecstruct ss out secstruct.dat sumout secstruct_sum.dat\n\ngo\n", |
| }, |
| "clustering": { |
| "title": "Trajectory Clustering", |
| "description": "Hierarchical clustering + representative structures", |
| "script": "parm topology.prmtop\ntrajin trajectory.nc\n\nautoimage\nstrip :WAT,Na+,Cl-\n\ncluster clusters @CA hieragglo epsilon 2.0 sieve 10 out cluster_assign.dat summary cluster_sum.dat info cluster_info.dat repout cluster_rep repfmt pdb\n\ngo\n", |
| }, |
| "pca": { |
| "title": "PCA", |
| "description": "Covariance matrix + projection onto first 3 modes", |
| "script": "parm topology.prmtop\ntrajin trajectory.nc\n\nautoimage\nalign @CA reference\n\nmatrix covar pca_mat @CA out covar.dat\nanalyze modes eigenvalues evectors pca_mat out pca_modes.dat\nprojection pca_proj evecvecs pca_mat @CA out pca_proj.dat beg 1 end 3\n\ngo\n", |
| }, |
| "strip_solvent": { |
| "title": "Strip Solvent & Save", |
| "description": "Remove water/ions, write protein-only trajectory", |
| "script": "parm topology.prmtop\ntrajin trajectory.nc\n\nautoimage\nstrip :WAT,Na+,Cl-\n\ntrajout protein_traj.nc\n\ngo\n", |
| }, |
| } |
|
|
| |
| |
| |
|
|
| |
| _SECTION_RE = re.compile( |
| r'^(\d+\.\d+(?:\.\d+)?)\s+([a-zA-Z][a-zA-Z0-9_\-|/]{1,30})\s*$', |
| re.MULTILINE, |
| ) |
| |
| _CHAPTER_RE = re.compile(r'^(\d+)\s+([A-Z][A-Za-z ]{3,50})\s*$', re.MULTILINE) |
|
|
| _MIN_CHUNK_CHARS = 100 |
| _MAX_CHUNK_CHARS = 6000 |
|
|
|
|
| def _extract_pdf_text() -> list[dict]: |
| """ |
| Extract text from CpptrajManual.pdf and return a list of page dicts: |
| [{"page": int, "text": str}, ...] |
| Results are cached in cpptraj_manual_cache.json. |
| """ |
| if CACHE_PATH.exists(): |
| with open(CACHE_PATH, encoding="utf-8") as f: |
| return json.load(f) |
|
|
| try: |
| import pdfplumber |
| except ImportError: |
| raise ImportError("pdfplumber is required to parse the manual: pip install pdfplumber") |
|
|
| print("[RAG] Extracting text from CpptrajManual.pdf (one-time, ~10 s)β¦") |
| pages = [] |
| with pdfplumber.open(PDF_PATH) as pdf: |
| for i, page in enumerate(pdf.pages): |
| text = page.extract_text() or "" |
| pages.append({"page": i + 1, "text": text}) |
|
|
| with open(CACHE_PATH, "w", encoding="utf-8") as f: |
| json.dump(pages, f, ensure_ascii=False) |
|
|
| print(f"[RAG] Extracted {len(pages)} pages, cached to {CACHE_PATH.name}") |
| return pages |
|
|
|
|
| def _chunk_manual(pages: list[dict]) -> list[dict]: |
| """ |
| Split the full manual text into semantic chunks. |
| |
| Strategy: |
| - Detect section headers (e.g. "11.1 rmsd") as chunk boundaries. |
| - Each chunk = one command section (header + body until next header). |
| - Also add whole-page chunks for pages that don't fit the pattern. |
| - Merge tiny chunks with the previous one. |
| """ |
| |
| full_text = "" |
| page_offsets = [] |
| for p in pages: |
| page_offsets.append((len(full_text), p["page"])) |
| full_text += p["text"] + "\n\n" |
|
|
| def char_to_page(pos: int) -> int: |
| pg = 1 |
| for start, pnum in page_offsets: |
| if start > pos: |
| break |
| pg = pnum |
| return pg |
|
|
| |
| boundaries = [] |
| for m in _SECTION_RE.finditer(full_text): |
| boundaries.append((m.start(), m.group(0).strip(), m.group(2).lower())) |
| |
| for m in _CHAPTER_RE.finditer(full_text): |
| boundaries.append((m.start(), m.group(0).strip(), m.group(2).lower())) |
|
|
| boundaries.sort(key=lambda x: x[0]) |
|
|
| chunks = [] |
| for i, (pos, header, cmd_name) in enumerate(boundaries): |
| end = boundaries[i + 1][0] if i + 1 < len(boundaries) else len(full_text) |
| text = full_text[pos:end].strip() |
|
|
| if len(text) < _MIN_CHUNK_CHARS: |
| continue |
|
|
| |
| if len(text) > _MAX_CHUNK_CHARS: |
| text = text[:_MAX_CHUNK_CHARS] + "\n⦠[truncated]" |
|
|
| chunks.append({ |
| "id": f"manual_sec_{i}", |
| "header": header, |
| "cmd_name": cmd_name, |
| "text": text, |
| "page": char_to_page(pos), |
| "source": "CpptrajManual.pdf", |
| }) |
|
|
| |
| if len(chunks) < 20: |
| print("[RAG] Section detection found few chunks β falling back to page-level chunking") |
| chunks = [] |
| for p in pages: |
| if len(p["text"]) < _MIN_CHUNK_CHARS: |
| continue |
| chunks.append({ |
| "id": f"page_{p['page']}", |
| "header": f"Page {p['page']}", |
| "cmd_name": "", |
| "text": p["text"][:_MAX_CHUNK_CHARS], |
| "page": p["page"], |
| "source": "CpptrajManual.pdf", |
| }) |
|
|
| return chunks |
|
|
|
|
| |
| |
| |
|
|
| class CPPTrajKnowledgeBase: |
| """ |
| RAG over the real CpptrajManual.pdf using TF-IDF retrieval. |
| Falls back gracefully if the PDF is not found. |
| """ |
|
|
| def __init__(self): |
| self._chunks: list[dict] = [] |
| self._texts: list[str] = [] |
| self.vectorizer: TfidfVectorizer | None = None |
| self.tfidf_matrix = None |
| self._pdf_available = False |
|
|
| self._load() |
|
|
| def _load(self): |
| if not PDF_PATH.exists(): |
| print(f"[RAG] Warning: {PDF_PATH} not found β using built-in command docs only.") |
| self._build_fallback_index() |
| return |
|
|
| try: |
| pages = _extract_pdf_text() |
| chunks = _chunk_manual(pages) |
| if not chunks: |
| self._build_fallback_index() |
| return |
|
|
| self._chunks = chunks |
| self._texts = [c["text"] for c in chunks] |
| self._pdf_available = True |
| print(f"[RAG] Loaded {len(chunks)} chunks from manual (pages 1β{pages[-1]['page']})") |
| except Exception as e: |
| print(f"[RAG] PDF load error: {e} β using built-in docs.") |
| self._build_fallback_index() |
| return |
|
|
| self._build_tfidf() |
|
|
| def _build_fallback_index(self): |
| """Build a minimal TF-IDF index from the built-in CPPTRAJ_COMMANDS.""" |
| for k, doc in CPPTRAJ_COMMANDS.items(): |
| text = f"{doc['title']} {doc['description']} {doc['syntax']} {k}" |
| self._chunks.append({"id": k, "header": doc["title"], "cmd_name": k, |
| "text": text, "page": 0, "source": "built-in"}) |
| self._texts.append(text) |
| self._build_tfidf() |
|
|
| def _build_tfidf(self): |
| self.vectorizer = TfidfVectorizer( |
| ngram_range=(1, 2), |
| stop_words="english", |
| min_df=1, |
| max_features=50_000, |
| ) |
| self.tfidf_matrix = self.vectorizer.fit_transform(self._texts) |
|
|
| |
|
|
| def retrieve(self, query: str, top_k: int = 6) -> list[dict]: |
| """Return top-k most relevant chunks for a query.""" |
| if self.vectorizer is None: |
| return [] |
| q_vec = self.vectorizer.transform([query]) |
| scores = cosine_similarity(q_vec, self.tfidf_matrix).flatten() |
| top_idx = np.argsort(scores)[::-1][:top_k] |
| return [ |
| {"chunk": self._chunks[i], "score": float(scores[i])} |
| for i in top_idx if scores[i] > 0.0 |
| ] |
|
|
| def get_command_cheatsheet(self) -> str: |
| """Compact one-liner per command β injected once into the system prompt.""" |
| cats: dict[str, list[str]] = {} |
| for key, v in CPPTRAJ_COMMANDS.items(): |
| cat = v["category"] |
| cats.setdefault(cat, []) |
| cats[cat].append(f" {key:<20s} {v['syntax']}") |
| lines = [ |
| "## cpptraj Command Reference", |
| "Syntax legend: [SETNAME] = positional output dataset name (first arg, no keyword); [<arg>] = optional named argument.\n", |
| ] |
| for cat in ("Setup", "Manipulation", "Analysis", "Output"): |
| if cat not in cats: |
| continue |
| lines.append(f"# {cat}") |
| lines.extend(cats[cat]) |
| return "\n".join(lines) |
|
|
| def get_context_for_llm(self, query: str, top_k: int = 3, |
| score_threshold: float = 0.10) -> str: |
| """ |
| Return full manual chunks only when TF-IDF relevance > threshold. |
| Returns empty string if nothing is relevant enough (model uses cheatsheet). |
| """ |
| results = self.retrieve(query, top_k=top_k) |
| results = [r for r in results if r["score"] >= score_threshold] |
| if not results: |
| return "" |
|
|
| lines = [ |
| "=== CPPTRAJ MANUAL β RELEVANT SECTIONS ===", |
| "(Use the EXACT command name from each section header, e.g. '11.65 radgyr' β use `radgyr`)\n", |
| ] |
| for r in results: |
| c = r["chunk"] |
| pg = f"p.{c['page']}" if c["page"] else c["source"] |
| lines.append(f"--- {c['header']} [{pg} relevance:{r['score']:.2f}] ---") |
| lines.append(c["text"]) |
| lines.append("") |
| return "\n".join(lines) |
|
|
| def get_all_commands(self) -> dict: return CPPTRAJ_COMMANDS |
| def get_command(self, key) -> dict | None: return CPPTRAJ_COMMANDS.get(key) |
| def get_categories(self) -> list: return sorted(set(d["category"] for d in CPPTRAJ_COMMANDS.values())) |
| def get_by_category(self, cat)-> dict: return {k: v for k, v in CPPTRAJ_COMMANDS.items() if v["category"] == cat} |
| def get_script_templates(self)-> dict: return SCRIPT_TEMPLATES |
|
|
| @property |
| def pdf_available(self) -> bool: |
| return self._pdf_available |
|
|
| @property |
| def n_chunks(self) -> int: |
| return len(self._chunks) |
|
|