Line data Source code
1 : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 : Copyright (c) 2012-2023 The plumed team
3 : (see the PEOPLE file at the root of the distribution for a list of names)
4 :
5 : See http://www.plumed.org for more information.
6 :
7 : This file is part of plumed, version 2.
8 :
9 : plumed is free software: you can redistribute it and/or modify
10 : it under the terms of the GNU Lesser General Public License as published by
11 : the Free Software Foundation, either version 3 of the License, or
12 : (at your option) any later version.
13 :
14 : plumed is distributed in the hope that it will be useful,
15 : but WITHOUT ANY WARRANTY; without even the implied warranty of
16 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 : GNU Lesser General Public License for more details.
18 :
19 : You should have received a copy of the GNU Lesser General Public License
20 : along with plumed. If not, see <http://www.gnu.org/licenses/>.
21 : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
22 : #include "core/ActionShortcut.h"
23 : #include "core/ActionRegister.h"
24 : #include "core/ActionWithArguments.h"
25 : #include "core/PlumedMain.h"
26 : #include "tools/PDB.h"
27 :
28 : namespace PLMD {
29 : namespace generic {
30 :
31 : //+PLUMEDOC COLVAR PDB2CONSTANT
32 : /*
33 : Create a constant value from a PDB input file
34 :
35 : This shortcut converts the contents of a PDB file to one or more [CONSTANT](CONSTANT.md) actions.
36 : Converting PDB files to Values in this way is useful because it means that when we implement methods, like those
37 : in the [refdist](module_refdist.md) or [mapping](module_mapping.md) modules or the [RMSD](RMSD.md) action, that calculate the distance between two
38 : configurations those two configurations are both stored in PLUMED values. The same code can thus be used to
39 : calculate the difference between the instantaneous configuration
40 : and a constant reference configuration that was read from a file or between two reference configuration.
41 :
42 : The following example illustrates how this action can be used to read a set of reference atomic positions
43 :
44 : ```plumed
45 : #SETTINGS INPUTFILES=regtest/basic/rt19/test0.pdb
46 : ref: PDB2CONSTANT REFERENCE=regtest/basic/rt19/test0.pdb
47 : ```
48 :
49 : You can see how the reference positions are converted to [CONSTANT](CONSTANT.md) action that outputs a vector by expanding the shortcut.
50 :
51 : You can also use this command to read in multiple reference positions as illustrated below:
52 :
53 : ```plumed
54 : #SETTINGS INPUTFILES=regtest/mapping/rt-pathtools-3/all.pdb
55 : ref: PDB2CONSTANT REFERENCE=regtest/mapping/rt-pathtools-3/all.pdb
56 : ```
57 :
58 : The [CONSTANT](CONSTANT.md) that is created by this action is a matrix. Each row of the output matrix contains one set of reference positions.
59 : Notice also that if you have a PDB input which contains multiple reference configurations you can create a vector constant by using the `NUMBER`
60 : keyword to specify the particular configuration that you would like to read in as shown below:
61 :
62 : ```plumed
63 : #SETTINGS INPUTFILES=regtest/mapping/rt-pathtools-3/all.pdb
64 : ref: PDB2CONSTANT REFERENCE=regtest/mapping/rt-pathtools-3/all.pdb NUMBER=4
65 : ```
66 :
67 : The input above will reads in the fourth configuration in the input PDB file only.
68 :
69 : ## The PDB file format
70 :
71 : PLUMED uses the PDB file format here and in several other places
72 :
73 : - To read the molecular structure ([MOLINFO](MOLINFO.md)).
74 : - To read reference conformations ([RMSD](RMSD.md), but also in methods such as [FIT_TO_TEMPLATE](FIT_TO_TEMPLATE.md), etc).
75 :
76 : The implemented PDB reader expects a file formatted correctly according to the
77 : [PDB standard](http://www.wwpdb.org/documentation/file-format-content/format33/v3.3.html).
78 : In particular, the following columns are read from ATOM records
79 :
80 : ````
81 : columns | content
82 : 1-6 | record name (ATOM or HETATM)
83 : 7-11 | serial number of the atom (starting from 1)
84 : 13-16 | atom name
85 : 18-20 | residue name
86 : 22 | chain id
87 : 23-26 | residue number
88 : 31-38 | x coordinate
89 : 39-46 | y coordinate
90 : 47-54 | z coordinate
91 : 55-60 | occupancy
92 : 61-66 | beta factor
93 : ````
94 :
95 : The PLUMED parser is slightly more permissive than the official PDB format
96 : in the fact that the format of real numbers is not fixed. In other words,
97 : any real number that can be parsed is OK and the dot can be placed anywhere. However,
98 : __columns are interpret strictly__. A sample PDB should look like the following
99 :
100 : ````
101 : ATOM 2 CH3 ACE 1 12.932 -14.718 -6.016 1.00 1.00
102 : ATOM 5 C ACE 1 21.312 -9.928 -5.946 1.00 1.00
103 : ATOM 9 CA ALA 2 19.462 -11.088 -8.986 1.00 1.00
104 : ````
105 :
106 : Notice that serial numbers need not to be consecutive. In the three-line example above,
107 : only the coordinates of three atoms are provided. This is perfectly legal and indicates to PLUMED
108 : that information about these atoms only is available. This could be both for structural
109 : information in [MOLINFO](MOLINFO.md), where the other atoms would have no name assigned, and for
110 : reference structures used in [RMSD](RMSD.md), where only the provided atoms would be used to compute RMSD.
111 :
112 : ## Including arguments in PDB files
113 :
114 : If you wish to specify reference values for PLUMED Values in the REMARKS of a PLUMED input file like this:
115 :
116 : ````
117 : REMARK t1=-4.3345
118 : REMARK t2=3.4725
119 : END
120 : ````
121 :
122 : You can read in these reference values by using the PDB2CONSTANT command as follows:
123 :
124 : ```plumed
125 : #SETTINGS INPUTFILES=regtest/mapping/rt-pathtools-4/epath.pdb
126 : t1: TORSION ATOMS=1,2,3,4
127 : t2: TORSION ATOMS=5,6,7,8
128 : t1_ref: PDB2CONSTANT REFERENCE=regtest/mapping/rt-pathtools-4/epath.pdb ARG=t1
129 : t2_ref: PDB2CONSTANT REFERENCE=regtest/mapping/rt-pathtools-4/epath.pdb ARG=t2
130 : ```
131 :
132 : In this case the input must define values with the labels that are being read in from the reference file
133 : and separate PDB2CONSTANT commands are required for reading in `t1` and `t2`. Furthermore, because the
134 : input PDB file contains multiple frames vectors containing all the values for `t1` and `t2` are output from
135 : the constant commands that are created by the shortcuts in the above input. If you want to read only one of the
136 : configurations in the input PDB file you can use a pdb with a single frame or the `NUMBER` keyword described above.
137 :
138 : If, for any reason, you want to read data from a PDB file that is not a reference value for one of the values defined in
139 : your PLUMED input file you use the NOARGS flag as shown below:
140 :
141 : ```plumed
142 : #SETTINGS INPUTFILES=regtest/mapping/rt-pathtools-4/epath.pdb
143 : t1_ref: PDB2CONSTANT REFERENCE=regtest/mapping/rt-pathtools-4/epath.pdb NOARGS ARG=t1
144 : t2_ref: PDB2CONSTANT REFERENCE=regtest/mapping/rt-pathtools-4/epath.pdb NOARGS ARG=t2
145 : ```
146 :
147 : ## Occupancy and beta factors
148 :
149 : PLUMED also reads the occupancy and beta factors from the input PDB files. However, these columns of data are
150 : given a very special meaning.
151 : In cases where the PDB structure is used as a reference for an alignment (that's the case
152 : for instance in [RMSD](RMSD.md) and in [FIT_TO_TEMPLATE](FIT_TO_TEMPLATE.md)), the occupancy column is used
153 : to provide the weight of each atom in the alignment. In cases where, perhaps after alignment,
154 : the displacement between running coordinates and the provided PDB is computed, the beta factors
155 : are used as weight for the displacement.
156 : Since setting the weights to zero is the same as __not__ including an atom in the alignment or
157 : displacement calculation, the two following reference files would be equivalent when used in an [RMSD](RMSD.md)
158 : calculation. First file:
159 :
160 : ````
161 : ATOM 2 CH3 ACE 1 12.932 -14.718 -6.016 1.00 1.00
162 : ATOM 5 C ACE 1 21.312 -9.928 -5.946 1.00 1.00
163 : ATOM 9 CA ALA 2 19.462 -11.088 -8.986 0.00 0.00
164 : ````
165 :
166 : Second file:
167 :
168 : ````
169 : ATOM 2 CH3 ACE 1 12.932 -14.718 -6.016 1.00 1.00
170 : ATOM 5 C ACE 1 21.312 -9.928 -5.946 1.00 1.00
171 : ````
172 :
173 : However notice that many extra atoms with zero weight might slow down the calculation, so
174 : removing lines is better than setting their weights to zero.
175 : In addition, weights for alignment need not to be equivalent to weights for displacement.
176 : Starting with PLUMED 2.7, if all the weights are set to zero they will be normalized to be equal to the
177 : inverse of the number of involved atoms. This means that it will be possible to use files with
178 : the weight columns set to zero obtaining a meaningful result. In previous PLUMED versions,
179 : setting all weights to zero was resulting in an error instead.
180 :
181 : ## Systems with more than 100k atoms
182 :
183 : Notice that it very likely does not make any sense to compute the [RMSD](RMSD.md) or any other structural
184 : deviation __using__ many atoms. However, if the protein for which you want to compute [RMSD](RMSD.md)
185 : has atoms with large serial numbers (e.g. because it is located __after__ solvent in the sorted list of atoms)
186 : you might end up with troubles with the limitations of the PDB format. Indeed, since there are 5
187 : columns available for atom serial number, this number cannot be larger than 99999.
188 : In addition, providing [MOLINFO](MOLINFO.md) with names associated to atoms with a serial larger than 99999 would be impossible.
189 :
190 : Since PLUMED 2.4 we allow the [hybrid 36](http://cci.lbl.gov/hybrid_36/) format to be used to specify atom numbers.
191 : This format is not particularly widespread, but has the nice feature that it provides a one-to-one mapping
192 : between numbers up to approximately 80 millions and strings with 5 characters, plus it is backward compatible
193 : for numbers smaller than 100000. This is not true for notations like the hex notation exported by VMD.
194 : Using the hybrid 36 format, the ATOM records for atom ranging from 99997 to 100002 would read like these:
195 :
196 : ````
197 : ATOM 99997 Ar X 1 45.349 38.631 15.116 1.00 1.00
198 : ATOM 99998 Ar X 1 46.189 38.631 15.956 1.00 1.00
199 : ATOM 99999 Ar X 1 46.189 39.471 15.116 1.00 1.00
200 : ATOM A0000 Ar X 1 45.349 39.471 15.956 1.00 1.00
201 : ATOM A0000 Ar X 1 45.349 38.631 16.796 1.00 1.00
202 : ATOM A0001 Ar X 1 46.189 38.631 17.636 1.00 1.00
203 : ````
204 :
205 : There are tools that can be found to translate from integers to strings and back using hybrid 36 format
206 : (a simple python script can be found [here](https://sourceforge.net/p/cctbx/code/HEAD/tree/trunk/iotbx/pdb/hybrid_36.py)).
207 : In addition, as of PLUMED 2.5, we provide a command line tool that can be used to renumber atoms in a PDB file.
208 :
209 : */
210 : //+ENDPLUMEDOC
211 :
212 : class PDB2Constant : public ActionShortcut {
213 : public:
214 : static void registerKeywords(Keywords& keys);
215 : explicit PDB2Constant(const ActionOptions&);
216 : };
217 :
218 : PLUMED_REGISTER_ACTION(PDB2Constant,"PDB2CONSTANT")
219 :
220 170 : void PDB2Constant::registerKeywords(Keywords& keys) {
221 170 : ActionShortcut::registerKeywords( keys );
222 170 : keys.add("compulsory","REFERENCE","a file in pdb format containing the reference structure");
223 170 : keys.add("compulsory","NUMBER","0","if there are multiple structures in the pdb file you can specify that you want the RMSD from a specific structure by specifying its place in the file here. If NUMBER=0 then the RMSD from all structures are computed");
224 170 : keys.addFlag("NOARGS",false,"the arguments that are being read from the PDB file are not in the plumed input");
225 340 : keys.addInputKeyword("optional","ARG","scalar/vector","read this single argument from the input rather than the atomic structure");
226 340 : keys.setValueDescription("scalar/vector/matrix","a value that is constructed from the information in the PDB file");
227 170 : keys.needsAction("CONSTANT");
228 170 : }
229 :
230 116 : PDB2Constant::PDB2Constant(const ActionOptions& ao):
231 : Action(ao),
232 116 : ActionShortcut(ao) {
233 : std::string input;
234 116 : parse("REFERENCE",input);
235 : unsigned frame;
236 116 : parse("NUMBER",frame);
237 116 : bool noargs=false;
238 : std::vector<std::string> argn;
239 232 : parseVector("ARG",argn);
240 : std::vector<Value*> theargs;
241 116 : if( argn.size()>0 ) {
242 75 : parseFlag("NOARGS",noargs);
243 75 : if( !noargs ) {
244 73 : ActionWithArguments::interpretArgumentList( argn, plumed.getActionSet(), this, theargs );
245 2 : } else if( argn.size()>1 ) {
246 0 : error("can only read one argument at a time from input pdb file");
247 : } else {
248 2 : log.printf(" reading argument %s from file \n", argn[0].c_str() );
249 : }
250 : }
251 116 : if( theargs.size()>1 ) {
252 0 : error("can only read one argument at a time from input pdb file");
253 : }
254 :
255 116 : FILE* fp=std::fopen(input.c_str(),"r");
256 : bool do_read=true;
257 : std::vector<double> vals;
258 116 : if(!fp) {
259 0 : plumed_merror("could not open reference file " + input);
260 : }
261 116 : unsigned natoms=0, nframes=0;
262 :
263 2278 : while ( do_read ) {
264 2278 : PDB mypdb;
265 2278 : do_read=mypdb.readFromFilepointer(fp,plumed.usingNaturalUnits(),0.1/plumed.getUnits().getLength());
266 2278 : if( !do_read && nframes>0 ) {
267 : break ;
268 : }
269 :
270 2162 : if( natoms==0 ) {
271 1379 : natoms = mypdb.getPositions().size();
272 783 : } else if( mypdb.getPositions().size()!=natoms ) {
273 0 : plumed_merror("mismatch between sizes of reference configurations");
274 : }
275 :
276 2162 : if( nframes+1==frame || frame==0 ) {
277 1684 : std::vector<double> align( mypdb.getOccupancy() );
278 : double asum=0;
279 10914 : for(unsigned i=0; i<align.size(); ++i) {
280 9230 : asum += align[i];
281 : }
282 1684 : if( asum>epsilon ) {
283 716 : double iasum = 1 / asum;
284 9946 : for(unsigned i=0; i<align.size(); ++i) {
285 9230 : align[i] *= iasum;
286 : }
287 968 : } else if( mypdb.size()>0 ) {
288 0 : double iasum = 1 / mypdb.size();
289 0 : for(unsigned i=0; i<align.size(); ++i) {
290 0 : align[i] = iasum;
291 : }
292 : }
293 : Vector center;
294 : center.zero();
295 10914 : for(unsigned i=0; i<mypdb.getPositions().size(); ++i) {
296 9230 : center += align[i]*mypdb.getPositions()[i];
297 : }
298 :
299 1684 : if( theargs.size()==0 && argn.size()==0 ) {
300 1856 : for(unsigned j=0; j<3; ++j) {
301 19254 : for(unsigned i=0; i<mypdb.getPositions().size(); ++i) {
302 17862 : vals.push_back( mypdb.getPositions()[i][j] - center[j] );
303 : }
304 : }
305 1220 : } else if( noargs ) {
306 20 : std::vector<double> argvals( 1 );
307 20 : if( !mypdb.getArgumentValue(argn[0], argvals ) ) {
308 0 : error("argument " + argn[0] + " was not set in pdb input");
309 : }
310 20 : vals.push_back( argvals[0] );
311 : } else {
312 1200 : std::vector<double> argvals( theargs[0]->getNumberOfValues() );
313 1200 : if( !mypdb.getArgumentValue(theargs[0]->getName(), argvals ) ) {
314 0 : error("argument " + theargs[0]->getName() + " was not set in pdb input");
315 : }
316 2402 : for(unsigned i=0; i<argvals.size(); ++i) {
317 1202 : vals.push_back( argvals[i] );
318 : }
319 : }
320 : }
321 2162 : nframes++;
322 2278 : }
323 116 : if( frame>0 ) {
324 68 : nframes=1;
325 : }
326 116 : std::fclose(fp);
327 : std::string rnum;
328 116 : plumed_assert( vals.size()>0 );
329 116 : Tools::convert( vals[0], rnum );
330 116 : std::string valstr = " VALUES=" + rnum;
331 19084 : for(unsigned i=1; i<vals.size(); ++i) {
332 18968 : Tools::convert( vals[i], rnum );
333 37936 : valstr += "," + rnum;
334 : }
335 116 : if( vals.size()>nframes ) {
336 : std::string nc, nr;
337 42 : Tools::convert( nframes, nr );
338 42 : Tools::convert( vals.size()/nframes, nc );
339 84 : readInputLine( getShortcutLabel() + ": CONSTANT NROWS=" + nr + " NCOLS=" + nc + valstr );
340 : } else {
341 148 : readInputLine( getShortcutLabel() + ": CONSTANT" + valstr );
342 : }
343 232 : }
344 :
345 : }
346 : }
|