Line data Source code
1 : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 : Copyright (c) 2019-2023 The plumed team
3 : (see the PEOPLE file at the root of the distribution for a list of names)
4 :
5 : See http://www.plumed.org for more information.
6 :
7 : This file is part of plumed, version 2.
8 :
9 : plumed is free software: you can redistribute it and/or modify
10 : it under the terms of the GNU Lesser General Public License as published by
11 : the Free Software Foundation, either version 3 of the License, or
12 : (at your option) any later version.
13 :
14 : plumed is distributed in the hope that it will be useful,
15 : but WITHOUT ANY WARRANTY; without even the implied warranty of
16 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 : GNU Lesser General Public License for more details.
18 :
19 : You should have received a copy of the GNU Lesser General Public License
20 : along with plumed. If not, see <http://www.gnu.org/licenses/>.
21 : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
22 : #include "CLTool.h"
23 : #include "core/CLToolRegister.h"
24 : #include "tools/Communicator.h"
25 : #include "tools/Tools.h"
26 : #include "config/Config.h"
27 : #include "tools/PlumedHandle.h"
28 : #include "tools/Stopwatch.h"
29 : #include "tools/Log.h"
30 : #include "tools/DLLoader.h"
31 : #include "tools/Random.h"
32 :
33 : #include <cstdio>
34 : #include <string>
35 : #include <vector>
36 : #include <fstream>
37 : #include <atomic>
38 : #include <csignal>
39 : #include <cstdio>
40 : #include <random>
41 : #include <algorithm>
42 : #include <chrono>
43 : #include <string_view>
44 :
45 : namespace PLMD {
46 : namespace cltools {
47 :
48 : //+PLUMEDOC TOOLS benchmark
49 : /*
50 : benchmark is a lightweight reimplementation of driver focused on running benchmarks
51 :
52 : The main difference wrt driver is that it generates a trajectory in memory rather than reading it
53 : from a file. This allows to better time the overhead of the plumed library, without including
54 : the time needed to read the trajectory.
55 :
56 : It is also possible to load a separate version of the plumed kernel. This enables running
57 : benchmarks agaist previous plumed versions in a controlled setting, where systematic errors
58 : in the comparison are minimized.
59 :
60 : \par Examples
61 :
62 : First, you should create a sample `plumed.dat` file for testing. For instance:
63 :
64 : \plumedfile
65 : WHOLEMOLECULES ENTITY0=1-10000
66 : p: POSITION ATOM=1
67 : RESTRAINT ARG=p.x KAPPA=1 AT=0
68 : \endplumedfile
69 :
70 : Then you can test the performance of this input with the following command:
71 : \verbatim
72 : plumed benchmark
73 : \endverbatim
74 :
75 : You can also test a different (older) version of PLUMED with the same input. To do so,
76 : you should run
77 : \verbatim
78 : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so
79 : \endverbatim
80 :
81 : \warning It is necessary to use the `plumed-runtime` executable here to avoid conflicts between different
82 : plumed versions. You will find it in your path if you are using the non installed version of plumed,
83 : and in `$prefix/lib/plumed` if you installed plumed in $prefix,.
84 :
85 : \par Comparing multiple versions
86 :
87 : The best way to compare two versions of plumed on the same input is to pass multiple colon-separated kernels:
88 :
89 : \verbatim
90 : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:/path2/to/lib/libplumedKernel.so:this
91 : \endverbatim
92 :
93 : Here `this` means the kernel of the version with which you are running the benchmark. This comparison runs the three
94 : instances simultaneously (alternating them) so that systematic differences in the load of your machine will affect them
95 : to the same extent.
96 :
97 : In case the different versions require modified plumed.dat files, or if you simply want to compare
98 : two different plumed input files that compute the same thing, you can also use multiple plumed input files:
99 :
100 : \verbatim
101 : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:this --plumed plumed1.dat:plumed2.dat
102 : \endverbatim
103 :
104 : Similarly, you might want to run two different inputs using the same kernel, which can be obtained with:
105 :
106 : \verbatim
107 : plumed-runtime benchmark --plumed plumed1.dat:plumed2.dat
108 : \endverbatim
109 :
110 : \par Profiling
111 :
112 : If you want to attach a profiler on the fly to the process, you might find it convenient to use `--nsteps -1`.
113 : The simulation will run forever and can be interrupted with CTRL-C. When interrupted, the result of the timers
114 : should be displayed anyway.
115 : You can also run setting a maximum time with `--maxtime`.
116 :
117 : If you run a profiler when testing multiple PLUMED versions you might be confused by which function is from
118 : each version. It is recommended to recompile separate instances with a separate C++ namespace (`-DPLMD=PLUMED_version_1`)
119 : so that you will be able to distinguish them. In addition, compiling with `CXXFLAGS="-g -O3"` will make the profiling
120 : report more complete, likely including code lines.
121 :
122 : \par MPI runs
123 :
124 : You can run emulating a domain decomposition. This is done automatically if plumed has been compiled with MPI
125 : and you run with `mpirun`
126 :
127 : \verbatim
128 : mpirun -np 4 plumed-runtime benchmark
129 : \endverbatim
130 :
131 : If you load separate PLUMED instances as discussed above, they should all be compiled against the same MPI version.
132 : Notice that when using MPI signals (CTRL-C) might not work.
133 :
134 : Since some of the data transfer could happen asynchronously, you might want to use the `--sleep` option
135 : to simulate a lag between the `prepareCalc` and `performCalc` actions. This part of the calculation will not contribute
136 : to timer, but will obviously slow down your test.
137 :
138 : \par Output
139 :
140 : In the output you will see the usual reports about timing produced by the internal
141 : timers of the tested plumed instances.
142 : In addition, this tool will monitor the timing externally, with some slightly different criterion:
143 : - First, the initialization (construction of the input) will be shown with a separate timer,
144 : as well as the timing for the first step.
145 : - Second, the timer corresponding to the calculation will be split in three parts, reporting
146 : execution of the first 20% (warm-up) and the next two blocks of 40% each.
147 : - Finally, you might notice some discrepancy because some of the actions that are usually
148 : not expensive are not included in the internal timers. The external timer will
149 : thus provide a better estimate of the total elapsed time, including everything.
150 :
151 : The internal timers are still useful to monitor what happens at the different stages
152 : and, with \ref DEBUG `DETAILED_TIMERS`, what happens in each action.
153 :
154 : When you run multiple version, a comparative analisys of the time spent within PLUMED in the various
155 : instances will be done, showing the ratio between the total time and the time measured on the first
156 : instance, which will act as a reference. Errors will be estimated with bootstrapping. The warm-up phase will be discarded for
157 : this analysis.
158 :
159 : */
160 : //+ENDPLUMEDOC
161 :
162 : // We use an anonymous namespace here to avoid clashes with variables
163 : // declared in other parts of the code
164 : namespace {
165 :
166 : //this is a sugar for changing idea faster about the rng
167 : using generator = std::mt19937;
168 :
169 : std::atomic<bool> signalReceived(false);
170 :
171 : class SignalHandlerGuard {
172 : public:
173 0 : SignalHandlerGuard(int signal, void (*newHandler)(int)) : signal_(signal) {
174 : // Store the current handler before setting the new one
175 0 : prevHandler_ = std::signal(signal, newHandler);
176 0 : if (prevHandler_ == SIG_ERR) {
177 0 : throw std::runtime_error("Failed to set signal handler");
178 : }
179 0 : }
180 :
181 : ~SignalHandlerGuard() {
182 : // Restore the previous handler upon destruction
183 0 : std::signal(signal_, prevHandler_);
184 0 : }
185 :
186 : // Delete copy constructor and assignment operator to prevent copying
187 : SignalHandlerGuard(const SignalHandlerGuard&) = delete;
188 : SignalHandlerGuard& operator=(const SignalHandlerGuard&) = delete;
189 :
190 : private:
191 : int signal_;
192 : void (*prevHandler_)(int);
193 : };
194 :
195 0 : extern "C" void signalHandler(int signal) {
196 0 : if (signal == SIGINT) {
197 : signalReceived.store(true);
198 0 : fprintf(stderr, "Signal interrupt received\n");
199 : }
200 0 : if (signal == SIGTERM) {
201 : signalReceived.store(true);
202 0 : fprintf(stderr, "Signal termination received\n");
203 : }
204 0 : }
205 :
206 : /// This base class contains members that are movable with default operations
207 : struct KernelBase {
208 : std::string path;
209 : std::string plumed_dat;
210 : PlumedHandle handle;
211 : Stopwatch stopwatch;
212 : std::vector<long long int> timings;
213 : double comparative_timing=-1.0;
214 : double comparative_timing_error=-1.0;
215 0 : KernelBase(const std::string & path_,const std::string & plumed_dat_, Log* log_):
216 0 : path(path_),
217 0 : plumed_dat(plumed_dat_),
218 0 : handle([&]() {
219 0 : if(path_=="this") {
220 0 : return PlumedHandle();
221 : } else {
222 0 : return PlumedHandle::dlopen(path.c_str());
223 : }
224 : }()),
225 0 : stopwatch(*log_) {
226 0 : }
227 : };
228 :
229 : /// Local structure handling a kernel and the related timers.
230 : /// This structure specifically contain the Log, which needs special treatment
231 : /// in move semantics
232 : struct Kernel :
233 : public KernelBase {
234 : Log* log=nullptr;
235 0 : Kernel(const std::string & path_,const std::string & the_plumed_dat, Log* log_):
236 : KernelBase(path_,the_plumed_dat,log_),
237 0 : log(log_) {
238 : }
239 :
240 0 : ~Kernel() {
241 0 : if(log) {
242 0 : (*log)<<"\n";
243 0 : (*log) <<"Kernel: "<<path<<"\n";
244 0 : (*log) <<"Input: "<<plumed_dat<<"\n";
245 0 : if(comparative_timing>0.0) {
246 0 : (*log).printf("Comparative: %.3f +- %.3f\n",comparative_timing,comparative_timing_error);
247 : }
248 : }
249 0 : }
250 :
251 0 : Kernel(Kernel && other) noexcept:
252 : KernelBase(std::move(other)),
253 0 : log(other.log) {
254 0 : other.log=nullptr; // ensure no log is done in the moved away object
255 : }
256 :
257 0 : Kernel & operator=(Kernel && other) noexcept {
258 0 : if(this != &other) {
259 0 : KernelBase::operator=(std::move(other));
260 0 : log=other.log;
261 0 : other.log=nullptr; // ensure no log is done in the moved away object
262 : }
263 0 : return *this;
264 : }
265 : };
266 :
267 : namespace {
268 :
269 : class UniformSphericalVector {
270 : //double rminCub;
271 : double rCub;
272 :
273 : public:
274 : //assuming rmin=0
275 0 : UniformSphericalVector(const double rmax):
276 0 : rCub (rmax*rmax*rmax/*-rminCub*/) {}
277 0 : PLMD::Vector operator()(Random& rng) {
278 0 : double rho = std::cbrt (/*rminCub + */rng.RandU01()*rCub);
279 0 : double theta =std::acos (2.0*rng.RandU01() -1.0);
280 0 : double phi = 2.0 * PLMD::pi * rng.RandU01();
281 : return Vector (
282 0 : rho * sin (theta) * cos (phi),
283 0 : rho * sin (theta) * sin (phi),
284 0 : rho * cos (theta));
285 : }
286 : };
287 :
288 : ///Acts as a template for any distribution
289 : struct AtomDistribution {
290 : virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&)=0;
291 0 : virtual void box(std::vector<double>& box, unsigned /*natoms*/, unsigned /*step*/, Random&) {
292 : std::fill(box.begin(), box.end(),0);
293 0 : };
294 : virtual ~AtomDistribution() noexcept {}
295 : };
296 :
297 0 : struct theLine:public AtomDistribution {
298 0 : void positions(std::vector<Vector>& posToUpdate, unsigned step, Random&rng) override {
299 : auto nat = posToUpdate.size();
300 : UniformSphericalVector usv(0.5);
301 :
302 0 : for (unsigned i=0; i<nat; ++i) {
303 0 : posToUpdate[i] = Vector(i, 0, 0) + usv(rng);
304 : }
305 0 : }
306 : };
307 :
308 0 : struct uniformSphere:public AtomDistribution {
309 0 : void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
310 :
311 : //giving more or less a cubic udm of volume for each atom: V=nat
312 0 : const double rmax= std::cbrt ((3.0/(4.0*PLMD::pi)) * posToUpdate.size());
313 :
314 : UniformSphericalVector usv(rmax);
315 : auto s=posToUpdate.begin();
316 : auto e=posToUpdate.end();
317 : //I am using the iterators:this is slightly faster,
318 : // enough to overcome the cost of the vtable that I added
319 0 : for (unsigned i=0; s!=e; ++s,++i) {
320 0 : *s = usv (rng);
321 : }
322 :
323 0 : }
324 0 : void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
325 0 : const double rmax= 2.0*std::cbrt((3.0/(4.0*PLMD::pi)) * natoms);
326 0 : box[0]=rmax;
327 0 : box[1]=0.0;
328 0 : box[2]=0.0;
329 0 : box[3]=0.0;
330 0 : box[4]=rmax;
331 0 : box[5]=0.0;
332 0 : box[6]=0.0;
333 0 : box[7]=0.0;
334 0 : box[8]=rmax;
335 :
336 0 : }
337 : };
338 :
339 0 : struct twoGlobs: public AtomDistribution {
340 0 : virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&rng) {
341 : //I am using two unigform spheres and 2V=n
342 0 : const double rmax= std::cbrt ((3.0/(8.0*PLMD::pi)) * posToUpdate.size());
343 :
344 : UniformSphericalVector usv(rmax);
345 : std::array<Vector,2> centers{
346 : PLMD::Vector{0.0,0.0,0.0},
347 : //so they do not overlap
348 : PLMD::Vector{2.0*rmax,2.0*rmax,2.0*rmax}
349 0 : };
350 0 : std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
351 : //RandInt is only declared
352 : // return usv (rng) + centers[rng.RandInt(1)];
353 0 : return usv (rng) + centers[rng.RandU01()>0.5];
354 : });
355 0 : }
356 :
357 0 : virtual void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) {
358 :
359 0 : const double rmax= 4.0 * std::cbrt ((3.0/(8.0*PLMD::pi)) * natoms);
360 0 : box[0]=rmax;
361 0 : box[1]=0.0;
362 0 : box[2]=0.0;
363 0 : box[3]=0.0;
364 0 : box[4]=rmax;
365 0 : box[5]=0.0;
366 0 : box[6]=0.0;
367 0 : box[7]=0.0;
368 0 : box[8]=rmax;
369 0 : };
370 : };
371 :
372 0 : struct uniformCube:public AtomDistribution {
373 0 : void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
374 : //giving more or less a cubic udm of volume for each atom: V = nat
375 0 : const double rmax = std::cbrt(static_cast<double>(posToUpdate.size()));
376 :
377 :
378 :
379 : // std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
380 : // return Vector (rndR(rng),rndR(rng),rndR(rng));
381 : // });
382 : auto s=posToUpdate.begin();
383 : auto e=posToUpdate.end();
384 : //I am using the iterators:this is slightly faster,
385 : // enough to overcome the cost of the vtable that I added
386 0 : for (unsigned i=0; s!=e; ++s,++i) {
387 0 : *s = Vector (rng.RandU01()*rmax,rng.RandU01()*rmax,rng.RandU01()*rmax);
388 : }
389 0 : }
390 0 : void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
391 : //+0.05 to avoid overlap
392 0 : const double rmax= std::cbrt(natoms)+0.05;
393 0 : box[0]=rmax;
394 0 : box[1]=0.0;
395 0 : box[2]=0.0;
396 0 : box[3]=0.0;
397 0 : box[4]=rmax;
398 0 : box[5]=0.0;
399 0 : box[6]=0.0;
400 0 : box[7]=0.0;
401 0 : box[8]=rmax;
402 :
403 0 : }
404 : };
405 :
406 0 : struct tiledSimpleCubic:public AtomDistribution {
407 0 : void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
408 : //Tiling the space in this way will not tests 100% the pbc, but
409 : //I do not think that write a spacefilling curve, like Hilbert, Peano or Morton
410 : //could be a good idea, in this case
411 :
412 : // the lambda is here because in some case a perfect cube is not well represented
413 : // by double and will get uncorrectly ceiled up (for example 24389 will result in a 30 and not in 29)
414 0 : const unsigned rmax = [&] {
415 0 : const size_t x = std::ceil(std::cbrt(static_cast<double>(posToUpdate.size())))-1;
416 0 : if ( x*x*x >= posToUpdate.size()) {
417 : return x;
418 : }
419 0 : return x+1;
420 0 : }();
421 :
422 : auto s=posToUpdate.begin();
423 : auto e=posToUpdate.end();
424 : //I am using the iterators:this is slightly faster,
425 : // enough to overcome the cost of the vtable that I added
426 0 : for (unsigned k=0; k<rmax&&s!=e; ++k) {
427 0 : for (unsigned j=0; j<rmax&&s!=e; ++j) {
428 0 : for (unsigned i=0; i<rmax&&s!=e; ++i) {
429 0 : *s = Vector (i,j,k);
430 : ++s;
431 : }
432 : }
433 : }
434 0 : }
435 0 : void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
436 0 : const double rmax= std::ceil(std::cbrt(static_cast<double>(natoms)));;
437 0 : box[0]=rmax;
438 0 : box[1]=0.0;
439 0 : box[2]=0.0;
440 0 : box[3]=0.0;
441 0 : box[4]=rmax;
442 0 : box[5]=0.0;
443 0 : box[6]=0.0;
444 0 : box[7]=0.0;
445 0 : box[8]=rmax;
446 :
447 0 : }
448 : };
449 0 : std::unique_ptr<AtomDistribution> getAtomDistribution(std::string_view atomicDistr) {
450 0 : std::unique_ptr<AtomDistribution> distribution;
451 0 : if(atomicDistr == "line") {
452 0 : distribution = std::make_unique<theLine>();
453 0 : } else if (atomicDistr == "cube") {
454 0 : distribution = std::make_unique<uniformCube>();
455 0 : } else if (atomicDistr == "sphere") {
456 0 : distribution = std::make_unique<uniformSphere>();
457 0 : } else if (atomicDistr == "globs") {
458 0 : distribution = std::make_unique<twoGlobs>();
459 0 : } else if (atomicDistr == "sc") {
460 0 : distribution = std::make_unique<tiledSimpleCubic>();
461 : } else {
462 0 : plumed_error() << R"(The atomic distribution can be only "line", "cube", "sphere", "globs" and "sc", the input was ")"
463 0 : << atomicDistr <<'"';
464 : }
465 0 : return distribution;
466 0 : }
467 : } //anonymus namespace for benchmark distributions
468 : class Benchmark:
469 : public CLTool {
470 : public:
471 : static void registerKeywords( Keywords& keys );
472 : explicit Benchmark(const CLToolOptions& co );
473 : int main(FILE* in, FILE*out,Communicator& pc) override;
474 4 : std::string description()const override {
475 4 : return "run a calculation with a fixed trajectory to find bottlenecks in PLUMED";
476 : }
477 : };
478 :
479 17653 : PLUMED_REGISTER_CLTOOL(Benchmark,"benchmark")
480 :
481 5883 : void Benchmark::registerKeywords( Keywords& keys ) {
482 5883 : CLTool::registerKeywords( keys );
483 11766 : keys.add("compulsory","--plumed","plumed.dat","colon separated path(s) to the input file(s)");
484 11766 : keys.add("compulsory","--kernel","this","colon separated path(s) to kernel(s)");
485 11766 : keys.add("compulsory","--natoms","100000","the number of atoms to use for the simulation");
486 11766 : keys.add("compulsory","--nsteps","2000","number of steps of MD to perform (-1 means forever)");
487 11766 : keys.add("compulsory","--maxtime","-1","maximum number of seconds (-1 means forever)");
488 11766 : keys.add("compulsory","--sleep","0","number of seconds of sleep, mimicking MD calculation");
489 11766 : keys.add("compulsory","--atom-distribution","line","the kind of possible atomic displacement at each step");
490 11766 : keys.add("optional","--dump-trajectory","dump the trajectory to this file");
491 11766 : keys.addFlag("--domain-decomposition",false,"simulate domain decomposition, implies --shuffle");
492 11766 : keys.addFlag("--shuffled",false,"reshuffle atoms");
493 5883 : }
494 :
495 4 : Benchmark::Benchmark(const CLToolOptions& co ):
496 4 : CLTool(co) {
497 4 : inputdata=commandline;
498 4 : }
499 :
500 :
501 0 : int Benchmark::main(FILE* in, FILE*out,Communicator& pc) {
502 : // deterministic initializations to avoid issues with MPI
503 : generator rng;
504 0 : PLMD::Random atomicGenerator;
505 0 : std::unique_ptr<AtomDistribution> distribution;
506 :
507 : struct FileDeleter {
508 : void operator()(FILE*f) const noexcept {
509 : if(f) {
510 0 : std::fclose(f);
511 : }
512 0 : }
513 : };
514 :
515 0 : std::unique_ptr<FILE,FileDeleter> log_dev_null{std::fopen("/dev/null","w")};
516 :
517 0 : Log log;
518 0 : if(pc.Get_rank()==0) {
519 0 : log.link(out);
520 : } else {
521 0 : log.link(log_dev_null.get());
522 : }
523 0 : log.setLinePrefix("BENCH: ");
524 0 : log <<"Welcome to PLUMED benchmark\n";
525 : std::vector<Kernel> kernels;
526 :
527 : // perform comparative analysis
528 : // ensure that kernels vector is destroyed from last to first element upon exit
529 0 : auto kernels_deleter=[&log](auto f) {
530 0 : if(!f) {
531 0 : return;
532 : }
533 0 : if(f->empty()) {
534 : return;
535 : }
536 : generator bootstrapRng;
537 :
538 : const auto size=f->back().timings.size();
539 : //B are the bootstrap iterations
540 : constexpr int B=200;
541 0 : const size_t numblocks=size;
542 : // for some reasons, blocked bootstrap underestimates error
543 : // For now I keep it off. If I remove it, the code below could be simplified
544 : // if(numblocks>20) numblocks=20;
545 0 : const auto blocksize=size/numblocks;
546 :
547 0 : if(f->size()<2) {
548 0 : log<<"Single run, skipping comparative analysis\n";
549 0 : } else if(size<10) {
550 0 : log<<"Too small sample, skipping comparative analysis\n";
551 : } else
552 : try {
553 :
554 0 : log<<"Running comparative analysis, "<<numblocks<<" blocks with size "<<blocksize<<"\n";
555 :
556 0 : std::vector<std::size_t> choice(size);
557 0 : std::uniform_int_distribution<> distrib(0, numblocks-1);
558 0 : std::vector<std::vector<long long int>> blocks(f->size());
559 :
560 : {
561 : int i=0;
562 0 : for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
563 : size_t l=0;
564 0 : blocks[i].assign(numblocks,0);
565 0 : for(auto j=0ULL; j<numblocks; j++) {
566 0 : for(auto k=0ULL; k<blocksize; k++) {
567 0 : plumed_assert(l<it->timings.size());
568 0 : blocks[i][j]+=it->timings[l];
569 0 : l++;
570 : }
571 : }
572 : }
573 : }
574 :
575 0 : std::vector<std::vector<double>> ratios(f->size());
576 0 : for(auto & r : ratios) {
577 : //B are the bootstrap iterations
578 0 : r.resize(B);
579 : }
580 :
581 : //B are the bootstrap iterations
582 0 : for(unsigned b=0; b<B; b++) {
583 0 : for(auto & c : choice) {
584 0 : c=distrib(bootstrapRng);
585 : }
586 : long long int reference=0;
587 0 : for(auto & c : choice) {
588 0 : reference+=blocks[0][c];
589 : }
590 0 : for(auto i=0ULL; i<blocks.size(); i++) {
591 : long long int estimate=0;
592 : // this would lead to separate bootstrap samples for each estimate:
593 : // for(auto & c : choice){c=distrib(bootstrapRng);}
594 0 : for(auto & c : choice) {
595 0 : estimate+=blocks[i][c];
596 : }
597 0 : ratios[i][b]=double(estimate)/double(reference);
598 : }
599 : }
600 :
601 : {
602 : int i=0;
603 0 : for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
604 : double sum=0.0;
605 : double sum2=0.0;
606 0 : for(auto r : ratios[i]) {
607 0 : sum+=r;
608 0 : sum2+=r*r;
609 : }
610 : //B are the bootstrap iterations
611 0 : it->comparative_timing=sum/B;
612 0 : it->comparative_timing_error=std::sqrt(sum2/B-sum*sum/(B*B));
613 : }
614 : }
615 :
616 0 : } catch(std::exception & e) {
617 0 : log<<"Unexpected error during comparative analysis\n";
618 0 : log<<e.what()<<"\n";
619 : }
620 0 : while(!f->empty()) {
621 : f->pop_back();
622 : }
623 :
624 : };
625 : std::unique_ptr<decltype(kernels),decltype(kernels_deleter)> kernels_deleter_obj(&kernels,kernels_deleter);
626 :
627 :
628 : // construct the kernels vector:
629 : {
630 : std::vector<std::string> allpaths;
631 :
632 : {
633 : std::string paths;
634 0 : parse("--kernel",paths);
635 0 : log <<"Using --kernel=" << paths << "\n";
636 0 : allpaths=Tools::getWords(paths,":");
637 : }
638 :
639 : std::vector<std::string> allplumed;
640 : {
641 : std::string paths;
642 0 : parse("--plumed",paths);
643 0 : log <<"Using --plumed=" << paths << "\n";
644 0 : allplumed=Tools::getWords(paths,":");
645 : }
646 :
647 0 : plumed_assert(allplumed.size()>0 && allpaths.size()>0);
648 :
649 : // this check only works on MacOS
650 : #if defined(__APPLE__)
651 : // if any of the paths if different from "this", we check if libplumed was loaded locally to avoid conflicts.
652 : if(std::any_of(allpaths.begin(),allpaths.end(),[](auto value) {
653 : return value != "this";
654 : })) {
655 : if(DLLoader::isPlumedGlobal()) {
656 : plumed_error()<<"It looks like libplumed is loaded in the global namespace, you cannot load a different version of the kernel\n"
657 : <<"Please make sure you use the plumed-runtime executable and that the env var PLUMED_LOAD_NAMESPACE is not set to GLOBAL";
658 : }
659 : }
660 : #endif
661 :
662 0 : if(allplumed.size()>1 && allpaths.size()>1 && allplumed.size() != allpaths.size()) {
663 0 : plumed_error() << "--kernel and --plumed should have either one element or the same number of elements";
664 : }
665 :
666 0 : if(allplumed.size()>1 && allpaths.size()==1)
667 0 : for(unsigned i=1; i<allplumed.size(); i++) {
668 0 : allpaths.push_back(allpaths[0]);
669 : }
670 0 : if(allplumed.size()==1 && allpaths.size()>1)
671 0 : for(unsigned i=1; i<allpaths.size(); i++) {
672 0 : allplumed.push_back(allplumed[0]);
673 : }
674 :
675 0 : for(unsigned i=0; i<allpaths.size(); i++) {
676 0 : kernels.emplace_back(allpaths[i],allplumed[i],&log);
677 : }
678 0 : }
679 :
680 : // reverse order so that log happens in the forward order:
681 0 : std::reverse(kernels.begin(),kernels.end());
682 :
683 : // read other flags:
684 0 : bool shuffled=false;
685 0 : parseFlag("--shuffled",shuffled);
686 :
687 : int nf;
688 0 : parse("--nsteps",nf);
689 0 : log << "Using --nsteps=" << nf << "\n";
690 : unsigned natoms;
691 0 : parse("--natoms",natoms);
692 0 : log << "Using --natoms=" << natoms << "\n";
693 : double maxtime;
694 0 : parse("--maxtime",maxtime);
695 0 : log << "Using --maxtime=" << maxtime << "\n";
696 :
697 0 : bool domain_decomposition=false;
698 0 : parseFlag("--domain-decomposition",domain_decomposition);
699 :
700 0 : if(pc.Get_size()>1) {
701 0 : domain_decomposition=true;
702 : }
703 0 : if(domain_decomposition) {
704 0 : shuffled=true;
705 : }
706 :
707 0 : if (shuffled) {
708 0 : log << "Using --shuffled\n";
709 : }
710 0 : if (domain_decomposition) {
711 0 : log << "Using --domain-decomposition\n";
712 : }
713 :
714 : double timeToSleep;
715 0 : parse("--sleep",timeToSleep);
716 0 : log << "Using --sleep=" << timeToSleep << "\n";
717 :
718 : std::vector<int> shuffled_indexes;
719 :
720 : {
721 : std::string atomicDistr;
722 0 : parse("--atom-distribution",atomicDistr);
723 0 : distribution = getAtomDistribution(atomicDistr);
724 0 : log << "Using --atom-distribution=" << atomicDistr << "\n";
725 : }
726 :
727 : {
728 : std::string fileToDump;
729 0 : if(parse("--dump-trajectory",fileToDump)) {
730 0 : log << "Saving the trajectory to \"" << fileToDump << "\" and exiting\n";
731 0 : std::vector<double> cell(9);
732 0 : std::vector<Vector> pos(natoms);
733 0 : std::ofstream ofile(fileToDump);
734 0 : if (nf<0) {
735 : //if the user accidentally sets infinite steps, we set it to print only one
736 0 : nf=1;
737 : }
738 0 : for(int step=0; step<nf; ++step) {
739 0 : auto sw=kernels[0].stopwatch.startStop("TrajectoryGeneration");
740 0 : distribution->positions(pos,step,atomicGenerator);
741 0 : distribution->box(cell,natoms,step,atomicGenerator);
742 0 : ofile << natoms << "\n"
743 0 : << cell[0] << " " << cell[1] << " " << cell[2] << " "
744 0 : << cell[3] << " " << cell[4] << " " << cell[5] << " "
745 0 : << cell[6] << " " << cell[7] << " " << cell[8] << "\n";
746 0 : for(int i=0; i<natoms; ++i) {
747 0 : ofile << "X\t" << pos[i]<< "\n";
748 : }
749 0 : }
750 0 : ofile.close();
751 : return 0;
752 0 : }
753 : }
754 :
755 0 : log <<"Initializing the setup of the kernel(s)\n";
756 0 : const auto initial_time=std::chrono::high_resolution_clock::now();
757 :
758 0 : for(auto & k : kernels) {
759 : auto & p(k.handle);
760 0 : auto sw=k.stopwatch.startStop("A Initialization");
761 0 : if(Communicator::plumedHasMPI() && domain_decomposition) {
762 0 : p.cmd("setMPIComm",&pc.Get_comm());
763 : }
764 0 : p.cmd("setRealPrecision",(int)sizeof(double));
765 0 : p.cmd("setMDLengthUnits",1.0);
766 0 : p.cmd("setMDChargeUnits",1.0);
767 0 : p.cmd("setMDMassUnits",1.0);
768 0 : p.cmd("setMDEngine","benchmarks");
769 0 : p.cmd("setTimestep",1.0);
770 0 : p.cmd("setPlumedDat",k.plumed_dat.c_str());
771 0 : p.cmd("setLog",out);
772 0 : p.cmd("setNatoms",natoms);
773 0 : p.cmd("init");
774 0 : }
775 :
776 0 : std::vector<double> cell( 9 ), virial( 9 );
777 0 : std::vector<Vector> pos( natoms ), forces( natoms );
778 0 : std::vector<double> masses( natoms, 1 ), charges( natoms, 0 );
779 :
780 0 : if(shuffled) {
781 0 : shuffled_indexes.resize(natoms);
782 0 : for(unsigned i=0; i<natoms; i++) {
783 0 : shuffled_indexes[i]=i;
784 : }
785 0 : std::shuffle(shuffled_indexes.begin(),shuffled_indexes.end(),rng);
786 : }
787 :
788 : // non owning pointers, used for shuffling the execution order
789 : std::vector<Kernel*> kernels_ptr;
790 0 : for(unsigned i=0; i<kernels.size(); i++) {
791 0 : kernels_ptr.push_back(&kernels[i]);
792 : }
793 :
794 0 : int plumedStopCondition=0;
795 : bool fast_finish=false;
796 : int part=0;
797 :
798 0 : log<<"Starting MD loop\n";
799 0 : log<<"Use CTRL+C to stop at any time and collect timers (not working in MPI runs)\n";
800 : // trap signals:
801 0 : SignalHandlerGuard sigIntGuard(SIGINT, signalHandler);
802 0 : SignalHandlerGuard sigTermGuard(SIGTERM, signalHandler);
803 :
804 0 : for(int step=0; nf<0 || step<nf; ++step) {
805 0 : std::shuffle(kernels_ptr.begin(),kernels_ptr.end(),rng);
806 0 : distribution->positions(pos,step,atomicGenerator);
807 0 : distribution->box(cell,natoms,step,atomicGenerator);
808 : double* pos_ptr;
809 : double* for_ptr;
810 : double* charges_ptr;
811 : double* masses_ptr;
812 : int* indexes_ptr=nullptr;
813 : int n_local_atoms;
814 :
815 0 : if(domain_decomposition) {
816 0 : const auto nproc=pc.Get_size();
817 0 : const auto nn=natoms/nproc;
818 : //using int to remove warning, MPI don't work with unsigned
819 0 : int excess=natoms%nproc;
820 0 : const auto myrank=pc.Get_rank();
821 : auto shift=0;
822 0 : n_local_atoms=nn;
823 0 : if(myrank<excess) {
824 0 : n_local_atoms+=1;
825 : }
826 0 : for(int i=0; i<myrank; i++) {
827 0 : shift+=nn;
828 0 : if(i<excess) {
829 0 : shift+=1;
830 : }
831 : }
832 0 : pos_ptr=&pos[shift][0];
833 0 : for_ptr=&forces[shift][0];
834 : charges_ptr=&charges[shift];
835 : masses_ptr=&masses[shift];
836 0 : indexes_ptr=shuffled_indexes.data()+shift;
837 : } else {
838 0 : pos_ptr=&pos[0][0];
839 0 : for_ptr=&forces[0][0];
840 : charges_ptr=&charges[0];
841 : masses_ptr=&masses[0];
842 0 : n_local_atoms=natoms;
843 : indexes_ptr=shuffled_indexes.data();
844 : }
845 :
846 : const char* sw_name;
847 0 : if(part==0) {
848 : sw_name="B0 First step";
849 0 : } else if(part==1) {
850 : sw_name="B1 Warm-up";
851 0 : } else if(part==2) {
852 : sw_name="B2 Calculation part 1";
853 : } else {
854 : sw_name="B3 Calculation part 2";
855 : }
856 :
857 :
858 0 : for(unsigned i=0; i<kernels_ptr.size(); i++) {
859 0 : auto & p(kernels_ptr[i]->handle);
860 :
861 : {
862 0 : auto sw=kernels_ptr[i]->stopwatch.startPause(sw_name);
863 0 : p.cmd("setStep",step);
864 0 : p.cmd("setStopFlag",&plumedStopCondition);
865 0 : p.cmd("setForces",for_ptr, {n_local_atoms,3});
866 0 : p.cmd("setBox",&cell[0], {3,3});
867 0 : p.cmd("setVirial",&virial[0], {3,3});
868 0 : p.cmd("setPositions",pos_ptr, {n_local_atoms,3});
869 0 : p.cmd("setMasses",masses_ptr, {n_local_atoms});
870 0 : p.cmd("setCharges",charges_ptr, {n_local_atoms});
871 0 : if(shuffled) {
872 0 : p.cmd("setAtomsNlocal",n_local_atoms);
873 0 : p.cmd("setAtomsGatindex",indexes_ptr, {n_local_atoms});
874 : }
875 0 : p.cmd("prepareCalc");
876 0 : }
877 :
878 : // mimick MD calculation here
879 : {
880 : unsigned k=0;
881 0 : auto start=std::chrono::high_resolution_clock::now();
882 0 : while(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-start).count()<(long long int)1e9*timeToSleep) {
883 0 : k+=i*i;
884 : }
885 : std::fprintf(log_dev_null.get(),"%u",k);
886 : }
887 :
888 : {
889 0 : auto sw=kernels_ptr[i]->stopwatch.startStop(sw_name);
890 0 : p.cmd("performCalc");
891 0 : }
892 :
893 0 : if(kernels_ptr.size()>1 && part>1) {
894 0 : kernels_ptr[i]->timings.push_back(kernels_ptr[i]->stopwatch.getLastCycle(sw_name));
895 : }
896 0 : if(plumedStopCondition || signalReceived.load()) {
897 : fast_finish=true;
898 : }
899 : }
900 0 : auto elapsed=std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-initial_time).count();
901 0 : if(part==0) {
902 : part=1;
903 : }
904 0 : if(part<2) {
905 0 : if((maxtime>0 && elapsed>(long long int)(0.2*1e9*maxtime)) || (nf>0 && step+1>=nf/5) || (maxtime<0 && nf<0 && step+1>=100)) {
906 : part=2;
907 0 : log<<"Warm-up completed\n";
908 : }
909 : }
910 0 : if(part<3) {
911 0 : if((maxtime>0 && elapsed>(long long int)(0.6*1e9*maxtime)) || (nf>0 && step+1>=3*nf/5)) {
912 : part=3;
913 0 : log<<"60% completed\n";
914 : }
915 : }
916 :
917 0 : if(maxtime>0 && elapsed>(long long int)(1e9*maxtime)) {
918 : fast_finish=true;
919 : }
920 :
921 : {
922 0 : unsigned tmp=fast_finish;
923 0 : pc.Bcast(tmp,0);
924 0 : fast_finish=tmp;
925 : }
926 0 : if(fast_finish) {
927 : break;
928 : }
929 : }
930 :
931 : return 0;
932 0 : }
933 :
934 : } // namespace unnamed
935 : } // namespace cltools
936 : } // namespace PLMD
|