Line data Source code
1 : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 : Copyright (c) 2019-2023 The plumed team
3 : (see the PEOPLE file at the root of the distribution for a list of names)
4 :
5 : See http://www.plumed.org for more information.
6 :
7 : This file is part of plumed, version 2.
8 :
9 : plumed is free software: you can redistribute it and/or modify
10 : it under the terms of the GNU Lesser General Public License as published by
11 : the Free Software Foundation, either version 3 of the License, or
12 : (at your option) any later version.
13 :
14 : plumed is distributed in the hope that it will be useful,
15 : but WITHOUT ANY WARRANTY; without even the implied warranty of
16 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 : GNU Lesser General Public License for more details.
18 :
19 : You should have received a copy of the GNU Lesser General Public License
20 : along with plumed. If not, see <http://www.gnu.org/licenses/>.
21 : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
22 : #include "CLTool.h"
23 : #include "core/CLToolRegister.h"
24 : #include "tools/Communicator.h"
25 : #include "tools/Tools.h"
26 : #include "config/Config.h"
27 : #include "tools/PlumedHandle.h"
28 : #include "tools/Stopwatch.h"
29 : #include "tools/Log.h"
30 : #include "tools/DLLoader.h"
31 : #include "tools/Random.h"
32 :
33 : #include <cstdio>
34 : #include <string>
35 : #include <vector>
36 : #include <fstream>
37 : #include <atomic>
38 : #include <csignal>
39 : #include <cstdio>
40 : #include <random>
41 : #include <algorithm>
42 : #include <chrono>
43 : #include <string_view>
44 :
45 : namespace PLMD {
46 : namespace cltools {
47 :
48 : //+PLUMEDOC TOOLS benchmark
49 : /*
50 : benchmark is a lightweight reimplementation of driver focused on running benchmarks
51 :
52 : The main difference wrt driver is that it generates a trajectory in memory rather than reading it
53 : from a file. This allows to better time the overhead of the plumed library, without including
54 : the time needed to read the trajectory.
55 :
56 : It is also possible to load a separate version of the plumed kernel. This enables running
57 : benchmarks agaist previous plumed versions in a controlled setting, where systematic errors
58 : in the comparison are minimized.
59 :
60 : \par Examples
61 :
62 : First, you should create a sample `plumed.dat` file for testing. For instance:
63 :
64 : \plumedfile
65 : WHOLEMOLECULES ENTITY0=1-10000
66 : p: POSITION ATOM=1
67 : RESTRAINT ARG=p.x KAPPA=1 AT=0
68 : \endplumedfile
69 :
70 : Then you can test the performance of this input with the following command:
71 : \verbatim
72 : plumed benchmark
73 : \endverbatim
74 :
75 : You can also test a different (older) version of PLUMED with the same input. To do so,
76 : you should run
77 : \verbatim
78 : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so
79 : \endverbatim
80 :
81 : \warning It is necessary to use the `plumed-runtime` executable here to avoid conflicts between different
82 : plumed versions. You will find it in your path if you are using the non installed version of plumed,
83 : and in `$prefix/lib/plumed` if you installed plumed in $prefix,.
84 :
85 : \par Comparing multiple versions
86 :
87 : The best way to compare two versions of plumed on the same input is to pass multiple colon-separated kernels:
88 :
89 : \verbatim
90 : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:/path2/to/lib/libplumedKernel.so:this
91 : \endverbatim
92 :
93 : Here `this` means the kernel of the version with which you are running the benchmark. This comparison runs the three
94 : instances simultaneously (alternating them) so that systematic differences in the load of your machine will affect them
95 : to the same extent.
96 :
97 : In case the different versions require modified plumed.dat files, or if you simply want to compare
98 : two different plumed input files that compute the same thing, you can also use multiple plumed input files:
99 :
100 : \verbatim
101 : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:this --plumed plumed1.dat:plumed2.dat
102 : \endverbatim
103 :
104 : Similarly, you might want to run two different inputs using the same kernel, which can be obtained with:
105 :
106 : \verbatim
107 : plumed-runtime benchmark --plumed plumed1.dat:plumed2.dat
108 : \endverbatim
109 :
110 : \par Profiling
111 :
112 : If you want to attach a profiler on the fly to the process, you might find it convenient to use `--nsteps -1`.
113 : The simulation will run forever and can be interrupted with CTRL-C. When interrupted, the result of the timers
114 : should be displayed anyway.
115 : You can also run setting a maximum time with `--maxtime`.
116 :
117 : If you run a profiler when testing multiple PLUMED versions you might be confused by which function is from
118 : each version. It is recommended to recompile separate instances with a separate C++ namespace (`-DPLMD=PLUMED_version_1`)
119 : so that you will be able to distinguish them. In addition, compiling with `CXXFLAGS="-g -O3"` will make the profiling
120 : report more complete, likely including code lines.
121 :
122 : \par MPI runs
123 :
124 : You can run emulating a domain decomposition. This is done automatically if plumed has been compiled with MPI
125 : and you run with `mpirun`
126 :
127 : \verbatim
128 : mpirun -np 4 plumed-runtime benchmark
129 : \endverbatim
130 :
131 : If you load separate PLUMED instances as discussed above, they should all be compiled against the same MPI version.
132 : Notice that when using MPI signals (CTRL-C) might not work.
133 :
134 : Since some of the data transfer could happen asynchronously, you might want to use the `--sleep` option
135 : to simulate a lag between the `prepareCalc` and `performCalc` actions. This part of the calculation will not contribute
136 : to timer, but will obviously slow down your test.
137 :
138 : \par Output
139 :
140 : In the output you will see the usual reports about timing produced by the internal
141 : timers of the tested plumed instances.
142 : In addition, this tool will monitor the timing externally, with some slightly different criterion:
143 : - First, the initialization (construction of the input) will be shown with a separate timer,
144 : as well as the timing for the first step.
145 : - Second, the timer corresponding to the calculation will be split in three parts, reporting
146 : execution of the first 20% (warm-up) and the next two blocks of 40% each.
147 : - Finally, you might notice some discrepancy because some of the actions that are usually
148 : not expensive are not included in the internal timers. The external timer will
149 : thus provide a better estimate of the total elapsed time, including everything.
150 :
151 : The internal timers are still useful to monitor what happens at the different stages
152 : and, with \ref DEBUG `DETAILED_TIMERS`, what happens in each action.
153 :
154 : When you run multiple version, a comparative analisys of the time spent within PLUMED in the various
155 : instances will be done, showing the ratio between the total time and the time measured on the first
156 : instance, which will act as a reference. Errors will be estimated with bootstrapping. The warm-up phase will be discarded for
157 : this analysis.
158 :
159 : */
160 : //+ENDPLUMEDOC
161 :
162 : // We use an anonymous namespace here to avoid clashes with variables
163 : // declared in other parts of the code
164 : namespace {
165 :
166 : //this is a sugar for changing idea faster about the rng
167 : using generator = std::mt19937;
168 :
169 : std::atomic<bool> signalReceived(false);
170 :
171 : class SignalHandlerGuard {
172 : public:
173 0 : SignalHandlerGuard(int signal, void (*newHandler)(int)) : signal_(signal) {
174 : // Store the current handler before setting the new one
175 0 : prevHandler_ = std::signal(signal, newHandler);
176 0 : if (prevHandler_ == SIG_ERR) {
177 0 : throw std::runtime_error("Failed to set signal handler");
178 : }
179 0 : }
180 :
181 : ~SignalHandlerGuard() {
182 : // Restore the previous handler upon destruction
183 0 : std::signal(signal_, prevHandler_);
184 0 : }
185 :
186 : // Delete copy constructor and assignment operator to prevent copying
187 : SignalHandlerGuard(const SignalHandlerGuard&) = delete;
188 : SignalHandlerGuard& operator=(const SignalHandlerGuard&) = delete;
189 :
190 : private:
191 : int signal_;
192 : void (*prevHandler_)(int);
193 : };
194 :
195 0 : extern "C" void signalHandler(int signal) {
196 0 : if (signal == SIGINT) {
197 : signalReceived.store(true);
198 0 : fprintf(stderr, "Signal interrupt received\n");
199 : }
200 0 : if (signal == SIGTERM) {
201 : signalReceived.store(true);
202 0 : fprintf(stderr, "Signal termination received\n");
203 : }
204 0 : }
205 :
206 : /// This base class contains members that are movable with default operations
207 : struct KernelBase {
208 : std::string path;
209 : std::string plumed_dat;
210 : PlumedHandle handle;
211 : Stopwatch stopwatch;
212 : std::vector<long long int> timings;
213 : double comparative_timing=-1.0;
214 : double comparative_timing_error=-1.0;
215 0 : KernelBase(const std::string & path_,const std::string & plumed_dat_, Log* log_):
216 0 : path(path_),
217 0 : plumed_dat(plumed_dat_),
218 0 : handle([&]() {
219 0 : if(path_=="this") {
220 0 : return PlumedHandle();
221 : } else {
222 0 : return PlumedHandle::dlopen(path.c_str());
223 : }
224 : }()),
225 0 : stopwatch(*log_) {
226 0 : }
227 : };
228 :
229 : /// Local structure handling a kernel and the related timers.
230 : /// This structure specifically contain the Log, which needs special treatment
231 : /// in move semantics
232 : struct Kernel :
233 : public KernelBase {
234 : Log* log=nullptr;
235 0 : Kernel(const std::string & path_,const std::string & the_plumed_dat, Log* log_):
236 : KernelBase(path_,the_plumed_dat,log_),
237 0 : log(log_) {
238 : }
239 :
240 0 : ~Kernel() {
241 0 : if(log) {
242 0 : (*log)<<"\n";
243 0 : (*log) <<"Kernel: "<<path<<"\n";
244 0 : (*log) <<"Input: "<<plumed_dat<<"\n";
245 0 : if(comparative_timing>0.0) {
246 0 : (*log).printf("Comparative: %.3f +- %.3f\n",comparative_timing,comparative_timing_error);
247 : }
248 : }
249 0 : }
250 :
251 0 : Kernel(Kernel && other) noexcept:
252 : KernelBase(std::move(other)),
253 0 : log(other.log) {
254 0 : other.log=nullptr; // ensure no log is done in the moved away object
255 : }
256 :
257 0 : Kernel & operator=(Kernel && other) noexcept {
258 0 : if(this != &other) {
259 0 : KernelBase::operator=(std::move(other));
260 0 : log=other.log;
261 0 : other.log=nullptr; // ensure no log is done in the moved away object
262 : }
263 0 : return *this;
264 : }
265 : };
266 :
267 : namespace {
268 :
269 : class UniformSphericalVector {
270 : //double rminCub;
271 : double rCub;
272 :
273 : public:
274 : //assuming rmin=0
275 0 : UniformSphericalVector(const double rmax):
276 0 : rCub (rmax*rmax*rmax/*-rminCub*/) {}
277 0 : PLMD::Vector operator()(Random& rng) {
278 0 : double rho = std::cbrt (/*rminCub + */rng.RandU01()*rCub);
279 0 : double theta =std::acos (2.0*rng.RandU01() -1.0);
280 0 : double phi = 2.0 * PLMD::pi * rng.RandU01();
281 : return Vector (
282 0 : rho * sin (theta) * cos (phi),
283 0 : rho * sin (theta) * sin (phi),
284 0 : rho * cos (theta));
285 : }
286 : };
287 :
288 : ///Acts as a template for any distribution
289 : struct AtomDistribution {
290 : virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&)=0;
291 0 : virtual void box(std::vector<double>& box, unsigned /*natoms*/, unsigned /*step*/, Random&) {
292 : std::fill(box.begin(), box.end(),0);
293 0 : };
294 : virtual ~AtomDistribution() noexcept {}
295 : };
296 :
297 0 : struct theLine:public AtomDistribution {
298 0 : void positions(std::vector<Vector>& posToUpdate, unsigned step, Random&rng) override {
299 : auto nat = posToUpdate.size();
300 : UniformSphericalVector usv(0.5);
301 :
302 0 : for (unsigned i=0; i<nat; ++i) {
303 0 : posToUpdate[i] = Vector(i, 0, 0) + usv(rng);
304 : }
305 0 : }
306 : };
307 :
308 0 : struct uniformSphere:public AtomDistribution {
309 0 : void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
310 :
311 : //giving more or less a cubic udm of volume for each atom: V=nat
312 0 : const double rmax= std::cbrt ((3.0/(4.0*PLMD::pi)) * posToUpdate.size());
313 :
314 : UniformSphericalVector usv(rmax);
315 : auto s=posToUpdate.begin();
316 : auto e=posToUpdate.end();
317 : //I am using the iterators:this is slightly faster,
318 : // enough to overcome the cost of the vtable that I added
319 0 : for (unsigned i=0; s!=e; ++s,++i) {
320 0 : *s = usv (rng);
321 : }
322 :
323 0 : }
324 0 : void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
325 0 : const double rmax= 2.0*std::cbrt((3.0/(4.0*PLMD::pi)) * natoms);
326 0 : box[0]=rmax;
327 0 : box[1]=0.0;
328 0 : box[2]=0.0;
329 0 : box[3]=0.0;
330 0 : box[4]=rmax;
331 0 : box[5]=0.0;
332 0 : box[6]=0.0;
333 0 : box[7]=0.0;
334 0 : box[8]=rmax;
335 :
336 0 : }
337 : };
338 :
339 0 : struct twoGlobs: public AtomDistribution {
340 0 : virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&rng) {
341 : //I am using two unigform spheres and 2V=n
342 0 : const double rmax= std::cbrt ((3.0/(8.0*PLMD::pi)) * posToUpdate.size());
343 :
344 : UniformSphericalVector usv(rmax);
345 : std::array<Vector,2> centers{
346 : PLMD::Vector{0.0,0.0,0.0},
347 : //so they do not overlap
348 : PLMD::Vector{2.0*rmax,2.0*rmax,2.0*rmax}
349 0 : };
350 0 : std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
351 : //RandInt is only declared
352 : // return usv (rng) + centers[rng.RandInt(1)];
353 0 : return usv (rng) + centers[rng.RandU01()>0.5];
354 : });
355 0 : }
356 :
357 0 : virtual void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) {
358 :
359 0 : const double rmax= 4.0 * std::cbrt ((3.0/(8.0*PLMD::pi)) * natoms);
360 0 : box[0]=rmax;
361 0 : box[1]=0.0;
362 0 : box[2]=0.0;
363 0 : box[3]=0.0;
364 0 : box[4]=rmax;
365 0 : box[5]=0.0;
366 0 : box[6]=0.0;
367 0 : box[7]=0.0;
368 0 : box[8]=rmax;
369 0 : };
370 : };
371 :
372 0 : struct uniformCube:public AtomDistribution {
373 0 : void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
374 : //giving more or less a cubic udm of volume for each atom: V = nat
375 0 : const double rmax = std::cbrt(static_cast<double>(posToUpdate.size()));
376 :
377 :
378 :
379 : // std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
380 : // return Vector (rndR(rng),rndR(rng),rndR(rng));
381 : // });
382 : auto s=posToUpdate.begin();
383 : auto e=posToUpdate.end();
384 : //I am using the iterators:this is slightly faster,
385 : // enough to overcome the cost of the vtable that I added
386 0 : for (unsigned i=0; s!=e; ++s,++i) {
387 0 : *s = Vector (rng.RandU01()*rmax,rng.RandU01()*rmax,rng.RandU01()*rmax);
388 : }
389 0 : }
390 0 : void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
391 : //+0.05 to avoid overlap
392 0 : const double rmax= std::cbrt(natoms)+0.05;
393 0 : box[0]=rmax;
394 0 : box[1]=0.0;
395 0 : box[2]=0.0;
396 0 : box[3]=0.0;
397 0 : box[4]=rmax;
398 0 : box[5]=0.0;
399 0 : box[6]=0.0;
400 0 : box[7]=0.0;
401 0 : box[8]=rmax;
402 :
403 0 : }
404 : };
405 :
406 0 : struct tiledSimpleCubic:public AtomDistribution {
407 0 : void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
408 : //Tiling the space in this way will not tests 100% the pbc, but
409 : //I do not think that write a spacefilling curve, like Hilbert, Peano or Morton
410 : //could be a good idea, in this case
411 0 : const unsigned rmax = std::ceil(std::cbrt(static_cast<double>(posToUpdate.size())));
412 :
413 : auto s=posToUpdate.begin();
414 : auto e=posToUpdate.end();
415 : //I am using the iterators:this is slightly faster,
416 : // enough to overcome the cost of the vtable that I added
417 0 : for (unsigned k=0; k<rmax&&s!=e; ++k) {
418 0 : for (unsigned j=0; j<rmax&&s!=e; ++j) {
419 0 : for (unsigned i=0; i<rmax&&s!=e; ++i) {
420 0 : *s = Vector (i,j,k);
421 : ++s;
422 : }
423 : }
424 : }
425 0 : }
426 0 : void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
427 0 : const double rmax= std::ceil(std::cbrt(static_cast<double>(natoms)));;
428 0 : box[0]=rmax;
429 0 : box[1]=0.0;
430 0 : box[2]=0.0;
431 0 : box[3]=0.0;
432 0 : box[4]=rmax;
433 0 : box[5]=0.0;
434 0 : box[6]=0.0;
435 0 : box[7]=0.0;
436 0 : box[8]=rmax;
437 :
438 0 : }
439 : };
440 0 : std::unique_ptr<AtomDistribution> getAtomDistribution(std::string_view atomicDistr) {
441 0 : std::unique_ptr<AtomDistribution> distribution;
442 0 : if(atomicDistr == "line") {
443 0 : distribution = std::make_unique<theLine>();
444 0 : } else if (atomicDistr == "cube") {
445 0 : distribution = std::make_unique<uniformCube>();
446 0 : } else if (atomicDistr == "sphere") {
447 0 : distribution = std::make_unique<uniformSphere>();
448 0 : } else if (atomicDistr == "globs") {
449 0 : distribution = std::make_unique<twoGlobs>();
450 0 : } else if (atomicDistr == "sc") {
451 0 : distribution = std::make_unique<tiledSimpleCubic>();
452 : } else {
453 0 : plumed_error() << R"(The atomic distribution can be only "line", "cube", "sphere", "globs" and "sc", the input was ")"
454 0 : << atomicDistr <<'"';
455 : }
456 0 : return distribution;
457 0 : }
458 : } //anonymus namespace for benchmark distributions
459 : class Benchmark:
460 : public CLTool {
461 : public:
462 : static void registerKeywords( Keywords& keys );
463 : explicit Benchmark(const CLToolOptions& co );
464 : int main(FILE* in, FILE*out,Communicator& pc) override;
465 4 : std::string description()const override {
466 4 : return "run a calculation with a fixed trajectory to find bottlenecks in PLUMED";
467 : }
468 : };
469 :
470 16330 : PLUMED_REGISTER_CLTOOL(Benchmark,"benchmark")
471 :
472 5442 : void Benchmark::registerKeywords( Keywords& keys ) {
473 5442 : CLTool::registerKeywords( keys );
474 10884 : keys.add("compulsory","--plumed","plumed.dat","colon separated path(s) to the input file(s)");
475 10884 : keys.add("compulsory","--kernel","this","colon separated path(s) to kernel(s)");
476 10884 : keys.add("compulsory","--natoms","100000","the number of atoms to use for the simulation");
477 10884 : keys.add("compulsory","--nsteps","2000","number of steps of MD to perform (-1 means forever)");
478 10884 : keys.add("compulsory","--maxtime","-1","maximum number of seconds (-1 means forever)");
479 10884 : keys.add("compulsory","--sleep","0","number of seconds of sleep, mimicking MD calculation");
480 10884 : keys.add("compulsory","--atom-distribution","line","the kind of possible atomic displacement at each step");
481 10884 : keys.add("optional","--dump-trajectory","dump the trajectory to this file");
482 10884 : keys.addFlag("--domain-decomposition",false,"simulate domain decomposition, implies --shuffle");
483 10884 : keys.addFlag("--shuffled",false,"reshuffle atoms");
484 5442 : }
485 :
486 4 : Benchmark::Benchmark(const CLToolOptions& co ):
487 4 : CLTool(co) {
488 4 : inputdata=commandline;
489 4 : }
490 :
491 :
492 0 : int Benchmark::main(FILE* in, FILE*out,Communicator& pc) {
493 : // deterministic initializations to avoid issues with MPI
494 : generator rng;
495 0 : PLMD::Random atomicGenerator;
496 0 : std::unique_ptr<AtomDistribution> distribution;
497 :
498 : struct FileDeleter {
499 : void operator()(FILE*f) const noexcept {
500 : if(f) {
501 0 : std::fclose(f);
502 : }
503 0 : }
504 : };
505 :
506 0 : std::unique_ptr<FILE,FileDeleter> log_dev_null{std::fopen("/dev/null","w")};
507 :
508 0 : Log log;
509 0 : if(pc.Get_rank()==0) {
510 0 : log.link(out);
511 : } else {
512 0 : log.link(log_dev_null.get());
513 : }
514 0 : log.setLinePrefix("BENCH: ");
515 0 : log <<"Welcome to PLUMED benchmark\n";
516 : std::vector<Kernel> kernels;
517 :
518 : // perform comparative analysis
519 : // ensure that kernels vector is destroyed from last to first element upon exit
520 0 : auto kernels_deleter=[&log](auto f) {
521 0 : if(!f) {
522 0 : return;
523 : }
524 0 : if(f->empty()) {
525 : return;
526 : }
527 : generator bootstrapRng;
528 :
529 : const auto size=f->back().timings.size();
530 : //B are the bootstrap iterations
531 : constexpr int B=200;
532 0 : const size_t numblocks=size;
533 : // for some reasons, blocked bootstrap underestimates error
534 : // For now I keep it off. If I remove it, the code below could be simplified
535 : // if(numblocks>20) numblocks=20;
536 0 : const auto blocksize=size/numblocks;
537 :
538 0 : if(f->size()<2) {
539 0 : log<<"Single run, skipping comparative analysis\n";
540 0 : } else if(size<10) {
541 0 : log<<"Too small sample, skipping comparative analysis\n";
542 : } else
543 : try {
544 :
545 0 : log<<"Running comparative analysis, "<<numblocks<<" blocks with size "<<blocksize<<"\n";
546 :
547 0 : std::vector<std::size_t> choice(size);
548 0 : std::uniform_int_distribution<> distrib(0, numblocks-1);
549 0 : std::vector<std::vector<long long int>> blocks(f->size());
550 :
551 : {
552 : int i=0;
553 0 : for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
554 : size_t l=0;
555 0 : blocks[i].assign(numblocks,0);
556 0 : for(auto j=0ULL; j<numblocks; j++) {
557 0 : for(auto k=0ULL; k<blocksize; k++) {
558 0 : plumed_assert(l<it->timings.size());
559 0 : blocks[i][j]+=it->timings[l];
560 0 : l++;
561 : }
562 : }
563 : }
564 : }
565 :
566 0 : std::vector<std::vector<double>> ratios(f->size());
567 0 : for(auto & r : ratios) {
568 : //B are the bootstrap iterations
569 0 : r.resize(B);
570 : }
571 :
572 : //B are the bootstrap iterations
573 0 : for(unsigned b=0; b<B; b++) {
574 0 : for(auto & c : choice) {
575 0 : c=distrib(bootstrapRng);
576 : }
577 : long long int reference=0;
578 0 : for(auto & c : choice) {
579 0 : reference+=blocks[0][c];
580 : }
581 0 : for(auto i=0ULL; i<blocks.size(); i++) {
582 : long long int estimate=0;
583 : // this would lead to separate bootstrap samples for each estimate:
584 : // for(auto & c : choice){c=distrib(bootstrapRng);}
585 0 : for(auto & c : choice) {
586 0 : estimate+=blocks[i][c];
587 : }
588 0 : ratios[i][b]=double(estimate)/double(reference);
589 : }
590 : }
591 :
592 : {
593 : int i=0;
594 0 : for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
595 : double sum=0.0;
596 : double sum2=0.0;
597 0 : for(auto r : ratios[i]) {
598 0 : sum+=r;
599 0 : sum2+=r*r;
600 : }
601 : //B are the bootstrap iterations
602 0 : it->comparative_timing=sum/B;
603 0 : it->comparative_timing_error=std::sqrt(sum2/B-sum*sum/(B*B));
604 : }
605 : }
606 :
607 0 : } catch(std::exception & e) {
608 0 : log<<"Unexpected error during comparative analysis\n";
609 0 : log<<e.what()<<"\n";
610 : }
611 0 : while(!f->empty()) {
612 : f->pop_back();
613 : }
614 :
615 : };
616 : std::unique_ptr<decltype(kernels),decltype(kernels_deleter)> kernels_deleter_obj(&kernels,kernels_deleter);
617 :
618 :
619 : // construct the kernels vector:
620 : {
621 : std::vector<std::string> allpaths;
622 :
623 : {
624 : std::string paths;
625 0 : parse("--kernel",paths);
626 0 : log <<"Using --kernel=" << paths << "\n";
627 0 : allpaths=Tools::getWords(paths,":");
628 : }
629 :
630 : std::vector<std::string> allplumed;
631 : {
632 : std::string paths;
633 0 : parse("--plumed",paths);
634 0 : log <<"Using --plumed=" << paths << "\n";
635 0 : allplumed=Tools::getWords(paths,":");
636 : }
637 :
638 0 : plumed_assert(allplumed.size()>0 && allpaths.size()>0);
639 :
640 : // this check only works on MacOS
641 : #if defined(__APPLE__)
642 : // if any of the paths if different from "this", we check if libplumed was loaded locally to avoid conflicts.
643 : if(std::any_of(allpaths.begin(),allpaths.end(),[](auto value) {
644 : return value != "this";
645 : })) {
646 : if(DLLoader::isPlumedGlobal()) {
647 : plumed_error()<<"It looks like libplumed is loaded in the global namespace, you cannot load a different version of the kernel\n"
648 : <<"Please make sure you use the plumed-runtime executable and that the env var PLUMED_LOAD_NAMESPACE is not set to GLOBAL";
649 : }
650 : }
651 : #endif
652 :
653 0 : if(allplumed.size()>1 && allpaths.size()>1 && allplumed.size() != allpaths.size()) {
654 0 : plumed_error() << "--kernel and --plumed should have either one element or the same number of elements";
655 : }
656 :
657 0 : if(allplumed.size()>1 && allpaths.size()==1)
658 0 : for(unsigned i=1; i<allplumed.size(); i++) {
659 0 : allpaths.push_back(allpaths[0]);
660 : }
661 0 : if(allplumed.size()==1 && allpaths.size()>1)
662 0 : for(unsigned i=1; i<allpaths.size(); i++) {
663 0 : allplumed.push_back(allplumed[0]);
664 : }
665 :
666 0 : for(unsigned i=0; i<allpaths.size(); i++) {
667 0 : kernels.emplace_back(allpaths[i],allplumed[i],&log);
668 : }
669 0 : }
670 :
671 : // reverse order so that log happens in the forward order:
672 0 : std::reverse(kernels.begin(),kernels.end());
673 :
674 : // read other flags:
675 0 : bool shuffled=false;
676 0 : parseFlag("--shuffled",shuffled);
677 :
678 : int nf;
679 0 : parse("--nsteps",nf);
680 0 : log << "Using --nsteps=" << nf << "\n";
681 : unsigned natoms;
682 0 : parse("--natoms",natoms);
683 0 : log << "Using --natoms=" << natoms << "\n";
684 : double maxtime;
685 0 : parse("--maxtime",maxtime);
686 0 : log << "Using --maxtime=" << maxtime << "\n";
687 :
688 0 : bool domain_decomposition=false;
689 0 : parseFlag("--domain-decomposition",domain_decomposition);
690 :
691 0 : if(pc.Get_size()>1) {
692 0 : domain_decomposition=true;
693 : }
694 0 : if(domain_decomposition) {
695 0 : shuffled=true;
696 : }
697 :
698 0 : if (shuffled) {
699 0 : log << "Using --shuffled\n";
700 : }
701 0 : if (domain_decomposition) {
702 0 : log << "Using --domain-decomposition\n";
703 : }
704 :
705 : double timeToSleep;
706 0 : parse("--sleep",timeToSleep);
707 0 : log << "Using --sleep=" << timeToSleep << "\n";
708 :
709 : std::vector<int> shuffled_indexes;
710 :
711 : {
712 : std::string atomicDistr;
713 0 : parse("--atom-distribution",atomicDistr);
714 0 : distribution = getAtomDistribution(atomicDistr);
715 0 : log << "Using --atom-distribution=" << atomicDistr << "\n";
716 : }
717 :
718 : {
719 : std::string fileToDump;
720 0 : if(parse("--dump-trajectory",fileToDump)) {
721 0 : log << "Saving the trajectory to \"" << fileToDump << "\" and exiting\n";
722 0 : std::vector<double> cell(9);
723 0 : std::vector<Vector> pos(natoms);
724 0 : std::ofstream ofile(fileToDump);
725 0 : if (nf<0) {
726 : //if the user accidentally sets infinite steps, we set it to print only one
727 0 : nf=1;
728 : }
729 0 : for(int step=0; step<nf; ++step) {
730 0 : auto sw=kernels[0].stopwatch.startStop("TrajectoryGeneration");
731 0 : distribution->positions(pos,step,atomicGenerator);
732 0 : distribution->box(cell,natoms,step,atomicGenerator);
733 0 : ofile << natoms << "\n"
734 0 : << cell[0] << " " << cell[1] << " " << cell[2] << " "
735 0 : << cell[3] << " " << cell[4] << " " << cell[5] << " "
736 0 : << cell[6] << " " << cell[7] << " " << cell[8] << "\n";
737 0 : for(int i=0; i<natoms; ++i) {
738 0 : ofile << "X\t" << pos[i]<< "\n";
739 : }
740 0 : }
741 0 : ofile.close();
742 : return 0;
743 0 : }
744 : }
745 :
746 0 : log <<"Initializing the setup of the kernel(s)\n";
747 0 : const auto initial_time=std::chrono::high_resolution_clock::now();
748 :
749 0 : for(auto & k : kernels) {
750 : auto & p(k.handle);
751 0 : auto sw=k.stopwatch.startStop("A Initialization");
752 0 : if(Communicator::plumedHasMPI() && domain_decomposition) {
753 0 : p.cmd("setMPIComm",&pc.Get_comm());
754 : }
755 0 : p.cmd("setRealPrecision",(int)sizeof(double));
756 0 : p.cmd("setMDLengthUnits",1.0);
757 0 : p.cmd("setMDChargeUnits",1.0);
758 0 : p.cmd("setMDMassUnits",1.0);
759 0 : p.cmd("setMDEngine","benchmarks");
760 0 : p.cmd("setTimestep",1.0);
761 0 : p.cmd("setPlumedDat",k.plumed_dat.c_str());
762 0 : p.cmd("setLog",out);
763 0 : p.cmd("setNatoms",natoms);
764 0 : p.cmd("init");
765 0 : }
766 :
767 0 : std::vector<double> cell( 9 ), virial( 9 );
768 0 : std::vector<Vector> pos( natoms ), forces( natoms );
769 0 : std::vector<double> masses( natoms, 1 ), charges( natoms, 0 );
770 :
771 0 : if(shuffled) {
772 0 : shuffled_indexes.resize(natoms);
773 0 : for(unsigned i=0; i<natoms; i++) {
774 0 : shuffled_indexes[i]=i;
775 : }
776 0 : std::shuffle(shuffled_indexes.begin(),shuffled_indexes.end(),rng);
777 : }
778 :
779 : // non owning pointers, used for shuffling the execution order
780 : std::vector<Kernel*> kernels_ptr;
781 0 : for(unsigned i=0; i<kernels.size(); i++) {
782 0 : kernels_ptr.push_back(&kernels[i]);
783 : }
784 :
785 0 : int plumedStopCondition=0;
786 : bool fast_finish=false;
787 : int part=0;
788 :
789 0 : log<<"Starting MD loop\n";
790 0 : log<<"Use CTRL+C to stop at any time and collect timers (not working in MPI runs)\n";
791 : // trap signals:
792 0 : SignalHandlerGuard sigIntGuard(SIGINT, signalHandler);
793 0 : SignalHandlerGuard sigTermGuard(SIGTERM, signalHandler);
794 :
795 0 : for(int step=0; nf<0 || step<nf; ++step) {
796 0 : std::shuffle(kernels_ptr.begin(),kernels_ptr.end(),rng);
797 0 : distribution->positions(pos,step,atomicGenerator);
798 0 : distribution->box(cell,natoms,step,atomicGenerator);
799 : double* pos_ptr;
800 : double* for_ptr;
801 : double* charges_ptr;
802 : double* masses_ptr;
803 : int* indexes_ptr=nullptr;
804 : int n_local_atoms;
805 :
806 0 : if(domain_decomposition) {
807 0 : const auto nproc=pc.Get_size();
808 0 : const auto nn=natoms/nproc;
809 : //using int to remove warning, MPI don't work with unsigned
810 0 : int excess=natoms%nproc;
811 0 : const auto myrank=pc.Get_rank();
812 : auto shift=0;
813 0 : n_local_atoms=nn;
814 0 : if(myrank<excess) {
815 0 : n_local_atoms+=1;
816 : }
817 0 : for(int i=0; i<myrank; i++) {
818 0 : shift+=nn;
819 0 : if(i<excess) {
820 0 : shift+=1;
821 : }
822 : }
823 0 : pos_ptr=&pos[shift][0];
824 0 : for_ptr=&forces[shift][0];
825 : charges_ptr=&charges[shift];
826 : masses_ptr=&masses[shift];
827 0 : indexes_ptr=shuffled_indexes.data()+shift;
828 : } else {
829 0 : pos_ptr=&pos[0][0];
830 0 : for_ptr=&forces[0][0];
831 : charges_ptr=&charges[0];
832 : masses_ptr=&masses[0];
833 0 : n_local_atoms=natoms;
834 : indexes_ptr=shuffled_indexes.data();
835 : }
836 :
837 : const char* sw_name;
838 0 : if(part==0) {
839 : sw_name="B0 First step";
840 0 : } else if(part==1) {
841 : sw_name="B1 Warm-up";
842 0 : } else if(part==2) {
843 : sw_name="B2 Calculation part 1";
844 : } else {
845 : sw_name="B3 Calculation part 2";
846 : }
847 :
848 :
849 0 : for(unsigned i=0; i<kernels_ptr.size(); i++) {
850 0 : auto & p(kernels_ptr[i]->handle);
851 :
852 : {
853 0 : auto sw=kernels_ptr[i]->stopwatch.startPause(sw_name);
854 0 : p.cmd("setStep",step);
855 0 : p.cmd("setStopFlag",&plumedStopCondition);
856 0 : p.cmd("setForces",for_ptr, {n_local_atoms,3});
857 0 : p.cmd("setBox",&cell[0], {3,3});
858 0 : p.cmd("setVirial",&virial[0], {3,3});
859 0 : p.cmd("setPositions",pos_ptr, {n_local_atoms,3});
860 0 : p.cmd("setMasses",masses_ptr, {n_local_atoms});
861 0 : p.cmd("setCharges",charges_ptr, {n_local_atoms});
862 0 : if(shuffled) {
863 0 : p.cmd("setAtomsNlocal",n_local_atoms);
864 0 : p.cmd("setAtomsGatindex",indexes_ptr, {n_local_atoms});
865 : }
866 0 : p.cmd("prepareCalc");
867 0 : }
868 :
869 : // mimick MD calculation here
870 : {
871 : unsigned k=0;
872 0 : auto start=std::chrono::high_resolution_clock::now();
873 0 : while(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-start).count()<(long long int)1e9*timeToSleep) {
874 0 : k+=i*i;
875 : }
876 : std::fprintf(log_dev_null.get(),"%u",k);
877 : }
878 :
879 : {
880 0 : auto sw=kernels_ptr[i]->stopwatch.startStop(sw_name);
881 0 : p.cmd("performCalc");
882 0 : }
883 :
884 0 : if(kernels_ptr.size()>1 && part>1) {
885 0 : kernels_ptr[i]->timings.push_back(kernels_ptr[i]->stopwatch.getLastCycle(sw_name));
886 : }
887 0 : if(plumedStopCondition || signalReceived.load()) {
888 : fast_finish=true;
889 : }
890 : }
891 0 : auto elapsed=std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-initial_time).count();
892 0 : if(part==0) {
893 : part=1;
894 : }
895 0 : if(part<2) {
896 0 : if((maxtime>0 && elapsed>(long long int)(0.2*1e9*maxtime)) || (nf>0 && step+1>=nf/5) || (maxtime<0 && nf<0 && step+1>=100)) {
897 : part=2;
898 0 : log<<"Warm-up completed\n";
899 : }
900 : }
901 0 : if(part<3) {
902 0 : if((maxtime>0 && elapsed>(long long int)(0.6*1e9*maxtime)) || (nf>0 && step+1>=3*nf/5)) {
903 : part=3;
904 0 : log<<"60% completed\n";
905 : }
906 : }
907 :
908 0 : if(maxtime>0 && elapsed>(long long int)(1e9*maxtime)) {
909 : fast_finish=true;
910 : }
911 :
912 : {
913 0 : unsigned tmp=fast_finish;
914 0 : pc.Bcast(tmp,0);
915 0 : fast_finish=tmp;
916 : }
917 0 : if(fast_finish) {
918 : break;
919 : }
920 : }
921 :
922 : return 0;
923 0 : }
924 :
925 : } // namespace unnamed
926 : } // namespace cltools
927 : } // namespace PLMD
|