LCOV - plumed test coverage - cltools/Benchmark.cpp

LCOV - code coverage report

Current view:	top level - cltools - Benchmark.cpp (source / functions)		Hit	Total	Coverage
Test:	plumed test coverage	Lines:	20	381	5.2 %
Date:	2025-11-25 13:55:50	Functions:	6	27	22.2 %

          Line data    Source code

       1             : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
       2             :    Copyright (c) 2019-2023 The plumed team
       3             :    (see the PEOPLE file at the root of the distribution for a list of names)
       4             : 
       5             :    See http://www.plumed.org for more information.
       6             : 
       7             :    This file is part of plumed, version 2.
       8             : 
       9             :    plumed is free software: you can redistribute it and/or modify
      10             :    it under the terms of the GNU Lesser General Public License as published by
      11             :    the Free Software Foundation, either version 3 of the License, or
      12             :    (at your option) any later version.
      13             : 
      14             :    plumed is distributed in the hope that it will be useful,
      15             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      16             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      17             :    GNU Lesser General Public License for more details.
      18             : 
      19             :    You should have received a copy of the GNU Lesser General Public License
      20             :    along with plumed.  If not, see <http://www.gnu.org/licenses/>.
      21             : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
      22             : #include "CLTool.h"
      23             : #include "core/CLToolRegister.h"
      24             : #include "tools/Communicator.h"
      25             : #include "tools/Tools.h"
      26             : #include "config/Config.h"
      27             : #include "tools/PlumedHandle.h"
      28             : #include "tools/Stopwatch.h"
      29             : #include "tools/Log.h"
      30             : #include "tools/DLLoader.h"
      31             : #include "tools/Random.h"
      32             : 
      33             : #include <cstdio>
      34             : #include <string>
      35             : #include <vector>
      36             : #include <fstream>
      37             : #include <atomic>
      38             : #include <csignal>
      39             : #include <cstdio>
      40             : #include <random>
      41             : #include <algorithm>
      42             : #include <chrono>
      43             : #include <string_view>
      44             : 
      45             : namespace PLMD {
      46             : namespace cltools {
      47             : 
      48             : //+PLUMEDOC TOOLS benchmark
      49             : /*
      50             : benchmark is a lightweight reimplementation of driver focused on running benchmarks
      51             : 
      52             : The main difference wrt driver is that it generates a trajectory in memory rather than reading it
      53             : from a file. This allows to better time the overhead of the plumed library, without including
      54             : the time needed to read the trajectory.
      55             : 
      56             : It is also possible to load a separate version of the plumed kernel. This enables running
      57             : benchmarks agaist previous plumed versions in a controlled setting, where systematic errors
      58             : in the comparison are minimized.
      59             : 
      60             : \par Examples
      61             : 
      62             : First, you should create a sample `plumed.dat` file for testing. For instance:
      63             : 
      64             : \plumedfile
      65             : WHOLEMOLECULES ENTITY0=1-10000
      66             : p: POSITION ATOM=1
      67             : RESTRAINT ARG=p.x KAPPA=1 AT=0
      68             : \endplumedfile
      69             : 
      70             : Then you can test the performance of this input with the following command:
      71             : \verbatim
      72             : plumed benchmark
      73             : \endverbatim
      74             : 
      75             : You can also test a different (older) version of PLUMED with the same input. To do so,
      76             : you should run
      77             : \verbatim
      78             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so
      79             : \endverbatim
      80             : 
      81             : \warning It is necessary to use the `plumed-runtime` executable here to avoid conflicts between different
      82             : plumed versions. You will find it in your path if you are using the non installed version of plumed,
      83             : and in `$prefix/lib/plumed` if you installed plumed in $prefix,.
      84             : 
      85             : \par Comparing multiple versions
      86             : 
      87             : The best way to compare two versions of plumed on the same input is to pass multiple colon-separated kernels:
      88             : 
      89             : \verbatim
      90             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:/path2/to/lib/libplumedKernel.so:this
      91             : \endverbatim
      92             : 
      93             : Here `this` means the kernel of the version with which you are running the benchmark. This comparison runs the three
      94             : instances simultaneously (alternating them) so that systematic differences in the load of your machine will affect them
      95             : to the same extent.
      96             : 
      97             : In case the different versions require modified plumed.dat files, or if you simply want to compare
      98             : two different plumed input files that compute the same thing, you can also use multiple plumed input files:
      99             : 
     100             : \verbatim
     101             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:this --plumed plumed1.dat:plumed2.dat
     102             : \endverbatim
     103             : 
     104             : Similarly, you might want to run two different inputs using the same kernel, which can be obtained with:
     105             : 
     106             : \verbatim
     107             : plumed-runtime benchmark --plumed plumed1.dat:plumed2.dat
     108             : \endverbatim
     109             : 
     110             : \par Profiling
     111             : 
     112             : If you want to attach a profiler on the fly to the process, you might find it convenient to use `--nsteps -1`.
     113             : The simulation will run forever and can be interrupted with CTRL-C. When interrupted, the result of the timers
     114             : should be displayed anyway.
     115             : You can also run setting a maximum time with `--maxtime`.
     116             : 
     117             : If you run a profiler when testing multiple PLUMED versions you might be confused by which function is from
     118             : each version. It is recommended to recompile separate instances with a separate C++ namespace (`-DPLMD=PLUMED_version_1`)
     119             : so that you will be able to distinguish them. In addition, compiling with `CXXFLAGS="-g -O3"` will make the profiling
     120             : report more complete, likely including code lines.
     121             : 
     122             : \par MPI runs
     123             : 
     124             : You can run emulating a domain decomposition. This is done automatically if plumed has been compiled with MPI
     125             : and you run with `mpirun`
     126             : 
     127             : \verbatim
     128             : mpirun -np 4 plumed-runtime benchmark
     129             : \endverbatim
     130             : 
     131             : If you load separate PLUMED instances as discussed above, they should all be compiled against the same MPI version.
     132             : Notice that when using MPI signals (CTRL-C) might not work.
     133             : 
     134             : Since some of the data transfer could happen asynchronously, you might want to use the `--sleep` option
     135             : to simulate a lag between the `prepareCalc` and `performCalc` actions. This part of the calculation will not contribute
     136             : to timer, but will obviously slow down your test.
     137             : 
     138             : \par Output
     139             : 
     140             : In the output you will see the usual reports about timing produced by the internal
     141             : timers of the tested plumed instances.
     142             : In addition, this tool will monitor the timing externally, with some slightly different criterion:
     143             : - First, the initialization (construction of the input) will be shown with a separate timer,
     144             :   as well as the timing for the first step.
     145             : - Second, the timer corresponding to the calculation will be split in three parts, reporting
     146             :   execution of the first 20% (warm-up) and the next two blocks of 40% each.
     147             : - Finally, you might notice some discrepancy because some of the actions that are usually
     148             :   not expensive are not included in the internal timers. The external timer will
     149             :   thus provide a better estimate of the total elapsed time, including everything.
     150             : 
     151             : The internal timers are still useful to monitor what happens at the different stages
     152             : and, with \ref DEBUG `DETAILED_TIMERS`, what happens in each action.
     153             : 
     154             : When you run multiple version, a comparative analisys of the time spent within PLUMED in the various
     155             : instances will be done, showing the ratio between the total time and the time measured on the first
     156             : instance, which will act as a reference. Errors will be estimated with bootstrapping. The warm-up phase will be discarded for
     157             : this analysis.
     158             : 
     159             : */
     160             : //+ENDPLUMEDOC
     161             : 
     162             : // We use an anonymous namespace here to avoid clashes with variables
     163             : // declared in other parts of the code
     164             : namespace {
     165             : 
     166             : //this is a sugar for changing idea faster about the rng
     167             : using generator = std::mt19937;
     168             : 
     169             : std::atomic<bool> signalReceived(false);
     170             : 
     171             : class SignalHandlerGuard {
     172             : public:
     173           0 :   SignalHandlerGuard(int signal, void (*newHandler)(int)) : signal_(signal) {
     174             :     // Store the current handler before setting the new one
     175           0 :     prevHandler_ = std::signal(signal, newHandler);
     176           0 :     if (prevHandler_ == SIG_ERR) {
     177           0 :       throw std::runtime_error("Failed to set signal handler");
     178             :     }
     179           0 :   }
     180             : 
     181             :   ~SignalHandlerGuard() {
     182             :     // Restore the previous handler upon destruction
     183           0 :     std::signal(signal_, prevHandler_);
     184           0 :   }
     185             : 
     186             :   // Delete copy constructor and assignment operator to prevent copying
     187             :   SignalHandlerGuard(const SignalHandlerGuard&) = delete;
     188             :   SignalHandlerGuard& operator=(const SignalHandlerGuard&) = delete;
     189             : 
     190             : private:
     191             :   int signal_;
     192             :   void (*prevHandler_)(int);
     193             : };
     194             : 
     195           0 : extern "C" void signalHandler(int signal) {
     196           0 :   if (signal == SIGINT) {
     197             :     signalReceived.store(true);
     198           0 :     fprintf(stderr, "Signal interrupt received\n");
     199             :   }
     200           0 :   if (signal == SIGTERM) {
     201             :     signalReceived.store(true);
     202           0 :     fprintf(stderr, "Signal termination received\n");
     203             :   }
     204           0 : }
     205             : 
     206             : /// This base class contains members that are movable with default operations
     207             : struct KernelBase {
     208             :   std::string path;
     209             :   std::string plumed_dat;
     210             :   PlumedHandle handle;
     211             :   Stopwatch stopwatch;
     212             :   std::vector<long long int> timings;
     213             :   double comparative_timing=-1.0;
     214             :   double comparative_timing_error=-1.0;
     215           0 :   KernelBase(const std::string & path_,const std::string & plumed_dat_, Log* log_):
     216           0 :     path(path_),
     217           0 :     plumed_dat(plumed_dat_),
     218           0 :     handle([&]() {
     219           0 :     if(path_=="this") {
     220           0 :       return PlumedHandle();
     221             :     } else {
     222           0 :       return PlumedHandle::dlopen(path.c_str());
     223             :     }
     224             :   }()),
     225           0 :   stopwatch(*log_) {
     226           0 :   }
     227             : };
     228             : 
     229             : /// Local structure handling a kernel and the related timers.
     230             : /// This structure specifically contain the Log, which needs special treatment
     231             : /// in move semantics
     232             : struct Kernel :
     233             :   public KernelBase {
     234             :   Log* log=nullptr;
     235           0 :   Kernel(const std::string & path_,const std::string & the_plumed_dat, Log* log_):
     236             :     KernelBase(path_,the_plumed_dat,log_),
     237           0 :     log(log_) {
     238             :   }
     239             : 
     240           0 :   ~Kernel() {
     241           0 :     if(log) {
     242           0 :       (*log)<<"\n";
     243           0 :       (*log)        <<"Kernel:      "<<path<<"\n";
     244           0 :       (*log)        <<"Input:       "<<plumed_dat<<"\n";
     245           0 :       if(comparative_timing>0.0) {
     246           0 :         (*log).printf("Comparative: %.3f +- %.3f\n",comparative_timing,comparative_timing_error);
     247             :       }
     248             :     }
     249           0 :   }
     250             : 
     251           0 :   Kernel(Kernel && other) noexcept:
     252             :     KernelBase(std::move(other)),
     253           0 :     log(other.log) {
     254           0 :     other.log=nullptr; // ensure no log is done in the moved away object
     255             :   }
     256             : 
     257           0 :   Kernel & operator=(Kernel && other) noexcept {
     258           0 :     if(this != &other) {
     259           0 :       KernelBase::operator=(std::move(other));
     260           0 :       log=other.log;
     261           0 :       other.log=nullptr; // ensure no log is done in the moved away object
     262             :     }
     263           0 :     return *this;
     264             :   }
     265             : };
     266             : 
     267             : namespace  {
     268             : 
     269             : class UniformSphericalVector {
     270             :   //double rminCub;
     271             :   double rCub;
     272             : 
     273             : public:
     274             :   //assuming rmin=0
     275           0 :   UniformSphericalVector(const double rmax):
     276           0 :     rCub (rmax*rmax*rmax/*-rminCub*/) {}
     277           0 :   PLMD::Vector operator()(Random& rng) {
     278           0 :     double rho = std::cbrt (/*rminCub + */rng.RandU01()*rCub);
     279           0 :     double theta =std::acos (2.0*rng.RandU01() -1.0);
     280           0 :     double phi = 2.0 * PLMD::pi * rng.RandU01();
     281             :     return Vector (
     282           0 :              rho * sin (theta) * cos (phi),
     283           0 :              rho * sin (theta) * sin (phi),
     284           0 :              rho * cos (theta));
     285             :   }
     286             : };
     287             : 
     288             : ///Acts as a template for any distribution
     289             : struct AtomDistribution {
     290             :   virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&)=0;
     291           0 :   virtual void box(std::vector<double>& box, unsigned /*natoms*/, unsigned /*step*/, Random&) {
     292             :     std::fill(box.begin(), box.end(),0);
     293           0 :   };
     294             :   virtual ~AtomDistribution() noexcept {}
     295             : };
     296             : 
     297           0 : struct theLine:public AtomDistribution {
     298           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned step, Random&rng) override {
     299             :     auto nat = posToUpdate.size();
     300             :     UniformSphericalVector usv(0.5);
     301             : 
     302           0 :     for (unsigned i=0; i<nat; ++i) {
     303           0 :       posToUpdate[i] = Vector(i, 0, 0) + usv(rng);
     304             :     }
     305           0 :   }
     306             : };
     307             : 
     308           0 : struct uniformSphere:public AtomDistribution {
     309           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     310             : 
     311             :     //giving more or less a cubic udm of volume for each atom: V=nat
     312           0 :     const double rmax= std::cbrt ((3.0/(4.0*PLMD::pi)) * posToUpdate.size());
     313             : 
     314             :     UniformSphericalVector usv(rmax);
     315             :     auto s=posToUpdate.begin();
     316             :     auto e=posToUpdate.end();
     317             :     //I am using the iterators:this is slightly faster,
     318             :     // enough to overcome the cost of the vtable that I added
     319           0 :     for (unsigned i=0; s!=e; ++s,++i) {
     320           0 :       *s = usv (rng);
     321             :     }
     322             : 
     323           0 :   }
     324           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     325           0 :     const double rmax= 2.0*std::cbrt((3.0/(4.0*PLMD::pi)) * natoms);
     326           0 :     box[0]=rmax;
     327           0 :     box[1]=0.0;
     328           0 :     box[2]=0.0;
     329           0 :     box[3]=0.0;
     330           0 :     box[4]=rmax;
     331           0 :     box[5]=0.0;
     332           0 :     box[6]=0.0;
     333           0 :     box[7]=0.0;
     334           0 :     box[8]=rmax;
     335             : 
     336           0 :   }
     337             : };
     338             : 
     339           0 : struct twoGlobs: public AtomDistribution {
     340           0 :   virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&rng) {
     341             :     //I am using two unigform spheres and 2V=n
     342           0 :     const double rmax= std::cbrt ((3.0/(8.0*PLMD::pi)) * posToUpdate.size());
     343             : 
     344             :     UniformSphericalVector usv(rmax);
     345             :     std::array<Vector,2> centers{
     346             :       PLMD::Vector{0.0,0.0,0.0},
     347             : //so they do not overlap
     348             :       PLMD::Vector{2.0*rmax,2.0*rmax,2.0*rmax}
     349           0 :     };
     350           0 :     std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
     351             :       //RandInt is only declared
     352             :       // return usv (rng) + centers[rng.RandInt(1)];
     353           0 :       return usv (rng) + centers[rng.RandU01()>0.5];
     354             :     });
     355           0 :   }
     356             : 
     357           0 :   virtual void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) {
     358             : 
     359           0 :     const double rmax= 4.0 * std::cbrt ((3.0/(8.0*PLMD::pi)) * natoms);
     360           0 :     box[0]=rmax;
     361           0 :     box[1]=0.0;
     362           0 :     box[2]=0.0;
     363           0 :     box[3]=0.0;
     364           0 :     box[4]=rmax;
     365           0 :     box[5]=0.0;
     366           0 :     box[6]=0.0;
     367           0 :     box[7]=0.0;
     368           0 :     box[8]=rmax;
     369           0 :   };
     370             : };
     371             : 
     372           0 : struct uniformCube:public AtomDistribution {
     373           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     374             :     //giving more or less a cubic udm of volume for each atom: V = nat
     375           0 :     const double rmax = std::cbrt(static_cast<double>(posToUpdate.size()));
     376             : 
     377             : 
     378             : 
     379             :     // std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
     380             :     //   return Vector (rndR(rng),rndR(rng),rndR(rng));
     381             :     // });
     382             :     auto s=posToUpdate.begin();
     383             :     auto e=posToUpdate.end();
     384             :     //I am using the iterators:this is slightly faster,
     385             :     // enough to overcome the cost of the vtable that I added
     386           0 :     for (unsigned i=0; s!=e; ++s,++i) {
     387           0 :       *s = Vector (rng.RandU01()*rmax,rng.RandU01()*rmax,rng.RandU01()*rmax);
     388             :     }
     389           0 :   }
     390           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     391             :     //+0.05 to avoid overlap
     392           0 :     const double rmax= std::cbrt(natoms)+0.05;
     393           0 :     box[0]=rmax;
     394           0 :     box[1]=0.0;
     395           0 :     box[2]=0.0;
     396           0 :     box[3]=0.0;
     397           0 :     box[4]=rmax;
     398           0 :     box[5]=0.0;
     399           0 :     box[6]=0.0;
     400           0 :     box[7]=0.0;
     401           0 :     box[8]=rmax;
     402             : 
     403           0 :   }
     404             : };
     405             : 
     406           0 : struct tiledSimpleCubic:public AtomDistribution {
     407           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     408             :     //Tiling the space in this way will not tests 100% the pbc, but
     409             :     //I do not think that write a spacefilling curve, like Hilbert, Peano or Morton
     410             :     //could be a good idea, in this case
     411           0 :     const unsigned rmax = std::ceil(std::cbrt(static_cast<double>(posToUpdate.size())));
     412             : 
     413             :     auto s=posToUpdate.begin();
     414             :     auto e=posToUpdate.end();
     415             :     //I am using the iterators:this is slightly faster,
     416             :     // enough to overcome the cost of the vtable that I added
     417           0 :     for (unsigned k=0; k<rmax&&s!=e; ++k) {
     418           0 :       for (unsigned j=0; j<rmax&&s!=e; ++j) {
     419           0 :         for (unsigned i=0; i<rmax&&s!=e; ++i) {
     420           0 :           *s = Vector (i,j,k);
     421             :           ++s;
     422             :         }
     423             :       }
     424             :     }
     425           0 :   }
     426           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     427           0 :     const double rmax= std::ceil(std::cbrt(static_cast<double>(natoms)));;
     428           0 :     box[0]=rmax;
     429           0 :     box[1]=0.0;
     430           0 :     box[2]=0.0;
     431           0 :     box[3]=0.0;
     432           0 :     box[4]=rmax;
     433           0 :     box[5]=0.0;
     434           0 :     box[6]=0.0;
     435           0 :     box[7]=0.0;
     436           0 :     box[8]=rmax;
     437             : 
     438           0 :   }
     439             : };
     440           0 : std::unique_ptr<AtomDistribution> getAtomDistribution(std::string_view atomicDistr) {
     441           0 :   std::unique_ptr<AtomDistribution> distribution;
     442           0 :   if(atomicDistr == "line") {
     443           0 :     distribution = std::make_unique<theLine>();
     444           0 :   } else if (atomicDistr == "cube") {
     445           0 :     distribution = std::make_unique<uniformCube>();
     446           0 :   } else if (atomicDistr == "sphere") {
     447           0 :     distribution = std::make_unique<uniformSphere>();
     448           0 :   } else if (atomicDistr == "globs") {
     449           0 :     distribution = std::make_unique<twoGlobs>();
     450           0 :   } else if (atomicDistr == "sc") {
     451           0 :     distribution = std::make_unique<tiledSimpleCubic>();
     452             :   } else {
     453           0 :     plumed_error() << R"(The atomic distribution can be only "line", "cube", "sphere", "globs" and "sc", the input was ")"
     454           0 :                    << atomicDistr <<'"';
     455             :   }
     456           0 :   return distribution;
     457           0 : }
     458             : } //anonymus namespace for benchmark distributions
     459             : class Benchmark:
     460             :   public CLTool {
     461             : public:
     462             :   static void registerKeywords( Keywords& keys );
     463             :   explicit Benchmark(const CLToolOptions& co );
     464             :   int main(FILE* in, FILE*out,Communicator& pc) override;
     465           4 :   std::string description()const override {
     466           4 :     return "run a calculation with a fixed trajectory to find bottlenecks in PLUMED";
     467             :   }
     468             : };
     469             : 
     470       16330 : PLUMED_REGISTER_CLTOOL(Benchmark,"benchmark")
     471             : 
     472        5442 : void Benchmark::registerKeywords( Keywords& keys ) {
     473        5442 :   CLTool::registerKeywords( keys );
     474       10884 :   keys.add("compulsory","--plumed","plumed.dat","colon separated path(s) to the input file(s)");
     475       10884 :   keys.add("compulsory","--kernel","this","colon separated path(s) to kernel(s)");
     476       10884 :   keys.add("compulsory","--natoms","100000","the number of atoms to use for the simulation");
     477       10884 :   keys.add("compulsory","--nsteps","2000","number of steps of MD to perform (-1 means forever)");
     478       10884 :   keys.add("compulsory","--maxtime","-1","maximum number of seconds (-1 means forever)");
     479       10884 :   keys.add("compulsory","--sleep","0","number of seconds of sleep, mimicking MD calculation");
     480       10884 :   keys.add("compulsory","--atom-distribution","line","the kind of possible atomic displacement at each step");
     481       10884 :   keys.add("optional","--dump-trajectory","dump the trajectory to this file");
     482       10884 :   keys.addFlag("--domain-decomposition",false,"simulate domain decomposition, implies --shuffle");
     483       10884 :   keys.addFlag("--shuffled",false,"reshuffle atoms");
     484        5442 : }
     485             : 
     486           4 : Benchmark::Benchmark(const CLToolOptions& co ):
     487           4 :   CLTool(co) {
     488           4 :   inputdata=commandline;
     489           4 : }
     490             : 
     491             : 
     492           0 : int Benchmark::main(FILE* in, FILE*out,Communicator& pc) {
     493             :   // deterministic initializations to avoid issues with MPI
     494             :   generator rng;
     495           0 :   PLMD::Random atomicGenerator;
     496           0 :   std::unique_ptr<AtomDistribution> distribution;
     497             : 
     498             :   struct FileDeleter {
     499             :     void operator()(FILE*f) const noexcept {
     500             :       if(f) {
     501           0 :         std::fclose(f);
     502             :       }
     503           0 :     }
     504             :   };
     505             : 
     506           0 :   std::unique_ptr<FILE,FileDeleter> log_dev_null{std::fopen("/dev/null","w")};
     507             : 
     508           0 :   Log log;
     509           0 :   if(pc.Get_rank()==0) {
     510           0 :     log.link(out);
     511             :   } else {
     512           0 :     log.link(log_dev_null.get());
     513             :   }
     514           0 :   log.setLinePrefix("BENCH:  ");
     515           0 :   log <<"Welcome to PLUMED benchmark\n";
     516             :   std::vector<Kernel> kernels;
     517             : 
     518             :   // perform comparative analysis
     519             :   // ensure that kernels vector is destroyed from last to first element upon exit
     520           0 :   auto kernels_deleter=[&log](auto f) {
     521           0 :     if(!f) {
     522           0 :       return;
     523             :     }
     524           0 :     if(f->empty()) {
     525             :       return;
     526             :     }
     527             :     generator bootstrapRng;
     528             : 
     529             :     const auto size=f->back().timings.size();
     530             :     //B are the bootstrap iterations
     531             :     constexpr int B=200;
     532           0 :     const size_t numblocks=size;
     533             :     // for some reasons, blocked bootstrap underestimates error
     534             :     // For now I keep it off. If I remove it, the code below could be simplified
     535             :     // if(numblocks>20) numblocks=20;
     536           0 :     const auto blocksize=size/numblocks;
     537             : 
     538           0 :     if(f->size()<2) {
     539           0 :       log<<"Single run, skipping comparative analysis\n";
     540           0 :     } else if(size<10) {
     541           0 :       log<<"Too small sample, skipping comparative analysis\n";
     542             :     } else
     543             :       try {
     544             : 
     545           0 :         log<<"Running comparative analysis, "<<numblocks<<" blocks with size "<<blocksize<<"\n";
     546             : 
     547           0 :         std::vector<std::size_t> choice(size);
     548           0 :         std::uniform_int_distribution<> distrib(0, numblocks-1);
     549           0 :         std::vector<std::vector<long long int>> blocks(f->size());
     550             : 
     551             :         {
     552             :           int i=0;
     553           0 :           for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
     554             :             size_t l=0;
     555           0 :             blocks[i].assign(numblocks,0);
     556           0 :             for(auto j=0ULL; j<numblocks; j++) {
     557           0 :               for(auto k=0ULL; k<blocksize; k++) {
     558           0 :                 plumed_assert(l<it->timings.size());
     559           0 :                 blocks[i][j]+=it->timings[l];
     560           0 :                 l++;
     561             :               }
     562             :             }
     563             :           }
     564             :         }
     565             : 
     566           0 :         std::vector<std::vector<double>> ratios(f->size());
     567           0 :         for(auto & r : ratios) {
     568             :           //B are the bootstrap iterations
     569           0 :           r.resize(B);
     570             :         }
     571             : 
     572             :         //B are the bootstrap iterations
     573           0 :         for(unsigned b=0; b<B; b++) {
     574           0 :           for(auto & c : choice) {
     575           0 :             c=distrib(bootstrapRng);
     576             :           }
     577             :           long long int reference=0;
     578           0 :           for(auto & c : choice) {
     579           0 :             reference+=blocks[0][c];
     580             :           }
     581           0 :           for(auto i=0ULL; i<blocks.size(); i++) {
     582             :             long long int estimate=0;
     583             :             // this would lead to separate bootstrap samples for each estimate:
     584             :             // for(auto & c : choice){c=distrib(bootstrapRng);}
     585           0 :             for(auto & c : choice) {
     586           0 :               estimate+=blocks[i][c];
     587             :             }
     588           0 :             ratios[i][b]=double(estimate)/double(reference);
     589             :           }
     590             :         }
     591             : 
     592             :         {
     593             :           int i=0;
     594           0 :           for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
     595             :             double sum=0.0;
     596             :             double sum2=0.0;
     597           0 :             for(auto r : ratios[i]) {
     598           0 :               sum+=r;
     599           0 :               sum2+=r*r;
     600             :             }
     601             :             //B are the bootstrap iterations
     602           0 :             it->comparative_timing=sum/B;
     603           0 :             it->comparative_timing_error=std::sqrt(sum2/B-sum*sum/(B*B));
     604             :           }
     605             :         }
     606             : 
     607           0 :       } catch(std::exception & e) {
     608           0 :         log<<"Unexpected error during comparative analysis\n";
     609           0 :         log<<e.what()<<"\n";
     610             :       }
     611           0 :     while(!f->empty()) {
     612             :       f->pop_back();
     613             :     }
     614             : 
     615             :   };
     616             :   std::unique_ptr<decltype(kernels),decltype(kernels_deleter)> kernels_deleter_obj(&kernels,kernels_deleter);
     617             : 
     618             : 
     619             :   // construct the kernels vector:
     620             :   {
     621             :     std::vector<std::string> allpaths;
     622             : 
     623             :     {
     624             :       std::string paths;
     625           0 :       parse("--kernel",paths);
     626           0 :       log <<"Using --kernel=" << paths << "\n";
     627           0 :       allpaths=Tools::getWords(paths,":");
     628             :     }
     629             : 
     630             :     std::vector<std::string> allplumed;
     631             :     {
     632             :       std::string paths;
     633           0 :       parse("--plumed",paths);
     634           0 :       log <<"Using --plumed=" << paths << "\n";
     635           0 :       allplumed=Tools::getWords(paths,":");
     636             :     }
     637             : 
     638           0 :     plumed_assert(allplumed.size()>0 && allpaths.size()>0);
     639             : 
     640             :     // this check only works on MacOS
     641             : #if defined(__APPLE__)
     642             :     // if any of the paths if different from "this", we check if libplumed was loaded locally to avoid conflicts.
     643             :     if(std::any_of(allpaths.begin(),allpaths.end(),[](auto value) {
     644             :     return value != "this";
     645             :   })) {
     646             :       if(DLLoader::isPlumedGlobal()) {
     647             :         plumed_error()<<"It looks like libplumed is loaded in the global namespace, you cannot load a different version of the kernel\n"
     648             :                       <<"Please make sure you use the plumed-runtime executable and that the env var PLUMED_LOAD_NAMESPACE is not set to GLOBAL";
     649             :       }
     650             :     }
     651             : #endif
     652             : 
     653           0 :     if(allplumed.size()>1 && allpaths.size()>1 && allplumed.size() != allpaths.size()) {
     654           0 :       plumed_error() << "--kernel and --plumed should have either one element or the same number of elements";
     655             :     }
     656             : 
     657           0 :     if(allplumed.size()>1 && allpaths.size()==1)
     658           0 :       for(unsigned i=1; i<allplumed.size(); i++) {
     659           0 :         allpaths.push_back(allpaths[0]);
     660             :       }
     661           0 :     if(allplumed.size()==1 && allpaths.size()>1)
     662           0 :       for(unsigned i=1; i<allpaths.size(); i++) {
     663           0 :         allplumed.push_back(allplumed[0]);
     664             :       }
     665             : 
     666           0 :     for(unsigned i=0; i<allpaths.size(); i++) {
     667           0 :       kernels.emplace_back(allpaths[i],allplumed[i],&log);
     668             :     }
     669           0 :   }
     670             : 
     671             :   // reverse order so that log happens in the forward order:
     672           0 :   std::reverse(kernels.begin(),kernels.end());
     673             : 
     674             :   // read other flags:
     675           0 :   bool shuffled=false;
     676           0 :   parseFlag("--shuffled",shuffled);
     677             : 
     678             :   int nf;
     679           0 :   parse("--nsteps",nf);
     680           0 :   log << "Using --nsteps=" << nf << "\n";
     681             :   unsigned natoms;
     682           0 :   parse("--natoms",natoms);
     683           0 :   log << "Using --natoms=" << natoms << "\n";
     684             :   double maxtime;
     685           0 :   parse("--maxtime",maxtime);
     686           0 :   log << "Using --maxtime=" << maxtime << "\n";
     687             : 
     688           0 :   bool domain_decomposition=false;
     689           0 :   parseFlag("--domain-decomposition",domain_decomposition);
     690             : 
     691           0 :   if(pc.Get_size()>1) {
     692           0 :     domain_decomposition=true;
     693             :   }
     694           0 :   if(domain_decomposition) {
     695           0 :     shuffled=true;
     696             :   }
     697             : 
     698           0 :   if (shuffled) {
     699           0 :     log << "Using --shuffled\n";
     700             :   }
     701           0 :   if (domain_decomposition) {
     702           0 :     log << "Using --domain-decomposition\n";
     703             :   }
     704             : 
     705             :   double timeToSleep;
     706           0 :   parse("--sleep",timeToSleep);
     707           0 :   log << "Using --sleep=" << timeToSleep << "\n";
     708             : 
     709             :   std::vector<int> shuffled_indexes;
     710             : 
     711             :   {
     712             :     std::string atomicDistr;
     713           0 :     parse("--atom-distribution",atomicDistr);
     714           0 :     distribution = getAtomDistribution(atomicDistr);
     715           0 :     log << "Using --atom-distribution=" << atomicDistr << "\n";
     716             :   }
     717             : 
     718             :   {
     719             :     std::string fileToDump;
     720           0 :     if(parse("--dump-trajectory",fileToDump)) {
     721           0 :       log << "Saving the trajectory to \"" << fileToDump << "\" and exiting\n";
     722           0 :       std::vector<double> cell(9);
     723           0 :       std::vector<Vector> pos(natoms);
     724           0 :       std::ofstream ofile(fileToDump);
     725           0 :       if (nf<0) {
     726             :         //if the user accidentally sets infinite steps, we set it to print only one
     727           0 :         nf=1;
     728             :       }
     729           0 :       for(int step=0; step<nf; ++step) {
     730           0 :         auto sw=kernels[0].stopwatch.startStop("TrajectoryGeneration");
     731           0 :         distribution->positions(pos,step,atomicGenerator);
     732           0 :         distribution->box(cell,natoms,step,atomicGenerator);
     733           0 :         ofile << natoms << "\n"
     734           0 :               << cell[0] << " " << cell[1] << " " << cell[2] << " "
     735           0 :               << cell[3] << " " << cell[4] << " " << cell[5] << " "
     736           0 :               << cell[6] << " " << cell[7] << " " << cell[8] << "\n";
     737           0 :         for(int i=0; i<natoms; ++i) {
     738           0 :           ofile << "X\t" << pos[i]<< "\n";
     739             :         }
     740           0 :       }
     741           0 :       ofile.close();
     742             :       return 0;
     743           0 :     }
     744             :   }
     745             : 
     746           0 :   log <<"Initializing the setup of the kernel(s)\n";
     747           0 :   const auto initial_time=std::chrono::high_resolution_clock::now();
     748             : 
     749           0 :   for(auto & k : kernels) {
     750             :     auto & p(k.handle);
     751           0 :     auto sw=k.stopwatch.startStop("A Initialization");
     752           0 :     if(Communicator::plumedHasMPI() && domain_decomposition) {
     753           0 :       p.cmd("setMPIComm",&pc.Get_comm());
     754             :     }
     755           0 :     p.cmd("setRealPrecision",(int)sizeof(double));
     756           0 :     p.cmd("setMDLengthUnits",1.0);
     757           0 :     p.cmd("setMDChargeUnits",1.0);
     758           0 :     p.cmd("setMDMassUnits",1.0);
     759           0 :     p.cmd("setMDEngine","benchmarks");
     760           0 :     p.cmd("setTimestep",1.0);
     761           0 :     p.cmd("setPlumedDat",k.plumed_dat.c_str());
     762           0 :     p.cmd("setLog",out);
     763           0 :     p.cmd("setNatoms",natoms);
     764           0 :     p.cmd("init");
     765           0 :   }
     766             : 
     767           0 :   std::vector<double> cell( 9 ), virial( 9 );
     768           0 :   std::vector<Vector> pos( natoms ), forces( natoms );
     769           0 :   std::vector<double> masses( natoms, 1 ), charges( natoms, 0 );
     770             : 
     771           0 :   if(shuffled) {
     772           0 :     shuffled_indexes.resize(natoms);
     773           0 :     for(unsigned i=0; i<natoms; i++) {
     774           0 :       shuffled_indexes[i]=i;
     775             :     }
     776           0 :     std::shuffle(shuffled_indexes.begin(),shuffled_indexes.end(),rng);
     777             :   }
     778             : 
     779             :   // non owning pointers, used for shuffling the execution order
     780             :   std::vector<Kernel*> kernels_ptr;
     781           0 :   for(unsigned i=0; i<kernels.size(); i++) {
     782           0 :     kernels_ptr.push_back(&kernels[i]);
     783             :   }
     784             : 
     785           0 :   int plumedStopCondition=0;
     786             :   bool fast_finish=false;
     787             :   int part=0;
     788             : 
     789           0 :   log<<"Starting MD loop\n";
     790           0 :   log<<"Use CTRL+C to stop at any time and collect timers (not working in MPI runs)\n";
     791             :   // trap signals:
     792           0 :   SignalHandlerGuard sigIntGuard(SIGINT, signalHandler);
     793           0 :   SignalHandlerGuard sigTermGuard(SIGTERM, signalHandler);
     794             : 
     795           0 :   for(int step=0; nf<0 || step<nf; ++step) {
     796           0 :     std::shuffle(kernels_ptr.begin(),kernels_ptr.end(),rng);
     797           0 :     distribution->positions(pos,step,atomicGenerator);
     798           0 :     distribution->box(cell,natoms,step,atomicGenerator);
     799             :     double* pos_ptr;
     800             :     double* for_ptr;
     801             :     double* charges_ptr;
     802             :     double* masses_ptr;
     803             :     int* indexes_ptr=nullptr;
     804             :     int n_local_atoms;
     805             : 
     806           0 :     if(domain_decomposition) {
     807           0 :       const auto nproc=pc.Get_size();
     808           0 :       const auto nn=natoms/nproc;
     809             :       //using int to remove warning, MPI don't work with unsigned
     810           0 :       int excess=natoms%nproc;
     811           0 :       const auto myrank=pc.Get_rank();
     812             :       auto shift=0;
     813           0 :       n_local_atoms=nn;
     814           0 :       if(myrank<excess) {
     815           0 :         n_local_atoms+=1;
     816             :       }
     817           0 :       for(int i=0; i<myrank; i++) {
     818           0 :         shift+=nn;
     819           0 :         if(i<excess) {
     820           0 :           shift+=1;
     821             :         }
     822             :       }
     823           0 :       pos_ptr=&pos[shift][0];
     824           0 :       for_ptr=&forces[shift][0];
     825             :       charges_ptr=&charges[shift];
     826             :       masses_ptr=&masses[shift];
     827           0 :       indexes_ptr=shuffled_indexes.data()+shift;
     828             :     } else {
     829           0 :       pos_ptr=&pos[0][0];
     830           0 :       for_ptr=&forces[0][0];
     831             :       charges_ptr=&charges[0];
     832             :       masses_ptr=&masses[0];
     833           0 :       n_local_atoms=natoms;
     834             :       indexes_ptr=shuffled_indexes.data();
     835             :     }
     836             : 
     837             :     const char* sw_name;
     838           0 :     if(part==0) {
     839             :       sw_name="B0 First step";
     840           0 :     } else if(part==1) {
     841             :       sw_name="B1 Warm-up";
     842           0 :     } else if(part==2) {
     843             :       sw_name="B2 Calculation part 1";
     844             :     } else {
     845             :       sw_name="B3 Calculation part 2";
     846             :     }
     847             : 
     848             : 
     849           0 :     for(unsigned i=0; i<kernels_ptr.size(); i++) {
     850           0 :       auto & p(kernels_ptr[i]->handle);
     851             : 
     852             :       {
     853           0 :         auto sw=kernels_ptr[i]->stopwatch.startPause(sw_name);
     854           0 :         p.cmd("setStep",step);
     855           0 :         p.cmd("setStopFlag",&plumedStopCondition);
     856           0 :         p.cmd("setForces",for_ptr, {n_local_atoms,3});
     857           0 :         p.cmd("setBox",&cell[0], {3,3});
     858           0 :         p.cmd("setVirial",&virial[0], {3,3});
     859           0 :         p.cmd("setPositions",pos_ptr, {n_local_atoms,3});
     860           0 :         p.cmd("setMasses",masses_ptr, {n_local_atoms});
     861           0 :         p.cmd("setCharges",charges_ptr, {n_local_atoms});
     862           0 :         if(shuffled) {
     863           0 :           p.cmd("setAtomsNlocal",n_local_atoms);
     864           0 :           p.cmd("setAtomsGatindex",indexes_ptr, {n_local_atoms});
     865             :         }
     866           0 :         p.cmd("prepareCalc");
     867           0 :       }
     868             : 
     869             :       // mimick MD calculation here
     870             :       {
     871             :         unsigned k=0;
     872           0 :         auto start=std::chrono::high_resolution_clock::now();
     873           0 :         while(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-start).count()<(long long int)1e9*timeToSleep) {
     874           0 :           k+=i*i;
     875             :         }
     876             :         std::fprintf(log_dev_null.get(),"%u",k);
     877             :       }
     878             : 
     879             :       {
     880           0 :         auto sw=kernels_ptr[i]->stopwatch.startStop(sw_name);
     881           0 :         p.cmd("performCalc");
     882           0 :       }
     883             : 
     884           0 :       if(kernels_ptr.size()>1 && part>1) {
     885           0 :         kernels_ptr[i]->timings.push_back(kernels_ptr[i]->stopwatch.getLastCycle(sw_name));
     886             :       }
     887           0 :       if(plumedStopCondition || signalReceived.load()) {
     888             :         fast_finish=true;
     889             :       }
     890             :     }
     891           0 :     auto elapsed=std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-initial_time).count();
     892           0 :     if(part==0) {
     893             :       part=1;
     894             :     }
     895           0 :     if(part<2) {
     896           0 :       if((maxtime>0 && elapsed>(long long int)(0.2*1e9*maxtime)) || (nf>0 && step+1>=nf/5) || (maxtime<0 && nf<0 && step+1>=100)) {
     897             :         part=2;
     898           0 :         log<<"Warm-up completed\n";
     899             :       }
     900             :     }
     901           0 :     if(part<3) {
     902           0 :       if((maxtime>0 && elapsed>(long long int)(0.6*1e9*maxtime)) || (nf>0 && step+1>=3*nf/5)) {
     903             :         part=3;
     904           0 :         log<<"60% completed\n";
     905             :       }
     906             :     }
     907             : 
     908           0 :     if(maxtime>0 && elapsed>(long long int)(1e9*maxtime)) {
     909             :       fast_finish=true;
     910             :     }
     911             : 
     912             :     {
     913           0 :       unsigned tmp=fast_finish;
     914           0 :       pc.Bcast(tmp,0);
     915           0 :       fast_finish=tmp;
     916             :     }
     917           0 :     if(fast_finish) {
     918             :       break;
     919             :     }
     920             :   }
     921             : 
     922             :   return 0;
     923           0 : }
     924             : 
     925             : } // namespace unnamed
     926             : } // namespace cltools
     927             : } // namespace PLMD

Generated by: LCOV version 1.16