LCOV - code coverage report
Current view: top level - cltools - Benchmark.cpp (source / functions) Hit Total Coverage
Test: plumed test coverage Lines: 20 385 5.2 %
Date: 2026-03-30 11:13:23 Functions: 6 28 21.4 %

          Line data    Source code
       1             : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
       2             :    Copyright (c) 2019-2023 The plumed team
       3             :    (see the PEOPLE file at the root of the distribution for a list of names)
       4             : 
       5             :    See http://www.plumed.org for more information.
       6             : 
       7             :    This file is part of plumed, version 2.
       8             : 
       9             :    plumed is free software: you can redistribute it and/or modify
      10             :    it under the terms of the GNU Lesser General Public License as published by
      11             :    the Free Software Foundation, either version 3 of the License, or
      12             :    (at your option) any later version.
      13             : 
      14             :    plumed is distributed in the hope that it will be useful,
      15             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      16             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      17             :    GNU Lesser General Public License for more details.
      18             : 
      19             :    You should have received a copy of the GNU Lesser General Public License
      20             :    along with plumed.  If not, see <http://www.gnu.org/licenses/>.
      21             : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
      22             : #include "CLTool.h"
      23             : #include "core/CLToolRegister.h"
      24             : #include "tools/Communicator.h"
      25             : #include "tools/Tools.h"
      26             : #include "config/Config.h"
      27             : #include "tools/PlumedHandle.h"
      28             : #include "tools/Stopwatch.h"
      29             : #include "tools/Log.h"
      30             : #include "tools/DLLoader.h"
      31             : #include "tools/Random.h"
      32             : 
      33             : #include <cstdio>
      34             : #include <string>
      35             : #include <vector>
      36             : #include <fstream>
      37             : #include <atomic>
      38             : #include <csignal>
      39             : #include <cstdio>
      40             : #include <random>
      41             : #include <algorithm>
      42             : #include <chrono>
      43             : #include <string_view>
      44             : 
      45             : namespace PLMD {
      46             : namespace cltools {
      47             : 
      48             : //+PLUMEDOC TOOLS benchmark
      49             : /*
      50             : benchmark is a lightweight reimplementation of driver focused on running benchmarks
      51             : 
      52             : The main difference wrt driver is that it generates a trajectory in memory rather than reading it
      53             : from a file. This allows to better time the overhead of the plumed library, without including
      54             : the time needed to read the trajectory.
      55             : 
      56             : It is also possible to load a separate version of the plumed kernel. This enables running
      57             : benchmarks agaist previous plumed versions in a controlled setting, where systematic errors
      58             : in the comparison are minimized.
      59             : 
      60             : \par Examples
      61             : 
      62             : First, you should create a sample `plumed.dat` file for testing. For instance:
      63             : 
      64             : \plumedfile
      65             : WHOLEMOLECULES ENTITY0=1-10000
      66             : p: POSITION ATOM=1
      67             : RESTRAINT ARG=p.x KAPPA=1 AT=0
      68             : \endplumedfile
      69             : 
      70             : Then you can test the performance of this input with the following command:
      71             : \verbatim
      72             : plumed benchmark
      73             : \endverbatim
      74             : 
      75             : You can also test a different (older) version of PLUMED with the same input. To do so,
      76             : you should run
      77             : \verbatim
      78             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so
      79             : \endverbatim
      80             : 
      81             : \warning It is necessary to use the `plumed-runtime` executable here to avoid conflicts between different
      82             : plumed versions. You will find it in your path if you are using the non installed version of plumed,
      83             : and in `$prefix/lib/plumed` if you installed plumed in $prefix,.
      84             : 
      85             : \par Comparing multiple versions
      86             : 
      87             : The best way to compare two versions of plumed on the same input is to pass multiple colon-separated kernels:
      88             : 
      89             : \verbatim
      90             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:/path2/to/lib/libplumedKernel.so:this
      91             : \endverbatim
      92             : 
      93             : Here `this` means the kernel of the version with which you are running the benchmark. This comparison runs the three
      94             : instances simultaneously (alternating them) so that systematic differences in the load of your machine will affect them
      95             : to the same extent.
      96             : 
      97             : In case the different versions require modified plumed.dat files, or if you simply want to compare
      98             : two different plumed input files that compute the same thing, you can also use multiple plumed input files:
      99             : 
     100             : \verbatim
     101             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:this --plumed plumed1.dat:plumed2.dat
     102             : \endverbatim
     103             : 
     104             : Similarly, you might want to run two different inputs using the same kernel, which can be obtained with:
     105             : 
     106             : \verbatim
     107             : plumed-runtime benchmark --plumed plumed1.dat:plumed2.dat
     108             : \endverbatim
     109             : 
     110             : \par Profiling
     111             : 
     112             : If you want to attach a profiler on the fly to the process, you might find it convenient to use `--nsteps -1`.
     113             : The simulation will run forever and can be interrupted with CTRL-C. When interrupted, the result of the timers
     114             : should be displayed anyway.
     115             : You can also run setting a maximum time with `--maxtime`.
     116             : 
     117             : If you run a profiler when testing multiple PLUMED versions you might be confused by which function is from
     118             : each version. It is recommended to recompile separate instances with a separate C++ namespace (`-DPLMD=PLUMED_version_1`)
     119             : so that you will be able to distinguish them. In addition, compiling with `CXXFLAGS="-g -O3"` will make the profiling
     120             : report more complete, likely including code lines.
     121             : 
     122             : \par MPI runs
     123             : 
     124             : You can run emulating a domain decomposition. This is done automatically if plumed has been compiled with MPI
     125             : and you run with `mpirun`
     126             : 
     127             : \verbatim
     128             : mpirun -np 4 plumed-runtime benchmark
     129             : \endverbatim
     130             : 
     131             : If you load separate PLUMED instances as discussed above, they should all be compiled against the same MPI version.
     132             : Notice that when using MPI signals (CTRL-C) might not work.
     133             : 
     134             : Since some of the data transfer could happen asynchronously, you might want to use the `--sleep` option
     135             : to simulate a lag between the `prepareCalc` and `performCalc` actions. This part of the calculation will not contribute
     136             : to timer, but will obviously slow down your test.
     137             : 
     138             : \par Output
     139             : 
     140             : In the output you will see the usual reports about timing produced by the internal
     141             : timers of the tested plumed instances.
     142             : In addition, this tool will monitor the timing externally, with some slightly different criterion:
     143             : - First, the initialization (construction of the input) will be shown with a separate timer,
     144             :   as well as the timing for the first step.
     145             : - Second, the timer corresponding to the calculation will be split in three parts, reporting
     146             :   execution of the first 20% (warm-up) and the next two blocks of 40% each.
     147             : - Finally, you might notice some discrepancy because some of the actions that are usually
     148             :   not expensive are not included in the internal timers. The external timer will
     149             :   thus provide a better estimate of the total elapsed time, including everything.
     150             : 
     151             : The internal timers are still useful to monitor what happens at the different stages
     152             : and, with \ref DEBUG `DETAILED_TIMERS`, what happens in each action.
     153             : 
     154             : When you run multiple version, a comparative analisys of the time spent within PLUMED in the various
     155             : instances will be done, showing the ratio between the total time and the time measured on the first
     156             : instance, which will act as a reference. Errors will be estimated with bootstrapping. The warm-up phase will be discarded for
     157             : this analysis.
     158             : 
     159             : */
     160             : //+ENDPLUMEDOC
     161             : 
     162             : // We use an anonymous namespace here to avoid clashes with variables
     163             : // declared in other parts of the code
     164             : namespace {
     165             : 
     166             : //this is a sugar for changing idea faster about the rng
     167             : using generator = std::mt19937;
     168             : 
     169             : std::atomic<bool> signalReceived(false);
     170             : 
     171             : class SignalHandlerGuard {
     172             : public:
     173           0 :   SignalHandlerGuard(int signal, void (*newHandler)(int)) : signal_(signal) {
     174             :     // Store the current handler before setting the new one
     175           0 :     prevHandler_ = std::signal(signal, newHandler);
     176           0 :     if (prevHandler_ == SIG_ERR) {
     177           0 :       throw std::runtime_error("Failed to set signal handler");
     178             :     }
     179           0 :   }
     180             : 
     181             :   ~SignalHandlerGuard() {
     182             :     // Restore the previous handler upon destruction
     183           0 :     std::signal(signal_, prevHandler_);
     184           0 :   }
     185             : 
     186             :   // Delete copy constructor and assignment operator to prevent copying
     187             :   SignalHandlerGuard(const SignalHandlerGuard&) = delete;
     188             :   SignalHandlerGuard& operator=(const SignalHandlerGuard&) = delete;
     189             : 
     190             : private:
     191             :   int signal_;
     192             :   void (*prevHandler_)(int);
     193             : };
     194             : 
     195           0 : extern "C" void signalHandler(int signal) {
     196           0 :   if (signal == SIGINT) {
     197             :     signalReceived.store(true);
     198           0 :     fprintf(stderr, "Signal interrupt received\n");
     199             :   }
     200           0 :   if (signal == SIGTERM) {
     201             :     signalReceived.store(true);
     202           0 :     fprintf(stderr, "Signal termination received\n");
     203             :   }
     204           0 : }
     205             : 
     206             : /// This base class contains members that are movable with default operations
     207             : struct KernelBase {
     208             :   std::string path;
     209             :   std::string plumed_dat;
     210             :   PlumedHandle handle;
     211             :   Stopwatch stopwatch;
     212             :   std::vector<long long int> timings;
     213             :   double comparative_timing=-1.0;
     214             :   double comparative_timing_error=-1.0;
     215           0 :   KernelBase(const std::string & path_,const std::string & plumed_dat_, Log* log_):
     216           0 :     path(path_),
     217           0 :     plumed_dat(plumed_dat_),
     218           0 :     handle([&]() {
     219           0 :     if(path_=="this") {
     220           0 :       return PlumedHandle();
     221             :     } else {
     222           0 :       return PlumedHandle::dlopen(path.c_str());
     223             :     }
     224             :   }()),
     225           0 :   stopwatch(*log_) {
     226           0 :   }
     227             : };
     228             : 
     229             : /// Local structure handling a kernel and the related timers.
     230             : /// This structure specifically contain the Log, which needs special treatment
     231             : /// in move semantics
     232             : struct Kernel :
     233             :   public KernelBase {
     234             :   Log* log=nullptr;
     235           0 :   Kernel(const std::string & path_,const std::string & the_plumed_dat, Log* log_):
     236             :     KernelBase(path_,the_plumed_dat,log_),
     237           0 :     log(log_) {
     238             :   }
     239             : 
     240           0 :   ~Kernel() {
     241           0 :     if(log) {
     242           0 :       (*log)<<"\n";
     243           0 :       (*log)        <<"Kernel:      "<<path<<"\n";
     244           0 :       (*log)        <<"Input:       "<<plumed_dat<<"\n";
     245           0 :       if(comparative_timing>0.0) {
     246           0 :         (*log).printf("Comparative: %.3f +- %.3f\n",comparative_timing,comparative_timing_error);
     247             :       }
     248             :     }
     249           0 :   }
     250             : 
     251           0 :   Kernel(Kernel && other) noexcept:
     252             :     KernelBase(std::move(other)),
     253           0 :     log(other.log) {
     254           0 :     other.log=nullptr; // ensure no log is done in the moved away object
     255             :   }
     256             : 
     257           0 :   Kernel & operator=(Kernel && other) noexcept {
     258           0 :     if(this != &other) {
     259           0 :       KernelBase::operator=(std::move(other));
     260           0 :       log=other.log;
     261           0 :       other.log=nullptr; // ensure no log is done in the moved away object
     262             :     }
     263           0 :     return *this;
     264             :   }
     265             : };
     266             : 
     267             : namespace  {
     268             : 
     269             : class UniformSphericalVector {
     270             :   //double rminCub;
     271             :   double rCub;
     272             : 
     273             : public:
     274             :   //assuming rmin=0
     275           0 :   UniformSphericalVector(const double rmax):
     276           0 :     rCub (rmax*rmax*rmax/*-rminCub*/) {}
     277           0 :   PLMD::Vector operator()(Random& rng) {
     278           0 :     double rho = std::cbrt (/*rminCub + */rng.RandU01()*rCub);
     279           0 :     double theta =std::acos (2.0*rng.RandU01() -1.0);
     280           0 :     double phi = 2.0 * PLMD::pi * rng.RandU01();
     281             :     return Vector (
     282           0 :              rho * sin (theta) * cos (phi),
     283           0 :              rho * sin (theta) * sin (phi),
     284           0 :              rho * cos (theta));
     285             :   }
     286             : };
     287             : 
     288             : ///Acts as a template for any distribution
     289             : struct AtomDistribution {
     290             :   virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&)=0;
     291           0 :   virtual void box(std::vector<double>& box, unsigned /*natoms*/, unsigned /*step*/, Random&) {
     292             :     std::fill(box.begin(), box.end(),0);
     293           0 :   };
     294             :   virtual ~AtomDistribution() noexcept {}
     295             : };
     296             : 
     297           0 : struct theLine:public AtomDistribution {
     298           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned step, Random&rng) override {
     299             :     auto nat = posToUpdate.size();
     300             :     UniformSphericalVector usv(0.5);
     301             : 
     302           0 :     for (unsigned i=0; i<nat; ++i) {
     303           0 :       posToUpdate[i] = Vector(i, 0, 0) + usv(rng);
     304             :     }
     305           0 :   }
     306             : };
     307             : 
     308           0 : struct uniformSphere:public AtomDistribution {
     309           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     310             : 
     311             :     //giving more or less a cubic udm of volume for each atom: V=nat
     312           0 :     const double rmax= std::cbrt ((3.0/(4.0*PLMD::pi)) * posToUpdate.size());
     313             : 
     314             :     UniformSphericalVector usv(rmax);
     315             :     auto s=posToUpdate.begin();
     316             :     auto e=posToUpdate.end();
     317             :     //I am using the iterators:this is slightly faster,
     318             :     // enough to overcome the cost of the vtable that I added
     319           0 :     for (unsigned i=0; s!=e; ++s,++i) {
     320           0 :       *s = usv (rng);
     321             :     }
     322             : 
     323           0 :   }
     324           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     325           0 :     const double rmax= 2.0*std::cbrt((3.0/(4.0*PLMD::pi)) * natoms);
     326           0 :     box[0]=rmax;
     327           0 :     box[1]=0.0;
     328           0 :     box[2]=0.0;
     329           0 :     box[3]=0.0;
     330           0 :     box[4]=rmax;
     331           0 :     box[5]=0.0;
     332           0 :     box[6]=0.0;
     333           0 :     box[7]=0.0;
     334           0 :     box[8]=rmax;
     335             : 
     336           0 :   }
     337             : };
     338             : 
     339           0 : struct twoGlobs: public AtomDistribution {
     340           0 :   virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&rng) {
     341             :     //I am using two unigform spheres and 2V=n
     342           0 :     const double rmax= std::cbrt ((3.0/(8.0*PLMD::pi)) * posToUpdate.size());
     343             : 
     344             :     UniformSphericalVector usv(rmax);
     345             :     std::array<Vector,2> centers{
     346             :       PLMD::Vector{0.0,0.0,0.0},
     347             : //so they do not overlap
     348             :       PLMD::Vector{2.0*rmax,2.0*rmax,2.0*rmax}
     349           0 :     };
     350           0 :     std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
     351             :       //RandInt is only declared
     352             :       // return usv (rng) + centers[rng.RandInt(1)];
     353           0 :       return usv (rng) + centers[rng.RandU01()>0.5];
     354             :     });
     355           0 :   }
     356             : 
     357           0 :   virtual void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) {
     358             : 
     359           0 :     const double rmax= 4.0 * std::cbrt ((3.0/(8.0*PLMD::pi)) * natoms);
     360           0 :     box[0]=rmax;
     361           0 :     box[1]=0.0;
     362           0 :     box[2]=0.0;
     363           0 :     box[3]=0.0;
     364           0 :     box[4]=rmax;
     365           0 :     box[5]=0.0;
     366           0 :     box[6]=0.0;
     367           0 :     box[7]=0.0;
     368           0 :     box[8]=rmax;
     369           0 :   };
     370             : };
     371             : 
     372           0 : struct uniformCube:public AtomDistribution {
     373           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     374             :     //giving more or less a cubic udm of volume for each atom: V = nat
     375           0 :     const double rmax = std::cbrt(static_cast<double>(posToUpdate.size()));
     376             : 
     377             : 
     378             : 
     379             :     // std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
     380             :     //   return Vector (rndR(rng),rndR(rng),rndR(rng));
     381             :     // });
     382             :     auto s=posToUpdate.begin();
     383             :     auto e=posToUpdate.end();
     384             :     //I am using the iterators:this is slightly faster,
     385             :     // enough to overcome the cost of the vtable that I added
     386           0 :     for (unsigned i=0; s!=e; ++s,++i) {
     387           0 :       *s = Vector (rng.RandU01()*rmax,rng.RandU01()*rmax,rng.RandU01()*rmax);
     388             :     }
     389           0 :   }
     390           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     391             :     //+0.05 to avoid overlap
     392           0 :     const double rmax= std::cbrt(natoms)+0.05;
     393           0 :     box[0]=rmax;
     394           0 :     box[1]=0.0;
     395           0 :     box[2]=0.0;
     396           0 :     box[3]=0.0;
     397           0 :     box[4]=rmax;
     398           0 :     box[5]=0.0;
     399           0 :     box[6]=0.0;
     400           0 :     box[7]=0.0;
     401           0 :     box[8]=rmax;
     402             : 
     403           0 :   }
     404             : };
     405             : 
     406           0 : struct tiledSimpleCubic:public AtomDistribution {
     407           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     408             :     //Tiling the space in this way will not tests 100% the pbc, but
     409             :     //I do not think that write a spacefilling curve, like Hilbert, Peano or Morton
     410             :     //could be a good idea, in this case
     411             : 
     412             :     // the lambda is here because in some case a perfect cube is not well represented
     413             :     // by double and will get uncorrectly ceiled up (for example 24389 will result in a 30 and not in 29)
     414           0 :     const unsigned rmax = [&] {
     415           0 :       const size_t x = std::ceil(std::cbrt(static_cast<double>(posToUpdate.size())))-1;
     416           0 :       if ( x*x*x >= posToUpdate.size()) {
     417             :         return x;
     418             :       }
     419           0 :       return x+1;
     420           0 :     }();
     421             : 
     422             :     auto s=posToUpdate.begin();
     423             :     auto e=posToUpdate.end();
     424             :     //I am using the iterators:this is slightly faster,
     425             :     // enough to overcome the cost of the vtable that I added
     426           0 :     for (unsigned k=0; k<rmax&&s!=e; ++k) {
     427           0 :       for (unsigned j=0; j<rmax&&s!=e; ++j) {
     428           0 :         for (unsigned i=0; i<rmax&&s!=e; ++i) {
     429           0 :           *s = Vector (i,j,k);
     430             :           ++s;
     431             :         }
     432             :       }
     433             :     }
     434           0 :   }
     435           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     436           0 :     const double rmax= std::ceil(std::cbrt(static_cast<double>(natoms)));;
     437           0 :     box[0]=rmax;
     438           0 :     box[1]=0.0;
     439           0 :     box[2]=0.0;
     440           0 :     box[3]=0.0;
     441           0 :     box[4]=rmax;
     442           0 :     box[5]=0.0;
     443           0 :     box[6]=0.0;
     444           0 :     box[7]=0.0;
     445           0 :     box[8]=rmax;
     446             : 
     447           0 :   }
     448             : };
     449           0 : std::unique_ptr<AtomDistribution> getAtomDistribution(std::string_view atomicDistr) {
     450           0 :   std::unique_ptr<AtomDistribution> distribution;
     451           0 :   if(atomicDistr == "line") {
     452           0 :     distribution = std::make_unique<theLine>();
     453           0 :   } else if (atomicDistr == "cube") {
     454           0 :     distribution = std::make_unique<uniformCube>();
     455           0 :   } else if (atomicDistr == "sphere") {
     456           0 :     distribution = std::make_unique<uniformSphere>();
     457           0 :   } else if (atomicDistr == "globs") {
     458           0 :     distribution = std::make_unique<twoGlobs>();
     459           0 :   } else if (atomicDistr == "sc") {
     460           0 :     distribution = std::make_unique<tiledSimpleCubic>();
     461             :   } else {
     462           0 :     plumed_error() << R"(The atomic distribution can be only "line", "cube", "sphere", "globs" and "sc", the input was ")"
     463           0 :                    << atomicDistr <<'"';
     464             :   }
     465           0 :   return distribution;
     466           0 : }
     467             : } //anonymus namespace for benchmark distributions
     468             : class Benchmark:
     469             :   public CLTool {
     470             : public:
     471             :   static void registerKeywords( Keywords& keys );
     472             :   explicit Benchmark(const CLToolOptions& co );
     473             :   int main(FILE* in, FILE*out,Communicator& pc) override;
     474           4 :   std::string description()const override {
     475           4 :     return "run a calculation with a fixed trajectory to find bottlenecks in PLUMED";
     476             :   }
     477             : };
     478             : 
     479       17653 : PLUMED_REGISTER_CLTOOL(Benchmark,"benchmark")
     480             : 
     481        5883 : void Benchmark::registerKeywords( Keywords& keys ) {
     482        5883 :   CLTool::registerKeywords( keys );
     483       11766 :   keys.add("compulsory","--plumed","plumed.dat","colon separated path(s) to the input file(s)");
     484       11766 :   keys.add("compulsory","--kernel","this","colon separated path(s) to kernel(s)");
     485       11766 :   keys.add("compulsory","--natoms","100000","the number of atoms to use for the simulation");
     486       11766 :   keys.add("compulsory","--nsteps","2000","number of steps of MD to perform (-1 means forever)");
     487       11766 :   keys.add("compulsory","--maxtime","-1","maximum number of seconds (-1 means forever)");
     488       11766 :   keys.add("compulsory","--sleep","0","number of seconds of sleep, mimicking MD calculation");
     489       11766 :   keys.add("compulsory","--atom-distribution","line","the kind of possible atomic displacement at each step");
     490       11766 :   keys.add("optional","--dump-trajectory","dump the trajectory to this file");
     491       11766 :   keys.addFlag("--domain-decomposition",false,"simulate domain decomposition, implies --shuffle");
     492       11766 :   keys.addFlag("--shuffled",false,"reshuffle atoms");
     493        5883 : }
     494             : 
     495           4 : Benchmark::Benchmark(const CLToolOptions& co ):
     496           4 :   CLTool(co) {
     497           4 :   inputdata=commandline;
     498           4 : }
     499             : 
     500             : 
     501           0 : int Benchmark::main(FILE* in, FILE*out,Communicator& pc) {
     502             :   // deterministic initializations to avoid issues with MPI
     503             :   generator rng;
     504           0 :   PLMD::Random atomicGenerator;
     505           0 :   std::unique_ptr<AtomDistribution> distribution;
     506             : 
     507             :   struct FileDeleter {
     508             :     void operator()(FILE*f) const noexcept {
     509             :       if(f) {
     510           0 :         std::fclose(f);
     511             :       }
     512           0 :     }
     513             :   };
     514             : 
     515           0 :   std::unique_ptr<FILE,FileDeleter> log_dev_null{std::fopen("/dev/null","w")};
     516             : 
     517           0 :   Log log;
     518           0 :   if(pc.Get_rank()==0) {
     519           0 :     log.link(out);
     520             :   } else {
     521           0 :     log.link(log_dev_null.get());
     522             :   }
     523           0 :   log.setLinePrefix("BENCH:  ");
     524           0 :   log <<"Welcome to PLUMED benchmark\n";
     525             :   std::vector<Kernel> kernels;
     526             : 
     527             :   // perform comparative analysis
     528             :   // ensure that kernels vector is destroyed from last to first element upon exit
     529           0 :   auto kernels_deleter=[&log](auto f) {
     530           0 :     if(!f) {
     531           0 :       return;
     532             :     }
     533           0 :     if(f->empty()) {
     534             :       return;
     535             :     }
     536             :     generator bootstrapRng;
     537             : 
     538             :     const auto size=f->back().timings.size();
     539             :     //B are the bootstrap iterations
     540             :     constexpr int B=200;
     541           0 :     const size_t numblocks=size;
     542             :     // for some reasons, blocked bootstrap underestimates error
     543             :     // For now I keep it off. If I remove it, the code below could be simplified
     544             :     // if(numblocks>20) numblocks=20;
     545           0 :     const auto blocksize=size/numblocks;
     546             : 
     547           0 :     if(f->size()<2) {
     548           0 :       log<<"Single run, skipping comparative analysis\n";
     549           0 :     } else if(size<10) {
     550           0 :       log<<"Too small sample, skipping comparative analysis\n";
     551             :     } else
     552             :       try {
     553             : 
     554           0 :         log<<"Running comparative analysis, "<<numblocks<<" blocks with size "<<blocksize<<"\n";
     555             : 
     556           0 :         std::vector<std::size_t> choice(size);
     557           0 :         std::uniform_int_distribution<> distrib(0, numblocks-1);
     558           0 :         std::vector<std::vector<long long int>> blocks(f->size());
     559             : 
     560             :         {
     561             :           int i=0;
     562           0 :           for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
     563             :             size_t l=0;
     564           0 :             blocks[i].assign(numblocks,0);
     565           0 :             for(auto j=0ULL; j<numblocks; j++) {
     566           0 :               for(auto k=0ULL; k<blocksize; k++) {
     567           0 :                 plumed_assert(l<it->timings.size());
     568           0 :                 blocks[i][j]+=it->timings[l];
     569           0 :                 l++;
     570             :               }
     571             :             }
     572             :           }
     573             :         }
     574             : 
     575           0 :         std::vector<std::vector<double>> ratios(f->size());
     576           0 :         for(auto & r : ratios) {
     577             :           //B are the bootstrap iterations
     578           0 :           r.resize(B);
     579             :         }
     580             : 
     581             :         //B are the bootstrap iterations
     582           0 :         for(unsigned b=0; b<B; b++) {
     583           0 :           for(auto & c : choice) {
     584           0 :             c=distrib(bootstrapRng);
     585             :           }
     586             :           long long int reference=0;
     587           0 :           for(auto & c : choice) {
     588           0 :             reference+=blocks[0][c];
     589             :           }
     590           0 :           for(auto i=0ULL; i<blocks.size(); i++) {
     591             :             long long int estimate=0;
     592             :             // this would lead to separate bootstrap samples for each estimate:
     593             :             // for(auto & c : choice){c=distrib(bootstrapRng);}
     594           0 :             for(auto & c : choice) {
     595           0 :               estimate+=blocks[i][c];
     596             :             }
     597           0 :             ratios[i][b]=double(estimate)/double(reference);
     598             :           }
     599             :         }
     600             : 
     601             :         {
     602             :           int i=0;
     603           0 :           for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
     604             :             double sum=0.0;
     605             :             double sum2=0.0;
     606           0 :             for(auto r : ratios[i]) {
     607           0 :               sum+=r;
     608           0 :               sum2+=r*r;
     609             :             }
     610             :             //B are the bootstrap iterations
     611           0 :             it->comparative_timing=sum/B;
     612           0 :             it->comparative_timing_error=std::sqrt(sum2/B-sum*sum/(B*B));
     613             :           }
     614             :         }
     615             : 
     616           0 :       } catch(std::exception & e) {
     617           0 :         log<<"Unexpected error during comparative analysis\n";
     618           0 :         log<<e.what()<<"\n";
     619             :       }
     620           0 :     while(!f->empty()) {
     621             :       f->pop_back();
     622             :     }
     623             : 
     624             :   };
     625             :   std::unique_ptr<decltype(kernels),decltype(kernels_deleter)> kernels_deleter_obj(&kernels,kernels_deleter);
     626             : 
     627             : 
     628             :   // construct the kernels vector:
     629             :   {
     630             :     std::vector<std::string> allpaths;
     631             : 
     632             :     {
     633             :       std::string paths;
     634           0 :       parse("--kernel",paths);
     635           0 :       log <<"Using --kernel=" << paths << "\n";
     636           0 :       allpaths=Tools::getWords(paths,":");
     637             :     }
     638             : 
     639             :     std::vector<std::string> allplumed;
     640             :     {
     641             :       std::string paths;
     642           0 :       parse("--plumed",paths);
     643           0 :       log <<"Using --plumed=" << paths << "\n";
     644           0 :       allplumed=Tools::getWords(paths,":");
     645             :     }
     646             : 
     647           0 :     plumed_assert(allplumed.size()>0 && allpaths.size()>0);
     648             : 
     649             :     // this check only works on MacOS
     650             : #if defined(__APPLE__)
     651             :     // if any of the paths if different from "this", we check if libplumed was loaded locally to avoid conflicts.
     652             :     if(std::any_of(allpaths.begin(),allpaths.end(),[](auto value) {
     653             :     return value != "this";
     654             :   })) {
     655             :       if(DLLoader::isPlumedGlobal()) {
     656             :         plumed_error()<<"It looks like libplumed is loaded in the global namespace, you cannot load a different version of the kernel\n"
     657             :                       <<"Please make sure you use the plumed-runtime executable and that the env var PLUMED_LOAD_NAMESPACE is not set to GLOBAL";
     658             :       }
     659             :     }
     660             : #endif
     661             : 
     662           0 :     if(allplumed.size()>1 && allpaths.size()>1 && allplumed.size() != allpaths.size()) {
     663           0 :       plumed_error() << "--kernel and --plumed should have either one element or the same number of elements";
     664             :     }
     665             : 
     666           0 :     if(allplumed.size()>1 && allpaths.size()==1)
     667           0 :       for(unsigned i=1; i<allplumed.size(); i++) {
     668           0 :         allpaths.push_back(allpaths[0]);
     669             :       }
     670           0 :     if(allplumed.size()==1 && allpaths.size()>1)
     671           0 :       for(unsigned i=1; i<allpaths.size(); i++) {
     672           0 :         allplumed.push_back(allplumed[0]);
     673             :       }
     674             : 
     675           0 :     for(unsigned i=0; i<allpaths.size(); i++) {
     676           0 :       kernels.emplace_back(allpaths[i],allplumed[i],&log);
     677             :     }
     678           0 :   }
     679             : 
     680             :   // reverse order so that log happens in the forward order:
     681           0 :   std::reverse(kernels.begin(),kernels.end());
     682             : 
     683             :   // read other flags:
     684           0 :   bool shuffled=false;
     685           0 :   parseFlag("--shuffled",shuffled);
     686             : 
     687             :   int nf;
     688           0 :   parse("--nsteps",nf);
     689           0 :   log << "Using --nsteps=" << nf << "\n";
     690             :   unsigned natoms;
     691           0 :   parse("--natoms",natoms);
     692           0 :   log << "Using --natoms=" << natoms << "\n";
     693             :   double maxtime;
     694           0 :   parse("--maxtime",maxtime);
     695           0 :   log << "Using --maxtime=" << maxtime << "\n";
     696             : 
     697           0 :   bool domain_decomposition=false;
     698           0 :   parseFlag("--domain-decomposition",domain_decomposition);
     699             : 
     700           0 :   if(pc.Get_size()>1) {
     701           0 :     domain_decomposition=true;
     702             :   }
     703           0 :   if(domain_decomposition) {
     704           0 :     shuffled=true;
     705             :   }
     706             : 
     707           0 :   if (shuffled) {
     708           0 :     log << "Using --shuffled\n";
     709             :   }
     710           0 :   if (domain_decomposition) {
     711           0 :     log << "Using --domain-decomposition\n";
     712             :   }
     713             : 
     714             :   double timeToSleep;
     715           0 :   parse("--sleep",timeToSleep);
     716           0 :   log << "Using --sleep=" << timeToSleep << "\n";
     717             : 
     718             :   std::vector<int> shuffled_indexes;
     719             : 
     720             :   {
     721             :     std::string atomicDistr;
     722           0 :     parse("--atom-distribution",atomicDistr);
     723           0 :     distribution = getAtomDistribution(atomicDistr);
     724           0 :     log << "Using --atom-distribution=" << atomicDistr << "\n";
     725             :   }
     726             : 
     727             :   {
     728             :     std::string fileToDump;
     729           0 :     if(parse("--dump-trajectory",fileToDump)) {
     730           0 :       log << "Saving the trajectory to \"" << fileToDump << "\" and exiting\n";
     731           0 :       std::vector<double> cell(9);
     732           0 :       std::vector<Vector> pos(natoms);
     733           0 :       std::ofstream ofile(fileToDump);
     734           0 :       if (nf<0) {
     735             :         //if the user accidentally sets infinite steps, we set it to print only one
     736           0 :         nf=1;
     737             :       }
     738           0 :       for(int step=0; step<nf; ++step) {
     739           0 :         auto sw=kernels[0].stopwatch.startStop("TrajectoryGeneration");
     740           0 :         distribution->positions(pos,step,atomicGenerator);
     741           0 :         distribution->box(cell,natoms,step,atomicGenerator);
     742           0 :         ofile << natoms << "\n"
     743           0 :               << cell[0] << " " << cell[1] << " " << cell[2] << " "
     744           0 :               << cell[3] << " " << cell[4] << " " << cell[5] << " "
     745           0 :               << cell[6] << " " << cell[7] << " " << cell[8] << "\n";
     746           0 :         for(int i=0; i<natoms; ++i) {
     747           0 :           ofile << "X\t" << pos[i]<< "\n";
     748             :         }
     749           0 :       }
     750           0 :       ofile.close();
     751             :       return 0;
     752           0 :     }
     753             :   }
     754             : 
     755           0 :   log <<"Initializing the setup of the kernel(s)\n";
     756           0 :   const auto initial_time=std::chrono::high_resolution_clock::now();
     757             : 
     758           0 :   for(auto & k : kernels) {
     759             :     auto & p(k.handle);
     760           0 :     auto sw=k.stopwatch.startStop("A Initialization");
     761           0 :     if(Communicator::plumedHasMPI() && domain_decomposition) {
     762           0 :       p.cmd("setMPIComm",&pc.Get_comm());
     763             :     }
     764           0 :     p.cmd("setRealPrecision",(int)sizeof(double));
     765           0 :     p.cmd("setMDLengthUnits",1.0);
     766           0 :     p.cmd("setMDChargeUnits",1.0);
     767           0 :     p.cmd("setMDMassUnits",1.0);
     768           0 :     p.cmd("setMDEngine","benchmarks");
     769           0 :     p.cmd("setTimestep",1.0);
     770           0 :     p.cmd("setPlumedDat",k.plumed_dat.c_str());
     771           0 :     p.cmd("setLog",out);
     772           0 :     p.cmd("setNatoms",natoms);
     773           0 :     p.cmd("init");
     774           0 :   }
     775             : 
     776           0 :   std::vector<double> cell( 9 ), virial( 9 );
     777           0 :   std::vector<Vector> pos( natoms ), forces( natoms );
     778           0 :   std::vector<double> masses( natoms, 1 ), charges( natoms, 0 );
     779             : 
     780           0 :   if(shuffled) {
     781           0 :     shuffled_indexes.resize(natoms);
     782           0 :     for(unsigned i=0; i<natoms; i++) {
     783           0 :       shuffled_indexes[i]=i;
     784             :     }
     785           0 :     std::shuffle(shuffled_indexes.begin(),shuffled_indexes.end(),rng);
     786             :   }
     787             : 
     788             :   // non owning pointers, used for shuffling the execution order
     789             :   std::vector<Kernel*> kernels_ptr;
     790           0 :   for(unsigned i=0; i<kernels.size(); i++) {
     791           0 :     kernels_ptr.push_back(&kernels[i]);
     792             :   }
     793             : 
     794           0 :   int plumedStopCondition=0;
     795             :   bool fast_finish=false;
     796             :   int part=0;
     797             : 
     798           0 :   log<<"Starting MD loop\n";
     799           0 :   log<<"Use CTRL+C to stop at any time and collect timers (not working in MPI runs)\n";
     800             :   // trap signals:
     801           0 :   SignalHandlerGuard sigIntGuard(SIGINT, signalHandler);
     802           0 :   SignalHandlerGuard sigTermGuard(SIGTERM, signalHandler);
     803             : 
     804           0 :   for(int step=0; nf<0 || step<nf; ++step) {
     805           0 :     std::shuffle(kernels_ptr.begin(),kernels_ptr.end(),rng);
     806           0 :     distribution->positions(pos,step,atomicGenerator);
     807           0 :     distribution->box(cell,natoms,step,atomicGenerator);
     808             :     double* pos_ptr;
     809             :     double* for_ptr;
     810             :     double* charges_ptr;
     811             :     double* masses_ptr;
     812             :     int* indexes_ptr=nullptr;
     813             :     int n_local_atoms;
     814             : 
     815           0 :     if(domain_decomposition) {
     816           0 :       const auto nproc=pc.Get_size();
     817           0 :       const auto nn=natoms/nproc;
     818             :       //using int to remove warning, MPI don't work with unsigned
     819           0 :       int excess=natoms%nproc;
     820           0 :       const auto myrank=pc.Get_rank();
     821             :       auto shift=0;
     822           0 :       n_local_atoms=nn;
     823           0 :       if(myrank<excess) {
     824           0 :         n_local_atoms+=1;
     825             :       }
     826           0 :       for(int i=0; i<myrank; i++) {
     827           0 :         shift+=nn;
     828           0 :         if(i<excess) {
     829           0 :           shift+=1;
     830             :         }
     831             :       }
     832           0 :       pos_ptr=&pos[shift][0];
     833           0 :       for_ptr=&forces[shift][0];
     834             :       charges_ptr=&charges[shift];
     835             :       masses_ptr=&masses[shift];
     836           0 :       indexes_ptr=shuffled_indexes.data()+shift;
     837             :     } else {
     838           0 :       pos_ptr=&pos[0][0];
     839           0 :       for_ptr=&forces[0][0];
     840             :       charges_ptr=&charges[0];
     841             :       masses_ptr=&masses[0];
     842           0 :       n_local_atoms=natoms;
     843             :       indexes_ptr=shuffled_indexes.data();
     844             :     }
     845             : 
     846             :     const char* sw_name;
     847           0 :     if(part==0) {
     848             :       sw_name="B0 First step";
     849           0 :     } else if(part==1) {
     850             :       sw_name="B1 Warm-up";
     851           0 :     } else if(part==2) {
     852             :       sw_name="B2 Calculation part 1";
     853             :     } else {
     854             :       sw_name="B3 Calculation part 2";
     855             :     }
     856             : 
     857             : 
     858           0 :     for(unsigned i=0; i<kernels_ptr.size(); i++) {
     859           0 :       auto & p(kernels_ptr[i]->handle);
     860             : 
     861             :       {
     862           0 :         auto sw=kernels_ptr[i]->stopwatch.startPause(sw_name);
     863           0 :         p.cmd("setStep",step);
     864           0 :         p.cmd("setStopFlag",&plumedStopCondition);
     865           0 :         p.cmd("setForces",for_ptr, {n_local_atoms,3});
     866           0 :         p.cmd("setBox",&cell[0], {3,3});
     867           0 :         p.cmd("setVirial",&virial[0], {3,3});
     868           0 :         p.cmd("setPositions",pos_ptr, {n_local_atoms,3});
     869           0 :         p.cmd("setMasses",masses_ptr, {n_local_atoms});
     870           0 :         p.cmd("setCharges",charges_ptr, {n_local_atoms});
     871           0 :         if(shuffled) {
     872           0 :           p.cmd("setAtomsNlocal",n_local_atoms);
     873           0 :           p.cmd("setAtomsGatindex",indexes_ptr, {n_local_atoms});
     874             :         }
     875           0 :         p.cmd("prepareCalc");
     876           0 :       }
     877             : 
     878             :       // mimick MD calculation here
     879             :       {
     880             :         unsigned k=0;
     881           0 :         auto start=std::chrono::high_resolution_clock::now();
     882           0 :         while(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-start).count()<(long long int)1e9*timeToSleep) {
     883           0 :           k+=i*i;
     884             :         }
     885             :         std::fprintf(log_dev_null.get(),"%u",k);
     886             :       }
     887             : 
     888             :       {
     889           0 :         auto sw=kernels_ptr[i]->stopwatch.startStop(sw_name);
     890           0 :         p.cmd("performCalc");
     891           0 :       }
     892             : 
     893           0 :       if(kernels_ptr.size()>1 && part>1) {
     894           0 :         kernels_ptr[i]->timings.push_back(kernels_ptr[i]->stopwatch.getLastCycle(sw_name));
     895             :       }
     896           0 :       if(plumedStopCondition || signalReceived.load()) {
     897             :         fast_finish=true;
     898             :       }
     899             :     }
     900           0 :     auto elapsed=std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-initial_time).count();
     901           0 :     if(part==0) {
     902             :       part=1;
     903             :     }
     904           0 :     if(part<2) {
     905           0 :       if((maxtime>0 && elapsed>(long long int)(0.2*1e9*maxtime)) || (nf>0 && step+1>=nf/5) || (maxtime<0 && nf<0 && step+1>=100)) {
     906             :         part=2;
     907           0 :         log<<"Warm-up completed\n";
     908             :       }
     909             :     }
     910           0 :     if(part<3) {
     911           0 :       if((maxtime>0 && elapsed>(long long int)(0.6*1e9*maxtime)) || (nf>0 && step+1>=3*nf/5)) {
     912             :         part=3;
     913           0 :         log<<"60% completed\n";
     914             :       }
     915             :     }
     916             : 
     917           0 :     if(maxtime>0 && elapsed>(long long int)(1e9*maxtime)) {
     918             :       fast_finish=true;
     919             :     }
     920             : 
     921             :     {
     922           0 :       unsigned tmp=fast_finish;
     923           0 :       pc.Bcast(tmp,0);
     924           0 :       fast_finish=tmp;
     925             :     }
     926           0 :     if(fast_finish) {
     927             :       break;
     928             :     }
     929             :   }
     930             : 
     931             :   return 0;
     932           0 : }
     933             : 
     934             : } // namespace unnamed
     935             : } // namespace cltools
     936             : } // namespace PLMD

Generated by: LCOV version 1.16