Line data Source code
1 : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 : Copyright (c) 2017-2019 The plumed team
3 : (see the PEOPLE file at the root of the distribution for a list of names)
4 :
5 : See http://www.plumed.org for more information.
6 :
7 : This file is part of plumed, version 2.
8 :
9 : plumed is free software: you can redistribute it and/or modify
10 : it under the terms of the GNU Lesser General Public License as published by
11 : the Free Software Foundation, either version 3 of the License, or
12 : (at your option) any later version.
13 :
14 : plumed is distributed in the hope that it will be useful,
15 : but WITHOUT ANY WARRANTY; without even the implied warranty of
16 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 : GNU Lesser General Public License for more details.
18 :
19 : You should have received a copy of the GNU Lesser General Public License
20 : along with plumed. If not, see <http://www.gnu.org/licenses/>.
21 : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
22 : #include "colvar/Colvar.h"
23 : #include "colvar/ActionRegister.h"
24 : #include "core/PlumedMain.h"
25 : #include "tools/Matrix.h"
26 : #include "core/SetupMolInfo.h"
27 : #include "core/ActionSet.h"
28 : #include "tools/File.h"
29 :
30 : #include <string>
31 : #include <cmath>
32 : #include <map>
33 : #include <numeric>
34 : #include <ctime>
35 : #include "tools/Random.h"
36 :
37 : using namespace std;
38 :
39 : namespace PLMD {
40 : namespace isdb {
41 :
42 : //+PLUMEDOC ISDB_COLVAR EMMI
43 : /*
44 : Calculate the fit of a structure or ensemble of structures with a cryo-EM density map.
45 :
46 : This action implements the multi-scale Bayesian approach to cryo-EM data fitting introduced in Ref. \cite Hanot113951 .
47 : This method allows efficient and accurate structural modeling of cryo-electron microscopy density maps at multiple scales, from coarse-grained to atomistic resolution, by addressing the presence of random and systematic errors in the data, sample heterogeneity, data correlation, and noise correlation.
48 :
49 : The experimental density map is fit by a Gaussian Mixture Model (GMM), which is provided as an external file specified by the keyword
50 : GMM_FILE. We are currently working on a web server to perform
51 : this operation. In the meantime, the user can request a stand-alone version of the GMM code at massimiliano.bonomi_AT_gmail.com.
52 :
53 : When run in single-replica mode, this action allows atomistic, flexible refinement of an individual structure into a density map.
54 : Combined with a multi-replica framework (such as the -multi option in GROMACS), the user can model an ensemble of structures using
55 : the Metainference approach \cite Bonomi:2016ip .
56 :
57 : \warning
58 : To use \ref EMMI, the user should always add a \ref MOLINFO line and specify a pdb file of the system.
59 :
60 : \note
61 : To enhance sampling in single-structure refinement, one can use a Replica Exchange Method, such as Parallel Tempering.
62 : In this case, the user should add the NO_AVER flag to the input line.
63 :
64 : \note
65 : \ref EMMI can be used in combination with periodic and non-periodic systems. In the latter case, one should
66 : add the NOPBC flag to the input line
67 :
68 : \par Examples
69 :
70 : In this example, we perform a single-structure refinement based on an experimental cryo-EM map. The map is fit with a GMM, whose
71 : parameters are listed in the file GMM_fit.dat. This file contains one line per GMM component in the following format:
72 :
73 : \plumedfile
74 : #! FIELDS Id Weight Mean_0 Mean_1 Mean_2 Cov_00 Cov_01 Cov_02 Cov_11 Cov_12 Cov_22 Beta
75 : 0 2.9993805e+01 6.54628 10.37820 -0.92988 2.078920e-02 1.216254e-03 5.990827e-04 2.556246e-02 8.411835e-03 2.486254e-02 1
76 : 1 2.3468312e+01 6.56095 10.34790 -0.87808 1.879859e-02 6.636049e-03 3.682865e-04 3.194490e-02 1.750524e-03 3.017100e-02 1
77 : ...
78 : \endplumedfile
79 :
80 : To accelerate the computation of the Bayesian score, one can:
81 : - use neighbor lists, specified by the keywords NL_CUTOFF and NL_STRIDE;
82 : - calculate the restraint every other step (or more).
83 :
84 : All the heavy atoms of the system are used to calculate the density map. This list can conveniently be provided
85 : using a GROMACS index file.
86 :
87 : The input file looks as follows:
88 :
89 : \plumedfile
90 : # include pdb info
91 : MOLINFO STRUCTURE=prot.pdb
92 :
93 : # all heavy atoms
94 : protein-h: GROUP NDX_FILE=index.ndx NDX_GROUP=Protein-H
95 :
96 : # create EMMI score
97 : gmm: EMMI NOPBC SIGMA_MIN=0.01 TEMP=300.0 NL_STRIDE=100 NL_CUTOFF=0.01 GMM_FILE=GMM_fit.dat ATOMS=protein-h
98 :
99 : # translate into bias - apply every 2 steps
100 : emr: BIASVALUE ARG=gmm.scoreb STRIDE=2
101 :
102 : PRINT ARG=emr.* FILE=COLVAR STRIDE=500 FMT=%20.10f
103 : \endplumedfile
104 :
105 :
106 : */
107 : //+ENDPLUMEDOC
108 :
109 90 : class EMMI : public Colvar {
110 :
111 : private:
112 :
113 : // temperature in kbt
114 : double kbt_;
115 : // model GMM - atom types
116 : vector<unsigned> GMM_m_type_;
117 : // model GMM - list of atom sigmas - one per atom type
118 : vector<double> GMM_m_s_;
119 : // model GMM - list of atom weights - one per atom type
120 : vector<double> GMM_m_w_;
121 : // data GMM - means, weights, and covariances + beta option
122 : vector<Vector> GMM_d_m_;
123 : vector<double> GMM_d_w_;
124 : vector< VectorGeneric<6> > GMM_d_cov_;
125 : vector<int> GMM_d_beta_;
126 : vector < vector<int> > GMM_d_grps_;
127 : // overlaps
128 : vector<double> ovmd_;
129 : vector<double> ovdd_;
130 : vector<double> ovmd_ave_;
131 : // and derivatives
132 : vector<Vector> ovmd_der_;
133 : vector<Vector> atom_der_;
134 : vector<double> GMMid_der_;
135 : // constants
136 : double cfact_;
137 : double inv_sqrt2_, sqrt2_pi_;
138 : // metainference
139 : unsigned nrep_;
140 : unsigned replica_;
141 : vector<double> sigma_;
142 : vector<double> sigma_min_;
143 : vector<double> sigma_max_;
144 : vector<double> dsigma_;
145 : // list of prefactors for overlap between two Gaussians
146 : // pre_fact = 1.0 / (2pi)**1.5 / sqrt(det_md) * Wm * Wd
147 : vector<double> pre_fact_;
148 : // inverse of the sum of model and data covariances matrices
149 : vector< VectorGeneric<6> > inv_cov_md_;
150 : // neighbor list
151 : double nl_cutoff_;
152 : unsigned nl_stride_;
153 : bool first_time_;
154 : bool no_aver_;
155 : vector<unsigned> nl_;
156 : // parallel stuff
157 : unsigned size_;
158 : unsigned rank_;
159 : // pbc
160 : bool pbc_;
161 : // Monte Carlo stuff
162 : int MCstride_;
163 : double MCaccept_;
164 : double MCtrials_;
165 : Random random_;
166 : // status stuff
167 : unsigned int statusstride_;
168 : string statusfilename_;
169 : OFile statusfile_;
170 : bool first_status_;
171 : // regression
172 : unsigned nregres_;
173 : double scale_;
174 : double scale_min_;
175 : double scale_max_;
176 : double dscale_;
177 : // tabulated exponential
178 : double dpcutoff_;
179 : double dexp_;
180 : unsigned nexp_;
181 : vector<double> tab_exp_;
182 : // simulated annealing
183 : unsigned nanneal_;
184 : double kanneal_;
185 : double anneal_;
186 : // prior exponent
187 : double prior_;
188 : // noise type
189 : unsigned noise_;
190 : // total score and virial;
191 : double ene_;
192 : Tensor virial_;
193 : // model overlap file
194 : unsigned int ovstride_;
195 : string ovfilename_;
196 :
197 : // write file with model overlap
198 : void write_model_overlap(long int step);
199 : // get median of vector
200 : double get_median(vector<double> &v);
201 : // annealing
202 : double get_annealing(long int step);
203 : // do regression
204 : double scaleEnergy(double s);
205 : double doRegression();
206 : // read and write status
207 : void read_status();
208 : void print_status(long int step);
209 : // accept or reject
210 : bool doAccept(double oldE, double newE, double kbt);
211 : // do MonteCarlo
212 : void doMonteCarlo();
213 : // read error file
214 : vector<double> read_exp_errors(string errfile);
215 : // read experimental overlaps
216 : vector<double> read_exp_overlaps(string ovfile);
217 : // calculate model GMM weights and covariances
218 : vector<double> get_GMM_m(vector<AtomNumber> &atoms);
219 : // read data GMM file
220 : void get_GMM_d(string gmm_file);
221 : // check GMM data
222 : void check_GMM_d(VectorGeneric<6> &cov, double w);
223 : // auxiliary method
224 : void calculate_useful_stuff(double reso);
225 : // get pref_fact and inv_cov_md
226 : double get_prefactor_inverse (const VectorGeneric<6> &GMM_cov_0, const VectorGeneric<6> &GMM_cov_1,
227 : double &GMM_w_0, double &GMM_w_1,
228 : VectorGeneric<6> &sum, VectorGeneric<6> &inv_sum);
229 : // calculate self overlaps between data GMM components - ovdd_
230 : double get_self_overlap(unsigned id);
231 : // calculate overlap between two Gaussians
232 : double get_overlap(const Vector &m_m, const Vector &d_m, double &pre_fact,
233 : const VectorGeneric<6> &inv_cov_md, Vector &ov_der);
234 : // calculate exponent of overlap for neighbor list update
235 : double get_exp_overlap(const Vector &m_m, const Vector &d_m,
236 : const VectorGeneric<6> &inv_cov_md);
237 : // update the neighbor list
238 : void update_neighbor_list();
239 : // calculate overlap
240 : void calculate_overlap();
241 : // Gaussian noise
242 : void calculate_Gauss();
243 : // Outliers noise
244 : void calculate_Outliers();
245 : // Marginal noise
246 : void calculate_Marginal();
247 :
248 : public:
249 : static void registerKeywords( Keywords& keys );
250 : explicit EMMI(const ActionOptions&);
251 : // active methods:
252 : void prepare() override;
253 : void calculate() override;
254 : };
255 :
256 7868 : PLUMED_REGISTER_ACTION(EMMI,"EMMI")
257 :
258 19 : void EMMI::registerKeywords( Keywords& keys ) {
259 19 : Colvar::registerKeywords( keys );
260 76 : keys.add("atoms","ATOMS","atoms for which we calculate the density map, typically all heavy atoms");
261 76 : keys.add("compulsory","GMM_FILE","file with the parameters of the GMM components");
262 76 : keys.add("compulsory","NL_CUTOFF","The cutoff in overlap for the neighbor list");
263 76 : keys.add("compulsory","NL_STRIDE","The frequency with which we are updating the neighbor list");
264 76 : keys.add("compulsory","SIGMA_MIN","minimum uncertainty");
265 76 : keys.add("compulsory","RESOLUTION", "Cryo-EM map resolution");
266 76 : keys.add("compulsory","NOISETYPE","functional form of the noise (GAUSS, OUTLIERS, MARGINAL)");
267 76 : keys.add("optional","SIGMA0","initial value of the uncertainty");
268 76 : keys.add("optional","DSIGMA","MC step for uncertainties");
269 76 : keys.add("optional","MC_STRIDE", "Monte Carlo stride");
270 76 : keys.add("optional","ERR_FILE","file with experimental or GMM fit errors");
271 76 : keys.add("optional","OV_FILE","file with experimental overlaps");
272 76 : keys.add("optional","NORM_DENSITY","integral of the experimental density");
273 76 : keys.add("optional","STATUS_FILE","write a file with all the data useful for restart");
274 76 : keys.add("optional","WRITE_STRIDE","write the status to a file every N steps, this can be used for restart");
275 76 : keys.add("optional","REGRESSION","regression stride");
276 76 : keys.add("optional","REG_SCALE_MIN","regression minimum scale");
277 76 : keys.add("optional","REG_SCALE_MAX","regression maximum scale");
278 76 : keys.add("optional","REG_DSCALE","regression maximum scale MC move");
279 76 : keys.add("optional","SCALE","scale factor");
280 76 : keys.add("optional","ANNEAL", "Length of annealing cycle");
281 76 : keys.add("optional","ANNEAL_FACT", "Annealing temperature factor");
282 76 : keys.add("optional","TEMP","temperature");
283 76 : keys.add("optional","PRIOR", "exponent of uncertainty prior");
284 76 : keys.add("optional","WRITE_OV_STRIDE","write model overlaps every N steps");
285 76 : keys.add("optional","WRITE_OV","write a file with model overlaps");
286 57 : keys.addFlag("NO_AVER",false,"don't do ensemble averaging in multi-replica mode");
287 19 : componentsAreNotOptional(keys);
288 76 : keys.addOutputComponent("scoreb","default","Bayesian score");
289 76 : keys.addOutputComponent("acc", "NOISETYPE","MC acceptance for uncertainty");
290 76 : keys.addOutputComponent("scale", "REGRESSION","scale factor");
291 76 : keys.addOutputComponent("accscale", "REGRESSION","MC acceptance for scale regression");
292 76 : keys.addOutputComponent("enescale", "REGRESSION","MC energy for scale regression");
293 76 : keys.addOutputComponent("anneal","ANNEAL","annealing factor");
294 19 : }
295 :
296 18 : EMMI::EMMI(const ActionOptions&ao):
297 : PLUMED_COLVAR_INIT(ao),
298 : inv_sqrt2_(0.707106781186548),
299 : sqrt2_pi_(0.797884560802865),
300 : first_time_(true), no_aver_(false), pbc_(true),
301 : MCstride_(1), MCaccept_(0.), MCtrials_(0.),
302 : statusstride_(0), first_status_(true),
303 : nregres_(0), scale_(1.),
304 : dpcutoff_(15.0), nexp_(1000000), nanneal_(0),
305 108 : kanneal_(0.), anneal_(1.), prior_(1.), ovstride_(0)
306 : {
307 : // periodic boundary conditions
308 18 : bool nopbc=!pbc_;
309 36 : parseFlag("NOPBC",nopbc);
310 18 : pbc_=!nopbc;
311 :
312 : // list of atoms
313 : vector<AtomNumber> atoms;
314 36 : parseAtomList("ATOMS", atoms);
315 :
316 : // file with data GMM
317 : string GMM_file;
318 36 : parse("GMM_FILE", GMM_file);
319 :
320 : // type of data noise
321 : string noise;
322 36 : parse("NOISETYPE",noise);
323 18 : if (noise=="GAUSS") noise_ = 0;
324 12 : else if(noise=="OUTLIERS") noise_ = 1;
325 6 : else if(noise=="MARGINAL") noise_ = 2;
326 0 : else error("Unknown noise type!");
327 :
328 : // minimum value for error
329 : double sigma_min;
330 36 : parse("SIGMA_MIN", sigma_min);
331 18 : if(sigma_min<0) error("SIGMA_MIN should be greater or equal to zero");
332 :
333 : // the following parameters must be specified with noise type 0 and 1
334 : double sigma_ini, dsigma;
335 18 : if(noise_!=2) {
336 : // initial value of the uncertainty
337 24 : parse("SIGMA0", sigma_ini);
338 12 : if(sigma_ini<=0) error("you must specify a positive SIGMA0");
339 : // MC parameters
340 24 : parse("DSIGMA", dsigma);
341 12 : if(dsigma<0) error("you must specify a positive DSIGMA");
342 24 : parse("MC_STRIDE", MCstride_);
343 12 : if(dsigma>0 && MCstride_<=0) error("you must specify a positive MC_STRIDE");
344 : // status file parameters
345 24 : parse("WRITE_STRIDE", statusstride_);
346 12 : if(statusstride_<=0) error("you must specify a positive WRITE_STRIDE");
347 24 : parse("STATUS_FILE", statusfilename_);
348 24 : if(statusfilename_=="") statusfilename_ = "MISTATUS"+getLabel();
349 0 : else statusfilename_ = statusfilename_+getLabel();
350 : }
351 :
352 : // error file
353 : string errfile;
354 36 : parse("ERR_FILE", errfile);
355 :
356 : // file with experimental overlaps
357 : string ovfile;
358 36 : parse("OV_FILE", ovfile);
359 :
360 : // integral of the experimetal density
361 18 : double norm_d = 0.0;
362 36 : parse("NORM_DENSITY", norm_d);
363 :
364 : // temperature
365 18 : double temp=0.0;
366 36 : parse("TEMP",temp);
367 : // convert temp to kbt
368 36 : if(temp>0.0) kbt_=plumed.getAtoms().getKBoltzmann()*temp;
369 0 : else kbt_=plumed.getAtoms().getKbT();
370 :
371 : // exponent of uncertainty prior
372 36 : parse("PRIOR",prior_);
373 :
374 : // simulated annealing stuff
375 36 : parse("ANNEAL", nanneal_);
376 36 : parse("ANNEAL_FACT", kanneal_);
377 18 : if(nanneal_>0 && kanneal_<=1.0) error("with ANNEAL, ANNEAL_FACT must be greater than 1");
378 :
379 : // regression stride
380 36 : parse("REGRESSION",nregres_);
381 : // other regression parameters
382 18 : if(nregres_>0) {
383 0 : parse("REG_SCALE_MIN",scale_min_);
384 0 : parse("REG_SCALE_MAX",scale_max_);
385 0 : parse("REG_DSCALE",dscale_);
386 : // checks
387 0 : if(scale_max_<=scale_min_) error("with REGRESSION, REG_SCALE_MAX must be greater than REG_SCALE_MIN");
388 0 : if(dscale_<=0.) error("with REGRESSION, REG_DSCALE must be positive");
389 : }
390 :
391 : // scale factor
392 36 : parse("SCALE", scale_);
393 :
394 : // read map resolution
395 : double reso;
396 36 : parse("RESOLUTION", reso);
397 18 : if(reso<=0.) error("RESOLUTION should be strictly positive");
398 :
399 : // neighbor list stuff
400 36 : parse("NL_CUTOFF",nl_cutoff_);
401 18 : if(nl_cutoff_<=0.0) error("NL_CUTOFF should be explicitly specified and positive");
402 36 : parse("NL_STRIDE",nl_stride_);
403 18 : if(nl_stride_<=0) error("NL_STRIDE should be explicitly specified and positive");
404 :
405 : // averaging or not
406 36 : parseFlag("NO_AVER",no_aver_);
407 :
408 : // write overlap file
409 36 : parse("WRITE_OV_STRIDE", ovstride_);
410 36 : parse("WRITE_OV", ovfilename_);
411 18 : if(ovstride_>0 && ovfilename_=="") error("With WRITE_OV_STRIDE you must specify WRITE_OV");
412 :
413 18 : checkRead();
414 :
415 : // set parallel stuff
416 18 : size_=comm.Get_size();
417 18 : rank_=comm.Get_rank();
418 :
419 : // get number of replicas
420 18 : if(rank_==0) {
421 12 : if(no_aver_) {
422 12 : nrep_ = 1;
423 : } else {
424 0 : nrep_ = multi_sim_comm.Get_size();
425 : }
426 12 : replica_ = multi_sim_comm.Get_rank();
427 : } else {
428 6 : nrep_ = 0;
429 6 : replica_ = 0;
430 : }
431 18 : comm.Sum(&nrep_,1);
432 18 : comm.Sum(&replica_,1);
433 :
434 18 : log.printf(" atoms involved : ");
435 21690 : for(unsigned i=0; i<atoms.size(); ++i) log.printf("%d ",atoms[i].serial());
436 18 : log.printf("\n");
437 18 : log.printf(" GMM data file : %s\n", GMM_file.c_str());
438 18 : if(no_aver_) log.printf(" without ensemble averaging\n");
439 18 : log.printf(" type of data noise : %s\n", noise.c_str());
440 18 : log.printf(" neighbor list cutoff : %lf\n", nl_cutoff_);
441 18 : log.printf(" neighbor list stride : %u\n", nl_stride_);
442 18 : log.printf(" minimum uncertainty : %f\n",sigma_min);
443 18 : log.printf(" scale factor : %lf\n",scale_);
444 18 : if(nregres_>0) {
445 0 : log.printf(" regression stride : %u\n", nregres_);
446 0 : log.printf(" regression minimum scale : %lf\n", scale_min_);
447 0 : log.printf(" regression maximum scale : %lf\n", scale_max_);
448 0 : log.printf(" regression maximum scale MC move : %lf\n", dscale_);
449 : }
450 18 : if(noise_!=2) {
451 12 : log.printf(" initial value of the uncertainty : %f\n",sigma_ini);
452 12 : log.printf(" max MC move in uncertainty : %f\n",dsigma);
453 12 : log.printf(" MC stride : %u\n", MCstride_);
454 12 : log.printf(" reading/writing to status file : %s\n",statusfilename_.c_str());
455 12 : log.printf(" with stride : %u\n",statusstride_);
456 : }
457 18 : if(errfile.size()>0) log.printf(" reading experimental errors from file : %s\n", errfile.c_str());
458 18 : if(ovfile.size()>0) log.printf(" reading experimental overlaps from file : %s\n", ovfile.c_str());
459 18 : log.printf(" temperature of the system in energy unit : %f\n",kbt_);
460 18 : log.printf(" prior exponent : %f\n",prior_);
461 18 : log.printf(" number of replicas for averaging: %u\n",nrep_);
462 18 : log.printf(" id of the replica : %u\n",replica_);
463 18 : if(nanneal_>0) {
464 0 : log.printf(" length of annealing cycle : %u\n",nanneal_);
465 0 : log.printf(" annealing factor : %f\n",kanneal_);
466 : }
467 18 : if(ovstride_>0) {
468 0 : log.printf(" stride for writing model overlaps : %u\n",ovstride_);
469 0 : log.printf(" file for writing model overlaps : %s\n", ovfilename_.c_str());
470 : }
471 :
472 : // set constant quantity before calculating stuff
473 18 : cfact_ = 1.0/pow( 2.0*pi, 1.5 );
474 :
475 : // calculate model GMM constant parameters
476 18 : vector<double> GMM_m_w = get_GMM_m(atoms);
477 :
478 : // read data GMM parameters
479 36 : get_GMM_d(GMM_file);
480 18 : log.printf(" number of GMM components : %u\n", static_cast<unsigned>(GMM_d_m_.size()));
481 :
482 : // normalize atom weight map
483 36 : if(norm_d <= 0.0) norm_d = accumulate(GMM_d_w_.begin(), GMM_d_w_.end(), 0.0);
484 : double norm_m = accumulate(GMM_m_w.begin(), GMM_m_w.end(), 0.0);
485 : // renormalization
486 162 : for(unsigned i=0; i<GMM_m_w_.size(); ++i) GMM_m_w_[i] *= norm_d / norm_m;
487 :
488 : // read experimental errors
489 : vector<double> exp_err;
490 18 : if(errfile.size()>0) exp_err = read_exp_errors(errfile);
491 :
492 : // get self overlaps between data GMM components
493 18 : if(ovfile.size()>0) {
494 0 : ovdd_ = read_exp_overlaps(ovfile);
495 : } else {
496 342 : for(unsigned i=0; i<GMM_d_m_.size(); ++i) {
497 162 : double ov = get_self_overlap(i);
498 162 : ovdd_.push_back(ov);
499 : }
500 : }
501 :
502 18 : log.printf(" number of GMM groups : %u\n", static_cast<unsigned>(GMM_d_grps_.size()));
503 : // cycle on GMM groups
504 54 : for(unsigned Gid=0; Gid<GMM_d_grps_.size(); ++Gid) {
505 18 : log.printf(" group %d\n", Gid);
506 : // calculate median overlap and experimental error
507 : vector<double> ovdd;
508 : vector<double> err;
509 : // cycle on the group members
510 360 : for(unsigned i=0; i<GMM_d_grps_[Gid].size(); ++i) {
511 : // GMM id
512 162 : int GMMid = GMM_d_grps_[Gid][i];
513 : // add to experimental error
514 162 : if(errfile.size()>0) err.push_back(exp_err[GMMid]);
515 324 : else err.push_back(0.);
516 : // add to GMM overlap
517 324 : ovdd.push_back(ovdd_[GMMid]);
518 : }
519 : // calculate median quantities
520 18 : double ovdd_m = get_median(ovdd);
521 18 : double err_m = get_median(err);
522 : // print out statistics
523 18 : log.printf(" # of members : %zu\n", GMM_d_grps_[Gid].size());
524 18 : log.printf(" median overlap : %lf\n", ovdd_m);
525 18 : log.printf(" median error : %lf\n", err_m);
526 : // add minimum value of sigma for this group of GMMs
527 36 : sigma_min_.push_back(sqrt(err_m*err_m+sigma_min*ovdd_m*sigma_min*ovdd_m));
528 : // these are only needed with Gaussian and Outliers noise models
529 18 : if(noise_!=2) {
530 : // set dsigma
531 24 : dsigma_.push_back(dsigma * ovdd_m);
532 : // set sigma max
533 48 : sigma_max_.push_back(10.0*ovdd_m + sigma_min_[Gid] + dsigma_[Gid]);
534 : // initialize sigma
535 24 : sigma_.push_back(std::max(sigma_min_[Gid],std::min(sigma_ini*ovdd_m,sigma_max_[Gid])));
536 : }
537 : }
538 :
539 : // read status file if restarting
540 18 : if(getRestart() && noise_!=2) read_status();
541 :
542 : // calculate auxiliary stuff
543 18 : calculate_useful_stuff(reso);
544 :
545 : // prepare data and derivative vectors
546 18 : ovmd_.resize(ovdd_.size());
547 18 : atom_der_.resize(GMM_m_type_.size());
548 18 : GMMid_der_.resize(ovdd_.size());
549 :
550 : // clear things that are no longer needed
551 : GMM_d_cov_.clear();
552 :
553 : // add components
554 54 : addComponentWithDerivatives("scoreb"); componentIsNotPeriodic("scoreb");
555 :
556 42 : if(noise_!=2) {addComponent("acc"); componentIsNotPeriodic("acc");}
557 :
558 18 : if(nregres_>0) {
559 0 : addComponent("scale"); componentIsNotPeriodic("scale");
560 0 : addComponent("accscale"); componentIsNotPeriodic("accscale");
561 0 : addComponent("enescale"); componentIsNotPeriodic("enescale");
562 : }
563 :
564 18 : if(nanneal_>0) {addComponent("anneal"); componentIsNotPeriodic("anneal");}
565 :
566 : // initialize random seed
567 : unsigned iseed;
568 18 : if(rank_==0) iseed = time(NULL)+replica_;
569 6 : else iseed = 0;
570 18 : comm.Sum(&iseed, 1);
571 18 : random_.setSeed(-iseed);
572 :
573 : // request the atoms
574 18 : requestAtoms(atoms);
575 :
576 : // print bibliography
577 54 : log<<" Bibliography "<<plumed.cite("Bonomi, Camilloni, Bioinformatics, 33, 3999 (2017)");
578 54 : log<<plumed.cite("Hanot, Bonomi, Greenberg, Sali, Nilges, Vendruscolo, Pellarin, bioRxiv doi: 10.1101/113951 (2017)");
579 54 : log<<plumed.cite("Bonomi, Pellarin, Vendruscolo, Biophys. J. 114, 1604 (2018)");
580 18 : if(!no_aver_ && nrep_>1)log<<plumed.cite("Bonomi, Camilloni, Cavalli, Vendruscolo, Sci. Adv. 2, e150117 (2016)");
581 18 : log<<"\n";
582 18 : }
583 :
584 0 : void EMMI::write_model_overlap(long int step)
585 : {
586 0 : OFile ovfile;
587 0 : ovfile.link(*this);
588 0 : std::string num; Tools::convert(step,num);
589 0 : string name = ovfilename_+"-"+num;
590 0 : ovfile.open(name);
591 : ovfile.setHeavyFlush();
592 0 : ovfile.fmtField("%10.7e ");
593 : // write overlaps
594 0 : for(int i=0; i<ovmd_.size(); ++i) {
595 0 : ovfile.printField("Model", ovmd_[i]);
596 0 : ovfile.printField("ModelScaled", scale_ * ovmd_[i]);
597 0 : ovfile.printField("Data", ovdd_[i]);
598 0 : ovfile.printField();
599 : }
600 0 : ovfile.close();
601 0 : }
602 :
603 36 : double EMMI::get_median(vector<double> &v)
604 : {
605 : // dimension of vector
606 36 : unsigned size = v.size();
607 : // in case of only one entry
608 36 : if (size==1) {
609 0 : return v[0];
610 : } else {
611 : // reorder vector
612 36 : sort(v.begin(), v.end());
613 : // odd or even?
614 36 : if (size%2==0) {
615 0 : return (v[size/2-1]+v[size/2])/2.0;
616 : } else {
617 72 : return v[size/2];
618 : }
619 : }
620 : }
621 :
622 0 : void EMMI::read_status()
623 : {
624 : double MDtime;
625 : // open file
626 0 : IFile *ifile = new IFile();
627 0 : ifile->link(*this);
628 0 : if(ifile->FileExist(statusfilename_)) {
629 0 : ifile->open(statusfilename_);
630 0 : while(ifile->scanField("MD_time", MDtime)) {
631 0 : for(unsigned i=0; i<sigma_.size(); ++i) {
632 : // convert i to string
633 0 : std::string num; Tools::convert(i,num);
634 : // read entries
635 0 : ifile->scanField("s"+num, sigma_[i]);
636 : }
637 : // new line
638 0 : ifile->scanField();
639 : }
640 0 : ifile->close();
641 : } else {
642 0 : error("Cannot find status file "+statusfilename_+"\n");
643 : }
644 0 : delete ifile;
645 0 : }
646 :
647 10902 : void EMMI::print_status(long int step)
648 : {
649 : // if first time open the file
650 10902 : if(first_status_) {
651 12 : first_status_ = false;
652 12 : statusfile_.link(*this);
653 12 : statusfile_.open(statusfilename_);
654 : statusfile_.setHeavyFlush();
655 24 : statusfile_.fmtField("%6.3e ");
656 : }
657 : // write fields
658 10902 : double MDtime = static_cast<double>(step)*getTimeStep();
659 21804 : statusfile_.printField("MD_time", MDtime);
660 43608 : for(unsigned i=0; i<sigma_.size(); ++i) {
661 : // convert i to string
662 10902 : std::string num; Tools::convert(i,num);
663 : // print entry
664 21804 : statusfile_.printField("s"+num, sigma_[i]);
665 : }
666 10902 : statusfile_.printField();
667 10902 : }
668 :
669 0 : bool EMMI::doAccept(double oldE, double newE, double kbt) {
670 : bool accept = false;
671 : // calculate delta energy
672 0 : double delta = ( newE - oldE ) / kbt;
673 : // if delta is negative always accept move
674 0 : if( delta < 0.0 ) {
675 : accept = true;
676 : } else {
677 : // otherwise extract random number
678 0 : double s = random_.RandU01();
679 0 : if( s < exp(-delta) ) { accept = true; }
680 : }
681 0 : return accept;
682 : }
683 :
684 0 : void EMMI::doMonteCarlo()
685 : {
686 : // extract random GMM group
687 0 : unsigned nGMM = static_cast<unsigned>(floor(random_.RandU01()*static_cast<double>(GMM_d_grps_.size())));
688 0 : if(nGMM==GMM_d_grps_.size()) nGMM -= 1;
689 :
690 : // generate random move
691 0 : double shift = dsigma_[nGMM] * ( 2.0 * random_.RandU01() - 1.0 );
692 : // new sigma
693 0 : double new_s = sigma_[nGMM] + shift;
694 : // check boundaries
695 0 : if(new_s > sigma_max_[nGMM]) {new_s = 2.0 * sigma_max_[nGMM] - new_s;}
696 0 : if(new_s < sigma_min_[nGMM]) {new_s = 2.0 * sigma_min_[nGMM] - new_s;}
697 : // old s2
698 0 : double old_inv_s2 = 1.0 / sigma_[nGMM] / sigma_[nGMM];
699 : // new s2
700 0 : double new_inv_s2 = 1.0 / new_s / new_s;
701 :
702 : // cycle on GMM group and calculate old and new energy
703 : double old_ene = 0.0;
704 : double new_ene = 0.0;
705 0 : double ng = static_cast<double>(GMM_d_grps_[nGMM].size());
706 :
707 : // in case of Gaussian noise
708 0 : if(noise_==0) {
709 : double chi2 = 0.0;
710 0 : for(unsigned i=0; i<GMM_d_grps_[nGMM].size(); ++i) {
711 : // id GMM component
712 0 : int GMMid = GMM_d_grps_[nGMM][i];
713 : // deviation
714 0 : double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] );
715 : // add to chi2
716 0 : chi2 += dev * dev;
717 : }
718 : // final energy calculation: add normalization and prior
719 0 : old_ene = 0.5 * kbt_ * ( chi2 * old_inv_s2 - (ng+prior_) * std::log(old_inv_s2) );
720 0 : new_ene = 0.5 * kbt_ * ( chi2 * new_inv_s2 - (ng+prior_) * std::log(new_inv_s2) );
721 : }
722 :
723 : // in case of Outliers noise
724 0 : if(noise_==1) {
725 0 : for(unsigned i=0; i<GMM_d_grps_[nGMM].size(); ++i) {
726 : // id GMM component
727 0 : int GMMid = GMM_d_grps_[nGMM][i];
728 : // calculate deviation
729 0 : double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] );
730 : // add to energies
731 0 : old_ene += std::log( 1.0 + 0.5 * dev * dev * old_inv_s2);
732 0 : new_ene += std::log( 1.0 + 0.5 * dev * dev * new_inv_s2);
733 : }
734 : // final energy calculation: add normalization and prior
735 0 : old_ene = kbt_ * ( old_ene + (ng+prior_) * std::log(sigma_[nGMM]) );
736 0 : new_ene = kbt_ * ( new_ene + (ng+prior_) * std::log(new_s) );
737 : }
738 :
739 : // increment number of trials
740 0 : MCtrials_ += 1.0;
741 :
742 : // accept or reject
743 0 : bool accept = doAccept(old_ene/anneal_, new_ene/anneal_, kbt_);
744 0 : if(accept) {
745 0 : sigma_[nGMM] = new_s;
746 0 : MCaccept_ += 1.0;
747 : }
748 : // local communication
749 0 : if(rank_!=0) {
750 0 : for(unsigned i=0; i<sigma_.size(); ++i) sigma_[i] = 0.0;
751 0 : MCaccept_ = 0.0;
752 : }
753 0 : if(size_>1) {
754 0 : comm.Sum(&sigma_[0], sigma_.size());
755 0 : comm.Sum(&MCaccept_, 1);
756 : }
757 0 : }
758 :
759 0 : vector<double> EMMI::read_exp_errors(string errfile)
760 : {
761 : int nexp, idcomp;
762 : double err;
763 : vector<double> exp_err;
764 : // open file
765 0 : IFile *ifile = new IFile();
766 0 : if(ifile->FileExist(errfile)) {
767 0 : ifile->open(errfile);
768 : // scan for number of experimental errors
769 0 : ifile->scanField("Nexp", nexp);
770 : // cycle on GMM components
771 0 : while(ifile->scanField("Id",idcomp)) {
772 : // total experimental error
773 0 : double err_tot = 0.0;
774 : // cycle on number of experimental overlaps
775 0 : for(unsigned i=0; i<nexp; ++i) {
776 0 : string ss; Tools::convert(i,ss);
777 0 : ifile->scanField("Err"+ss, err);
778 : // add to total error
779 0 : err_tot += err*err;
780 : }
781 : // new line
782 0 : ifile->scanField();
783 : // calculate RMSE
784 0 : err_tot = sqrt(err_tot/static_cast<double>(nexp));
785 : // add to global
786 0 : exp_err.push_back(err_tot);
787 : }
788 0 : ifile->close();
789 : } else {
790 0 : error("Cannot find ERR_FILE "+errfile+"\n");
791 : }
792 0 : return exp_err;
793 : }
794 :
795 0 : vector<double> EMMI::read_exp_overlaps(string ovfile)
796 : {
797 : int idcomp;
798 : double ov;
799 : vector<double> ovdd;
800 : // open file
801 0 : IFile *ifile = new IFile();
802 0 : if(ifile->FileExist(ovfile)) {
803 0 : ifile->open(ovfile);
804 : // cycle on GMM components
805 0 : while(ifile->scanField("Id",idcomp)) {
806 : // read experimental overlap
807 0 : ifile->scanField("Overlap", ov);
808 : // add to ovdd
809 0 : ovdd.push_back(ov);
810 : // new line
811 0 : ifile->scanField();
812 : }
813 0 : ifile->close();
814 : } else {
815 0 : error("Cannot find OV_FILE "+ovfile+"\n");
816 : }
817 0 : return ovdd;
818 : }
819 :
820 18 : vector<double> EMMI::get_GMM_m(vector<AtomNumber> &atoms)
821 : {
822 : // list of weights - one per atom
823 : vector<double> GMM_m_w;
824 :
825 36 : vector<SetupMolInfo*> moldat=plumed.getActionSet().select<SetupMolInfo*>();
826 : // map of atom types to A and B coefficients of scattering factor
827 : // f(s) = A * exp(-B*s**2)
828 : // B is in Angstrom squared
829 : // map between an atom type and an index
830 : map<string, unsigned> type_map;
831 36 : type_map["C"]=0;
832 36 : type_map["O"]=1;
833 36 : type_map["N"]=2;
834 36 : type_map["S"]=3;
835 : // fill in sigma vector
836 36 : GMM_m_s_.push_back(0.01*15.146); // type 0
837 36 : GMM_m_s_.push_back(0.01*8.59722); // type 1
838 36 : GMM_m_s_.push_back(0.01*11.1116); // type 2
839 36 : GMM_m_s_.push_back(0.01*15.8952); // type 3
840 : // fill in weight vector
841 36 : GMM_m_w_.push_back(2.49982); // type 0
842 36 : GMM_m_w_.push_back(1.97692); // type 1
843 36 : GMM_m_w_.push_back(2.20402); // type 2
844 36 : GMM_m_w_.push_back(5.14099); // type 3
845 :
846 : // check if MOLINFO line is present
847 18 : if( moldat.size()==1 ) {
848 18 : log<<" MOLINFO DATA found, using proper atom names\n";
849 21690 : for(unsigned i=0; i<atoms.size(); ++i) {
850 : // get atom name
851 10836 : string name = moldat[0]->getAtomName(atoms[i]);
852 : char type;
853 : // get atom type
854 10836 : char first = name.at(0);
855 : // GOLDEN RULE: type is first letter, if not a number
856 10836 : if (!isdigit(first)) {
857 : type = first;
858 : // otherwise is the second
859 : } else {
860 0 : type = name.at(1);
861 : }
862 : // check if key in map
863 10836 : std::string type_s = std::string(1,type);
864 10836 : if(type_map.find(type_s) != type_map.end()) {
865 : // save atom type
866 10836 : GMM_m_type_.push_back(type_map[type_s]);
867 : // this will be normalized in the final density
868 21672 : GMM_m_w.push_back(GMM_m_w_[type_map[type_s]]);
869 : } else {
870 0 : error("Wrong atom type "+type_s+" from atom name "+name+"\n");
871 : }
872 : }
873 : } else {
874 0 : error("MOLINFO DATA not found\n");
875 : }
876 18 : return GMM_m_w;
877 : }
878 :
879 162 : void EMMI::check_GMM_d(VectorGeneric<6> &cov, double w)
880 : {
881 :
882 : // check if positive defined, by calculating the 3 leading principal minors
883 162 : double pm1 = cov[0];
884 162 : double pm2 = cov[0]*cov[3]-cov[1]*cov[1];
885 162 : double pm3 = cov[0]*(cov[3]*cov[5]-cov[4]*cov[4])-cov[1]*(cov[1]*cov[5]-cov[4]*cov[2])+cov[2]*(cov[1]*cov[4]-cov[3]*cov[2]);
886 : // apply Sylvester’s criterion
887 162 : if(pm1<=0.0 || pm2<=0.0 || pm3<=0.0)
888 0 : error("check data GMM: covariance matrix is not positive defined");
889 :
890 : // check if weight is positive
891 162 : if(w<=0) error("check data GMM: weight must be positive");
892 162 : }
893 :
894 : // read GMM data file in PLUMED format:
895 18 : void EMMI::get_GMM_d(string GMM_file)
896 : {
897 18 : VectorGeneric<6> cov;
898 :
899 : // open file
900 18 : std::unique_ptr<IFile> ifile(new IFile);
901 18 : if(ifile->FileExist(GMM_file)) {
902 18 : ifile->open(GMM_file);
903 : int idcomp;
904 360 : while(ifile->scanField("Id",idcomp)) {
905 : int beta;
906 : double w, m0, m1, m2;
907 324 : ifile->scanField("Weight",w);
908 324 : ifile->scanField("Mean_0",m0);
909 324 : ifile->scanField("Mean_1",m1);
910 324 : ifile->scanField("Mean_2",m2);
911 324 : ifile->scanField("Cov_00",cov[0]);
912 324 : ifile->scanField("Cov_01",cov[1]);
913 324 : ifile->scanField("Cov_02",cov[2]);
914 324 : ifile->scanField("Cov_11",cov[3]);
915 324 : ifile->scanField("Cov_12",cov[4]);
916 324 : ifile->scanField("Cov_22",cov[5]);
917 324 : ifile->scanField("Beta",beta);
918 : // check input
919 162 : check_GMM_d(cov, w);
920 : // check beta
921 162 : if(beta<0) error("Beta must be positive!");
922 : // center of the Gaussian
923 324 : GMM_d_m_.push_back(Vector(m0,m1,m2));
924 : // covariance matrix
925 162 : GMM_d_cov_.push_back(cov);
926 : // weight
927 162 : GMM_d_w_.push_back(w);
928 : // beta
929 162 : GMM_d_beta_.push_back(beta);
930 : // new line
931 162 : ifile->scanField();
932 : }
933 : } else {
934 0 : error("Cannot find GMM_FILE "+GMM_file+"\n");
935 : }
936 : // now create a set from beta (unique set of values)
937 18 : set<int> bu(GMM_d_beta_.begin(), GMM_d_beta_.end());
938 : // now prepare the group vector
939 18 : GMM_d_grps_.resize(bu.size());
940 : // and fill it in
941 342 : for(unsigned i=0; i<GMM_d_beta_.size(); ++i) {
942 324 : if(GMM_d_beta_[i]>=GMM_d_grps_.size()) error("Check Beta values");
943 486 : GMM_d_grps_[GMM_d_beta_[i]].push_back(i);
944 : }
945 18 : }
946 :
947 18 : void EMMI::calculate_useful_stuff(double reso)
948 : {
949 : // We use the following definition for resolution:
950 : // the Fourier transform of the density distribution in real space
951 : // f(s) falls to 1/e of its maximum value at wavenumber 1/resolution
952 : // i.e. from f(s) = A * exp(-B*s**2) -> Res = sqrt(B).
953 : // average value of B
954 : double Bave = 0.0;
955 21708 : for(unsigned i=0; i<GMM_m_type_.size(); ++i) {
956 21672 : Bave += GMM_m_s_[GMM_m_type_[i]];
957 : }
958 18 : Bave /= static_cast<double>(GMM_m_type_.size());
959 : // calculate blur factor
960 : double blur = 0.0;
961 18 : if(reso*reso>Bave) blur = reso*reso-Bave;
962 36 : else warning("PLUMED should not be used with maps at resolution better than 0.3 nm");
963 : // add blur to B
964 180 : for(unsigned i=0; i<GMM_m_s_.size(); ++i) GMM_m_s_[i] += blur;
965 : // calculate average resolution
966 : double ave_res = 0.0;
967 21690 : for(unsigned i=0; i<GMM_m_type_.size(); ++i) {
968 21672 : ave_res += sqrt(GMM_m_s_[GMM_m_type_[i]]);
969 : }
970 18 : ave_res = ave_res / static_cast<double>(GMM_m_type_.size());
971 18 : log.printf(" experimental map resolution : %3.2f\n", reso);
972 18 : log.printf(" predicted map resolution : %3.2f\n", ave_res);
973 18 : log.printf(" blur factor : %f\n", blur);
974 : // now calculate useful stuff
975 18 : VectorGeneric<6> cov, sum, inv_sum;
976 : // cycle on all atoms types (4 for the moment)
977 180 : for(unsigned i=0; i<GMM_m_s_.size(); ++i) {
978 : // the Gaussian in density (real) space is the FT of scattering factor
979 : // f(r) = A * (pi/B)**1.5 * exp(-pi**2/B*r**2)
980 72 : double s = sqrt ( 0.5 * GMM_m_s_[i] ) / pi;
981 : // covariance matrix for spherical Gaussian
982 72 : cov[0]=s*s; cov[1]=0.0; cov[2]=0.0;
983 72 : cov[3]=s*s; cov[4]=0.0;
984 72 : cov[5]=s*s;
985 : // cycle on all data GMM
986 1440 : for(unsigned j=0; j<GMM_d_m_.size(); ++j) {
987 : // we need the sum of the covariance matrices
988 7776 : for(unsigned k=0; k<6; ++k) sum[k] = cov[k] + GMM_d_cov_[j][k];
989 : // and to calculate its determinant
990 648 : double det = sum[0]*(sum[3]*sum[5]-sum[4]*sum[4]);
991 648 : det -= sum[1]*(sum[1]*sum[5]-sum[4]*sum[2]);
992 648 : det += sum[2]*(sum[1]*sum[4]-sum[3]*sum[2]);
993 : // calculate prefactor - model weights are already normalized
994 1944 : double pre_fact = cfact_ / sqrt(det) * GMM_d_w_[j] * GMM_m_w_[i];
995 : // and its inverse
996 648 : inv_sum[0] = (sum[3]*sum[5] - sum[4]*sum[4])/det;
997 648 : inv_sum[1] = (sum[2]*sum[4] - sum[1]*sum[5])/det;
998 648 : inv_sum[2] = (sum[1]*sum[4] - sum[2]*sum[3])/det;
999 648 : inv_sum[3] = (sum[0]*sum[5] - sum[2]*sum[2])/det;
1000 648 : inv_sum[4] = (sum[2]*sum[1] - sum[0]*sum[4])/det;
1001 648 : inv_sum[5] = (sum[0]*sum[3] - sum[1]*sum[1])/det;
1002 : // now we store the prefactor
1003 648 : pre_fact_.push_back(pre_fact);
1004 : // and the inverse of the sum
1005 648 : inv_cov_md_.push_back(inv_sum);
1006 : }
1007 : }
1008 : // tabulate exponential
1009 18 : dexp_ = dpcutoff_ / static_cast<double> (nexp_-1);
1010 18000018 : for(unsigned i=0; i<nexp_; ++i) {
1011 36000000 : tab_exp_.push_back(exp(-static_cast<double>(i) * dexp_));
1012 : }
1013 18 : }
1014 :
1015 : // get prefactors
1016 1458 : double EMMI::get_prefactor_inverse
1017 : (const VectorGeneric<6> &GMM_cov_0, const VectorGeneric<6> &GMM_cov_1,
1018 : double &GMM_w_0, double &GMM_w_1,
1019 : VectorGeneric<6> &sum, VectorGeneric<6> &inv_sum)
1020 : {
1021 : // we need the sum of the covariance matrices
1022 1458 : for(unsigned k=0; k<6; ++k) sum[k] = GMM_cov_0[k] + GMM_cov_1[k];
1023 :
1024 : // and to calculate its determinant
1025 1458 : double det = sum[0]*(sum[3]*sum[5]-sum[4]*sum[4]);
1026 1458 : det -= sum[1]*(sum[1]*sum[5]-sum[4]*sum[2]);
1027 1458 : det += sum[2]*(sum[1]*sum[4]-sum[3]*sum[2]);
1028 :
1029 : // the prefactor is
1030 1458 : double pre_fact = cfact_ / sqrt(det) * GMM_w_0 * GMM_w_1;
1031 :
1032 : // and its inverse
1033 1458 : inv_sum[0] = (sum[3]*sum[5] - sum[4]*sum[4])/det;
1034 1458 : inv_sum[1] = (sum[2]*sum[4] - sum[1]*sum[5])/det;
1035 1458 : inv_sum[2] = (sum[1]*sum[4] - sum[2]*sum[3])/det;
1036 1458 : inv_sum[3] = (sum[0]*sum[5] - sum[2]*sum[2])/det;
1037 1458 : inv_sum[4] = (sum[2]*sum[1] - sum[0]*sum[4])/det;
1038 1458 : inv_sum[5] = (sum[0]*sum[3] - sum[1]*sum[1])/det;
1039 :
1040 : // return pre-factor
1041 1458 : return pre_fact;
1042 : }
1043 :
1044 162 : double EMMI::get_self_overlap(unsigned id)
1045 : {
1046 : double ov_tot = 0.0;
1047 162 : VectorGeneric<6> sum, inv_sum;
1048 162 : Vector ov_der;
1049 : // start loop
1050 3240 : for(unsigned i=0; i<GMM_d_m_.size(); ++i) {
1051 : // call auxiliary method
1052 : double pre_fact = get_prefactor_inverse(GMM_d_cov_[id], GMM_d_cov_[i],
1053 2916 : GMM_d_w_[id], GMM_d_w_[i], sum, inv_sum);
1054 : // add overlap to ov_tot
1055 1458 : ov_tot += get_overlap(GMM_d_m_[id], GMM_d_m_[i], pre_fact, inv_sum, ov_der);
1056 : }
1057 : // and return it
1058 162 : return ov_tot;
1059 : }
1060 :
1061 : // get overlap and derivatives
1062 2083740 : double EMMI::get_overlap(const Vector &m_m, const Vector &d_m, double &pre_fact,
1063 : const VectorGeneric<6> &inv_cov_md, Vector &ov_der)
1064 : {
1065 2083740 : Vector md;
1066 : // calculate vector difference m_m-d_m with/without pbc
1067 2083740 : if(pbc_) md = pbcDistance(d_m, m_m);
1068 2083740 : else md = delta(d_m, m_m);
1069 : // calculate product of transpose of md and inv_cov_md
1070 2083740 : double p_x = md[0]*inv_cov_md[0]+md[1]*inv_cov_md[1]+md[2]*inv_cov_md[2];
1071 2083740 : double p_y = md[0]*inv_cov_md[1]+md[1]*inv_cov_md[3]+md[2]*inv_cov_md[4];
1072 2083740 : double p_z = md[0]*inv_cov_md[2]+md[1]*inv_cov_md[4]+md[2]*inv_cov_md[5];
1073 : // calculate product of prod and md
1074 2083740 : double ov = md[0]*p_x+md[1]*p_y+md[2]*p_z;
1075 : // final calculation
1076 2083740 : ov = pre_fact * exp(-0.5*ov);
1077 : // derivatives
1078 2083740 : ov_der = ov * Vector(p_x, p_y, p_z);
1079 2083740 : return ov;
1080 : }
1081 :
1082 : // get the exponent of the overlap
1083 59067036 : double EMMI::get_exp_overlap(const Vector &m_m, const Vector &d_m,
1084 : const VectorGeneric<6> &inv_cov_md)
1085 : {
1086 59067036 : Vector md;
1087 : // calculate vector difference m_m-d_m with/without pbc
1088 59067036 : if(pbc_) md = pbcDistance(d_m, m_m);
1089 59067036 : else md = delta(d_m, m_m);
1090 : // calculate product of transpose of md and inv_cov_md
1091 59067036 : double p_x = md[0]*inv_cov_md[0]+md[1]*inv_cov_md[1]+md[2]*inv_cov_md[2];
1092 59067036 : double p_y = md[0]*inv_cov_md[1]+md[1]*inv_cov_md[3]+md[2]*inv_cov_md[4];
1093 59067036 : double p_z = md[0]*inv_cov_md[2]+md[1]*inv_cov_md[4]+md[2]*inv_cov_md[5];
1094 : // calculate product of prod and md
1095 59067036 : double ov = md[0]*p_x+md[1]*p_y+md[2]*p_z;
1096 59067036 : return ov;
1097 : }
1098 :
1099 16353 : void EMMI::update_neighbor_list()
1100 : {
1101 : // dimension of GMM and atom vectors
1102 16353 : unsigned GMM_d_size = GMM_d_m_.size();
1103 16353 : unsigned GMM_m_size = GMM_m_type_.size();
1104 : // local neighbor list
1105 : vector < unsigned > nl_l;
1106 : // clear old neighbor list
1107 : nl_.clear();
1108 :
1109 : // cycle on GMM components - in parallel
1110 114471 : for(unsigned id=rank_; id<GMM_d_size; id+=size_) {
1111 : // overlap lists and map
1112 : vector<double> ov_l;
1113 : map<double, unsigned> ov_m;
1114 : // total overlap with id
1115 : double ov_tot = 0.0;
1116 : // cycle on all atoms
1117 59165154 : for(unsigned im=0; im<GMM_m_size; ++im) {
1118 : // get index in auxiliary lists
1119 118134072 : unsigned kaux = GMM_m_type_[im] * GMM_d_size + id;
1120 : // calculate exponent of overlap
1121 236268144 : double expov = get_exp_overlap(GMM_d_m_[id], getPosition(im), inv_cov_md_[kaux]);
1122 : // get index of 0.5*expov in tabulated exponential
1123 59067036 : unsigned itab = static_cast<unsigned> (round( 0.5*expov/dexp_ ));
1124 : // check boundaries and skip atom in case
1125 173080152 : if(itab >= tab_exp_.size()) continue;
1126 : // in case calculate overlap
1127 8241912 : double ov = pre_fact_[kaux] * tab_exp_[itab];
1128 : // add to list
1129 4120956 : ov_l.push_back(ov);
1130 : // and map to retrieve atom index
1131 4120956 : ov_m[ov] = im;
1132 : // increase ov_tot
1133 4120956 : ov_tot += ov;
1134 : }
1135 : // check if zero size -> ov_tot = 0
1136 98118 : if(ov_l.size()==0) continue;
1137 : // define cutoff
1138 98118 : double ov_cut = ov_tot * nl_cutoff_;
1139 : // sort ov_l in ascending order
1140 98118 : std::sort(ov_l.begin(), ov_l.end());
1141 : // integrate ov_l
1142 : double res = 0.0;
1143 4273584 : for(unsigned i=0; i<ov_l.size(); ++i) {
1144 2136792 : res += ov_l[i];
1145 : // if exceeding the cutoff for overlap, stop
1146 2136792 : if(res >= ov_cut) break;
1147 : else ov_m.erase(ov_l[i]);
1148 : }
1149 : // now add atoms to neighborlist
1150 2278518 : for(map<double, unsigned>::iterator it=ov_m.begin(); it!=ov_m.end(); ++it)
1151 4164564 : nl_l.push_back(id*GMM_m_size+it->second);
1152 : // end cycle on GMM components in parallel
1153 : }
1154 : // find total dimension of neighborlist
1155 16353 : vector <int> recvcounts(size_, 0);
1156 32706 : recvcounts[rank_] = nl_l.size();
1157 32706 : comm.Sum(&recvcounts[0], size_);
1158 : int tot_size = accumulate(recvcounts.begin(), recvcounts.end(), 0);
1159 : // resize neighbor stuff
1160 16353 : nl_.resize(tot_size);
1161 : // calculate vector of displacement
1162 16353 : vector<int> disp(size_);
1163 16353 : disp[0] = 0;
1164 : int rank_size = 0;
1165 43608 : for(unsigned i=0; i<size_-1; ++i) {
1166 21804 : rank_size += recvcounts[i];
1167 21804 : disp[i+1] = rank_size;
1168 : }
1169 : // Allgather neighbor list
1170 49059 : comm.Allgatherv(&nl_l[0], recvcounts[rank_], &nl_[0], &recvcounts[0], &disp[0]);
1171 : // now resize derivatives
1172 16353 : ovmd_der_.resize(tot_size);
1173 16353 : }
1174 :
1175 18 : void EMMI::prepare()
1176 : {
1177 18 : if(getExchangeStep()) first_time_=true;
1178 18 : }
1179 :
1180 : // overlap calculator
1181 16353 : void EMMI::calculate_overlap() {
1182 :
1183 16353 : if(first_time_ || getExchangeStep() || getStep()%nl_stride_==0) {
1184 16353 : update_neighbor_list();
1185 16353 : first_time_=false;
1186 : }
1187 :
1188 : // clean temporary vectors
1189 310707 : for(unsigned i=0; i<ovmd_.size(); ++i) ovmd_[i] = 0.0;
1190 6263199 : for(unsigned i=0; i<ovmd_der_.size(); ++i) ovmd_der_[i] = Vector(0,0,0);
1191 :
1192 : // we have to cycle over all model and data GMM components in the neighbor list
1193 16353 : unsigned GMM_d_size = GMM_d_m_.size();
1194 16353 : unsigned GMM_m_size = GMM_m_type_.size();
1195 4197270 : for(unsigned i=rank_; i<nl_.size(); i=i+size_) {
1196 : // get data (id) and atom (im) indexes
1197 2082282 : unsigned id = nl_[i] / GMM_m_size;
1198 2082282 : unsigned im = nl_[i] % GMM_m_size;
1199 : // get index in auxiliary lists
1200 4164564 : unsigned kaux = GMM_m_type_[im] * GMM_d_size + id;
1201 : // add overlap with im component of model GMM
1202 4164564 : ovmd_[id] += get_overlap(GMM_d_m_[id], getPosition(im), pre_fact_[kaux],
1203 6246846 : inv_cov_md_[kaux], ovmd_der_[i]);
1204 : }
1205 : // communicate stuff
1206 16353 : if(size_>1) {
1207 10902 : comm.Sum(&ovmd_[0], ovmd_.size());
1208 10902 : comm.Sum(&ovmd_der_[0][0], 3*ovmd_der_.size());
1209 : }
1210 16353 : }
1211 :
1212 0 : double EMMI::scaleEnergy(double s)
1213 : {
1214 : double ene = 0.0;
1215 0 : for(unsigned i=0; i<ovdd_.size(); ++i) {
1216 0 : ene += std::log( abs ( s * ovmd_[i] - ovdd_[i] ) );
1217 : }
1218 0 : return ene;
1219 : }
1220 :
1221 0 : double EMMI::doRegression()
1222 : {
1223 : // standard MC parameters
1224 : unsigned MCsteps = 100000;
1225 : double kbtmin = 1.0;
1226 : double kbtmax = 10.0;
1227 : unsigned ncold = 5000;
1228 : unsigned nhot = 2000;
1229 : double MCacc = 0.0;
1230 : double kbt, ebest, scale_best;
1231 :
1232 : // initial value of scale factor and energy
1233 0 : double scale = random_.RandU01() * ( scale_max_ - scale_min_ ) + scale_min_;
1234 0 : double ene = scaleEnergy(scale);
1235 : // set best energy
1236 0 : ebest = ene;
1237 :
1238 : // MC loop
1239 0 : for(unsigned istep=0; istep<MCsteps; ++istep) {
1240 : // get temperature
1241 0 : if(istep%(ncold+nhot)<ncold) kbt = kbtmin;
1242 : else kbt = kbtmax;
1243 : // propose move in scale
1244 0 : double ds = dscale_ * ( 2.0 * random_.RandU01() - 1.0 );
1245 0 : double new_scale = scale + ds;
1246 : // check boundaries
1247 0 : if(new_scale > scale_max_) {new_scale = 2.0 * scale_max_ - new_scale;}
1248 0 : if(new_scale < scale_min_) {new_scale = 2.0 * scale_min_ - new_scale;}
1249 : // new energy
1250 0 : double new_ene = scaleEnergy(new_scale);
1251 : // accept or reject
1252 0 : bool accept = doAccept(ene, new_ene, kbt);
1253 : // in case of acceptance
1254 0 : if(accept) {
1255 : scale = new_scale;
1256 : ene = new_ene;
1257 0 : MCacc += 1.0;
1258 : }
1259 : // save best
1260 0 : if(ene<ebest) {
1261 0 : ebest = ene;
1262 0 : scale_best = scale;
1263 : }
1264 : }
1265 : // calculate acceptance
1266 0 : double accscale = MCacc / static_cast<double>(MCsteps);
1267 : // global communication
1268 0 : if(!no_aver_ && nrep_>1) {
1269 0 : if(replica_!=0) {
1270 0 : scale_best = 0.0;
1271 0 : ebest = 0.0;
1272 0 : accscale = 0.0;
1273 : }
1274 0 : if(rank_==0) {
1275 0 : multi_sim_comm.Sum(&scale_best, 1);
1276 0 : multi_sim_comm.Sum(&ebest, 1);
1277 0 : multi_sim_comm.Sum(&accscale, 1);
1278 : }
1279 : }
1280 : // local communication
1281 0 : if(rank_!=0) {
1282 0 : scale_best = 0.0;
1283 0 : ebest = 0.0;
1284 0 : accscale = 0.0;
1285 : }
1286 0 : if(size_>1) {
1287 0 : comm.Sum(&scale_best, 1);
1288 0 : comm.Sum(&ebest, 1);
1289 0 : comm.Sum(&accscale, 1);
1290 : }
1291 : // set scale parameters
1292 0 : getPntrToComponent("accscale")->set(accscale);
1293 0 : getPntrToComponent("enescale")->set(ebest);
1294 : // return scale value
1295 0 : return scale_best;
1296 : }
1297 :
1298 0 : double EMMI::get_annealing(long int step)
1299 : {
1300 : // default no annealing
1301 : double fact = 1.0;
1302 : // position in annealing cycle
1303 0 : unsigned nc = step%(4*nanneal_);
1304 : // useful doubles
1305 0 : double ncd = static_cast<double>(nc);
1306 0 : double nn = static_cast<double>(nanneal_);
1307 : // set fact
1308 0 : if(nc>=nanneal_ && nc<2*nanneal_) fact = (kanneal_-1.0) / nn * ( ncd - nn ) + 1.0;
1309 0 : if(nc>=2*nanneal_ && nc<3*nanneal_) fact = kanneal_;
1310 0 : if(nc>=3*nanneal_) fact = (1.0-kanneal_) / nn * ( ncd - 3.0*nn) + kanneal_;
1311 0 : return fact;
1312 : }
1313 :
1314 16353 : void EMMI::calculate()
1315 : {
1316 :
1317 : // calculate CV
1318 16353 : calculate_overlap();
1319 :
1320 : // rescale factor for ensemble average
1321 16353 : double escale = 1.0 / static_cast<double>(nrep_);
1322 :
1323 : // in case of ensemble averaging, calculate average overlap
1324 16353 : if(!no_aver_ && nrep_>1) {
1325 : // if master node, calculate average across replicas
1326 0 : if(rank_==0) {
1327 0 : multi_sim_comm.Sum(&ovmd_[0], ovmd_.size());
1328 0 : for(unsigned i=0; i<ovmd_.size(); ++i) ovmd_[i] *= escale;
1329 : } else {
1330 0 : for(unsigned i=0; i<ovmd_.size(); ++i) ovmd_[i] = 0.0;
1331 : }
1332 : // local communication
1333 0 : if(size_>1) comm.Sum(&ovmd_[0], ovmd_.size());
1334 : }
1335 :
1336 : // get time step
1337 16353 : long int step = getStep();
1338 :
1339 : // do regression
1340 16353 : if(nregres_>0) {
1341 0 : if(step%nregres_==0 && !getExchangeStep()) scale_ = doRegression();
1342 : // set scale component
1343 0 : getPntrToComponent("scale")->set(scale_);
1344 : }
1345 :
1346 : // write model overlap to file
1347 16353 : if(ovstride_>0 && step%ovstride_==0) write_model_overlap(step);
1348 :
1349 : // clear energy and virial
1350 16353 : ene_ = 0.0;
1351 16353 : virial_.zero();
1352 :
1353 : // Gaussian noise
1354 16353 : if(noise_==0) calculate_Gauss();
1355 :
1356 : // Outliers noise
1357 16353 : if(noise_==1) calculate_Outliers();
1358 :
1359 : // Marginal noise
1360 16353 : if(noise_==2) calculate_Marginal();
1361 :
1362 : // get annealing rescale factor
1363 16353 : if(nanneal_>0) {
1364 0 : anneal_ = get_annealing(step);
1365 0 : getPntrToComponent("anneal")->set(anneal_);
1366 : }
1367 :
1368 : // annealing rescale
1369 16353 : ene_ /= anneal_;
1370 :
1371 : // in case of ensemble averaging
1372 16353 : if(!no_aver_ && nrep_>1) {
1373 : // if master node, sum der_GMMid derivatives and ene
1374 0 : if(rank_==0) {
1375 0 : multi_sim_comm.Sum(&GMMid_der_[0], GMMid_der_.size());
1376 0 : multi_sim_comm.Sum(&ene_, 1);
1377 : } else {
1378 : // set der_GMMid derivatives and energy to zero
1379 0 : for(unsigned i=0; i<GMMid_der_.size(); ++i) GMMid_der_[i]=0.0;
1380 0 : ene_ = 0.0;
1381 : }
1382 : // local communication
1383 0 : if(size_>1) {
1384 0 : comm.Sum(&GMMid_der_[0], GMMid_der_.size());
1385 0 : comm.Sum(&ene_, 1);
1386 : }
1387 : }
1388 :
1389 : // clean temporary vector
1390 19705365 : for(unsigned i=0; i<atom_der_.size(); ++i) atom_der_[i] = Vector(0,0,0);
1391 :
1392 : // get derivatives of bias with respect to atoms
1393 4197270 : for(unsigned i=rank_; i<nl_.size(); i=i+size_) {
1394 : // get indexes of data and model component
1395 4164564 : unsigned id = nl_[i] / GMM_m_type_.size();
1396 2082282 : unsigned im = nl_[i] % GMM_m_type_.size();
1397 : // chain rule + replica normalization
1398 6246846 : Vector tot_der = GMMid_der_[id] * ovmd_der_[i] * escale * scale_ / anneal_;
1399 2082282 : Vector pos;
1400 2082282 : if(pbc_) pos = pbcDistance(GMM_d_m_[id], getPosition(im)) + GMM_d_m_[id];
1401 4164564 : else pos = getPosition(im);
1402 : // increment derivatives and virial
1403 4164564 : atom_der_[im] += tot_der;
1404 2082282 : virial_ += Tensor(pos, -tot_der);
1405 : }
1406 :
1407 : // communicate local derivatives and virial
1408 16353 : if(size_>1) {
1409 10902 : comm.Sum(&atom_der_[0][0], 3*atom_der_.size());
1410 10902 : comm.Sum(virial_);
1411 : }
1412 :
1413 : // set derivatives, virial, and score
1414 29549871 : for(unsigned i=0; i<atom_der_.size(); ++i) setAtomsDerivatives(getPntrToComponent("scoreb"), i, atom_der_[i]);
1415 32706 : setBoxDerivatives(getPntrToComponent("scoreb"), virial_);
1416 32706 : getPntrToComponent("scoreb")->set(ene_);
1417 :
1418 : // This part is needed only for Gaussian and Outliers noise models
1419 16353 : if(noise_!=2) {
1420 :
1421 : // do Montecarlo
1422 10902 : if(dsigma_[0]>0 && step%MCstride_==0 && !getExchangeStep()) doMonteCarlo();
1423 :
1424 : // print status
1425 10902 : if(step%statusstride_==0) print_status(step);
1426 :
1427 : // calculate acceptance ratio
1428 10902 : double acc = MCaccept_ / MCtrials_;
1429 :
1430 : // set value
1431 21804 : getPntrToComponent("acc")->set(acc);
1432 :
1433 : }
1434 :
1435 16353 : }
1436 :
1437 5451 : void EMMI::calculate_Gauss()
1438 : {
1439 : // cycle on all the GMM groups
1440 21804 : for(unsigned i=0; i<GMM_d_grps_.size(); ++i) {
1441 : double eneg = 0.0;
1442 : // cycle on all the members of the group
1443 103569 : for(unsigned j=0; j<GMM_d_grps_[i].size(); ++j) {
1444 : // id of the GMM component
1445 49059 : int GMMid = GMM_d_grps_[i][j];
1446 : // calculate deviation
1447 196236 : double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] ) / sigma_[i];
1448 : // add to group energy
1449 49059 : eneg += 0.5 * dev * dev;
1450 : // store derivative for later
1451 49059 : GMMid_der_[GMMid] = kbt_ * dev / sigma_[i];
1452 : }
1453 : // add to total energy along with normalizations and prior
1454 10902 : ene_ += kbt_ * ( eneg + (static_cast<double>(GMM_d_grps_[i].size())+prior_) * std::log(sigma_[i]) );
1455 : }
1456 5451 : }
1457 :
1458 5451 : void EMMI::calculate_Outliers()
1459 : {
1460 : // cycle on all the GMM groups
1461 21804 : for(unsigned i=0; i<GMM_d_grps_.size(); ++i) {
1462 : // cycle on all the members of the group
1463 : double eneg = 0.0;
1464 103569 : for(unsigned j=0; j<GMM_d_grps_[i].size(); ++j) {
1465 : // id of the GMM component
1466 49059 : int GMMid = GMM_d_grps_[i][j];
1467 : // calculate deviation
1468 196236 : double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] ) / sigma_[i];
1469 : // add to group energy
1470 49059 : eneg += std::log( 1.0 + 0.5 * dev * dev );
1471 : // store derivative for later
1472 98118 : GMMid_der_[GMMid] = kbt_ / ( 1.0 + 0.5 * dev * dev ) * dev / sigma_[i];
1473 : }
1474 : // add to total energy along with normalizations and prior
1475 10902 : ene_ += kbt_ * ( eneg + (static_cast<double>(GMM_d_grps_[i].size())+prior_) * std::log(sigma_[i]) );
1476 : }
1477 5451 : }
1478 :
1479 5451 : void EMMI::calculate_Marginal()
1480 : {
1481 : // cycle on all the GMM groups
1482 21804 : for(unsigned i=0; i<GMM_d_grps_.size(); ++i) {
1483 : // cycle on all the members of the group
1484 103569 : for(unsigned j=0; j<GMM_d_grps_[i].size(); ++j) {
1485 : // id of the GMM component
1486 49059 : int GMMid = GMM_d_grps_[i][j];
1487 : // calculate deviation
1488 147177 : double dev = ( scale_*ovmd_[GMMid]-ovdd_[GMMid] );
1489 : // calculate errf
1490 98118 : double errf = erf ( dev * inv_sqrt2_ / sigma_min_[i] );
1491 : // add to group energy
1492 49059 : ene_ += -kbt_ * std::log ( 0.5 / dev * errf ) ;
1493 : // store derivative for later
1494 147177 : GMMid_der_[GMMid] = - kbt_/errf*sqrt2_pi_*exp(-0.5*dev*dev/sigma_min_[i]/sigma_min_[i])/sigma_min_[i]+kbt_/dev;
1495 : }
1496 : }
1497 5451 : }
1498 :
1499 : }
1500 5874 : }
|