IsoSpec  2.2.1
fixedEnvelopes.cpp
1 /*
2  * Copyright (C) 2015-2020 Mateusz Łącki and Michał Startek.
3  *
4  * This file is part of IsoSpec.
5  *
6  * IsoSpec is free software: you can redistribute it and/or modify
7  * it under the terms of the Simplified ("2-clause") BSD licence.
8  *
9  * IsoSpec is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12  *
13  * You should have received a copy of the Simplified BSD Licence
14  * along with IsoSpec. If not, see <https://opensource.org/licenses/BSD-2-Clause>.
15  */
16 
17 #include "fixedEnvelopes.h"
18 #include <limits>
19 #include "isoMath.h"
20 
21 namespace IsoSpec
22 {
23 
24 FixedEnvelope::FixedEnvelope(const FixedEnvelope& other) :
25 _masses(array_copy<double>(other._masses, other._confs_no)),
26 _probs(array_copy<double>(other._probs, other._confs_no)),
27 _confs(array_copy_nptr<int>(other._confs, other._confs_no*other.allDim)),
28 _confs_no(other._confs_no),
29 allDim(other.allDim),
30 sorted_by_mass(other.sorted_by_mass),
31 sorted_by_prob(other.sorted_by_prob),
32 total_prob(other.total_prob)
33 {}
34 
35 FixedEnvelope::FixedEnvelope(FixedEnvelope&& other) :
36 _masses(other._masses),
37 _probs(other._probs),
38 _confs(other._confs),
39 _confs_no(other._confs_no),
40 allDim(other.allDim),
41 sorted_by_mass(other.sorted_by_mass),
42 sorted_by_prob(other.sorted_by_prob),
43 total_prob(other.total_prob)
44 {
45 other._masses = nullptr;
46 other._probs = nullptr;
47 other._confs = nullptr;
48 other._confs_no = 0;
49 other.total_prob = 0.0;
50 }
51 
52 FixedEnvelope::FixedEnvelope(double* in_masses, double* in_probs, size_t in_confs_no, bool masses_sorted, bool probs_sorted, double _total_prob) :
53 _masses(in_masses),
54 _probs(in_probs),
55 _confs(nullptr),
56 _confs_no(in_confs_no),
57 allDim(0),
58 sorted_by_mass(masses_sorted),
59 sorted_by_prob(probs_sorted),
60 total_prob(_total_prob)
61 {}
62 
63 FixedEnvelope FixedEnvelope::operator+(const FixedEnvelope& other) const
64 {
65  double* nprobs = reinterpret_cast<double*>(malloc(sizeof(double) * (_confs_no+other._confs_no)));
66  if(nprobs == nullptr)
67  throw std::bad_alloc();
68  double* nmasses = reinterpret_cast<double*>(malloc(sizeof(double) * (_confs_no+other._confs_no)));
69  if(nmasses == nullptr)
70  {
71  free(nprobs);
72  throw std::bad_alloc();
73  }
74 
75  memcpy(nprobs, _probs, sizeof(double) * _confs_no);
76  memcpy(nmasses, _masses, sizeof(double) * _confs_no);
77 
78  memcpy(nprobs+_confs_no, other._probs, sizeof(double) * other._confs_no);
79  memcpy(nmasses+_confs_no, other._masses, sizeof(double) * other._confs_no);
80 
81  return FixedEnvelope(nmasses, nprobs, _confs_no + other._confs_no);
82 }
83 
84 FixedEnvelope FixedEnvelope::operator*(const FixedEnvelope& other) const
85 {
86  double* nprobs = reinterpret_cast<double*>(malloc(sizeof(double) * _confs_no * other._confs_no));
87  if(nprobs == nullptr)
88  throw std::bad_alloc();
89  // deepcode ignore CMemoryLeak: It's not a memleak: the memory is passed to FixedEnvelope which
90  // deepcode ignore CMemoryLeak: takes ownership of it, and will properly free() it in destructor.
91  double* nmasses = reinterpret_cast<double*>(malloc(sizeof(double) * _confs_no * other._confs_no));
92  if(nmasses == nullptr)
93  {
94  free(nprobs);
95  throw std::bad_alloc();
96  }
97 
98  size_t tgt_idx = 0;
99 
100  for(size_t ii = 0; ii < _confs_no; ii++)
101  for(size_t jj = 0; jj < other._confs_no; jj++)
102  {
103  nprobs[tgt_idx] = _probs[ii] * other._probs[jj];
104  nmasses[tgt_idx] = _masses[ii] + other._masses[jj];
105  tgt_idx++;
106  }
107 
108  return FixedEnvelope(nmasses, nprobs, tgt_idx);
109 }
110 
111 void FixedEnvelope::sort_by_mass()
112 {
113  if(sorted_by_mass)
114  return;
115 
116  sort_by(_masses);
117 
118  sorted_by_mass = true;
119  sorted_by_prob = false;
120 }
121 
122 
123 void FixedEnvelope::sort_by_prob()
124 {
125  if(sorted_by_prob)
126  return;
127 
128  sort_by(_probs);
129 
130  sorted_by_prob = true;
131  sorted_by_mass = false;
132 }
133 
134 template<typename T> void reorder_array(T* arr, size_t* order, size_t size, bool can_destroy = false)
135 {
136  if(!can_destroy)
137  {
138  size_t* order_c = new size_t[size];
139  memcpy(order_c, order, sizeof(size_t)*size);
140  order = order_c;
141  }
142 
143  for(size_t ii = 0; ii < size; ii++)
144  while(order[ii] != ii)
145  {
146  std::swap(arr[ii], arr[order[ii]]);
147  std::swap(order[order[ii]], order[ii]);
148  }
149 
150  if(!can_destroy)
151  delete[] order;
152 }
153 
154 void FixedEnvelope::sort_by(double* order)
155 {
156  size_t* indices = new size_t[_confs_no];
157 
158  if(_confs_no <= 1)
159  return;
160 
161  for(size_t ii = 0; ii < _confs_no; ii++)
162  indices[ii] = ii;
163 
164  std::sort<size_t*>(indices, indices + _confs_no, TableOrder<double>(order));
165 
166  size_t* inverse = new size_t[_confs_no];
167 
168  for(size_t ii = 0; ii < _confs_no; ii++)
169  inverse[indices[ii]] = ii;
170 
171  delete[] indices;
172 
173  reorder_array(_masses, inverse, _confs_no);
174  reorder_array(_probs, inverse, _confs_no, _confs == nullptr);
175  if(_confs != nullptr)
176  {
177  int* swapspace = new int[allDim];
178  for(size_t ii = 0; ii < _confs_no; ii++)
179  while(inverse[ii] != ii)
180  {
181  memcpy(swapspace, &_confs[ii*allDim], allDimSizeofInt);
182  memcpy(&_confs[ii*allDim], &_confs[inverse[ii]*allDim], allDimSizeofInt);
183  memcpy(&_confs[inverse[ii]*allDim], swapspace, allDimSizeofInt);
184  std::swap(inverse[inverse[ii]], inverse[ii]);
185  }
186  delete[] swapspace;
187  }
188  delete[] inverse;
189 }
190 
191 
192 double FixedEnvelope::get_total_prob()
193 {
194  if(std::isnan(total_prob))
195  {
196  total_prob = 0.0;
197  for(size_t ii = 0; ii < _confs_no; ii++)
198  total_prob += _probs[ii];
199  }
200  return total_prob;
201 }
202 
203 void FixedEnvelope::scale(double factor)
204 {
205  for(size_t ii = 0; ii < _confs_no; ii++)
206  _probs[ii] *= factor;
207  total_prob *= factor;
208 }
209 
210 void FixedEnvelope::normalize()
211 {
212  double tp = get_total_prob();
213  if(tp != 1.0)
214  {
215  scale(1.0/tp);
216  total_prob = 1.0;
217  }
218 }
219 
220 void FixedEnvelope::shift_mass(double value)
221 {
222  for(size_t ii = 0; ii < _confs_no; ii++)
223  _masses[ii] += value;
224 }
225 
226 void FixedEnvelope::resample(size_t samples, double beta_bias)
227 {
228  if(_confs_no == 0)
229  throw std::logic_error("Resample called on an empty spectrum");
230 
231  double pprob = 0.0;
232  double cprob = 0.0;
233  size_t pidx = -1; // Overflows - but it doesn't matter.
234 
235  _probs[_confs_no-1] = (std::numeric_limits<double>::max)();
236 
237  while(samples > 0)
238  {
239  pprob += _probs[++pidx];
240  _probs[pidx] = 0.0;
241  double covered_part = (pprob - cprob) / (1.0 - cprob);
242  while(samples * covered_part < beta_bias && samples > 0)
243  {
244  cprob += rdvariate_beta_1_b(samples) * (1.0 - cprob);
245  while(pprob < cprob)
246  {
247  pprob += _probs[++pidx];
248  _probs[pidx] = 0.0;
249  }
250  _probs[pidx] += 1.0;
251  samples--;
252  covered_part = (pprob - cprob) / (1.0 - cprob);
253  }
254  if(samples <= 0)
255  break;
256  size_t nrtaken = rdvariate_binom(samples, covered_part);
257  _probs[pidx] += static_cast<double>(nrtaken);
258  samples -= nrtaken;
259  cprob = pprob;
260  }
261 
262  pidx++;
263  memset(_probs + pidx, 0, sizeof(double)*(_confs_no - pidx));
264 }
265 
266 FixedEnvelope FixedEnvelope::LinearCombination(const std::vector<const FixedEnvelope*>& spectra, const std::vector<double>& intensities)
267 {
268  return LinearCombination(spectra.data(), intensities.data(), spectra.size());
269 }
270 
271 FixedEnvelope FixedEnvelope::LinearCombination(const FixedEnvelope* const * spectra, const double* intensities, size_t size)
272 {
273  size_t ret_size = 0;
274  for(size_t ii = 0; ii < size; ii++)
275  ret_size += spectra[ii]->_confs_no;
276 
277  double* newprobs = reinterpret_cast<double*>(malloc(sizeof(double)*ret_size));
278  if(newprobs == nullptr)
279  throw std::bad_alloc();
280  double* newmasses = reinterpret_cast<double*>(malloc(sizeof(double)*ret_size));
281  if(newmasses == nullptr)
282  {
283  free(newprobs);
284  throw std::bad_alloc();
285  }
286 
287  size_t cntr = 0;
288  for(size_t ii = 0; ii < size; ii++)
289  {
290  double mul = intensities[ii];
291  for(size_t jj = 0; jj < spectra[ii]->_confs_no; jj++)
292  newprobs[jj+cntr] = spectra[ii]->_probs[jj] * mul;
293  memcpy(newmasses + cntr, spectra[ii]->_masses, sizeof(double) * spectra[ii]->_confs_no);
294  cntr += spectra[ii]->_confs_no;
295  }
296  return FixedEnvelope(newmasses, newprobs, cntr);
297 }
298 
299 double FixedEnvelope::WassersteinDistance(FixedEnvelope& other)
300 {
301  double ret = 0.0;
302  if((get_total_prob()*0.999 > other.get_total_prob()) || (other.get_total_prob() > get_total_prob()*1.001))
303  throw std::logic_error("Spectra must be normalized before computing Wasserstein Distance");
304 
305  if(_confs_no == 0 || other._confs_no == 0)
306  return 0.0;
307 
308  sort_by_mass();
309  other.sort_by_mass();
310 
311  size_t idx_this = 0;
312  size_t idx_other = 0;
313 
314  double acc_prob = 0.0;
315  double last_point = 0.0;
316 
317 
318  while(idx_this < _confs_no && idx_other < other._confs_no)
319  {
320  if(_masses[idx_this] < other._masses[idx_other])
321  {
322  ret += (_masses[idx_this] - last_point) * std::abs(acc_prob);
323  acc_prob += _probs[idx_this];
324  last_point = _masses[idx_this];
325  idx_this++;
326  }
327  else
328  {
329  ret += (other._masses[idx_other] - last_point) * std::abs(acc_prob);
330  acc_prob -= other._probs[idx_other];
331  last_point = other._masses[idx_other];
332  idx_other++;
333  }
334  }
335 
336  acc_prob = std::abs(acc_prob);
337 
338  while(idx_this < _confs_no)
339  {
340  ret += (_masses[idx_this] - last_point) * acc_prob;
341  acc_prob -= _probs[idx_this];
342  last_point = _masses[idx_this];
343  idx_this++;
344  }
345 
346  while(idx_other < other._confs_no)
347  {
348  ret += (other._masses[idx_other] - last_point) * acc_prob;
349  acc_prob -= other._probs[idx_other];
350  last_point = other._masses[idx_other];
351  idx_other++;
352  }
353 
354  return ret;
355 }
356 
357 
358 double FixedEnvelope::OrientedWassersteinDistance(FixedEnvelope& other)
359 {
360  double ret = 0.0;
361  if((get_total_prob()*0.999 > other.get_total_prob()) || (other.get_total_prob() > get_total_prob()*1.001))
362  throw std::logic_error("Spectra must be normalized before computing Wasserstein Distance");
363 
364  if(_confs_no == 0 || other._confs_no == 0)
365  return 0.0;
366 
367  sort_by_mass();
368  other.sort_by_mass();
369 
370  size_t idx_this = 0;
371  size_t idx_other = 0;
372 
373  double acc_prob = 0.0;
374  double last_point = 0.0;
375 
376 
377  while(idx_this < _confs_no && idx_other < other._confs_no)
378  {
379  if(_masses[idx_this] < other._masses[idx_other])
380  {
381  ret += (_masses[idx_this] - last_point) * acc_prob;
382  acc_prob += _probs[idx_this];
383  last_point = _masses[idx_this];
384  idx_this++;
385  }
386  else
387  {
388  ret += (other._masses[idx_other] - last_point) * acc_prob;
389  acc_prob -= other._probs[idx_other];
390  last_point = other._masses[idx_other];
391  idx_other++;
392  }
393  }
394 
395  while(idx_this < _confs_no)
396  {
397  ret += (_masses[idx_this] - last_point) * acc_prob;
398  acc_prob -= _probs[idx_this];
399  last_point = _masses[idx_this];
400  idx_this++;
401  }
402 
403  while(idx_other < other._confs_no)
404  {
405  ret += (other._masses[idx_other] - last_point) * acc_prob;
406  acc_prob -= other._probs[idx_other];
407  last_point = other._masses[idx_other];
408  idx_other++;
409  }
410 
411  return ret;
412 }
413 
414 double FixedEnvelope::AbyssalWassersteinDistance(FixedEnvelope& other, double abyss_depth, double other_scale)
415 {
416  sort_by_mass();
417  other.sort_by_mass();
418 
419  std::vector<std::pair<double, double>> carried;
420 
421  size_t idx_this = 0;
422  size_t idx_other = 0;
423 
424  //std::cout << "AAA" << std::endl;
425 
426  auto finished = [&]() -> bool { return idx_this >= _confs_no && idx_other >= other._confs_no; };
427  auto next = [&]() -> std::pair<double, double> {
428  if(idx_this >= _confs_no || (idx_other < other._confs_no && _masses[idx_this] > other._masses[idx_other]))
429  {
430  std::pair<double, double> res = std::pair<double, double>(other._masses[idx_other], other._probs[idx_other]*other_scale);
431  idx_other++;
432  return res;
433  }
434  else
435  {
436  std::pair<double, double> res = std::pair<double, double>(_masses[idx_this], -_probs[idx_this]);
437  idx_this++;
438  return res;
439  }
440  };
441  double accd = 0.0;
442  double condemned = 0.0;
443 
444  while(!finished())
445  {
446  auto [m, p] = next();
447  if(!carried.empty() && carried[0].second * p > 0.0)
448  {
449  carried.emplace_back(m, p);
450  continue;
451  }
452 
453  while(!carried.empty())
454  {
455  auto& [cm, cp] = carried.back();
456  if(m - cm >= abyss_depth)
457  {
458  for(auto it = carried.cbegin(); it != carried.cend(); it++)
459  condemned += fabs(it->second);
460  carried.clear();
461  break;
462  }
463  if((cp+p)*p > 0.0)
464  {
465  accd += fabs((m-cm)*cp);
466  p += cp;
467  carried.pop_back();
468  }
469  else
470  {
471  accd += fabs((m-cm)*p);
472  cp += p;
473  p = 0.0;
474  break;
475  }
476  }
477  if(p != 0.0)
478  carried.emplace_back(m, p);
479  //std::cout << m << " " << p << std::endl;
480  }
481 
482  for(auto it = carried.cbegin(); it != carried.cend(); it++)
483  condemned += fabs(it->second);
484 
485  return accd + condemned * abyss_depth * 0.5;
486 }
487 
488 #if 0
489 double FixedEnvelope::ScaledAbyssalWassersteinDistance(FixedEnvelope * const * others, double abyss_depth, const double* other_scales, const size_t N)
490 {
491  sort_by_mass();
492 
493  std::priority_queue<std::pair<double, size_t>> PQ;
494  std::unique_ptr<size_t[]> indices = std::make_unique<size_t[]>(N);
495  memset(indices.get(), 0, sizeof(size_t)*N);
496 
497  for(size_t ii=0; ii<N; ii++)
498  {
499  others[ii]->sort_by_mass();
500  if(others[ii]->_confs_no > 0)
501  PQ.emplace_back({others._probs[0], ii});
502  }
503 
504 
505 
506 
507  std::vector<std::pair<double, double>> carried;
508 
509  size_t idx_this = 0;
510  size_t idx_other = 0;
511 
512  //std::cout << "AAA" << std::endl;
513 
514  auto finished = [&]() -> bool { return idx_this >= _confs_no && PQ.empty(); };
515  auto next = [&]() -> std::tuple<double, double, size_t> {
516  if(idx_this >= _confs_no || (idx_other < other._confs_no && _masses[idx_this] > other._masses[idx_other]))
517  {
518  std::pair<double, double> res = std::pair<double, double>(other._masses[idx_other], other._probs[idx_other]*other_scale);
519  idx_other++;
520  return res;
521  }
522  else
523  {
524  std::pair<double, double> res = std::pair<double, double>(_masses[idx_this], -_probs[idx_this]);
525  idx_this++;
526  return res;
527  }
528  };
529  double accd = 0.0;
530  double condemned = 0.0;
531 
532  while(!finished())
533  {
534  auto [m, p] = next();
535  if(!carried.empty() && carried[0].second * p > 0.0)
536  {
537  carried.emplace_back(m, p);
538  continue;
539  }
540 
541  while(!carried.empty())
542  {
543  auto& [cm, cp] = carried.back();
544  if(m - cm >= abyss_depth)
545  {
546  for(auto it = carried.cbegin(); it != carried.cend(); it++)
547  condemned += fabs(it->second);
548  carried.clear();
549  break;
550  }
551  if((cp+p)*p > 0.0)
552  {
553  accd += fabs((m-cm)*cp);
554  p += cp;
555  carried.pop_back();
556  }
557  else
558  {
559  accd += fabs((m-cm)*p);
560  cp += p;
561  p = 0.0;
562  break;
563  }
564  }
565  if(p != 0.0)
566  carried.emplace_back(m, p);
567  //std::cout << m << " " << p << std::endl;
568  }
569 
570  for(auto it = carried.cbegin(); it != carried.cend(); it++)
571  condemned += fabs(it->second);
572 
573  return accd + condemned * abyss_depth * 0.5;
574 }
575 
576 #endif
577 
578 #if 0
579 double AbyssalWassersteinDistanceGrad(FixedEnvelope* const* envelopes, const double* scales, double* ret_gradient, size_t N, double abyss_depth_exp, double abyss_depth_the)
580 {
581 return 0.0;
582  std::unique_ptr<size_t[]> env_idx = std::make_unique<size_t[]>(N+1);
583  memset(env_idx.get(), 0, (N+1)*sizeof(size_t));
584  memset(ret_gradient, 0, (N+1)*sizeof(double));
585 
586  auto pq_cmp = [](std::pair<double, size_t>& p1, std::pair<double, size_t>& p2) { return p1.first > p2.first; };
587  std::priority_queue<std::pair<double, size_t>, std::vector<std::pair<double, size_t>>, decltype(pq_cmp)> PQ(pq_cmp);
588 
589  for(size_t ii=0; ii<=N; ii++)
590  {
591  envelopes[ii]->sort_by_mass();
592  if(envelopes[ii]->_confs_no > 0)
593  {
594  std::cout << "Initial push: " << envelopes[ii]->_masses[0] << " " << ii << '\n';
595  PQ.push({envelopes[ii]->_masses[0], ii});
596  }
597  }
598 
599  std::vector<std::tuple<double, double, size_t>> carried;
600 
601  auto next = [&]() -> std::tuple<double, double, size_t> {
602  auto [mass, eidx] = PQ.top();
603  double prob = envelopes[eidx]->_probs[env_idx[eidx]];
604  PQ.pop();
605  if(eidx == 0)
606  prob = -prob;
607  else
608  prob = prob * scales[eidx];
609  std::cout << "Next: " << mass << " " << prob << " " << eidx << '\n';
610  env_idx[eidx]++;
611  if(env_idx[eidx] < envelopes[eidx]->_confs_no)
612  PQ.push({envelopes[eidx]->_masses[env_idx[eidx]], eidx});
613 
614  return {mass, prob, eidx};
615  };
616  double accd = 0.0;
617  double condemned = 0.0;
618  //double flow;
619  const double max_flow_dist = abyss_depth_exp + abyss_depth_the;
620  max_flow_dist *= 2.0;
621 
622  while(!PQ.empty())
623  {
624  auto [m, p, eidx] = next();
625  if(!carried.empty() && std::get<1>(carried[0]) * p > 0.0)
626  {
627  carried.emplace_back(m, p, eidx);
628  continue;
629  }
630 
631  while(!carried.empty())
632  {
633  auto& [cm, cp, ceidx] = carried.back();
634  if(m - cm >= max_flow_dist)
635  {
636  for(auto it = carried.cbegin(); it != carried.cend(); it++)
637  condemned += fabs(std::get<1>(*it));
638  carried.clear();
639  break;
640  }
641  std::cout << "accing\n";
642  if((cp+p)*p > 0.0)
643  {
644  accd += fabs((m-cm)*cp);
645  p += cp;
646  carried.pop_back();
647  std::cout << "cprob:" << cp << '\n';
648  }
649  else
650  {
651  accd += fabs((m-cm)*p);
652  cp += p;
653  std::cout << "prob:" << p << '\n';
654  p = 0.0;
655  break;
656  }
657  }
658  if(p != 0.0)
659  carried.emplace_back(m, p, eidx);
660  //std::cout << m << " " << p << std::endl;
661  }
662 
663  for(auto it = carried.cbegin(); it != carried.cend(); it++)
664  condemned += fabs(std::get<1>(*it));
665 
666  std::cout << "ISO:" << accd << " " << condemned << '\n';
667  return accd + condemned * max_flow_dist * 0.5;
668  while(!PQ.empty())
669  {
670  auto [m, p, eidx] = next();
671  if(!carried.empty() && (std::get<1>(carried[0]) * p > 0.0))
672  {
673  carried.emplace_back(m, p, eidx);
674  continue;
675  }
676 
677  while(!carried.empty())
678  {
679  auto& [cm, cp, ceidx] = carried.back();
680  if(m - cm >= max_flow_dist)
681  {
682  for(auto it = carried.cbegin(); it != carried.cend(); it++)
683  {
684  flow = fabs(std::get<1>(*it));
685  const size_t target_idx = std::get<2>(*it);
686  flow *= target_idx == 0 ? abyss_depth_exp : abyss_depth_the;
687  ret_gradient[target_idx] += flow;
688  condemned += flow;
689  std::cout << "condemnin': " << m << " " << p << " " << eidx << " | " << cm << " " << cp << " " << ceidx << '\n';
690  }
691  carried.clear();
692  break;
693  }
694  if((cp+p)*p > 0.0)
695  {
696  flow = fabs((m-cm)*cp);
697  accd += flow;
698  p += cp;
699  ret_gradient[ceidx] += flow;
700  carried.pop_back();
701  }
702  else
703  {
704  flow = fabs((m-cm)*p);
705  accd += flow;
706  cp += p;
707  ret_gradient[eidx] += flow;
708  p = 0.0;
709  break;
710  }
711  }
712  if(p != 0.0)
713  carried.emplace_back(m, p, eidx);
714  //std::cout << m << " " << p << std::endl;
715  }
716 
717  for(auto it = carried.cbegin(); it != carried.cend(); it++)
718  condemned += fabs(std::get<1>(*it));
719 
720 
721  return accd + condemned * (abyss_depth_exp + abyss_depth_the) * 0.5;
722 }
723 #endif
724 
725 
726 std::tuple<double, double, double> FixedEnvelope::WassersteinMatch(FixedEnvelope& other, double flow_distance, double other_scale)
727 {
728  if(_confs_no == 0)
729  return {0.0, other.get_total_prob() * other_scale, 0.0};
730 
731  double unmatched1 = 0.0;
732  double unmatched2 = 0.0;
733  double massflow = 0.0;
734 
735  sort_by_mass();
736  other.sort_by_mass();
737 
738  size_t idx_this = 0;
739  size_t idx_other = 0;
740  double used_prob_this = 0.0;
741  double used_prob_other = 0.0;
742 
743  while(idx_this < _confs_no && idx_other < other._confs_no)
744  {
745  bool moved = true;
746  while(moved && idx_this < _confs_no && idx_other < other._confs_no)
747  {
748  moved = false;
749  if(_masses[idx_this] < other._masses[idx_other] - flow_distance)
750  {
751  unmatched1 += _probs[idx_this] - used_prob_this;
752  used_prob_this = 0.0;
753  idx_this++;
754  moved = true;
755  }
756  if(other._masses[idx_other] < _masses[idx_this] - flow_distance)
757  {
758  unmatched2 += other._probs[idx_other]*other_scale - used_prob_other;
759  used_prob_other = 0.0;
760  idx_other++;
761  moved = true;
762  }
763  }
764  if(idx_this < _confs_no && idx_other < other._confs_no)
765  {
766  assert(_probs[idx_this] - used_prob_this >= 0.0);
767  assert(other._probs[idx_other]*other_scale - used_prob_other >= 0.0);
768 
769  if(_probs[idx_this] - used_prob_this < other._probs[idx_other]*other_scale - used_prob_other)
770  {
771  massflow += _probs[idx_this] - used_prob_this;
772  used_prob_other += _probs[idx_this] - used_prob_this;
773  assert(used_prob_other >= 0.0);
774  used_prob_this = 0.0;
775  idx_this++;
776  }
777  else
778  {
779  massflow += other._probs[idx_other]*other_scale - used_prob_other;
780  used_prob_this += other._probs[idx_other]*other_scale - used_prob_other;
781  assert(used_prob_this >= 0.0);
782  used_prob_other = 0.0;
783  idx_other++;
784  }
785  }
786  }
787 
788  unmatched1 -= used_prob_this;
789  unmatched2 -= used_prob_other;
790 
791  for(; idx_this < _confs_no; idx_this++)
792  unmatched1 += _probs[idx_this];
793  for(; idx_other < other._confs_no; idx_other++)
794  unmatched2 += other._probs[idx_other]*other_scale;
795 
796  return {unmatched1, unmatched2, massflow};
797 }
798 
799 FixedEnvelope FixedEnvelope::bin(double bin_width, double middle)
800 {
801  sort_by_mass();
802 
803  FixedEnvelope ret;
804 
805  if(_confs_no == 0)
806  return ret;
807 
808  ret.reallocate_memory<false>(ISOSPEC_INIT_TABLE_SIZE);
809 
810  if(bin_width == 0)
811  {
812  double curr_mass = _masses[0];
813  double accd_prob = _probs[0];
814  for(size_t ii = 1; ii<_confs_no; ii++)
815  {
816  if(curr_mass != _masses[ii])
817  {
818  ret.store_conf(curr_mass, accd_prob);
819  curr_mass = _masses[ii];
820  accd_prob = _probs[ii];
821  }
822  else
823  accd_prob += _probs[ii];
824  }
825  ret.store_conf(curr_mass, accd_prob);
826  return ret;
827  }
828 
829  size_t ii = 0;
830 
831  double half_width = 0.5*bin_width;
832  double hwmm = half_width-middle;
833 
834  while(ii < _confs_no)
835  {
836  double current_bin_middle = floor((_masses[ii]+hwmm)/bin_width)*bin_width + middle;
837  double current_bin_end = current_bin_middle + half_width;
838  double bin_prob = 0.0;
839 
840  while(ii < _confs_no && _masses[ii] <= current_bin_end)
841  {
842  bin_prob += _probs[ii];
843  ii++;
844  }
845  ret.store_conf(current_bin_middle, bin_prob);
846  }
847 
848  return ret;
849 }
850 
851 template<bool tgetConfs> void FixedEnvelope::reallocate_memory(size_t new_size)
852 {
853  current_size = new_size;
854  // FIXME: Handle overflow gracefully here. It definitely could happen for people still stuck on 32 bits...
855  _masses = reinterpret_cast<double*>(realloc(_masses, new_size * sizeof(double)));
856  if(_masses == nullptr)
857  throw std::bad_alloc();
858  tmasses = _masses + _confs_no;
859 
860  _probs = reinterpret_cast<double*>(realloc(_probs, new_size * sizeof(double)));
861  if(_probs == nullptr)
862  throw std::bad_alloc();
863  tprobs = _probs + _confs_no;
864 
865  constexpr_if(tgetConfs)
866  {
867  _confs = reinterpret_cast<int*>(realloc(_confs, new_size * allDimSizeofInt));
868  if(_confs == nullptr)
869  throw std::bad_alloc();
870  tconfs = _confs + (allDim * _confs_no);
871  }
872 }
873 
874 void FixedEnvelope::slow_reallocate_memory(size_t new_size)
875 {
876  current_size = new_size;
877  // FIXME: Handle overflow gracefully here. It definitely could happen for people still stuck on 32 bits...
878  _masses = reinterpret_cast<double*>(realloc(_masses, new_size * sizeof(double)));
879  if(_masses == nullptr)
880  throw std::bad_alloc();
881  tmasses = _masses + _confs_no;
882 
883  _probs = reinterpret_cast<double*>(realloc(_probs, new_size * sizeof(double)));
884  if(_probs == nullptr)
885  throw std::bad_alloc();
886  tprobs = _probs + _confs_no;
887 
888  if(_confs != nullptr)
889  {
890  _confs = reinterpret_cast<int*>(realloc(_confs, new_size * allDimSizeofInt));
891  if(_confs == nullptr)
892  throw std::bad_alloc();
893  tconfs = _confs + (allDim * _confs_no);
894  }
895 }
896 
897 template<bool tgetConfs> void FixedEnvelope::threshold_init(Iso&& iso, double threshold, bool absolute)
898 {
899  IsoThresholdGenerator generator(std::move(iso), threshold, absolute);
900 
901  size_t tab_size = generator.count_confs();
902  this->allDim = generator.getAllDim();
903  this->allDimSizeofInt = this->allDim * sizeof(int);
904 
905  this->reallocate_memory<tgetConfs>(tab_size);
906 
907  double* ttmasses = this->_masses;
908  double* ttprobs = this->_probs;
909  ISOSPEC_MAYBE_UNUSED int* ttconfs;
910  constexpr_if(tgetConfs)
911  ttconfs = _confs;
912 
913  while(generator.advanceToNextConfiguration())
914  {
915  *ttmasses = generator.mass(); ttmasses++;
916  *ttprobs = generator.prob(); ttprobs++;
917  constexpr_if(tgetConfs) { generator.get_conf_signature(ttconfs); ttconfs += allDim; }
918  }
919 
920  this->_confs_no = tab_size;
921 }
922 
923 template void FixedEnvelope::threshold_init<true>(Iso&& iso, double threshold, bool absolute);
924 template void FixedEnvelope::threshold_init<false>(Iso&& iso, double threshold, bool absolute);
925 
926 
927 template<bool tgetConfs> void FixedEnvelope::total_prob_init(Iso&& iso, double target_total_prob, bool optimize)
928 {
929  if(target_total_prob <= 0.0)
930  return;
931 
932  if(target_total_prob >= 1.0)
933  {
934  threshold_init<tgetConfs>(std::move(iso), 0.0, true);
935  return;
936  }
937 
938  IsoLayeredGenerator generator(std::move(iso), 1000, 1000, true, std::min<double>(target_total_prob, 0.9999));
939 
940  this->allDim = generator.getAllDim();
941  this->allDimSizeofInt = this->allDim*sizeof(int);
942 
943 
944  this->reallocate_memory<tgetConfs>(ISOSPEC_INIT_TABLE_SIZE);
945 
946  size_t last_switch = 0;
947  double prob_at_last_switch = 0.0;
948  double prob_so_far = 0.0;
949  double layer_delta;
950 
951  const double sum_above = log1p(-target_total_prob) - 2.3025850929940455; // log(0.1);
952 
953  do
954  { // Store confs until we accumulate more prob than needed - and, if optimizing,
955  // store also the rest of the last layer
956  while(generator.advanceToNextConfigurationWithinLayer())
957  {
958  this->template addConfILG<tgetConfs>(generator);
959  prob_so_far += *(tprobs-1); // The just-stored probability
960  if(prob_so_far >= target_total_prob)
961  {
962  if (optimize)
963  {
964  while(generator.advanceToNextConfigurationWithinLayer())
965  this->template addConfILG<tgetConfs>(generator);
966  break;
967  }
968  else
969  return;
970  }
971  }
972  if(prob_so_far >= target_total_prob)
973  break;
974 
975  last_switch = this->_confs_no;
976  prob_at_last_switch = prob_so_far;
977 
978  layer_delta = sum_above - log1p(-prob_so_far);
979  layer_delta = (std::max)((std::min)(layer_delta, -0.1), -5.0);
980  } while(generator.nextLayer(layer_delta));
981 
982  if(!optimize || prob_so_far <= target_total_prob)
983  return;
984 
985  // Right. We have extra configurations and we have been asked to produce an optimal p-set, so
986  // now we shall trim unneeded configurations, using an algorithm dubbed "quicktrim"
987  // - similar to the quickselect algorithm, except that we use the cumulative sum of elements
988  // left of pivot to decide whether to go left or right, instead of the positional index.
989  // We'll be sorting by the prob array, permuting the other ones in parallel.
990 
991  int* conf_swapspace = nullptr;
992  constexpr_if(tgetConfs)
993  conf_swapspace = reinterpret_cast<int*>(malloc(this->allDimSizeofInt));
994 
995  size_t start = last_switch;
996  size_t end = this->_confs_no;
997  double sum_to_start = prob_at_last_switch;
998 
999  while(start < end)
1000  {
1001  // Partition part
1002  size_t len = end - start;
1003 #if ISOSPEC_BUILDING_R
1004  size_t pivot = len/2 + start;
1005 #else
1006  size_t pivot = random_gen() % len + start; // Using Mersenne twister directly - we don't
1007  // need a very uniform distribution just for pivot
1008  // selection
1009 #endif
1010  double pprob = this->_probs[pivot];
1011  swap<tgetConfs>(pivot, end-1, conf_swapspace);
1012 
1013  double new_csum = sum_to_start;
1014 
1015  size_t loweridx = start;
1016  for(size_t ii = start; ii < end-1; ii++)
1017  if(this->_probs[ii] > pprob)
1018  {
1019  swap<tgetConfs>(ii, loweridx, conf_swapspace);
1020  new_csum += this->_probs[loweridx];
1021  loweridx++;
1022  }
1023 
1024  swap<tgetConfs>(end-1, loweridx, conf_swapspace);
1025 
1026  // Selection part
1027  if(new_csum < target_total_prob)
1028  {
1029  start = loweridx + 1;
1030  sum_to_start = new_csum + this->_probs[loweridx];
1031  }
1032  else
1033  end = loweridx;
1034  }
1035 
1036  constexpr_if(tgetConfs)
1037  free(conf_swapspace);
1038 
1039  if(end <= current_size/2)
1040  // Overhead in memory of 2x or more, shrink to fit
1041  this->template reallocate_memory<tgetConfs>(end);
1042 
1043  this->_confs_no = end;
1044 }
1045 
1046 template void FixedEnvelope::total_prob_init<true>(Iso&& iso, double target_total_prob, bool optimize);
1047 template void FixedEnvelope::total_prob_init<false>(Iso&& iso, double target_total_prob, bool optimize);
1048 
1049 template<bool tgetConfs> void FixedEnvelope::stochastic_init(Iso&& iso, size_t _no_molecules, double _precision, double _beta_bias)
1050 {
1051  IsoStochasticGenerator generator(std::move(iso), _no_molecules, _precision, _beta_bias);
1052 
1053  this->allDim = generator.getAllDim();
1054  this->allDimSizeofInt = this->allDim * sizeof(int);
1055 
1056  this->reallocate_memory<tgetConfs>(ISOSPEC_INIT_TABLE_SIZE);
1057 
1058  while(generator.advanceToNextConfiguration())
1059  addConfILG<tgetConfs, IsoStochasticGenerator>(generator);
1060 }
1061 
1062 template void FixedEnvelope::stochastic_init<true>(Iso&& iso, size_t _no_molecules, double _precision, double _beta_bias);
1063 template void FixedEnvelope::stochastic_init<false>(Iso&& iso, size_t _no_molecules, double _precision, double _beta_bias);
1064 
1065 double FixedEnvelope::empiric_average_mass()
1066 {
1067  double ret = 0.0;
1068  for(size_t ii = 0; ii < _confs_no; ii++)
1069  {
1070  ret += _masses[ii] * _probs[ii];
1071  }
1072  return ret / get_total_prob();
1073 }
1074 
1075 double FixedEnvelope::empiric_variance()
1076 {
1077  double ret = 0.0;
1078  double avg = empiric_average_mass();
1079  for(size_t ii = 0; ii < _confs_no; ii++)
1080  {
1081  double msq = _masses[ii] - avg;
1082  ret += msq * msq * _probs[ii];
1083  }
1084 
1085  return ret / get_total_prob();
1086 }
1087 
1088 FixedEnvelope FixedEnvelope::Binned(Iso&& iso, double target_total_prob, double bin_width, double bin_middle)
1089 {
1090  FixedEnvelope ret;
1091 
1092  double min_mass = iso.getLightestPeakMass();
1093  double range_len = iso.getHeaviestPeakMass() - min_mass;
1094  size_t no_bins = static_cast<size_t>(range_len / bin_width) + 2;
1095  double half_width = 0.5*bin_width;
1096  double hwmm = half_width-bin_middle;
1097  size_t idx_min = floor((min_mass + hwmm) / bin_width);
1098  size_t idx_max = idx_min + no_bins;
1099 
1100  double* acc;
1101 # if ISOSPEC_GOT_MMAN
1102  acc = reinterpret_cast<double*>(mmap(nullptr, sizeof(double)*no_bins, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
1103 #else
1104  // This will probably crash for large molecules and high resolutions...
1105  acc = reinterpret_cast<double*>(calloc(no_bins, sizeof(double)));
1106 #endif
1107  if(acc == NULL)
1108  throw std::bad_alloc();
1109 
1110  acc -= idx_min;
1111 
1112  IsoLayeredGenerator ITG(std::move(iso));
1113 
1114 
1115  bool non_empty;
1116  while((non_empty = ITG.advanceToNextConfiguration()) && ITG.prob() == 0.0)
1117  {}
1118 
1119  if(non_empty)
1120  {
1121  double accum_prob = ITG.prob();
1122  size_t nonzero_idx = floor((ITG.mass() + hwmm)/bin_width);
1123  acc[nonzero_idx] = accum_prob;
1124 
1125  while(ITG.advanceToNextConfiguration() && accum_prob < target_total_prob)
1126  {
1127  double prob = ITG.prob();
1128  accum_prob += prob;
1129  size_t bin_idx = floor((ITG.mass() + hwmm)/bin_width);
1130  acc[bin_idx] += prob;
1131  }
1132 
1133  // Making the assumption that there won't be gaps of more than 10 Da in the spectrum. This is true for all
1134  // molecules made of natural elements.
1135  size_t distance_10da = static_cast<size_t>(10.0/bin_width) + 1;
1136 
1137  size_t empty_steps = 0;
1138 
1139  ret.reallocate_memory<false>(ISOSPEC_INIT_TABLE_SIZE);
1140 
1141  for(size_t ii = nonzero_idx; ii >= idx_min && empty_steps < distance_10da; ii--)
1142  {
1143  if(acc[ii] > 0.0)
1144  {
1145  empty_steps = 0;
1146  ret.store_conf(static_cast<double>(ii)*bin_width + bin_middle, acc[ii]);
1147  }
1148  else
1149  empty_steps++;
1150  }
1151 
1152  empty_steps = 0;
1153  for(size_t ii = nonzero_idx+1; ii < idx_max && empty_steps < distance_10da; ii++)
1154  {
1155  if(acc[ii] > 0.0)
1156  {
1157  empty_steps = 0;
1158  ret.store_conf(static_cast<double>(ii)*bin_width + bin_middle, acc[ii]);
1159  }
1160  else
1161  empty_steps++;
1162  }
1163  }
1164 
1165  acc += idx_min;
1166 
1167 # if ISOSPEC_GOT_MMAN
1168  munmap(acc, sizeof(double)*no_bins);
1169 #else
1170  free(acc);
1171 #endif
1172 
1173  return ret;
1174 }
1175 
1176 } // namespace IsoSpec