fastq_to_fasta
A template for creation of SeqAn3 apps, with a FASTQ to FASTA example app.
hierarchical_interleaved_bloom_filter.hpp
Go to the documentation of this file.
1 // --------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md
6 // --------------------------------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <ranges>
11 
12 #include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>
13 
14 #ifndef RAPTOR_HIBF_HAS_COUNT
15 # define RAPTOR_HIBF_HAS_COUNT 0
16 #endif
17 
18 namespace raptor
19 {
20 
76 template <seqan3::data_layout data_layout_mode_ = seqan3::data_layout::uncompressed>
78 {
79 public:
80  // Forward declaration
81  class user_bins;
82 
83  // Forward declaration
84  class membership_agent;
85 
86 #if RAPTOR_HIBF_HAS_COUNT
87  // Forward declaration
88  template <std::integral value_t>
89  class counting_agent_type;
90 #endif
91 
93  static constexpr seqan3::data_layout data_layout_mode = data_layout_mode_;
94 
96  using ibf_t = seqan3::interleaved_bloom_filter<data_layout_mode_>;
97 
109 
111 
113  std::vector<ibf_t> ibf_vector;
114 
122  std::vector<std::vector<int64_t>> next_ibf_id;
123 
126 
129  {
131  }
132 
133 #if RAPTOR_HIBF_HAS_COUNT
137  template <std::integral value_t = uint16_t>
138  counting_agent_type<value_t> counting_agent() const
139  {
140  return counting_agent_type<value_t>{*this};
141  }
142 #endif
143 
151  template <seqan3::cereal_archive archive_t>
152  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
153  {
154  archive(ibf_vector);
155  archive(next_ibf_id);
156  archive(user_bins);
157  }
159 };
160 
163 template <seqan3::data_layout data_layout_mode>
165 {
166 private:
168  std::vector<std::string> user_bin_filenames;
169 
176  std::vector<std::vector<int64_t>> ibf_bin_to_filename_position{};
177 
178 public:
180  size_t num_user_bins() const noexcept
181  {
182  return user_bin_filenames.size();
183  }
184 
186  void set_ibf_count(size_t const size)
187  {
188  ibf_bin_to_filename_position.resize(size);
189  }
190 
192  void set_user_bin_count(size_t const size)
193  {
194  user_bin_filenames.resize(size);
195  }
196 
198  std::vector<int64_t> & bin_indices_of_ibf(size_t const idx)
199  {
200  return ibf_bin_to_filename_position[idx];
201  }
202 
204  std::string & filename_of_user_bin(size_t const idx)
205  {
206  return user_bin_filenames[idx];
207  }
208 
210  std::string const & operator[](std::pair<size_t, size_t> const & index_pair) const
211  {
212  return user_bin_filenames[ibf_bin_to_filename_position[index_pair.first][index_pair.second]];
213  }
214 
218  auto operator[](size_t const ibf_idx) const
219  {
220  return ibf_bin_to_filename_position[ibf_idx]
221  | std::views::transform(
222  [this](int64_t i)
223  {
224  if (i == -1)
225  return std::string{};
226  else
227  return user_bin_filenames[i];
228  });
229  }
230 
232  int64_t filename_index(size_t const ibf_idx, size_t const bin_idx) const
233  {
234  return ibf_bin_to_filename_position[ibf_idx][bin_idx];
235  }
236 
242  template <typename stream_t>
243  void write_filenames(stream_t & out_stream) const
244  {
245  size_t position{};
246  std::string line{};
247  for (auto const & filename : user_bin_filenames)
248  {
249  line.clear();
250  line = '#';
251  line += std::to_string(position);
252  line += '\t';
253  line += filename;
254  line += '\n';
255  out_stream << line;
256  ++position;
257  }
258  }
259 
267  template <typename archive_t>
268  void serialize(archive_t & archive)
269  {
270  archive(user_bin_filenames);
271  archive(ibf_bin_to_filename_position);
272  }
274 };
275 
281 template <seqan3::data_layout data_layout_mode> // TODO: value_t as template?
283 {
284 private:
287 
289  hibf_t const * const hibf_ptr{nullptr};
290 
292  template <std::ranges::forward_range value_range_t>
293  void bulk_contains_impl(value_range_t && values, int64_t const ibf_idx, size_t const threshold)
294  {
295  auto agent = hibf_ptr->ibf_vector[ibf_idx].template counting_agent<uint16_t>();
296  auto & result = agent.bulk_count(values);
297 
298  uint16_t sum{};
299 
300  for (size_t bin{}; bin < result.size(); ++bin)
301  {
302  sum += result[bin];
303 
304  auto const current_filename_index = hibf_ptr->user_bins.filename_index(ibf_idx, bin);
305 
306  if (current_filename_index < 0) // merged bin
307  {
308  if (sum >= threshold)
309  bulk_contains_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold);
310  sum = 0u;
311  }
312  else if (bin + 1u == result.size() || // last bin
313  current_filename_index != hibf_ptr->user_bins.filename_index(ibf_idx, bin + 1)) // end of split bin
314  {
315  if (sum >= threshold)
316  result_buffer.emplace_back(current_filename_index);
317  sum = 0u;
318  }
319  }
320  }
321 
322 public:
326  membership_agent() = default;
327  membership_agent(membership_agent const &) = default;
331  ~membership_agent() = default;
332 
337  explicit membership_agent(hibf_t const & hibf) : hibf_ptr(std::addressof(hibf))
338  {}
340 
342  std::vector<int64_t> result_buffer;
343 
361  template <std::ranges::forward_range value_range_t>
362  [[nodiscard]] std::vector<int64_t> const & bulk_contains(value_range_t && values, size_t const threshold) & noexcept
363  {
364  assert(hibf_ptr != nullptr);
365 
366  static_assert(std::ranges::forward_range<value_range_t>, "The values must model forward_range.");
367  static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
368  "An individual value must be an unsigned integral.");
369 
370  result_buffer.clear();
371 
372  bulk_contains_impl(values, 0, threshold);
373 
374  std::ranges::sort(result_buffer); // TODO: necessary?
375 
376  return result_buffer;
377  }
378 
379  // `bulk_contains` cannot be called on a temporary, since the object the returned reference points to
380  // is immediately destroyed.
381  template <std::ranges::range value_range_t>
382  [[nodiscard]] std::vector<int64_t> const & bulk_contains(value_range_t && values,
383  size_t const threshold) && noexcept = delete;
385 };
386 
387 #if RAPTOR_HIBF_HAS_COUNT
390 template <seqan3::data_layout data_layout_mode>
391 template <std::integral value_t>
393 {
394 private:
397 
399  hibf_t const * const hibf_ptr{nullptr};
400 
402  template <std::ranges::forward_range value_range_t>
403  void bulk_count_impl(value_range_t && values, int64_t const ibf_idx, size_t const threshold)
404  {
405  auto agent = hibf_ptr->ibf_vector[ibf_idx].template counting_agent<value_t>();
406  auto & result = agent.bulk_count(values);
407 
408  value_t sum{};
409 
410  for (size_t bin{}; bin < result.size(); ++bin)
411  {
412  sum += result[bin];
413  auto const current_filename_index = hibf_ptr->user_bins.filename_index(ibf_idx, bin);
414 
415  if (current_filename_index < 0) // merged bin
416  {
417  if (sum >= threshold)
418  bulk_count_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold);
419  sum = 0u;
420  }
421  else if (bin + 1u == result.size() || // last bin
422  current_filename_index != hibf_ptr->user_bins.filename_index(ibf_idx, bin + 1)) // end of split bin
423  {
424  if (sum >= threshold)
425  result_buffer[current_filename_index] = sum;
426  sum = 0u;
427  }
428  }
429  }
430 
431 public:
435  counting_agent_type() = default;
440  ~counting_agent_type() = default;
441 
446  explicit counting_agent_type(hibf_t const & hibf) :
447  hibf_ptr(std::addressof(hibf)),
448  result_buffer(hibf_ptr->user_bins.num_user_bins())
449  {}
451 
453  seqan3::counting_vector<value_t> result_buffer;
454 
474  template <std::ranges::forward_range value_range_t>
475  [[nodiscard]] seqan3::counting_vector<value_t> const & bulk_count(value_range_t && values,
476  size_t const threshold = 1u) & noexcept
477  {
478  assert(hibf_ptr != nullptr);
479  assert(threshold > 0u);
480  assert(result_buffer.size() == hibf_ptr->user_bins.num_user_bins());
481 
482  static_assert(std::ranges::forward_range<value_range_t>, "The values must model forward_range.");
483  static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
484  "An individual value must be an unsigned integral.");
485 
486  std::ranges::fill(result_buffer, static_cast<value_t>(0u));
487 
488  bulk_count_impl(values, 0, threshold);
489 
490  return result_buffer;
491  }
492 
493  // `bulk_count` cannot be called on a temporary, since the object the returned reference points to
494  // is immediately destroyed.
495  template <std::ranges::range value_range_t>
496  [[nodiscard]] seqan3::counting_vector<value_t> const & bulk_count(value_range_t && values,
497  size_t const threshold = 1u) && noexcept = delete;
499 };
500 #endif // RAPTOR_HIBF_HAS_COUNT
501 
502 } // namespace raptor
Manages counting ranges of values for the hibf::hierarchical_interleaved_bloom_filter.
Definition: hierarchical_interleaved_bloom_filter.hpp:393
counting_agent_type(counting_agent_type &&)=default
Defaulted.
seqan3::counting_vector< value_t > const & bulk_count(value_range_t &&values, size_t const threshold=1u) &&noexcept=delete
counting_agent_type & operator=(counting_agent_type const &)=default
Defaulted.
counting_agent_type & operator=(counting_agent_type &&)=default
Defaulted.
counting_agent_type(counting_agent_type const &)=default
Defaulted.
seqan3::counting_vector< value_t > const & bulk_count(value_range_t &&values, size_t const threshold=1u) &noexcept
Counts the occurrences in each bin for all values in a range.
Definition: hierarchical_interleaved_bloom_filter.hpp:475
seqan3::counting_vector< value_t > result_buffer
Stores the result of bulk_count().
Definition: hierarchical_interleaved_bloom_filter.hpp:453
Manages membership queries for the hibf::hierarchical_interleaved_bloom_filter.
Definition: hierarchical_interleaved_bloom_filter.hpp:283
membership_agent(membership_agent const &)=default
Defaulted.
std::vector< int64_t > const & bulk_contains(value_range_t &&values, size_t const threshold) &&noexcept=delete
std::vector< int64_t > const & bulk_contains(value_range_t &&values, size_t const threshold) &noexcept
Determines set membership of given values, and returns the user bin indices of occurrences.
Definition: hierarchical_interleaved_bloom_filter.hpp:362
membership_agent & operator=(membership_agent const &)=default
Defaulted.
membership_agent & operator=(membership_agent &&)=default
Defaulted.
std::vector< int64_t > result_buffer
Stores the result of bulk_contains().
Definition: hierarchical_interleaved_bloom_filter.hpp:342
membership_agent(membership_agent &&)=default
Defaulted.
Bookkeeping for user and technical bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:165
int64_t filename_index(size_t const ibf_idx, size_t const bin_idx) const
Returns the filename index of the ibf_idxth IBF for bin bin_idx.
Definition: hierarchical_interleaved_bloom_filter.hpp:232
void set_ibf_count(size_t const size)
Changes the number of managed IBFs.
Definition: hierarchical_interleaved_bloom_filter.hpp:186
void set_user_bin_count(size_t const size)
Changes the number of managed user bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:192
auto operator[](size_t const ibf_idx) const
Returns a view over the user bin filenames for the ibf_idxth IBF. An empty string is returned for mer...
Definition: hierarchical_interleaved_bloom_filter.hpp:218
void write_filenames(stream_t &out_stream) const
Writes all filenames to a stream. Index and filename are tab-separated.
Definition: hierarchical_interleaved_bloom_filter.hpp:243
std::string const & operator[](std::pair< size_t, size_t > const &index_pair) const
For a pair (a,b), returns a const reference to the filename of the user bin at IBF a,...
Definition: hierarchical_interleaved_bloom_filter.hpp:210
std::string & filename_of_user_bin(size_t const idx)
Returns the filename of the idxth user bin.
Definition: hierarchical_interleaved_bloom_filter.hpp:204
std::vector< int64_t > & bin_indices_of_ibf(size_t const idx)
Returns a vector containing user bin indices for each bin in the idxth IBF.
Definition: hierarchical_interleaved_bloom_filter.hpp:198
size_t num_user_bins() const noexcept
Returns the number of managed user bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:180
The HIBF binning directory. A data structure that efficiently answers set-membership queries for mult...
Definition: hierarchical_interleaved_bloom_filter.hpp:78
hierarchical_interleaved_bloom_filter(hierarchical_interleaved_bloom_filter const &)=default
Defaulted.
~hierarchical_interleaved_bloom_filter()=default
Defaulted.
std::vector< ibf_t > ibf_vector
The individual interleaved Bloom filters.
Definition: hierarchical_interleaved_bloom_filter.hpp:113
hierarchical_interleaved_bloom_filter & operator=(hierarchical_interleaved_bloom_filter &&)=default
Defaulted.
hierarchical_interleaved_bloom_filter & operator=(hierarchical_interleaved_bloom_filter const &)=default
Defaulted.
membership_agent membership_agent() const
Returns a membership_agent to be used for counting.
Definition: hierarchical_interleaved_bloom_filter.hpp:128
static constexpr seqan3::data_layout data_layout_mode
Indicates whether the Interleaved Bloom Filter is compressed.
Definition: hierarchical_interleaved_bloom_filter.hpp:93
std::vector< std::vector< int64_t > > next_ibf_id
Stores for each bin in each IBF of the HIBF the ID of the next IBF.
Definition: hierarchical_interleaved_bloom_filter.hpp:122
hierarchical_interleaved_bloom_filter(hierarchical_interleaved_bloom_filter &&)=default
Defaulted.
seqan3::interleaved_bloom_filter< data_layout_mode_ > ibf_t
The type of an individual Bloom filter.
Definition: hierarchical_interleaved_bloom_filter.hpp:96
hierarchical_interleaved_bloom_filter()=default
Defaulted.
user_bins user_bins
The underlying user bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:125
hierarchical_interleaved_bloom_filter< seqan3::data_layout::uncompressed > hibf
Definition: index.hpp:24
Definition: adjust_seed.hpp:13