fastq_to_fasta
A template for creation of SeqAn3 apps, with a FASTQ to FASTA example app.
search_single.hpp
Go to the documentation of this file.
1 // --------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md
6 // --------------------------------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <seqan3/search/views/minimiser_hash.hpp>
11 
12 #include <raptor/adjust_seed.hpp>
13 #include <raptor/dna4_traits.hpp>
18 
19 namespace raptor
20 {
21 
22 template <typename index_t>
23 void search_single(search_arguments const & arguments, index_t && index)
24 {
25  constexpr bool is_ibf = std::same_as<index_t, raptor_index<index_structure::ibf>>
26  || std::same_as<index_t, raptor_index<index_structure::ibf_compressed>>;
27 
28  double index_io_time{0.0};
29  double reads_io_time{0.0};
30  double compute_time{0.0};
31 
32  auto cereal_worker = [&]()
33  {
34  load_index(index, arguments, index_io_time);
35  };
36  auto cereal_handle = std::async(std::launch::async, cereal_worker);
37 
38  seqan3::sequence_file_input<dna4_traits, seqan3::fields<seqan3::field::id, seqan3::field::seq>> fin{
39  arguments.query_file};
40  using record_type = typename decltype(fin)::record_type;
41  std::vector<record_type> records{};
42 
43  sync_out synced_out{arguments.out_file};
44 
45  {
46  size_t position{};
47  std::string line{};
48  for (auto const & file_list : arguments.bin_path)
49  {
50  line.clear();
51  line = '#';
52  line += std::to_string(position);
53  line += '\t';
54  for (auto const & filename : file_list)
55  {
56  line += filename;
57  line += ',';
58  }
59  line.back() = '\n';
60  synced_out << line;
61  ++position;
62  }
63  synced_out << "#QUERY_NAME\tUSER_BINS\n";
64  }
65 
66  raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()};
67 
68  auto worker = [&](size_t const start, size_t const end)
69  {
70  auto counter = [&index]()
71  {
72  if constexpr (is_ibf)
73  return index.ibf().template counting_agent<uint16_t>();
74  else
75  return index.ibf().membership_agent();
76  }();
77  std::string result_string{};
78  std::vector<uint64_t> minimiser;
79 
80  auto hash_adaptor = seqan3::views::minimiser_hash(arguments.shape,
81  seqan3::window_size{arguments.window_size},
82  seqan3::seed{adjust_seed(arguments.shape_weight)});
83 
84  for (auto && [id, seq] : records | seqan3::views::slice(start, end))
85  {
86  result_string.clear();
87  result_string += id;
88  result_string += '\t';
89 
90  auto minimiser_view = seq | hash_adaptor | std::views::common;
91  minimiser.assign(minimiser_view.begin(), minimiser_view.end());
92 
93  size_t const minimiser_count{minimiser.size()};
94  size_t const threshold = thresholder.get(minimiser_count);
95 
96  if constexpr (is_ibf)
97  {
98  auto & result = counter.bulk_count(minimiser);
99  size_t current_bin{0};
100  for (auto && count : result)
101  {
102  if (count >= threshold)
103  {
104  result_string += std::to_string(current_bin);
105  result_string += ',';
106  }
107  ++current_bin;
108  }
109  }
110  else
111  {
112  auto & result = counter.bulk_contains(minimiser, threshold); // Results contains user bin IDs
113  for (auto && count : result)
114  {
115  result_string += std::to_string(count);
116  result_string += ',';
117  }
118  }
119 
120  if (auto & last_char = result_string.back(); last_char == ',')
121  last_char = '\n';
122  else
123  result_string += '\n';
124  synced_out.write(result_string);
125  }
126  };
127 
128  for (auto && chunked_records : fin | seqan3::views::chunk((1ULL << 20) * 10))
129  {
130  records.clear();
131  auto start = std::chrono::high_resolution_clock::now();
132  std::ranges::move(chunked_records, std::back_inserter(records));
133  auto end = std::chrono::high_resolution_clock::now();
134  reads_io_time += std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
135 
136  cereal_handle.wait();
137 
138  do_parallel(worker, records.size(), arguments.threads, compute_time);
139  }
140 
141  // GCOVR_EXCL_START
142  if (arguments.write_time)
143  {
144  std::filesystem::path file_path{arguments.out_file};
145  file_path += ".time";
146  std::ofstream file_handle{file_path};
147  file_handle << "Index I/O\tReads I/O\tCompute\n";
148  file_handle << std::fixed << std::setprecision(2) << index_io_time << '\t' << reads_io_time << '\t'
149  << compute_time;
150  }
151  // GCOVR_EXCL_STOP
152 }
153 
154 } // namespace raptor
Definition: sync_out.hpp:18
Definition: threshold.hpp:17
Definition: adjust_seed.hpp:13
void do_parallel(algorithm_t &&worker, size_t const num_records, size_t const threads, double &compute_time)
Definition: do_parallel.hpp:18
void search_single(search_arguments const &arguments, index_t &&index)
Definition: search_single.hpp:23
void load_index(index_t &index, search_arguments const &arguments, size_t const part, double &index_io_time)
Definition: load_index.hpp:19
Definition: search_arguments.hpp:27
uint8_t threads
Definition: search_arguments.hpp:33
raptor::threshold::threshold_parameters make_threshold_parameters() const noexcept
Definition: search_arguments.hpp:58
seqan3::shape shape
Definition: search_arguments.hpp:30
std::filesystem::path query_file
Definition: search_arguments.hpp:51
std::vector< std::vector< std::string > > bin_path
Definition: search_arguments.hpp:50
bool write_time
Definition: search_arguments.hpp:53
std::filesystem::path out_file
Definition: search_arguments.hpp:52