Seastar
High performance C++ framework for concurrent servers
io_queue.hh
1/*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18/*
19 * Copyright 2019 ScyllaDB
20 */
21
22#pragma once
23
24#ifndef SEASTAR_MODULE
25#include <boost/container/static_vector.hpp>
26#include <chrono>
27#include <memory>
28#include <vector>
29#include <sys/uio.h>
30#endif
31#include <seastar/core/sstring.hh>
32#include <seastar/core/fair_queue.hh>
34#include <seastar/core/future.hh>
35#include <seastar/core/internal/io_request.hh>
36#include <seastar/core/lowres_clock.hh>
37#include <seastar/util/spinlock.hh>
38#include <seastar/util/modules.hh>
39
40struct io_queue_for_tests;
41
42namespace seastar {
43
44class io_queue;
45namespace internal {
46const fair_group& get_fair_group(const io_queue& ioq, unsigned stream);
47}
48
49SEASTAR_MODULE_EXPORT
50class io_intent;
51
52namespace internal {
53class io_sink;
54}
55
56using shard_id = unsigned;
57using stream_id = unsigned;
58
59class io_desc_read_write;
60class queued_io_request;
61class io_group;
62
63using io_group_ptr = std::shared_ptr<io_group>;
64using iovec_keeper = std::vector<::iovec>;
65
66namespace internal {
67struct maybe_priority_class_ref;
68class priority_class {
69 unsigned _id;
70public:
71 explicit priority_class(const scheduling_group& sg) noexcept;
72 explicit priority_class(internal::maybe_priority_class_ref pc) noexcept;
73 unsigned id() const noexcept { return _id; }
74};
75}
76
77class io_queue {
78public:
79 class priority_class_data;
80
81private:
82 std::vector<std::unique_ptr<priority_class_data>> _priority_classes;
83 io_group_ptr _group;
84 boost::container::static_vector<fair_queue, 2> _streams;
85 internal::io_sink& _sink;
86
87 friend struct ::io_queue_for_tests;
88 friend const fair_group& internal::get_fair_group(const io_queue& ioq, unsigned stream);
89
90 priority_class_data& find_or_create_class(internal::priority_class pc);
91 future<size_t> queue_request(internal::priority_class pc, internal::io_direction_and_length dnl, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept;
92 future<size_t> queue_one_request(internal::priority_class pc, internal::io_direction_and_length dnl, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept;
93
94 // The fields below are going away, they are just here so we can implement deprecated
95 // functions that used to be provided by the fair_queue and are going away (from both
96 // the fair_queue and the io_queue). Double-accounting for now will allow for easier
97 // decoupling and is temporary
98 size_t _queued_requests = 0;
99 size_t _requests_executing = 0;
100 uint64_t _requests_dispatched = 0;
101 uint64_t _requests_completed = 0;
102
103 // Flow monitor
104 uint64_t _prev_dispatched = 0;
105 uint64_t _prev_completed = 0;
106 double _flow_ratio = 1.0;
107
108 timer<lowres_clock> _averaging_decay_timer;
109
110 const std::chrono::milliseconds _stall_threshold_min;
111 std::chrono::milliseconds _stall_threshold;
112
113 void update_flow_ratio() noexcept;
114 void lower_stall_threshold() noexcept;
115
116 metrics::metric_groups _metric_groups;
117public:
118
119 using clock_type = std::chrono::steady_clock;
120
121 // We want to represent the fact that write requests are (maybe) more expensive
122 // than read requests. To avoid dealing with floating point math we will scale one
123 // read request to be counted by this amount.
124 //
125 // A write request that is 30% more expensive than a read will be accounted as
126 // (read_request_base_count * 130) / 100.
127 // It is also technically possible for reads to be the expensive ones, in which case
128 // writes will have an integer value lower than read_request_base_count.
129 static constexpr unsigned read_request_base_count = 128;
130 static constexpr unsigned block_size_shift = 9;
131
132 struct config {
133 dev_t devid;
134 unsigned long req_count_rate = std::numeric_limits<int>::max();
135 unsigned long blocks_count_rate = std::numeric_limits<int>::max();
136 unsigned disk_req_write_to_read_multiplier = read_request_base_count;
137 unsigned disk_blocks_write_to_read_multiplier = read_request_base_count;
138 size_t disk_read_saturation_length = std::numeric_limits<size_t>::max();
139 size_t disk_write_saturation_length = std::numeric_limits<size_t>::max();
140 sstring mountpoint = "undefined";
141 bool duplex = false;
142 std::chrono::duration<double> rate_limit_duration = std::chrono::milliseconds(1);
143 size_t block_count_limit_min = 1;
144 unsigned averaging_decay_ticks = 100;
145 double flow_ratio_ema_factor = 0.95;
146 double flow_ratio_backpressure_threshold = 1.1;
147 std::chrono::milliseconds stall_threshold = std::chrono::milliseconds(100);
148 };
149
150 io_queue(io_group_ptr group, internal::io_sink& sink);
151 ~io_queue();
152
153 stream_id request_stream(internal::io_direction_and_length dnl) const noexcept;
154
155 future<size_t> submit_io_read(internal::priority_class priority_class,
156 size_t len, internal::io_request req, io_intent* intent, iovec_keeper iovs = {}) noexcept;
157 future<size_t> submit_io_write(internal::priority_class priority_class,
158 size_t len, internal::io_request req, io_intent* intent, iovec_keeper iovs = {}) noexcept;
159
160 void submit_request(io_desc_read_write* desc, internal::io_request req) noexcept;
161 void cancel_request(queued_io_request& req) noexcept;
162 void complete_cancelled_request(queued_io_request& req) noexcept;
163 void complete_request(io_desc_read_write& desc, std::chrono::duration<double> delay) noexcept;
164
165 [[deprecated("I/O queue users should not track individual requests, but resources (weight, size) passing through the queue")]]
166 size_t queued_requests() const {
167 return _queued_requests;
168 }
169
170 // How many requests are sent to disk but not yet returned.
171 [[deprecated("I/O queue users should not track individual requests, but resources (weight, size) passing through the queue")]]
172 size_t requests_currently_executing() const {
173 return _requests_executing;
174 }
175
176 // Dispatch requests that are pending in the I/O queue
177 void poll_io_queue();
178
179 clock_type::time_point next_pending_aio() const noexcept;
180 fair_queue_entry::capacity_t request_capacity(internal::io_direction_and_length dnl) const noexcept;
181
182 sstring mountpoint() const;
183 dev_t dev_id() const noexcept;
184
185 void update_shares_for_class(internal::priority_class pc, size_t new_shares);
186 future<> update_bandwidth_for_class(internal::priority_class pc, uint64_t new_bandwidth);
187 void rename_priority_class(internal::priority_class pc, sstring new_name);
188 void throttle_priority_class(const priority_class_data& pc) noexcept;
189 void unthrottle_priority_class(const priority_class_data& pc) noexcept;
190
192 size_t max_read;
193 size_t max_write;
194 };
195
196 request_limits get_request_limits() const noexcept;
197 const config& get_config() const noexcept;
198
199private:
200 static fair_queue::config make_fair_queue_config(const config& cfg, sstring label);
201 void register_stats(sstring name, priority_class_data& pc);
202};
203
204class io_group {
205public:
206 explicit io_group(io_queue::config io_cfg, unsigned nr_queues);
207 ~io_group();
208 struct priority_class_data;
209
210 std::chrono::duration<double> io_latency_goal() const noexcept;
211
212private:
213 friend class io_queue;
214 friend struct ::io_queue_for_tests;
215 friend const fair_group& internal::get_fair_group(const io_queue& ioq, unsigned stream);
216
217 const io_queue::config _config;
218 size_t _max_request_length[2];
219 boost::container::static_vector<fair_group, 2> _fgs;
220 std::vector<std::unique_ptr<priority_class_data>> _priority_classes;
221 util::spinlock _lock;
222 const shard_id _allocated_on;
223
224 static fair_group::config make_fair_group_config(const io_queue::config& qcfg) noexcept;
225 priority_class_data& find_or_create_class(internal::priority_class pc);
226};
227
228inline const io_queue::config& io_queue::get_config() const noexcept {
229 return _group->_config;
230}
231
232inline sstring io_queue::mountpoint() const {
233 return get_config().mountpoint;
234}
235
236inline dev_t io_queue::dev_id() const noexcept {
237 return get_config().devid;
238}
239
240namespace internal {
241double request_tokens(io_direction_and_length dnl, const io_queue::config& cfg) noexcept;
242}
243
244}
Group of queues class.
Definition: fair_queue.hh:138
Definition: fair_queue.hh:236
Fair queuing class.
Definition: fair_queue.hh:289
A representation of a possibly not-yet-computed value.
Definition: future.hh:1240
Definition: io_queue.hh:204
Definition: io_intent.hh:44
Definition: io_queue.hh:77
Definition: io_queue.hh:132
Definition: io_queue.hh:191
holds the metric definition.
Definition: metrics_registration.hh:94
Definition: stream.hh:60
Definition: spinlock.hh:88
holds the metric_groups definition needed by class that reports metrics
Seastar API namespace.
Definition: abort_on_ebadf.hh:26