24#include <seastar/core/aligned_buffer.hh>
25#include <seastar/core/cacheline.hh>
26#include <seastar/core/circular_buffer.hh>
28#include <seastar/core/condition-variable.hh>
29#include <seastar/core/enum.hh>
30#include <seastar/core/file.hh>
31#include <seastar/core/future.hh>
33#include <seastar/core/internal/io_desc.hh>
34#include <seastar/core/internal/io_request.hh>
35#include <seastar/core/internal/io_sink.hh>
36#include <seastar/core/iostream.hh>
37#include <seastar/core/lowres_clock.hh>
38#include <seastar/core/make_task.hh>
39#include <seastar/core/manual_clock.hh>
40#include <seastar/core/memory.hh>
42#include <seastar/core/internal/estimated_histogram.hh>
44#include <seastar/core/reactor_config.hh>
45#include <seastar/core/scattered_message.hh>
47#include <seastar/core/scheduling_specific.hh>
48#include <seastar/core/seastar.hh>
49#include <seastar/core/semaphore.hh>
50#include <seastar/core/sstring.hh>
51#include <seastar/core/temporary_buffer.hh>
52#include <seastar/core/thread_cputime_clock.hh>
54#include <seastar/core/gate.hh>
55#include <seastar/net/api.hh>
56#include <seastar/util/eclipse.hh>
57#include <seastar/util/log.hh>
58#include <seastar/util/modules.hh>
59#include <seastar/util/noncopyable_function.hh>
60#include <seastar/util/std-compat.hh>
61#include "internal/pollable_fd.hh"
64#include <boost/container/static_vector.hpp>
71#include <unordered_map>
76#include <sys/socket.h>
77#include <netinet/ip.h>
82struct _Unwind_Exception;
86using shard_id = unsigned;
99struct hash<::sockaddr_in> {
100 size_t operator()(::sockaddr_in a)
const {
101 return a.sin_port ^ a.sin_addr.s_addr;
107bool operator==(const ::sockaddr_in a, const ::sockaddr_in b);
114class reactor_backend_selector;
116class reactor_backend;
121class reactor_stall_sampler;
122class cpu_stall_detector;
123class buffer_allocator;
127size_t scheduling_group_count();
129void increase_thrown_exceptions_counter() noexcept;
139 virtual void complete_with(ssize_t res)
final override;
141 virtual void complete(
size_t res)
noexcept = 0;
142 virtual void set_exception(std::exception_ptr eptr)
noexcept = 0;
153 class batch_flush_pollfn;
155 class drain_cross_cpu_freelist_pollfn;
156 class lowres_timer_pollfn;
157 class manual_timer_pollfn;
159 class reap_kernel_completions_pollfn;
160 class kernel_submit_work_pollfn;
161 class io_queue_submission_pollfn;
162 class syscall_pollfn;
163 class execution_stage_pollfn;
165 friend class file_data_source_impl;
166 friend class internal::reactor_stall_sampler;
167 friend class preempt_io_context;
168 friend struct hrtimer_aio_completion;
169 friend class reactor_backend_epoll;
170 friend class reactor_backend_aio;
171 friend class reactor_backend_uring;
172 friend class reactor_backend_selector;
175 friend class aio_storage_context;
177 using poller = internal::poller;
183 uint64_t aio_reads = 0;
184 uint64_t aio_read_bytes = 0;
185 uint64_t aio_writes = 0;
186 uint64_t aio_write_bytes = 0;
187 uint64_t aio_outsizes = 0;
188 uint64_t aio_errors = 0;
189 uint64_t fstream_reads = 0;
190 uint64_t fstream_read_bytes = 0;
191 uint64_t fstream_reads_blocked = 0;
192 uint64_t fstream_read_bytes_blocked = 0;
193 uint64_t fstream_read_aheads_discarded = 0;
194 uint64_t fstream_read_ahead_discarded_bytes = 0;
201 uint64_t tasks_processed = 0;
203 friend void io_completion::complete_with(ssize_t);
210 std::shared_ptr<smp> _smp;
215 std::unique_ptr<reactor_backend> _backend;
216 sigset_t _active_sigmask;
217 std::vector<pollfn*> _pollers;
219 static constexpr unsigned max_aio_per_queue = 128;
220 static constexpr unsigned max_queues = 8;
221 static constexpr unsigned max_aio = max_aio_per_queue * max_queues;
224 std::unordered_map<dev_t, std::unique_ptr<io_queue>> _io_queues;
226 internal::io_sink _io_sink;
227 unsigned _num_io_groups = 0;
229 std::vector<noncopyable_function<future<> ()>> _exit_funcs;
231 bool _stopping =
false;
232 bool _stopped =
false;
233 bool _finished_running_tasks =
false;
235 std::optional<future<std::unique_ptr<network_stack>>> _network_stack_ready;
239 internal::preemption_monitor _preemption_monitor{};
240 uint64_t _global_tasks_processed = 0;
242 metrics::internal::time_estimated_histogram _stalls_histogram;
243 std::unique_ptr<internal::cpu_stall_detector> _cpu_stall_detector;
245 timer<>::set_t _timers;
246 timer<>::set_t::timer_list_t _expired_timers;
247 timer<lowres_clock>::set_t _lowres_timers;
248 timer<lowres_clock>::set_t::timer_list_t _expired_lowres_timers;
249 timer<manual_clock>::set_t _manual_timers;
250 timer<manual_clock>::set_t::timer_list_t _expired_manual_timers;
252 uint64_t _fsyncs = 0;
253 uint64_t _cxx_exceptions = 0;
254 uint64_t _abandoned_failed_futures = 0;
256 explicit task_queue(
unsigned id, sstring name, sstring shortname,
float shares);
257 int64_t _vruntime = 0;
259 int64_t _reciprocal_shares_times_2_power_32;
260 bool _active =
false;
262 sched_clock::time_point _ts;
263 sched_clock::duration _runtime = {};
264 sched_clock::duration _waittime = {};
265 sched_clock::duration _starvetime = {};
266 uint64_t _tasks_processed = 0;
267 circular_buffer<task*> _q;
271 static constexpr size_t shortname_size = 4;
273 int64_t to_vruntime(sched_clock::duration runtime)
const;
274 void set_shares(
float shares)
noexcept;
275 struct indirect_compare;
276 sched_clock::duration _time_spent_on_task_quota_violations = {};
278 void rename(sstring new_name, sstring new_shortname);
280 void register_stats();
283 boost::container::static_vector<std::unique_ptr<task_queue>, max_scheduling_groups()> _task_queues;
284 internal::scheduling_group_specific_thread_local_data _scheduling_group_specific_data;
285 int64_t _last_vruntime = 0;
286 task_queue_list _active_task_queues;
287 task_queue_list _activating_task_queues;
288 task_queue* _at_destroy_tasks;
289 task* _current_task =
nullptr;
299 std::unique_ptr<network_stack> _network_stack;
300 lowres_clock::time_point _lowres_next_timeout = lowres_clock::time_point::max();
301 std::optional<pollable_fd> _aio_eventfd;
302 const bool _reuseport;
303 circular_buffer<double> _loads;
305 sched_clock::duration _total_idle{0};
306 sched_clock::duration _total_sleep;
307 sched_clock::time_point _start_time =
now();
308 output_stream<char>::batch_flush_list_t _flush_batching;
309 std::atomic<bool> _sleeping
alignas(seastar::cache_line_size){0};
310 pthread_t _thread_id
alignas(seastar::cache_line_size) = pthread_self();
311 std::atomic<bool> _dying{
false};
312 gate _background_gate;
315 static std::chrono::nanoseconds calculate_poll_time();
316 static void block_notifier(
int);
317 bool flush_pending_aio();
318 steady_clock_type::time_point next_pending_aio() const noexcept;
319 bool reap_kernel_completions();
320 bool flush_tcp_batches();
321 void update_lowres_clocks() noexcept;
322 bool do_expire_lowres_timers() noexcept;
323 bool do_check_lowres_timers() const noexcept;
324 void expire_manual_timers() noexcept;
325 void start_aio_eventfd_loop();
326 void stop_aio_eventfd_loop();
335 bool pure_poll_once();
342 bool stopped() const noexcept {
return _stopped; }
344 uint64_t polls() const noexcept {
return _polls; }
353 bool pure_poll_signal()
const;
354 void handle_signal(
int signo, noncopyable_function<
void ()>&& handler);
355 void handle_signal_once(
int signo, noncopyable_function<
void ()>&& handler);
356 static void action(
int signo, siginfo_t* siginfo,
void* ignore);
357 static void failed_to_handle(
int signo);
359 struct signal_handler {
360 signal_handler(
int signo, noncopyable_function<
void ()>&& handler);
361 noncopyable_function<void ()> _handler;
363 std::atomic<uint64_t> _pending_signals;
364 std::unordered_map<int, signal_handler> _signal_handlers;
368 std::unique_ptr<thread_pool> _thread_pool;
369 friend class thread_pool;
370 friend class thread_context;
371 friend class internal::cpu_stall_detector;
375 uint64_t pending_task_count()
const;
376 void run_tasks(task_queue& tq);
377 bool have_more_tasks()
const;
378 bool posix_reuseport_detect();
379 void run_some_tasks();
380 void activate(task_queue& tq);
381 void insert_active_task_queue(task_queue* tq);
382 task_queue* pop_active_task_queue(sched_clock::time_point
now);
383 void insert_activating_task_queues();
384 void account_runtime(task_queue& tq, sched_clock::duration runtime);
385 void account_idle(sched_clock::duration idletime);
386 void allocate_scheduling_group_specific_data(
scheduling_group sg,
unsigned long key_id);
391 uint64_t tasks_processed()
const;
392 uint64_t min_vruntime()
const;
393 void request_preemption();
394 void start_handling_signal();
395 void reset_preemption_monitor();
396 void service_highres_timer() noexcept;
420 explicit
reactor(
std::
shared_ptr<
smp>
smp, alien::instance& alien,
unsigned id, reactor_backend_selector rbs, reactor_config cfg);
423 void operator=(const
reactor&) = delete;
425 static sched_clock::time_point
now() noexcept {
428 sched_clock::duration uptime() {
429 return now() - _start_time;
432 io_queue& get_io_queue(dev_t devid = 0) {
433 auto queue = _io_queues.find(devid);
434 if (queue == _io_queues.end()) {
435 return *_io_queues.at(0);
437 return *(queue->second);
442 future<> update_bandwidth_for_queues(internal::priority_class pc, uint64_t bandwidth);
444 void rename_queues(internal::priority_class pc, sstring new_name);
446 void update_shares_for_queues(internal::priority_class pc, uint32_t shares);
448 server_socket
listen(socket_address sa, listen_options opts = {});
450 future<connected_socket>
connect(socket_address sa);
451 future<connected_socket>
connect(socket_address, socket_address, transport proto = transport::TCP);
453 pollable_fd posix_listen(socket_address sa, listen_options opts = {});
455 bool posix_reuseport_available()
const {
return _reuseport; }
457 pollable_fd make_pollable_fd(socket_address sa,
int proto);
459 future<> posix_connect(pollable_fd pfd, socket_address sa, socket_address local);
461 future<> send_all(pollable_fd_state& fd,
const void* buffer,
size_t size);
463 future<file> open_file_dma(std::string_view name,
open_flags flags, file_open_options options = {})
noexcept;
464 future<file> open_directory(std::string_view name)
noexcept;
465 future<>
make_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions)
noexcept;
466 future<>
touch_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions)
noexcept;
467 future<std::optional<directory_entry_type>>
file_type(std::string_view name, follow_symlink = follow_symlink::yes)
noexcept;
468 future<stat_data>
file_stat(std::string_view pathname, follow_symlink)
noexcept;
469 future<>
chown(std::string_view filepath, uid_t owner, gid_t group);
470 future<std::optional<struct group_details>>
getgrnam(std::string_view name);
471 future<uint64_t>
file_size(std::string_view pathname)
noexcept;
472 future<bool>
file_accessible(std::string_view pathname, access_flags flags)
noexcept;
473 future<bool>
file_exists(std::string_view pathname)
noexcept {
476 future<fs_type>
file_system_at(std::string_view pathname)
noexcept;
477 future<struct statvfs> statvfs(std::string_view pathname)
noexcept;
478 future<>
remove_file(std::string_view pathname)
noexcept;
479 future<>
rename_file(std::string_view old_pathname, std::string_view new_pathname)
noexcept;
480 future<>
link_file(std::string_view oldpath, std::string_view newpath)
noexcept;
481 future<>
chmod(std::string_view name, file_permissions permissions)
noexcept;
483 future<size_t> read_directory(
int fd,
char* buffer,
size_t buffer_size);
485 future<int> inotify_add_watch(
int fd, std::string_view path, uint32_t flags);
487 future<std::tuple<file_desc, file_desc>>
make_pipe();
488 future<std::tuple<pid_t, file_desc, file_desc, file_desc>>
489 spawn(std::string_view pathname,
490 std::vector<sstring> argv,
491 std::vector<sstring> env = {});
492 future<int> waitpid(pid_t pid);
493 void kill(pid_t pid,
int sig);
497 future<> when_started() {
return _start_promise.
get_future(); }
500 template <
typename Rep,
typename Period>
501 future<> wait_for_stop(std::chrono::duration<Rep, Period> timeout) {
502 return _stop_requested.
wait(timeout, [
this] {
return _stopping; });
505 void at_exit(noncopyable_function<future<> ()> func);
507 template <
typename Func>
508 void at_destroy(Func&& func) {
509 _at_destroy_tasks->_q.push_back(make_task(default_scheduling_group(), std::forward<Func>(func)));
512 task* current_task()
const {
return _current_task; }
517 void set_current_task(task* t) { _current_task = t; }
519 void add_task(task* t)
noexcept;
520 void add_urgent_task(task* t)
noexcept;
522 void run_in_background(future<> f);
524 template <
typename Func>
525 void run_in_background(Func&& func) {
526 run_in_background(futurize_invoke(std::forward<Func>(func)));
530 static future<> drain();
541 _idle_cpu_handler = std::move(handler);
545 void add_high_priority_task(
task*)
noexcept;
549 [[deprecated(
"Use this_shard_id")]]
550 shard_id cpu_id()
const;
554 steady_clock_type::duration total_idle_time();
555 steady_clock_type::duration total_busy_time();
556 std::chrono::nanoseconds total_steal_time();
558 const io_stats& get_io_stats()
const {
return _io_stats; }
565 uint64_t abandoned_failed_futures()
const {
return _abandoned_failed_futures; }
575 void register_poller(
pollfn* p);
576 void unregister_poller(
pollfn* p);
578 void register_metrics();
581 future<> fdatasync(
int fd)
noexcept;
597 friend class posix_file_impl;
598 friend class blockdev_file_impl;
599 friend class timer<>;
603 friend class internal::poller;
606 friend void seastar::internal::increase_thrown_exceptions_counter() noexcept;
607 friend
void report_failed_future(const
std::exception_ptr& eptr) noexcept;
608 metrics::metric_groups _metric_groups;
614 future<struct statfs> fstatfs(
int fd) noexcept;
621 void enable_timer(steady_clock_type::time_point when) noexcept;
631 void set_strict_dma(
bool value);
632 void set_bypass_fsync(
bool value);
633 void update_blocked_reactor_notify_ms(
std::chrono::milliseconds ms);
634 std::chrono::milliseconds get_blocked_reactor_notify_ms() const;
638 static void with_allow_abandoned_failed_futures(
unsigned count,
noncopyable_function<
void ()> func);
644 static std::function<void ()> get_stall_detector_report_function();
648extern __thread
reactor* local_engine;
649extern __thread
size_t task_quota;
653 return *local_engine;
657inline bool engine_is_ready() {
658 return local_engine !=
nullptr;
661inline int hrtimer_signal() {
668extern logger seastar_logger;
Definition: circular_buffer_fixed_capacity.hh:53
Conditional variable.
Definition: condition-variable.hh:73
future wait() noexcept
Definition: condition-variable.hh:214
A representation of a possibly not-yet-computed value.
Definition: future.hh:1240
Definition: reactor.hh:137
Definition: io_queue.hh:77
Definition: io_desc.hh:28
Low-resolution and efficient steady clock.
Definition: lowres_clock.hh:56
Definition: manual_clock.hh:35
holds the metric definition.
Definition: metrics_registration.hh:94
Definition: pollable_fd.hh:62
Definition: pollable_fd.hh:136
Definition: reactor.hh:636
static void set_stall_detector_report_function(std::function< void()> report)
Definition: reactor.hh:146
sched_stats get_sched_stats() const
friend void handle_signal(int signo, noncopyable_function< void()> &&handler, bool once)
Sets a signal handler for the specified signal.
void set_idle_cpu_handler(idle_cpu_handler &&handler)
Definition: reactor.hh:540
alien::instance & alien()
Definition: reactor.hh:207
Definition: reactor.hh:182
Scheduling statistics.
Definition: reactor.hh:197
Definition: scheduling.hh:183
Identifies function calls that are accounted as a group.
Definition: scheduling.hh:285
Definition: shared_ptr.hh:507
Definition: socket_defs.hh:47
Definition: temporary_buffer.hh:67
future touch_directory(std::string_view name, file_permissions permissions=file_permissions::default_dir_permissions) noexcept
future remove_file(std::string_view name) noexcept
future rename_file(std::string_view old_name, std::string_view new_name) noexcept
future chmod(std::string_view name, file_permissions permissions) noexcept
future< uint64_t > file_size(std::string_view name) noexcept
future< std::optional< struct group_details > > getgrnam(std::string_view name)
future< bool > file_exists(std::string_view name) noexcept
future< fs_type > file_system_at(std::string_view name) noexcept
future< bool > file_accessible(std::string_view name, access_flags flags) noexcept
future link_file(std::string_view oldpath, std::string_view newpath) noexcept
future chown(std::string_view filepath, uid_t owner, gid_t group)
future< std::optional< directory_entry_type > > file_type(std::string_view name, follow_symlink follow=follow_symlink::yes) noexcept
open_flags
Definition: file-types.hh:41
future< stat_data > file_stat(std::string_view name, follow_symlink fs=follow_symlink::yes) noexcept
future make_directory(std::string_view name, file_permissions permissions=file_permissions::default_dir_permissions) noexcept
future< T > get_future() noexcept
Gets the promise's associated future.
Definition: future.hh:1926
future now()
Returns a ready future.
Definition: later.hh:35
future< std::tuple< file_desc, file_desc > > make_pipe()
server_socket listen(socket_address sa)
future< connected_socket > connect(socket_address sa)
holds the metric_groups definition needed by class that reports metrics
future configure(const options &opts)
set the metrics configuration
Seastar API namespace.
Definition: abort_on_ebadf.hh:26
const noncopyable_function< bool()> & work_waiting_on_reactor
Definition: idle_cpu_handler.hh:46
void handle_signal(int signo, noncopyable_function< void()> &&handler, bool once=false)
Sets a signal handler for the specified signal.
future sleep(std::chrono::duration< Rep, Period > dur)
Definition: sleep.hh:48
future destroy_scheduling_group(scheduling_group sg) noexcept
future rename_scheduling_group(scheduling_group sg, sstring new_name) noexcept
idle_cpu_handler_result
Definition: idle_cpu_handler.hh:37
@ no_more_work
The user callback has no more work to perform.
noncopyable_function< idle_cpu_handler_result(work_waiting_on_reactor poll)> idle_cpu_handler
Definition: idle_cpu_handler.hh:52
future< scheduling_group > create_scheduling_group(sstring name, float shares) noexcept
future< scheduling_group_key > scheduling_group_key_create(scheduling_group_key_config cfg) noexcept
Definition: noncopyable_function.hh:37
Configuration for the reactor.
Definition: reactor_config.hh:53
Definition: scheduling.hh:143