master/fair__queue_8hh_source.html

/*

 * This file is open source software, licensed to you under the terms

 * of the Apache License, Version 2.0 (the "License").  See the NOTICE file

 * distributed with this work for additional information regarding copyright

 * ownership.  You may not use this file except in compliance with the License.

 *

 * You may obtain a copy of the License at

 *

 *   http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 */


/*

 * Copyright (C) 2016 ScyllaDB

 */

#pragma once


#include <boost/intrusive/slist.hpp>

#include <seastar/core/sstring.hh>

#include <seastar/core/shared_ptr.hh>

#include <seastar/core/circular_buffer.hh>

#include <seastar/core/metrics_registration.hh>

#include <seastar/util/shared_token_bucket.hh>


#include <chrono>

#include <cstdint>

#include <functional>

#include <optional>

#include <queue>

#include <fmt/ostream.h>


namespace bi = boost::intrusive;


namespace seastar {


class fair_queue_ticket {

    uint32_t _weight = 0;

    uint32_t _size = 0;

public:

    fair_queue_ticket(uint32_t weight, uint32_t size) noexcept;

    fair_queue_ticket() noexcept {}

    fair_queue_ticket operator+(fair_queue_ticket desc) const noexcept;

    fair_queue_ticket operator-(fair_queue_ticket desc) const noexcept;

    fair_queue_ticket& operator+=(fair_queue_ticket desc) noexcept;

    fair_queue_ticket& operator-=(fair_queue_ticket desc) noexcept;

    bool operator==(const fair_queue_ticket& desc) const noexcept;


    explicit operator bool() const noexcept;

    bool is_non_zero() const noexcept;


    friend std::ostream& operator<<(std::ostream& os, fair_queue_ticket t);


    float normalize(fair_queue_ticket axis) const noexcept;


    /*

     * For both dimentions checks if the first rover is ahead of the

     * second and returns the difference. If behind returns zero.

     */

    friend fair_queue_ticket wrapping_difference(const fair_queue_ticket& a, const fair_queue_ticket& b) noexcept;

};


class fair_queue_entry {

public:

    // The capacity_t represents tokens each entry needs to get dispatched, in

    // a 'normalized' form -- converted from floating-point to fixed-point number

    // and scaled accrding to fair-group's token-bucket duration

    using capacity_t = uint64_t;

    friend class fair_queue;


private:

    capacity_t _capacity;

    bi::slist_member_hook<> _hook;


public:

    fair_queue_entry(capacity_t c) noexcept

        : _capacity(c) {}

    using container_list_t = bi::slist<fair_queue_entry,

            bi::constant_time_size<false>,

            bi::cache_last<true>,

            bi::member_hook<fair_queue_entry, bi::slist_member_hook<>, &fair_queue_entry::_hook>>;


    capacity_t capacity() const noexcept { return _capacity; }

};


class fair_group {

public:

    using capacity_t = fair_queue_entry::capacity_t;

    using clock_type = std::chrono::steady_clock;


    /*

     * tldr; The math

     *

     *    Bw, Br -- write/read bandwidth (bytes per second)

     *    Ow, Or -- write/read iops (ops per second)

     *

     *    xx_max -- their maximum values (configured)

     *

     * Throttling formula:

     *

     *    Bw/Bw_max + Br/Br_max + Ow/Ow_max + Or/Or_max <= K

     *

     * where K is the scalar value <= 1.0 (also configured)

     *

     * Bandwidth is bytes time derivatite, iops is ops time derivative, i.e.

     * Bx = d(bx)/dt, Ox = d(ox)/dt. Then the formula turns into

     *

     *   d(bw/Bw_max + br/Br_max + ow/Ow_max + or/Or_max)/dt <= K

     *

     * Fair queue tickets are {w, s} weight-size pairs that are

     *

     *   s = read_base_count * br, for reads

     *       Br_max/Bw_max * read_base_count * bw, for writes

     *

     *   w = read_base_count, for reads

     *       Or_max/Ow_max * read_base_count, for writes

     *

     * Thus the formula turns into

     *

     *   d(sum(w/W + s/S))/dr <= K

     *

     * where {w, s} is the ticket value if a request and sum summarizes the

     * ticket values from all the requests seen so far, {W, S} is the ticket

     * value that corresonds to a virtual summary of Or_max requests of

     * Br_max size total.

     */


    /*

     * The normalization results in a float of the 2^-30 seconds order of

     * magnitude. Not to invent float point atomic arithmetics, the result

     * is converted to an integer by multiplying by a factor that's large

     * enough to turn these values into a non-zero integer.

     *

     * Also, the rates in bytes/sec when adjusted by io-queue according to

     * multipliers become too large to be stored in 32-bit ticket value.

     * Thus the rate resolution is applied. The t.bucket is configured with a

     * time period for which the speeds from F (in above formula) are taken.

     */


    static constexpr float fixed_point_factor = float(1 << 24);

    using rate_resolution = std::milli;

    using token_bucket_t = internal::shared_token_bucket<capacity_t, rate_resolution, internal::capped_release::no>;


private:


    /*

     * The dF/dt <= K limitation is managed by the modified token bucket

     * algo where tokens are ticket.normalize(cost_capacity), the refill

     * rate is K.

     *

     * The token bucket algo must have the limit on the number of tokens

     * accumulated. Here it's configured so that it accumulates for the

     * latency_goal duration.

     *

     * The replenish threshold is the minimal number of tokens to put back.

     * It's reserved for future use to reduce the load on the replenish

     * timestamp.

     *

     * The timestamp, in turn, is the time when the bucket was replenished

     * last. Every time a shard tries to get tokens from bucket it first

     * tries to convert the time that had passed since this timestamp

     * into more tokens in the bucket.

     */


    token_bucket_t _token_bucket;

    const capacity_t _per_tick_threshold;


public:


    // Convert internal capacity value back into the real token

    static double capacity_tokens(capacity_t cap) noexcept {

        return (double)cap / fixed_point_factor / token_bucket_t::rate_cast(std::chrono::seconds(1)).count();

    }


    // Convert floating-point tokens into the token bucket capacity

    static capacity_t tokens_capacity(double tokens) noexcept {

        return tokens * token_bucket_t::rate_cast(std::chrono::seconds(1)).count() * fixed_point_factor;

    }


    auto capacity_duration(capacity_t cap) const noexcept {

        return _token_bucket.duration_for(cap);

    }


    struct config {

        sstring label = "";

        /*

         * There are two "min" values that can be configured. The former one

         * is the minimal weight:size pair that the upper layer is going to

         * submit. However, it can submit _larger_ values, and the fair queue

         * must accept those as large as the latter pair (but it can accept

         * even larger values, of course)

         */

        double min_tokens = 0.0;

        double limit_min_tokens = 0.0;

        std::chrono::duration<double> rate_limit_duration = std::chrono::milliseconds(1);

    };


    explicit fair_group(config cfg, unsigned nr_queues);

    fair_group(fair_group&&) = delete;


    capacity_t maximum_capacity() const noexcept { return _token_bucket.limit(); }

    capacity_t per_tick_grab_threshold() const noexcept { return _per_tick_threshold; }

    capacity_t grab_capacity(capacity_t cap) noexcept;

    clock_type::time_point replenished_ts() const noexcept { return _token_bucket.replenished_ts(); }

    void replenish_capacity(clock_type::time_point now) noexcept;

    void maybe_replenish_capacity(clock_type::time_point& local_ts) noexcept;


    capacity_t capacity_deficiency(capacity_t from) const noexcept;


    std::chrono::duration<double> rate_limit_duration() const noexcept {

        std::chrono::duration<double, rate_resolution> dur((double)_token_bucket.limit() / _token_bucket.rate());

        return std::chrono::duration_cast<std::chrono::duration<double>>(dur);

    }


    const token_bucket_t& token_bucket() const noexcept { return _token_bucket; }

};


class fair_queue {

public:

    struct config {

        sstring label = "";

        std::chrono::microseconds tau = std::chrono::milliseconds(5);

    };


    using class_id = unsigned int;

    class priority_class_data;

    using capacity_t = fair_group::capacity_t;

    using signed_capacity_t = std::make_signed_t<capacity_t>;


private:

    using clock_type = std::chrono::steady_clock;

    using priority_class_ptr = priority_class_data*;

    struct class_compare {

        bool operator() (const priority_class_ptr& lhs, const priority_class_ptr & rhs) const noexcept;

    };


    class priority_queue : public std::priority_queue<priority_class_ptr, std::vector<priority_class_ptr>, class_compare> {

        using super = std::priority_queue<priority_class_ptr, std::vector<priority_class_ptr>, class_compare>;

    public:

        void reserve(size_t len) {

            c.reserve(len);

        }


        void assert_enough_capacity() const noexcept {

            assert(c.size() < c.capacity());

        }

    };


    config _config;

    fair_group& _group;

    clock_type::time_point _group_replenish;

    fair_queue_ticket _resources_executing;

    fair_queue_ticket _resources_queued;

    priority_queue _handles;

    std::vector<std::unique_ptr<priority_class_data>> _priority_classes;

    size_t _nr_classes = 0;

    capacity_t _last_accumulated = 0;


    /*

     * When the shared capacity os over the local queue delays

     * further dispatching untill better times

     *

     * \head  -- the value group head rover is expected to cross

     * \cap   -- the capacity that's accounted on the group

     *

     * The last field is needed to "rearm" the wait in case

     * queue decides that it wants to dispatch another capacity

     * in the middle of the waiting

     */

    struct pending {

        capacity_t head;

        capacity_t cap;


        pending(capacity_t t, capacity_t c) noexcept : head(t), cap(c) {}

    };


    std::optional<pending> _pending;


    void push_priority_class(priority_class_data& pc) noexcept;

    void push_priority_class_from_idle(priority_class_data& pc) noexcept;

    void pop_priority_class(priority_class_data& pc) noexcept;

    void plug_priority_class(priority_class_data& pc) noexcept;

    void unplug_priority_class(priority_class_data& pc) noexcept;


    enum class grab_result { grabbed, cant_preempt, pending };

    grab_result grab_capacity(const fair_queue_entry& ent) noexcept;

    grab_result grab_pending_capacity(const fair_queue_entry& ent) noexcept;

public:

    explicit fair_queue(fair_group& shared, config cfg);

    fair_queue(fair_queue&&) = delete;

    ~fair_queue();


    sstring label() const noexcept { return _config.label; }


    void register_priority_class(class_id c, uint32_t shares);


    void unregister_priority_class(class_id c);


    void update_shares_for_class(class_id c, uint32_t new_shares);


    fair_queue_ticket resources_currently_waiting() const;


    fair_queue_ticket resources_currently_executing() const;


    capacity_t tokens_capacity(double tokens) const noexcept {

        return _group.tokens_capacity(tokens);

    }


    capacity_t maximum_capacity() const noexcept {

        return _group.maximum_capacity();

    }


    void queue(class_id c, fair_queue_entry& ent) noexcept;


    void plug_class(class_id c) noexcept;

    void unplug_class(class_id c) noexcept;


    void notify_request_finished(fair_queue_entry::capacity_t cap) noexcept;

    void notify_request_cancelled(fair_queue_entry& ent) noexcept;


    void dispatch_requests(std::function<void(fair_queue_entry&)> cb);


    clock_type::time_point next_pending_aio() const noexcept;


    std::vector<seastar::metrics::impl::metric_definition_impl> metrics(class_id c);

};


}


#if FMT_VERSION >= 90000

template <> struct fmt::formatter<seastar::fair_queue_ticket> : fmt::ostream_formatter {};

#endif

seastar::fair_group
Group of queues class.
Definition: fair_queue.hh:138

seastar::fair_group::config
Definition: fair_queue.hh:236

seastar::fair_queue_entry
Definition: fair_queue.hh:105

seastar::fair_queue_ticket
describes a request that passes through the fair_queue.
Definition: fair_queue.hh:49

seastar::fair_queue_ticket::fair_queue_ticket
fair_queue_ticket(uint32_t weight, uint32_t size) noexcept

seastar::fair_queue_ticket::operator==
bool operator==(const fair_queue_ticket &desc) const noexcept

seastar::fair_queue_ticket::operator-=
fair_queue_ticket & operator-=(fair_queue_ticket desc) noexcept

seastar::fair_queue_ticket::normalize
float normalize(fair_queue_ticket axis) const noexcept

seastar::fair_queue_ticket::operator+=
fair_queue_ticket & operator+=(fair_queue_ticket desc) noexcept

seastar::fair_queue
Fair queuing class.
Definition: fair_queue.hh:289

seastar::fair_queue::queue
void queue(class_id c, fair_queue_entry &ent) noexcept

seastar::fair_queue::unregister_priority_class
void unregister_priority_class(class_id c)

seastar::fair_queue::resources_currently_executing
fair_queue_ticket resources_currently_executing() const

seastar::fair_queue::fair_queue
fair_queue(fair_group &shared, config cfg)

seastar::fair_queue::notify_request_finished
void notify_request_finished(fair_queue_entry::capacity_t cap) noexcept

seastar::fair_queue::dispatch_requests
void dispatch_requests(std::function< void(fair_queue_entry &)> cb)
Try to execute new requests if there is capacity left in the queue.

seastar::fair_queue::resources_currently_waiting
fair_queue_ticket resources_currently_waiting() const

seastar::fair_queue::register_priority_class
void register_priority_class(class_id c, uint32_t shares)

seastar::fair_queue::config
Fair Queue configuration structure.
Definition: fair_queue.hh:295

seastar::now
future now()
Returns a ready future.
Definition: later.hh:35

metrics_registration.hh
holds the metric_groups definition needed by class that reports metrics

impl
holds the implementation parts of the metrics layer, do not use directly.

seastar
Seastar API namespace.
Definition: abort_on_ebadf.hh:26

std
STL namespace.