Do small vector optimization for all communication

I once read years ago somewhere on Stack Overflow that `std::vectors` with that are preinitialized to a default size would allocate the initial capacity on the stack. This of course doesn't make any sense (run time sized stack allocations can cause all kinds of issues), so we were still allocating with our default 64-byte sized buffers, but just not as often.
2026-05-07 20:10:13 +02:00 · 2021-05-23 14:43:02 +02:00
parent a5ba3bdf33
commit 6f321649c4
6 changed files with 96 additions and 27 deletions
@@ -40,6 +40,9 @@ Versioning](https://semver.org/spec/v2.0.0.html).
  event and parameter change queues.
 - VST2 audio processing also received the same small vector optimization to get
  rid of any last potential allocations during audio processing.
+- The same small vector optimization has been applied across yabridge's entire
+  communication architecture, meaning that most function calls should no longer
+  produce any allocations for both VST2 and VST3 plugins.
 - Changed the way mutual recursion in VST3 plugins on the plugin side works to
  counter any potential GUI related timing issues with VST3 plugins when using
  multiple instances of a plugin.
@@ -30,11 +30,18 @@
 #include <boost/asio/local/stream_protocol.hpp>
 #include <boost/asio/read.hpp>
 #include <boost/asio/write.hpp>
+#include <boost/container/small_vector.hpp>
 #include <boost/filesystem.hpp>

+#include "../bitsery/traits/small-vector.h"
 #include "../logging/common.h"
 #include "../utils.h"

+// Our input and output adapters for binary serialization always expect the data
+// to be encoded in little endian format. This should not make any difference
+// currently, but this would make it possible (somewhat, it would probably still
+// be too slow) to have yabridge be usable with Wine run through Qemu on
+// big-endian architectures.
 namespace bitsery {
 struct LittleEndianConfig {
    // In case we ever want to bridge from some big-endian architecture to
@@ -56,6 +63,62 @@ template <typename B>
 using InputAdapter =
    bitsery::InputBufferAdapter<B, bitsery::LittleEndianConfig>;

+/**
+ * For binary serialization we use these small vectors that preallocate a small
+ * capacity on the stack as part of our binary serialization process. For most
+ * messages we don't need more than the default capacity (which would usually be
+ * 64 bytes), so we can avoid a lot of allocations in the serialization process
+ * this way.
+ */
+template <size_t N>
+using SerializationBuffer = boost::container::small_vector<uint8_t, N>;
+
+/**
+ * The class `SerializationBuffer<N>` is derived from, so we can erase the
+ * buffer's initial capacity from all functions that work with them.
+ */
+using SerializationBufferBase = boost::container::small_vector_base<uint8_t>;
+
+namespace boost {
+namespace asio {
+
+template <typename PodType, typename Allocator>
+inline BOOST_ASIO_MUTABLE_BUFFER buffer(
+    boost::container::small_vector_base<PodType, Allocator>& data)
+    BOOST_ASIO_NOEXCEPT {
+    return BOOST_ASIO_MUTABLE_BUFFER(
+        data.size() ? &data[0] : 0, data.size() * sizeof(PodType)
+#if defined(BOOST_ASIO_ENABLE_BUFFER_DEBUGGING)
+                                        ,
+        detail::buffer_debug_check<
+            typename std::vector<PodType, Allocator>::iterator>(data.begin())
+#endif  // BOOST_ASIO_ENABLE_BUFFER_DEBUGGING
+    );
+}
+
+// These are copied verbatim `boost::asio::buffer(std::vector<PodType,
+// Allocator>&, std::size_t)`, since `boost::container::small_vector` is
+// compatible with the STL vector.
+template <typename PodType, typename Allocator>
+inline BOOST_ASIO_MUTABLE_BUFFER buffer(
+    boost::container::small_vector_base<PodType, Allocator>& data,
+    std::size_t max_size_in_bytes) BOOST_ASIO_NOEXCEPT {
+    return BOOST_ASIO_MUTABLE_BUFFER(
+        data.size() ? &data[0] : 0,
+        data.size() * sizeof(PodType) < max_size_in_bytes
+            ? data.size() * sizeof(PodType)
+            : max_size_in_bytes
+#if defined(BOOST_ASIO_ENABLE_BUFFER_DEBUGGING)
+        ,
+        detail::buffer_debug_check<
+            typename std::vector<PodType, Allocator>::iterator>(data.begin())
+#endif  // BOOST_ASIO_ENABLE_BUFFER_DEBUGGING
+    );
+}
+
+}  // namespace asio
+}  // namespace boost
+
 /**
 * Serialize an object using bitsery and write it to a socket. This will write
 * both the size of the serialized object and the object itself over the socket.
@@ -74,9 +137,9 @@ using InputAdapter =
 template <typename T, typename Socket>
 inline void write_object(Socket& socket,
                         const T& object,
-                         std::vector<uint8_t>& buffer) {
+                         SerializationBufferBase& buffer) {
    const size_t size =
-        bitsery::quickSerialization<OutputAdapter<std::vector<uint8_t>>>(
+        bitsery::quickSerialization<OutputAdapter<SerializationBufferBase>>(
            buffer, object);

    // Tell the other side how large the object is so it can prepare a buffer
@@ -100,7 +163,7 @@ inline void write_object(Socket& socket,
 */
 template <typename T, typename Socket>
 inline void write_object(Socket& socket, const T& object) {
-    std::vector<uint8_t> buffer(64);
+    SerializationBuffer<64> buffer{};
    write_object(socket, object, buffer);
 }

@@ -123,7 +186,9 @@ inline void write_object(Socket& socket, const T& object) {
 * @relates write_object
 */
 template <typename T, typename Socket>
-inline T& read_object(Socket& socket, T& object, std::vector<uint8_t>& buffer) {
+inline T& read_object(Socket& socket,
+                      T& object,
+                      SerializationBufferBase& buffer) {
    // See the note above on the use of `uint64_t` instead of `size_t`
    std::array<uint64_t, 1> message_length;
    boost::asio::read(socket, boost::asio::buffer(message_length),
@@ -140,7 +205,7 @@ inline T& read_object(Socket& socket, T& object, std::vector<uint8_t>& buffer) {
                      boost::asio::transfer_exactly(size));

    auto [_, success] =
-        bitsery::quickDeserialization<InputAdapter<std::vector<uint8_t>>>(
+        bitsery::quickDeserialization<InputAdapter<SerializationBufferBase>>(
            {buffer.begin(), size}, object);

    if (BOOST_UNLIKELY(!success)) {
@@ -158,7 +223,7 @@ inline T& read_object(Socket& socket, T& object, std::vector<uint8_t>& buffer) {
 * @overload
 */
 template <typename T, typename Socket>
-inline T read_object(Socket& socket, std::vector<uint8_t>& buffer) {
+inline T read_object(Socket& socket, SerializationBufferBase& buffer) {
    T object;
    read_object<T>(socket, object, buffer);

@@ -173,7 +238,7 @@ inline T read_object(Socket& socket, std::vector<uint8_t>& buffer) {
 */
 template <typename T, typename Socket>
 inline T& read_object(Socket& socket, T& object) {
-    std::vector<uint8_t> buffer(64);
+    SerializationBuffer<64> buffer{};
    return read_object<T>(socket, object, buffer);
 }

@@ -186,7 +251,7 @@ inline T& read_object(Socket& socket, T& object) {
 template <typename T, typename Socket>
 inline T read_object(Socket& socket) {
    T object;
-    std::vector<uint8_t> buffer(64);
+    SerializationBuffer<64> buffer{};
    read_object<T>(socket, object, buffer);

    return object;
@@ -360,7 +425,7 @@ class SocketHandler {
     * @see SocketHandler::receive_multi
     */
    template <typename T>
-    inline void send(const T& object, std::vector<uint8_t>& buffer) {
+    inline void send(const T& object, SerializationBufferBase& buffer) {
        write_object(socket, object, buffer);
    }

@@ -402,7 +467,7 @@ class SocketHandler {
     * @see SocketHandler::receive_multi
     */
    template <typename T>
-    inline T receive_single(std::vector<uint8_t>& buffer) {
+    inline T receive_single(SerializationBufferBase& buffer) {
        return read_object<T>(socket, buffer);
    }

@@ -425,19 +490,19 @@ class SocketHandler {
     *   we'd probably want to do some more stuff after sending a reply, calling
     *   `send()` is the responsibility of this function.
     *
-     * @tparam F A function type in the form of `void(T, std::vector<uint8_t>&)`
-     *   that does something with the object, and then calls `send()`. The
-     *   reading/writing buffer is passed along so it can be reused for sending
-     *   large amounts of data.
+     * @tparam F A function type in the form of `void(T,
+     *   SerializationBufferBase&)` that does something with the object, and
+     *   then calls `send()`. The reading/writing buffer is passed along so it
+     *   can be reused for sending large amounts of data.
     *
     * @relates SocketHandler::send
     *
     * @see read_object
     * @see SocketHandler::receive_single
     */
-    template <typename T, std::invocable<T, std::vector<uint8_t>&> F>
+    template <typename T, std::invocable<T, SerializationBufferBase&> F>
    void receive_multi(F&& callback) {
-        std::vector<uint8_t> buffer{};
+        SerializationBuffer<64> buffer{};
        while (true) {
            try {
                auto object = receive_single<T>(buffer);
@@ -87,7 +87,7 @@ class Vst3MessageHandler : public AdHocSocketHandler<Thread> {
    typename T::Response send_message(
        const T& object,
        std::optional<std::pair<Vst3Logger&, bool>> logging,
-        std::vector<uint8_t>& buffer) {
+        SerializationBufferBase& buffer) {
        typename T::Response response_object;
        receive_into(object, response_object, logging, buffer);

@@ -122,7 +122,7 @@ class Vst3MessageHandler : public AdHocSocketHandler<Thread> {
        const T& object,
        typename T::Response& response_object,
        std::optional<std::pair<Vst3Logger&, bool>> logging,
-        std::vector<uint8_t>& buffer) {
+        SerializationBufferBase& buffer) {
        using TResponse = typename T::Response;

        // Since a lot of messages just return a `tresult`, we can't filter out
@@ -161,7 +161,7 @@ class Vst3MessageHandler : public AdHocSocketHandler<Thread> {
        const T& object,
        typename T::Response& response_object,
        std::optional<std::pair<Vst3Logger&, bool>> logging) {
-        std::vector<uint8_t> buffer(64);
+        SerializationBuffer<64> buffer{};
        return receive_into(object, response_object, std::move(logging),
                            buffer);
    }
@@ -217,7 +217,7 @@ class Vst3MessageHandler : public AdHocSocketHandler<Thread> {
                // every time, but on the audio processor side we store the
                // actual variant within an object and we then use some hackery
                // to always keep the large process data object in memory.
-                thread_local std::vector<uint8_t> persistent_buffer{};
+                thread_local SerializationBuffer<64> persistent_buffer{};
                thread_local Request persistent_object;

                auto& request =
@@ -506,7 +506,7 @@ class Vst3Sockets : public Sockets {
        typename T::Response& response_object,
        size_t instance_id,
        std::optional<std::pair<Vst3Logger&, bool>> logging) {
-        thread_local std::vector<uint8_t> audio_processor_buffer{};
+        thread_local SerializationBuffer<64> audio_processor_buffer{};

        return audio_processor_sockets.at(instance_id)
            .receive_into(object, response_object, logging,
@@ -593,7 +593,7 @@ void Vst2PluginBridge::do_process(T** inputs, T** outputs, int sample_frames) {
    }

    // The inputs and outputs arrays should be `[num_inputs][sample_frames]` and
-    // `[num_outputs][sample_frames]` floats large respectfully.
+    // `[num_outputs][sample_frames]` floats large respectfully
    std::vector<std::vector<T>> input_buffers(plugin.numInputs,
                                              std::vector<T>(sample_frames));
    for (int channel = 0; channel < plugin.numInputs; channel++) {
@@ -154,10 +154,11 @@ class Vst2PluginBridge : PluginBridge<Vst2Sockets<std::jthread>> {
    Vst2Logger logger;

    /**
-     * A scratch buffer for sending and receiving data during `process`,
-     * `processReplacing` and `processDoubleReplacing` calls.
+     * A scratch buffer for sending and receiving binary data during the
+     * `process()`, `processReplacing()` and `processDoubleReplacing()` calls.
+     * This buffer also needs to stay alive.
     */
-    std::vector<uint8_t> process_buffer;
+    SerializationBuffer<0> process_buffer;

    /**
     * We'll periodically synchronize the Wine host's audio thread priority with
@@ -179,7 +179,7 @@ Vst2Bridge::Vst2Bridge(MainContext& main_context,

    parameters_handler = Win32Thread([&]() {
        sockets.host_vst_parameters.receive_multi<Parameter>(
-            [&](Parameter request, std::vector<uint8_t>& buffer) {
+            [&](Parameter request, SerializationBufferBase& buffer) {
                // Both `getParameter` and `setParameter` functions are passed
                // through on this socket since they have a lot of overlap. The
                // presence of the `value` field tells us which one we're
@@ -216,7 +216,7 @@ Vst2Bridge::Vst2Bridge(MainContext& main_context,
            plugin->numOutputs);

        sockets.host_vst_process_replacing.receive_multi<AudioBuffers>(
-            [&](AudioBuffers request, std::vector<uint8_t>& buffer) {
+            [&](AudioBuffers request, SerializationBufferBase& buffer) {
                // Since the value cannot change during this processing cycle,
                // we'll send the current transport information as part of the
                // request so we prefetch it to avoid unnecessary callbacks from