From 6f321649c4e620df9de540734cb633cb3d0b42a7 Mon Sep 17 00:00:00 2001
From: Robbert van der Helm <mail@robbertvanderhelm.nl>
Date: Sun, 23 May 2021 14:43:02 +0200
Subject: [PATCH] Do small vector optimization for all communication

I once read years ago somewhere on Stack Overflow that `std::vectors`
with that are preinitialized to a default size would allocate the
initial capacity on the stack. This of course doesn't make any
sense (run time sized stack allocations can cause all kinds of issues),
so we were still allocating with our default 64-byte sized buffers, but
just not as often.
---
 CHANGELOG.md                      |  3 +
 src/common/communication/common.h | 97 ++++++++++++++++++++++++++-----
 src/common/communication/vst3.h   | 10 ++--
 src/plugin/bridges/vst2.cpp       |  2 +-
 src/plugin/bridges/vst2.h         |  7 ++-
 src/wine-host/bridges/vst2.cpp    |  4 +-
 6 files changed, 96 insertions(+), 27 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f97dd683..e587ed1d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,9 @@ Versioning](https://semver.org/spec/v2.0.0.html).
   event and parameter change queues.
 - VST2 audio processing also received the same small vector optimization to get
   rid of any last potential allocations during audio processing.
+- The same small vector optimization has been applied across yabridge's entire
+  communication architecture, meaning that most function calls should no longer
+  produce any allocations for both VST2 and VST3 plugins.
 - Changed the way mutual recursion in VST3 plugins on the plugin side works to
   counter any potential GUI related timing issues with VST3 plugins when using
   multiple instances of a plugin.
diff --git a/src/common/communication/common.h b/src/common/communication/common.h
index 980d6c90..d71cb7ab 100644
--- a/src/common/communication/common.h
+++ b/src/common/communication/common.h
@@ -30,11 +30,18 @@
 #include <boost/asio/local/stream_protocol.hpp>
 #include <boost/asio/read.hpp>
 #include <boost/asio/write.hpp>
+#include <boost/container/small_vector.hpp>
 #include <boost/filesystem.hpp>
 
+#include "../bitsery/traits/small-vector.h"
 #include "../logging/common.h"
 #include "../utils.h"
 
+// Our input and output adapters for binary serialization always expect the data
+// to be encoded in little endian format. This should not make any difference
+// currently, but this would make it possible (somewhat, it would probably still
+// be too slow) to have yabridge be usable with Wine run through Qemu on
+// big-endian architectures.
 namespace bitsery {
 struct LittleEndianConfig {
     // In case we ever want to bridge from some big-endian architecture to
@@ -56,6 +63,62 @@ template <typename B>
 using InputAdapter =
     bitsery::InputBufferAdapter<B, bitsery::LittleEndianConfig>;
 
+/**
+ * For binary serialization we use these small vectors that preallocate a small
+ * capacity on the stack as part of our binary serialization process. For most
+ * messages we don't need more than the default capacity (which would usually be
+ * 64 bytes), so we can avoid a lot of allocations in the serialization process
+ * this way.
+ */
+template <size_t N>
+using SerializationBuffer = boost::container::small_vector<uint8_t, N>;
+
+/**
+ * The class `SerializationBuffer<N>` is derived from, so we can erase the
+ * buffer's initial capacity from all functions that work with them.
+ */
+using SerializationBufferBase = boost::container::small_vector_base<uint8_t>;
+
+namespace boost {
+namespace asio {
+
+template <typename PodType, typename Allocator>
+inline BOOST_ASIO_MUTABLE_BUFFER buffer(
+    boost::container::small_vector_base<PodType, Allocator>& data)
+    BOOST_ASIO_NOEXCEPT {
+    return BOOST_ASIO_MUTABLE_BUFFER(
+        data.size() ? &data[0] : 0, data.size() * sizeof(PodType)
+#if defined(BOOST_ASIO_ENABLE_BUFFER_DEBUGGING)
+                                        ,
+        detail::buffer_debug_check<
+            typename std::vector<PodType, Allocator>::iterator>(data.begin())
+#endif  // BOOST_ASIO_ENABLE_BUFFER_DEBUGGING
+    );
+}
+
+// These are copied verbatim `boost::asio::buffer(std::vector<PodType,
+// Allocator>&, std::size_t)`, since `boost::container::small_vector` is
+// compatible with the STL vector.
+template <typename PodType, typename Allocator>
+inline BOOST_ASIO_MUTABLE_BUFFER buffer(
+    boost::container::small_vector_base<PodType, Allocator>& data,
+    std::size_t max_size_in_bytes) BOOST_ASIO_NOEXCEPT {
+    return BOOST_ASIO_MUTABLE_BUFFER(
+        data.size() ? &data[0] : 0,
+        data.size() * sizeof(PodType) < max_size_in_bytes
+            ? data.size() * sizeof(PodType)
+            : max_size_in_bytes
+#if defined(BOOST_ASIO_ENABLE_BUFFER_DEBUGGING)
+        ,
+        detail::buffer_debug_check<
+            typename std::vector<PodType, Allocator>::iterator>(data.begin())
+#endif  // BOOST_ASIO_ENABLE_BUFFER_DEBUGGING
+    );
+}
+
+}  // namespace asio
+}  // namespace boost
+
 /**
  * Serialize an object using bitsery and write it to a socket. This will write
  * both the size of the serialized object and the object itself over the socket.
@@ -74,9 +137,9 @@ using InputAdapter =
 template <typename T, typename Socket>
 inline void write_object(Socket& socket,
                          const T& object,
-                         std::vector<uint8_t>& buffer) {
+                         SerializationBufferBase& buffer) {
     const size_t size =
-        bitsery::quickSerialization<OutputAdapter<std::vector<uint8_t>>>(
+        bitsery::quickSerialization<OutputAdapter<SerializationBufferBase>>(
             buffer, object);
 
     // Tell the other side how large the object is so it can prepare a buffer
@@ -100,7 +163,7 @@ inline void write_object(Socket& socket,
  */
 template <typename T, typename Socket>
 inline void write_object(Socket& socket, const T& object) {
-    std::vector<uint8_t> buffer(64);
+    SerializationBuffer<64> buffer{};
     write_object(socket, object, buffer);
 }
 
@@ -123,7 +186,9 @@ inline void write_object(Socket& socket, const T& object) {
  * @relates write_object
  */
 template <typename T, typename Socket>
-inline T& read_object(Socket& socket, T& object, std::vector<uint8_t>& buffer) {
+inline T& read_object(Socket& socket,
+                      T& object,
+                      SerializationBufferBase& buffer) {
     // See the note above on the use of `uint64_t` instead of `size_t`
     std::array<uint64_t, 1> message_length;
     boost::asio::read(socket, boost::asio::buffer(message_length),
@@ -140,7 +205,7 @@ inline T& read_object(Socket& socket, T& object, std::vector<uint8_t>& buffer) {
                       boost::asio::transfer_exactly(size));
 
     auto [_, success] =
-        bitsery::quickDeserialization<InputAdapter<std::vector<uint8_t>>>(
+        bitsery::quickDeserialization<InputAdapter<SerializationBufferBase>>(
             {buffer.begin(), size}, object);
 
     if (BOOST_UNLIKELY(!success)) {
@@ -158,7 +223,7 @@ inline T& read_object(Socket& socket, T& object, std::vector<uint8_t>& buffer) {
  * @overload
  */
 template <typename T, typename Socket>
-inline T read_object(Socket& socket, std::vector<uint8_t>& buffer) {
+inline T read_object(Socket& socket, SerializationBufferBase& buffer) {
     T object;
     read_object<T>(socket, object, buffer);
 
@@ -173,7 +238,7 @@ inline T read_object(Socket& socket, std::vector<uint8_t>& buffer) {
  */
 template <typename T, typename Socket>
 inline T& read_object(Socket& socket, T& object) {
-    std::vector<uint8_t> buffer(64);
+    SerializationBuffer<64> buffer{};
     return read_object<T>(socket, object, buffer);
 }
 
@@ -186,7 +251,7 @@ inline T& read_object(Socket& socket, T& object) {
 template <typename T, typename Socket>
 inline T read_object(Socket& socket) {
     T object;
-    std::vector<uint8_t> buffer(64);
+    SerializationBuffer<64> buffer{};
     read_object<T>(socket, object, buffer);
 
     return object;
@@ -360,7 +425,7 @@ class SocketHandler {
      * @see SocketHandler::receive_multi
      */
     template <typename T>
-    inline void send(const T& object, std::vector<uint8_t>& buffer) {
+    inline void send(const T& object, SerializationBufferBase& buffer) {
         write_object(socket, object, buffer);
     }
 
@@ -402,7 +467,7 @@ class SocketHandler {
      * @see SocketHandler::receive_multi
      */
     template <typename T>
-    inline T receive_single(std::vector<uint8_t>& buffer) {
+    inline T receive_single(SerializationBufferBase& buffer) {
         return read_object<T>(socket, buffer);
     }
 
@@ -425,19 +490,19 @@ class SocketHandler {
      *   we'd probably want to do some more stuff after sending a reply, calling
      *   `send()` is the responsibility of this function.
      *
-     * @tparam F A function type in the form of `void(T, std::vector<uint8_t>&)`
-     *   that does something with the object, and then calls `send()`. The
-     *   reading/writing buffer is passed along so it can be reused for sending
-     *   large amounts of data.
+     * @tparam F A function type in the form of `void(T,
+     *   SerializationBufferBase&)` that does something with the object, and
+     *   then calls `send()`. The reading/writing buffer is passed along so it
+     *   can be reused for sending large amounts of data.
      *
      * @relates SocketHandler::send
      *
      * @see read_object
      * @see SocketHandler::receive_single
      */
-    template <typename T, std::invocable<T, std::vector<uint8_t>&> F>
+    template <typename T, std::invocable<T, SerializationBufferBase&> F>
     void receive_multi(F&& callback) {
-        std::vector<uint8_t> buffer{};
+        SerializationBuffer<64> buffer{};
         while (true) {
             try {
                 auto object = receive_single<T>(buffer);
diff --git a/src/common/communication/vst3.h b/src/common/communication/vst3.h
index 5642823f..bd6ab66b 100644
--- a/src/common/communication/vst3.h
+++ b/src/common/communication/vst3.h
@@ -87,7 +87,7 @@ class Vst3MessageHandler : public AdHocSocketHandler<Thread> {
     typename T::Response send_message(
         const T& object,
         std::optional<std::pair<Vst3Logger&, bool>> logging,
-        std::vector<uint8_t>& buffer) {
+        SerializationBufferBase& buffer) {
         typename T::Response response_object;
         receive_into(object, response_object, logging, buffer);
 
@@ -122,7 +122,7 @@ class Vst3MessageHandler : public AdHocSocketHandler<Thread> {
         const T& object,
         typename T::Response& response_object,
         std::optional<std::pair<Vst3Logger&, bool>> logging,
-        std::vector<uint8_t>& buffer) {
+        SerializationBufferBase& buffer) {
         using TResponse = typename T::Response;
 
         // Since a lot of messages just return a `tresult`, we can't filter out
@@ -161,7 +161,7 @@ class Vst3MessageHandler : public AdHocSocketHandler<Thread> {
         const T& object,
         typename T::Response& response_object,
         std::optional<std::pair<Vst3Logger&, bool>> logging) {
-        std::vector<uint8_t> buffer(64);
+        SerializationBuffer<64> buffer{};
         return receive_into(object, response_object, std::move(logging),
                             buffer);
     }
@@ -217,7 +217,7 @@ class Vst3MessageHandler : public AdHocSocketHandler<Thread> {
                 // every time, but on the audio processor side we store the
                 // actual variant within an object and we then use some hackery
                 // to always keep the large process data object in memory.
-                thread_local std::vector<uint8_t> persistent_buffer{};
+                thread_local SerializationBuffer<64> persistent_buffer{};
                 thread_local Request persistent_object;
 
                 auto& request =
@@ -506,7 +506,7 @@ class Vst3Sockets : public Sockets {
         typename T::Response& response_object,
         size_t instance_id,
         std::optional<std::pair<Vst3Logger&, bool>> logging) {
-        thread_local std::vector<uint8_t> audio_processor_buffer{};
+        thread_local SerializationBuffer<64> audio_processor_buffer{};
 
         return audio_processor_sockets.at(instance_id)
             .receive_into(object, response_object, logging,
diff --git a/src/plugin/bridges/vst2.cpp b/src/plugin/bridges/vst2.cpp
index 09abc335..9d043858 100644
--- a/src/plugin/bridges/vst2.cpp
+++ b/src/plugin/bridges/vst2.cpp
@@ -593,7 +593,7 @@ void Vst2PluginBridge::do_process(T** inputs, T** outputs, int sample_frames) {
     }
 
     // The inputs and outputs arrays should be `[num_inputs][sample_frames]` and
-    // `[num_outputs][sample_frames]` floats large respectfully.
+    // `[num_outputs][sample_frames]` floats large respectfully
     std::vector<std::vector<T>> input_buffers(plugin.numInputs,
                                               std::vector<T>(sample_frames));
     for (int channel = 0; channel < plugin.numInputs; channel++) {
diff --git a/src/plugin/bridges/vst2.h b/src/plugin/bridges/vst2.h
index 441d2595..11f29b31 100644
--- a/src/plugin/bridges/vst2.h
+++ b/src/plugin/bridges/vst2.h
@@ -154,10 +154,11 @@ class Vst2PluginBridge : PluginBridge<Vst2Sockets<std::jthread>> {
     Vst2Logger logger;
 
     /**
-     * A scratch buffer for sending and receiving data during `process`,
-     * `processReplacing` and `processDoubleReplacing` calls.
+     * A scratch buffer for sending and receiving binary data during the
+     * `process()`, `processReplacing()` and `processDoubleReplacing()` calls.
+     * This buffer also needs to stay alive.
      */
-    std::vector<uint8_t> process_buffer;
+    SerializationBuffer<0> process_buffer;
 
     /**
      * We'll periodically synchronize the Wine host's audio thread priority with
diff --git a/src/wine-host/bridges/vst2.cpp b/src/wine-host/bridges/vst2.cpp
index 78ff9fd6..709f5826 100644
--- a/src/wine-host/bridges/vst2.cpp
+++ b/src/wine-host/bridges/vst2.cpp
@@ -179,7 +179,7 @@ Vst2Bridge::Vst2Bridge(MainContext& main_context,
 
     parameters_handler = Win32Thread([&]() {
         sockets.host_vst_parameters.receive_multi<Parameter>(
-            [&](Parameter request, std::vector<uint8_t>& buffer) {
+            [&](Parameter request, SerializationBufferBase& buffer) {
                 // Both `getParameter` and `setParameter` functions are passed
                 // through on this socket since they have a lot of overlap. The
                 // presence of the `value` field tells us which one we're
@@ -216,7 +216,7 @@ Vst2Bridge::Vst2Bridge(MainContext& main_context,
             plugin->numOutputs);
 
         sockets.host_vst_process_replacing.receive_multi<AudioBuffers>(
-            [&](AudioBuffers request, std::vector<uint8_t>& buffer) {
+            [&](AudioBuffers request, SerializationBufferBase& buffer) {
                 // Since the value cannot change during this processing cycle,
                 // we'll send the current transport information as part of the
                 // request so we prefetch it to avoid unnecessary callbacks from