tangara-fw/src/database/records.cpp

/*
 * Copyright 2023 jacqueline <me@jacqueline.id.au>
 *
 * SPDX-License-Identifier: GPL-3.0-only
 */

#include "records.hpp"

#include <stdint.h>
#include <sys/_stdint.h>

#include <functional>
#include <iomanip>
#include <memory_resource>
#include <sstream>
#include <string>
#include <vector>

#include "cppbor.h"
#include "cppbor_parse.h"
#include "esp_log.h"

#include "index.hpp"
#include "komihash.h"
#include "memory_resource.hpp"
#include "track.hpp"

// As LevelDB is a key-value store, each record in the database consists of a
// key and an optional value.
//
// Values, when present, are always cbor-encoded. This is fast, compact, and
// very easy to evolve over time due to its inclusion of type information.
//
// Keys have a more complicated scheme, as for performance we rely heavily on
// LevelDB's sorted storage format. We must therefore worry about clustering of
// similar records, and the sortability of our encoding format.
//    Each kind of key consists of a a single-byte prefix, then one or more
// fields separated by null (0) bytes. Each field may be cbor-encoded, or may
// use some bespoke encoding; it depends on whether we want to be able to sort
// by that field.
//    For debugging and discussion purposes, we represent field separators
// textually as '/', and write each field as its hex encoding. e.g. a data key
// for the track with id 17 would be written as 'D / 0x11'.

namespace database {

static const char* kTag = "RECORDS";

static const char kDataPrefix = 'D';
static const char kHashPrefix = 'H';
static const char kIndexPrefix = 'I';
static const char kFieldSeparator = '\0';

/* 'D/' */
auto EncodeDataPrefix() -> std::string {
  return {kDataPrefix, kFieldSeparator};
}

/* 'D/ 0xACAB' */
auto EncodeDataKey(const TrackId& id) -> std::string {
  return EncodeDataPrefix() + TrackIdToBytes(id);
}

auto EncodeDataValue(const TrackData& track) -> std::string {
  cppbor::Array val{
      cppbor::Uint{track.id()},
      cppbor::Tstr{track.filepath()},
      cppbor::Uint{track.tags_hash()},
      cppbor::Bool{track.is_tombstoned()},
      cppbor::Uint{track.modified_at().first},
      cppbor::Uint{track.modified_at().second},
  };
  return val.toString();
}

auto ParseDataValue(const leveldb::Slice& slice) -> std::shared_ptr<TrackData> {
  auto [item, unused, err] = cppbor::parseWithViews(
      reinterpret_cast<const uint8_t*>(slice.data()), slice.size());
  if (!item || item->type() != cppbor::ARRAY) {
    return nullptr;
  }
  auto vals = item->asArray();
  if (vals->size() != 6 || vals->get(0)->type() != cppbor::UINT ||
      vals->get(1)->type() != cppbor::TSTR ||
      vals->get(2)->type() != cppbor::UINT ||
      vals->get(3)->type() != cppbor::SIMPLE ||
      vals->get(4)->type() != cppbor::UINT ||
      vals->get(5)->type() != cppbor::UINT) {
    return {};
  }
  TrackId id = vals->get(0)->asUint()->unsignedValue();
  auto path = vals->get(1)->asViewTstr()->view();
  uint64_t hash = vals->get(2)->asUint()->unsignedValue();
  bool tombstoned = vals->get(3)->asBool()->value();
  auto modified_at = std::make_pair<uint16_t, uint16_t>(
      vals->get(4)->asUint()->unsignedValue(),
      vals->get(5)->asUint()->unsignedValue());
  return std::make_shared<TrackData>(id,
                                     std::pmr::string{path.data(), path.size()},
                                     hash, tombstoned, modified_at);
}

/* 'H/ 0xBEEF' */
auto EncodeHashKey(const uint64_t& hash) -> std::string {
  return std::string{kHashPrefix, kFieldSeparator} +
         cppbor::Uint{hash}.toString();
}

auto ParseHashValue(const leveldb::Slice& slice) -> std::optional<TrackId> {
  return BytesToTrackId({slice.data(), slice.size()});
}

auto EncodeHashValue(TrackId id) -> std::string {
  return TrackIdToBytes(id);
}

/* 'I/' */
auto EncodeAllIndexesPrefix() -> std::string {
  return {kIndexPrefix, kFieldSeparator};
}

auto EncodeIndexPrefix(const IndexKey::Header& header) -> std::string {
  std::ostringstream out;
  out.put(kIndexPrefix).put(kFieldSeparator);
  cppbor::Array val{
      cppbor::Uint{header.id},
      cppbor::Uint{header.depth},
      cppbor::Uint{header.components_hash},
  };
  out << val.toString() << kFieldSeparator;
  return out.str();
}

/*
 * 'I/0xa2/0x686921/0xb9'
 *                   ^ --- trailer
 *          ^ --- component ("hi!")
 *     ^ -------- header
 *
 *  The components *must* be encoded in a way that is easy to sort
 *  lexicographically. The header and footer do not have this restriction, so
 *  cbor is fine.
 *
 *  We store grouping information within the header; which index, filtered
 *  components. We store disambiguation information in the trailer; just a track
 *  id for now, but could reasonably be something like 'release year' as well.
 */
auto EncodeIndexKey(const IndexKey& key) -> std::string {
  std::ostringstream out{};

  out << EncodeIndexPrefix(key.header);

  // The component should already be UTF-8 encoded, so just write it.
  if (key.item) {
    out << *key.item << kFieldSeparator;
  }

  if (key.track) {
    out << TrackIdToBytes(*key.track);
  }

  return out.str();
}

auto ParseIndexKey(const leveldb::Slice& slice) -> std::optional<IndexKey> {
  IndexKey result{};

  auto prefix = EncodeAllIndexesPrefix();
  if (!slice.starts_with(prefix)) {
    return {};
  }

  std::string key_data = slice.ToString().substr(prefix.size());
  auto [key, end_of_key, err] = cppbor::parseWithViews(
      reinterpret_cast<const uint8_t*>(key_data.data()), key_data.size());
  if (!key || key->type() != cppbor::ARRAY) {
    return {};
  }
  auto as_array = key->asArray();
  if (as_array->size() != 3 || as_array->get(0)->type() != cppbor::UINT ||
      as_array->get(1)->type() != cppbor::UINT ||
      as_array->get(2)->type() != cppbor::UINT) {
    return {};
  }
  result.header.id = as_array->get(0)->asUint()->unsignedValue();
  result.header.depth = as_array->get(1)->asUint()->unsignedValue();
  result.header.components_hash = as_array->get(2)->asUint()->unsignedValue();

  size_t header_length =
      reinterpret_cast<const char*>(end_of_key) - key_data.data();

  if (header_length == 0 || header_length >= key_data.size()) {
    return {};
  }

  std::istringstream in(key_data.substr(header_length + 1));
  std::stringbuf buffer{};

  in.get(buffer, kFieldSeparator);
  if (buffer.str().size() > 0) {
    result.item = buffer.str();
  }

  buffer = {};
  in.get(buffer);
  std::string id_str = buffer.str();
  if (id_str.size() > 1) {
    result.track = BytesToTrackId(id_str.substr(1));
  }

  return result;
}

auto TrackIdToBytes(TrackId id) -> std::string {
  return cppbor::Uint{id}.toString();
}

auto BytesToTrackId(cpp::span<const char> bytes) -> std::optional<TrackId> {
  auto [res, unused, err] = cppbor::parse(
      reinterpret_cast<const uint8_t*>(bytes.data()), bytes.size());
  if (!res || res->type() != cppbor::UINT) {
    return {};
  }
  return res->asUint()->unsignedValue();
}

}  // namespace database
Annote E V E R Y T H I N G with license info 2 years ago			`/*`
			`* Copyright 2023 jacqueline <me@jacqueline.id.au>`
			`*`
			`* SPDX-License-Identifier: GPL-3.0-only`
			`*/`

Database init is now stable! 2 years ago			`#include "records.hpp"`

			`#include <stdint.h>`
Use libcppbor for much much nicer db encoding 2 years ago			`#include <sys/_stdint.h>`
Database init is now stable! 2 years ago
std::string -> std::pmr::string in psram 2 years ago			`#include <functional>`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`#include <iomanip>`
std::string -> std::pmr::string in psram 2 years ago			`#include <memory_resource>`
Database init is now stable! 2 years ago			`#include <sstream>`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`#include <string>`
Database init is now stable! 2 years ago			`#include <vector>`

Use libcppbor for much much nicer db encoding 2 years ago			`#include "cppbor.h"`
			`#include "cppbor_parse.h"`
Add some basic tests for the database 2 years ago			`#include "esp_log.h"`

add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`#include "index.hpp"`
			`#include "komihash.h"`
std::string -> std::pmr::string in psram 2 years ago			`#include "memory_resource.hpp"`
song -> track 2 years ago			`#include "track.hpp"`
Database init is now stable! 2 years ago
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`// As LevelDB is a key-value store, each record in the database consists of a`
			`// key and an optional value.`
			`//`
			`// Values, when present, are always cbor-encoded. This is fast, compact, and`
			`// very easy to evolve over time due to its inclusion of type information.`
			`//`
			`// Keys have a more complicated scheme, as for performance we rely heavily on`
			`// LevelDB's sorted storage format. We must therefore worry about clustering of`
			`// similar records, and the sortability of our encoding format.`
			`// Each kind of key consists of a a single-byte prefix, then one or more`
			`// fields separated by null (0) bytes. Each field may be cbor-encoded, or may`
			`// use some bespoke encoding; it depends on whether we want to be able to sort`
			`// by that field.`
			`// For debugging and discussion purposes, we represent field separators`
			`// textually as '/', and write each field as its hex encoding. e.g. a data key`
			`// for the track with id 17 would be written as 'D / 0x11'.`

Database init is now stable! 2 years ago			`namespace database {`

			`static const char* kTag = "RECORDS";`

			`static const char kDataPrefix = 'D';`
			`static const char kHashPrefix = 'H';`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`static const char kIndexPrefix = 'I';`
Database init is now stable! 2 years ago			`static const char kFieldSeparator = '\0';`

add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`/* 'D/' */`
Use libcppbor for much much nicer db encoding 2 years ago			`auto EncodeDataPrefix() -> std::string {`
			`return {kDataPrefix, kFieldSeparator};`
Database init is now stable! 2 years ago			`}`

add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`/* 'D/ 0xACAB' */`
Use libcppbor for much much nicer db encoding 2 years ago			`auto EncodeDataKey(const TrackId& id) -> std::string {`
			`return EncodeDataPrefix() + TrackIdToBytes(id);`
Database init is now stable! 2 years ago			`}`

Use libcppbor for much much nicer db encoding 2 years ago			`auto EncodeDataValue(const TrackData& track) -> std::string {`
			`cppbor::Array val{`
			`cppbor::Uint{track.id()},`
			`cppbor::Tstr{track.filepath()},`
			`cppbor::Uint{track.tags_hash()},`
			`cppbor::Bool{track.is_tombstoned()},`
Add modified time to TrackData 2 years ago			`cppbor::Uint{track.modified_at().first},`
			`cppbor::Uint{track.modified_at().second},`
Use libcppbor for much much nicer db encoding 2 years ago			`};`
			`return val.toString();`
Database init is now stable! 2 years ago			`}`

Use bindey for databinding instead of hand rolling ui updates 2 years ago			`auto ParseDataValue(const leveldb::Slice& slice) -> std::shared_ptr<TrackData> {`
Use libcppbor for much much nicer db encoding 2 years ago			`auto [item, unused, err] = cppbor::parseWithViews(`
			`reinterpret_cast<const uint8_t*>(slice.data()), slice.size());`
			`if (!item \|\| item->type() != cppbor::ARRAY) {`
			`return nullptr;`
			`}`
			`auto vals = item->asArray();`
Add modified time to TrackData 2 years ago			`if (vals->size() != 6 \|\| vals->get(0)->type() != cppbor::UINT \|\|`
Use libcppbor for much much nicer db encoding 2 years ago			`vals->get(1)->type() != cppbor::TSTR \|\|`
			`vals->get(2)->type() != cppbor::UINT \|\|`
Add modified time to TrackData 2 years ago			`vals->get(3)->type() != cppbor::SIMPLE \|\|`
			`vals->get(4)->type() != cppbor::UINT \|\|`
			`vals->get(5)->type() != cppbor::UINT) {`
Database init is now stable! 2 years ago			`return {};`
			`}`
Use libcppbor for much much nicer db encoding 2 years ago			`TrackId id = vals->get(0)->asUint()->unsignedValue();`
			`auto path = vals->get(1)->asViewTstr()->view();`
			`uint64_t hash = vals->get(2)->asUint()->unsignedValue();`
			`bool tombstoned = vals->get(3)->asBool()->value();`
Add modified time to TrackData 2 years ago			`auto modified_at = std::make_pair<uint16_t, uint16_t>(`
			`vals->get(4)->asUint()->unsignedValue(),`
			`vals->get(5)->asUint()->unsignedValue());`
			`return std::make_shared<TrackData>(id,`
			`std::pmr::string{path.data(), path.size()},`
			`hash, tombstoned, modified_at);`
Database init is now stable! 2 years ago			`}`

add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`/* 'H/ 0xBEEF' */`
Use libcppbor for much much nicer db encoding 2 years ago			`auto EncodeHashKey(const uint64_t& hash) -> std::string {`
			`return std::string{kHashPrefix, kFieldSeparator} +`
			`cppbor::Uint{hash}.toString();`
Database init is now stable! 2 years ago			`}`

song -> track 2 years ago			`auto ParseHashValue(const leveldb::Slice& slice) -> std::optional<TrackId> {`
std::string -> std::pmr::string in psram 2 years ago			`return BytesToTrackId({slice.data(), slice.size()});`
Database init is now stable! 2 years ago			`}`

Use libcppbor for much much nicer db encoding 2 years ago			`auto EncodeHashValue(TrackId id) -> std::string {`
song -> track 2 years ago			`return TrackIdToBytes(id);`
Database init is now stable! 2 years ago			`}`

add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`/* 'I/' */`
Use libcppbor for much much nicer db encoding 2 years ago			`auto EncodeAllIndexesPrefix() -> std::string {`
			`return {kIndexPrefix, kFieldSeparator};`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`}`

Use libcppbor for much much nicer db encoding 2 years ago			`auto EncodeIndexPrefix(const IndexKey::Header& header) -> std::string {`
			`std::ostringstream out;`
			`out.put(kIndexPrefix).put(kFieldSeparator);`
			`cppbor::Array val{`
			`cppbor::Uint{header.id},`
			`cppbor::Uint{header.depth},`
			`cppbor::Uint{header.components_hash},`
			`};`
			`out << val.toString() << kFieldSeparator;`
			`return out.str();`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`}`

			`/*`
			`* 'I/0xa2/0x686921/0xb9'`
			`* ^ --- trailer`
			`* ^ --- component ("hi!")`
			`* ^ -------- header`
			`*`
			`* The components must be encoded in a way that is easy to sort`
			`* lexicographically. The header and footer do not have this restriction, so`
			`* cbor is fine.`
			`*`
			`* We store grouping information within the header; which index, filtered`
			`* components. We store disambiguation information in the trailer; just a track`
			`* id for now, but could reasonably be something like 'release year' as well.`
			`*/`
Use libcppbor for much much nicer db encoding 2 years ago			`auto EncodeIndexKey(const IndexKey& key) -> std::string {`
			`std::ostringstream out{};`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago
Use libcppbor for much much nicer db encoding 2 years ago			`out << EncodeIndexPrefix(key.header);`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago
			`// The component should already be UTF-8 encoded, so just write it.`
			`if (key.item) {`
Use libcppbor for much much nicer db encoding 2 years ago			`out << *key.item << kFieldSeparator;`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`}`

			`if (key.track) {`
Use libcppbor for much much nicer db encoding 2 years ago			`out << TrackIdToBytes(*key.track);`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`}`
Use libcppbor for much much nicer db encoding 2 years ago
			`return out.str();`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`}`

			`auto ParseIndexKey(const leveldb::Slice& slice) -> std::optional<IndexKey> {`
			`IndexKey result{};`

			`auto prefix = EncodeAllIndexesPrefix();`
Use libcppbor for much much nicer db encoding 2 years ago			`if (!slice.starts_with(prefix)) {`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`return {};`
			`}`

Use libcppbor for much much nicer db encoding 2 years ago			`std::string key_data = slice.ToString().substr(prefix.size());`
			`auto [key, end_of_key, err] = cppbor::parseWithViews(`
			`reinterpret_cast<const uint8_t*>(key_data.data()), key_data.size());`
			`if (!key \|\| key->type() != cppbor::ARRAY) {`
			`return {};`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`}`
Use libcppbor for much much nicer db encoding 2 years ago			`auto as_array = key->asArray();`
			`if (as_array->size() != 3 \|\| as_array->get(0)->type() != cppbor::UINT \|\|`
			`as_array->get(1)->type() != cppbor::UINT \|\|`
			`as_array->get(2)->type() != cppbor::UINT) {`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`return {};`
			`}`
Use libcppbor for much much nicer db encoding 2 years ago			`result.header.id = as_array->get(0)->asUint()->unsignedValue();`
			`result.header.depth = as_array->get(1)->asUint()->unsignedValue();`
			`result.header.components_hash = as_array->get(2)->asUint()->unsignedValue();`

			`size_t header_length =`
			`reinterpret_cast<const char*>(end_of_key) - key_data.data();`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago
Use libcppbor for much much nicer db encoding 2 years ago			`if (header_length == 0 \|\| header_length >= key_data.size()) {`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`return {};`
			`}`

			`std::istringstream in(key_data.substr(header_length + 1));`
			`std::stringbuf buffer{};`

			`in.get(buffer, kFieldSeparator);`
			`if (buffer.str().size() > 0) {`
			`result.item = buffer.str();`
			`}`

			`buffer = {};`
			`in.get(buffer);`
Fix missing track number issues 2 years ago			`std::string id_str = buffer.str();`
			`if (id_str.size() > 1) {`
			`result.track = BytesToTrackId(id_str.substr(1));`
add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat 2 years ago			`}`

			`return result;`
			`}`

Use libcppbor for much much nicer db encoding 2 years ago			`auto TrackIdToBytes(TrackId id) -> std::string {`
			`return cppbor::Uint{id}.toString();`
Database init is now stable! 2 years ago			`}`

std::string -> std::pmr::string in psram 2 years ago			`auto BytesToTrackId(cpp::span<const char> bytes) -> std::optional<TrackId> {`
Use libcppbor for much much nicer db encoding 2 years ago			`auto [res, unused, err] = cppbor::parse(`
			`reinterpret_cast<const uint8_t*>(bytes.data()), bytes.size());`
			`if (!res \|\| res->type() != cppbor::UINT) {`
Add some basic tests for the database 2 years ago			`return {};`
			`}`
Use libcppbor for much much nicer db encoding 2 years ago			`return res->asUint()->unsignedValue();`
Database init is now stable! 2 years ago			`}`

			`} // namespace database`