#include "wayy_db/string_column.hpp" #include #include #include #include #include namespace fs = std::filesystem; namespace wayy_db { StringColumn::StringColumn(std::string name) : name_(std::move(name)) { offsets_.push_back(0); // Initial offset } std::string_view StringColumn::get(size_t row) const { if (row >= size()) { throw InvalidOperation("StringColumn row out of range"); } if (has_validity_ && !is_valid(row)) { return {}; // Null row returns empty view } int64_t start = offsets_[row]; int64_t end = offsets_[row + 1]; return std::string_view(reinterpret_cast(data_.data() + start), static_cast(end - start)); } void StringColumn::append(std::string_view val) { int64_t offset = offsets_.back(); data_.insert(data_.end(), val.begin(), val.end()); offsets_.push_back(offset + static_cast(val.size())); if (has_validity_) { size_t row = size() - 1; size_t needed_bytes = (size() + 7) / 8; if (validity_.size() < needed_bytes) { validity_.push_back(0); } set_valid(row, true); } } void StringColumn::append_null() { offsets_.push_back(offsets_.back()); // Zero-length entry ensure_validity(); set_valid(size() - 1, false); } void StringColumn::set(size_t row, std::string_view val) { if (row >= size()) { throw InvalidOperation("StringColumn row out of range in set"); } int64_t old_start = offsets_[row]; int64_t old_end = offsets_[row + 1]; int64_t old_len = old_end - old_start; int64_t new_len = static_cast(val.size()); if (new_len <= old_len) { // Fits in-place: overwrite and zero-pad remainder std::memcpy(data_.data() + old_start, val.data(), val.size()); if (new_len < old_len) { std::memset(data_.data() + old_start + new_len, 0, static_cast(old_len - new_len)); } // Update offsets: shift this entry's end offsets_[row + 1] = old_start + new_len; // NOTE: This changes the offset for subsequent rows if they shared // contiguous data. For OLTP use (row-level updates), this is fine // because compact() will fix fragmentation. } else { // Doesn't fit: append to end of data buffer, old slot becomes waste int64_t new_start = static_cast(data_.size()); data_.insert(data_.end(), val.begin(), val.end()); offsets_[row] = new_start; offsets_[row + 1] = new_start + new_len; } if (has_validity_) { set_valid(row, true); } } // --- Validity bitmap --- void StringColumn::ensure_validity() { if (has_validity_) return; size_t n = size(); size_t num_bytes = (n + 7) / 8; validity_.assign(num_bytes, 0xFF); if (n % 8 != 0) { uint8_t mask = static_cast((1u << (n % 8)) - 1); validity_.back() = mask; } has_validity_ = true; } bool StringColumn::is_valid(size_t row) const { if (!has_validity_) return true; if (row >= size()) return false; return (validity_[row / 8] >> (row % 8)) & 1; } void StringColumn::set_valid(size_t row, bool valid) { if (!has_validity_) ensure_validity(); if (row >= size()) return; if (valid) { validity_[row / 8] |= (1u << (row % 8)); } else { validity_[row / 8] &= ~(1u << (row % 8)); } } size_t StringColumn::count_valid() const { if (!has_validity_) return size(); size_t count = 0; for (auto byte : validity_) { count += std::popcount(byte); } return count; } // --- Persistence --- // Files: /.offsets, .data, .validity void StringColumn::save(const std::string& dir_path, const std::string& col_name) const { fs::create_directories(dir_path); // Write offsets { std::string path = dir_path + "/" + col_name + ".offsets"; std::ofstream f(path, std::ios::binary); if (!f) throw WayyException("Failed to create offsets file: " + path); uint64_t count = offsets_.size(); f.write(reinterpret_cast(&count), sizeof(count)); f.write(reinterpret_cast(offsets_.data()), static_cast(offsets_.size() * sizeof(int64_t))); } // Write data { std::string path = dir_path + "/" + col_name + ".data"; std::ofstream f(path, std::ios::binary); if (!f) throw WayyException("Failed to create data file: " + path); uint64_t sz = data_.size(); f.write(reinterpret_cast(&sz), sizeof(sz)); f.write(reinterpret_cast(data_.data()), static_cast(data_.size())); } // Write validity if present if (has_validity_) { std::string path = dir_path + "/" + col_name + ".validity"; std::ofstream f(path, std::ios::binary); if (!f) throw WayyException("Failed to create validity file: " + path); uint64_t sz = validity_.size(); f.write(reinterpret_cast(&sz), sizeof(sz)); f.write(reinterpret_cast(validity_.data()), static_cast(validity_.size())); } } StringColumn StringColumn::load(const std::string& dir_path, const std::string& col_name) { StringColumn sc(col_name); sc.offsets_.clear(); // Read offsets { std::string path = dir_path + "/" + col_name + ".offsets"; std::ifstream f(path, std::ios::binary); if (!f) throw WayyException("Failed to open offsets file: " + path); uint64_t count = 0; f.read(reinterpret_cast(&count), sizeof(count)); sc.offsets_.resize(count); f.read(reinterpret_cast(sc.offsets_.data()), static_cast(count * sizeof(int64_t))); } // Read data { std::string path = dir_path + "/" + col_name + ".data"; std::ifstream f(path, std::ios::binary); if (!f) throw WayyException("Failed to open data file: " + path); uint64_t sz = 0; f.read(reinterpret_cast(&sz), sizeof(sz)); sc.data_.resize(sz); f.read(reinterpret_cast(sc.data_.data()), static_cast(sz)); } // Read validity if present { std::string path = dir_path + "/" + col_name + ".validity"; if (fs::exists(path)) { std::ifstream f(path, std::ios::binary); if (f) { uint64_t sz = 0; f.read(reinterpret_cast(&sz), sizeof(sz)); sc.validity_.resize(sz); f.read(reinterpret_cast(sc.validity_.data()), static_cast(sz)); sc.has_validity_ = true; } } } return sc; } std::vector StringColumn::to_vector() const { std::vector result; result.reserve(size()); for (size_t i = 0; i < size(); ++i) { if (is_valid(i)) { result.emplace_back(get(i)); } else { result.emplace_back(); } } return result; } } // namespace wayy_db