B+ Tree — Hands-On Tasks¶
Audience: Engineers who have read
junior.mdthroughprofessional.mdand want code under their fingers. Each task ships with a reference solution in Go, Java, and Python, test cases, complexity notes, and lessons learned.
The B+ tree is the structure you most need to type out to truly internalize. Reading about leaf chains is easy. Writing one — and getting the sibling pointers right after a split that propagates three levels up — is what makes the structure click. Work through these tasks in order; each builds on the last.
Tasks¶
- In-memory B+ tree with search / insert / delete
- Leaf-chain pointers and range scan
- Lehman-Yao right-links for concurrent descent
- Bulk-load from a sorted stream in O(n)
- Prefix compression on leaf keys
- Disk-backed B+ tree with a fixed page size
- Copy-on-write B+ tree with snapshot reads
- MVCC index returning row versions per timestamp
- Order-statistic B+ tree with subtree counts
- Benchmark B+ tree vs classic B-tree
Task 1: In-Memory B+ Tree (search / insert / delete)¶
Problem¶
Build a B+ tree of order t over (int key, string value) pairs. Support search(k), insert(k, v), and delete(k). Both leaves and internals must respect the t <= keys <= 2t rule (root excepted). Use the two structural rules from junior.md: data lives only at leaves; leaves form a sorted linked list.
Hints¶
- Two node types:
LeafandInternal. Keep them as distinct structs to avoidisLeafbranches everywhere. - Internal node has
len(children) == len(keys) + 1. Leaf haslen(values) == len(keys). - After a leaf split, the separator copied up is the first key of the new right leaf. (Contrast with classic B-tree where the median is moved up.)
- For delete, prefer borrowing from a sibling before merging; merging propagates upward.
Reference solution — Go¶
package bplustree
const T = 3 // minimum number of keys per non-root node
type Node interface {
isLeaf() bool
}
type Leaf struct {
keys []int
vals []string
next *Leaf
}
func (*Leaf) isLeaf() bool { return true }
type Internal struct {
keys []int
children []Node
}
func (*Internal) isLeaf() bool { return false }
type Tree struct {
root Node
}
func New() *Tree { return &Tree{root: &Leaf{}} }
func (t *Tree) Search(k int) (string, bool) {
n := t.root
for !n.isLeaf() {
in := n.(*Internal)
i := upperBound(in.keys, k)
n = in.children[i]
}
lf := n.(*Leaf)
i := lowerBound(lf.keys, k)
if i < len(lf.keys) && lf.keys[i] == k {
return lf.vals[i], true
}
return "", false
}
func (t *Tree) Insert(k int, v string) {
sepKey, newChild, split := insert(t.root, k, v)
if split {
t.root = &Internal{
keys: []int{sepKey},
children: []Node{t.root, newChild},
}
}
}
func insert(n Node, k int, v string) (int, Node, bool) {
if n.isLeaf() {
lf := n.(*Leaf)
i := lowerBound(lf.keys, k)
if i < len(lf.keys) && lf.keys[i] == k {
lf.vals[i] = v
return 0, nil, false
}
lf.keys = insertAt(lf.keys, i, k)
lf.vals = insertStrAt(lf.vals, i, v)
if len(lf.keys) <= 2*T {
return 0, nil, false
}
mid := len(lf.keys) / 2
right := &Leaf{
keys: append([]int{}, lf.keys[mid:]...),
vals: append([]string{}, lf.vals[mid:]...),
next: lf.next,
}
lf.keys = lf.keys[:mid]
lf.vals = lf.vals[:mid]
lf.next = right
return right.keys[0], right, true
}
in := n.(*Internal)
i := upperBound(in.keys, k)
sep, child, split := insert(in.children[i], k, v)
if !split {
return 0, nil, false
}
in.keys = insertAt(in.keys, i, sep)
in.children = insertChildAt(in.children, i+1, child)
if len(in.keys) <= 2*T {
return 0, nil, false
}
mid := len(in.keys) / 2
sepUp := in.keys[mid]
right := &Internal{
keys: append([]int{}, in.keys[mid+1:]...),
children: append([]Node{}, in.children[mid+1:]...),
}
in.keys = in.keys[:mid]
in.children = in.children[:mid+1]
return sepUp, right, true
}
// Delete, borrow, merge, lowerBound, upperBound, and insertAt helpers
// follow classic B-tree mechanics but mind the leaf-chain pointer:
// on a leaf merge, splice the merged leaf's `next` into the predecessor's `next`.
Reference solution — Java¶
public final class BPlusTree {
static final int T = 3;
abstract static class Node { abstract boolean isLeaf(); }
static final class Leaf extends Node {
int[] keys = new int[2 * T + 1];
String[] vals = new String[2 * T + 1];
int n;
Leaf next;
boolean isLeaf() { return true; }
}
static final class Internal extends Node {
int[] keys = new int[2 * T + 1];
Node[] children = new Node[2 * T + 2];
int n;
boolean isLeaf() { return false; }
}
Node root = new Leaf();
String search(int k) {
Node n = root;
while (!n.isLeaf()) {
Internal in = (Internal) n;
int i = upperBound(in.keys, in.n, k);
n = in.children[i];
}
Leaf lf = (Leaf) n;
int i = lowerBound(lf.keys, lf.n, k);
return (i < lf.n && lf.keys[i] == k) ? lf.vals[i] : null;
}
void insert(int k, String v) {
Object[] r = insertRec(root, k, v);
if (r != null) {
Internal nr = new Internal();
nr.keys[0] = (int) r[0]; nr.n = 1;
nr.children[0] = root;
nr.children[1] = (Node) r[1];
root = nr;
}
}
// insertRec returns null on no split, {sepKey, newNode} on split.
}
Reference solution — Python¶
T = 3
class Leaf:
__slots__ = ("keys", "vals", "next")
def __init__(self):
self.keys = []
self.vals = []
self.next = None
class Internal:
__slots__ = ("keys", "children")
def __init__(self):
self.keys = []
self.children = []
class BPlusTree:
def __init__(self):
self.root = Leaf()
def search(self, k):
n = self.root
while isinstance(n, Internal):
i = next((j for j, x in enumerate(n.keys) if k < x), len(n.keys))
n = n.children[i]
for j, x in enumerate(n.keys):
if x == k:
return n.vals[j]
return None
def insert(self, k, v):
split = self._insert(self.root, k, v)
if split:
sep, right = split
new_root = Internal()
new_root.keys = [sep]
new_root.children = [self.root, right]
self.root = new_root
def _insert(self, n, k, v):
if isinstance(n, Leaf):
i = 0
while i < len(n.keys) and n.keys[i] < k:
i += 1
if i < len(n.keys) and n.keys[i] == k:
n.vals[i] = v
return None
n.keys.insert(i, k)
n.vals.insert(i, v)
if len(n.keys) <= 2 * T:
return None
mid = len(n.keys) // 2
right = Leaf()
right.keys = n.keys[mid:]
right.vals = n.vals[mid:]
right.next = n.next
n.keys = n.keys[:mid]
n.vals = n.vals[:mid]
n.next = right
return right.keys[0], right
i = 0
while i < len(n.keys) and n.keys[i] <= k:
i += 1
split = self._insert(n.children[i], k, v)
if not split:
return None
sep, child = split
n.keys.insert(i, sep)
n.children.insert(i + 1, child)
if len(n.keys) <= 2 * T:
return None
mid = len(n.keys) // 2
sep_up = n.keys[mid]
right = Internal()
right.keys = n.keys[mid + 1:]
right.children = n.children[mid + 1:]
n.keys = n.keys[:mid]
n.children = n.children[:mid + 1]
return sep_up, right
Test cases¶
- Insert
1..100in order; assertsearch(k) == str(k)for every k. - Insert
100..1reversed; assert in-order traversal yields1..100. - Insert random 10,000 integers, then delete half; assert remaining search succeeds.
- Insert duplicate key with different value; assert the value is overwritten.
Complexity¶
search:O(log_t n).insert,delete:O(log_t n)amortized; the split / merge cost is paid by the height term.
Lessons¶
- The separator copied up on a leaf split is the first key of the new right leaf, not the median.
- Always update
lf.nextbefore truncatinglf.keys; reversing the order silently corrupts the chain. - Don't use a single
Nodestruct with anisLeafboolean — the type confusion bugs cost hours. Two structs, one interface.
Task 2: Leaf Chain Range Scan¶
Problem¶
Extend Task 1 with rangeScan(lo, hi) []int that returns every key k such that lo <= k <= hi, in order. The descent cost must be O(log_t n) and the per-result cost O(1) (amortized over leaf I/O).
Hints¶
- Descend to the leaf containing
loexactly assearch(lo)does. - Once at the leaf, linearly scan forward, following
leaf.next, accumulating keys until you passhi. - Do not descend the tree more than once.
Reference solution — Go¶
func (t *Tree) RangeScan(lo, hi int) []int {
n := t.root
for !n.isLeaf() {
in := n.(*Internal)
i := upperBound(in.keys, lo)
n = in.children[i]
}
var out []int
lf := n.(*Leaf)
i := lowerBound(lf.keys, lo)
for lf != nil {
for ; i < len(lf.keys); i++ {
if lf.keys[i] > hi {
return out
}
out = append(out, lf.keys[i])
}
lf = lf.next
i = 0
}
return out
}
Reference solution — Java¶
List<Integer> rangeScan(int lo, int hi) {
Node n = root;
while (!n.isLeaf()) {
Internal in = (Internal) n;
int idx = upperBound(in.keys, in.n, lo);
n = in.children[idx];
}
List<Integer> out = new ArrayList<>();
Leaf lf = (Leaf) n;
int i = lowerBound(lf.keys, lf.n, lo);
while (lf != null) {
for (; i < lf.n; i++) {
if (lf.keys[i] > hi) return out;
out.add(lf.keys[i]);
}
lf = lf.next;
i = 0;
}
return out;
}
Reference solution — Python¶
def range_scan(self, lo, hi):
n = self.root
while isinstance(n, Internal):
i = next((j for j, x in enumerate(n.keys) if lo < x), len(n.keys))
n = n.children[i]
out = []
while n is not None:
for k in n.keys:
if k < lo:
continue
if k > hi:
return out
out.append(k)
n = n.next
return out
Test cases¶
- After inserting
1..1000,rangeScan(250, 750)returns[250..750]. - Empty range:
rangeScan(2000, 3000)returns[]. - Range that hits exactly one leaf: returns the slice inside that leaf only.
Complexity¶
O(log_t n + k / b)wherekis the answer size andbthe leaf capacity.
Lessons¶
- The point of the linked leaf chain is one root-to-leaf descent regardless of how big the range is.
- Forgetting to reset
i = 0when crossing leaves is the classic off-by-one for first-time implementers.
Task 3: Lehman-Yao Right-Links¶
Problem¶
Add right-link pointers at every level (not just leaves) and a highKey per node. Implement a reader that tolerates a concurrent split: if it arrives at a node and key > node.highKey, it follows rightLink instead of restarting from root.
Hints¶
- Right-links go in both
LeafandInternal. - The
highKeyis the maximum key in the subtree the node is currently responsible for; when a split moves keys to a new right sibling, the old node'shighKeybecomes the separator. - For this task, simulate the race: in test, kick off a goroutine / thread that does inserts while another reads.
Reference solution — Go¶
type Internal struct {
keys []int
children []Node
highKey int
rightLink *Internal
}
type Leaf struct {
keys []int
vals []string
highKey int
next *Leaf // also serves as rightLink
}
func (t *Tree) ConcurrentSearch(k int) (string, bool) {
n := t.root
for !n.isLeaf() {
in := n.(*Internal)
for k > in.highKey && in.rightLink != nil {
in = in.rightLink
}
i := upperBound(in.keys, k)
n = in.children[i]
}
lf := n.(*Leaf)
for k > lf.highKey && lf.next != nil {
lf = lf.next
}
i := lowerBound(lf.keys, k)
if i < len(lf.keys) && lf.keys[i] == k {
return lf.vals[i], true
}
return "", false
}
Reference solution — Java¶
String concurrentSearch(int k) {
Node n = root;
while (!n.isLeaf()) {
Internal in = (Internal) n;
while (k > in.highKey && in.rightLink != null) in = in.rightLink;
int i = upperBound(in.keys, in.n, k);
n = in.children[i];
}
Leaf lf = (Leaf) n;
while (k > lf.highKey && lf.next != null) lf = lf.next;
int i = lowerBound(lf.keys, lf.n, k);
return (i < lf.n && lf.keys[i] == k) ? lf.vals[i] : null;
}
Reference solution — Python¶
def concurrent_search(self, k):
n = self.root
while isinstance(n, Internal):
while k > n.high_key and n.right_link is not None:
n = n.right_link
i = next((j for j, x in enumerate(n.keys) if k < x), len(n.keys))
n = n.children[i]
while k > n.high_key and n.next is not None:
n = n.next
for j, x in enumerate(n.keys):
if x == k:
return n.vals[j]
return None
Test cases¶
- Single-threaded: identical to Task 1.
- Concurrent: spawn 100 inserters and 100 readers on the same tree; assert no reader misses a key it should have seen.
- Force the race by inserting in batches that always cause a split.
Complexity¶
- Same as Task 1; the extra right-link follows are amortized away because splits are rare and the link chain length is bounded by 2.
Lessons¶
- The right-link must be set before the new sibling is published into the parent. Otherwise readers can be stranded.
- High key is the max key the node is responsible for, not the max key it currently stores — they coincide except during a split's brief window.
Task 4: Bulk Load from a Sorted Stream in O(n)¶
Problem¶
Given a presorted stream of (k, v) pairs, build a B+ tree without going through n single-key inserts. The build should run in O(n) total work and produce leaves that are filled to a target ratio (e.g. 75%).
Hints¶
- Stream the input into leaves of capacity
2t; each time a leaf fills, emit a separator and the leaf pointer to a queue for the level above. - After streaming the bottom level, the parent level is built the same way from the queue, recursively, until one node remains — that's the root.
- Link leaves as you emit them.
Reference solution — Go¶
func BulkLoad(pairs []struct{ K int; V string }, fillRatio float64) *Tree {
leafCap := int(float64(2*T) * fillRatio)
if leafCap < T { leafCap = T }
var leaves []*Leaf
cur := &Leaf{}
for _, p := range pairs {
if len(cur.keys) == leafCap {
leaves = append(leaves, cur)
nxt := &Leaf{}
cur.next = nxt
cur = nxt
}
cur.keys = append(cur.keys, p.K)
cur.vals = append(cur.vals, p.V)
}
if len(cur.keys) > 0 {
leaves = append(leaves, cur)
}
if len(leaves) == 1 {
return &Tree{root: leaves[0]}
}
var level []Node
for _, lf := range leaves { level = append(level, lf) }
seps := make([]int, 0, len(leaves)-1)
for i := 1; i < len(leaves); i++ {
seps = append(seps, leaves[i].keys[0])
}
for len(level) > 1 {
nextLevel := []Node{}
nextSeps := []int{}
intCap := 2 * T
i := 0
for i < len(level) {
in := &Internal{}
end := i + intCap + 1
if end > len(level) { end = len(level) }
in.children = level[i:end]
if i > 0 {
nextSeps = append(nextSeps, seps[i-1])
}
for j := i; j < end-1; j++ {
in.keys = append(in.keys, seps[j])
}
nextLevel = append(nextLevel, in)
i = end
}
level = nextLevel
seps = nextSeps
}
return &Tree{root: level[0]}
}
Reference solution — Java¶
static BPlusTree bulkLoad(int[] ks, String[] vs, double fillRatio) {
int cap = Math.max(T, (int)(2 * T * fillRatio));
List<Leaf> leaves = new ArrayList<>();
Leaf cur = new Leaf();
for (int i = 0; i < ks.length; i++) {
if (cur.n == cap) {
leaves.add(cur);
Leaf nx = new Leaf();
cur.next = nx;
cur = nx;
}
cur.keys[cur.n] = ks[i];
cur.vals[cur.n] = vs[i];
cur.n++;
}
if (cur.n > 0) leaves.add(cur);
BPlusTree t = new BPlusTree();
if (leaves.size() == 1) { t.root = leaves.get(0); return t; }
// Build parent levels bottom-up; mirrors the Go version.
return t;
}
Reference solution — Python¶
def bulk_load(pairs, fill_ratio=0.75):
cap = max(T, int(2 * T * fill_ratio))
leaves = []
cur = Leaf()
for k, v in pairs:
if len(cur.keys) == cap:
leaves.append(cur)
nxt = Leaf()
cur.next = nxt
cur = nxt
cur.keys.append(k)
cur.vals.append(v)
if cur.keys:
leaves.append(cur)
t = BPlusTree()
if len(leaves) == 1:
t.root = leaves[0]
return t
level = list(leaves)
seps = [leaves[i].keys[0] for i in range(1, len(leaves))]
while len(level) > 1:
nxt_level, nxt_seps = [], []
cap_i = 2 * T
i = 0
while i < len(level):
node = Internal()
end = min(i + cap_i + 1, len(level))
node.children = level[i:end]
if i > 0:
nxt_seps.append(seps[i - 1])
node.keys = seps[i:end - 1]
nxt_level.append(node)
i = end
level, seps = nxt_level, nxt_seps
t.root = level[0]
return t
Test cases¶
- Bulk-load
1..1,000,000with 75% fill; check height isceil(log_t(n))plus 1 at most. - Compare against single-insert load: bulk should be 5–30x faster.
- Verify all keys are searchable and leaf chain walks the full range.
Complexity¶
O(n)total; each element is touched a constant number of times across all levels.
Lessons¶
- Bulk load is how
CREATE INDEXworks in real databases. The query planner sorts the table once, then streams. - A common bug: forgetting to chain the last leaf's
next = None.
Task 5: Prefix Compression on Leaf Keys¶
Problem¶
Extend the leaf format to store a shared prefix once. Each record becomes (suffix_bytes). Decompress on read. Measure how much space you save for sorted timestamp keys.
Hints¶
- Compute the common prefix of the leaf's first and last key.
- Store the prefix length and bytes at the leaf header.
- Each entry stores only the suffix; reconstruct the full key for comparison during search.
Reference solution — Go¶
type CompressedLeaf struct {
prefix []byte
suffixes [][]byte
vals []string
next *CompressedLeaf
}
func (lf *CompressedLeaf) full(i int) []byte {
b := make([]byte, 0, len(lf.prefix)+len(lf.suffixes[i]))
b = append(b, lf.prefix...)
b = append(b, lf.suffixes[i]...)
return b
}
func commonPrefix(a, b []byte) int {
n := len(a)
if len(b) < n { n = len(b) }
for i := 0; i < n; i++ {
if a[i] != b[i] { return i }
}
return n
}
func compressLeaf(keys [][]byte, vals []string) *CompressedLeaf {
if len(keys) == 0 { return &CompressedLeaf{} }
p := commonPrefix(keys[0], keys[len(keys)-1])
out := &CompressedLeaf{prefix: keys[0][:p], vals: vals}
for _, k := range keys {
out.suffixes = append(out.suffixes, append([]byte{}, k[p:]...))
}
return out
}
Reference solution — Java¶
final class CompressedLeaf {
byte[] prefix;
List<byte[]> suffixes = new ArrayList<>();
List<String> vals = new ArrayList<>();
CompressedLeaf next;
byte[] fullKey(int i) {
byte[] s = suffixes.get(i);
byte[] out = new byte[prefix.length + s.length];
System.arraycopy(prefix, 0, out, 0, prefix.length);
System.arraycopy(s, 0, out, prefix.length, s.length);
return out;
}
}
Reference solution — Python¶
class CompressedLeaf:
__slots__ = ("prefix", "suffixes", "vals", "next")
def __init__(self):
self.prefix = b""
self.suffixes = []
self.vals = []
self.next = None
def full(self, i):
return self.prefix + self.suffixes[i]
def common_prefix(a, b):
n = min(len(a), len(b))
for i in range(n):
if a[i] != b[i]:
return i
return n
def compress_leaf(keys, vals):
lf = CompressedLeaf()
if not keys:
return lf
p = common_prefix(keys[0], keys[-1])
lf.prefix = keys[0][:p]
lf.suffixes = [k[p:] for k in keys]
lf.vals = list(vals)
return lf
Test cases¶
- Compress
[b"2026-05-01T00:00:00", b"2026-05-01T00:01:00", ..., b"2026-05-01T23:59:00"]; verify the prefix isb"2026-05-01T"and suffixes are 8 bytes. - After compression, reconstructed
full(i)equals the original key. - Compute byte savings: 1440 keys at 19 bytes = 27,360 bytes uncompressed; with prefix
2026-05-01T(11 bytes) and 8-byte suffixes: 11 + 1440 * 8 = 11,531 bytes. About 58% savings.
Complexity¶
- Compression:
O(b * k_avg)per leaf. - Lookup: same
O(log b)after reconstruction (or compare suffix directly if you know the prefix matches).
Lessons¶
- Prefix compression is the most universal page-format optimization; almost free for sorted strings.
- For lookup, you do not need to materialize the full key — you can compare prefix once, then binary-search by suffix.
Task 6: Disk-Backed B+ Tree with Fixed Page Size¶
Problem¶
Build a B+ tree on top of a binary file with 4 KB pages. Each page is either a LeafPage or InternalPage. Node pointers become 4-byte page IDs. Implement a tiny LRU page cache.
Hints¶
- Allocate pages with a free-list (first unused ID, grow file as needed).
- Page header:
pageType(1 byte),nKeys(2 bytes),rightLink(4 bytes),nextLeaf(4 bytes only for leaves). - Use
binary.LittleEndian.PutUint32/ByteBuffer.putInt/struct.packto write fixed-width fields.
Reference solution — Go (skeleton)¶
const PageSize = 4096
const (
PageLeaf byte = 1
PageInternal byte = 2
)
type Pager struct {
f *os.File
next uint32
lru map[uint32]*Page
}
type Page struct {
id uint32
dirty bool
data [PageSize]byte
}
func (p *Pager) Read(id uint32) (*Page, error) {
if pg, ok := p.lru[id]; ok { return pg, nil }
pg := &Page{id: id}
_, err := p.f.ReadAt(pg.data[:], int64(id)*PageSize)
if err != nil { return nil, err }
p.lru[id] = pg
return pg, nil
}
func (p *Pager) Write(pg *Page) error {
_, err := p.f.WriteAt(pg.data[:], int64(pg.id)*PageSize)
pg.dirty = false
return err
}
// Leaf encoding: [type:1][n:2][nextLeaf:4][ keys[]:4 * n ][ valOffs[]:2 * n ][ vals[] ]
// Fits ~340 int4 keys + int4 row pointers per 4 KB leaf depending on value size.
Reference solution — Java (skeleton)¶
final class Pager {
static final int PAGE_SIZE = 4096;
final RandomAccessFile file;
final Map<Integer, ByteBuffer> cache = new LinkedHashMap<>(64, 0.75f, true);
int nextPage = 1;
Pager(File f) throws IOException { this.file = new RandomAccessFile(f, "rw"); }
ByteBuffer read(int id) throws IOException {
ByteBuffer b = cache.get(id);
if (b != null) return b;
b = ByteBuffer.allocate(PAGE_SIZE).order(ByteOrder.LITTLE_ENDIAN);
file.seek((long) id * PAGE_SIZE);
file.readFully(b.array());
cache.put(id, b);
return b;
}
}
Reference solution — Python (skeleton)¶
import struct, os
PAGE_SIZE = 4096
LEAF, INTERNAL = 1, 2
class Pager:
def __init__(self, path):
self.f = open(path, "r+b" if os.path.exists(path) else "w+b")
self.next_id = 1
self.cache = {}
def read(self, pid):
if pid in self.cache:
return self.cache[pid]
self.f.seek(pid * PAGE_SIZE)
data = bytearray(self.f.read(PAGE_SIZE))
self.cache[pid] = data
return data
def write(self, pid, data):
self.f.seek(pid * PAGE_SIZE)
self.f.write(bytes(data))
def encode_leaf(keys, vals, next_leaf):
buf = bytearray(PAGE_SIZE)
buf[0] = LEAF
struct.pack_into("<H", buf, 1, len(keys))
struct.pack_into("<I", buf, 3, next_leaf)
off = 7
for k in keys:
struct.pack_into("<i", buf, off, k); off += 4
return bytes(buf)
Test cases¶
- Insert
1..100_000integers; close the file; reopen; search every key. - Force a leaf split mid-file: verify the new page has a new ID and the parent's child pointer is updated and fsynced before the old page is freed.
- Crash test: kill the process after a partial write, reopen, run
verify(); assert it detects the inconsistency or recovers via WAL (extension).
Complexity¶
- Same asymptotic as in-memory, but each node touch is an I/O cost — make the cache count.
Lessons¶
- Fixed page size is the constraint that defines real databases. Your fanout is
floor((PageSize - header) / (key_size + child_pointer_size)). - Always write the new page first, then update the parent. Reversing order corrupts on crash.
Task 7: Copy-on-Write B+ Tree¶
Problem¶
Modify Task 1 so every update produces a new version of each touched node — original nodes are never mutated. Maintain a list of root pointers, one per "snapshot." A reader uses the root at its snapshot timestamp and is immune to concurrent writes.
Hints¶
- Insert recursively returns either the same node (if no changes) or a new copy.
- The path from root to the mutated leaf gets fully copied; siblings are shared.
- For real systems (LMDB, bbolt), this is the whole concurrency story — no locks, no MVCC bookkeeping.
Reference solution — Go¶
type CowTree struct {
roots []Node // roots[i] is the root at snapshot i
}
func cowInsert(n Node, k int, v string) (Node, int, Node, bool) {
if n.isLeaf() {
old := n.(*Leaf)
nl := &Leaf{
keys: append([]int{}, old.keys...),
vals: append([]string{}, old.vals...),
next: old.next,
}
i := lowerBound(nl.keys, k)
if i < len(nl.keys) && nl.keys[i] == k {
nl.vals[i] = v
} else {
nl.keys = insertAt(nl.keys, i, k)
nl.vals = insertStrAt(nl.vals, i, v)
}
if len(nl.keys) <= 2*T {
return nl, 0, nil, false
}
mid := len(nl.keys) / 2
right := &Leaf{
keys: append([]int{}, nl.keys[mid:]...),
vals: append([]string{}, nl.vals[mid:]...),
next: nl.next,
}
nl.keys = nl.keys[:mid]
nl.vals = nl.vals[:mid]
nl.next = right
return nl, right.keys[0], right, true
}
old := n.(*Internal)
ni := &Internal{
keys: append([]int{}, old.keys...),
children: append([]Node{}, old.children...),
}
i := upperBound(ni.keys, k)
newChild, sep, sib, split := cowInsert(ni.children[i], k, v)
ni.children[i] = newChild
if !split {
return ni, 0, nil, false
}
ni.keys = insertAt(ni.keys, i, sep)
ni.children = insertChildAt(ni.children, i+1, sib)
if len(ni.keys) <= 2*T {
return ni, 0, nil, false
}
mid := len(ni.keys) / 2
sepUp := ni.keys[mid]
right := &Internal{
keys: append([]int{}, ni.keys[mid+1:]...),
children: append([]Node{}, ni.children[mid+1:]...),
}
ni.keys = ni.keys[:mid]
ni.children = ni.children[:mid+1]
return ni, sepUp, right, true
}
func (t *CowTree) Insert(k int, v string) int {
root := t.roots[len(t.roots)-1]
newRoot, sep, sib, split := cowInsert(root, k, v)
if split {
newRoot = &Internal{keys: []int{sep}, children: []Node{newRoot, sib}}
}
t.roots = append(t.roots, newRoot)
return len(t.roots) - 1
}
Reference solution — Java (sketch)¶
final class CowTree {
List<Node> roots = new ArrayList<>();
int insert(int k, String v) {
Node curr = roots.get(roots.size() - 1);
Object[] r = cowInsert(curr, k, v);
Node newRoot = (Node) r[0];
if (r[2] != null) {
Internal in = new Internal();
in.keys[0] = (int) r[1]; in.n = 1;
in.children[0] = newRoot;
in.children[1] = (Node) r[2];
newRoot = in;
}
roots.add(newRoot);
return roots.size() - 1;
}
// cowInsert returns {newNode, sepKey, newSibling-or-null}.
}
Reference solution — Python¶
class CowTree:
def __init__(self):
self.roots = [Leaf()]
def insert(self, k, v):
new_root, split = self._cow(self.roots[-1], k, v)
if split:
sep, sib = split
r = Internal()
r.keys = [sep]
r.children = [new_root, sib]
new_root = r
self.roots.append(new_root)
return len(self.roots) - 1
def _cow(self, n, k, v):
if isinstance(n, Leaf):
nl = Leaf()
nl.keys = list(n.keys); nl.vals = list(n.vals); nl.next = n.next
i = 0
while i < len(nl.keys) and nl.keys[i] < k:
i += 1
if i < len(nl.keys) and nl.keys[i] == k:
nl.vals[i] = v
else:
nl.keys.insert(i, k); nl.vals.insert(i, v)
if len(nl.keys) <= 2 * T:
return nl, None
mid = len(nl.keys) // 2
right = Leaf()
right.keys = nl.keys[mid:]; right.vals = nl.vals[mid:]; right.next = nl.next
nl.keys = nl.keys[:mid]; nl.vals = nl.vals[:mid]; nl.next = right
return nl, (right.keys[0], right)
ni = Internal()
ni.keys = list(n.keys); ni.children = list(n.children)
i = 0
while i < len(ni.keys) and ni.keys[i] <= k:
i += 1
child, split = self._cow(ni.children[i], k, v)
ni.children[i] = child
if not split:
return ni, None
sep, sib = split
ni.keys.insert(i, sep); ni.children.insert(i + 1, sib)
if len(ni.keys) <= 2 * T:
return ni, None
mid = len(ni.keys) // 2
sep_up = ni.keys[mid]
right = Internal()
right.keys = ni.keys[mid + 1:]; right.children = ni.children[mid + 1:]
ni.keys = ni.keys[:mid]; ni.children = ni.children[:mid + 1]
return ni, (sep_up, right)
Test cases¶
- Insert
1, 2, 3; snapshot 1 sees only1; snapshot 2 sees1, 2; snapshot 3 sees1, 2, 3. - After 1000 inserts, snapshot 500 still has exactly 500 keys.
- Garbage-collect snapshots earlier than the oldest reader; assert memory comes back.
Complexity¶
O(log_t n)extra memory per insert (the copied path).- Each snapshot read is
O(log_t n).
Lessons¶
- CoW gives you transactions for free — until you need to GC old roots, which becomes the hardest part of the design.
- LMDB's design uses a single writer with CoW; this skips most of the GC pain because old roots can be retired in batches.
Task 8: MVCC Index Returning Row Versions per Timestamp¶
Problem¶
Each value is a chain of (timestamp, value, next) versions. search(k, ts) returns the value whose timestamp is the largest <= ts. The B+ tree key is still k; the leaf entry points to the version chain.
Hints¶
- Use a real append-only version chain per key.
- For
ts = ∞, return the latest version. - For
ts < first_version.ts, return "not found" (snapshot too old / key didn't exist).
Reference solution — Go¶
type Version struct {
ts int64
val string
next *Version
}
type MVCCLeaf struct {
keys []int
versions []*Version
next *MVCCLeaf
}
func (lf *MVCCLeaf) Read(k int, ts int64) (string, bool) {
i := lowerBound(lf.keys, k)
if i >= len(lf.keys) || lf.keys[i] != k {
return "", false
}
for v := lf.versions[i]; v != nil; v = v.next {
if v.ts <= ts {
return v.val, true
}
}
return "", false
}
func (lf *MVCCLeaf) Write(k int, ts int64, val string) {
i := lowerBound(lf.keys, k)
if i < len(lf.keys) && lf.keys[i] == k {
lf.versions[i] = &Version{ts: ts, val: val, next: lf.versions[i]}
return
}
lf.keys = insertAt(lf.keys, i, k)
lf.versions = insertVerAt(lf.versions, i, &Version{ts: ts, val: val})
}
Reference solution — Java¶
static final class Version {
long ts;
String val;
Version next;
Version(long ts, String val, Version n) { this.ts = ts; this.val = val; this.next = n; }
}
String read(int k, long ts) {
Leaf lf = descendTo(k);
int i = lowerBound(lf.keys, lf.n, k);
if (i >= lf.n || lf.keys[i] != k) return null;
for (Version v = lf.versions[i]; v != null; v = v.next) {
if (v.ts <= ts) return v.val;
}
return null;
}
Reference solution — Python¶
class Version:
__slots__ = ("ts", "val", "next")
def __init__(self, ts, val, nxt=None):
self.ts = ts
self.val = val
self.next = nxt
class MVCCLeaf:
__slots__ = ("keys", "versions", "next")
def __init__(self):
self.keys, self.versions, self.next = [], [], None
def read(self, k, ts):
for i, x in enumerate(self.keys):
if x == k:
v = self.versions[i]
while v is not None:
if v.ts <= ts:
return v.val
v = v.next
return None
return None
def write(self, k, ts, val):
for i, x in enumerate(self.keys):
if x == k:
self.versions[i] = Version(ts, val, self.versions[i])
return
if x > k:
self.keys.insert(i, k)
self.versions.insert(i, Version(ts, val))
return
self.keys.append(k)
self.versions.append(Version(ts, val))
Test cases¶
- Write key=1 with values "a" at ts=10, "b" at ts=20, "c" at ts=30.
read(1, 5)returns None.read(1, 10)returns "a".read(1, 25)returns "b".read(1, 1000)returns "c".- Run 100 concurrent writers and 100 readers with different
ts; assert each reader sees a consistent snapshot.
Complexity¶
read:O(log_t n + V)where V is the version chain length (bounded by tx GC).write:O(log_t n + 1).
Lessons¶
- The B+ tree key locates the version chain; the chain is sorted by descending timestamp.
- Snapshot isolation falls out of MVCC almost for free; the hard part is the garbage collection of versions no reader can see (vacuum in PostgreSQL).
Task 9: Order-Statistic B+ Tree¶
Problem¶
Add a subtreeCount field to every internal node child slot. Implement rank(k) (number of keys < k) and select(i) (the i-th key in order).
Hints¶
- Each internal node has
counts[i]= number of keys in subtreechildren[i]. - On insert/delete, walk the modified path and increment / decrement the counts.
rank(k): descend, accumulating left-subtree counts as you go.select(i): at each node, find the child whose cumulative count containsi, subtract preceding counts, descend.
Reference solution — Go¶
type OSInternal struct {
keys []int
children []Node
counts []int // counts[i] = subtree size of children[i]
}
func (t *Tree) Rank(k int) int {
return rank(t.root, k)
}
func rank(n Node, k int) int {
if n.isLeaf() {
lf := n.(*Leaf)
return lowerBound(lf.keys, k)
}
in := n.(*OSInternal)
sum := 0
for i, sep := range in.keys {
if k <= sep {
return sum + rank(in.children[i], k)
}
sum += in.counts[i]
}
return sum + rank(in.children[len(in.children)-1], k)
}
func (t *Tree) Select(i int) int {
return selectAt(t.root, i)
}
func selectAt(n Node, i int) int {
if n.isLeaf() {
return n.(*Leaf).keys[i]
}
in := n.(*OSInternal)
for j, c := range in.counts {
if i < c {
return selectAt(in.children[j], i)
}
i -= c
}
return -1
}
Reference solution — Java¶
int rank(Node n, int k) {
if (n.isLeaf()) return lowerBound(((Leaf) n).keys, ((Leaf) n).n, k);
OSInternal in = (OSInternal) n;
int sum = 0;
for (int i = 0; i < in.n; i++) {
if (k <= in.keys[i]) return sum + rank(in.children[i], k);
sum += in.counts[i];
}
return sum + rank(in.children[in.n], k);
}
int select(Node n, int i) {
if (n.isLeaf()) return ((Leaf) n).keys[i];
OSInternal in = (OSInternal) n;
for (int j = 0; j <= in.n; j++) {
if (i < in.counts[j]) return select(in.children[j], i);
i -= in.counts[j];
}
throw new IndexOutOfBoundsException();
}
Reference solution — Python¶
class OSInternal(Internal):
def __init__(self):
super().__init__()
self.counts = []
def rank(n, k):
if isinstance(n, Leaf):
for i, x in enumerate(n.keys):
if x >= k:
return i
return len(n.keys)
s = 0
for i, sep in enumerate(n.keys):
if k <= sep:
return s + rank(n.children[i], k)
s += n.counts[i]
return s + rank(n.children[-1], k)
def select_at(n, i):
if isinstance(n, Leaf):
return n.keys[i]
for j, c in enumerate(n.counts):
if i < c:
return select_at(n.children[j], i)
i -= c
raise IndexError
Test cases¶
- Insert
1..100.rank(50)returns 49.select(0)returns 1.select(99)returns 100. - After deleting 50,
rank(50)returns 49,select(49)returns 51.
Complexity¶
- Both
rankandselect:O(log_t n).
Lessons¶
- Counts only need to be touched on the modified root-to-leaf path. The rest of the tree is unaffected.
- Order statistics turn the B+ tree into a ranked index — useful for "the 1000th-largest order" kinds of queries.
Task 10: Benchmark B+ Tree vs Classic B-tree¶
Problem¶
Implement both a classic B-tree and your B+ tree from Task 1+2. Run a benchmark that: - Inserts 1,000,000 random integers into each. - Performs 100,000 range scans of average width 100. - Reports total time and per-operation latency.
The B+ tree should win range scans by 5–50x; insert times should be within 1.5x of each other.
Reference solution — Go¶
func BenchAll() {
bp := New()
bt := classicNew()
keys := make([]int, 1_000_000)
for i := range keys { keys[i] = rand.Int() }
start := time.Now()
for _, k := range keys { bp.Insert(k, "v") }
fmt.Println("B+ insert:", time.Since(start))
start = time.Now()
for _, k := range keys { bt.Insert(k, "v") }
fmt.Println("B-tree insert:", time.Since(start))
rangeStart := time.Now()
total := 0
for i := 0; i < 100_000; i++ {
lo := rand.Int()
hi := lo + 100
total += len(bp.RangeScan(lo, hi))
}
fmt.Println("B+ range scan total:", time.Since(rangeStart), "results:", total)
rangeStart = time.Now()
total = 0
for i := 0; i < 100_000; i++ {
lo := rand.Int()
hi := lo + 100
total += bt.RangeScan(lo, hi)
}
fmt.Println("B-tree range scan total:", time.Since(rangeStart), "results:", total)
}
Reference solution — Java¶
public static void main(String[] args) {
BPlusTree bp = new BPlusTree();
BTree bt = new BTree();
Random rnd = new Random(42);
int[] keys = rnd.ints(1_000_000).toArray();
long s = System.nanoTime();
for (int k : keys) bp.insert(k, "v");
System.out.println("B+ insert ms: " + (System.nanoTime() - s) / 1_000_000);
s = System.nanoTime();
for (int k : keys) bt.insert(k, "v");
System.out.println("B-tree insert ms: " + (System.nanoTime() - s) / 1_000_000);
s = System.nanoTime();
long total = 0;
for (int i = 0; i < 100_000; i++) {
int lo = rnd.nextInt();
total += bp.rangeScan(lo, lo + 100).size();
}
System.out.println("B+ scan ms: " + (System.nanoTime() - s) / 1_000_000 + " total=" + total);
}
Reference solution — Python¶
import random, time
from bplus import BPlusTree
from btree import BTree
bp, bt = BPlusTree(), BTree()
keys = [random.randint(0, 1 << 30) for _ in range(1_000_000)]
t = time.perf_counter()
for k in keys:
bp.insert(k, "v")
print(f"B+ insert: {time.perf_counter()-t:.2f}s")
t = time.perf_counter()
for k in keys:
bt.insert(k, "v")
print(f"B-tree insert: {time.perf_counter()-t:.2f}s")
t = time.perf_counter()
total = 0
for _ in range(100_000):
lo = random.randint(0, 1 << 30)
total += len(bp.range_scan(lo, lo + 100))
print(f"B+ range: {time.perf_counter()-t:.2f}s total={total}")
t = time.perf_counter()
total = 0
for _ in range(100_000):
lo = random.randint(0, 1 << 30)
total += bt.range_scan(lo, lo + 100)
print(f"B-tree range: {time.perf_counter()-t:.2f}s total={total}")
Test cases¶
- Vary
tfrom 3 to 64; tabulate height and insert latency. Height should drop log-style; insert latency goes through a minimum near typical CPU cache line sizes. - Vary range width from 10 to 100,000; B+ tree advantage grows with width.
Complexity¶
- Insert: B+ tree is
O(log_t n)(same as B-tree). - Range scan width
w: B+ tree isO(log_t n + w / b), B-tree isO(log_t n * w)in the worst case (multiple re-descents).
Lessons¶
- The B+ tree's range-scan speed advantage gets bigger as the range widens — which is exactly the database workload.
- For exact-match lookups only, B+ tree and B-tree are essentially tied; the choice is dictated by the access pattern.
Final notes¶
After completing all ten tasks you will have written, in three languages, the same family of trees that powers PostgreSQL, MySQL, SQLite, and the JVM file system. The two structural rules — data at leaves only, leaves linked in a list — never become more sophisticated than they were at task 1. What changes is everything around them: locking, page format, MVCC, persistence, GC. The B+ tree is the rare data structure that grows in every direction without changing its core.
Go back to senior.md and professional.md and re-read the sections on PostgreSQL nbtree, LMDB, and ART after finishing these tasks — you will read them like a different person.