Huffman Coding — Practice Tasks¶
All tasks must be solved in Go, Java, and Python. Each task ships with a problem statement, constraints, hints, and a reference solution in all three languages. Build a round-trip tester early — encode then decode must reproduce the input. Known checks:
"abracadabra"→ 23 bits with an optimal code; entropy bandH ≤ L < H + 1; Kraft sumΣ 2^(−len) = 1for a complete code.
Beginner Tasks (5)¶
Task 1 — Count frequencies and report the distribution¶
Problem. Given a string, count how often each character appears and print the symbols sorted, with their counts and probabilities.
Constraints. 1 ≤ |text| ≤ 10⁶, ASCII.
Hint. A map/dict keyed by character. Probability = count / total.
Go¶
package main
import (
"fmt"
"sort"
)
func main() {
text := "abracadabra"
freq := map[byte]int{}
for i := 0; i < len(text); i++ {
freq[text[i]]++
}
syms := make([]int, 0, len(freq))
for s := range freq {
syms = append(syms, int(s))
}
sort.Ints(syms)
for _, s := range syms {
fmt.Printf("%c: %d (%.3f)\n", byte(s), freq[byte(s)], float64(freq[byte(s)])/float64(len(text)))
}
}
Java¶
import java.util.*;
public class Task1 {
public static void main(String[] args) {
String text = "abracadabra";
Map<Character, Integer> freq = new TreeMap<>();
for (char c : text.toCharArray()) freq.merge(c, 1, Integer::sum);
for (var e : freq.entrySet())
System.out.printf("%c: %d (%.3f)%n", e.getKey(), e.getValue(),
(double) e.getValue() / text.length());
}
}
Python¶
from collections import Counter
text = "abracadabra"
freq = Counter(text)
for s in sorted(freq):
print(f"{s}: {freq[s]} ({freq[s] / len(text):.3f})")
Task 2 — Build the Huffman tree and emit the code table¶
Problem. Build the Huffman tree with a min-heap and produce the symbol → codeword table.
Constraints. 1 ≤ n ≤ 256. Deterministic ties.
Hint. Pop two smallest, merge (sum freq), push; n−1 times. Then DFS: left=0, right=1.
Go¶
package main
import (
"container/heap"
"fmt"
"sort"
)
type node struct {
freq, order int
sym byte
leaf bool
left, right *node
}
type mh []*node
func (h mh) Len() int { return len(h) }
func (h mh) Less(i, j int) bool {
if h[i].freq != h[j].freq {
return h[i].freq < h[j].freq
}
return h[i].order < h[j].order
}
func (h mh) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *mh) Push(x any) { *h = append(*h, x.(*node)) }
func (h *mh) Pop() any { o := *h; n := len(o); x := o[n-1]; *h = o[:n-1]; return x }
func build(freq map[byte]int) *node {
syms := make([]int, 0, len(freq))
for s := range freq {
syms = append(syms, int(s))
}
sort.Ints(syms)
h := &mh{}
ord := 0
for _, s := range syms {
heap.Push(h, &node{freq: freq[byte(s)], sym: byte(s), leaf: true, order: ord})
ord++
}
if h.Len() == 1 {
only := heap.Pop(h).(*node)
return &node{freq: only.freq, left: only, order: ord}
}
for h.Len() > 1 {
a := heap.Pop(h).(*node)
b := heap.Pop(h).(*node)
heap.Push(h, &node{freq: a.freq + b.freq, left: a, right: b, order: ord})
ord++
}
return heap.Pop(h).(*node)
}
func codes(root *node) map[byte]string {
out := map[byte]string{}
var walk func(n *node, p string)
walk = func(n *node, p string) {
if n.leaf {
if p == "" {
p = "0"
}
out[n.sym] = p
return
}
walk(n.left, p+"0")
walk(n.right, p+"1")
}
walk(root, "")
return out
}
func main() {
text := "abracadabra"
freq := map[byte]int{}
for i := 0; i < len(text); i++ {
freq[text[i]]++
}
table := codes(build(freq))
keys := make([]int, 0, len(table))
for s := range table {
keys = append(keys, int(s))
}
sort.Ints(keys)
for _, s := range keys {
fmt.Printf("%c -> %s\n", byte(s), table[byte(s)])
}
}
Java¶
import java.util.*;
public class Task2 {
static class N { int freq, order; char sym; boolean leaf; N left, right;
N(int f, int o, char s, boolean l) { freq=f; order=o; sym=s; leaf=l; } }
static N build(Map<Character, Integer> freq) {
List<Character> syms = new ArrayList<>(freq.keySet());
Collections.sort(syms);
PriorityQueue<N> h = new PriorityQueue<>(
(a, b) -> a.freq != b.freq ? a.freq - b.freq : a.order - b.order);
int[] ord = {0};
for (char s : syms) h.add(new N(freq.get(s), ord[0]++, s, true));
if (h.size() == 1) { N o = h.poll(); N p = new N(o.freq, ord[0]++, '\0', false); p.left = o; return p; }
while (h.size() > 1) {
N a = h.poll(), b = h.poll();
N m = new N(a.freq + b.freq, ord[0]++, '\0', false); m.left = a; m.right = b; h.add(m);
}
return h.poll();
}
static void walk(N n, String p, Map<Character, String> out) {
if (n.leaf) { out.put(n.sym, p.isEmpty() ? "0" : p); return; }
walk(n.left, p + "0", out); walk(n.right, p + "1", out);
}
public static void main(String[] args) {
String text = "abracadabra";
Map<Character, Integer> freq = new HashMap<>();
for (char c : text.toCharArray()) freq.merge(c, 1, Integer::sum);
Map<Character, String> table = new TreeMap<>();
walk(build(freq), "", table);
for (var e : table.entrySet()) System.out.println(e.getKey() + " -> " + e.getValue());
}
}
Python¶
import heapq
from collections import Counter
from itertools import count
def build(freq):
tie = count()
heap = [[freq[s], next(tie), s, None, None] for s in sorted(freq)]
heapq.heapify(heap)
if len(heap) == 1:
only = heap[0]
return [only[0], next(tie), None, only, None]
while len(heap) > 1:
a, b = heapq.heappop(heap), heapq.heappop(heap)
heapq.heappush(heap, [a[0] + b[0], next(tie), None, a, b])
return heap[0]
def codes(root):
out = {}
def walk(node, p):
_, _, sym, left, right = node
if sym is not None:
out[sym] = p or "0"
else:
walk(left, p + "0"); walk(right, p + "1")
walk(root, "")
return out
table = codes(build(Counter("abracadabra")))
for s in sorted(table):
print(f"{s} -> {table[s]}")
Task 3 — Encode and decode (round trip)¶
Problem. Encode a string with the code table, then decode the bits by walking the tree; assert the result equals the input.
Constraints. 1 ≤ |text| ≤ 10⁶.
Hint. Decode: descend left on 0, right on 1; emit symbol at each leaf and reset to root.
Go¶
package main
import (
"fmt"
"strings"
)
func main() {
text := "abracadabra"
freq := map[byte]int{}
for i := 0; i < len(text); i++ {
freq[text[i]]++
}
root := build(freq) // from Task 2
table := codes(root) // from Task 2
var enc strings.Builder
for i := 0; i < len(text); i++ {
enc.WriteString(table[text[i]])
}
bits := enc.String()
var dec strings.Builder
cur := root
for _, b := range bits {
if b == '0' {
cur = cur.left
} else {
cur = cur.right
}
if cur.leaf {
dec.WriteByte(cur.sym)
cur = root
}
}
fmt.Println("match:", dec.String() == text, "bits:", len(bits))
}
Java¶
public class Task3 {
public static void main(String[] args) {
String text = "abracadabra";
var freq = new java.util.HashMap<Character, Integer>();
for (char c : text.toCharArray()) freq.merge(c, 1, Integer::sum);
Task2.N root = Task2.build(freq);
var table = new java.util.HashMap<Character, String>();
Task2.walk(root, "", table);
StringBuilder enc = new StringBuilder();
for (char c : text.toCharArray()) enc.append(table.get(c));
String bits = enc.toString();
StringBuilder dec = new StringBuilder(); Task2.N cur = root;
for (char b : bits.toCharArray()) {
cur = b == '0' ? cur.left : cur.right;
if (cur.leaf) { dec.append(cur.sym); cur = root; }
}
System.out.println("match: " + dec.toString().equals(text) + " bits: " + bits.length());
}
}
Python¶
from collections import Counter
text = "abracadabra"
root = build(Counter(text)) # from Task 2
table = codes(root) # from Task 2
bits = "".join(table[c] for c in text)
out, node = [], root
for b in bits:
node = node[3] if b == "0" else node[4]
if node[2] is not None:
out.append(node[2]); node = root
print("match:", "".join(out) == text, "bits:", len(bits))
Task 4 — Compare to fixed-length size¶
Problem. Compute the Huffman-encoded size in bits and compare to a fixed-length code (⌈log₂ n⌉ bits/symbol). Report the saving.
Constraints. 1 ≤ n ≤ 256.
Hint. Huffman bits = Σ freq·len. Fixed = N · ⌈log₂ n⌉.
Go¶
package main
import (
"fmt"
"math"
)
func main() {
text := "abracadabra"
freq := map[byte]int{}
for i := 0; i < len(text); i++ {
freq[text[i]]++
}
table := codes(build(freq)) // Task 2
huff := 0
for i := 0; i < len(text); i++ {
huff += len(table[text[i]])
}
n := len(freq)
fixedBits := int(math.Ceil(math.Log2(float64(n))))
if n == 1 {
fixedBits = 1
}
fixed := len(text) * fixedBits
fmt.Printf("huffman=%d fixed=%d saving=%.1f%%\n", huff, fixed, 100*float64(fixed-huff)/float64(fixed))
}
Java¶
public class Task4 {
public static void main(String[] args) {
String text = "abracadabra";
var freq = new java.util.HashMap<Character, Integer>();
for (char c : text.toCharArray()) freq.merge(c, 1, Integer::sum);
var table = new java.util.HashMap<Character, String>();
Task2.walk(Task2.build(freq), "", table);
int huff = 0;
for (char c : text.toCharArray()) huff += table.get(c).length();
int n = freq.size();
int fixedBits = n == 1 ? 1 : (int) Math.ceil(Math.log(n) / Math.log(2));
int fixed = text.length() * fixedBits;
System.out.printf("huffman=%d fixed=%d saving=%.1f%%%n", huff, fixed, 100.0 * (fixed - huff) / fixed);
}
}
Python¶
import math
from collections import Counter
text = "abracadabra"
freq = Counter(text)
table = codes(build(freq)) # Task 2
huff = sum(len(table[c]) for c in text)
n = len(freq)
fixed_bits = 1 if n == 1 else math.ceil(math.log2(n))
fixed = len(text) * fixed_bits
print(f"huffman={huff} fixed={fixed} saving={100 * (fixed - huff) / fixed:.1f}%")
Task 5 — Handle single-symbol and empty inputs¶
Problem. Confirm your builder handles "" (empty) and a string of one repeated character ("aaaa") without crashing; the lone symbol must get a 1-bit code.
Constraints. Inputs include "", "a", "aaaa".
Hint. Empty → empty table, zero bits. Single distinct symbol → code "0".
Go¶
package main
import "fmt"
func main() {
for _, text := range []string{"", "a", "aaaa"} {
freq := map[byte]int{}
for i := 0; i < len(text); i++ {
freq[text[i]]++
}
if len(freq) == 0 {
fmt.Printf("%q -> empty\n", text)
continue
}
table := codes(build(freq)) // Task 2
fmt.Printf("%q -> %v\n", text, table)
}
}
Java¶
public class Task5 {
public static void main(String[] args) {
for (String text : new String[]{"", "a", "aaaa"}) {
var freq = new java.util.HashMap<Character, Integer>();
for (char c : text.toCharArray()) freq.merge(c, 1, Integer::sum);
if (freq.isEmpty()) { System.out.println("\"" + text + "\" -> empty"); continue; }
var table = new java.util.HashMap<Character, String>();
Task2.walk(Task2.build(freq), "", table);
System.out.println("\"" + text + "\" -> " + table);
}
}
}
Python¶
from collections import Counter
for text in ["", "a", "aaaa"]:
freq = Counter(text)
if not freq:
print(repr(text), "-> empty")
continue
print(repr(text), "->", codes(build(freq))) # Task 2
Intermediate Tasks (5)¶
Task 6 — Canonical Huffman from lengths¶
Problem. Compute Huffman code lengths, then assign canonical codewords from the lengths alone. Verify decoding works from the canonical table.
Constraints. 1 ≤ n ≤ 10⁵.
Hint. Sort by (length, symbol); code <<= Δlength; assign; code++.
Go¶
package main
import (
"container/heap"
"fmt"
"sort"
)
func lengths(freq map[byte]int) map[byte]int {
syms := make([]int, 0, len(freq))
for s := range freq {
syms = append(syms, int(s))
}
sort.Ints(syms)
h := &mh{} // mh from Task 2
ord := 0
for _, s := range syms {
heap.Push(h, &node{freq: freq[byte(s)], sym: byte(s), leaf: true, order: ord})
ord++
}
L := map[byte]int{}
if h.Len() == 1 {
L[(*h)[0].sym] = 1
return L
}
for h.Len() > 1 {
a := heap.Pop(h).(*node)
b := heap.Pop(h).(*node)
heap.Push(h, &node{freq: a.freq + b.freq, left: a, right: b, order: ord})
ord++
}
var walk func(n *node, d int)
walk = func(n *node, d int) {
if n.leaf {
L[n.sym] = d
return
}
walk(n.left, d+1)
walk(n.right, d+1)
}
walk(heap.Pop(h).(*node), 0)
return L
}
func canonical(L map[byte]int) map[byte]string {
type sl struct {
s byte
l int
}
arr := make([]sl, 0, len(L))
for s, l := range L {
arr = append(arr, sl{s, l})
}
sort.Slice(arr, func(i, j int) bool {
if arr[i].l != arr[j].l {
return arr[i].l < arr[j].l
}
return arr[i].s < arr[j].s
})
out := map[byte]string{}
code, prev := 0, 0
for _, e := range arr {
code <<= (e.l - prev)
out[e.s] = fmt.Sprintf("%0*b", e.l, code)
code++
prev = e.l
}
return out
}
func main() {
freq := map[byte]int{'a': 5, 'b': 2, 'r': 2, 'c': 1, 'd': 1}
for s, c := range canonical(lengths(freq)) {
fmt.Printf("%c -> %s\n", s, c)
}
}
Java¶
import java.util.*;
public class Task6 {
static Map<Character, Integer> lengths(Map<Character, Integer> freq) {
List<Character> syms = new ArrayList<>(freq.keySet());
Collections.sort(syms);
PriorityQueue<Task2.N> h = new PriorityQueue<>(
(a, b) -> a.freq != b.freq ? a.freq - b.freq : a.order - b.order);
int[] ord = {0};
for (char s : syms) h.add(new Task2.N(freq.get(s), ord[0]++, s, true));
Map<Character, Integer> L = new HashMap<>();
if (h.size() == 1) { L.put(h.peek().sym, 1); return L; }
while (h.size() > 1) {
Task2.N a = h.poll(), b = h.poll();
Task2.N m = new Task2.N(a.freq + b.freq, ord[0]++, '\0', false);
m.left = a; m.right = b; h.add(m);
}
walk(h.poll(), 0, L);
return L;
}
static void walk(Task2.N n, int d, Map<Character, Integer> L) {
if (n.leaf) { L.put(n.sym, d); return; }
walk(n.left, d + 1, L); walk(n.right, d + 1, L);
}
static Map<Character, String> canonical(Map<Character, Integer> L) {
List<Map.Entry<Character, Integer>> a = new ArrayList<>(L.entrySet());
a.sort((x, y) -> x.getValue().equals(y.getValue())
? Character.compare(x.getKey(), y.getKey()) : x.getValue() - y.getValue());
Map<Character, String> out = new TreeMap<>();
int code = 0, prev = 0;
for (var e : a) {
int Lv = e.getValue(); code <<= (Lv - prev);
out.put(e.getKey(), String.format("%" + Lv + "s",
Integer.toBinaryString(code)).replace(' ', '0'));
code++; prev = Lv;
}
return out;
}
public static void main(String[] args) {
var freq = new HashMap<Character, Integer>();
for (char c : "abracadabra".toCharArray()) freq.merge(c, 1, Integer::sum);
canonical(lengths(freq)).forEach((s, c) -> System.out.println(s + " -> " + c));
}
}
Python¶
import heapq
from collections import Counter
from itertools import count
def lengths(freq):
tie = count()
heap = [[freq[s], next(tie), s, None, None] for s in sorted(freq)]
heapq.heapify(heap)
if len(heap) == 1:
return {heap[0][2]: 1}
while len(heap) > 1:
a, b = heapq.heappop(heap), heapq.heappop(heap)
heapq.heappush(heap, [a[0] + b[0], next(tie), None, a, b])
L = {}
def walk(node, d):
_, _, s, left, right = node
if s is not None: L[s] = d
else: walk(left, d + 1); walk(right, d + 1)
walk(heap[0], 0)
return L
def canonical(L):
out, code, prev = {}, 0, 0
for s in sorted(L, key=lambda x: (L[x], x)):
l = L[s]; code <<= (l - prev); out[s] = format(code, f"0{l}b"); code += 1; prev = l
return out
for s, c in canonical(lengths(Counter("abracadabra"))).items():
print(f"{s} -> {c}")
Task 7 — Verify the entropy bound¶
Problem. For several distributions, build the code, compute L and H, and assert H ≤ L < H + 1.
Constraints. Include a power-of-two distribution (where L = H) and a uniform one.
Hint. H = −Σ p log₂ p; L = Σ p·len.
Go¶
package main
import (
"fmt"
"math"
)
func main() {
dists := []map[byte]int{
{'a': 5, 'b': 2, 'r': 2, 'c': 1, 'd': 1},
{'a': 4, 'b': 2, 'c': 1, 'd': 1}, // 1/2,1/4,1/8,1/8 -> L=H=1.75
{'w': 1, 'x': 1, 'y': 1, 'z': 1},
}
for _, freq := range dists {
L := lengths(freq) // Task 6
total := 0
for _, f := range freq {
total += f
}
avg, H := 0.0, 0.0
for s, f := range freq {
p := float64(f) / float64(total)
avg += p * float64(L[s])
H += -p * math.Log2(p)
}
fmt.Printf("H=%.4f L=%.4f ok=%v\n", H, avg, H <= avg+1e-9 && avg < H+1)
}
}
Java¶
import java.util.*;
public class Task7 {
public static void main(String[] args) {
List<Map<Character, Integer>> dists = List.of(
Map.of('a',5,'b',2,'r',2,'c',1,'d',1),
Map.of('a',4,'b',2,'c',1,'d',1),
Map.of('w',1,'x',1,'y',1,'z',1));
for (var freq : dists) {
var L = Task6.lengths(freq);
int total = freq.values().stream().mapToInt(Integer::intValue).sum();
double avg = 0, H = 0;
for (var e : freq.entrySet()) {
double p = (double) e.getValue() / total;
avg += p * L.get(e.getKey());
H += -p * (Math.log(p) / Math.log(2));
}
System.out.printf("H=%.4f L=%.4f ok=%b%n", H, avg, H <= avg + 1e-9 && avg < H + 1);
}
}
}
Python¶
import math
from collections import Counter
for freq in [Counter("abracadabra"),
{"a": 4, "b": 2, "c": 1, "d": 1}, # L = H = 1.75
{"w": 1, "x": 1, "y": 1, "z": 1}]:
L = lengths(freq) # Task 6
total = sum(freq.values())
avg = sum((freq[s] / total) * L[s] for s in freq)
H = -sum((freq[s] / total) * math.log2(freq[s] / total) for s in freq)
print(f"H={H:.4f} L={avg:.4f} ok={H - 1e-9 <= avg < H + 1}")
Task 8 — Validate the Kraft inequality¶
Problem. Confirm the computed code lengths satisfy Σ 2^(−len) = 1 (a complete Huffman code is always full).
Constraints. 1 ≤ n ≤ 10⁵.
Hint. Accumulate 2^(−len); it must equal 1 for a full tree.
Go¶
package main
import "fmt"
func main() {
freq := map[byte]int{'a': 5, 'b': 2, 'r': 2, 'c': 1, 'd': 1}
L := lengths(freq) // Task 6
kraft := 0.0
for _, l := range L {
kraft += 1.0 / float64(int64(1)<<l)
}
fmt.Printf("Kraft = %.6f (== 1: %v)\n", kraft, kraft > 0.999999 && kraft < 1.000001)
}
Java¶
public class Task8 {
public static void main(String[] args) {
var freq = new java.util.HashMap<Character, Integer>();
for (char c : "abracadabra".toCharArray()) freq.merge(c, 1, Integer::sum);
var L = Task6.lengths(freq);
double kraft = 0;
for (int l : L.values()) kraft += 1.0 / (1L << l);
System.out.printf("Kraft = %.6f (== 1: %b)%n", kraft, Math.abs(kraft - 1) < 1e-6);
}
}
Python¶
from collections import Counter
L = lengths(Counter("abracadabra")) # Task 6
kraft = sum(2 ** -l for l in L.values())
print(f"Kraft = {kraft:.6f} (== 1: {abs(kraft - 1) < 1e-9})")
Task 9 — Decode from a canonical table (no tree)¶
Problem. Given canonical lengths and a bit stream, decode by accumulating bits into (length, value) and matching against the canonical table — the way zlib/JPEG decode.
Constraints. Stream is well-formed.
Hint. Build table[(L, code)] = symbol; accumulate bits, reset on match.
Python¶
def decode_canonical(L, bits):
code, prev, table = 0, 0, {}
for s in sorted(L, key=lambda x: (L[x], x)):
l = L[s]; code <<= (l - prev); table[(l, code)] = s; code += 1; prev = l
out, acc, length = [], 0, 0
for b in bits:
acc = (acc << 1) | (1 if b == "1" else 0); length += 1
if (length, acc) in table:
out.append(table[(length, acc)]); acc, length = 0, 0
return "".join(out)
from collections import Counter
text = "abracadabra"
L = lengths(Counter(text)) # Task 6
table = canonical(L) # Task 6
bits = "".join(table[c] for c in text)
print(decode_canonical(L, bits) == text) # True
Go / Java¶
// Build canonical codes from lengths; map (length, value) -> symbol. Accumulate bits,
// shifting into an integer; when (length, value) matches, emit and reset. Mirror Python.
Task 10 — Pack bits into bytes with a length header¶
Problem. Pack the encoded bit string into bytes, store the true bit count, then unpack and decode — proving padding is not misread.
Constraints. Arbitrary |text|.
Hint. Write the bit count first; on unpack, read exactly that many bits.
Python¶
from collections import Counter
def pack(bits):
n = len(bits)
by = bytearray()
for i in range(0, n, 8):
chunk = bits[i:i + 8].ljust(8, "0")
by.append(int(chunk, 2))
return n, bytes(by)
def unpack(n, by):
bits = "".join(format(b, "08b") for b in by)
return bits[:n]
text = "abracadabra"
root = build(Counter(text)); table = codes(root) # Task 2
bits = "".join(table[c] for c in text)
n, packed = pack(bits)
assert unpack(n, packed) == bits
print("packed bytes:", len(packed), "for", n, "bits")
Go / Java¶
// Pack: for each 8-bit group, build a byte (pad last group with zeros); store bit count n.
// Unpack: expand bytes to bits, take the first n. Decode via the tree (Task 3).
// Verify the round trip survives byte packing.
Advanced Tasks (5)¶
Task 11 — Length-limited Huffman (package-merge)¶
Problem. Compute optimal code lengths with max len ≤ L_max using package-merge. Verify max len ≤ L_max and Kraft ≤ 1, and that WPL is ≥ the unlimited Huffman WPL.
Constraints. 1 ≤ n ≤ 10⁴, ⌈log₂ n⌉ ≤ L_max ≤ 32.
Hint. Coins of dyadic width 2^(−d); at each level package adjacent pairs, merge with originals, keep cheapest; select 2(n−1) cheapest.
Python¶
def package_merge(freqs, limit):
n = len(freqs)
if n == 1:
return [1]
order = sorted(range(n), key=lambda i: freqs[i])
orig = [(freqs[i], (i,)) for i in order]
prev = list(orig)
for _ in range(limit - 1):
packaged = [(prev[i][0] + prev[i + 1][0], prev[i][1] + prev[i + 1][1])
for i in range(0, len(prev) - 1, 2)]
prev = sorted(orig + packaged, key=lambda x: x[0])
lengths = [0] * n
for _, syms in prev[:2 * (n - 1)]:
for s in syms:
lengths[s] += 1
return lengths
freqs = [20, 17, 6, 3, 2, 2]
L = package_merge(freqs, 4)
print("lengths:", L, "max:", max(L))
assert max(L) <= 4
assert sum(2 ** -l for l in L) <= 1 + 1e-9
Go¶
package main
import (
"fmt"
"sort"
)
func packageMerge(freqs []int, limit int) []int {
n := len(freqs)
if n == 1 {
return []int{1}
}
type item struct {
w int
syms []int
}
idx := make([]int, n)
for i := range idx {
idx[i] = i
}
sort.Slice(idx, func(a, b int) bool { return freqs[idx[a]] < freqs[idx[b]] })
orig := make([]item, n)
for i, id := range idx {
orig[i] = item{freqs[id], []int{id}}
}
prev := append([]item(nil), orig...)
for L := 0; L < limit-1; L++ {
var pk []item
for i := 0; i+1 < len(prev); i += 2 {
s := append(append([]int{}, prev[i].syms...), prev[i+1].syms...)
pk = append(pk, item{prev[i].w + prev[i+1].w, s})
}
merged := append(append([]item{}, orig...), pk...)
sort.Slice(merged, func(a, b int) bool { return merged[a].w < merged[b].w })
prev = merged
}
lengths := make([]int, n)
for i := 0; i < 2*(n-1) && i < len(prev); i++ {
for _, s := range prev[i].syms {
lengths[s]++
}
}
return lengths
}
func main() {
L := packageMerge([]int{20, 17, 6, 3, 2, 2}, 4)
mx := 0
for _, l := range L {
if l > mx {
mx = l
}
}
fmt.Println("lengths:", L, "max:", mx)
}
Java¶
import java.util.*;
public class Task11 {
static int[] packageMerge(int[] freqs, int limit) {
int n = freqs.length;
if (n == 1) return new int[]{1};
Integer[] idx = new Integer[n];
for (int i = 0; i < n; i++) idx[i] = i;
Arrays.sort(idx, (a, b) -> freqs[a] - freqs[b]);
List<int[]> origW = new ArrayList<>(); List<List<Integer>> origS = new ArrayList<>();
for (int id : idx) { origW.add(new int[]{freqs[id]}); origS.add(new ArrayList<>(List.of(id))); }
List<int[]> pw = new ArrayList<>(origW); List<List<Integer>> ps = new ArrayList<>(origS);
for (int L = 0; L < limit - 1; L++) {
List<int[]> nw = new ArrayList<>(); List<List<Integer>> ns = new ArrayList<>();
for (int i = 0; i + 1 < pw.size(); i += 2) {
nw.add(new int[]{pw.get(i)[0] + pw.get(i + 1)[0]});
List<Integer> s = new ArrayList<>(ps.get(i)); s.addAll(ps.get(i + 1)); ns.add(s);
}
List<Integer> order = new ArrayList<>();
List<int[]> allW = new ArrayList<>(origW); allW.addAll(nw);
List<List<Integer>> allS = new ArrayList<>(origS); allS.addAll(ns);
for (int i = 0; i < allW.size(); i++) order.add(i);
order.sort((a, b) -> allW.get(a)[0] - allW.get(b)[0]);
pw = new ArrayList<>(); ps = new ArrayList<>();
for (int i : order) { pw.add(allW.get(i)); ps.add(allS.get(i)); }
}
int[] lengths = new int[n];
for (int i = 0; i < 2 * (n - 1) && i < ps.size(); i++)
for (int s : ps.get(i)) lengths[s]++;
return lengths;
}
public static void main(String[] args) {
int[] L = packageMerge(new int[]{20, 17, 6, 3, 2, 2}, 4);
System.out.println(Arrays.toString(L));
}
}
Task 12 — Adaptive (one-pass) Huffman skeleton¶
Problem. Implement a simplified adaptive scheme: maintain running counts, rebuild the static code after each symbol, and decode in lockstep. (Demonstrates the one-pass idea without the full FGK tree-rotation machinery.)
Constraints. |text| ≤ 10⁴ (rebuild-per-symbol is O(N·n log n) — fine for the demo).
Hint. Both encoder and decoder must rebuild from the same counts before processing each next symbol, using a deterministic build.
Python¶
from collections import Counter
def adaptive_roundtrip(text):
# Encoder: build code from counts seen SO FAR (start uniform over alphabet).
alphabet = sorted(set(text))
counts = Counter({c: 1 for c in alphabet}) # +1 smoothing so all codes exist
out_syms = []
for ch in text:
table = codes(build(counts)) # Task 2 build/codes
out_syms.append((ch, table[ch]))
counts[ch] += 1
# Decoder mirrors: rebuild from identical evolving counts.
counts2 = Counter({c: 1 for c in alphabet})
decoded = []
for ch, bits in out_syms:
table = codes(build(counts2))
rev = {v: k for k, v in table.items()}
decoded.append(rev[bits])
counts2[rev[bits]] += 1
return "".join(decoded)
print(adaptive_roundtrip("abracadabra") == "abracadabra") # True
Go / Java¶
// Maintain a counts map starting at 1 per known symbol. Before each symbol, rebuild the
// Huffman code deterministically (Task 2), emit the codeword, then increment the count.
// Decoder rebuilds from the identical evolving counts and reverses the codeword.
Task 13 — k-ary (ternary) Huffman¶
Problem. Build a ternary (k = 3) Huffman code: merge the 3 smallest each step, padding with zero-weight dummies until (n − 1) mod (k − 1) == 0.
Constraints. 1 ≤ n ≤ 10⁴, k = 3.
Hint. Pad count: pad = (k - 1 - (n - 1) % (k - 1)) % (k - 1). Codewords are over {0,1,2}.
Python¶
import heapq
from collections import Counter
from itertools import count
def kary(freq, k=3):
tie = count()
items = [[freq[s], next(tie), s, []] for s in sorted(freq)]
n = len(items)
pad = (k - 1 - (n - 1) % (k - 1)) % (k - 1) if n > 1 else 0
for _ in range(pad):
items.append([0, next(tie), None, []]) # dummy leaf
heapq.heapify(items)
while len(items) > 1:
kids = [heapq.heappop(items) for _ in range(k)]
heapq.heappush(items, [sum(x[0] for x in kids), next(tie), None, kids])
root = items[0]
out = {}
def walk(node, path):
f, o, s, kids = node
if s is not None:
out[s] = path or "0"
elif not kids:
return
else:
for d, child in enumerate(kids):
walk(child, path + str(d))
walk(root, "")
return out
table = kary(Counter("abracadabra"), 3)
for s in sorted(table):
print(s, "->", table[s])
assert all(set(c) <= set("012") for c in table.values())
Go / Java¶
// Pad with zero-weight dummies until (n-1) % (k-1) == 0. Heap by (freq, order). Each step
// pop k smallest, merge, push. Codewords use digits 0..k-1. Dummies get no real symbol.
Task 14 — Build a fast decode lookup table¶
Problem. Build a single-level 2^MAX_BITS lookup table mapping the next bits to (symbol, length), and decode using it (advance the cursor by length).
Constraints. MAX_BITS = max code length ≤ 16.
Hint. For a canonical code of length L and value v, every table index whose top L bits equal v maps to that symbol.
Python¶
from collections import Counter
def build_decode_table(L):
codes_map = canonical(L) # Task 6
max_bits = max(L.values())
table = [None] * (1 << max_bits)
for sym, code in codes_map.items():
l = len(code)
v = int(code, 2)
# fill all indices whose top l bits == v
low = v << (max_bits - l)
high = low + (1 << (max_bits - l))
for idx in range(low, high):
table[idx] = (sym, l)
return table, max_bits
def decode_with_table(L, bits):
table, max_bits = build_decode_table(L)
out, i = [], 0
bits = bits + "0" * max_bits # pad so the last read is safe
while i < len(bits) - max_bits:
window = int(bits[i:i + max_bits], 2)
sym, l = table[window]
out.append(sym)
i += l
return "".join(out)
text = "abracadabra"
L = lengths(Counter(text)) # Task 6
table = canonical(L)
bits = "".join(table[c] for c in text)
print(decode_with_table(L, bits) == text) # True
Go / Java¶
// Build a 1<<maxBits table; for each canonical (sym, len, value), fill every index whose
// top `len` bits equal `value` with (sym, len). Decode: read maxBits, index, emit symbol,
// advance cursor by the stored length. This is the zlib/JPEG fast-decode trick.
Task 15 — Compress a real file and report the ratio¶
Problem. Read a file's bytes, build a Huffman code over the 256 byte values, encode, pack into bytes (with a header of the code lengths + bit count), and report the compression ratio. Decode and verify byte-for-byte.
Constraints. Any file up to a few MB. Use a 256-entry frequency array.
Hint. Header = 256 lengths (1 byte each is fine for ≤255-bit, but use ≤32) + the bit count; payload = packed bits. Ratio = compressed_size / original_size.
Python¶
from collections import Counter
def compress(data: bytes):
if not data:
return b"", {}, 0
freq = Counter(data)
L = lengths({bytes([b]): c for b, c in freq.items()}) # adapt keys as needed
# Simpler: operate on ints directly
freq_i = Counter(data)
Li = lengths(freq_i) # Task 6 works on hashable keys
table = canonical(Li)
bits = "".join(table[b] for b in data)
return bits, Li, len(bits)
def decompress(bits, Li, n):
# decode_canonical from Task 9 (keys are ints)
code, prev, dec = 0, 0, {}
for s in sorted(Li, key=lambda x: (Li[x], x)):
l = Li[s]; code <<= (l - prev); dec[(l, code)] = s; code += 1; prev = l
out, acc, length = bytearray(), 0, 0
for b in bits[:n]:
acc = (acc << 1) | (1 if b == "1" else 0); length += 1
if (length, acc) in dec:
out.append(dec[(length, acc)]); acc, length = 0, 0
return bytes(out)
data = b"abracadabra" * 1000
bits, Li, n = compress(data)
restored = decompress(bits, Li, n)
print("match:", restored == data, "ratio:", (n / 8) / len(data))
Go / Java¶
// Use a 256-int frequency array. Build canonical lengths (Task 6/11). Encode to a bit buffer,
// pack to bytes with a header (lengths + bit count). Decode with a lookup table (Task 14) and
// assert byte-equality. Report (compressed_bytes / original_bytes).
Benchmark Task — Tree-walk vs Table Decode¶
Problem. Decode a large encoded stream two ways: bit-by-bit tree walk and 2^MAX_BITS lookup table. Time both; report the speedup.
Expected findings. - The tree walk does one branch per bit — branch-heavy and cache-unfriendly. - The table decode does one memory read per symbol and advances by the code length — typically an order of magnitude faster on real data. - The table costs O(2^MAX_BITS) memory and a one-time build; it pays off across a long stream.
Python¶
import time
from collections import Counter
def bench(text):
root = build(Counter(text)); table = codes(root) # Task 2
bits = "".join(table[c] for c in text)
Li = lengths(Counter(text)) # Task 6
t0 = time.perf_counter()
out, node = [], root
for b in bits:
node = node[3] if b == "0" else node[4]
if node[2] is not None: out.append(node[2]); node = root
t_walk = time.perf_counter() - t0
t0 = time.perf_counter()
r = decode_with_table(Li, bits) # Task 14
t_table = time.perf_counter() - t0
print(f"walk={t_walk*1000:.1f}ms table={t_table*1000:.1f}ms speedup={t_walk/t_table:.1f}x")
bench("abracadabra" * 50000)
Go / Java¶
// Encode a large string. Time (a) tree-walk decode and (b) table decode (Task 14).
// Go: time.Now()/time.Since. Java: System.nanoTime(). Report the speedup; expect the
// table decode to win by a large factor on long streams.
Evaluation criteria. - Correctness: both decoders reproduce the input exactly. - Performance: report wall-clock for each; explain why the table wins. - Discussion: when is the table not worth it? (Tiny streams where the build cost dominates, or huge MAX_BITS forcing multi-level tables.)