Queue -- Optimization Exercises¶
Each exercise starts with a correct but slow implementation. Your goal is to optimize it for better time complexity, space complexity, or practical performance.
Table of Contents¶
- Exercise 1: Queue Dequeue -- O(n) to O(1)
- Exercise 2: Sliding Window Maximum -- O(n*k) to O(n)
- Exercise 3: BFS Shortest Path -- Reduce Queue Size
- Exercise 4: Moving Average -- O(n) to O(1) per Query
- Exercise 5: Task Scheduler -- O(n*m) to O(n)
- Exercise 6: First Non-Repeating -- O(n^2) to O(n)
- Exercise 7: Priority Queue -- O(n) Extract to O(log n)
- Exercise 8: Queue Memory Leak -- Fix Unbounded Growth
- Exercise 9: Multi-Queue Merge -- O(nk) to O(nlog k)
- Exercise 10: Rate Limiter -- O(n) Cleanup to O(1) Amortized
- Exercise 11: BFS on Matrix -- Reduce Allocations
- Exercise 12: Circular Buffer -- Branching to Branchless
Exercise 1: Queue Dequeue -- O(n) to O(1)¶
Slow Version¶
// O(n) dequeue: shifting all elements left
type SlowQueue struct {
data []int
}
func (q *SlowQueue) Enqueue(val int) {
q.data = append(q.data, val)
}
func (q *SlowQueue) Dequeue() int {
val := q.data[0]
// O(n): copy all elements one position left
copy(q.data, q.data[1:])
q.data = q.data[:len(q.data)-1]
return val
}
Optimized Version¶
// O(1) amortized dequeue using a circular buffer
type FastQueue struct {
data []int
front, size int
capacity int
}
func NewFastQueue(cap int) *FastQueue {
return &FastQueue{data: make([]int, cap), capacity: cap}
}
func (q *FastQueue) Enqueue(val int) {
if q.size == q.capacity {
q.resize(q.capacity * 2)
}
rear := (q.front + q.size) % q.capacity
q.data[rear] = val
q.size++
}
func (q *FastQueue) Dequeue() int {
val := q.data[q.front]
q.front = (q.front + 1) % q.capacity
q.size--
return val
}
func (q *FastQueue) resize(newCap int) {
newData := make([]int, newCap)
for i := 0; i < q.size; i++ {
newData[i] = q.data[(q.front+i)%q.capacity]
}
q.data = newData
q.front = 0
q.capacity = newCap
}
Why it is faster: The circular buffer avoids shifting elements. Dequeue simply advances the front pointer with modulo arithmetic. No memory copies needed.
Exercise 2: Sliding Window Maximum -- O(n*k) to O(n)¶
Slow Version¶
// O(n*k): scan each window for maximum
public static int[] maxSlidingWindow(int[] nums, int k) {
int[] result = new int[nums.length - k + 1];
for (int i = 0; i <= nums.length - k; i++) {
int max = nums[i];
for (int j = i + 1; j < i + k; j++) {
max = Math.max(max, nums[j]);
}
result[i] = max;
}
return result;
}
Optimized Version¶
// O(n): monotonic deque -- each element is added/removed at most once
public static int[] maxSlidingWindow(int[] nums, int k) {
Deque<Integer> dq = new ArrayDeque<>(); // stores indices
int[] result = new int[nums.length - k + 1];
int idx = 0;
for (int i = 0; i < nums.length; i++) {
if (!dq.isEmpty() && dq.peekFirst() <= i - k) {
dq.pollFirst();
}
while (!dq.isEmpty() && nums[dq.peekLast()] <= nums[i]) {
dq.pollLast();
}
dq.offerLast(i);
if (i >= k - 1) {
result[idx++] = nums[dq.peekFirst()];
}
}
return result;
}
Why it is faster: Each element enters and leaves the deque at most once, giving O(n) total instead of O(n*k). The deque maintains a decreasing order so the front is always the current maximum.
Exercise 3: BFS Shortest Path -- Reduce Queue Size¶
Slow Version¶
from collections import deque
def shortest_path(grid, start, end):
"""BFS on grid -- marks visited at dequeue time."""
rows, cols = len(grid), len(grid[0])
queue = deque([(start[0], start[1], 0)]) # (row, col, dist)
visited = set()
while queue:
r, c, dist = queue.popleft()
if (r, c) in visited:
continue
visited.add((r, c))
if (r, c) == end:
return dist
for dr, dc in [(0,1),(0,-1),(1,0),(-1,0)]:
nr, nc = r+dr, c+dc
if 0 <= nr < rows and 0 <= nc < cols and grid[nr][nc] == 0:
queue.append((nr, nc, dist+1))
return -1
Optimized Version¶
from collections import deque
def shortest_path(grid, start, end):
"""BFS on grid -- marks visited at enqueue time."""
rows, cols = len(grid), len(grid[0])
queue = deque([(start[0], start[1], 0)])
visited = {(start[0], start[1])} # mark visited immediately
while queue:
r, c, dist = queue.popleft()
if (r, c) == end:
return dist
for dr, dc in [(0,1),(0,-1),(1,0),(-1,0)]:
nr, nc = r+dr, c+dc
if (0 <= nr < rows and 0 <= nc < cols
and grid[nr][nc] == 0 and (nr, nc) not in visited):
visited.add((nr, nc)) # mark before enqueue
queue.append((nr, nc, dist+1))
return -1
Why it is faster: Marking visited at enqueue time prevents the same cell from being enqueued by multiple neighbors. The queue stays at most O(perimeter) instead of O(edges), reducing both memory and time.
Exercise 4: Moving Average -- O(n) to O(1) per Query¶
Slow Version¶
// O(n) per query: recompute sum from scratch
type MovingAverage struct {
window []float64
size int
}
func (ma *MovingAverage) Next(val float64) float64 {
ma.window = append(ma.window, val)
if len(ma.window) > ma.size {
ma.window = ma.window[1:]
}
sum := 0.0
for _, v := range ma.window {
sum += v
}
return sum / float64(len(ma.window))
}
Optimized Version¶
// O(1) per query: maintain running sum with circular buffer
type MovingAverage struct {
data []float64
size int
count int
idx int
sum float64
}
func NewMovingAverage(size int) *MovingAverage {
return &MovingAverage{data: make([]float64, size), size: size}
}
func (ma *MovingAverage) Next(val float64) float64 {
if ma.count >= ma.size {
ma.sum -= ma.data[ma.idx] // subtract outgoing value
}
ma.data[ma.idx] = val
ma.sum += val
ma.idx = (ma.idx + 1) % ma.size
if ma.count < ma.size {
ma.count++
}
return ma.sum / float64(ma.count)
}
Why it is faster: Instead of summing the entire window each time, we maintain a running sum. When a new value enters, we subtract the oldest value (about to be overwritten in the circular buffer) and add the new value. O(1) per query.
Exercise 5: Task Scheduler -- O(n*m) to O(n)¶
Slow Version¶
// O(n * m): simulate each time slot
public int leastInterval(char[] tasks, int n) {
Map<Character, Integer> freq = new HashMap<>();
for (char t : tasks) freq.merge(t, 1, Integer::sum);
int time = 0;
while (!freq.isEmpty()) {
List<Map.Entry<Character, Integer>> sorted = new ArrayList<>(freq.entrySet());
sorted.sort((a, b) -> b.getValue() - a.getValue());
int cycle = n + 1;
int executed = 0;
for (var entry : sorted) {
if (cycle <= 0) break;
entry.setValue(entry.getValue() - 1);
if (entry.getValue() == 0) freq.remove(entry.getKey());
executed++;
cycle--;
}
time += freq.isEmpty() ? executed : n + 1;
}
return time;
}
Optimized Version¶
// O(n): math formula based on max frequency
public int leastInterval(char[] tasks, int n) {
int[] freq = new int[26];
for (char t : tasks) freq[t - 'A']++;
int maxFreq = 0, maxCount = 0;
for (int f : freq) {
if (f > maxFreq) { maxFreq = f; maxCount = 1; }
else if (f == maxFreq) maxCount++;
}
int intervals = (maxFreq - 1) * (n + 1) + maxCount;
return Math.max(intervals, tasks.length);
}
Why it is faster: Instead of simulating each time slot, we use a formula. The minimum time is determined by the most frequent task and the cooldown period. One pass to count frequencies, then O(1) computation.
Exercise 6: First Non-Repeating -- O(n^2) to O(n)¶
Slow Version¶
# O(n^2): for each position, scan all previous characters
def first_non_repeating(stream):
result = []
for i in range(len(stream)):
found = '#'
for j in range(i + 1):
count = stream[:i+1].count(stream[j])
if count == 1:
found = stream[j]
break
result.append(found)
return result
Optimized Version¶
from collections import deque
def first_non_repeating(stream):
freq = {}
queue = deque() # candidates for non-repeating
result = []
for ch in stream:
freq[ch] = freq.get(ch, 0) + 1
queue.append(ch)
# Remove characters that are now repeating
while queue and freq[queue[0]] > 1:
queue.popleft()
result.append(queue[0] if queue else '#')
return result
Why it is faster: The queue tracks non-repeating candidates. Each character enters and leaves the queue at most once, giving O(n) total. The frequency map provides O(1) lookups.
Exercise 7: Priority Queue -- O(n) Extract to O(log n)¶
Slow Version¶
// O(n) extract: linear scan for minimum
type SlowPQ struct {
data []int
}
func (pq *SlowPQ) Insert(val int) {
pq.data = append(pq.data, val) // O(1)
}
func (pq *SlowPQ) ExtractMin() int {
minIdx := 0
for i := 1; i < len(pq.data); i++ {
if pq.data[i] < pq.data[minIdx] {
minIdx = i
}
}
min := pq.data[minIdx]
pq.data[minIdx] = pq.data[len(pq.data)-1]
pq.data = pq.data[:len(pq.data)-1]
return min
}
Optimized Version¶
// O(log n) extract: binary heap
type MinHeap struct {
data []int
}
func (h *MinHeap) Insert(val int) {
h.data = append(h.data, val)
i := len(h.data) - 1
for i > 0 {
parent := (i - 1) / 2
if h.data[parent] <= h.data[i] { break }
h.data[parent], h.data[i] = h.data[i], h.data[parent]
i = parent
}
}
func (h *MinHeap) ExtractMin() int {
min := h.data[0]
h.data[0] = h.data[len(h.data)-1]
h.data = h.data[:len(h.data)-1]
i, n := 0, len(h.data)
for {
smallest, left, right := i, 2*i+1, 2*i+2
if left < n && h.data[left] < h.data[smallest] { smallest = left }
if right < n && h.data[right] < h.data[smallest] { smallest = right }
if smallest == i { break }
h.data[i], h.data[smallest] = h.data[smallest], h.data[i]
i = smallest
}
return min
}
Why it is faster: A binary heap maintains the min at the root. Insert sifts up in O(log n). Extract replaces the root with the last element and sifts down in O(log n). Both are much faster than O(n) linear scan.
Exercise 8: Queue Memory Leak -- Fix Unbounded Growth¶
Slow Version (Memory Leak)¶
// Memory leak: dequeued elements are never freed
public class LeakyQueue {
private Object[] data = new Object[16];
private int front = 0, rear = 0;
public void enqueue(Object val) {
if (rear == data.length) {
data = Arrays.copyOf(data, data.length * 2);
}
data[rear++] = val;
}
public Object dequeue() {
return data[front++]; // old reference stays in array!
}
}
Optimized Version¶
// Fixed: null out dequeued slots + use circular buffer
public class FixedQueue {
private Object[] data;
private int front = 0, size = 0, capacity;
public FixedQueue(int cap) {
this.capacity = cap;
this.data = new Object[cap];
}
public void enqueue(Object val) {
if (size == capacity) resize(capacity * 2);
data[(front + size) % capacity] = val;
size++;
}
public Object dequeue() {
Object val = data[front];
data[front] = null; // null out for GC
front = (front + 1) % capacity;
size--;
return val;
}
private void resize(int newCap) {
Object[] newData = new Object[newCap];
for (int i = 0; i < size; i++)
newData[i] = data[(front + i) % capacity];
data = newData;
front = 0;
capacity = newCap;
}
}
Why it is better: The leaky version never reuses slots at the front and never nulls out references. The array grows unboundedly even if only a few elements exist. The fixed version uses a circular buffer (constant memory for fixed throughput) and nulls out dequeued slots so the GC can reclaim objects.
Exercise 9: Multi-Queue Merge -- O(nk) to O(nlog k)¶
Slow Version¶
# O(n*k): find minimum across k queues each time
def merge_k_queues(queues):
result = []
while any(queues):
min_val = float('inf')
min_idx = -1
for i, q in enumerate(queues):
if q and q[0] < min_val:
min_val = q[0]
min_idx = i
result.append(queues[min_idx].popleft())
return result
Optimized Version¶
import heapq
from collections import deque
def merge_k_queues(queues):
# O(n * log k): use a min-heap to track the smallest front
heap = []
for i, q in enumerate(queues):
if q:
heapq.heappush(heap, (q[0], i))
result = []
while heap:
val, idx = heapq.heappop(heap)
queues[idx].popleft()
result.append(val)
if queues[idx]:
heapq.heappush(heap, (queues[idx][0], idx))
return result
Why it is faster: Instead of scanning all k queues to find the minimum (O(k) per extraction), we use a min-heap of size k. Each extraction and insertion takes O(log k). Total: O(n * log k) instead of O(n * k).
Exercise 10: Rate Limiter -- O(n) Cleanup to O(1) Amortized¶
Slow Version¶
// O(n) per check: stores all timestamps, cleans up linearly
type SlowLimiter struct {
timestamps []int64
window int64 // window in milliseconds
limit int
}
func (l *SlowLimiter) Allow(now int64) bool {
// Remove old timestamps -- O(n) scan
newTs := []int64{}
for _, ts := range l.timestamps {
if now-ts < l.window {
newTs = append(newTs, ts)
}
}
l.timestamps = newTs
if len(l.timestamps) < l.limit {
l.timestamps = append(l.timestamps, now)
return true
}
return false
}
Optimized Version¶
// O(1) amortized: use a circular buffer, timestamps are naturally ordered
type FastLimiter struct {
timestamps []int64
capacity int
idx int
count int
window int64
}
func NewFastLimiter(limit int, window int64) *FastLimiter {
return &FastLimiter{
timestamps: make([]int64, limit),
capacity: limit,
window: window,
}
}
func (l *FastLimiter) Allow(now int64) bool {
if l.count < l.capacity {
l.timestamps[l.idx] = now
l.idx = (l.idx + 1) % l.capacity
l.count++
return true
}
// Check oldest timestamp (it is at current idx in circular buffer)
oldest := l.timestamps[l.idx]
if now-oldest >= l.window {
l.timestamps[l.idx] = now
l.idx = (l.idx + 1) % l.capacity
return true
}
return false
}
Why it is faster: The slow version scans and rebuilds the timestamp list every call. The optimized version uses a circular buffer of size limit. Since timestamps are added in order, the oldest is always at the current write position. One comparison determines whether to allow -- O(1).
Exercise 11: BFS on Matrix -- Reduce Allocations¶
Slow Version¶
// High allocation: creates a new int[] for each cell in the queue
public int bfsDistance(int[][] grid, int[] start, int[] end) {
int rows = grid.length, cols = grid[0].length;
boolean[][] visited = new boolean[rows][cols];
Queue<int[]> queue = new LinkedList<>();
queue.offer(new int[]{start[0], start[1], 0}); // each enqueue allocates
visited[start[0]][start[1]] = true;
while (!queue.isEmpty()) {
int[] cell = queue.poll();
if (cell[0] == end[0] && cell[1] == end[1]) return cell[2];
for (int[] d : new int[][]{{0,1},{0,-1},{1,0},{-1,0}}) {
int nr = cell[0]+d[0], nc = cell[1]+d[1];
if (nr>=0 && nr<rows && nc>=0 && nc<cols && !visited[nr][nc] && grid[nr][nc]==0) {
visited[nr][nc] = true;
queue.offer(new int[]{nr, nc, cell[2]+1});
}
}
}
return -1;
}
Optimized Version¶
// Low allocation: use a distance matrix instead of storing distance in queue
public int bfsDistance(int[][] grid, int[] start, int[] end) {
int rows = grid.length, cols = grid[0].length;
int[][] dist = new int[rows][cols];
for (int[] row : dist) Arrays.fill(row, -1);
dist[start[0]][start[1]] = 0;
// Encode (r, c) as single int to avoid array allocation
ArrayDeque<Integer> queue = new ArrayDeque<>();
queue.offer(start[0] * cols + start[1]);
int[][] dirs = {{0,1},{0,-1},{1,0},{-1,0}};
while (!queue.isEmpty()) {
int encoded = queue.poll();
int r = encoded / cols, c = encoded % cols;
if (r == end[0] && c == end[1]) return dist[r][c];
for (int[] d : dirs) {
int nr = r+d[0], nc = c+d[1];
if (nr>=0 && nr<rows && nc>=0 && nc<cols && dist[nr][nc]==-1 && grid[nr][nc]==0) {
dist[nr][nc] = dist[r][c] + 1;
queue.offer(nr * cols + nc);
}
}
}
return -1;
}
Why it is faster: (1) Distances are stored in a matrix, not per-queue-entry. (2) Cell coordinates are encoded as a single integer (row * cols + col), eliminating per-cell array allocation. (3) ArrayDeque is used instead of LinkedList for better cache locality. These changes reduce GC pressure significantly.
Exercise 12: Circular Buffer -- Branching to Branchless¶
Slow Version¶
class CircularBuffer:
def __init__(self, capacity):
self._data = [0] * capacity
self._head = 0
self._cap = capacity
def advance(self):
# Branch: if-else for wraparound
if self._head == self._cap - 1:
self._head = 0
else:
self._head += 1
Optimized Version¶
class CircularBuffer:
def __init__(self, capacity):
self._data = [0] * capacity
self._head = 0
self._cap = capacity
# Ensure capacity is power of 2 for bitmask
assert capacity & (capacity - 1) == 0, "capacity must be power of 2"
self._mask = capacity - 1
def advance(self):
# Branchless: bitmask instead of modulo or if-else
self._head = (self._head + 1) & self._mask
Why it is faster: When the capacity is a power of 2, the modulo operation % capacity can be replaced with a bitwise AND & (capacity - 1). This is branchless (no if-else, no division) and executes in a single CPU cycle. For tight loops processing millions of items (audio buffers, network packets), this makes a measurable difference.
Note: The modulo version (head + 1) % cap also works and is branchless on modern CPUs, but the bitmask is still faster because integer division (which modulo uses internally) is more expensive than bitwise AND.