Design IP/CIDR rule matcher
Company: Databricks
Role: Software Engineer
Category: Coding & Algorithms
Difficulty: Medium
Interview Round: Technical Screen
Quick Answer: This question evaluates understanding of IPv4 addressing, CIDR and numeric range representations, plus competency in designing efficient data structures and algorithms for low-latency rule matching with dynamic updates.
Part 1: IPv4 Rule Matcher with Add, Remove, and Query
Constraints
- 0 <= len(operations) <= 100000
- All IPv4 addresses are valid dotted-decimal IPv4 strings
- CIDR prefix lengths are between 0 and 32 inclusive
- Actions are either 'accept' or 'deny'
- For range rules, start and end fit in the IPv4 space; treat the rule as inclusive
Examples
Input: [('add_cidr', '10.0.0.0/8', 'accept'), ('add_cidr', '10.1.0.0/16', 'deny'), ('query', '10.1.2.3'), ('query', '10.2.3.4'), ('query', '11.0.0.1')]
Expected Output: ['deny', 'accept', 'none']
Explanation: The /16 rule is more specific than the /8 rule for 10.1.2.3. The address 10.2.3.4 matches only the /8. The address 11.0.0.1 matches nothing.
Input: [('add_cidr', '192.168.1.1/32', 'deny'), ('add_range', '192.168.1.1', '192.168.1.1', 'accept'), ('query', '192.168.1.1')]
Expected Output: ['accept']
Explanation: Both rules cover exactly one IP, so they have equal specificity. The newer rule wins.
Input: [('add_range', '1.1.1.0', '1.1.1.255', 'deny'), ('add_range', '1.1.1.0', '1.1.1.255', 'accept'), ('query', '1.1.1.1'), ('remove_range', '1.1.1.0', '1.1.1.255', 'accept'), ('query', '1.1.1.1'), ('remove_range', '1.1.1.0', '1.1.1.255', 'deny'), ('query', '1.1.1.1')]
Expected Output: ['accept', 'deny', 'none']
Explanation: Two identical ranges are active, so newest wins first. After removing the newer one, the older deny remains. After removing that too, no rule matches.
Input: [('remove_cidr', '0.0.0.0/0', 'deny'), ('query', '8.8.8.8'), ('add_cidr', '0.0.0.0/0', 'deny'), ('add_cidr', '255.255.255.255/32', 'accept'), ('query', '255.255.255.255'), ('query', '0.0.0.0')]
Expected Output: ['none', 'accept', 'deny']
Explanation: Removing a non-existent rule does nothing. The /32 for 255.255.255.255 is more specific than /0, while 0.0.0.0 matches only the /0 rule.
Solution
def solution(operations):
from bisect import bisect_left, bisect_right
from collections import defaultdict
import heapq
def ip_to_int(ip):
a, b, c, d = map(int, ip.split('.'))
return ((a << 24) | (b << 16) | (c << 8) | d)
def parse_cidr(cidr):
ip, prefix = cidr.split('/')
prefix = int(prefix)
base = ip_to_int(ip)
if prefix == 0:
return 0, 0xFFFFFFFF, prefix
mask = ((0xFFFFFFFF << (32 - prefix)) & 0xFFFFFFFF)
start = (base & mask)
size = (1 << (32 - prefix))
end = start + size - 1
return start, end, prefix
query_values = [ip_to_int(op[1]) for op in operations if op[0] == 'query']
coords = sorted(set(query_values))
index_of = {value: i for i, value in enumerate(coords)}
m = len(coords)
tree = [[] for _ in range(4 * m if m else 1)]
active = {}
rule_stacks = defaultdict(list)
timestamp = 0
next_id = 1
def clean(node):
heap = tree[node]
while heap and not active.get(heap[0][2], False):
heapq.heappop(heap)
def update(node, left, right, ql, qr, item):
if ql > right or qr < left or ql > qr:
return
if ql <= left and right <= qr:
heapq.heappush(tree[node], item)
return
mid = (left + right) // 2
if ql <= mid:
update(node * 2, left, mid, ql, qr, item)
if qr > mid:
update(node * 2 + 1, mid + 1, right, ql, qr, item)
def query_best(node, left, right, idx):
clean(node)
best_here = tree[node][0] if tree[node] else None
if left == right:
return best_here
mid = (left + right) // 2
if idx <= mid:
best_child = query_best(node * 2, left, mid, idx)
else:
best_child = query_best(node * 2 + 1, mid + 1, right, idx)
if best_here is None:
return best_child
if best_child is None:
return best_here
return best_here if best_here < best_child else best_child
answers = []
for op in operations:
kind = op[0]
if kind == 'add_range':
_, start_ip, end_ip, action = op
start = ip_to_int(start_ip)
end = ip_to_int(end_ip)
if start > end:
start, end = end, start
timestamp += 1
rid = next_id
next_id += 1
active[rid] = True
length = end - start + 1
key = ('range', start, end, action)
rule_stacks[key].append(rid)
if m:
l = bisect_left(coords, start)
r = bisect_right(coords, end) - 1
if l <= r:
update(1, 0, m - 1, l, r, (length, -timestamp, rid, action))
elif kind == 'add_cidr':
_, cidr, action = op
start, end, prefix = parse_cidr(cidr)
timestamp += 1
rid = next_id
next_id += 1
active[rid] = True
length = end - start + 1
key = ('cidr', start, end, prefix, action)
rule_stacks[key].append(rid)
if m:
l = bisect_left(coords, start)
r = bisect_right(coords, end) - 1
if l <= r:
update(1, 0, m - 1, l, r, (length, -timestamp, rid, action))
elif kind == 'remove_range':
_, start_ip, end_ip, action = op
start = ip_to_int(start_ip)
end = ip_to_int(end_ip)
if start > end:
start, end = end, start
key = ('range', start, end, action)
stack = rule_stacks[key]
while stack and not active.get(stack[-1], False):
stack.pop()
if stack:
active[stack.pop()] = False
elif kind == 'remove_cidr':
_, cidr, action = op
start, end, prefix = parse_cidr(cidr)
key = ('cidr', start, end, prefix, action)
stack = rule_stacks[key]
while stack and not active.get(stack[-1], False):
stack.pop()
if stack:
active[stack.pop()] = False
elif kind == 'query':
_, ip = op
if not m:
answers.append('none')
continue
value = ip_to_int(ip)
idx = index_of[value]
best = query_best(1, 0, m - 1, idx)
answers.append(best[3] if best else 'none')
return answers
Time complexity: Preprocessing the queried IPs takes O(Q log Q), where Q is the number of query operations. Each add operation costs O(log^2 Q) in the worst case because it touches O(log Q) segment-tree nodes and pushes into heaps. Each remove is O(1) amortized plus lazy cleanup later. Each query is O(log Q) path traversal plus amortized heap cleanup.. Space complexity: O(Q + R log Q), where Q is the number of distinct queried IPs and R is the number of added rules that cover at least one queried IP..
Hints
- Convert every IPv4 address to a 32-bit integer before comparing ranges. Use explicit parentheses when shifting and masking.
- Because the full operation list is known in advance, compress only the IPs that are actually queried, then support range updates and point queries on those coordinates.
Part 2: Choose the Best Rule-Index Data Structure for Firewall Workloads
Constraints
- 1 <= len(scenarios) <= 100000
- All count fields are integers between 0 and 10^9
- coordinate_count is an integer between 0 and 10^9
- Use L(x) = 0 if x <= 1, otherwise ceil(log2(x))
- If multiple strategies have the same cost, use priority: sorted_disjoint_intervals < ordered_map < interval_tree < segment_tree < radix_trie < hybrid(...)
Examples
Input: [{'prefix_rules': 0, 'range_rules': 8, 'prefix_updates': 0, 'range_updates': 0, 'queries': 100, 'disjoint_ranges': True, 'coordinate_count': 0}]
Expected Output: ['sorted_disjoint_intervals']
Explanation: This is a pure static disjoint-range workload, which is exactly what sorted disjoint intervals are best at under the given model.
Input: [{'prefix_rules': 10, 'range_rules': 0, 'prefix_updates': 5, 'range_updates': 0, 'queries': 20, 'disjoint_ranges': False, 'coordinate_count': 0}]
Expected Output: ['radix_trie']
Explanation: This is a pure prefix workload, and only the radix trie directly targets prefixes without converting them to ranges.
Input: [{'prefix_rules': 50, 'range_rules': 10, 'prefix_updates': 10, 'range_updates': 5, 'queries': 100, 'disjoint_ranges': True, 'coordinate_count': 50000}]
Expected Output: ['hybrid(ordered_map+radix_trie)']
Explanation: For the mixed workload, the trie handles prefixes and the ordered map is the cheapest valid range structure, so the hybrid beats a full segment tree.
Input: [{'prefix_rules': 5, 'range_rules': 50, 'prefix_updates': 1, 'range_updates': 20, 'queries': 30, 'disjoint_ranges': False, 'coordinate_count': 64}]
Expected Output: ['segment_tree']
Explanation: The coordinate count is small enough that a segment tree over compressed coordinates becomes cheaper than the hybrid option.
Input: [{'prefix_rules': 0, 'range_rules': 0, 'prefix_updates': 0, 'range_updates': 0, 'queries': 0, 'disjoint_ranges': True, 'coordinate_count': 0}]
Expected Output: ['sorted_disjoint_intervals']
Explanation: All valid strategies have zero cost on an empty workload, so the required tie-break picks sorted_disjoint_intervals.
Solution
def solution(scenarios):
def clog2(x):
return 0 if x <= 1 else (x - 1).bit_length()
overall_priority = {
'sorted_disjoint_intervals': 0,
'ordered_map': 1,
'interval_tree': 2,
'segment_tree': 3,
'radix_trie': 4,
'hybrid': 5,
}
range_priority = {
'sorted_disjoint_intervals': 0,
'ordered_map': 1,
'interval_tree': 2,
'segment_tree': 3,
}
def best_range_strategy(range_rules, range_updates, queries, disjoint_ranges, coordinate_count):
candidates = []
lr = clog2(range_rules + range_updates)
l0 = clog2(range_rules)
if disjoint_ranges and range_updates == 0:
cost = range_rules * l0 + queries * l0
candidates.append(('sorted_disjoint_intervals', cost))
if disjoint_ranges:
cost = 2 * range_rules * lr + 3 * range_updates * lr + queries * lr
candidates.append(('ordered_map', cost))
cost = 2 * range_rules * lr + 2 * range_updates * lr + queries * (lr + 1)
candidates.append(('interval_tree', cost))
if 0 < coordinate_count <= 200000:
lc = clog2(coordinate_count)
cost = coordinate_count + 2 * (range_rules + range_updates) * lc + queries * lc
candidates.append(('segment_tree', cost))
best = min(candidates, key=lambda item: (item[1], range_priority[item[0]], item[0]))
return best, {name: cost for name, cost in candidates}
def overall_key(name):
if name.startswith('hybrid('):
return overall_priority['hybrid']
return overall_priority[name]
answers = []
for sc in scenarios:
prefix_rules = sc['prefix_rules']
range_rules = sc['range_rules']
prefix_updates = sc['prefix_updates']
range_updates = sc['range_updates']
queries = sc['queries']
disjoint_ranges = sc['disjoint_ranges']
coordinate_count = sc['coordinate_count']
has_prefix_work = (prefix_rules > 0 or prefix_updates > 0)
has_range_work = (range_rules > 0 or range_updates > 0)
candidates = []
if not has_prefix_work and not has_range_work:
best_range, all_range = best_range_strategy(0, 0, queries, True, coordinate_count)
for name, cost in all_range.items():
candidates.append((name, cost))
candidates.append(('radix_trie', 32 * queries))
elif has_range_work and not has_prefix_work:
_, all_range = best_range_strategy(range_rules, range_updates, queries, disjoint_ranges, coordinate_count)
for name, cost in all_range.items():
candidates.append((name, cost))
elif has_prefix_work and not has_range_work:
candidates.append(('radix_trie', 32 * (prefix_rules + prefix_updates + queries)))
if 0 < coordinate_count <= 200000:
lc = clog2(coordinate_count)
cost = coordinate_count + 2 * (prefix_rules + prefix_updates) * lc + queries * lc
candidates.append(('segment_tree', cost))
else:
if 0 < coordinate_count <= 200000:
lc = clog2(coordinate_count)
cost = coordinate_count + 2 * (prefix_rules + range_rules + prefix_updates + range_updates) * lc + queries * lc
candidates.append(('segment_tree', cost))
best_range, _ = best_range_strategy(range_rules, range_updates, queries, disjoint_ranges, coordinate_count)
range_name, range_cost = best_range
trie_cost = 32 * (prefix_rules + prefix_updates + queries)
hybrid_name = 'hybrid(' + range_name + '+radix_trie)'
candidates.append((hybrid_name, trie_cost + range_cost))
best_name, _ = min(candidates, key=lambda item: (item[1], overall_key(item[0]), item[0]))
answers.append(best_name)
return answers
Time complexity: O(S), where S is the number of scenarios. Each scenario evaluates only a constant number of candidate strategies.. Space complexity: O(1) auxiliary space per scenario, excluding the output list..
Hints
- First classify each scenario as pure range, pure prefix, mixed, or empty. Different strategy sets are valid in different categories.
- Avoid floating-point logs. In Python, ceil(log2(x)) for x > 1 can be computed as (x - 1).bit_length().