Implement KV store serialization
Company: OpenAI
Role: Software Engineer
Category: Coding & Algorithms
Difficulty: Medium
Interview Round: Technical Screen
Quick Answer: This question evaluates skills in binary serialization and deserialization, data encoding and layout, type tagging, endianness, checksumming, versioning for forward/backward compatibility, and algorithmic efficiency and space usage for an in-memory key-value store; it falls under the Coding & Algorithms domain.
Constraints
- All keys are strings, all strings are UTF-8, and all integers fit in signed 64-bit range.
- Serialize entries in the dict's existing iteration order; do not sort keys.
- The total serialized size is at most 10^6 bytes, and nesting depth is at most 100.
- Within a map, keys are unique.
Examples
Input: ('encode', {})
Expected Output: b'KVSB\x01\x04\x00\x00\x00\x00\x00\x00\x00;\x01\x00\x00'
Explanation: An empty map body is just a 4-byte entry count of 0. The checksum is the sum of the header and body bytes modulo 2^32.
Input: ('decode', b'KVSB\x01\x04\x00\x00\x00\x00\x00\x00\x00;\x01\x00\x00')
Expected Output: {}
Explanation: This is the valid encoding of an empty root map, so decoding returns an empty dict.
Input: ('encode', {'pi': 1.5})
Expected Output: b'KVSB\x01\x17\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00pi\x02\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf8?\x6b\x03\x00\x00'
Explanation: The map has one entry with key 'pi', type tag 2 for float64, and the IEEE-754 little-endian bytes for 1.5.
Input: ('decode', b'KVSB\x01\x24\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00m\x05\x16\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00a\x01\x08\x00\x00\x00\xfe\xff\xff\xff\xff\xff\xff\xff\x48\x0a\x00\x00')
Expected Output: {'m': {'a': -2}}
Explanation: The outer map contains one nested map under key 'm'. The nested map contains key 'a' with the int64 value -2.
Input: ('decode', b'KVSB\x01\x1b\x00\x00\x00\x02\x00\x00\x00\x01\x00\x00\x00t\x03\x01\x00\x00\x00\x01\x01\x00\x00\x00u\x04\x02\x00\x00\x00\xc3\xa9\xb6\x03\x00\x00')
Expected Output: {'t': True, 'u': 'é'}
Explanation: This buffer contains a bool entry and a UTF-8 string entry. The decoder must interpret tag 3 as bool and tag 4 as UTF-8 text.
Solution
def solution(operation, data):
import struct
MAGIC = b'KVSB'
VERSION = 1
def checksum(data_bytes):
return sum(data_bytes) & 0xFFFFFFFF
def require(condition, message):
if not condition:
raise ValueError(message)
def encode_map(obj):
if not isinstance(obj, dict):
raise TypeError('store must be a dict')
parts = [struct.pack('<I', len(obj))]
for key, value in obj.items():
if not isinstance(key, str):
raise TypeError('keys must be strings')
try:
key_bytes = key.encode('utf-8')
except UnicodeEncodeError:
raise ValueError('invalid UTF-8 key')
if type(value) is bool:
tag = 3
value_bytes = b'\x01' if value else b'\x00'
elif isinstance(value, int):
if not (-2**63 <= value <= 2**63 - 1):
raise ValueError('int out of int64 range')
tag = 1
value_bytes = struct.pack('<q', value)
elif isinstance(value, float):
tag = 2
value_bytes = struct.pack('<d', value)
elif isinstance(value, str):
tag = 4
try:
value_bytes = value.encode('utf-8')
except UnicodeEncodeError:
raise ValueError('invalid UTF-8 string')
elif isinstance(value, dict):
tag = 5
value_bytes = encode_map(value)
else:
raise TypeError('unsupported value type')
parts.append(struct.pack('<I', len(key_bytes)))
parts.append(key_bytes)
parts.append(bytes([tag]))
parts.append(struct.pack('<I', len(value_bytes)))
parts.append(value_bytes)
return b''.join(parts)
def decode_map(buf, start, end):
require(end - start >= 4, 'truncated map payload')
pos = start
entry_count = struct.unpack_from('<I', buf, pos)[0]
pos += 4
result = {}
for _ in range(entry_count):
require(end - pos >= 4, 'truncated key length')
key_len = struct.unpack_from('<I', buf, pos)[0]
pos += 4
require(key_len <= end - pos, 'invalid key length')
key_bytes = buf[pos:pos + key_len]
pos += key_len
try:
key = key_bytes.decode('utf-8')
except UnicodeDecodeError:
raise ValueError('invalid UTF-8 key')
require(end - pos >= 1, 'truncated type tag')
tag = buf[pos]
pos += 1
require(end - pos >= 4, 'truncated value length')
value_len = struct.unpack_from('<I', buf, pos)[0]
pos += 4
require(value_len <= end - pos, 'invalid value length')
value_bytes = buf[pos:pos + value_len]
pos += value_len
if tag == 1:
require(value_len == 8, 'invalid int64 payload length')
value = struct.unpack('<q', value_bytes)[0]
elif tag == 2:
require(value_len == 8, 'invalid float64 payload length')
value = struct.unpack('<d', value_bytes)[0]
elif tag == 3:
require(value_len == 1, 'invalid bool payload length')
require(value_bytes[0] in (0, 1), 'invalid bool payload')
value = (value_bytes[0] == 1)
elif tag == 4:
try:
value = value_bytes.decode('utf-8')
except UnicodeDecodeError:
raise ValueError('invalid UTF-8 string')
elif tag == 5:
value, nested_pos = decode_map(value_bytes, 0, len(value_bytes))
require(nested_pos == len(value_bytes), 'invalid nested map payload')
else:
raise ValueError('unknown type tag')
result[key] = value
return result, pos
if operation == 'encode':
body = encode_map(data)
header = MAGIC + bytes([VERSION]) + struct.pack('<I', len(body))
prefix = header + body
return prefix + struct.pack('<I', checksum(prefix))
if operation == 'decode':
if not isinstance(data, (bytes, bytearray)):
raise TypeError('buffer must be bytes or bytearray')
buf = bytes(data)
require(len(buf) >= 13, 'buffer too short')
require(buf[:4] == MAGIC, 'invalid magic')
version = buf[4]
require(version == VERSION, 'unsupported version')
body_len = struct.unpack_from('<I', buf, 5)[0]
total_len = 9 + body_len + 4
require(len(buf) == total_len, 'invalid total length')
expected_checksum = struct.unpack_from('<I', buf, 9 + body_len)[0]
actual_checksum = checksum(buf[:9 + body_len])
require(actual_checksum == expected_checksum, 'checksum mismatch')
body_start = 9
body_end = 9 + body_len
store, pos = decode_map(buf, body_start, body_end)
require(pos == body_end, 'body length mismatch')
return store
raise ValueError("operation must be 'encode' or 'decode'")Time complexity: O(B), where B is the total number of bytes written or read. Space complexity: O(B + D), where D is the nesting depth.
Hints
- Give every value a type tag and a length prefix. That makes recursive decoding simpler and leaves room for future type tags.
- While decoding, keep one index into the buffer and advance it carefully. For nested maps, decode only within the subrange described by that value's length.