Handle multi-source string matching and tagging
Company: Harvey AI
Role: Software Engineer
Category: Coding & Algorithms
Difficulty: Medium
Interview Round: Technical Screen
##### Question
Given an LLM output string and a list of source strings, design an algorithm to count how many times each source appears in the output. Extend the solution to return the output string where every occurrence of a source string is wrapped in <tag></tag> while correctly handling overlapping matches. Modify the algorithm so that each tag also includes annotations of the indices of the sources that matched (e.g., <tag>text</tag>[1][3]).
Quick Answer: This question evaluates skills in string matching, pattern recognition, overlap handling, text annotation, and counting occurrences across multiple source strings.
Given a string output and a list of source strings sources, return two results: (1) counts: a list where counts[i] is the number of occurrences (allowing overlaps) of sources[i] in output; (2) tagged: a new string constructed by wrapping each maximal contiguous region of output that is covered by at least one match with <tag> and </tag>. After each such tag, append bracketed source indices in ascending order (e.g., [0][3]) indicating all distinct i for which at least one occurrence of sources[i] overlaps that region. Indices are 0-based. Overlapping or touching matches are merged into a single tagged region. Matching is exact and case-sensitive.
Constraints
- 1 <= len(output) <= 200000
- 1 <= len(sources) <= 5000
- 1 <= sum(len(s) for s in sources) <= 200000
- All sources[i] are non-empty; duplicates allowed
- Matching is exact and case-sensitive
- Indices in annotations are 0-based, unique per region, and sorted ascending
Solution
from typing import List, Tuple
from collections import deque
import heapq
def multi_source_tagging(output: str, sources: List[str]) -> Tuple[List[int], str]:
n = len(output)
m = len(sources)
# Build Aho-Corasick automaton
nexts: List[dict] = []
fail: List[int] = []
out: List[List[int]] = []
def new_node() -> int:
nexts.append({})
fail.append(0)
out.append([])
return len(nexts) - 1
new_node() # root = 0
lens = [len(p) for p in sources]
for idx, pat in enumerate(sources):
# Sources are guaranteed non-empty by constraints
node = 0
for ch in pat:
if ch not in nexts[node]:
nexts[node][ch] = new_node()
node = nexts[node][ch]
out[node].append(idx)
# Build failure links and output links
q = deque()
for ch, v in nexts[0].items():
fail[v] = 0
q.append(v)
while q:
u = q.popleft()
for ch, v in nexts[u].items():
f = fail[u]
while f and ch not in nexts[f]:
f = fail[f]
fail[v] = nexts[f][ch] if ch in nexts[f] else 0
# inherit outputs from failure link
if out[fail[v]]:
out[v].extend(out[fail[v]])
q.append(v)
# Traverse output string to collect matches and counts
counts = [0] * m
coverage_diff = [0] * (n + 1)
matches: List[Tuple[int, int, int]] = [] # (start, end, source_index)
state = 0
for i, ch in enumerate(output):
while state and ch not in nexts[state]:
state = fail[state]
if ch in nexts[state]:
state = nexts[state][ch]
else:
state = 0
if out[state]:
for idx in out[state]:
L = i + 1 - lens[idx]
R = i + 1
# L >= 0 always holds; lens[idx] > 0 by constraints
counts[idx] += 1
matches.append((L, R, idx))
coverage_diff[L] += 1
coverage_diff[R] -= 1
# Build maximal covered regions (merge overlapping or touching via coverage)
segments: List[Tuple[int, int]] = []
cover = 0
seg_start = None
for i in range(n):
cover += coverage_diff[i]
if cover > 0 and seg_start is None:
seg_start = i
elif cover == 0 and seg_start is not None:
segments.append((seg_start, i))
seg_start = None
if seg_start is not None:
segments.append((seg_start, n))
if not segments:
return counts, output
# Annotate each segment with indices of sources that have at least one match overlapping the segment
matches.sort(key=lambda x: x[0]) # sort by start
M = len(matches)
p = 0
heap: List[Tuple[int, int]] = [] # (end, match_id)
active_counts = {} # source_idx -> count of active overlapping matches
res_parts: List[str] = []
prev = 0
for L, R in segments:
# Add all matches starting before R
while p < M and matches[p][0] < R:
_, end, idx = matches[p]
heapq.heappush(heap, (end, p))
active_counts[idx] = active_counts.get(idx, 0) + 1
p += 1
# Remove all matches that end at or before L
while heap and heap[0][0] <= L:
_, mid = heapq.heappop(heap)
_, _, idx = matches[mid]
cnt = active_counts.get(idx, 0)
if cnt <= 1:
active_counts.pop(idx, None)
else:
active_counts[idx] = cnt - 1
# Emit segment with tag and annotations
res_parts.append(output[prev:L])
res_parts.append("<tag>")
res_parts.append(output[L:R])
res_parts.append("</tag>")
if active_counts:
for k in sorted(active_counts.keys()):
res_parts.append(f"[{k}]")
prev = R
res_parts.append(output[prev:])
tagged = ''.join(res_parts)
return counts, tagged