Multipattern Search

This is a simple multipattern search algorithm. It isn't particularly speedy (grep -Ff words target which uses Aho-Corasick will outperform this algorithm) but it is quick to implement . It reads a list of words, sorts them into sublists grouped by length, and converts each sublist into a trie. Then, for each trie, it slides a window of that length (trie depth) over the input and walks the trie for each value in the window. Matches are stored and output at the end.

Approximately 1 MB of random sample data may be generated with openssl rand -out target -base64 $(( 2**20 * 3/4 )).

Usage: ./ words target where words is the name of a file with one word per line and target is the name of the file to be searched. If debug = True, the search terms, search trees, and progress meters will be displayed. If debug = False, only matches will be printed. Matches are displayed one to a line: the match location in bytes from the beginning, a space, and the word matched.

#!/usr/bin/env python

from pprint import pprint as pp
import sys

debug = True

#   Input data
words = []
with open(sys.argv[1], 'r') as f:
    for a_word in f:
        a_word = a_word[:-1] if a_word[-1] == '\n' else a_word
        words.append((a_word, len(a_word)))

#   Length sort
words.sort(key=lambda w: w[1])

#   Split into lists by length
current_length  = 0
sorted_words    = []

for a_word, word_length in words:
    if current_length != word_length:
        current_length = word_length

if debug:

#   Trie class to store search terms
class Trie(dict):
    def add(self, data):
        if not data:
        except KeyError:
            self[data[0]] = Trie()

    def find(self, data):
        if not data:
            return True
            return self[data[0]].find(data[1:])
        except KeyError:
            return False

    def depth(self, n):
        if not ((self and True) or False):
            return n

        return self.values()[0].depth(n + 1)

#   Construct tries out of pre-sorted words
tries = []

for current_sublist, a_sublist in enumerate(sorted_words):

    for a_word in a_sublist:

if debug:

#   Make a pass for each trie, sliding a window of that trie's depth
#   over the input and walking the trie for matches.
found = []

for a_trie in tries:
    current_depth   = a_trie.depth(0)
    step_back       = 1-current_depth

    if debug: print("Current depth: %d" % current_depth)

    #   seeks over the file to avoid reading it all in at once
    with open(sys.argv[2], 'rb') as f:, 2)
        endpoint = f.tell()
        if endpoint < current_depth: break

        #   progress bar
        i       = 0
        last_i  = 5
        per     = endpoint / 10000
        per     = per if per != 0 else 1

        if debug:

        #   main search loop (sliding window)
        while True:
            if (endpoint - f.tell()) < current_depth: break
            current_read =
            if not current_read: break
            if a_trie.find(current_read):
                found.append("%d %s" % (f.tell() - current_depth, current_read))
  , 1)

            #   progress bar
            if debug:
                if (i % per) == 0:
                    i_percent = "%.2f" % (i * 100.0 / endpoint)
                    sys.stdout.write("%s%s%%" % (('\b' * last_i), i_percent))
                    last_i = len(i_percent) + 1
                i += 1
        if debug:
            sys.stdout.write("%s100.00%%\n" % ('\b' * last_i))

#   Final results
for an_item in found: