## 07 March 2006

### Mix'n'matching

Have you ever did mental math to figure out how to best fit a collection of data into a set of DVDs, trying to squeeze the most into every single DVD? It happens more and more to me, so I wrote a Python script to do it for me.

The algorithm used to efficiently find the largest path combinations below a threshold is inspired in the apriori algorithm for association rule discovery. Since the largest path combination is a superset of smaller combinations, we can start building those starting from single paths, combine those with the initial to make two-item sets while removing all larger than the threshold, then three-item, four-item, and so on; until no larger combination below the threshold can be found.

Here is the script:

```#!/usr/bin/env python
# mixnmatch.py - find combination of files/dirs that sum below a given threshold
# -- Jose Fonseca

import os
import os.path
import optparse
import sys

from sets import ImmutableSet as set

def get_size(path):
if os.path.isdir(path):
result = 0
for name in os.listdir(path):
result += get_size(os.path.join(path, name))
return result
else:
return os.path.getsize(path)

def mix_and_match(limit, items, verbose = False):

# filter items
items = [(size, name) for size, name in items if size <= limit]
# sort them by size
items.sort(lambda (xsize, xname), (ysize, yname): cmp(xsize, ysize))

# initialize variables
added_collections = dict([(set([name]), size) for size, name in items])

while True:
if verbose:
sys.stderr.write("%d\n" % len(collections))

# find unique combinations of the recent collections
new_collections = {}
for size2, name2 in items:
size3 = size1 + size2
if size3 > limit:
# we can break here as all collections that follow are
#  bigger in size due to the sorting above
break
if name2 in names1:
continue
names3 = names1.union(set([name2]))
if names3 in new_collections:
continue
new_collections[names3] = size3

if len(new_collections) == 0:
break

collections.update(new_collections)

return [(size, names) for names, size in collections.iteritems()]

def main():
parser = optparse.OptionParser(usage="\n\t%prog [options] path ...")
'-l', '--limit',
type="int", dest="limit", default=4700000000,
help="total size limit")
'-s', '--show',
type="int", dest="show", default=10,
help="number of combinations to show")
'-v', '--verbose',
action="store_true", dest="verbose", default=False,
help="verbose output")
(options, args) = parser.parse_args(sys.argv[1:])

limit = options.limit

items = [(get_size(arg), arg) for arg in args]

collections = mix_and_match(limit, items, options.verbose)
collections.sort(lambda (xsize, xnames), (ysize, ynames): -cmp(xsize, ysize))
if options.show != 0:
collections = collections[0:options.show]

for size, names in collections:
percentage = 100.0*float(size)/float(limit)
try:
sys.stdout.write("%10d\t%02.2f%%\t%s\n" % (size, percentage, " ".join(names)))
except IOError:
# ignore broken pipe
pass

if __name__ == '__main__':
main()
```

This script has also been posted as a Python Cookbook Recipe.