Source code for skml.datasets.sample_down_label_space

import numpy as np
from scipy.sparse import issparse
from operator import itemgetter


[docs]def sample_down_label_space(y, k, method='most-frequent'): """ Samples down label space, such that the returned label space retains order of the original labels, but removes labels which do not meet certain criteria (see `method`). Parameters ---------- y : (sparse) array-like, shape = [n_samples, ], [n_samples, n_classes] Multi-label targets k : number Number of returned labels, has to be smaller than the number of distinct labels in `y` method : string, default = 'most-frequent' Method to sample the label space down. Currently supported is only by top k most frequent labels. """ if k > y.shape[1]: raise ValueError('Cannot sample more labels than given') if method == 'most-frequent': # create mapping of frequencies per column (label) if issparse(y): # sum of sparse matrix returns a matrix. A1 holds the matrix as # a one dimensional array, just like if y was dense freqs = list(map(lambda x: (x[0], x[1]), enumerate(np.sum(y, axis=0).A1))) else: freqs = list(map(lambda x: (x[0], x[1]), enumerate(np.sum(y, axis=0)))) freqs.sort(key=itemgetter(1), reverse=True) # select top k labels, restore original order # if we wouldn't restore the original order, the labels would # be ordered not by original column, but by "most frequent occuring" sampled_indices = sorted(list(map(lambda x: x[0], freqs[:k]))) return y[:, sampled_indices] else: raise ValueError('No such sample method {0}'.format(method))