Source code for mouffet.data.split

import random

from ..utils.file import list_folder


[docs]def random_split(path, split_props, extensions): splits = [[] for i in range(len(split_props) + 1)] files = [str(p) for p in path.rglob("*") if p.suffix.lower() in extensions] n_files = len(files) random.shuffle(files) idx = 0 start_idx = 0 for split_prop in split_props: n_split = int(split_prop * n_files) n_files = n_files - n_split splits[idx] = files[start_idx : start_idx + n_split] start_idx = start_idx + n_split idx += 1 splits[idx] = files[start_idx : len(files)] return splits
[docs]def split_list(data, split_props): splits = [] tmp_list = data.copy() random.shuffle(tmp_list) for proportion in split_props: split_length = round(len(tmp_list) * proportion) splits.append(tmp_list[0:split_length]) tmp_list = tmp_list[split_length:] splits.append(tmp_list) return splits
[docs]def split_folder(path, split_props, extensions): """Function if files are ordered with folders. Apply the split to each subfolder with files and continues recursively Args: path (Path): path where to start searching for files split_props (list): list of proportions for each split extensions (list): list of file extensions to include Returns: list: list with all the splits """ splits = [[] for i in range(len(split_props) + 1)] dirs, files = list_folder(path, extensions) if files: tmp_splits = split_list(files, split_props) for i, split in enumerate(tmp_splits): splits[i] += split if dirs: for dir_path in dirs: tmp_splits = split_folder(dir_path, split_props, extensions) for i, split in enumerate(tmp_splits): splits[i] += split return splits