Source code for mouffet.data.dataset

import pickle
from pathlib import Path

import feather
import pandas as pd

from ..utils import file_utils
from .data_loader import DataLoader
from .data_structure import DataStructure


[docs]class Dataset(DataStructure):

    LOADERS = {"default": DataLoader}

    def __init__(self, db_type="test", database=None, file_list=None):
        super().__init__()
        # self._paths = {}
        self.database = database
        self.db_type = db_type
        self.data = self.get_structure_copy()
        if database is not None:
            self.paths = self.update_paths(database.paths)
        self.file_list = file_list

[docs]    def get_class_subfolder_path(self):
        """Function called if data is to be saved using a class subfolder.
        By default uses the value of the "class_type" option.

        Args:
            database (dict): The dictionary holding all option for the specific database.

        Returns:
            str: The class name
        """
        return self.database.class_type

    def default_file_name(self, key, db_type, database):
        return db_type + "_" + key + "." + self.get_extension(key)

    def get_file_name(self, key, db_type, database):
        return self.get_structure_function(key, "file_name", db_type, database)

    def get_subfolders(self, key):
        return self.get_structure_function(key, "subfolders")

[docs]    def default_subfolders(self, file_type):
        """Generate subfolders based on a list provided in the 'use_subfolders' option.
        For each item in the list, this function will try to call the
        get_itemname_folder_path(database) method from the DataHandler instance, where itemname is
        the name of the current item in the list. For example, if the item is "class", then the
        function will attempt to call the 'get_class_folder_path' method.
        If the method is not found, the option is skipped.
        Note that the called function should have the following signature:
        get_itemname_folder_path(database) -> str or pathlib.Path

        Args:
            database (dict): The dictionary holding all option for the specific database.

        Returns:
            pathlib.Path: a Path
        """
        res = Path("")
        subfolders = self.database.subfolders
        if subfolders:
            if isinstance(subfolders, str):
                subfolders = [{"type": subfolders}]
            for subfolder in subfolders:
                func_name = "_".join(["get", subfolder["type"], "subfolder_path"])
                if hasattr(self, func_name) and callable(getattr(self, func_name)):
                    res /= getattr(self, func_name)(subfolder)
                else:
                    print(
                        "Warning! No function found for getting the subfolder path for the '"
                        + subfolder
                        + "' option. Check if this is the correct value in the "
                        + "'use_subfolders' option or create a '"
                        + func_name
                        + "' function in your DataHandler instance."
                    )
        return res

    def get_subfolder_options(self, name):
        subfolder_opts = self.database.get("subfolders", [])
        for subfolder in subfolder_opts:
            if subfolder.get("type", "") == name:
                return subfolder
        return {}

[docs]    def get_save_dest_paths(self, dest_dir):
        """Create

        Args:
            dest_dir ([type]): [description]
            db_type ([type]): [description]
            subfolders ([type]): [description]

        Returns:
            [type]: [description]
        """
        res = {}

        for file_type in self.structure.keys():
            res[file_type] = (
                dest_dir
                / self.get_subfolders(file_type)
                / self.get_file_name(
                    file_type, db_type=self.db_type, database=self.database
                )
            )
        return res

    def update_paths(self, paths):
        if paths:
            paths["save_dests"][self.db_type] = self.get_save_dest_paths(
                paths["dest"][self.db_type]
            )
        return paths

    def generate(self, file_list, missing, overwrite):
        loader_cls = self.LOADERS[self.database.get("loader", "default")]
        loader = loader_cls(self.get_structure_copy())
        loader.generate_dataset(
            database=self.database,
            paths=self.paths,
            file_list=file_list,
            db_type=self.db_type,
            missing=missing,
            overwrite=overwrite,
        )
        self.save(loader.data, missing)

[docs]    def save(self, data, missing=None):
        """_summary_

        Args:
            data (_type_): _description_
        """
        if data:
            for key, value in data.items():
                if missing and key in missing:
                    if isinstance(value, pd.DataFrame):
                        if value.empty:
                            continue
                    else:
                        if not value:
                            continue
                    path = self.paths["save_dests"][self.db_type][key]
                    if path.suffix == ".pkl":
                        with open(
                            file_utils.ensure_path_exists(path, is_file=True), "wb"
                        ) as f:
                            pickle.dump(value, f, -1)
                            print("Saved file: ", path)
                    elif path.suffix == ".feather":
                        if isinstance(value, pd.DataFrame):
                            value = value.reset_index(drop=True)
                            feather.write_dataframe(value, path)
                        else:
                            raise (
                                ValueError(
                                    (
                                        "Trying to write feather data from a source that"
                                        + " is not a dataframe for key {}"
                                    ).format(key)
                                )
                            )

    def get_loader(self):
        loader_cls = self.LOADERS[self.database.get("loader", "default")]
        loader = loader_cls(self.structure)
        return loader

[docs]    def load(self, load_opts=None):
        """_summary_

        Args:
            load_opts (_type_, optional): _description_. Defaults to None.
        """
        loader = self.get_loader()
        loader.load_dataset(self.paths, self.db_type, load_opts)
        self.data = loader.data

[docs]    def exists(self, file_types=None):
        """_summary_

        Returns:
            _type_: _description_
        """
        res = []
        file_types = file_types or {}
        for key in self.structure.keys():
            if file_types and key not in file_types:
                continue
            if not self.paths["save_dests"][self.db_type][key].exists():
                res.append(key)
        return res

[docs]    def summarize(self):
        """_summary_

        Returns:
            _type_: _description_
        """
        return {}

[docs]    def get_raw_data(self):
        """_summary_"""
        pass

[docs]    def get_ground_truth(self):
        """_summary_"""
        pass

    def __getitem__(self, key):
        return self.data[key]

    # def from_file_list(self, file_list, opts):
    #     pass

    # def from_folder(self, folder_path, opts):
    #     pass

    # def __iter__(self):
    #     loader = self.get_loader()
    #     if self.file_list is not None:
    #         for file_path in self.file_list:
    #             yield

    def copy(self):
        return self.__class__(self.db_type)