Source code for composeml.label_times.object

import json
import os

import pandas as pd

from ..version import __version__
from .description import describe_label_times
from .plots import LabelPlots

SCHEMA_VERSION = "0.1.0"


[docs]class LabelTimes(pd.DataFrame):
    """The data frame that contains labels and cutoff times for the target entity."""
[docs]    def __init__(
        self,
        data=None,
        target_entity=None,
        target_types=None,
        target_columns=None,
        search_settings=None,
        transforms=None,
        *args,
        **kwargs,
    ):
        super().__init__(data=data, *args, **kwargs)
        self.target_entity = target_entity
        self.target_columns = target_columns or []
        self.target_types = target_types or {}
        self.search_settings = search_settings or {}
        self.transforms = transforms or []
        self.plot = LabelPlots(self)

        if not self.empty: self._check_label_times()

    def _assert_single_target(self):
        """Asserts that the label times object contains a single target."""
        info = 'must first select an individual target'
        assert self._is_single_target, info

    def _check_target_columns(self):
        """Validates the target columns."""
        if not self.target_columns:
            self.target_columns = self._infer_target_columns()
        else:
            for target in self.target_columns:
                info = 'target "%s" not found in data frame'
                assert target in self.columns, info % target

    def _check_target_types(self):
        """Validates the target types."""
        if isinstance(self.target_types, dict):
            self.target_types = pd.Series(self.target_types)

        if self.target_types.empty:
            self.target_types = self._infer_target_types()
        else:
            target_names = self.target_types.index.tolist()
            match = target_names == self.target_columns
            assert match, 'target names in types must match target columns'

    def _check_label_times(self):
        """Validates the lables times object."""
        self._check_target_columns()
        self._check_target_types()

    def _infer_target_columns(self):
        """Infers the names of the targets in the data frame.

        Returns:
            value (list): A list of the target names.
        """
        not_targets = [self.target_entity, 'time']
        target_columns = self.columns.difference(not_targets)
        assert not target_columns.empty, 'target columns not found'
        value = target_columns.tolist()
        return value

    @property
    def _is_single_target(self):
        return len(self.target_columns) == 1

    def _get_target_type(self, dtype):
        is_discrete = pd.api.types.is_bool_dtype(dtype)
        is_discrete |= pd.api.types.is_categorical_dtype(dtype)
        is_discrete |= pd.api.types.is_object_dtype(dtype)
        value = 'discrete' if is_discrete else 'continuous'
        return value

    def _infer_target_types(self):
        """Infers the target type from the data type.

        Returns:
            types (Series): Inferred label type. Either "continuous" or "discrete".
        """
        dtypes = self.dtypes[self.target_columns]
        types = dtypes.apply(self._get_target_type)
        return types

[docs]    def select(self, target):
        """Selects one of the target variables.

        Args:
            target (str): The name of the target column.

        Returns:
            lt (LabelTimes): A label times object that contains a single target.

        Examples:
            Create a label times object that contains multiple target variables.

            >>> entity = [0, 0, 1, 1]
            >>> labels = [True, False, True, False]
            >>> time = ['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04']
            >>> data = {'entity': entity, 'time': time, 'A': labels, 'B': labels}
            >>> lt = LabelTimes(data=data, target_entity='entity', target_columns=['A', 'B'])
            >>> lt
               entity        time      A      B
            0       0  2020-01-01   True   True
            1       0  2020-01-02  False  False
            2       1  2020-01-03   True   True
            3       1  2020-01-04  False  False

            Select a single target from the label times.

            >>> lt.select('B')
               entity        time      B
            0       0  2020-01-01   True
            1       0  2020-01-02  False
            2       1  2020-01-03   True
            3       1  2020-01-04  False
        """
        assert not self._is_single_target, 'only one target exists'
        if not isinstance(target, str): raise TypeError('target name must be string')
        assert target in self.target_columns, 'target "%s" not found' % target

        lt = self.copy()
        lt.target_columns = [target]
        lt.target_types = lt.target_types[[target]]
        lt = lt[[self.target_entity, 'time', target]]
        return lt

    @property
    def settings(self):
        """Returns metadata about the label times."""
        return {
            'compose_version': __version__,
            'schema_version': SCHEMA_VERSION,
            'label_times': {
                'target_entity': self.target_entity,
                'target_columns': self.target_columns,
                'target_types': self.target_types.to_dict(),
                'search_settings': self.search_settings,
                'transforms': self.transforms,
            }
        }

    @property
    def is_discrete(self):
        """Whether labels are discrete."""
        return self.target_types.eq('discrete')

    @property
    def distribution(self):
        """Returns label distribution if labels are discrete."""
        self._assert_single_target()
        target_column = self.target_columns[0]

        if self.is_discrete[target_column]:
            labels = self.assign(count=1)
            labels = labels.groupby(target_column)
            distribution = labels['count'].count()
            return distribution

    @property
    def count(self):
        """Returns label count per instance."""
        self._assert_single_target()
        count = self.groupby(self.target_entity)
        count = count[self.target_columns[0]].count()
        count = count.to_frame('count')
        return count

    @property
    def count_by_time(self):
        """Returns label count across cutoff times."""
        self._assert_single_target()
        target_column = self.target_columns[0]

        if self.is_discrete[target_column]:
            keys = ['time', target_column]
            value = self.groupby(keys).time.count()
            value = value.unstack(target_column).fillna(0)
        else:
            value = self.groupby('time')
            value = value[target_column].count()

        value = value.cumsum()  # In Python 3.5, these values automatically convert to float.
        value = value.astype('int')
        return value

[docs]    def describe(self):
        """Prints out the settings used to make the label times."""
        if not self.empty:
            self._assert_single_target()
            describe_label_times(self)

[docs]    def copy(self, deep=True):
        """Make a copy of this object's indices and data.

        Args:
            deep (bool): Make a deep copy, including a copy of the data and the indices.
                With ``deep=False`` neither the indices nor the data are copied. Default is True.

        Returns:
            lt (LabelTimes): A copy of the label times object.
        """
        lt = super().copy(deep=deep)
        lt.target_entity = self.target_entity
        lt.target_columns = self.target_columns
        lt.target_types = self.target_types.copy()
        lt.search_settings = self.search_settings.copy()
        lt.transforms = self.transforms.copy()
        return lt

[docs]    def threshold(self, value, inplace=False):
        """Creates binary labels by testing if labels are above threshold.

        Args:
            value (float) : Value of threshold.
            inplace (bool) : Modify labels in place.

        Returns:
            labels (LabelTimes) : Instance of labels.
        """
        self._assert_single_target()
        target_column = self.target_columns[0]
        labels = self if inplace else self.copy()
        labels[target_column] = labels[target_column].gt(value)
        labels.target_types[target_column] = 'discrete'

        transform = {'transform': 'threshold', 'value': value}
        labels.transforms.append(transform)

        if not inplace:
            return labels

[docs]    def apply_lead(self, value, inplace=False):
        """Shifts the label times earlier for predicting in advance.

        Args:
            value (str) : Time to shift earlier.
            inplace (bool) : Modify labels in place.

        Returns:
            labels (LabelTimes) : Instance of labels.
        """
        labels = self if inplace else self.copy()
        labels['time'] = labels['time'].sub(pd.Timedelta(value))

        transform = {'transform': 'apply_lead', 'value': value}
        labels.transforms.append(transform)

        if not inplace:
            return labels

[docs]    def bin(self, bins, quantiles=False, labels=None, right=True, precision=3):
        """Bin labels into discrete intervals.

        Args:
            bins (int or array): The criteria to bin by.
                As an integer, the value can be the number of equal-width or quantile-based bins.
                If :code:`quantiles` is False, the value is defined as the number of equal-width bins.
                The range is extended by .1% on each side to include the minimum and maximum values.
                If :code:`quantiles` is True, the value is defined as the number of quantiles (e.g. 10 for deciles, 4 for quartiles, etc.)
                As an array, the value can be custom or quantile-based edges.
                If :code:`quantiles` is False, the value is defined as bin edges allowing for non-uniform width. No extension is done.
                If :code:`quantiles` is True, the value is defined as bin edges usings an array of quantiles (e.g. [0, .25, .5, .75, 1.] for quartiles)

            quantiles (bool): Determines whether to use a quantile-based discretization function.
            labels (array): Specifies the labels for the returned bins. Must be the same length as the resulting bins.
            right (bool) : Indicates whether bins includes the rightmost edge or not. Does not apply to quantile-based bins.
            precision (int): The precision at which to store and display the bins labels. Default value is 3.

        Returns:
            LabelTimes : Instance of labels.

        Examples:
            These are the target values for the examples.

            >>> data = [226.93, 47.95, 283.46, 31.54]
            >>> lt = LabelTimes({'target': data})
            >>> lt
               target
            0  226.93
            1   47.95
            2  283.46
            3   31.54

            Bin values using equal-widths.

            >>> lt.bin(2)
                        target
            0  (157.5, 283.46]
            1  (31.288, 157.5]
            2  (157.5, 283.46]
            3  (31.288, 157.5]

            Bin values using custom-widths.

            >>> lt.bin([0, 200, 400])
                   target
            0  (200, 400]
            1    (0, 200]
            2  (200, 400]
            3    (0, 200]

            Bin values using infinite edges.

            >>> lt.bin(['-inf', 100, 'inf'])
                      target
            0   (100.0, inf]
            1  (-inf, 100.0]
            2   (100.0, inf]
            3  (-inf, 100.0]

            Bin values using quartiles.

            >>> lt.bin(4, quantiles=True)
                                     target
            0             (137.44, 241.062]
            1              (43.848, 137.44]
            2             (241.062, 283.46]
            3  (31.538999999999998, 43.848]

            Bin values using custom quantiles with precision.

            >>> lt.bin([0, .5, 1], quantiles=True, precision=1)
                       target
            0  (137.4, 283.5]
            1   (31.4, 137.4]
            2  (137.4, 283.5]
            3   (31.4, 137.4]

            Assign labels to bins.

            >>> lt.bin(2, labels=['low', 'high'])
              target
            0   high
            1    low
            2   high
            3    low
        """  # noqa
        self._assert_single_target()
        target_column = self.target_columns[0]
        values = self[target_column].values

        if quantiles:
            values = pd.qcut(values, q=bins, labels=labels, precision=precision)

        else:
            if isinstance(bins, list):
                for i, edge in enumerate(bins):
                    if edge in ['-inf', 'inf']:
                        bins[i] = float(edge)

            values = pd.cut(values, bins=bins, labels=labels, right=right, precision=precision)

        transform = {
            'transform': 'bin',
            'bins': bins,
            'quantiles': quantiles,
            'labels': labels,
            'right': right,
            'precision': precision,
        }

        lt = self.copy()
        lt[target_column] = values
        lt.transforms.append(transform)
        lt.target_types[target_column] = 'discrete'
        return lt

    def _sample(self, key, value, settings, random_state=None, replace=False):
        """Returns a random sample of labels.

        Args:
            key (str) : Determines the sampling method. Can either be 'n' or 'frac'.
            value (int or float) : Quantity to sample.
            settings (dict) : Transform settings used for sampling.
            random_state (int) : Seed for the random number generator.
            replace (bool) : Sample with or without replacement. Default value is False.

        Returns:
            LabelTimes : Random sample of labels.
        """
        sample = super().sample(random_state=random_state, replace=replace, **{key: value})
        return sample

    def _sample_per_label(self, key, value, settings, random_state=None, replace=False):
        """Returns a random sample per label.

        Args:
            key (str) : Determines the sampling method. Can either be 'n' or 'frac'.
            value (dict) : Quantity to sample per label.
            settings (dict) : Transform settings used for sampling.
            random_state (int) : Seed for the random number generator.
            replace (bool) : Sample with or without replacement. Default value is False.

        Returns:
            LabelTimes : Random sample per label.
        """
        sample_per_label = []
        target_column = self.target_columns[0]

        for label, value, in value.items():
            label = self[self[target_column] == label]
            sample = label._sample(key, value, settings, random_state=random_state, replace=replace)
            sample_per_label.append(sample)

        sample = pd.concat(sample_per_label, axis=0, sort=False)
        return sample

[docs]    def sample(self, n=None, frac=None, random_state=None, replace=False, per_instance=False):
        """Return a random sample of labels.

        Args:
            n (int or dict) : Sample number of labels. A dictionary returns
                the number of samples to each label. Cannot be used with frac.
            frac (float or dict) : Sample fraction of labels. A dictionary returns
                the sample fraction to each label. Cannot be used with n.
            random_state (int) : Seed for the random number generator.
            replace (bool) : Sample with or without replacement. Default value is False.
            per_instance (bool): Whether to apply sampling to each group. Default is False.

        Returns:
            LabelTimes : Random sample of labels.

        Examples:
            Create a label times object.

            >>> entity = [0, 0, 1, 1]
            >>> labels = [True, False, True, False]
            >>> data = {'entity': entity, 'labels': labels}
            >>> lt = LabelTimes(data=data, target_entity='entity', target_columns=['labels'])
            >>> lt
               entity  labels
            0       0    True
            1       0   False
            2       1    True
            3       1   False

            Sample a number of the examples.

            >>> lt.sample(n=3, random_state=0)
               entity  labels
            1       0   False
            2       1    True
            3       1   False

            Sample a fraction of the examples.

            >>> lt.sample(frac=.25, random_state=0)
               entity  labels
            2       1    True

            Sample a number of the examples for specific labels.

            >>> n = {True: 1, False: 1}
            >>> lt.sample(n=n, random_state=0)
               entity  labels
            2       1    True
            3       1   False

            Sample a fraction of the examples for specific labels.

            >>> frac = {True: .5, False: .5}
            >>> lt.sample(frac=frac, random_state=0)
               entity  labels
            2       1    True
            3       1   False

            Sample a number of the examples from each entity group.

            >>> lt.sample(n={True: 1}, per_instance=True, random_state=0)
               entity  labels
            0       0    True
            2       1    True

            Sample a fraction of the examples from each entity group.

            >>> lt.sample(frac=.5, per_instance=True, random_state=0)
               entity  labels
            1       0   False
            3       1   False
        """  # noqa
        self._assert_single_target()

        settings = {
            'transform': 'sample',
            'n': n,
            'frac': frac,
            'random_state': random_state,
            'replace': replace,
            'per_instance': per_instance,
        }

        key, value = ('n', n) if n else ('frac', frac)
        assert value, "must set value for 'n' or 'frac'"

        per_label = isinstance(value, dict)
        method = '_sample_per_label' if per_label else '_sample'

        def transform(lt):
            sample = getattr(lt, method)(
                key=key,
                value=value,
                settings=settings,
                random_state=random_state,
                replace=replace,
            )
            return sample

        if per_instance:
            groupby = self.groupby(self.target_entity, group_keys=False)
            sample = groupby.apply(transform)
        else:
            sample = transform(self)

        sample = sample.copy()
        sample.sort_index(inplace=True)
        sample.transforms.append(settings)
        return sample

[docs]    def equals(self, other, **kwargs):
        """Determines if two label time objects are the same.

        Args:
            other (LabelTimes) : Other label time object for comparison.
            **kwargs: Keyword arguments to pass to underlying pandas.DataFrame.equals method

        Returns:
            bool : Whether label time objects are the same.
        """
        is_equal = super().equals(other, **kwargs)
        is_equal &= self.settings == other.settings
        return is_equal

    def _save_settings(self, path):
        """Write the settings in json format to disk.

        Args:
            path (str) : Directory on disk to write to.
        """
        settings = self.settings
        dtypes = self.dtypes.astype('str')
        settings['dtypes'] = dtypes.to_dict()

        file = os.path.join(path, 'settings.json')
        with open(file, 'w') as file:
            json.dump(settings, file)

[docs]    def to_csv(self, path, save_settings=True, **kwargs):
        """Write label times in csv format to disk.

        Args:
            path (str) : Location on disk to write to (will be created as a directory).
            save_settings (bool) : Whether to save the settings used to make the label times.
            **kwargs: Keyword arguments to pass to underlying pandas.DataFrame.to_csv method
        """
        os.makedirs(path, exist_ok=True)
        file = os.path.join(path, 'data.csv')
        super().to_csv(file, index=False, **kwargs)

        if save_settings:
            self._save_settings(path)

[docs]    def to_parquet(self, path, save_settings=True, **kwargs):
        """Write label times in parquet format to disk.

        Args:
            path (str) : Location on disk to write to (will be created as a directory).
            save_settings (bool) : Whether to save the settings used to make the label times.
            **kwargs: Keyword arguments to pass to underlying pandas.DataFrame.to_parquet method
        """
        os.makedirs(path, exist_ok=True)
        file = os.path.join(path, 'data.parquet')
        super().to_parquet(file, compression=None, engine='auto', **kwargs)

        if save_settings:
            self._save_settings(path)

[docs]    def to_pickle(self, path, save_settings=True, **kwargs):
        """Write label times in pickle format to disk.

        Args:
            path (str) : Location on disk to write to (will be created as a directory).
            save_settings (bool) : Whether to save the settings used to make the label times.
            **kwargs: Keyword arguments to pass to underlying pandas.DataFrame.to_pickle method
        """
        os.makedirs(path, exist_ok=True)
        file = os.path.join(path, 'data.pickle')
        super().to_pickle(file, **kwargs)

        if save_settings:
            self._save_settings(path)

    # ----------------------------------------
    # Subclassing Pandas Data Frame
    # ----------------------------------------

    _metadata = [
        'search_settings',
        'target_columns',
        'target_entity',
        'target_types',
        'transforms',
    ]

    def __finalize__(self, other, method=None, **kwargs):
        """Propagate metadata from other label times data frames.

        Args:
            other (LabelTimes) : The label times from which to get the attributes from.
            method (str) : A passed method name for optionally taking different types of propagation actions based on this value.
        """
        if method == 'concat':
            other = other.objs[0]

            for key in self._metadata:
                value = getattr(other, key, None)
                setattr(self, key, value)

            return self

        return super().__finalize__(other=other, method=method, **kwargs)

    @property
    def _constructor(self):
        return LabelTimes