Source code for composeml.label_times.object

import json
import os

import pandas as pd

from ..version import __version__
from .description import describe_label_times
from .plots import LabelPlots

SCHEMA_VERSION = "0.1.0"


[docs]class LabelTimes(pd.DataFrame): """The data frame that contains labels and cutoff times for the target entity."""
[docs] def __init__( self, data=None, target_entity=None, target_types=None, target_columns=None, search_settings=None, transforms=None, *args, **kwargs, ): super().__init__(data=data, *args, **kwargs) self.target_entity = target_entity self.target_columns = target_columns or [] self.target_types = target_types or {} self.search_settings = search_settings or {} self.transforms = transforms or [] self.plot = LabelPlots(self) if not self.empty: self._check_label_times()
def _assert_single_target(self): """Asserts that the label times object contains a single target.""" info = 'must first select an individual target' assert self._is_single_target, info def _check_target_columns(self): """Validates the target columns.""" if not self.target_columns: self.target_columns = self._infer_target_columns() else: for target in self.target_columns: info = 'target "%s" not found in data frame' assert target in self.columns, info % target def _check_target_types(self): """Validates the target types.""" if isinstance(self.target_types, dict): self.target_types = pd.Series(self.target_types) if self.target_types.empty: self.target_types = self._infer_target_types() else: target_names = self.target_types.index.tolist() match = target_names == self.target_columns assert match, 'target names in types must match target columns' def _check_label_times(self): """Validates the lables times object.""" self._check_target_columns() self._check_target_types() def _infer_target_columns(self): """Infers the names of the targets in the data frame. Returns: value (list): A list of the target names. """ not_targets = [self.target_entity, 'time'] target_columns = self.columns.difference(not_targets) assert not target_columns.empty, 'target columns not found' value = target_columns.tolist() return value @property def _is_single_target(self): return len(self.target_columns) == 1 def _get_target_type(self, dtype): is_discrete = pd.api.types.is_bool_dtype(dtype) is_discrete |= pd.api.types.is_categorical_dtype(dtype) is_discrete |= pd.api.types.is_object_dtype(dtype) value = 'discrete' if is_discrete else 'continuous' return value def _infer_target_types(self): """Infers the target type from the data type. Returns: types (Series): Inferred label type. Either "continuous" or "discrete". """ dtypes = self.dtypes[self.target_columns] types = dtypes.apply(self._get_target_type) return types
[docs] def select(self, target): """Selects one of the target variables. Args: target (str): The name of the target column. Returns: lt (LabelTimes): A label times object that contains a single target. Examples: Create a label times object that contains multiple target variables. >>> entity = [0, 0, 1, 1] >>> labels = [True, False, True, False] >>> time = ['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04'] >>> data = {'entity': entity, 'time': time, 'A': labels, 'B': labels} >>> lt = LabelTimes(data=data, target_entity='entity', target_columns=['A', 'B']) >>> lt entity time A B 0 0 2020-01-01 True True 1 0 2020-01-02 False False 2 1 2020-01-03 True True 3 1 2020-01-04 False False Select a single target from the label times. >>> lt.select('B') entity time B 0 0 2020-01-01 True 1 0 2020-01-02 False 2 1 2020-01-03 True 3 1 2020-01-04 False """ assert not self._is_single_target, 'only one target exists' if not isinstance(target, str): raise TypeError('target name must be string') assert target in self.target_columns, 'target "%s" not found' % target lt = self.copy() lt.target_columns = [target] lt.target_types = lt.target_types[[target]] lt = lt[[self.target_entity, 'time', target]] return lt
@property def settings(self): """Returns metadata about the label times.""" return { 'compose_version': __version__, 'schema_version': SCHEMA_VERSION, 'label_times': { 'target_entity': self.target_entity, 'target_columns': self.target_columns, 'target_types': self.target_types.to_dict(), 'search_settings': self.search_settings, 'transforms': self.transforms, } } @property def is_discrete(self): """Whether labels are discrete.""" return self.target_types.eq('discrete') @property def distribution(self): """Returns label distribution if labels are discrete.""" self._assert_single_target() target_column = self.target_columns[0] if self.is_discrete[target_column]: labels = self.assign(count=1) labels = labels.groupby(target_column) distribution = labels['count'].count() return distribution @property def count(self): """Returns label count per instance.""" self._assert_single_target() count = self.groupby(self.target_entity) count = count[self.target_columns[0]].count() count = count.to_frame('count') return count @property def count_by_time(self): """Returns label count across cutoff times.""" self._assert_single_target() target_column = self.target_columns[0] if self.is_discrete[target_column]: keys = ['time', target_column] value = self.groupby(keys).time.count() value = value.unstack(target_column).fillna(0) else: value = self.groupby('time') value = value[target_column].count() value = value.cumsum() # In Python 3.5, these values automatically convert to float. value = value.astype('int') return value
[docs] def describe(self): """Prints out the settings used to make the label times.""" if not self.empty: self._assert_single_target() describe_label_times(self)
[docs] def copy(self, deep=True): """Make a copy of this object's indices and data. Args: deep (bool): Make a deep copy, including a copy of the data and the indices. With ``deep=False`` neither the indices nor the data are copied. Default is True. Returns: lt (LabelTimes): A copy of the label times object. """ lt = super().copy(deep=deep) lt.target_entity = self.target_entity lt.target_columns = self.target_columns lt.target_types = self.target_types.copy() lt.search_settings = self.search_settings.copy() lt.transforms = self.transforms.copy() return lt
[docs] def threshold(self, value, inplace=False): """Creates binary labels by testing if labels are above threshold. Args: value (float) : Value of threshold. inplace (bool) : Modify labels in place. Returns: labels (LabelTimes) : Instance of labels. """ self._assert_single_target() target_column = self.target_columns[0] labels = self if inplace else self.copy() labels[target_column] = labels[target_column].gt(value) labels.target_types[target_column] = 'discrete' transform = {'transform': 'threshold', 'value': value} labels.transforms.append(transform) if not inplace: return labels
[docs] def apply_lead(self, value, inplace=False): """Shifts the label times earlier for predicting in advance. Args: value (str) : Time to shift earlier. inplace (bool) : Modify labels in place. Returns: labels (LabelTimes) : Instance of labels. """ labels = self if inplace else self.copy() labels['time'] = labels['time'].sub(pd.Timedelta(value)) transform = {'transform': 'apply_lead', 'value': value} labels.transforms.append(transform) if not inplace: return labels
[docs] def bin(self, bins, quantiles=False, labels=None, right=True, precision=3): """Bin labels into discrete intervals. Args: bins (int or array): The criteria to bin by. As an integer, the value can be the number of equal-width or quantile-based bins. If :code:`quantiles` is False, the value is defined as the number of equal-width bins. The range is extended by .1% on each side to include the minimum and maximum values. If :code:`quantiles` is True, the value is defined as the number of quantiles (e.g. 10 for deciles, 4 for quartiles, etc.) As an array, the value can be custom or quantile-based edges. If :code:`quantiles` is False, the value is defined as bin edges allowing for non-uniform width. No extension is done. If :code:`quantiles` is True, the value is defined as bin edges usings an array of quantiles (e.g. [0, .25, .5, .75, 1.] for quartiles) quantiles (bool): Determines whether to use a quantile-based discretization function. labels (array): Specifies the labels for the returned bins. Must be the same length as the resulting bins. right (bool) : Indicates whether bins includes the rightmost edge or not. Does not apply to quantile-based bins. precision (int): The precision at which to store and display the bins labels. Default value is 3. Returns: LabelTimes : Instance of labels. Examples: These are the target values for the examples. >>> data = [226.93, 47.95, 283.46, 31.54] >>> lt = LabelTimes({'target': data}) >>> lt target 0 226.93 1 47.95 2 283.46 3 31.54 Bin values using equal-widths. >>> lt.bin(2) target 0 (157.5, 283.46] 1 (31.288, 157.5] 2 (157.5, 283.46] 3 (31.288, 157.5] Bin values using custom-widths. >>> lt.bin([0, 200, 400]) target 0 (200, 400] 1 (0, 200] 2 (200, 400] 3 (0, 200] Bin values using infinite edges. >>> lt.bin(['-inf', 100, 'inf']) target 0 (100.0, inf] 1 (-inf, 100.0] 2 (100.0, inf] 3 (-inf, 100.0] Bin values using quartiles. >>> lt.bin(4, quantiles=True) target 0 (137.44, 241.062] 1 (43.848, 137.44] 2 (241.062, 283.46] 3 (31.538999999999998, 43.848] Bin values using custom quantiles with precision. >>> lt.bin([0, .5, 1], quantiles=True, precision=1) target 0 (137.4, 283.5] 1 (31.4, 137.4] 2 (137.4, 283.5] 3 (31.4, 137.4] Assign labels to bins. >>> lt.bin(2, labels=['low', 'high']) target 0 high 1 low 2 high 3 low """ # noqa self._assert_single_target() target_column = self.target_columns[0] values = self[target_column].values if quantiles: values = pd.qcut(values, q=bins, labels=labels, precision=precision) else: if isinstance(bins, list): for i, edge in enumerate(bins): if edge in ['-inf', 'inf']: bins[i] = float(edge) values = pd.cut(values, bins=bins, labels=labels, right=right, precision=precision) transform = { 'transform': 'bin', 'bins': bins, 'quantiles': quantiles, 'labels': labels, 'right': right, 'precision': precision, } lt = self.copy() lt[target_column] = values lt.transforms.append(transform) lt.target_types[target_column] = 'discrete' return lt
def _sample(self, key, value, settings, random_state=None, replace=False): """Returns a random sample of labels. Args: key (str) : Determines the sampling method. Can either be 'n' or 'frac'. value (int or float) : Quantity to sample. settings (dict) : Transform settings used for sampling. random_state (int) : Seed for the random number generator. replace (bool) : Sample with or without replacement. Default value is False. Returns: LabelTimes : Random sample of labels. """ sample = super().sample(random_state=random_state, replace=replace, **{key: value}) return sample def _sample_per_label(self, key, value, settings, random_state=None, replace=False): """Returns a random sample per label. Args: key (str) : Determines the sampling method. Can either be 'n' or 'frac'. value (dict) : Quantity to sample per label. settings (dict) : Transform settings used for sampling. random_state (int) : Seed for the random number generator. replace (bool) : Sample with or without replacement. Default value is False. Returns: LabelTimes : Random sample per label. """ sample_per_label = [] target_column = self.target_columns[0] for label, value, in value.items(): label = self[self[target_column] == label] sample = label._sample(key, value, settings, random_state=random_state, replace=replace) sample_per_label.append(sample) sample = pd.concat(sample_per_label, axis=0, sort=False) return sample
[docs] def sample(self, n=None, frac=None, random_state=None, replace=False, per_instance=False): """Return a random sample of labels. Args: n (int or dict) : Sample number of labels. A dictionary returns the number of samples to each label. Cannot be used with frac. frac (float or dict) : Sample fraction of labels. A dictionary returns the sample fraction to each label. Cannot be used with n. random_state (int) : Seed for the random number generator. replace (bool) : Sample with or without replacement. Default value is False. per_instance (bool): Whether to apply sampling to each group. Default is False. Returns: LabelTimes : Random sample of labels. Examples: Create a label times object. >>> entity = [0, 0, 1, 1] >>> labels = [True, False, True, False] >>> data = {'entity': entity, 'labels': labels} >>> lt = LabelTimes(data=data, target_entity='entity', target_columns=['labels']) >>> lt entity labels 0 0 True 1 0 False 2 1 True 3 1 False Sample a number of the examples. >>> lt.sample(n=3, random_state=0) entity labels 1 0 False 2 1 True 3 1 False Sample a fraction of the examples. >>> lt.sample(frac=.25, random_state=0) entity labels 2 1 True Sample a number of the examples for specific labels. >>> n = {True: 1, False: 1} >>> lt.sample(n=n, random_state=0) entity labels 2 1 True 3 1 False Sample a fraction of the examples for specific labels. >>> frac = {True: .5, False: .5} >>> lt.sample(frac=frac, random_state=0) entity labels 2 1 True 3 1 False Sample a number of the examples from each entity group. >>> lt.sample(n={True: 1}, per_instance=True, random_state=0) entity labels 0 0 True 2 1 True Sample a fraction of the examples from each entity group. >>> lt.sample(frac=.5, per_instance=True, random_state=0) entity labels 1 0 False 3 1 False """ # noqa self._assert_single_target() settings = { 'transform': 'sample', 'n': n, 'frac': frac, 'random_state': random_state, 'replace': replace, 'per_instance': per_instance, } key, value = ('n', n) if n else ('frac', frac) assert value, "must set value for 'n' or 'frac'" per_label = isinstance(value, dict) method = '_sample_per_label' if per_label else '_sample' def transform(lt): sample = getattr(lt, method)( key=key, value=value, settings=settings, random_state=random_state, replace=replace, ) return sample if per_instance: groupby = self.groupby(self.target_entity, group_keys=False) sample = groupby.apply(transform) else: sample = transform(self) sample = sample.copy() sample.sort_index(inplace=True) sample.transforms.append(settings) return sample
[docs] def equals(self, other, **kwargs): """Determines if two label time objects are the same. Args: other (LabelTimes) : Other label time object for comparison. **kwargs: Keyword arguments to pass to underlying pandas.DataFrame.equals method Returns: bool : Whether label time objects are the same. """ is_equal = super().equals(other, **kwargs) is_equal &= self.settings == other.settings return is_equal
def _save_settings(self, path): """Write the settings in json format to disk. Args: path (str) : Directory on disk to write to. """ settings = self.settings dtypes = self.dtypes.astype('str') settings['dtypes'] = dtypes.to_dict() file = os.path.join(path, 'settings.json') with open(file, 'w') as file: json.dump(settings, file)
[docs] def to_csv(self, path, save_settings=True, **kwargs): """Write label times in csv format to disk. Args: path (str) : Location on disk to write to (will be created as a directory). save_settings (bool) : Whether to save the settings used to make the label times. **kwargs: Keyword arguments to pass to underlying pandas.DataFrame.to_csv method """ os.makedirs(path, exist_ok=True) file = os.path.join(path, 'data.csv') super().to_csv(file, index=False, **kwargs) if save_settings: self._save_settings(path)
[docs] def to_parquet(self, path, save_settings=True, **kwargs): """Write label times in parquet format to disk. Args: path (str) : Location on disk to write to (will be created as a directory). save_settings (bool) : Whether to save the settings used to make the label times. **kwargs: Keyword arguments to pass to underlying pandas.DataFrame.to_parquet method """ os.makedirs(path, exist_ok=True) file = os.path.join(path, 'data.parquet') super().to_parquet(file, compression=None, engine='auto', **kwargs) if save_settings: self._save_settings(path)
[docs] def to_pickle(self, path, save_settings=True, **kwargs): """Write label times in pickle format to disk. Args: path (str) : Location on disk to write to (will be created as a directory). save_settings (bool) : Whether to save the settings used to make the label times. **kwargs: Keyword arguments to pass to underlying pandas.DataFrame.to_pickle method """ os.makedirs(path, exist_ok=True) file = os.path.join(path, 'data.pickle') super().to_pickle(file, **kwargs) if save_settings: self._save_settings(path)
# ---------------------------------------- # Subclassing Pandas Data Frame # ---------------------------------------- _metadata = [ 'search_settings', 'target_columns', 'target_entity', 'target_types', 'transforms', ] def __finalize__(self, other, method=None, **kwargs): """Propagate metadata from other label times data frames. Args: other (LabelTimes) : The label times from which to get the attributes from. method (str) : A passed method name for optionally taking different types of propagation actions based on this value. """ if method == 'concat': other = other.objs[0] for key in self._metadata: value = getattr(other, key, None) setattr(self, key, value) return self return super().__finalize__(other=other, method=method, **kwargs) @property def _constructor(self): return LabelTimes