Source code for pysubgroup.subgroup_description

"""
Created on 28.04.2016

@author: lemmerfn
"""
import copy
import weakref
from abc import ABC, abstractmethod
from functools import total_ordering
from itertools import chain

import numpy as np

import pysubgroup as ps


[docs]@total_ordering
class SelectorBase(ABC):
    # selector cache
    __refs__ = weakref.WeakSet()

    def __new__(cls, *args, **kwargs):
        """Ensures that each selector only exists once."""

        # create temporary selector
        tmp = super().__new__(cls)
        tmp.set_descriptions(*args, **kwargs)

        # save original arguments
        # NOTE: this is a fix for pickle
        #       so we can call `__getnewargs_ex__` with the right arguments
        # TODO: this may have unintended side effects if args,
        #       kwargs are large or volatile (I don't think we have that yet though)
        tmp.__new_args__ = args, kwargs

        # check if selector is already in cache (__refs__)
        # if so, return cached instance
        if tmp in SelectorBase.__refs__:
            for ref in SelectorBase.__refs__:
                if ref == tmp:
                    return ref
        # if not return
        return tmp

    def __getnewargs_ex__(self):  # pylint: disable=invalid-getnewargs-ex-returned
        tmp_args = self.__new_args__
        del self.__new_args__
        return tmp_args

    def __init__(self):
        # add selector to cache
        # TODO: why not do this in `__new__`,
        # then it would be all together in one function?
        SelectorBase.__refs__.add(self)

    def __eq__(self, other):
        if other is None:
            return False
        return repr(self) == repr(other)

    def __lt__(self, other):
        return repr(self) < repr(other)

    def __hash__(self):
        return self._hash  # pylint: disable=no-member

[docs]    @abstractmethod
    def set_descriptions(self, *args, **kwargs):
        pass  # pragma: no-cover


[docs]def get_cover_array_and_size(subgroup, data_len=None, data=None):
    if hasattr(subgroup, "representation"):
        cover_arr = subgroup
        size = subgroup.size_sg
    elif isinstance(subgroup, slice):
        cover_arr = subgroup
        if data_len is None:
            if type(data).__name__ == "DataFrame":
                data_len = len(data)
            else:
                raise ValueError(
                    "if you pass a slice, you need to pass either data_len or data"
                )
        # https://stackoverflow.com/questions/36188429/retrieve-length-of-slice-from-slice-object-in-python
        size = len(range(*subgroup.indices(data_len)))
    elif hasattr(subgroup, "__array_interface__"):
        cover_arr = subgroup
        type_char = subgroup.__array_interface__["typestr"][1]
        if type_char == "b":  # boolean indexing is used
            size = np.count_nonzero(cover_arr)
        elif type_char in ("u", "i"):  # integer indexing
            size = subgroup.__array_interface__["shape"][0]
        else:
            raise NotImplementedError(
                f"Currently a typechar of {type_char} is not supported."
            )
    else:
        assert type(data).__name__ == "DataFrame"
        cover_arr = subgroup.covers(data)
        size = np.count_nonzero(cover_arr)
    return cover_arr, size


[docs]def get_size(subgroup, data_len=None, data=None):
    if hasattr(subgroup, "representation"):
        size = subgroup.size_sg
    elif isinstance(subgroup, slice):
        if data_len is None:
            if type(data).__name__ == "DataFrame":
                data_len = len(data)
            else:
                raise ValueError(
                    "if you pass a slice, you need to pass either data_len or data"
                )
        # https://stackoverflow.com/questions/36188429/retrieve-length-of-slice-from-slice-object-in-python
        size = len(range(*subgroup.indices(data_len)))
    elif hasattr(subgroup, "__array_interface__"):
        type_char = subgroup.__array_interface__["typestr"][1]
        if type_char == "b":  # boolean indexing is used
            size = np.count_nonzero(subgroup)
        elif type_char == "u" or type_char == "i":  # integer indexing
            size = subgroup.__array_interface__["shape"][0]
        else:
            raise NotImplementedError(
                f"Currently a typechar of {type_char} is not supported."
            )
    else:
        assert type(data).__name__ == "DataFrame"
        size = np.count_nonzero(subgroup.covers(data))
    return size


[docs]class EqualitySelector(SelectorBase):
    def __init__(self, attribute_name, attribute_value, selector_name=None):
        if attribute_name is None:
            raise TypeError()
        if attribute_value is None:
            raise TypeError()

        # TODO: this is redundant due to `__new__` and `set_descriptions`
        self._attribute_name = attribute_name
        self._attribute_value = attribute_value
        self._selector_name = selector_name
        self.set_descriptions(
            self._attribute_name, self._attribute_value, self._selector_name
        )

        super().__init__()

    @property
    def attribute_name(self):
        return self._attribute_name

    @property
    def attribute_value(self):
        return self._attribute_value

[docs]    def set_descriptions(
        self, attribute_name, attribute_value, selector_name=None
    ):  # pylint: disable=arguments-differ
        self._hash, self._query, self._string = EqualitySelector.compute_descriptions(
            attribute_name, attribute_value, selector_name=selector_name
        )

[docs]    @classmethod
    def compute_descriptions(cls, attribute_name, attribute_value, selector_name):
        if isinstance(attribute_value, (str, bytes)):
            query = str(attribute_name) + "==" + "'" + str(attribute_value) + "'"
        elif attribute_value is None:
            query = str(attribute_name) + " is None"
        elif np.isnan(attribute_value):
            query = attribute_name + ".isnull()"
        else:
            query = str(attribute_name) + "==" + str(attribute_value)
        if selector_name is not None:
            string_ = selector_name
        else:
            string_ = query
        hash_value = hash(query)
        return (hash_value, query, string_)

    def __repr__(self):
        return self._query

[docs]    def covers(self, data):
        import pandas as pd  # pylint: disable=import-outside-toplevel

        row = data[self.attribute_name].to_numpy()
        if pd.isnull(self.attribute_value):
            return pd.isnull(row)
        return row == self.attribute_value

    def __str__(self, open_brackets="", closing_brackets=""):
        return open_brackets + self._string + closing_brackets

    @property
    def selectors(self):
        return (self,)

[docs]    @staticmethod
    def from_str(s):
        s = s.strip()
        attribute_name, attribute_value = s.split("==")
        if attribute_value[0] == "'" and attribute_value[-1] == "'":
            if attribute_value.startswith("'b'") and attribute_value.endswith("''"):
                attribute_value = str.encode(attribute_value[3:-2])
            else:
                attribute_value = attribute_value[1:-1]
        try:
            attribute_value = int(attribute_value)
        except ValueError:
            try:
                attribute_value = float(attribute_value)
            except ValueError:
                pass
        return ps.EqualitySelector(attribute_name, attribute_value)


[docs]class NegatedSelector(SelectorBase):
    def __init__(self, selector):
        # TODO: this is redundant due to `__new__` and `set_descriptions`
        self._selector = selector
        self.set_descriptions(selector)

        super().__init__()

[docs]    def covers(self, data_instance):
        return np.logical_not(self._selector.covers(data_instance))

    def __repr__(self):
        return self._query

    def __str__(self, open_brackets="", closing_brackets=""):
        return "NOT " + self._selector.__str__(open_brackets, closing_brackets)

[docs]    def set_descriptions(self, selector):  # pylint: disable=arguments-differ
        self._query = "(not " + repr(selector) + ")"
        self._hash = hash(repr(self))

    @property
    def attribute_name(self):
        return self._selector.attribute_name

    @property
    def selectors(self):
        return (self,)


# Including the lower bound, excluding the upper_bound
[docs]class IntervalSelector(SelectorBase):
    def __init__(self, attribute_name, lower_bound, upper_bound, selector_name=None):
        assert lower_bound < upper_bound
        # TODO: this is redundant due to `__new__` and `set_descriptions`
        self._attribute_name = attribute_name
        self._lower_bound = lower_bound
        self._upper_bound = upper_bound
        self.selector_name = selector_name
        self.set_descriptions(attribute_name, lower_bound, upper_bound, selector_name)

        super().__init__()

    @property
    def attribute_name(self):
        return self._attribute_name

    @property
    def lower_bound(self):
        return self._lower_bound

    @property
    def upper_bound(self):
        return self._upper_bound

[docs]    def covers(self, data_instance):
        val = data_instance[self.attribute_name].to_numpy()
        return np.logical_and((val >= self.lower_bound), (val < self.upper_bound))

    def __repr__(self):
        return self._query

    def __hash__(self):
        return self._hash

    def __str__(self):
        return self._string

[docs]    @classmethod
    def compute_descriptions(
        cls, attribute_name, lower_bound, upper_bound, selector_name=None
    ):
        if selector_name is None:
            _string = cls.compute_string(
                attribute_name, lower_bound, upper_bound, rounding_digits=2
            )
        else:
            _string = selector_name
        _query = cls.compute_string(
            attribute_name, lower_bound, upper_bound, rounding_digits=None
        )
        _hash = hash(_query)
        return (_hash, _query, _string)

[docs]    def set_descriptions(
        self, attribute_name, lower_bound, upper_bound, selector_name=None
    ):  # pylint: disable=arguments-differ
        self._hash, self._query, self._string = IntervalSelector.compute_descriptions(
            attribute_name, lower_bound, upper_bound, selector_name=selector_name
        )

[docs]    @classmethod
    def compute_string(cls, attribute_name, lower_bound, upper_bound, rounding_digits):
        if rounding_digits is None:
            formatter = "{}"
        else:
            formatter = "{0:." + str(rounding_digits) + "f}"
        ub = upper_bound
        lb = lower_bound
        if ub % 1:
            ub = formatter.format(ub)
        if lb % 1:
            lb = formatter.format(lb)

        if lower_bound == float("-inf") and upper_bound == float("inf"):
            repre = attribute_name + " = anything"
        elif lower_bound == float("-inf"):
            repre = attribute_name + "<" + str(ub)
        elif upper_bound == float("inf"):
            repre = attribute_name + ">=" + str(lb)
        else:
            repre = attribute_name + ": [" + str(lb) + ":" + str(ub) + "["
        return repre

[docs]    @staticmethod
    def from_str(s):
        s = s.strip()
        if s.endswith(" = anything"):
            return IntervalSelector(
                s[: -len(" = anything")], float("-inf"), float("+inf")
            )
        if "<" in s:
            attribute_name, ub = s.split("<")
            try:
                return IntervalSelector(attribute_name.strip(), float("-inf"), int(ub))
            except ValueError:
                return IntervalSelector(
                    attribute_name.strip(), float("-inf"), float(ub)
                )
        if ">=" in s:
            attribute_name, lb = s.split(">=")
            try:
                return IntervalSelector(attribute_name.strip(), int(lb), float("inf"))
            except ValueError:
                return IntervalSelector(attribute_name.strip(), float(lb), float("inf"))
        if s.count(":") == 2:
            attribute_name, lb, ub = s.split(":")
            lb = lb.strip()[1:]
            ub = ub.strip()[:-1]
            try:
                return IntervalSelector(attribute_name.strip(), int(lb), int(ub))
            except ValueError:
                return IntervalSelector(attribute_name.strip(), float(lb), float(ub))
        else:
            raise ValueError(f"string {s} could not be converted to IntervalSelector")

    @property
    def selectors(self):
        return (self,)


[docs]def create_selectors(data, nbins=5, intervals_only=True, ignore=None):
    if ignore is None:
        ignore = []
    sels = create_nominal_selectors(data, ignore)
    sels.extend(create_numeric_selectors(data, nbins, intervals_only, ignore=ignore))
    return sels


[docs]def create_nominal_selectors(data, ignore=None):
    if ignore is None:
        ignore = []
    nominal_selectors = []
    # for attr_name in [
    #       x for x in data.select_dtypes(exclude=['number']).columns.values
    #       if x not in ignore]:
    #    nominal_selectors.extend(
    #       create_nominal_selectors_for_attribute(data, attr_name))
    nominal_dtypes = data.select_dtypes(exclude=["number"])
    dtypes = data.dtypes
    # print(dtypes)
    for attr_name in [x for x in nominal_dtypes.columns.values if x not in ignore]:
        nominal_selectors.extend(
            create_nominal_selectors_for_attribute(data, attr_name, dtypes)
        )
    return nominal_selectors


[docs]def create_nominal_selectors_for_attribute(data, attribute_name, dtypes=None):
    import pandas as pd  # pylint: disable=import-outside-toplevel

    nominal_selectors = []
    for val in pd.unique(data[attribute_name]):
        nominal_selectors.append(EqualitySelector(attribute_name, val))
    # setting the is_bool flag for selector
    if dtypes is None:
        dtypes = data.dtypes
    if dtypes[attribute_name] == "bool":
        for s in nominal_selectors:
            s.is_bool = True
    return nominal_selectors


[docs]def create_numeric_selectors(
    data, nbins=5, intervals_only=True, weighting_attribute=None, ignore=None
):
    if ignore is None:
        ignore = []  # pragma: no cover
    numeric_selectors = []
    for attr_name in [
        x
        for x in data.select_dtypes(include=["number"]).columns.values
        if x not in ignore
    ]:
        numeric_selectors.extend(
            create_numeric_selectors_for_attribute(
                data, attr_name, nbins, intervals_only, weighting_attribute
            )
        )
    return numeric_selectors


[docs]def create_numeric_selectors_for_attribute(
    data, attr_name, nbins=5, intervals_only=True, weighting_attribute=None
):
    numeric_selectors = []
    data_not_null = data[data[attr_name].notnull()]

    uniqueValues = np.unique(data_not_null[attr_name])
    if len(data_not_null.index) < len(data.index):
        numeric_selectors.append(EqualitySelector(attr_name, np.nan))

    if len(uniqueValues) <= nbins:
        for val in uniqueValues:
            numeric_selectors.append(EqualitySelector(attr_name, val))
    else:
        cutpoints = ps.equal_frequency_discretization(
            data, attr_name, nbins, weighting_attribute
        )
        if intervals_only:
            old_cutpoint = float("-inf")
            for c in cutpoints:
                numeric_selectors.append(IntervalSelector(attr_name, old_cutpoint, c))
                old_cutpoint = c
            numeric_selectors.append(
                IntervalSelector(attr_name, old_cutpoint, float("inf"))
            )
        else:
            for c in cutpoints:
                numeric_selectors.append(IntervalSelector(attr_name, c, float("inf")))
                numeric_selectors.append(IntervalSelector(attr_name, float("-inf"), c))

    return numeric_selectors


[docs]def remove_target_attributes(selectors, target):
    return [
        sel for sel in selectors if sel.attribute_name not in target.get_attributes()
    ]


##############
# Boolean expressions
##############
[docs]class BooleanExpressionBase(ABC):
    def __or__(self, other):
        tmp = copy.copy(self)
        tmp.append_or(other)
        return tmp

    def __and__(self, other):
        tmp = copy.copy(self)
        tmp.append_and(other)
        return tmp

[docs]    @abstractmethod
    def append_and(self, to_append):
        pass

[docs]    @abstractmethod
    def append_or(self, to_append):
        pass

    @abstractmethod
    def __copy__(self):
        pass


[docs]@total_ordering
class Conjunction(BooleanExpressionBase):
    def __init__(self, selectors):
        self._repr = None
        self._hash = None
        try:
            it = iter(selectors)
            self._selectors = list(it)
        except TypeError:
            self._selectors = [selectors]

[docs]    def covers(self, instance):
        # empty description ==> return a list of all '1's
        if not self._selectors:
            return np.full(len(instance), True, dtype=bool)
        # non-empty description
        return np.all([sel.covers(instance) for sel in self._selectors], axis=0)

    def __len__(self):
        return len(self._selectors)

    def __str__(self, open_brackets="", closing_brackets="", and_term=" AND "):
        if not self._selectors:
            return "Dataset"
        attrs = sorted(str(sel) for sel in self._selectors)
        return "".join((open_brackets, and_term.join(attrs), closing_brackets))

    def __repr__(self):
        if self._repr is not None:
            return self._repr
        else:
            self._repr = self._compute_repr()
            return self._repr

    def __eq__(self, other):
        return repr(self) == repr(other)

    def __lt__(self, other):
        return repr(self) < repr(other)

    def __hash__(self):
        if self._hash is not None:
            return self._hash
        else:
            self._hash = self._compute_hash()
            return self._hash

    def _compute_repr(self):
        if not self._selectors:
            return "True"
        reprs = sorted(repr(sel) for sel in self._selectors)
        return "".join(("(", " and ".join(reprs), ")"))

    def _compute_hash(self):
        return hash(repr(self))

    def _invalidate_representations(self):
        self._repr = None
        self._hash = None

[docs]    def append_and(self, to_append):
        if isinstance(to_append, SelectorBase):
            self._selectors.append(to_append)
        elif isinstance(to_append, Conjunction):
            self._selectors.extend(to_append.selectors)
        else:
            self._selectors.extend(to_append)
        self._invalidate_representations()

[docs]    def append_or(self, to_append):
        raise RuntimeError(
            "Or operations are not supported by a pure Conjunction. Consider using DNF."
        )

[docs]    def pop_and(self):
        return self._selectors.pop()

[docs]    def pop_or(self):
        raise RuntimeError(
            "Or operations are not supported by a pure Conjunction. Consider using DNF."
        )

    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.__dict__.update(self.__dict__)
        result._selectors = list(self._selectors)
        return result

    @property
    def depth(self):
        return len(self._selectors)

    @property
    def selectors(self):
        return tuple(chain.from_iterable(sel.selectors for sel in self._selectors))

[docs]    @staticmethod
    def from_str(s):
        if s.strip() == "Dataset":
            return Conjunction([])
        selector_strings = s.split(" AND ")
        selectors = []
        for selector_string in selector_strings:
            selector_string = selector_string.strip()
            if "==" in selector_string:
                selectors.append(EqualitySelector.from_str(selector_string))
            else:
                selectors.append(IntervalSelector.from_str(selector_string))
        return Conjunction(selectors)


[docs]@total_ordering
class Disjunction(BooleanExpressionBase):
    def __init__(self, selectors=None):
        if isinstance(selectors, (list, tuple)):
            self._selectors = selectors
        elif selectors is None:
            self._selectors = []
        else:
            self._selectors = [selectors]

[docs]    def covers(self, instance):
        # empty description ==> return a list of all '1's
        if not self._selectors:
            return np.full(len(instance), False, dtype=bool)
        # non-empty description
        return np.any([sel.covers(instance) for sel in self._selectors], axis=0)

    def __len__(self):
        return len(self._selectors)

    def __str__(self, open_brackets="", closing_brackets="", or_term=" OR "):
        if not self._selectors:
            return "Empty"  # pragma: no cover
        attrs = sorted(str(sel) for sel in self._selectors)
        return "".join((open_brackets, or_term.join(attrs), closing_brackets))

    def __repr__(self):
        if not self._selectors:
            return "True"
        reprs = sorted(repr(sel) for sel in self._selectors)
        return "".join(("(", " or ".join(reprs), ")"))

    def __eq__(self, other):
        return repr(self) == repr(other)

    def __lt__(self, other):
        return repr(self) < repr(other)

    def __hash__(self):
        return hash(repr(self))

[docs]    def append_and(self, to_append):
        raise RuntimeError(
            "And operations are not supported by a pure Conjunction. "
            "Consider using DNF."
        )

[docs]    def append_or(self, to_append):
        if isinstance(to_append, Disjunction):
            self._selectors.extend(to_append.selectors)
            return
        try:
            self._selectors.extend(to_append)
        except TypeError:
            self._selectors.append(to_append)

    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.__dict__.update(self.__dict__)
        result._selectors = copy.copy(self._selectors)
        return result

    @property
    def selectors(self):
        return tuple(chain.from_iterable(sel.selectors for sel in self._selectors))


[docs]class DNF(Disjunction):
    def __init__(self, selectors=None):
        if selectors is None:
            selectors = []
        super().__init__([])
        self.append_or(selectors)

    @staticmethod
    def _ensure_pure_conjunction(to_append):
        if isinstance(to_append, Conjunction):
            return to_append
        elif isinstance(to_append, SelectorBase):
            return Conjunction(to_append)
        else:
            it = iter(to_append)
            if all(isinstance(sel, SelectorBase) for sel in to_append):
                return Conjunction(it)
            else:
                raise ValueError(
                    "DNFs only accept an iterable of Selectors"
                )  # pragma: no cover

[docs]    def append_or(self, to_append):
        if isinstance(to_append, ps.Disjunction):
            to_append = to_append.selectors
        try:
            it = iter(to_append)
            conjunctions = [DNF._ensure_pure_conjunction(part) for part in it]
        except TypeError:
            conjunctions = DNF._ensure_pure_conjunction(to_append)
        super().append_or(conjunctions)

[docs]    def append_and(self, to_append):
        if isinstance(to_append, Disjunction):
            raise NotImplementedError(
                "Appeding a disjunction to DNF is not implemented"
            )
        conj = DNF._ensure_pure_conjunction(to_append)
        if len(self._selectors) > 0:
            for conjunction in self._selectors:
                conjunction.append_and(conj)
        else:
            self._selectors.append(conj)

[docs]    def pop_and(self):
        out_list = [s.pop_and() for s in self._selectors]
        return_val = out_list[0]
        if all(x == return_val for x in out_list):
            return return_val
        else:
            for to_append, conj in zip(out_list, self._selectors):
                conj.append_and(to_append)

            raise RuntimeError("pop_and failed as the result was inconsistent")
Source code for pysubgroup.subgroup_description

pysubgroup

Navigation

Related Topics