Source code for pysubgroup.model_target

from collections import namedtuple

import numpy as np

import pysubgroup as ps

# Define a named tuple to store regression parameters and subgroup size
beta_tuple = namedtuple("beta_tuple", ["beta", "size_sg"])



[docs]
class EMM_Likelihood(ps.AbstractInterestingnessMeasure):
    """Exceptional Model Mining likelihood-based interestingness measure.

    This class computes the difference in likelihoods between a subgroup model
    and the inverse (complement) model, providing a measure of how exceptional
    the subgroup is with respect to the entire dataset.
    """

    # Define a named tuple to store model parameters and likelihoods
    tpl = namedtuple(
        "EMM_Likelihood",
        ["model_params", "subgroup_likelihood", "inverse_likelihood", "size"],
    )

    def __init__(self, model):
        """Initialize the EMM_Likelihood measure with a given model.

        Parameters:
            model: An instance of a model class that provides fit and likelihood
                   methods.
        """
        self.model = model
        self.has_constant_statistics = False
        self.required_stat_attrs = EMM_Likelihood.tpl._fields
        self.data_size = None


[docs]
    def calculate_constant_statistics(self, data, target):
        """Calculate statistics that remain constant over all subgroups.

        Parameters:
            data: The dataset as a pandas DataFrame.
            target: The target variable (unused in this context).
        """
        self.model.calculate_constant_statistics(data, target)
        self.data_size = len(data)
        self.has_constant_statistics = True



[docs]
    def calculate_statistics(
        self, subgroup, target, data, statistics=None
    ):  # pylint: disable=unused-argument
        """Calculate statistics specific to a subgroup.

        Parameters:
            subgroup: The subgroup description.
            target: The target variable (unused in this context).
            data: The dataset as a pandas DataFrame.
            statistics: Previously calculated statistics (optional).

        Returns:
            An EMM_Likelihood.tpl namedtuple containing model parameters,
            subgroup likelihood, inverse likelihood, and subgroup size.
        """
        cover_arr, sg_size = ps.get_cover_array_and_size(subgroup, self.data_size, data)
        params = self.model.fit(cover_arr, data)
        return self.get_tuple(sg_size, params, cover_arr)



[docs]
    def get_tuple(self, sg_size, params, cover_arr):
        """Compute the likelihoods for the subgroup and its complement.

        Parameters:
            sg_size: Size of the subgroup.
            params: Model parameters obtained from fitting the subgroup.
            cover_arr: Boolean array indicating the instances in the subgroup.

        Returns:
            An EMM_Likelihood.tpl namedtuple with the computed statistics.
        """
        # Compute likelihoods for all data instances
        all_likelihood = self.model.likelihood(
            params, np.ones(self.data_size, dtype=bool)
        )
        # Sum of likelihoods for subgroup instances
        sg_likelihood_sum = np.sum(all_likelihood[cover_arr])
        # Sum of likelihoods for all instances
        total_likelihood_sum = np.sum(all_likelihood)
        # Compute average likelihood for the complement (inverse) subgroup
        dataset_average = np.nan
        if (self.data_size - sg_size) > 0:
            dataset_average = (total_likelihood_sum - sg_likelihood_sum) / (
                self.data_size - sg_size
            )
        # Compute average likelihood for the subgroup
        sg_average = np.nan
        if sg_size > 0:
            sg_average = sg_likelihood_sum / sg_size
        return EMM_Likelihood.tpl(params, sg_average, dataset_average, sg_size)



[docs]
    def evaluate(self, subgroup, target, data, statistics=None):
        """Evaluate the interestingness of a subgroup.

        Parameters:
            subgroup: The subgroup description.
            target: The target variable (unused in this context).
            data: The dataset as a pandas DataFrame.
            statistics: Previously calculated statistics (optional).

        Returns:
            The difference between subgroup likelihood and inverse likelihood.
        """
        statistics = self.ensure_statistics(subgroup, target, data, statistics)
        return statistics.subgroup_likelihood - statistics.inverse_likelihood



[docs]
    def gp_get_params(self, cover_arr, v):
        """Get parameters for GP-Growth algorithm.

        Parameters:
            cover_arr: Boolean array indicating the instances in the subgroup.
            v: Statistics vector from GP-Growth.

        Returns:
            An EMM_Likelihood.tpl namedtuple with the computed statistics.
        """
        params = self.model.gp_get_params(v)
        sg_size = params.size_sg
        return self.get_tuple(sg_size, params, cover_arr)


    @property
    def gp_requires_cover_arr(self):
        """Indicate whether the GP-Growth algorithm requires a cover array.

        Returns:
            True, since the cover array is required.
        """
        return True

    def __getattr__(self, name):
        """Delegate attribute access to the underlying model.

        Parameters:
            name: Name of the attribute.

        Returns:
            The attribute from the model if it exists.
        """
        return getattr(self.model, name)




[docs]
class PolyRegression_ModelClass:
    """Polynomial Regression Model Class for Exceptional Model Mining.

    Provides methods to fit a polynomial regression model to a subgroup and
    compute likelihoods for Exceptional Model Mining.
    """

    def __init__(self, x_name="x", y_name="y", degree=1):
        """Initialize the Polynomial Regression Model.

        Parameters:
            x_name (str): Name of the independent variable in the data.
            y_name (str): Name of the dependent variable in the data.
            degree (int): Degree of the polynomial (currently only degree=1 is
                          supported).
        Raises:
            ValueError: If degree is not equal to 1.
        """
        self.x_name = x_name
        self.y_name = y_name
        if degree != 1:
            raise ValueError("Currently only degree == 1 is supported")
        self.degree = degree
        self.x = None
        self.y = None
        self.has_constant_statistics = True
        super().__init__()


[docs]
    def calculate_constant_statistics(
        self, data, target
    ):  # pylint: disable=unused-argument
        """Calculate statistics that remain constant over all subgroups.

        Parameters:
            data: The dataset as a pandas DataFrame.
            target: The target variable (unused in this context).
        """
        self.x = data[self.x_name].to_numpy()
        self.y = data[self.y_name].to_numpy()
        self.has_constant_statistics = True



[docs]
    @staticmethod
    def gp_merge(u, v):
        """Merge two statistics vectors for the GP-Growth algorithm.

        Parameters:
            u (numpy.ndarray): Left statistics vector.
            v (numpy.ndarray): Right statistics vector.
        """
        v0 = v[0]
        u0 = u[0]
        if v0 == 0 or u0 == 0:
            d = 0
        else:
            d = v0 * u0 / (v0 + u0) * (v[1] / v0 - u[1] / u0) * (v[2] / v0 - u[2] / u0)
        u += v
        u[3] += d



[docs]
    def gp_get_null_vector(self):
        """Get a null vector for initialization in the GP-Growth algorithm.

        Returns:
            numpy.ndarray: Zero-initialized array of size 5.
        """
        return np.zeros(5)



[docs]
    def gp_get_stats(self, row_index):
        """Get statistics for a single row (used in GP-Growth algorithm).

        Parameters:
            row_index (int): Index of the row in the dataset.

        Returns:
            numpy.ndarray: Statistics vector for the given row.
        """
        x = self.x[row_index]
        return np.array([1, x, self.y[row_index], 0, x * x])



[docs]
    def gp_get_params(self, v):
        """Extract model parameters from the statistics vector.

        Parameters:
            v (numpy.ndarray): Statistics vector.

        Returns:
            beta_tuple: Contains regression coefficients and subgroup size.
        """
        size = v[0]
        if size < self.degree:
            return beta_tuple(np.full(self.degree + 1, np.nan), size)
        v1 = v[1]
        # Compute slope and intercept for linear regression
        slope = v[0] * v[3] / (v[0] * v[4] - v1 * v1)
        intercept = v[2] / v[0] - slope * v[1] / v[0]
        return beta_tuple(np.array([slope, intercept]), v[0])



[docs]
    def gp_to_str(self, stats):
        """Convert statistics to a string representation.

        Parameters:
            stats (numpy.ndarray): Statistics vector.

        Returns:
            str: String representation of the statistics.
        """
        return " ".join(map(str, stats))



[docs]
    def gp_size_sg(self, stats):
        """Get the size of the subgroup from the statistics.

        Parameters:
            stats (numpy.ndarray): Statistics vector.

        Returns:
            float: Size of the subgroup.
        """
        return stats[0]


    @property
    def gp_requires_cover_arr(self):
        """Indicate whether the GP-Growth algorithm requires a cover array.

        Returns:
            False, since the cover array is not required.
        """
        return False


[docs]
    def fit(self, subgroup, data=None):
        """Fit the polynomial regression model to the subgroup data.

        Parameters:
            subgroup: The subgroup description.
            data: The dataset as a pandas DataFrame (optional).

        Returns:
            beta_tuple: Contains regression coefficients and subgroup size.
        """
        cover_arr, size = ps.get_cover_array_and_size(subgroup, len(self.x), data)
        if size <= self.degree + 1:
            return beta_tuple(np.full(self.degree + 1, np.nan), size)
        # Fit polynomial regression model to subgroup data
        return beta_tuple(
            np.polyfit(self.x[cover_arr], self.y[cover_arr], deg=self.degree), size
        )



[docs]
    def likelihood(self, stats, sg):
        """Compute the likelihoods for the subgroup instances.

        Parameters:
            stats (beta_tuple): Regression parameters and subgroup size.
            sg (numpy.ndarray): Boolean array indicating subgroup instances.

        Returns:
            numpy.ndarray: Likelihood values for the subgroup instances.
        """
        from scipy.stats import norm  # pylint: disable=import-outside-toplevel

        if any(np.isnan(stats.beta)):
            return np.full(self.x[sg].shape, np.nan)
        # Compute the residuals and evaluate the normal probability density function
        residuals = np.polyval(stats.beta, self.x[sg]) - self.y[sg]
        return norm.pdf(residuals)



[docs]
    def loglikelihood(self, stats, sg):
        """Compute the log-likelihoods for the subgroup instances.

        Parameters:
            stats (beta_tuple): Regression parameters and subgroup size.
            sg (numpy.ndarray): Boolean array indicating subgroup instances.

        Returns:
            numpy.ndarray: Log-likelihood values for the subgroup instances.
        """
        from scipy.stats import norm  # pylint: disable=import-outside-toplevel

        # Compute the residuals and evaluate the normal log-probability density function
        residuals = np.polyval(stats.beta, self.x[sg]) - self.y[sg]
        return norm.logpdf(residuals)
Source code for pysubgroup.model_target

pysubgroup

Navigation

Related Topics