Source code for pysubgroup.visualization

from functools import partial

import numpy as np

import pysubgroup as ps


[docs]def plot_sgbars( result_df, _, ylabel="target share", title="Discovered Subgroups", dynamic_widths=False, _suffix="", ): from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel shares_sg = result_df["target_share_sg"] shares_compl = result_df["target_share_complement"] sg_relative_sizes = result_df["relative_size_sg"] x = np.arange(len(result_df)) base_width = 0.8 if dynamic_widths: width_sg = 0.02 + base_width * sg_relative_sizes width_compl = base_width - width_sg else: width_sg = base_width / 2 width_compl = base_width / 2 fig, ax = plt.subplots() rects1 = ax.bar(x, shares_sg, width_sg, align="edge") rects2 = ax.bar( x + width_sg, shares_compl, width_compl, align="edge", color="#61b76f" ) ax.set_ylabel(ylabel) ax.set_title(title) ax.set_xticks(x + base_width / 2) ax.set_xticklabels(result_df.index, rotation=90) ax.legend((rects1[0], rects2[0]), ("subgroup", "complement")) fig.set_size_inches(12, len(result_df)) return fig
[docs]def plot_roc(result_df, data, qf=ps.StandardQF(0.5), levels=40, annotate=False): from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel instances_dataset = len(data) positives_dataset = np.max(result_df["positives_dataset"]) negatives_dataset = instances_dataset - positives_dataset xlist = np.linspace(0.01, 0.99, 100) ylist = np.linspace(0.01, 0.99, 100) X, Y = np.meshgrid(xlist, ylist) f = np.vectorize( partial(qf.evaluate, instances_dataset, positives_dataset), otypes=[np.float] ) Z = f(X * negatives_dataset + Y * positives_dataset, Y * positives_dataset) max_val = np.max([np.max(Z), -np.min(Z)]) fig, ax = plt.subplots() cm = plt.cm.get_cmap("bwr") plt.contourf(X, Y, Z, levels, cmap=cm, vmin=-max_val, vmax=max_val) for i, sg in result_df.iterrows(): rel_positives_sg = sg["positives_sg"] / positives_dataset rel_negatives_sg = (sg["size_sg"] - sg["positives_sg"]) / negatives_dataset ax.plot(rel_negatives_sg, rel_positives_sg, "ro", color="black") if annotate: label_margin = 0.01 ax.annotate( str(i), (rel_negatives_sg + label_margin, rel_positives_sg + label_margin), ) # plt.colorbar(cp) plt.title("Discovered subgroups") plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") return fig
[docs]def plot_npspace(result_df, data, annotate=True, fixed_limits=False): from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel fig, ax = plt.subplots() for i, sg in result_df.iterrows(): target_share_sg = sg["target_share_sg"] size_sg = sg["size_sg"] ax.plot(size_sg, target_share_sg, "ro", color="black") if annotate: ax.annotate(str(i), (size_sg + 5, target_share_sg + 0.001)) if fixed_limits: plt.xlim((0, len(data))) plt.ylim((0, 1)) plt.title("Discovered subgroups") plt.xlabel("Size of Subgroup") plt.ylabel("Target Share Subgroup") return fig
[docs]def plot_distribution_numeric(sg, data, bins): from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel fig, _ = plt.subplots() target_values_sg = data[sg.covers(data)][sg.target.get_attributes()].values target_values_data = data[sg.target.get_attributes()].values plt.hist( target_values_sg, bins, alpha=0.5, label=str(sg.subgroup_description), density=True, ) plt.hist(target_values_data, bins, alpha=0.5, label="Overall Data", density=True) plt.legend(loc="upper right") return fig
[docs]def compare_distributions_numeric(sgs, data, bins): from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel fig, _ = plt.subplots() for sg in sgs: target_values_sg = data[sg.covers(data)][sg.target.get_attributes()].values plt.hist( target_values_sg, bins, alpha=0.3, label=str(sg.subgroup_description), density=True, ) plt.legend(loc="upper right") return fig
[docs]def similarity_sgs(sgd_results, data, color=True): import pandas as pd # pylint:disable=import-outside-toplevel sgs = [x[1] for x in sgd_results] # sgNames = [str(sg.subgroup_description) for sg in sgs] dists = [[ps.overlap(sg, sg2, data) for sg2 in sgs] for sg in sgs] dist_df = pd.DataFrame(dists) if color: dist_df = dist_df.style.background_gradient() return dist_df
[docs]def similarity_dendrogram(result, data): from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel from scipy.cluster.hierarchy import ( # pylint: disable=import-outside-toplevel dendrogram, linkage, ) from scipy.spatial.distance import ( squareform, # pylint: disable=import-outside-toplevel ) fig, _ = plt.subplots() dist_df = similarity_sgs(result, data, color=False) mat = 1 - dist_df.values dists = squareform(mat) linkage_matrix = linkage(dists, "single") dendrogram(linkage_matrix, labels=dist_df.index) return fig
[docs]def supportSetVisualization(result, in_order=True, drop_empty=True): df = result.task.data n_items = len(result.task.data) n_SGDs = len(result.results) covs = np.zeros((n_items, n_SGDs), dtype=bool) for i, (_, r, _) in enumerate(result.to_subgroups): covs[:, i] = r.covers(df) img_arr = covs.copy() sort_inds_x = np.argsort(np.sum(covs, axis=1))[::-1] img_arr = img_arr[sort_inds_x, :] if not in_order: sort_inds_y = np.argsort(np.sum(covs, axis=0)) img_arr = img_arr[:, sort_inds_y] if drop_empty: keep_entities = np.sum(img_arr, axis=1) > 0 print( f"Discarding {n_items - np.count_nonzero(keep_entities)} " "entities that are not covered" ) img_arr = img_arr[keep_entities, :] return img_arr.T