from functools import partial
import numpy as np
import pysubgroup as ps
[docs]def plot_sgbars(
result_df,
_,
ylabel="target share",
title="Discovered Subgroups",
dynamic_widths=False,
_suffix="",
):
from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel
shares_sg = result_df["target_share_sg"]
shares_compl = result_df["target_share_complement"]
sg_relative_sizes = result_df["relative_size_sg"]
x = np.arange(len(result_df))
base_width = 0.8
if dynamic_widths:
width_sg = 0.02 + base_width * sg_relative_sizes
width_compl = base_width - width_sg
else:
width_sg = base_width / 2
width_compl = base_width / 2
fig, ax = plt.subplots()
rects1 = ax.bar(x, shares_sg, width_sg, align="edge")
rects2 = ax.bar(
x + width_sg, shares_compl, width_compl, align="edge", color="#61b76f"
)
ax.set_ylabel(ylabel)
ax.set_title(title)
ax.set_xticks(x + base_width / 2)
ax.set_xticklabels(result_df.index, rotation=90)
ax.legend((rects1[0], rects2[0]), ("subgroup", "complement"))
fig.set_size_inches(12, len(result_df))
return fig
[docs]def plot_roc(result_df, data, qf=ps.StandardQF(0.5), levels=40, annotate=False):
from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel
instances_dataset = len(data)
positives_dataset = np.max(result_df["positives_dataset"])
negatives_dataset = instances_dataset - positives_dataset
xlist = np.linspace(0.01, 0.99, 100)
ylist = np.linspace(0.01, 0.99, 100)
X, Y = np.meshgrid(xlist, ylist)
f = np.vectorize(
partial(qf.evaluate, instances_dataset, positives_dataset), otypes=[np.float]
)
Z = f(X * negatives_dataset + Y * positives_dataset, Y * positives_dataset)
max_val = np.max([np.max(Z), -np.min(Z)])
fig, ax = plt.subplots()
cm = plt.cm.get_cmap("bwr")
plt.contourf(X, Y, Z, levels, cmap=cm, vmin=-max_val, vmax=max_val)
for i, sg in result_df.iterrows():
rel_positives_sg = sg["positives_sg"] / positives_dataset
rel_negatives_sg = (sg["size_sg"] - sg["positives_sg"]) / negatives_dataset
ax.plot(rel_negatives_sg, rel_positives_sg, "ro", color="black")
if annotate:
label_margin = 0.01
ax.annotate(
str(i),
(rel_negatives_sg + label_margin, rel_positives_sg + label_margin),
)
# plt.colorbar(cp)
plt.title("Discovered subgroups")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
return fig
[docs]def plot_npspace(result_df, data, annotate=True, fixed_limits=False):
from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel
fig, ax = plt.subplots()
for i, sg in result_df.iterrows():
target_share_sg = sg["target_share_sg"]
size_sg = sg["size_sg"]
ax.plot(size_sg, target_share_sg, "ro", color="black")
if annotate:
ax.annotate(str(i), (size_sg + 5, target_share_sg + 0.001))
if fixed_limits:
plt.xlim((0, len(data)))
plt.ylim((0, 1))
plt.title("Discovered subgroups")
plt.xlabel("Size of Subgroup")
plt.ylabel("Target Share Subgroup")
return fig
[docs]def plot_distribution_numeric(sg, data, bins):
from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel
fig, _ = plt.subplots()
target_values_sg = data[sg.covers(data)][sg.target.get_attributes()].values
target_values_data = data[sg.target.get_attributes()].values
plt.hist(
target_values_sg,
bins,
alpha=0.5,
label=str(sg.subgroup_description),
density=True,
)
plt.hist(target_values_data, bins, alpha=0.5, label="Overall Data", density=True)
plt.legend(loc="upper right")
return fig
[docs]def compare_distributions_numeric(sgs, data, bins):
from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel
fig, _ = plt.subplots()
for sg in sgs:
target_values_sg = data[sg.covers(data)][sg.target.get_attributes()].values
plt.hist(
target_values_sg,
bins,
alpha=0.3,
label=str(sg.subgroup_description),
density=True,
)
plt.legend(loc="upper right")
return fig
[docs]def similarity_sgs(sgd_results, data, color=True):
import pandas as pd # pylint:disable=import-outside-toplevel
sgs = [x[1] for x in sgd_results]
# sgNames = [str(sg.subgroup_description) for sg in sgs]
dists = [[ps.overlap(sg, sg2, data) for sg2 in sgs] for sg in sgs]
dist_df = pd.DataFrame(dists)
if color:
dist_df = dist_df.style.background_gradient()
return dist_df
[docs]def similarity_dendrogram(result, data):
from matplotlib import pyplot as plt # pylint: disable=import-outside-toplevel
from scipy.cluster.hierarchy import ( # pylint: disable=import-outside-toplevel
dendrogram,
linkage,
)
from scipy.spatial.distance import (
squareform, # pylint: disable=import-outside-toplevel
)
fig, _ = plt.subplots()
dist_df = similarity_sgs(result, data, color=False)
mat = 1 - dist_df.values
dists = squareform(mat)
linkage_matrix = linkage(dists, "single")
dendrogram(linkage_matrix, labels=dist_df.index)
return fig
[docs]def supportSetVisualization(result, in_order=True, drop_empty=True):
df = result.task.data
n_items = len(result.task.data)
n_SGDs = len(result.results)
covs = np.zeros((n_items, n_SGDs), dtype=bool)
for i, (_, r, _) in enumerate(result.to_subgroups):
covs[:, i] = r.covers(df)
img_arr = covs.copy()
sort_inds_x = np.argsort(np.sum(covs, axis=1))[::-1]
img_arr = img_arr[sort_inds_x, :]
if not in_order:
sort_inds_y = np.argsort(np.sum(covs, axis=0))
img_arr = img_arr[:, sort_inds_y]
if drop_empty:
keep_entities = np.sum(img_arr, axis=1) > 0
print(
f"Discarding {n_items - np.count_nonzero(keep_entities)} "
"entities that are not covered"
)
img_arr = img_arr[keep_entities, :]
return img_arr.T