Python - XMeans实现

XMeans聚类、画图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# -*- coding: UTF-8 -*-


import math
from sklearn.cluster import KMeans
import numpy as np


DEBUG_PRINT = False

def cluster_by_xmeans(data, min_k=1, max_k=10, activation_bic_ratio=0.8, init="k-means++", n_init=10, max_iter=300, tol=0.0001):
"""
:param data: list of points
:param min_k, max_k: the min and max clusters for xmeans
:param activation_bic_ratio: the bic threshold for k=1
:param init, n_init, max_iter, tol: see sklearn.cluster.KMeans
:return: KMeans object
"""

if min_k > max_k:
return None

cur_k = min_k
while cur_k < max_k:
if DEBUG_PRINT: print("cur_k = %d" % cur_k)
new_k = 0

kmeans = KMeans(n_clusters=cur_k, init=init, n_init=n_init, max_iter=max_iter, tol=tol).fit(data)
data_predicted = kmeans.predict(data)
cluster_data_map = create_cluster_data_map(cur_k, data, data_predicted)

# 对各个簇进行二分裂尝试
for i in range(cur_k):
cur_cluster_data = cluster_data_map[i]
cur_cluster_center = kmeans.cluster_centers_[i]
R = len(cur_cluster_data)
M = len(cur_cluster_center)

if R >= 2:
if DEBUG_PRINT: print("\tcheck cluster %d of %d" % (i, cur_k))
# 检查当前簇的点是否全相同
equal_flag = True
for j in range(1, R):
if np.linalg.norm(np.array(cur_cluster_data[j]) - np.array(cur_cluster_data[0]) != 0):
equal_flag = False
# 当前簇的点全相同,不做二分裂检查,加速(此处优化是针对:sklearn.cluster.KMeans对完全相同的一组点做2means很慢的问题)
if equal_flag:
if DEBUG_PRINT: print("\t\tequal flag!")
new_k += 1
else:
child_kmeans = KMeans(n_clusters=2, init=init, n_init=n_init, max_iter=max_iter, tol=tol).fit(cur_cluster_data)
cur_cluster_data_predicted = child_kmeans.predict(cur_cluster_data)
child_cluster_data_map = create_cluster_data_map(2, cur_cluster_data, cur_cluster_data_predicted)
if DEBUG_PRINT: print("\t\tnot equal")

if len(child_cluster_data_map[0]) != 0 and len(child_cluster_data_map[1]) != 0:
bic1 = cal_BIC([cur_cluster_center], {0: cur_cluster_data}, R, M)
bic2 = cal_BIC(child_kmeans.cluster_centers_, child_cluster_data_map, R, M)
# 对cur_k的kmeans每个簇,尝试二分裂。
# 如果根据BIC,分裂成功,则一个原簇对应二个新簇。
# 如果根据BIC,分裂不成功,对原簇数量不产生影响,即一个簇仍对应一个簇;
if DEBUG_PRINT: print("\t\tbic1 = %.2f" % bic1)
if DEBUG_PRINT: print("\t\tbic2 = %.2f" % bic2)
if bic1 <= bic2:
new_k += 2
if DEBUG_PRINT:
print("\t\tsplit:")
for d in cur_cluster_data:
print(d)
elif cur_k == 1 and bic2 / bic1 >= activation_bic_ratio:
new_k += 2
if DEBUG_PRINT:
print("\t\tsplit (init):")
for d in cur_cluster_data:
print(d)
else:
new_k += 1
if DEBUG_PRINT:
print("\t\tno split")
for d in cur_cluster_data:
print(d)
else:
# 无法进行2means,也是分裂不成功的场景
new_k += 1
elif R == 1:
if DEBUG_PRINT: print("\tno need check cluster %d of %d, count = 1" % (i, cur_k))
if DEBUG_PRINT: print(cur_cluster_data[0])
# 该簇只有1个元素,无法做2means,只好保留当前簇
new_k += 1
else:
if DEBUG_PRINT: print("\tshould not create this cluster %d of %d, count = 0" % (i, cur_k))
# 该簇为空,意味着cur_k-means不成功,不应聚cur_k了
# 不应考虑本簇,此时new_k不应再自增
if cur_k != min_k:
# 但是注意:若是在cur_k的迭代中,本次cur_k是由上次cur_k-1中某个簇2means决定分裂的,
# 故本次cur_k中该簇也会要分裂,对此场景将new_k自减以抵消影响;
# 若不是在cur_k的迭代中,则为最初场景cur_k == min_k,仅需不考虑本簇即可,即new_k不再自增。
new_k -= 1
# 迭代到cur_k的某个簇(尝试二分裂的过程中)已经达到max_k
if new_k >= max_k:
new_k = max_k
break
# 迭代到cur_k的所有簇,经过二分裂尝试后,仍聚cur_k簇(或甚至不足cur_k簇),即已经无法再向下分裂
if new_k <= cur_k:
if new_k >= min_k:
cur_k = new_k
else:
cur_k = min_k
break
else:
cur_k = new_k

if DEBUG_PRINT: print("cur_k = %d" % cur_k)

return KMeans(n_clusters=cur_k, init=init, n_init=n_init, max_iter=max_iter, tol=tol).fit(data)


def create_cluster_data_map(k, data, data_predicted):
"""
create a dict: {label: [points_of_this_label]}
"""

cluster_data_map = {}
for i in range(k):
cluster_data_map[i] = []
for i in range(len(data)):
cluster_data_map[data_predicted[i]].append(data[i])
return cluster_data_map


def cal_BIC(centers, labelled_data, R, M):
"""
calculate the bayesian information criterion
"""

K = len(centers)
sigma2 = cal_cluster_variance(centers, labelled_data, R, K)
if sigma2 <= 0:
sigma_multiplier = float("-inf")
else:
sigma_multiplier = M * 0.5 * math.log(sigma2)
l = 0.0

for i in range(K):
Rn = len(labelled_data[i])
l += Rn * math.log(Rn) - Rn * math.log(R) - Rn * 0.5 * math.log(2.0 * math.pi) \
- Rn * sigma_multiplier - (Rn - K) * 0.5

bic = l - K * (K * (M + 1.0)) / 2.0 * math.log(R)

return bic


def cal_cluster_variance(centers, labelled_data, R, K):
"""
calculate the variance for a set of clusters
"""

sum = 0
for cur_label in range(K):
cur_center = centers[cur_label]
cur_data = labelled_data[cur_label]
for cur_point in cur_data:
#dist = np.linalg.norm(np.array(cur_point) - np.array(cur_center))
dist2 = np.sum(np.square(np.array(cur_point) - np.array(cur_center)))
sum += dist2

v = sum * 1.0
if R - K > 0:
v = v / (R - K)

return v


import colorsys
def get_colors(color_count):
hues = np.linspace(0, 1, color_count, endpoint=False)
for i in range(color_count):
#hue = i * 1. / color_count
hue = hues[i]
rgb = [int(x * 255.0) for x in colorsys.hsv_to_rgb(hue, 1.0, 1.0)]
yield "#{0:02x}{1:02x}{2:02x}".format(*rgb)


import matplotlib.pyplot as plt
def plot_by_clusters(cluster_data_map, savepath=None):
plt.figure(figsize=(8, 6))

actual_cluster_count = 0
for c in cluster_data_map:
if len(cluster_data_map[c]) > 0:
actual_cluster_count += 1

colors = get_colors(actual_cluster_count)

for c in cluster_data_map:
if len(cluster_data_map[c]) > 0:
col = next(colors)
for d in cluster_data_map[c]:
plt.plot(range(1, len(d) + 1), d, color=col)

plt.title("actually %d clusters" % actual_cluster_count, fontsize=16)
plt.xlabel("time")
plt.ylabel("kpi value")

if savepath is not None:
plt.savefig(savepath)
plt.close()
else:
plt.show()


if __name__ == '__main__':
import time

"""
# 5 clusters
data = [[3.487966, 2.617258], [3.052439, 2.939565], [2.541804, 2.855116],
[2.993872, 2.741651], [2.645149, 2.988544], [3.275658, 2.734759],
[3.4383, 3.126239], [3.149475, 2.774026], [2.808158, 2.987228],
[2.609178, 2.608235], [2.89733, 2.502805], [2.599489, 3.317544],
[3.330802, 3.329103], [2.504006, 3.13618], [2.598317, 2.838451],
[3.468053, -3.205529], [3.473971, -2.881551], [2.915968, -3.146783],
[3.287449, -2.818057], [3.199882, -3.082526], [3.33998, -2.892212],
[2.652254, -3.284869], [3.062117, -3.377962], [3.439667, -2.904208],
[3.131923, -2.860522], [2.770989, -2.585313], [2.678803, -3.25333],
[3.20751, -2.602797], [2.548267, -3.350991], [3.488662, -2.793679],
[-3.419303, -3.460712], [-2.66534, -3.327664], [-2.843735, -2.520843],
[-3.256643, -2.505888], [-2.723389, -3.145575], [-2.819643, -2.775133],
[-3.074187, -3.408583], [-3.286655, -2.857967], [-2.650737, -2.697036],
[-3.152726, -3.137763], [-2.999019, -2.686993], [-2.688528, -2.741138],
[-2.592045, -3.197096], [-3.160052, -2.52554], [-3.301692, -2.993403],
[-2.946691, 3.19919], [-3.408824, 2.648383], [-3.148617, 3.378614],
[-2.563794, 2.702014], [-3.157149, 2.622525], [-2.975722, 2.604931],
[-2.518403, 3.039607], [-3.140483, 3.32115], [-2.619992, 2.968638],
[-2.875432, 3.374492], [-2.555934, 2.997142], [-3.254758, 2.814866],
[-2.947021, 3.04663], [-3.177994, 2.874514], [-3.110401, 3.071354],
[0.11111, 0.111111], [0.11111, -0.111111], [-0.11111, 0.111111],
[-0.11111, -0.11111], [0.234232, 0.34322], [-0.23223, 0.3323]]

# use kmeans directly
time1 = time.time()
kmeans = KMeans(n_clusters=5)
kmeans.fit(data)
print("use kmeans directly".center(80, "-"))
print("cur kmeans model: %s" % kmeans)
print("cur centers: %s" % kmeans.cluster_centers_)
print("labels of data: %s" % kmeans.predict(data))
time2 = time.time()
print("timecost: %0.3f" % (time2 - time1))
print("".center(80, "-"))

print
print

# use xmeans to find the right k
time3 = time.time()
xmeans = cluster_by_xmeans(data, n_init=20)
print("use xmeans to find the right k".center(80, "-"))
print("cur kmeans model: %s" % xmeans)
print("cur centers: %s" % xmeans.cluster_centers_)
print("labels of data: %s" % xmeans.predict(data))
time4 = time.time()
print("timecost: %0.3f" % (time4 - time3))
print("".center(80, "-"))
"""


# another test
"""
data0 = [(1.2, 0, 0), (1, 0, 0), (1.1, 0, 0),
(0, 0.9, 0), (0, 0.8, 0), (0, 1, 0),
(0, 0, 1), (0, 0, 1.3), (0, 0, 1.1)]

data1 = [(1, 0, 0), (1, 0, 0), (1, 0, 0),
(0, 1, 0), (0, 1, 0), (0, 1, 0),
(0, 0, 1), (0, 0, 1), (0, 0, 1)]

data2 = [(1.2, 0, 0), (1, 0, 0), (1.1, 0, 0),
(0, 0.9, 0), (0, 0.8, 0), (0, 1, 0),
(0, 0, 1), (0, 0, 1.3), (0, 0, 1.1), (1, 0, 1)]

time1 = time.time()
xmeans = cluster_by_xmeans(data0)
print("cur kmeans model: %s" % xmeans)
print("cur centers: %s" % xmeans.cluster_centers_)
print("labels of data: %s" % xmeans.predict(data0))
time2 = time.time()
print("timecost: %0.3f" % (time2 - time1))

print
print

time3 = time.time()
xmeans = cluster_by_xmeans(data1)
print("cur kmeans model: %s" % xmeans)
print("cur centers: %s" % xmeans.cluster_centers_)
print("labels of data: %s" % xmeans.predict(data1))
time4 = time.time()
print("timecost: %0.3f" % (time4 - time3))

print
print

time5 = time.time()
xmeans = cluster_by_xmeans(data2)
print("cur kmeans model: %s" % xmeans)
print("cur centers: %s" % xmeans.cluster_centers_)
print("labels of data: %s" % xmeans.predict(data2))
time6 = time.time()
print("timecost: %0.3f" % (time6 - time5))
"""


# plot result
"""
xmeans = cluster_by_xmeans(data, max_k=4)
data_predicted = xmeans.predict(data)
cluster_data_map = create_cluster_data_map(len(xmeans.cluster_centers_), data, data_predicted)
plot_by_clusters(cluster_data_map)
"""