pythonでk-meansとfuzzy c-meansのクラスタリングを実装したので、サンプルコードを置いておきます。動作保証はできませんが、よかったら使ってください。
参考
k-means
import numpy as np def centroid(mem_ship,ps,k): V = [np.array([0.]*len(ps[0])) for i in xrange(k)] count = [0]*k for i in xrange(len(ps)): count[mem_ship[i]] += 1 V[mem_ship[i]] += ps[i] _V = [] for i in xrange(k): if count[i] != 0: _V.append(V[i]/count[i]) return _V def argmin(i,V,ps,k): min_i = 0 min = sum((V[0]-np.array(ps[i]))**2) for j in xrange(1,k): tmp = sum((V[j]-np.array(ps[i]))**2) if tmp < min: min_i = j min = tmp return min_i def update(V,ps,k): new_mem_ship = [None]*len(ps) for i in xrange(len(ps)): new_mem_ship[i] = argmin(i,V,ps,k) return new_mem_ship def k_means(ps,k): while True: mem_ship = list(np.random.randint(0,k,len(ps))) if len(set(mem_ship)) == k: break while True: V = centroid(mem_ship,ps,k) k = len(V) new_mem_ship = update(V,ps,k) if mem_ship == new_mem_ship: break else: mem_ship = new_mem_ship return mem_ship
fuzzy c-means
import numpy as np def update(V,ps,k,m): new_mem_ship = [np.array([None]*k) for i in xrange(len(ps))] for i in xrange(len(ps)): for j in xrange(k): new_mem_ship[i][j] = 1.0/sum((np.linalg.norm(np.array(ps[i])-V[j])/np.linalg.norm(np.array(ps[i])-V[l]))**(2.0/(m-1)) for l in xrange(k)) return new_mem_ship def centroid(mem_ship,ps,k,m): V = [np.array([0.]*len(ps)) for i in xrange(k)] for i in xrange(k): V[i] = sum([mem_ship[j][i]**m*np.array(ps[j]) for j in xrange(len(ps))])/sum([mem_ship[j][i]**m for j in xrange(len(ps))]) return V def c_means(ps,k,m): mem_ship = [np.random.random(k) for i in xrange(len(ps))] mem_ship = [mem_ship[i]/sum(mem_ship[i]) for i in xrange(len(ps))] while True: V = centroid(mem_ship,ps,k,m) new_mem_ship = update(V,ps,k,m) diff = sum([np.linalg.norm(mem_ship[i]-new_mem_ship[i]) for i in xrange(len(ps))]) if diff < 0.0001: break else: mem_ship = new_mem_ship return mem_ship
結果の比較
サンプルコード
import k_means as km import fuzzy_c_means as cm import numpy as np import matplotlib.pyplot as plt from scipy.interpolate import interp1d from scipy.spatial import ConvexHull k = 5 #クラスタ数 m = 1.5 #ファジィさの度合 def plot(ps): x = [i[0] for i in ps] y = [i[1] for i in ps] plt.plot(x, y, "o") def draw_regs(ps,mem_ship,mode): for i in xrange(k): P = [] if mode == 0: for j in xrange(len(ps)): if i == mem_ship[j]: P.append(ps[j]) else: for j in xrange(len(ps)): if mem_ship[j][i] > 0.1: P.append(ps[j]) if P != []: hull = ConvexHull(P) x = [P[l][0] for l in hull.vertices] y = [P[l][1] for l in hull.vertices] orig_len = len(x) x = x[-3:-1] + x + x[1:3] y = y[-3:-1] + y + y[1:3] t = np.arange(len(x)) ti = np.linspace(2, orig_len + 1, 10 * orig_len) xi = interp1d(t, x, kind='cubic')(ti) yi = interp1d(t, y, kind='cubic')(ti) plt.fill(xi, yi, alpha=0.2) if __name__ == '__main__': ps = np.random.rand(300,2) plt.subplot(211) plot(ps) draw_regs(ps,km.k_means(ps,k),0) plt.xlim(-0.1, 1.1) plt.ylim(-0.1, 1.1) plt.subplot(212) plot(ps) draw_regs(ps,cm.c_means(ps,k,m),1) plt.xlim(-0.1, 1.1) plt.ylim(-0.1, 1.1) plt.show()
実行結果
上がk-means、下がfuzzy c-means。
入力データが二次元で、ランダム生成なのであんまり面白くないので、そのうち何か面白いデータを使ってみたいと思います。