記憶力が無い

プログラミングと室内園芸と何か

Pythonでクラスタリング

pythonでk-meansとfuzzy c-meansのクラスタリングを実装したので、サンプルコードを置いておきます。動作保証はできませんが、よかったら使ってください。

参考

k-means

import numpy as np

def centroid(mem_ship,ps,k):
    V = [np.array([0.]*len(ps[0])) for i in xrange(k)]
    count = [0]*k
    for i in xrange(len(ps)):
        count[mem_ship[i]] += 1
        V[mem_ship[i]] += ps[i]
    _V = []
    for i in xrange(k):
        if count[i] != 0:
            _V.append(V[i]/count[i])
    return _V

def argmin(i,V,ps,k):
    min_i = 0
    min = sum((V[0]-np.array(ps[i]))**2)
    for j in xrange(1,k):
        tmp = sum((V[j]-np.array(ps[i]))**2)
        if tmp < min:
            min_i = j
            min = tmp
    return min_i

def update(V,ps,k):
    new_mem_ship  = [None]*len(ps)
    for i in xrange(len(ps)):
        new_mem_ship[i] = argmin(i,V,ps,k)
    return new_mem_ship
            
def k_means(ps,k):
    while True:
        mem_ship = list(np.random.randint(0,k,len(ps)))
        if len(set(mem_ship)) == k:
            break
    while True:
        V = centroid(mem_ship,ps,k)
        k = len(V)
        new_mem_ship = update(V,ps,k)
        if mem_ship == new_mem_ship:
            break
        else:
            mem_ship = new_mem_ship
    return mem_ship

fuzzy c-means

import numpy as np

def update(V,ps,k,m):
    new_mem_ship  = [np.array([None]*k) for i in xrange(len(ps))]
    for i in xrange(len(ps)):
        for j in xrange(k):
            new_mem_ship[i][j] = 1.0/sum((np.linalg.norm(np.array(ps[i])-V[j])/np.linalg.norm(np.array(ps[i])-V[l]))**(2.0/(m-1)) for l in xrange(k))
    return new_mem_ship
            
def centroid(mem_ship,ps,k,m):
    V = [np.array([0.]*len(ps)) for i in xrange(k)]
    for i in xrange(k):
        V[i] = sum([mem_ship[j][i]**m*np.array(ps[j]) for j in xrange(len(ps))])/sum([mem_ship[j][i]**m for j in xrange(len(ps))])
    return V
            
def c_means(ps,k,m):
    mem_ship = [np.random.random(k) for i in xrange(len(ps))]
    mem_ship = [mem_ship[i]/sum(mem_ship[i]) for i in xrange(len(ps))]
    while True:
        V = centroid(mem_ship,ps,k,m)
        new_mem_ship = update(V,ps,k,m)
        diff = sum([np.linalg.norm(mem_ship[i]-new_mem_ship[i]) for i in xrange(len(ps))])
        if diff < 0.0001:
            break
        else:
            mem_ship = new_mem_ship
    return mem_ship

結果の比較

サンプルコード

import k_means as km
import fuzzy_c_means as cm
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from scipy.spatial import ConvexHull

k = 5 #クラスタ数
m = 1.5 #ファジィさの度合

def plot(ps):
    x = [i[0] for i in ps]
    y = [i[1] for i in ps]
    plt.plot(x, y, "o")

def draw_regs(ps,mem_ship,mode):
    for i in xrange(k):
        P = []
        if mode == 0:
            for j in xrange(len(ps)):
                if i == mem_ship[j]:
                    P.append(ps[j])
        else:
            for j in xrange(len(ps)):
                if mem_ship[j][i] > 0.1:
                    P.append(ps[j])
        if P != []:
            hull = ConvexHull(P)
            x = [P[l][0] for l in hull.vertices]
            y = [P[l][1] for l in hull.vertices]
            orig_len = len(x)
            x = x[-3:-1] + x + x[1:3]
            y = y[-3:-1] + y + y[1:3]
            t = np.arange(len(x))
            ti = np.linspace(2, orig_len + 1, 10 * orig_len)
            xi = interp1d(t, x, kind='cubic')(ti)
            yi = interp1d(t, y, kind='cubic')(ti)
            plt.fill(xi, yi, alpha=0.2)
        
if __name__ == '__main__':
    ps = np.random.rand(300,2)
    plt.subplot(211)
    plot(ps)
    draw_regs(ps,km.k_means(ps,k),0)
    plt.xlim(-0.1, 1.1)
    plt.ylim(-0.1, 1.1)

    plt.subplot(212)
    plot(ps)
    draw_regs(ps,cm.c_means(ps,k,m),1)
    plt.xlim(-0.1, 1.1)
    plt.ylim(-0.1, 1.1)
    
    plt.show()

実行結果

f:id:ttk1:20180115015633p:plain

上がk-means、下がfuzzy c-means。

入力データが二次元で、ランダム生成なのであんまり面白くないので、そのうち何か面白いデータを使ってみたいと思います。

Copyright © 2017 ttk1