From sources-request at octave dot org Wed Sep 14 18:08:05 2005 Subject: Reasonably fast (fuzzy) kmeans clustering including a demo script, might live in statistics From: Jeff Armitstead To: sources at octave dot org Date: Wed, 14 Sep 2005 16:48:11 -0500 This is a multi-part message in MIME format. --------------040801060902090907080301 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Reasonably fast (fuzzy) kmeans clustering including a demo script, might live in statistics. --------------040801060902090907080301 Content-Type: text/plain; name="kmeans.m" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="kmeans.m" ## Copyright (C) 2004 Jeff Armitstead ## ## This file is part of Octave. ## ## Octave is free software; you can redistribute it and/or modify it ## under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2, or (at your option) ## any later version. ## ## Octave is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Octave; see the file COPYING. If not, write to the Free ## Software Foundation, 59 Temple Place - Suite 330, Boston, MA ## 02111-1307, USA. ## Author: JPA ## Description: Clustering of data using a (fuzzy) kmeans algorithm ## See the excellent Pattern Classification by Duda, Hart & Stork for details ## of algorithms. ## ## Reasonably fast unsupervised clustering of data ## ## [Cent,L,K,norm_mean,norm_std] = ## kmeans( F, nclust, start_cent, fuzz, elim, norm, metric, verbosity, graph) ## ## INPUTS ## ## F : the test data in feature space ## nclust : the number of clusters to fit ## start_cent : starting centroids, if absent data is randomly initialised to clusters ## fuzz : fuzzy blending ( 0 for non-fuzzy clustering ) ## elim : stopping criterion ## norm : 1=normalise raw data, 0=don't ## metric : 1= L1-norm, 2= L2-norm (Euclidean), ..., Inf = infinity norm. ## verbosity : 1=speak, 0=don't ## graph : 1=graph cluster norms, 0=don't ## ## OUTPUTS ## ## Cent : the cluster centroids ## L : the cluster metrics for each point in feature space ## K : cluster membership matrix ## norm_mean : the normalising means ## norm_std : the normalising std devs function [Cent,L,K,norm_mean,norm_std] = kmeans(F,nclust,start_cent,fuzz,elim,norm,metric,verbosity,graph) if nargin == 2 start_cent=[]; fuzz=0; elim=1e-6; norm=1; metric=2; verbosity=0; graph=0; elseif nargin == 3 fuzz=0; elim=1e-6; norm=1; metric=2; verbosity=0; graph=0; elseif nargin == 4 elim=1e-6; norm=1; metric=2; verbosity=0; graph=0; elseif nargin == 5 norm=1; metric=2; verbosity=0; graph=0; elseif nargin == 6 metric=2; verbosity=0; graph=0; elseif nargin == 7 verbosity=0; graph=0; elseif nargin == 8 graph=0; elseif nargin == 9 ; else error("You must give me at least the \"data\" and no. of clusters"); return end start_day=now(); nobs=size(F,1); nfeats=size(F,2); if (verbosity) fprintf("%d features and %d observations\n",nfeats,nobs); end ## Check inputs for sanity ######################### if (nobs < 2*nclust) error("No. of obs less than 2 * No. of clusters ??"); return end if ((~isempty(start_cent)) & (size(start_cent,1)~=nclust | size(start_cent,2)~=nfeats)) error("Starting Centroid different size to Data ??"); return end if nclust < 2 error("Clusters must be >= 2 ??"); return end if (~isempty(fuzz) & fuzz == 1 ) error("Fuzzy blending must be > 1 ??"); return else fexpon=1/(fuzz-1); end if isempty(fuzz) fuzz=0; end if (isempty(elim) | elim == 0) elim=1e-6; end if isempty(norm) norm=1; end if isempty(metric) metric=2; end if ( metric < 1 ) error ("bad metric exponent"); endif if isempty(verbosity) verbosity=1; end if isempty(graph) graph=0; end ## normalize metrics ########################### if norm==1 norm_mean=mean(F); M=repmat(norm_mean,nobs,1); F=F-M; norm_std=std(F); M=repmat(norm_std,nobs,1); F=F./M; else norm_mean=zeros(1,size(F,2)); norm_std=ones(1,size(F,2)); end K=ones(nobs,nclust); ## if start_cent is empty we randomise if min(size(start_cent))==0 ##initial centroids lowest=min(F); range=max(F)-lowest; for i=1:nclust Cent(i,:)=rand(1,nfeats).*range+lowest; end else Cent=start_cent; end running=1; pass=1; max_pass = 1000; if fuzz==0 ############## Use non-fuzzy kmeans algorithm ############### if verbosity fprintf("Using NON-fuzzy k-means algorithm\n"); end ##initial L2 norms for i=1:nclust M=repmat(Cent(i,:),nobs,1); if ( metric ==2 ) L(:,i)=sqrt(sum(((F-M).^2),2)); elseif ( metric == 1 ) L(:,i)=(sum((abs(F-M)),2)); elseif ( metric == Inf ) L(:,i)=max(abs(F-M)')'; else L(:,i)=(sum(((F-M).^metric),2)).^(1/metric); endif end #####Start the loop while running if verbosity fprintf("Pass no. %d\n",pass); end ## what's moving for i=1:nclust buf="find (("; next=i+1; for j=1:nclust-1 if (next>nclust) next=1; end buf=sprintf("%s L(:,%d)>L(:,%d) ",buf,i,next); next+=1; if ((j+1)~=nclust) buf=sprintf("%s | ",buf); end end buf=sprintf("%s) & K(:,%d)) ",buf,i); eval(["m(i)= length( " buf ");"]); end for i=1:nclust if verbosity fprintf("moving %d from cluster %d\n",m(i),i); end end if(sum(m)==0) running=0; end ##reallocate for i=1:nclust ff=find(K(:,i)); K(ff,i)=zeros(size(ff)); end for i=1:nclust buf="find ("; next=i+1; for j=1:nclust-1 if (next>nclust) next=1; end buf=sprintf("%s L(:,%d) max_pass ) error("exceeded maximum number of iterations"); endif end #while running else ############## Use fuzzy kmeans algorithm ############### if verbosity fprintf("Using fuzzy k-means algorithm\n"); end Cold=Cent; while running if verbosity fprintf("Pass no. %d\n",pass); end ##L2 norms for i=1:nclust M=repmat(Cent(i,:),nobs,1); if ( metric ==2 ) L(:,i)=sqrt(sum(((F-M).^2),2)); elseif ( metric == 1 ) L(:,i)=(sum((abs(F-M)),2)); elseif ( metric == Inf ) L(:,i)=max(abs(F-M)')'; else L(:,i)=(sum(((F-M).^metric),2)).^(1/metric); endif end ## Membership probabilities temp=(1./L).^(fexpon); nmemprob=temp./repmat(sum(temp,2),1,nclust); ##new centroids for i=1:nclust parr=repmat(nmemprob(:,i),1,nfeats); Cent(i,:)=sum((parr.^fexpon).*F)./sum(parr.^fexpon); end ## Stopping crit nm=sqrt(sum(sum((Cent - Cold).^2))); if nm < elim running=0; end Cold=Cent; pass+=1; if ( pass > max_pass ) error("exceeded maximum number of iterations"); endif end #while running ## Calculate the other things K=zeros(nobs,nclust); [m,im]=max(nmemprob'); ##euclidean centroids for i=1:nclust II=find(im==i); if length(II)==1 Cent(i,:)=F(II,:); else Cent(i,:)=mean(F(II,:)); end end ##Euclidean L2 norms for i=1:nclust M=repmat(Cent(i,:),nobs,1); if ( metric ==2 ) L(:,i)=sqrt(sum(((F-M).^2),2)); elseif ( metric == 1 ) L(:,i)=(sum((abs(F-M)),2)); elseif ( metric == Inf ) L(:,i)=max(abs(F-M)')'; else L(:,i)=(sum(((F-M).^metric),2)).^(1/metric); endif end for i=1:nclust II=find(im==i); K(II,i)=ones(size(II))'; end end ##Fuzzy kmeans algorithm if verbosity fprintf('Elapsed time= %f\n',(now()-start_day)*24*60*60); fprintf("Cluster metric norms are:\n"); if ( metric ==2 ) lnm=sqrt(sum(L.^2)); elseif ( metric == 1 ) lnm=sum(abs(L)); elseif ( metric == Inf ) lnm=max(abs(L))'; else lnm=(sum(L.^metric)).^(1/metric); endif for i=1:length(lnm) fprintf("%f\t",lnm(i)); end fprintf("\n"); ##fprintf("\nTotal norm= %f %f\n",norm(lnm),norm(L,"fro")); end if graph ## visualise cluster metrics if (nclust==2) ## easy hold off; xlabel("Norm 1"); ylabel("Norm 2"); plot(L(find(K(:,1)),1),L(find(K(:,1)),2),"r*;Clust 1;"); hold on; plot(L(find(K(:,2)),1),L(find(K(:,2)),2),"b*;Clust 2;"); hold off; end if (nclust==3) ## tricky I=find(K(:,1)); II=find(K(:,2)); III=find(K(:,3)); hold off; polar([2*pi/3 2*pi/3 4*pi/3 4*pi/3 2*pi 2*pi],[0 10 0 10 0 10],"-o"); axis('square'); legend('off'); hold on; ## [theta,r]=cart2pol(L(I,1),L(I,2)); theta=theta/(pi/2)*(2*pi/3); polar(theta,r,"*r"); [theta,r]=cart2pol(L(II,1),L(II,2)); theta=theta/(pi/2)*(2*pi/3); polar(theta,r,"*b"); [theta,r]=cart2pol(L(III,1),L(III,2)); theta=theta/(pi/2)*(2*pi/3); polar(theta,r,"*g"); ## [theta,r]=cart2pol(L(I,2),L(I,3)); theta=theta/(pi/2)*(2*pi/3)+(2*pi/3); polar(theta,r,"*r"); [theta,r]=cart2pol(L(II,2),L(II,3)); theta=theta/(pi/2)*(2*pi/3)+(2*pi/3); polar(theta,r,"*b"); [theta,r]=cart2pol(L(III,2),L(III,3)); theta=theta/(pi/2)*(2*pi/3)+(2*pi/3); polar(theta,r,"*g"); ## [theta,r]=cart2pol(L(I,3),L(I,1)); theta=theta/(pi/2)*(2*pi/3)+(4*pi/3); polar(theta,r,"*r"); [theta,r]=cart2pol(L(II,3),L(II,1)); theta=theta/(pi/2)*(2*pi/3)+(4*pi/3); polar(theta,r,"*b"); [theta,r]=cart2pol(L(III,3),L(III,1)); theta=theta/(pi/2)*(2*pi/3)+(4*pi/3); polar(theta,r,"*g"); ## hold off; end if ( nclust > 3 ) ## .er... fprintf("Can't visualise metrics for nclust > 3.\n"); endif end endfunction --------------040801060902090907080301 Content-Type: text/plain; name="kmeans_demo.m" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="kmeans_demo.m" ## Copyright (C) 2004 Jeff Armitstead ## ## This file is part of Octave. ## ## Octave is free software; you can redistribute it and/or modify it ## under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2, or (at your option) ## any later version. ## ## Octave is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Octave; see the file COPYING. If not, write to the Free ## Software Foundation, 59 Temple Place - Suite 330, Boston, MA ## 02111-1307, USA. ## Author: JPA ## Description: Clustering of data using a (fuzzy) kmeans algorithm ## See the excellent Pattern Classification by Duda, Hart & Stork for details ## of algorithms. ## ## Demonstrate kmeans.m using example data from Duda, Hart & Stork w1=[ -5.01 -8.12 -3.68; -5.43 -3.48 -3.54; 1.08 -5.52 1.66; 0.86 -3.78 -4.11; -2.67 0.63 7.39; 4.94 3.29 2.08; -2.51 2.09 -2.59; -2.25 -2.13 -6.94; 5.56 2.86 -2.26; 1.03 -3.33 4.33 ]; w2=[ -0.91 -0.18 -0.05; 1.3 -2.06 -3.53; -7.75 -4.54 -0.95; -5.47 0.5 3.92; 6.14 5.72 -4.85; 3.6 1.26 4.36; 5.37 -4.63 -3.65; 7.18 1.46 -6.66; -7.39 1.17 6.3; -7.5 -6.32 -0.31 ]; w3=[ 5.35 2.26 8.13; 5.12 3.22 -2.66; -1.34 -5.31 -9.87; 4.48 3.42 5.19; 7.11 2.39 9.21; 7.17 4.33 -0.98; 5.75 3.97 6.65; 0.77 0.27 2.41; 0.9 -0.43 -8.71; 3.52 -0.36 6.43 ]; sel = input("Choose data set [1,2,3]: "); buf=sprintf("%d",sel); eval(["myw=w" buf]); ## F : the test data in feature space ## nclust : the number of clusters to fit ## start_cent : starting centroids, if absent data is randomly initialised to clusters ## fuzz : fuzzy blending ( 0 for non-fuzzy clustering ) ## elim : stopping criterion ## norm : 1=normalise raw data, 0=don't ## metric : 1= L1-norm, 2= L2-norm (Euclidean), ..., Inf = infinity norm. ## verbosity : 1=speak, 0=don't ## graph : 1=graph cluster norms, 0=don't [Cent,L,K,norm_mean,norm_std] =kmeans( myw, 3, [], 1.5, 1e-4, 0, 1, 1, 0); I1=find(K(:,1)); I2=find(K(:,2)); I3=find(K(:,3)); title(["Data Set " buf]); hold off if (length(I1) >0) plot3(myw(I1,1),myw(I1,2),myw(I1,3),"+;cluster 1;") endif hold on if (length(I2) >0) plot3(myw(I2,1),myw(I2,2),myw(I2,3),"+g;cluster 2;") endif if (length(I3) >0) plot3(myw(I3,1),myw(I3,2),myw(I3,3),"+b;cluster 3;") endif hold off --------------040801060902090907080301--