實(shí)驗(yàn)報(bào)告材料 聚類分析報(bào)告
word實(shí)驗(yàn)報(bào)告 聚類分析實(shí)驗(yàn)原理:K均值聚類、中心點(diǎn)聚類、系統(tǒng)聚類和EM算法聚類分析技術(shù)。實(shí)驗(yàn)題目:用鳶尾花的數(shù)據(jù)集,進(jìn)行聚類挖掘分析。實(shí)驗(yàn)要求:探索鳶尾花數(shù)據(jù)的基本特征,利用不同的聚類挖掘方法,獲得基本結(jié)論并簡明解釋。實(shí)驗(yàn)題目-分析報(bào)告:data(iris)> rm(list=ls()> gc() used (Mb) gc trigger (Mb) max used (Mb)Ncells 431730 23.1 929718 49.7 607591 32.5Vcells 787605 6.1 8388608 64.0 1592403 12.2> data(iris)> data<-iris> head(data) Sepal.Length Sepal.Width Petal.Length Petal.Width Species1 5.1 3.5 1.4 0.2 setosa2 4.9 3.0 1.4 0.2 setosa3 4.7 3.2 1.3 0.2 setosa4 4.6 3.1 1.5 0.2 setosa5 5.0 3.6 1.4 0.2 setosa6 5.4 3.9 1.7 0.4 setosa#Kmean聚類分析> newiris <- iris> newiris$Species <- NULL> (kc <- kmeans(newiris, 3)K-means clustering with 3 clusters of sizes 62, 50, 38Cluster means: Sepal.Length Sepal.Width Petal.Length Petal.Width1 5.901613 2.748387 4.393548 1.4338712 5.006000 3.428000 1.462000 0.2460003 6.850000 3.073684 5.742105 2.071053Clustering vector: 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 41 2 2 2 2 2 2 2 2 2 2 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 81 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 3 3 3 1 3 3 3 3 3 3 1 1 3 3 3 3 1121 3 1 3 1 3 3 1 1 3 3 3 3 3 1 3 3 3 3 1 3 3 3 1 3 3 3 1 3 3 1Within cluster sum of squares by cluster:1 39.82097 15.15100 23.87947 (between_SS / total_SS = 88.4 %)Available ponents: 1 "cluster" "centers" "totss" "withinss" "tot.withinss"6 "betweenss" "size" "iter" "ifault" > table(iris$Species, kc$cluster) 1 2 3 setosa 0 50 0 versicolor 48 0 2 virginica 14 0 36> plot(newirisc("Sepal.Length", "Sepal.Width"), col = kc$cluster)> points(kc$centers,c("Sepal.Length", "Sepal.Width"), col = 1:3, pch = 8, cex=2)#K-Mediods 進(jìn)行聚類分析> install.packages("cluster")> library(cluster)> iris.pam<-pam(iris,3)> table(iris$Species,iris.pam$clustering) 1 2 3 setosa 50 0 0 versicolor 0 3 47 virginica 0 49 1> layout(matrix(c(1,2),1,2)> plot(iris.pam)> layout(matrix(1)#hc> iris.hc <- hclust( dist(iris,1:4)> plot( iris.hc, hang = -1)> plclust( iris.hc, labels = FALSE, hang = -1)> re <- rect.hclust(iris.hc, k = 3)> iris.id <- cutree(iris.hc, 3)#利用剪枝函數(shù)cutree()參數(shù)h控制輸出height=18時(shí)的系譜類別> sapply(unique(iris.id),+ function(g)iris$Speciesiris.id=g)1 1 setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa12 setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa23 setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa34 setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa45 setosa setosa setosa setosa setosa setosaLevels: setosa versicolor virginica2 1 versicolor versicolor versicolor versicolor versicolor versicolor versicolor 8 versicolor versicolor versicolor versicolor versicolor versicolor versicolor15 versicolor versicolor versicolor versicolor versicolor versicolor versicolor22 versicolor versicolor virginica virginica virginica virginica virginica 29 virginica virginica virginica virginica virginica virginica virginica 36 virginica virginica virginica virginica virginica virginica virginica 43 virginica virginica virginica virginica virginica virginica virginica 50 virginica virginica virginica virginica virginica virginica virginica 57 virginica virginica virginica virginica virginica virginica virginica 64 virginica virginica virginica virginica virginica virginica virginica 71 virginica virginica Levels: setosa versicolor virginica3 1 versicolor versicolor versicolor versicolor versicolor versicolor versicolor 8 versicolor versicolor versicolor versicolor versicolor versicolor versicolor15 versicolor versicolor versicolor versicolor versicolor versicolor versicolor22 versicolor versicolor versicolor versicolor versicolor versicolor virginica Levels: setosa versicolor virginica> plot(iris.hc)> rect.hclust(iris.hc,k=4,border="light grey")#用淺灰色矩形框出4分類聚類結(jié)果> rect.hclust(iris.hc,k=3,border="dark grey")#用淺灰色矩形框出3分類聚類結(jié)果> rect.hclust(iris.hc,k=7,which=c(2,6),border="dark grey")# DBSCAN #基于密度的聚類> install.packages("fpc")> library(fpc)> ds1=dbscan(iris,1:4,eps=1,MinPts=5)#半徑參數(shù)為1,密度閾值為5> ds1dbscan Pts=150 MinPts=5 eps=1 1 2border 0 1seed 50 99total 50 100> ds2=dbscan(iris,1:4,eps=4,MinPts=5)> ds3=dbscan(iris,1:4,eps=4,MinPts=2)> ds4=dbscan(iris,1:4,eps=8,MinPts=2)> par(mfcol=c(2,2)> plot(ds1,iris,1:4,main="1: MinPts=5 eps=1")> plot(ds3,iris,1:4,main="3: MinPts=2 eps=4")> plot(ds2,iris,1:4,main="2: MinPts=5 eps=4")> plot(ds4,iris,1:4,main="4: MinPts=2 eps=8")> d=dist(iris,1:4)#計(jì)算數(shù)據(jù)集的距離矩陣d> max(d);min(d)#計(jì)算數(shù)據(jù)集樣本的距離的最值1 7.0851961 0> install.packages("ggplot2")> library(ggplot2)> interval=cut_interval(d,30)> table(interval)interval 0,0.236 (0.236,0.472 (0.472,0.709 (0.709,0.945 (0.945,1.18 (1.18,1.42 88 585 876 891 831 688 (1.42,1.65 (1.65,1.89 (1.89,2.13 (2.13,2.36 (2.36,2.6 (2.6,2.83 543 369 379 339 335 406 (2.83,3.07 (3.07,3.31 (3.31,3.54 (3.54,3.78 (3.78,4.01 (4.01,4.25 458 459 465 480 468 505 (4.25,4.49 (4.49,4.72 (4.72,4.96 (4.96,5.2 (5.2,5.43 (5.43,5.67 349 385 321 291 187 (5.67,5.9 (5.9,6.14 (6.14,6.38 (6.38,6.61 (6.61,6.85 (6.85,7.09 97 92 78 50 18 4 > which.max(table(interval)(0.709,0.945 4 > for(i in 3:5)+ for(j in 1:10)+ ds=dbscan(iris,1:4,eps=i,MinPts=j)+ print(ds)+ + dbscan Pts=150 MinPts=1 eps=3 1seed 150total 150dbscan Pts=150 MinPts=2 eps=3 1seed 150total 150dbscan Pts=150 MinPts=3 eps=3 1seed 150total 150dbscan Pts=150 MinPts=4 eps=3 1seed 150total 150dbscan Pts=150 MinPts=5 eps=3 1seed 150total 150dbscan Pts=150 MinPts=6 eps=3 1seed 150total 150dbscan Pts=150 MinPts=7 eps=3 1seed 150total 150dbscan Pts=150 MinPts=8 eps=3 1seed 150total 150dbscan Pts=150 MinPts=9 eps=3 1seed 150total 150dbscan Pts=150 MinPts=10 eps=3 1seed 150total 150dbscan Pts=150 MinPts=1 eps=4 1seed 150total 150dbscan Pts=150 MinPts=2 eps=4 1seed 150total 150dbscan Pts=150 MinPts=3 eps=4 1seed 150total 150dbscan Pts=150 MinPts=4 eps=4 1seed 150total 150dbscan Pts=150 MinPts=5 eps=4 1seed 150total 150dbscan Pts=150 MinPts=6 eps=4 1seed 150total 150dbscan Pts=150 MinPts=7 eps=4 1seed 150total 150dbscan Pts=150 MinPts=8 eps=4 1seed 150total 150dbscan Pts=150 MinPts=9 eps=4 1seed 150total 150dbscan Pts=150 MinPts=10 eps=4 1seed 150total 150dbscan Pts=150 MinPts=1 eps=5 1seed 150total 150dbscan Pts=150 MinPts=2 eps=5 1seed 150total 150dbscan Pts=150 MinPts=3 eps=5 1seed 150total 150dbscan Pts=150 MinPts=4 eps=5 1seed 150total 150dbscan Pts=150 MinPts=5 eps=5 1seed 150total 150dbscan Pts=150 MinPts=6 eps=5 1seed 150total 150dbscan Pts=150 MinPts=7 eps=5 1seed 150total 150dbscan Pts=150 MinPts=8 eps=5 1seed 150total 150dbscan Pts=150 MinPts=9 eps=5 1seed 150total 150dbscan Pts=150 MinPts=10 eps=5 1seed 150total 150#30次dbscan的聚類結(jié)果> ds5=dbscan(iris,1:4,eps=3,MinPts=2)> ds6=dbscan(iris,1:4,eps=4,MinPts=5)> ds7=dbscan(iris,1:4,eps=5,MinPts=9)> par(mfcol=c(1,3)> plot(ds5,iris,1:4,main="1: MinPts=2 eps=3")> plot(ds6,iris,1:4,main="3: MinPts=5 eps=4")> plot(ds7,iris,1:4,main="2: MinPts=9 eps=5")# EM 期望最大化聚類> install.packages("mclust")> library(mclust)> fit_EM=Mclust(iris,1:4)fitting . |=| 100%> summary(fit_EM)- Gaussian finite mixture model fitted by EM algorithm - Mclust VEV (ellipsoidal, equal shape) model with 2 ponents: log.likelihood n df BIC ICL -215.726 150 26 -561.7285 -561.7289Clustering table: 1 2 50 100 > summary(fit_EM,parameters=TRUE)- Gaussian finite mixture model fitted by EM algorithm - Mclust VEV (ellipsoidal, equal shape) model with 2 ponents: log.likelihood n df BIC ICL -215.726 150 26 -561.7285 -561.7289Clustering table: 1 2 50 100 Mixing probabilities: 1 2 0.3333319 0.6666681 Means: ,1 ,2Sepal.Length 5.0060022 6.261996Sepal.Width 3.4280049 2.871999Petal.Length 1.4620007 4.905992Petal.Width 0.2459998 1.675997Variances:,1 Sepal.Length Sepal.Width Petal.Length Petal.WidthSepal.Length 0.15065114 0.13080115 0.02084463 0.01309107Sepal.Width 0.13080115 0.17604529 0.01603245 0.01221458Petal.Length 0.02084463 0.01603245 0.02808260 0.00601568Petal.Width 0.01309107 0.01221458 0.00601568 0.01042365,2 Sepal.Length Sepal.Width Petal.Length Petal.WidthSepal.Length 0.4000438 0.10865444 0.3994018 0.14368256Sepal.Width 0.1086544 0.10928077 0.1238904 0.07284384Petal.Length 0.3994018 0.12389040 0.6109024 0.25738990Petal.Width 0.1436826 0.07284384 0.2573899 0.16808182> plot(fit_EM)#對EM聚類結(jié)果作圖Model-based clustering plots: 1: BIC2: classification3: uncertainty4: densitySelection:(下面顯示選項(xiàng))#選1#選2#選3#選4Selection: 0> iris_BIC=mclustBIC(iris,1:4)fitting . |=| 100%> iris_BICsum=summary(iris_BIC,data=iris,1:4)> iris_BICsum #獲取數(shù)1據(jù)集iris在各模型和類別數(shù)下的BIC值Best BIC values: VEV,2 VEV,3 VVV,2BIC -561.7285 -562.5522369 -574.01783BIC diff 0.0000 -0.8237748 -12.28937Classification table for model (VEV,2): 1 2 50 100 > iris_BICBayesian Information Criterion (BIC): EII VII EEI VEI EVI VVI EEE1 -1804.0854 -1804.0854 -1522.1202 -1522.1202 -1522.1202 -1522.1202 -829.97822 -1123.4117 -1012.2352 -1042.9679 -956.2823 -1007.3082 -857.5515 -688.09723 -878.7650 -853.8144 -813.0504 -779.1566 -797.8342 -744.6382 -632.96474 -893.6140 -812.6048 -827.4036 -748.4529 -837.5452 -751.0198 -646.02585 -782.6441 -742.6083 -741.9185 -688.3463 -766.8158 -711.4502 -604.81316 -715.7136 -705.7811 -693.7908 -676.1697 -774.0673 -707.2901 -609.85437 -731.8821 -698.5413 -713.1823 -680.7377 -813.5220 -766.6500 -632.49478 -725.0805 -701.4806 -691.4133 -679.4640 -740.4068 -764.1969 -639.26409 -694.5205 -700.0276 -696.2607 -702.0143 -767.8044 -755.8290 -653.0878 EVE VEE VVE EEV VEV EVV VVV1 -829.9782 -829.9782 -829.9782 -829.9782 -829.9782 -829.9782 -829.97822 -657.2263 -656.3270 -605.1841 -644.5997 -561.7285 -658.3306 -574.01783 -666.5491 -605.3982 -636.4259 -644.7810 -562.5522 -656.0359 -580.83964 -705.5435 -604.8371 -639.7078 -699.8684 -602.0104 -725.2925 -630.60005 -723.7199 NA -632.2056 -652.2959 -634.2890 NA -676.60616 -661.9497 -609.5584 -664.8224 -664.4537 -679.5116 NA -754.79387 -699.5102 NA -690.6108 -709.9530 -704.7699 -809.8276 -806.92778 -700.4277 -654.8237 -709.9392 -735.4463 -712.8788 -831.7520 -830.63739 -729.6651 NA -734.2997 -758.9348 -748.8237 -882.4391 -883.6931Top 3 models based on the BIC criterion: VEV,2 VEV,3 VVV,2 -561.7285 -562.5522 -574.0178 > par(mfcol=c(1,1)> plot(iris_BIC,G=1:7,col="yellow")> mclust2Dplot(iris,1:2, + classification=iris_BICsum$classification,+ parameters=iris_BICsum$parameters,col="yellow")> iris_Dens=densityMclust(iris,1:2)# 對每一個樣本進(jìn)行密度估計(jì)fitting . |=| 100%> iris_Dens'densityMclust' model object: (VEV,2) Available ponents: 1 "call" "data" "modelName" "n" 5 "d" "G" "BIC" "bic" 9 "loglik" "df" "hypvol" "parameters" 13 "z" "classification" "uncertainty" "density" > plot(iris_Dens,iris,1:2,col="yellow",nlevels=55) #輸入1或2Model-based density estimation plots: 1: BIC2: densitySelection:(下面顯示選項(xiàng))#選1#選2Selection: 0> plot(iris_Dens,type = "persp",col = grey(0.8)Model-based density estimation plots: 1: BIC2: densitySelection:(下面顯示選項(xiàng))#選1#選2Selection: 028 / 28