I am doing the DBSCAN clustering in python. I want to achieve an adaptive way to return the number of clusters by self calculating its eps and Minpts parameters. Below is my code. import math import copy import numpy as np import pandas as pd from sklearn.cluster import DBSCAN def loadDataSet(fileName, splitChar='\t'): dataSet = [] with open(fileName) as fr: for line in fr.readlines(): curline = line.strip().split(splitChar) fltline = list(map(float, curline)) dataSet.append(fltline) return dataSet def dist(a,b): return math.sqrt(math.pow(a[0]-b[0],2) + math.pow(a[1]-b[1],2)) def returnDk(matrix,k): Dk = [] for i in range(len(matrix)): Dk.append(matrix[i][k]) return Dk def returnDkAverage(Dk): sum = 0 for i in range(len(Dk)): sum = sum + Dk[i] return sum/len(Dk) def CalculateDistMatrix(dataset): DistMatrix = [[0 for j in range(len(dataset))] for i in range(len(dataset))] for i in range(len(dataset)): for j in range(len(dataset)): DistMatrix[i][j] = dist(dataset[i], dataset[j]) return DistMatrix def returnEpsCandidate(dataSet): DistMatrix = CalculateDistMatrix(dataSet) tmp_matrix = copy.deepcopy(DistMatrix) for i in range(len(tmp_matrix)): tmp_matrix[i].sort() EpsCandidate = [] for k in range(1,len(dataSet)): Dk = returnDk(tmp_matrix,k) DkAverage = returnDkAverage(Dk) EpsCandidate.append(DkAverage) return EpsCandidate def returnMinptsCandidate(DistMatrix,EpsCandidate): MinptsCandidate = [] for k in range(len(EpsCandidate)): tmp_eps = EpsCandidate[k] tmp_count = 0 for i in range(len(DistMatrix)): for j in range(len(DistMatrix[i])): if DistMatrix[i][j] <= tmp_eps: tmp_count = tmp_count + 1 MinptsCandidate.append(tmp_count/len(dataSet)) return MinptsCandidate def returnClusterNumberList(dataset,EpsCandidate,MinptsCandidate): np_dataset = np.array(dataset) ClusterNumberList = [] for i in range(len(EpsCandidate)): clustering = DBSCAN(eps= EpsCandidate[i],min_samples= MinptsCandidate[i]).fit(np_dataset) num_clustering = max(clustering.labels_) ClusterNumberList.append(num_clustering) return ClusterNumberList if __name__ == '__main__': data = pd.read_csv('/Users/Desktop/Mic/recorder_test1/New folder/MFCCresultsforclustering/MFCCresultsforclustering.csv') dataSet = data.iloc[:,0:13].values EpsCandidate = returnEpsCandidate(dataSet) DistMatrix = CalculateDistMatrix(dataSet) MinptsCandidate = returnMinptsCandidate(DistMatrix,EpsCandidate) ClusterNumberList = returnClusterNumberList(dataSet,EpsCandidate,MinptsCandidate) print(EpsCandidate) print(MinptsCandidate) print('cluster number list is') print(ClusterNumberList) However, the output with the loading data set is all [-1]s. I am wondering where is the mistake. Am I right for this general direction? If not, how can I achieve the adaptive DBSCAN clustering?
... View more