본문 바로가기
인공지능/Machine Learning

[ML] k-NEAREST NEIGHBORS 예제

by 유일리 2022. 11. 15.

dataset

https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data

 

Breast Cancer Wisconsin (Diagnostic) Data Set

Predict whether the cancer is benign or malignant

www.kaggle.com

data.csv
0.12MB

import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

df= pd.read_csv('/content/data.csv')

del df['id']
del df['Unnamed: 32']

df.head()
scaler = StandardScaler()
scaler.fit(df.drop('diagnosis',axis=1))
scaled_features = scaler.transform(df.drop('diagnosis',axis=1))

X = scaled_features
y = df['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10, shuffle=True)

model = KNeighborsClassifier(n_neighbors=5)

model.fit(X_train, y_train)

model_predict = model.predict(X_test)

print(confusion_matrix(y_test,model_predict))
print(classification_report(y_test,model_predict))

neighbors = []
cv_scores = []

from sklearn.model_selection import cross_val_score
#perform 10 fold cross validation
for k in range(1, 51, 2):
    neighbors.append(k)
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(
        knn, X_train, y_train, cv = 10, scoring = 'accuracy')
    cv_scores.append(scores.mean())
 
MSE = [1-x for x in cv_scores]
 
#determing the best k
optimal_k = neighbors[MSE.index(min(MSE))]
print('The optimal number of neighbors is %d' % optimal_k)

import matplotlib.pyplot as plt
#plot miscalssification error versus k
plt.figure(figsize = (10, 6))
plt.plot(neighbors, MSE)
plt.xlabel('Number of neighbors')
plt.ylabel('Misclassification Error')
plt.show()

new_patient = [
    11.76,
    18.14,
    75,
    431.1,
    0.09968,
    0.05914,
    0.02685,
    0.03515,
    0.1619,
    0.06287,
    0.645,
    2.105,
    4.138,
    49.11,
    30,
    0.01721,
    0.05616,
    0.1091,
    0.0207,
    0.002758,
    13.36,
    23.39,
    85.1,
    553.6,
    0.1137,
    0.07974,
    0.0612,
    0.0716,
    0.1978,
    0.06915
]
 
new_pred = model.predict([new_patient])
new_pred

cols = df.columns.tolist()
print("new_patiend = [")
for item in cols:
    print("\0, " + "#" +  item)
print("]")

https://github.com/erica00j/machinelearning/blob/main/final_Cancer.ipynb

댓글