dataset
https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
Breast Cancer Wisconsin (Diagnostic) Data Set
Predict whether the cancer is benign or malignant
www.kaggle.com
data.csv
0.12MB
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
df= pd.read_csv('/content/data.csv')
del df['id']
del df['Unnamed: 32']
df.head()
scaler = StandardScaler()
scaler.fit(df.drop('diagnosis',axis=1))
scaled_features = scaler.transform(df.drop('diagnosis',axis=1))
X = scaled_features
y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10, shuffle=True)
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
model_predict = model.predict(X_test)
print(confusion_matrix(y_test,model_predict))
print(classification_report(y_test,model_predict))

neighbors = []
cv_scores = []
from sklearn.model_selection import cross_val_score
#perform 10 fold cross validation
for k in range(1, 51, 2):
neighbors.append(k)
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(
knn, X_train, y_train, cv = 10, scoring = 'accuracy')
cv_scores.append(scores.mean())
MSE = [1-x for x in cv_scores]
#determing the best k
optimal_k = neighbors[MSE.index(min(MSE))]
print('The optimal number of neighbors is %d' % optimal_k)

import matplotlib.pyplot as plt
#plot miscalssification error versus k
plt.figure(figsize = (10, 6))
plt.plot(neighbors, MSE)
plt.xlabel('Number of neighbors')
plt.ylabel('Misclassification Error')
plt.show()

new_patient = [
11.76,
18.14,
75,
431.1,
0.09968,
0.05914,
0.02685,
0.03515,
0.1619,
0.06287,
0.645,
2.105,
4.138,
49.11,
30,
0.01721,
0.05616,
0.1091,
0.0207,
0.002758,
13.36,
23.39,
85.1,
553.6,
0.1137,
0.07974,
0.0612,
0.0716,
0.1978,
0.06915
]
new_pred = model.predict([new_patient])
new_pred

cols = df.columns.tolist()
print("new_patiend = [")
for item in cols:
print("\0, " + "#" + item)
print("]")

https://github.com/erica00j/machinelearning/blob/main/final_Cancer.ipynb
'인공지능 > Machine Learning' 카테고리의 다른 글
[ML] Tree Based Learning Algorithms - Random Forests (0) | 2022.11.30 |
---|---|
[ML] Tree Based Learning Algorithms - Decision Trees (0) | 2022.11.30 |
[ML] k-NEAREST NEIGHBORS (k-최근접 이웃 알고리즘) (0) | 2022.11.15 |
[ML] Bias & Variance (0) | 2022.11.15 |
[ML] Support Vector Machines, SVM (0) | 2022.11.15 |
댓글