The notebook implements K Nearest Neighbor and Support Vector Machine on the Wisconsin Breast Cancer Dataset.
Thanks very much for the data preprocessing tutorial at http://www.cse.msu.edu/~ptan/dmbook/tutorials/tutorial4/tutorial4.html.
import scipy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
names = ['id', 'clump_thickness', 'uniform_cell_size', 'uniform_cell_shape',
'marginal_adhesion', 'single_epithelial_size', 'bare_nuclei',
'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']
df = pd.read_csv(url, names=names)
df.tail(10)
According to the data description https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original), the missing values are encoded as ?
# Preprocess the data
df = df.replace('?',np.NaN)
print('Number of instances = %d' % (df.shape[0]))
print('Number of attributes = %d' % (df.shape[1]))
print('Number of missing values:')
for col in df.columns:
print('\t%s: %d' % (col, df[col].isna().sum()))
Note that only bare_nuclei contains missing values.
#df_imputed = df['bare_nuclei'].fillna(df['bare_nuclei'].median())
df.fillna(df['bare_nuclei'].median(), inplace=True)
print('Number of missing values:')
for col in df.columns:
print('\t%s: %d' % (col, df[col].isna().sum()))
Boxplots are effective in identifying outliers.
# Describe the dataset
df.info()
Note that bare_nuclei is the only column where data is stored as string objects. Hence we would need to convert the column into numerical values before data analysis.
%matplotlib inline
df_new = df.drop(['class','id'],axis=1)
df_new['bare_nuclei'] = pd.to_numeric(df_new['bare_nuclei'])
df_new.boxplot(figsize=(20,3))
The plots suggest that the columns marginal_adhesion, single_epithelial_size, bland_chromatin, normal_nucleoli, mitoses contain outliers.
To deal with the outliers, we can compute z-scores for each attribute and remove the instances with z score greater than 3 or less than -3.
#from scipy import stats
#df['z_score']=stats.zscore(df['data'])
df_std = (df_new-df_new.mean())/df_new.std()
# (df_std > -3) returns Boolean data frame that indicates whether if > -3, then 1; else, then 0
# ((df_std > -3).sum(axis=1)==9) indicate rows with no z score less than -3
# ((df_std <= 3).sum(axis=1)==9) indicates rows with no z score greater than 3
df = df.loc[((df_std > -3).sum(axis=1)==9) & ((df_std <= 3).sum(axis=1)==9),:]
Now we can check if we contain duplicate data.
# The duplicated() function will return a Boolean array that indicates whether each row is a duplicate of a previous row in the table.
dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
df = df.drop_duplicates()
print('Number of rows after discarding duplicates = %d' % (df.shape[0]))
df = df.drop('id', axis=1)
# Create scatter plot matrix
scatter_matrix(df, figsize = (18,18))
plt.show()
# Create X and Y datasets for training
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Testing Options
seed = 8
scoring = 'accuracy'
# Define models to train
models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5)))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state = None)
cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# Make predictions on validation dataset
for name, model in models:
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(name)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
# Accuracy - ratio of correctly predicted observation to the total observations.
# Precision - (false positives) ratio of correctly predicted positive observations to the total predicted positive observations
# Recall (Sensitivity) - (false negatives) ratio of correctly predicted positive observations to the all observations in actual class - yes.
# F1 score - F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false
clf = SVC()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)
example_measures = np.array([[4,2,1,1,1,2,3,2,1]])
example_measures = example_measures.reshape(len(example_measures), -1)
prediction = clf.predict(example_measures)
print(prediction)
We see that both the KNN and the support vector machine performs really well for this task!