import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import norm, skew
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
%config InlineBackend.figure_format = 'retina'
# Model Accuracies
ml_accuracies = dict()
# Colors
colors = ['lightcoral',
'brown',
'lightseagreen',
'maroon',
'deeppink',
'darkorange',
'royalblue',
'darkviolet',
'gold',
'crimson',
'lightsteelblue',
'salmon',
'mediumseagreen',
'olivedrab',
'blue',
'limegreen',
'slateblue',
'red',
'steelblue',
'teal',
'peru',
'dimgray',
'violet',
'cyan']
1 - Data Meaning
¶1 | Data Meaning
Columns | Meaning |
---|---|
Age | The age of the patient. (Numeric) |
Gender | The gender of the patient. (Categorical) |
Air Pollution | The level of air pollution exposure of the patient. (Categorical) |
Alcohol use | The level of alcohol use of the patient. (Categorical) |
Dust Allergy | The level of dust allergy of the patient. (Categorical) |
OccuPational Hazards | The level of occupational hazards of the patient. (Categorical) |
Genetic Risk | The level of genetic risk of the patient. (Categorical) |
chronic Lung Disease | The level of chronic lung disease of the patient. (Categorical) |
Balanced Diet | The level of balanced diet of the patient. (Categorical) |
Obesity | The level of obesity of the patient. (Categorical) |
Smoking | The level of smoking of the patient. (Categorical) |
Passive Smoker | The level of passive smoker of the patient. (Categorical) |
Chest Pain | The level of chest pain of the patient. (Categorical) |
Coughing of Blood | The level of coughing of blood of the patient. (Categorical) |
Fatigue | The level of fatigue of the patient. (Categorical) |
Weight Loss | The level of weight loss of the patient. (Categorical) |
Shortness of Breath | The level of shortness of breath of the patient. (Categorical) |
Wheezing | The level of wheezing of the patient. (Categorical) |
Swallowing Difficulty | The level of swallowing difficulty of the patient. (Categorical) |
Clubbing of Finger Nails | The level of clubbing of finger nails of the patient. (Categorical) |
low
, medium
and high
. In this notebook, after exploring the data I will be aiming to develop a predictive method for the level of severity of the cancer. By analyzing this data we'll gain insight into what causes lung cancer and how best to prevent it as a person.2 - Importing and Initial Analysis
¶2 | Importing and Initial Analysis
df = pd.read_csv("/content/cancer patient data sets.csv", index_col='index')
# Index Column now refers to patient. This will reduce any prediction bias having a chronological index number would induce.
df.drop("Patient Id", axis=1, inplace=True)
# Cleaning the column names to allow us to deal with them professionaly.
df.rename(columns=str.lower, inplace=True)
df.rename(columns={col: col.replace(" ", "_") for col in df.columns}, inplace=True)
# View the first 5 rows of the database.
df.head(5)
age | gender | air_pollution | alcohol_use | dust_allergy | occupational_hazards | genetic_risk | chronic_lung_disease | balanced_diet | obesity | ... | fatigue | weight_loss | shortness_of_breath | wheezing | swallowing_difficulty | clubbing_of_finger_nails | frequent_cold | dry_cough | snoring | level | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
0 | 33 | 1 | 2 | 4 | 5 | 4 | 3 | 2 | 2 | 4 | ... | 3 | 4 | 2 | 2 | 3 | 1 | 2 | 3 | 4 | Low |
1 | 17 | 1 | 3 | 1 | 5 | 3 | 4 | 2 | 2 | 2 | ... | 1 | 3 | 7 | 8 | 6 | 2 | 1 | 7 | 2 | Medium |
2 | 35 | 1 | 4 | 5 | 6 | 5 | 5 | 4 | 6 | 7 | ... | 8 | 7 | 9 | 2 | 1 | 4 | 6 | 7 | 2 | High |
3 | 37 | 1 | 7 | 7 | 7 | 7 | 6 | 7 | 7 | 7 | ... | 4 | 2 | 3 | 1 | 4 | 5 | 6 | 7 | 5 | High |
4 | 46 | 1 | 6 | 8 | 7 | 7 | 7 | 6 | 7 | 7 | ... | 3 | 2 | 4 | 1 | 4 | 2 | 4 | 2 | 3 | High |
5 rows × 24 columns
# To understand the data further, we need to identify the data type and view if there's any corrupt/missing data fields.
print(df.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 1000 entries, 0 to 999 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1000 non-null int64 1 gender 1000 non-null int64 2 air_pollution 1000 non-null int64 3 alcohol_use 1000 non-null int64 4 dust_allergy 1000 non-null int64 5 occupational_hazards 1000 non-null int64 6 genetic_risk 1000 non-null int64 7 chronic_lung_disease 1000 non-null int64 8 balanced_diet 1000 non-null int64 9 obesity 1000 non-null int64 10 smoking 1000 non-null int64 11 passive_smoker 1000 non-null int64 12 chest_pain 1000 non-null int64 13 coughing_of_blood 1000 non-null int64 14 fatigue 1000 non-null int64 15 weight_loss 1000 non-null int64 16 shortness_of_breath 1000 non-null int64 17 wheezing 1000 non-null int64 18 swallowing_difficulty 1000 non-null int64 19 clubbing_of_finger_nails 1000 non-null int64 20 frequent_cold 1000 non-null int64 21 dry_cough 1000 non-null int64 22 snoring 1000 non-null int64 23 level 1000 non-null object dtypes: int64(23), object(1) memory usage: 195.3+ KB None
# What levels did the researchers categorize in this dataset?
print('Cancer Levels: ', df['level'].unique())
'''
Applying a horizontal background gradient to the transposed descriptive statistics of the DataFrame,
rounded to three decimal places. This enhances readability and visual interpretation,
allowing for an immediate, intuitive understanding of the data's distribution and variability across different measures.
'''
mapping = {'High': 2, 'Medium': 1, 'Low': 0}
df["level"].replace(mapping, inplace=True)
print('Cancer Levels: ', df['level'].unique())
Cancer Levels: ['Low' 'Medium' 'High'] Cancer Levels: [0 1 2]
round(df.describe().iloc[1:, ].T, 3).style.background_gradient(axis=1)
mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|
age | 37.174000 | 12.005000 | 14.000000 | 27.750000 | 36.000000 | 45.000000 | 73.000000 |
gender | 1.402000 | 0.491000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 2.000000 |
air_pollution | 3.840000 | 2.030000 | 1.000000 | 2.000000 | 3.000000 | 6.000000 | 8.000000 |
alcohol_use | 4.563000 | 2.620000 | 1.000000 | 2.000000 | 5.000000 | 7.000000 | 8.000000 |
dust_allergy | 5.165000 | 1.981000 | 1.000000 | 4.000000 | 6.000000 | 7.000000 | 8.000000 |
occupational_hazards | 4.840000 | 2.108000 | 1.000000 | 3.000000 | 5.000000 | 7.000000 | 8.000000 |
genetic_risk | 4.580000 | 2.127000 | 1.000000 | 2.000000 | 5.000000 | 7.000000 | 7.000000 |
chronic_lung_disease | 4.380000 | 1.849000 | 1.000000 | 3.000000 | 4.000000 | 6.000000 | 7.000000 |
balanced_diet | 4.491000 | 2.136000 | 1.000000 | 2.000000 | 4.000000 | 7.000000 | 7.000000 |
obesity | 4.465000 | 2.125000 | 1.000000 | 3.000000 | 4.000000 | 7.000000 | 7.000000 |
smoking | 3.948000 | 2.496000 | 1.000000 | 2.000000 | 3.000000 | 7.000000 | 8.000000 |
passive_smoker | 4.195000 | 2.312000 | 1.000000 | 2.000000 | 4.000000 | 7.000000 | 8.000000 |
chest_pain | 4.438000 | 2.280000 | 1.000000 | 2.000000 | 4.000000 | 7.000000 | 9.000000 |
coughing_of_blood | 4.859000 | 2.428000 | 1.000000 | 3.000000 | 4.000000 | 7.000000 | 9.000000 |
fatigue | 3.856000 | 2.245000 | 1.000000 | 2.000000 | 3.000000 | 5.000000 | 9.000000 |
weight_loss | 3.855000 | 2.207000 | 1.000000 | 2.000000 | 3.000000 | 6.000000 | 8.000000 |
shortness_of_breath | 4.240000 | 2.285000 | 1.000000 | 2.000000 | 4.000000 | 6.000000 | 9.000000 |
wheezing | 3.777000 | 2.042000 | 1.000000 | 2.000000 | 4.000000 | 5.000000 | 8.000000 |
swallowing_difficulty | 3.746000 | 2.270000 | 1.000000 | 2.000000 | 4.000000 | 5.000000 | 8.000000 |
clubbing_of_finger_nails | 3.923000 | 2.388000 | 1.000000 | 2.000000 | 4.000000 | 5.000000 | 9.000000 |
frequent_cold | 3.536000 | 1.833000 | 1.000000 | 2.000000 | 3.000000 | 5.000000 | 7.000000 |
dry_cough | 3.853000 | 2.039000 | 1.000000 | 2.000000 | 4.000000 | 6.000000 | 7.000000 |
snoring | 2.926000 | 1.475000 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 7.000000 |
level | 1.062000 | 0.815000 | 0.000000 | 0.000000 | 1.000000 | 2.000000 | 2.000000 |
# As we are using a supervised machine learning method, the data must be identified correctly.
X = df.drop(columns='level')
y = df.level
display(X.head(), y[:5])
age | gender | air_pollution | alcohol_use | dust_allergy | occupational_hazards | genetic_risk | chronic_lung_disease | balanced_diet | obesity | ... | coughing_of_blood | fatigue | weight_loss | shortness_of_breath | wheezing | swallowing_difficulty | clubbing_of_finger_nails | frequent_cold | dry_cough | snoring | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
0 | 33 | 1 | 2 | 4 | 5 | 4 | 3 | 2 | 2 | 4 | ... | 4 | 3 | 4 | 2 | 2 | 3 | 1 | 2 | 3 | 4 |
1 | 17 | 1 | 3 | 1 | 5 | 3 | 4 | 2 | 2 | 2 | ... | 3 | 1 | 3 | 7 | 8 | 6 | 2 | 1 | 7 | 2 |
2 | 35 | 1 | 4 | 5 | 6 | 5 | 5 | 4 | 6 | 7 | ... | 8 | 8 | 7 | 9 | 2 | 1 | 4 | 6 | 7 | 2 |
3 | 37 | 1 | 7 | 7 | 7 | 7 | 6 | 7 | 7 | 7 | ... | 8 | 4 | 2 | 3 | 1 | 4 | 5 | 6 | 7 | 5 |
4 | 46 | 1 | 6 | 8 | 7 | 7 | 7 | 6 | 7 | 7 | ... | 9 | 3 | 2 | 4 | 1 | 4 | 2 | 4 | 2 | 3 |
5 rows × 23 columns
index 0 0 1 1 2 2 3 2 4 2 Name: level, dtype: int64
One of the most important things in developing systems is to maintain a balanced input data set as to not imply a bias towards a certain prediction if the input data contains more samples that conclude a specific output. E.g. In this case, if we use only High Cancer level data, we could be implying a false bias that all people who smoke will have High-level Cancer, which may not be true.
To identify whether we need to over-represent a certain level, we need to visualize the data to observe the levels of each data type.
plt.figure(figsize=(10, 8))
plt.title('Training Data Output Labels', fontsize=20)
plt.pie(df.level.value_counts(),
labels=mapping.keys(),
colors=['#97C1A9','#FFB8B1', '#55CBCD','#FA0000'],
autopct=lambda p: '{:.2f}%\n{:,.0f}'.format(p, p * sum(df.level.value_counts()) / 100),
explode=tuple(0.01 for i in range(3)),
textprops={'fontsize': 20}
)
plt.show()
As part of data visualization, a common graph used could be the seaborn heatmap which allows us to observe the correlation matrix between multiple variables at the same time through a single graph.
The heatmap allows us to denote some useful observations that are insightful, yet need further exploration.
plt.figure(figsize=(20,15))
sns.heatmap(df.corr(), annot=True, cmap=plt.cm.PuBu)
plt.show()
We can see that:
Diagonal: The diagonal line shows a perfect positive correlation (1.0) since it represents each variable's correlation with itself.
Age: Age appears to have a moderate positive correlation with factors like 'air_pollution', 'alcohol_use', and 'chronic_lung_disease', suggesting that as age increases, so do these factors.
Gender: Gender has some level of negative correlation with 'dust_allergy' and 'balanced_diet' and a mild positive correlation with 'genetic_risk'.
Lifestyle Factors: 'Smoking' and 'passive_smoker' are moderately positively correlated, which is expected as they are related behaviors. 'Smoking' also has a mild positive correlation with 'alcohol_use'.
Symptoms and Conditions: 'Chest_pain', 'coughing_of_blood', 'fatigue', and 'weight_loss' are all strongly correlated with each other, which might indicate they often occur together in patients.
Negative Correlations: There are several negative correlations present, such as between 'gender' and 'dust_allergy', though these are relatively weak.
We can observe here violin plots, which show us the range, distribution and the quartiles as well as the medians of data in one plot, very illustratively.
fig, ax = plt.subplots(ncols=4, nrows=6, figsize=(20, 20))
ax = ax.flatten()
for i, col in enumerate(df.columns):
sns.violinplot(x=df['level'].replace(dict(zip(mapping.values(), mapping.keys()))),
y=col, data=df, hue_order='level', palette='tab20c', ax=ax[i])
ax[i].set_title(col.title())
plt.tight_layout(pad=0.2, w_pad=0.2, h_pad=2.5)
These are some distribution plots, depicting the mean and standard deviation of each feature.
fig, ax = plt.subplots(ncols=8, nrows=3, figsize=(20, 10))
ax = ax.flatten()
i = 0
for k, v in df.items():
mu, sigma = norm.fit(v)
sns.histplot(v,
color=colors[i],
kde=True,
bins=25,
ax=ax[i],
label=f'$\mu={mu:.1f}$\n$\sigma={sigma:.1f}$')
ax[i].set_title(f'{k}')
ax[i].legend()
i += 1
plt.tight_layout(pad=0.2, w_pad=0.2, h_pad=2.5)
plt.show()
To build the model, we will not be splitting the data into three parts with a designated validation section, rather a two-way split with Training and Testing as the two main categories. What will happen is that over multiple instances of partioning the data randomly by selecting a random number orf features and/or samples, the training data will allow the model to analyse and create its own logic on how to categorize the information we give it in that instance, and then use it as a weak classifier we will call a tree. With multiple trees being made, a "Random Forest" classifier is built.
Some parameters we can control are how long the tree is, how wide it is, and how many trees we would like. In this notebook we will only explore the number of trees. The reason we are not performing a complex hyperparameter tuning here is because, as some of you might guess, 1000 people are not an accurate representation of a disease that affects millions.
The split will be as such: 70% of the data will be training data and then we will test the accuracy of the model on the remaining 30% of the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
print(f'Shapes - X Training: {X_train.shape} and X Testing {X_test.shape}')
print(f'Shapes - Y Training: {y_train.shape} and Y Testing {y_test.shape}')
print(f'\nTraining output counts\n{y_train.value_counts()}')
Shapes - X Training: (700, 23) and X Testing (300, 23) Shapes - Y Training: (700,) and Y Testing (300,) Training output counts 2 248 1 233 0 219 Name: level, dtype: int64
def CM(y_test, y_pred_test, col_names):
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as pl
import numpy as np
# Forming confusion matrix
CM = confusion_matrix(y_test, y_pred_test)
pl.figure(figsize=(10,8))
sns.heatmap(CM, annot=True, annot_kws={'size':15}, fmt=".0f",
cmap=pl.cm.Blues, linewidths=5)
# labels in plot
tick_marks = np.arange(len(col_names))
pl.xticks(tick_marks + 0.5, col_names)
pl.yticks(tick_marks + 0.5, col_names, rotation=0)
pl.xlabel('Predicted label')
pl.ylabel('True label')
pl.title('Confusion Matrix for Random Forest Model')
pl.show()
def random_forest_n_best(X_train, y_train, X_test, y_test, n_list):
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
scores = []
for n in n_list:
RF = RandomForestClassifier(n_estimators=n, random_state=40)
RF.fit(X_train, y_train)
RF_pred = RF.predict(X_test)
scores.append(accuracy_score(y_test, RF_pred))
plt.plot(n_list, scores)
plt.xlabel('Value of n_estimators for Random Forest Classifier')
plt.ylabel('Testing Accuracy')
plt.grid(alpha=0.1)
plt.show()
random_forest_n_best(X_train, y_train, X_test, y_test, n_list=np.arange(1,20,1))
# Define model and set random_state
RF = RandomForestClassifier(n_estimators=3, random_state=40)
# fitting model
RF.fit(X_train, y_train)
# predicting with model
RF_pred = RF.predict(X_test)
pd.Series(RF_pred).value_counts()
2 117 1 99 0 84 dtype: int64
CM(y_test, RF_pred, col_names=['Low', 'Medium', 'High'])
We will now output the official plot of the random forest.
from sklearn import tree
trees = len(RF.estimators_)
cn = ['Low', 'Medium', 'High']
fig, ax = plt.subplots(trees, 1, figsize=(30,10*trees))
for i, forest in enumerate(RF.estimators_):
if trees > 1:
tree.plot_tree(forest,
feature_names=X.columns,
class_names=cn,
filled=True,
fontsize=11,
ax=ax[i])
else:
tree.plot_tree(forest,
feature_names=X.columns,
class_names=cn,
filled=True,
fontsize=11)
plt.tight_layout(h_pad=-10)
plt.show()
# View the classification report for test data and predictions
ml_accuracies['Random Forest'] = accuracy_score(y_test, RF_pred)
print(classification_report(y_test, RF_pred))
precision recall f1-score support 0 1.00 1.00 1.00 84 1 1.00 1.00 1.00 99 2 1.00 1.00 1.00 117 accuracy 1.00 300 macro avg 1.00 1.00 1.00 300 weighted avg 1.00 1.00 1.00 300
The model achieved a 100% accuracy. While this is known to be false as the dataset is not large, might be fabricated as no official source has claimed it from the source on https://kaggle.com, this notebook shows the understanding of how a classifier is explored to later develop a pipeline for big data.
1. Data Meaning
2. Analysing the Data
3. Visualizing the data
4. Model Building
4.1 Training Testing Split
4.2 Defining Confusion Matrix function
4.3 Random Forest Model
Thank you and if you found this useful please let me know! 👍🏼