Kaggle Titanic: Survive prediction
This post is a recording of learning ML with Kaggle, For the competition, link click here.
This is a 73.68% accurate model with Random Forest Classifier.
Intro
Let take a glance at the notebook, here we go! We need the following packages.
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
# Modelling Algorithms -> I finally select Random Forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# Modelling Helpers
from sklearn.preprocessing import Normalizer, scale
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import RFECV
# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# Configure Visualization
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 8, 6
plus, we got some helper methods.
# Helper Methods from Kaggle
def plot_histograms( df , variables , n_rows , n_cols ):
fig = plt.figure( figsize = ( 16 , 12 ) )
for i, var_name in enumerate( variables ):
ax=fig.add_subplot( n_rows , n_cols , i+1 )
df[ var_name ].hist( bins=10 , ax=ax )
ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
ax.set_xticklabels( [] , visible=False )
ax.set_yticklabels( [] , visible=False )
fig.tight_layout() # Improves appearance a bit.
plt.show()
def plot_distribution( df , var , target , **kwargs ):
row = kwargs.get( 'row' , None )
col = kwargs.get( 'col' , None )
facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
facet.map( sns.kdeplot , var , shade= True )
facet.set( xlim=( 0 , df[ var ].max() ) )
facet.add_legend()
def plot_categories( df , cat , target , **kwargs ):
row = kwargs.get( 'row' , None )
col = kwargs.get( 'col' , None )
facet = sns.FacetGrid( df , row = row , col = col )
facet.map( sns.barplot , cat , target )
facet.add_legend()
def plot_correlation_map( df ):
corr = titanic.corr()
_ , ax = plt.subplots( figsize =( 12 , 10 ) )
cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
_ = sns.heatmap(
corr,
cmap = cmap,
square=True,
cbar_kws={ 'shrink' : .9 },
ax=ax,
annot = True,
annot_kws = { 'fontsize' : 12 }
)
def describe_more( df ):
var = [] ; l = [] ; t = []
for x in df:
var.append( x )
l.append( len( pd.value_counts( df[ x ] ) ) )
t.append( df[ x ].dtypes )
levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
levels.sort_values( by = 'Levels' , inplace = True )
return levels
def plot_variable_importance( X , y ):
tree = DecisionTreeClassifier( random_state = 99 )
tree.fit( X , y )
plot_model_var_imp( tree , X , y )
def plot_model_var_imp( model , X , y ):
imp = pd.DataFrame(
model.feature_importances_ ,
columns = [ 'Importance' ] ,
index = X.columns
)
imp = imp.sort_values( [ 'Importance' ] , ascending = True )
imp[ : 10 ].plot( kind = 'barh' )
print (model.score( X , y ))
and using KaggleAPI to download data
kaggle competitions download -c titanic
loading the data
# Loading Data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
full = train.append(test, ignore_index=True)
titanic = full[:891]
del train, test
print('Datasets:', 'full:', full.shape, 'titanic:', titanic.shape)
# Datasets: full: (1309, 12) titanic: (891, 12)
Taking a glance at the data head
titanic.head()
Data Visualization
plot_correlation_map(titanic)
plot_categories(titanic, cat='Embarked', target='Survived')
# Plot survival rate by Sex
plot_categories(titanic, cat='Sex', target='Survived')
# Plot survival rate by Pclass
plot_categories(titanic, cat='Pclass', target='Survived')
# Plot survival rate by SibSp (no. of Siblings / Spouses)
plot_categories(titanic, cat='SibSp', target='Survived')
Data Preparation
pandas.get_dummies()
the method is to one-hot encoding every attributes with the data, here is an example to know how it works
# pd.get_dummies() function usage
# one hot encoding of every attributes with the data
df = pd.DataFrame([
['tall', 'thin'],
['short', 'thin'],
['short', 'overweighted']
])
df.columns = ['height', 'weight']
pd.get_dummies(df)
ID | height_short | height_tall | weight_overweighted | weight_thin |
---|---|---|---|---|
0 | 0 | 1 | 0 | 1 |
1 | 1 | 0 | 0 | 1 |
2 | 1 | 0 | 1 | 0 |
To have a more clear dataset, We can transform sex into 1 or 0
# Transform sex -> 0 / 1
sex = pd.Series(np.where(full.Sex == 'male', 1, 0), name='Sex')
Make dummies for embarked gate
# Create a new variable for every unique value of Embarked (one-hot encoding)
# Embarked gate = [C, Q, S]
embarked = pd.get_dummies(full.Embarked, prefix='Embarked')
# Same operation of Pclass
pclass = pd.get_dummies(full.Pclass, prefix='Pclass')
Data Cleaning
Most machine learning algorithms require all variables to have values to use for training the model. The simplest method is to fill missing values with the average of the variable across all observations in the training set.
# Create Dataset
imputed = pd.DataFrame()
# Fill missing values of Age and Fare with the avg from full
imputed['Age'] = full.Age.fillna(full.Age.mean())
imputed['Fare'] = full.Fare.fillna(full.Fare.mean())
imputed.head()
ID | Age | Fare |
---|---|---|
0 | 22.0 | 7.2500 |
1 | 38.0 | 71.2833 |
2 | 26.0 | 7.9250 |
3 | 35.0 | 53.1000 |
4 | 35.0 | 8.0500 |
Feature Engineering
There are many titles for the passengers in Titanic, but we can simplify them into several categories, this needs a helper method and mapping dictionary to achieve.
# Extract title from passager names
# Titles reflect social status and may predict survival probability
title = pd.DataFrame()
title['Title'] = full['Name'].map(\
lambda name: name.split(',')[1]\
.split('.')[0].strip()
)
# Simplify titles dictionary
Title_Dictionary = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Dr": "Officer",
"Rev": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir": "Royalty",
"the Countess": "Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr": "Mr",
"Mrs": "Mrs",
"Miss": "Miss",
"Master": "Master",
"Lady": "Royalty"
}
To apply the cleaned title
title['Title'] = title.Title.map(Title_Dictionary)
title = pd.get_dummies(title.Title)
title.head()
ID | Master | Miss | Mr | Mrs | Officer | Royalty |
---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | 0 | 0 | 1 | 0 | 0 | 0 |
Now we optimize the Cabin, using U for unknown cabins (data missing).
# Extract cabin category information from cabin no.
cabin = pd.DataFrame()
# U: Unknown
cabin['Cabin'] = full.Cabin.fillna('U')
# mapping cabin value with cabin letter
cabin['Cabin'] = cabin['Cabin'].map(lambda c: c[0])
# one hot encoding
cabin = pd.get_dummies(cabin['Cabin'], prefix='Cabin')
cabin.head()
ID | Cabin_A | Cabin_B | Cabin_C | Cabin_D | Cabin_E | Cabin_F | Cabin_G | Cabin_T | Cabin_U |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
The following is for extracting ticket class from each ticket no.
# Extract ticket class from ticket number
# Ticket clean Algorithms
def cleanTicket( ticket ):
ticket = ticket.replace( '.' , '' )
ticket = ticket.replace( '/' , '' )
ticket = ticket.split()
ticket = map( lambda t : t.strip() , ticket )
ticket = list(filter( lambda t : not t.isdigit() , ticket ))
if len( ticket ) > 0:
return ticket[0]
else:
return 'XXX'
ticket = pd.DataFrame()
ticket['Ticket'] = full['Ticket'].map(cleanTicket)
ticket = pd.get_dummies(ticket['Ticket'], prefix='Ticket')
ticket.head()
ID | Ticket_A | Ticket_A4 | Ticket_A5 | Ticket_AQ3 | Ticket_AQ4 | Ticket_AS | Ticket_C | Ticket_CA | Ticket_CASOTON | Ticket_FC | ... | Ticket_SOTONO2 | Ticket_SOTONOQ | Ticket_SP | Ticket_STONO | Ticket_STONO2 | Ticket_STONOQ | Ticket_SWPP | Ticket_WC | Ticket_WEP | Ticket_XXX |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
We introduce a new attribute called family size which data is derived from parent/children and sibling/spouse attributes
# Create family size and category for family size
family = pd.DataFrame()
family['FamilySize'] = full['Parch'] + full['SibSp'] + 1
family['Family_Single'] = family['FamilySize'].map(lambda s: 1 if s == 1 else 0)
family['Family_Small'] = family['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
family['Family_Large'] = family['FamilySize'].map(lambda s: 1 if 5 <= s else 0)
family.head()
ID | FamilySize | Family_Single | Family_Small | Family_Large |
---|---|---|---|---|
0 | 2 | 0 | 1 | 0 |
1 | 2 | 0 | 1 | 0 |
2 | 1 | 1 | 0 | 0 |
3 | 2 | 0 | 1 | 0 |
4 | 1 | 1 | 0 | 0 |
Data Modelling
With the cleaned data, I selected these variables.
- imputed
- embarked
- pclass
- sex
- family
- cabin
- ticket
full_X = pd.concat([imputed, embarked, cabin, sex], axis=1)
full_X.head()
ID | Age | Fare | Embarked_C | Embarked_Q | Embarked_S | Cabin_A | Cabin_B | Cabin_C | Cabin_D | Cabin_E | Cabin_F | Cabin_G | Cabin_T | Cabin_U | Sex |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22.0 | 7.2500 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
1 | 38.0 | 71.2833 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 26.0 | 7.9250 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 35.0 | 53.1000 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 35.0 | 8.0500 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
That's it, and now we can set the train and test datasets
train_valid_X = full_X[0:891]
train_valid_y = titanic.Survived
test_X = full_X[891:]
train_X, valid_X, train_y, valid_y = train_test_split(
train_valid_X,
train_valid_y,
train_size=.7
)
print(
full_X.shape,
train_X.shape,
valid_X.shape,
train_y.shape,
valid_y.shape,
test_X.shape
)
# (1309, 15) (623, 15) (268, 15) (623,) (268,) (418, 15)
plot_variable_importance(train_X, train_y)
I also tried many other classifiers, finally, the random forest is optimized for this classification
# Random Forest Classifier
selected_model = RandomForestClassifier(n_estimators=100)
# Support Vector Machines
model = SVC()
# Gradient Boosting Classifier
model = GradientBoostingClassifier()
# K-nearest neighbors
model = KNeighborsClassifier(n_neighbors=3)
# Gaussian Naive Bayes
model = GaussianNB()
# Logistic Regression
model = LogisticRegression()
Then we can fit (train the model)
selected_model.fit(train_X, train_y)
And take a look at the accuracy
print("Train:", selected_model.score(train_X, train_y),
"\nValid:", selected_model.score(valid_X, valid_y))
# Train: 0.9341894060995185
# Valid: 0.7761194029850746
Deploy to Kaggle
Deployment to Kaggle
Deployment in this context means publishing the resulting prediction from the model to the Kaggle leaderboard. To do this do the following:
- select the cell below and run it by pressing the play button.
- Press the
Publish
button in the top right corner. - Select
Output
on the notebook menubar - Select the result dataset and press
Submit to Competition
button
test_Y = selected_model.predict(test_X)
passenger_id = full[891:].PassengerId
test = pd.DataFrame({
'PassengerId': passenger_id,
'Survived': test_Y
}, dtype='int32')
print(test.shape)
print(test.head())
test.to_csv('submission.csv', index=False)
# output
(418, 2)
PassengerId Survived
891 892 0
892 893 1
893 894 0
894 895 0
895 896 1
here is the submission file
!ls
Kaggle-Titanic.ipynb submission.csv
data titanic.zip
Something in the end
Kaggle Titanic survival prediction is a straightforward competition and it is good for making your hands dirty to exercise ML skills.