Kaggle Titanic: Survive prediction

This post is a recording of learning ML with Kaggle, For the competition, link click here.

This is a 73.68% accurate model with Random Forest Classifier.

Intro

Let take a glance at the notebook, here we go! We need the following packages.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Modelling Algorithms -> I finally select Random Forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Modelling Helpers
from sklearn.preprocessing import Normalizer, scale
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import RFECV

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure Visualization
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 8, 6

plus, we got some helper methods.

# Helper Methods from Kaggle
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = titanic.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))

and using KaggleAPI to download data

kaggle competitions download -c titanic

loading the data

# Loading Data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

full = train.append(test, ignore_index=True)
titanic = full[:891]

del train, test

print('Datasets:', 'full:', full.shape, 'titanic:', titanic.shape)
# Datasets: full: (1309, 12) titanic: (891, 12)

Taking a glance at the data head

titanic.head()

Data Visualization

plot_correlation_map(titanic)

plot_categories(titanic, cat='Embarked', target='Survived')

# Plot survival rate by Sex
plot_categories(titanic, cat='Sex', target='Survived')

# Plot survival rate by Pclass
plot_categories(titanic, cat='Pclass', target='Survived')

# Plot survival rate by SibSp (no. of Siblings / Spouses)
plot_categories(titanic, cat='SibSp', target='Survived')

Data Preparation

pandas.get_dummies() the method is to one-hot encoding every attributes with the data, here is an example to know how it works

# pd.get_dummies() function usage
# one hot encoding of every attributes with the data
df = pd.DataFrame([
    ['tall', 'thin'],
    ['short', 'thin'],
    ['short', 'overweighted']
])

df.columns = ['height', 'weight']
pd.get_dummies(df)

ID	height_short	height_tall	weight_overweighted	weight_thin
0	0	1	0	1
1	1	0	0	1
2	1	0	1	0

To have a more clear dataset, We can transform sex into 1 or 0

# Transform sex -> 0 / 1
sex = pd.Series(np.where(full.Sex == 'male', 1, 0), name='Sex')

Make dummies for embarked gate

# Create a new variable for every unique value of Embarked (one-hot encoding)
# Embarked gate = [C, Q, S]
embarked = pd.get_dummies(full.Embarked, prefix='Embarked')

# Same operation of Pclass
pclass = pd.get_dummies(full.Pclass, prefix='Pclass')

Data Cleaning

Most machine learning algorithms require all variables to have values to use for training the model. The simplest method is to fill missing values with the average of the variable across all observations in the training set.

# Create Dataset
imputed = pd.DataFrame()

# Fill missing values of Age and Fare with the avg from full
imputed['Age'] = full.Age.fillna(full.Age.mean())
imputed['Fare'] = full.Fare.fillna(full.Fare.mean())
imputed.head()

ID	Age	Fare
0	22.0	7.2500
1	38.0	71.2833
2	26.0	7.9250
3	35.0	53.1000
4	35.0	8.0500

Feature Engineering

There are many titles for the passengers in Titanic, but we can simplify them into several categories, this needs a helper method and mapping dictionary to achieve.

# Extract title from passager names
# Titles reflect social status and may predict survival probability
title = pd.DataFrame()
title['Title'] = full['Name'].map(\
    lambda name: name.split(',')[1]\
    .split('.')[0].strip()
)

# Simplify titles dictionary
Title_Dictionary = {
    "Capt":         "Officer",
    "Col":          "Officer",
    "Major":        "Officer",
    "Dr":           "Officer",
    "Rev":          "Officer",
    "Jonkheer":     "Royalty",
    "Don":          "Royalty",
    "Sir":          "Royalty",
    "the Countess": "Royalty",
    "Dona":         "Royalty",
    "Mme":          "Mrs",
    "Mlle":         "Miss",
    "Ms":           "Mrs",
    "Mr":           "Mr",
    "Mrs":          "Mrs",
    "Miss":         "Miss",
    "Master":       "Master",
    "Lady":         "Royalty"
}

To apply the cleaned title

title['Title'] = title.Title.map(Title_Dictionary)
title = pd.get_dummies(title.Title)
title.head()

ID	Miss	Mr	Mrs
0	0	1	0
1	0	0	1
2	1	0	0
3	0	0	1
4	0	1	0

Now we optimize the Cabin, using U for unknown cabins (data missing).

# Extract cabin category information from cabin no.
cabin = pd.DataFrame()
# U: Unknown
cabin['Cabin'] = full.Cabin.fillna('U')

# mapping cabin value with cabin letter
cabin['Cabin'] = cabin['Cabin'].map(lambda c: c[0])

# one hot encoding
cabin = pd.get_dummies(cabin['Cabin'], prefix='Cabin')
cabin.head()

ID	Cabin_C	Cabin_U
0	0	1
1	1	0
2	0	1
3	1	0
4	0	1

The following is for extracting ticket class from each ticket no.

# Extract ticket class from ticket number

# Ticket clean Algorithms
def cleanTicket( ticket ):
    ticket = ticket.replace( '.' , '' )
    ticket = ticket.replace( '/' , '' )
    ticket = ticket.split()
    ticket = map( lambda t : t.strip() , ticket )
    ticket = list(filter( lambda t : not t.isdigit() , ticket ))
    if len( ticket ) > 0:
        return ticket[0]
    else: 
        return 'XXX'


ticket = pd.DataFrame()
ticket['Ticket'] = full['Ticket'].map(cleanTicket)
ticket = pd.get_dummies(ticket['Ticket'], prefix='Ticket')

ticket.head()

ID	Ticket_A5	...	Ticket_STONO2	Ticket_XXX
0	1	...	0	0
1	0	...	0	0
2	0	...	1	0
3	0	...	0	1
4	0	...	0	1

We introduce a new attribute called family size which data is derived from parent/children and sibling/spouse attributes

# Create family size and category for family size
family = pd.DataFrame()

family['FamilySize'] = full['Parch'] + full['SibSp'] + 1

family['Family_Single'] = family['FamilySize'].map(lambda s: 1 if s == 1 else 0)
family['Family_Small'] = family['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
family['Family_Large'] = family['FamilySize'].map(lambda s: 1 if 5 <= s else 0)

family.head()

ID	FamilySize	Family_Single	Family_Small
0	2	0	1
1	2	0	1
2	1	1	0
3	2	0	1
4	1	1	0

Data Modelling

With the cleaned data, I selected these variables.

imputed
embarked
pclass
sex
family
cabin
ticket

full_X = pd.concat([imputed, embarked, cabin, sex], axis=1)
full_X.head()

ID	Age	Fare	Embarked_C	Embarked_S	Cabin_C	Cabin_U	Sex
0	22.0	7.2500	0	1	0	1	1
1	38.0	71.2833	1	0	1	0	0
2	26.0	7.9250	0	1	0	1	0
3	35.0	53.1000	0	1	1	0	0
4	35.0	8.0500	0	1	0	1	1

That's it, and now we can set the train and test datasets

train_valid_X = full_X[0:891]
train_valid_y = titanic.Survived
test_X = full_X[891:]

train_X, valid_X, train_y, valid_y = train_test_split(
    train_valid_X, 
    train_valid_y, 
    train_size=.7
)

print(
    full_X.shape, 
    train_X.shape, 
    valid_X.shape, 
    train_y.shape, 
    valid_y.shape, 
    test_X.shape
)

# (1309, 15) (623, 15) (268, 15) (623,) (268,) (418, 15)

plot_variable_importance(train_X, train_y)

I also tried many other classifiers, finally, the random forest is optimized for this classification

# Random Forest Classifier
selected_model = RandomForestClassifier(n_estimators=100)

# Support Vector Machines
model = SVC()

# Gradient Boosting Classifier
model = GradientBoostingClassifier()

# K-nearest neighbors
model = KNeighborsClassifier(n_neighbors=3)

# Gaussian Naive Bayes
model = GaussianNB()

# Logistic Regression
model = LogisticRegression()

Then we can fit (train the model)

selected_model.fit(train_X, train_y)

And take a look at the accuracy

print("Train:", selected_model.score(train_X, train_y),
      "\nValid:", selected_model.score(valid_X, valid_y))

# Train: 0.9341894060995185 
# Valid: 0.7761194029850746

Deploy to Kaggle

Deployment to Kaggle

Deployment in this context means publishing the resulting prediction from the model to the Kaggle leaderboard. To do this do the following:

select the cell below and run it by pressing the play button.
Press the Publish button in the top right corner.
Select Output on the notebook menubar
Select the result dataset and press Submit to Competition button

test_Y = selected_model.predict(test_X)
passenger_id = full[891:].PassengerId
test = pd.DataFrame({
    'PassengerId': passenger_id,
    'Survived': test_Y
}, dtype='int32')
print(test.shape)
print(test.head())
test.to_csv('submission.csv', index=False)

# output
(418, 2)
     PassengerId  Survived
891          892         0
892          893         1
893          894         0
894          895         0
895          896         1

here is the submission file

!ls

Kaggle-Titanic.ipynb submission.csv
data                 titanic.zip

Something in the end

Kaggle Titanic survival prediction is a straightforward competition and it is good for making your hands dirty to exercise ML skills.

ID	Ticket_A5	...	Ticket_STONO2	Ticket_XXX
0	1	...	0	0
1	0	...	0	0
2	0	...	1	0
3	0	...	0	1
4	0	...	0	1

ID	Ticket_A5	...	Ticket_STONO2	Ticket_XXX
0	1	...	0	0
1	0	...	0	0
2	0	...	1	0
3	0	...	0	1
4	0	...	0	1

ID	Ticket_A5	...	Ticket_STONO2	Ticket_XXX
0	1	...	0	0
1	0	...	0	0
2	0	...	1	0
3	0	...	0	1
4	0	...	0	1