Using scikit-learn to tackle the Titanic Kaggle Competition
Motivation
This page contains walkthrough information written in python for the Titanic: Machine Learning from Disaster TitanicCode
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
In [2]:
# deterministic random data
np.random.seed(42)
In [3]:
#Loading the training data
train = pd.read_csv("train.csv", index_col = "PassengerId")
In [4]:
train.info()
#Some info are missing (age:714, cabin:204, embarked:889)
In [5]:
train.hist(bins=50, figsize=(20,15))
plt.show()
In [6]:
#Although there was some element of luck involved in surviving the sinking,
#some groups of people were more likely to survive than others, such as women,
#children, and the upper-class. https://www.kaggle.com/c/titanic
#adults
train["age_cat"] = 1
train["age_cat"].where(train["Age"] > 12, 0, inplace=True)
train["age_cat"].where(train["Age"] < 60, 2, inplace=True)
train.groupby("age_cat").count()
Out[6]:
In [7]:
train[train["Survived"] == 1].hist(bins=50, figsize=(20,15))
plt.show()
In [8]:
train[train["Survived"] == 0].hist(bins=50, figsize=(20,15))
plt.show()
In [9]:
train[train["Survived"]==1].groupby("age_cat").count()["Survived"]/train[train["Survived"]==0].groupby("age_cat").count()["Survived"]
Out[9]:
In [10]:
train[train["Survived"]==1].groupby("Sex").count()["Survived"]/train[train["Survived"]==0].groupby("Sex").count()["Survived"]
Out[10]:
In [11]:
attributes = ["Age", "age_cat", "Fare", "Pclass", "Survived"]
##train[attributes].plot(kind="scatter", x="age_cat", y="Survived", alpha=0.1, figsize=(12, 8))
scatter_matrix(train[attributes], figsize=(12, 8))
plt.show()
In [12]:
train.groupby("Sex").count()
Out[12]:
In [13]:
train.groupby("Survived").describe()
Out[13]:
In [14]:
train.groupby("Survived").hist(bins=25,figsize=(12, 8))
plt.show()
In [15]:
train[["Survived","Embarked"]].groupby("Embarked").hist(bins=3,figsize=(12, 8))
plt.show()
Data Cleaning/Scaling¶
preparing the data to train the model
In [16]:
dropped_features = ["Cabin", "Embarked", "Name", "Ticket", "age_cat"]
In [17]:
train_encoded = train.drop(dropped_features, 1)
train_encoded = pd.get_dummies(train_encoded)
age_median = train_encoded["Age"].median()
train_encoded["Age"].fillna(age_median, inplace=True)
In [18]:
train_encoded.describe()
Out[18]:
In [19]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
train_index, test_index = next(split.split(train, train["Sex"]))
In [20]:
X_train_set, y_train_set = train_encoded.iloc[train_index], train_encoded[["Survived"]].iloc[train_index]
X_test_set, y_test_set = train_encoded.iloc[test_index], train_encoded[["Survived"]].iloc[test_index]
In [21]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.feature_selection import RFECV
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
def test_models(X, y):
sgd_clf = SGDClassifier(random_state=42)
rfecv = RFECV(estimator=sgd_clf, cv=5, scoring='f1')
scores = cross_val_score(sgd_clf, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
print("\nSGDClassifier - features:%d" % rfecv.n_features_)
display_scores(scores)
forest_clf = RandomForestClassifier(random_state=42)
rfecv = RFECV(estimator=forest_clf, cv=5, scoring='f1')
scores = cross_val_score(forest_clf, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
print("\nRandomForestClassifier:%d" % rfecv.n_features_)
display_scores(scores)
log_reg = LogisticRegression(random_state=42)
rfecv = RFECV(estimator=log_reg, cv=5, scoring='f1')
scores = cross_val_score(log_reg, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
print("\nLogisticRegression:%d" % rfecv.n_features_)
display_scores(scores)
softmax_reg = LogisticRegression(solver="lbfgs", C=5, random_state=42)
rfecv = RFECV(estimator=softmax_reg, cv=5, scoring='f1')
scores = cross_val_score(softmax_reg, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
print("\nsoftmax_reg-LogisticRegression:%d" % rfecv.n_features_)
display_scores(scores)
xgb_clf = xgb.XGBClassifier(seed = 42)
rfecv = RFECV(estimator=xgb_clf, cv=5, scoring='f1')
scores = cross_val_score(xgb_clf, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
print("\nXGBClassifier:%d" % rfecv.n_features_)
display_scores(scores)
neu_clf = MLPClassifier(random_state=42)
scores = cross_val_score(neu_clf, X, y, scoring="f1", cv=5)
print("\nMLPClassifier")
display_scores(scores)
scaler = StandardScaler()
In [22]:
train_prepared= scaler.fit_transform(X_train_set.drop("Survived", 1))
test_models(train_prepared, y_train_set["Survived"])
XGBClassifier has the best results.¶
Time to run some GridSearchCV to find the hyperparameters
In [23]:
xgb_clf = xgb.XGBClassifier(seed = 42)
rfecv = RFECV(estimator=xgb_clf, cv=5, scoring='f1')
rfecv.fit(train_prepared, y_train_set["Survived"])
xgb_clf.fit(rfecv.transform(train_prepared), y_train_set["Survived"])
f1_score(xgb_clf.predict(rfecv.transform(train_prepared)), y_train_set["Survived"])
Out[23]:
In [24]:
f1_score(xgb_clf.predict(rfecv.transform(scaler.fit_transform(X_test_set.drop("Survived", 1)))), y_test_set["Survived"])
Out[24]:
In [25]:
#train_prepared= scaler.fit_transform(train_encoded.drop("Survived", 1))
#xgb_clf.fit(rfecv.transform(train_prepared), train_encoded["Survived"])
#f1_score(xgb_clf.predict(rfecv.transform(train_prepared)), train_encoded["Survived"])
In [26]:
#once a model is build, your kaggle submission can be build - 0.77990
test = pd.read_csv("test.csv", index_col = "PassengerId")
t_encoded = pd.get_dummies(test.drop(["Cabin", "Embarked", "Name", "Ticket"], 1))
age_median = t_encoded["Age"].median()
t_encoded["Age"].fillna(age_median, inplace=True)
fare_median = t_encoded["Fare"].median()
t_encoded["Fare"]=fare_median
test["Survived"] = xgb_clf.predict(rfecv.transform(scaler.fit_transform(t_encoded)))
test['Survived'].to_csv("result_2.csv")
Setting some hyperparameters¶
work in progress...
In [ ]:
from sklearn.model_selection import GridSearchCV
In [ ]:
parameters = {
'max_depth':range(3,10,1),
'min_child_weight':range(1,6,1),
'learning_rate': [0.1, 0.15, 0.2, 0.25],
'n_estimators':range(100,200,30)
}
grid_search = GridSearchCV(xgb.XGBClassifier(seed = 42), parameters, cv=5,scoring='f1')
grid_search.fit(poly_features.fit_transform(train_prepared), strat_train_set["Survived"])
clf = grid_search.best_estimator_
Conclusion
This model scores 0.77990, but some hyperparameters have to be test before concluded that XGBoost gave its best.
Author
Matheus Cunha (@mathcunha) works as Solutions Architect at SEFAZ-CE. He holds a B.Sc. in Computer Science from Federal University of Bahia, Brazil; a M.Sc. and a Ph.D in Applied Informatics from University of Fortaleza, Brazil. His main research areas are distributed systems and cloud computing.
Comments
Post a Comment