The variables that could directly effect on survival rate according to the data table can be 'Sex', 'Pclass',and 'Age'. The following report will be focus on how sex related with survival.
a. Survival rate factors:
b. Analysis of passenger information to see how other variables related with sex
%matplotlib inline
import numpy as np # imports a fast numerical programming library
import scipy as sp #imports stats functions, amongst other things
import matplotlib as mpl # this actually imports matplotlib
import matplotlib.cm as cm #allows us easy access to colormaps
import matplotlib.pyplot as plt #sets up plotting under plt
import pandas as pd #lets us handle data as dataframes
#sets up pandas table display
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns #sets up styles and gives us more plotting options
filename="titanic-data.csv"
titanic_df=pd.read_csv(filename)
titanic_df.head()
titanic_df.dtypes
titanic_df.info()
#create a new DataFrame from groupby objects which grouped by if survived and each sex,
#and calculate the number of people in each group
survival_groupby_sex=pd.DataFrame({'count' : titanic_df.groupby( ['Survived','Sex'] ).size()}).reset_index()
survival_groupby_sex
#check if the total number of people from two DataFrame can match,
#and assign total number of people, total number of people by set to each variable
print titanic_df['Sex'].count()
print survival_groupby_sex['count'].sum()
total_people=survival_groupby_sex['count'].sum()
total_people_by_sex=survival_groupby_sex.groupby('Sex')['count'].sum()
total_people_by_sex
#Calculate survival rate of all the passengers
total_survival=survival_groupby_sex.groupby('Survived').sum().reset_index()
total_survival_rate=total_survival['count'][1]/float(total_people)
print ('ratio of survived people out of all passangers', total_survival_rate)
# Calculate and assign variables to survival rate for each sex
female_survival_rate=233/314.0
male_survival_rate=109/577.0
print female_survival_rate
print male_survival_rate
# Plot stacked column chart to compare number of male and female among survived and non-survived passengers
female=[233,81]
male=[109,468]
ind = np.arange(2)
width=0.4
fig, ax=plt.subplots()
rects1=ax.bar(ind, female, width, color=['#e9967a'])
rects2=ax.bar(ind, male, width,bottom=female, color=['#4169e1'])
ax.set_ylabel('Number of People')
ax.set_xticks(ind+0.2)
ax.set_xticklabels(('Survived','Non_survived'))
ax.set_yticks(np.arange(0,700,100))
ax.legend((rects1[0], rects2[0]), ('female', 'male'),loc=2)
ax.set_title("Counts of Genders by Survivors and Non-Survivors")
# plot stacked plot to compare survived vs. non-survived among all passengers, only female and only male.
survived=[342,233,109]
non_survived=[529,81,468]
ind = np.arange(3)
width=0.4
fig, ax=plt.subplots()
rects1=ax.bar(ind, survived, width, color='#9370db')
rects2=ax.bar(ind, non_survived, width,bottom=survived, color='#696969')
ax.set_ylabel('Number of People')
ax.set_xticks(ind+0.2)
ax.set_xticklabels(('total_people','female','male'))
ax.set_yticks(np.arange(0,1000,100))
ax.legend((rects1[0], rects2[0]), ('Survived', 'Non-survived'))
ax.set_title('Counts of Survivors and Non-Survivors for All Passengers, and Each Gender (Percentages are Survival Rates)')
labels=['{percent:.2%}'.format(percent=total_survival_rate),'{percent:.2%}'.format(percent=female_survival_rate),'{percent:.2%}'.format(percent=male_survival_rate)]
for rect, label in zip(rects1, labels):
height=rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2,height/2, label, ha='center', va='bottom')
# Select 'Age' column for each sex, and clean empty age rows.
age_by_female=titanic_df[(titanic_df['Sex']=='female')]['Age'].dropna()
age_by_male=titanic_df[(titanic_df['Sex']=='male')]['Age'].dropna()
# Plot available age data as box chart for female and male
data=[age_by_female,age_by_male]
fig = plt.figure(1,figsize=(9,6))
ax=fig.add_subplot(111)
bp=ax.boxplot(data,patch_artist=True)
for box in bp['boxes']:
# change outline color
box.set( color='#7570b3', linewidth=2)
# change fill color
box.set_facecolor = '#1b9e77'
## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
whisker.set(color='#7570b3', linewidth=2)
## change color and linewidth of the caps
for cap in bp['caps']:
cap.set(color='#7570b3', linewidth=2)
## change color and linewidth of the medians
for median in bp['medians']:
median.set(color='#b2df8a', linewidth=2)
## change the style of fliers and their fill
for flier in bp['fliers']:
flier.set(marker='o', color='#e7298a', alpha=0.5)
ax.set_xticklabels(['Female','Male'],fontsize=14)
ax.set_ylabel('Age')
ax.set_yticks(np.arange(0,100,10))
ax.set_xlabel('Gender')
ax.set_title('Box Chart for Age by Gender')
# Plot histogram for female and male ages to compare distribution
fig2 = plt.figure(1,figsize=(9,6))
ax2=fig2.add_subplot(111)
hist=ax2.hist(data,bins=15,label=['Female','Male'])
plt.style.use('seaborn-deep')
plt.legend(loc='upper right')
ax2.set_xlabel('Age')
ax2.set_ylabel('Number of passengers')
ax2.set_title('Histogram of Age by Gender')
# Create a DataFrame to count the number of passengers grouped by sex and class.
sex_by_class=pd.DataFrame({'count' : titanic_df.groupby( ['Sex','Pclass'] ).size()}).reset_index()
sex_by_class
# Plot female and male as stacked bar chart for each class level.
class_female=sex_by_class['count'][0:3]
class_male=sex_by_class['count'][3:6]
ind = np.arange(3)
width=0.4
fig, ax=plt.subplots()
rects1=ax.bar(ind, class_female, width, color='#e9967a')
rects2=ax.bar(ind, class_male, width,bottom=class_female, color='#4169e1')
ax.set_ylabel('Number of People')
ax.set_xticks(ind+0.2)
ax.set_xticklabels(('Class1','Class2','Class3'))
ax.set_yticks(np.arange(0,600,100))
ax.legend((rects1[0], rects2[0]), ('Female', 'Male'), loc=2)
ax.set_xlabel('Passenger Class')
ax.set_title('Count of Passenger Gender by Class')
# Create a DataFrame to count number of people grouped by if or not survived and 'Pclass'
survival_groupby_class=pd.DataFrame({'count' : titanic_df.groupby( ['Survived','Pclass'] ).size()}).reset_index()
survival_groupby_class
# Calculate survival rate for each class
survival_rate_calculation=survival_groupby_class[survival_groupby_class['Survived']==1]['count'].reset_index()/survival_groupby_class.groupby('Pclass')['count'].sum().reset_index()
print survival_groupby_class[survival_groupby_class['Survived']==1]['count'].reset_index()
print survival_groupby_class.groupby('Pclass')['count'].sum().reset_index()
survival_rate=survival_rate_calculation['count']
# Plot survived and non-survived people as stacked bar for each class level.
class_non_survival=survival_groupby_class['count'][0:3]
class_survival=survival_groupby_class['count'][3:6]
ind = np.arange(3)
width=0.4
fig, ax=plt.subplots()
rects1=ax.bar(ind, class_survival, width, color='#9370db')
rects2=ax.bar(ind, class_non_survival, width,bottom=class_survival, color='#696969')
#ax1=plt.bar(ind, survived, 0.4, color='r')
#ax2=plt.bar(ind, non_survived, 0.4,bottom=survived, color='k')
ax.set_ylabel('Number of People')
ax.set_xticks(ind+0.2)
ax.set_xticklabels(('Class1','Class2','Class3'))
ax.set_yticks(np.arange(0,600,100))
ax.legend((rects1[0], rects2[0]), ('Survived','Non_survived'), loc=2)
ax.set_xlabel('Passenger Class')
ax.set_title('Counts of Survivors and Non-Survivors by Passenger Class (Percentages are Survival Rates)')
labels=[]
for number in survival_rate:
labels.append('{percent:.2%}'.format(percent=number))
for rect, label in zip(rects1, labels):
height=rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2,height/2, label, ha='center', va='bottom')