In [1]:
import pandas as pd
In [2]:
data = pd.read_csv('train.csv')
In [3]:
data.head()
Out [3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [5]:
data.tail(3)
Out [5]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.45 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.00 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.75 NaN Q
In [7]:
data.shape
Out [7]:
(891, 12)
In [8]:
data.info
Out [8]:
<bound method DataFrame.info of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ...    ...   
886                              Montvila, Rev. Juozas    male  27.0      0   
887                       Graham, Miss. Margaret Edith  female  19.0      0   
888           Johnston, Miss. Catherine Helen "Carrie"  female   NaN      1   
889                              Behr, Mr. Karl Howell    male  26.0      0   
890                                Dooley, Mr. Patrick    male  32.0      0   

     Parch            Ticket     Fare Cabin Embarked  
0        0         A/5 21171   7.2500   NaN        S  
1        0          PC 17599  71.2833   C85        C  
2        0  STON/O2. 3101282   7.9250   NaN        S  
3        0            113803  53.1000  C123        S  
4        0            373450   8.0500   NaN        S  
..     ...               ...      ...   ...      ...  
886      0            211536  13.0000   NaN        S  
887      0            112053  30.0000   B42        S  
888      2        W./C. 6607  23.4500   NaN        S  
889      0            111369  30.0000  C148        C  
890      0            370376   7.7500   NaN        Q  

[891 rows x 12 columns]>
In [11]:
data.describe(include='all')
Out [11]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
count 891.000000 891.000000 891.000000 891 891 714.000000 891.000000 891.000000 891 891.000000 204 889
unique NaN NaN NaN 891 2 NaN NaN NaN 681 NaN 147 3
top NaN NaN NaN Braund, Mr. Owen Harris male NaN NaN NaN 347082 NaN B96 B98 S
freq NaN NaN NaN 1 577 NaN NaN NaN 7 NaN 4 644
mean 446.000000 0.383838 2.308642 NaN NaN 29.699118 0.523008 0.381594 NaN 32.204208 NaN NaN
std 257.353842 0.486592 0.836071 NaN NaN 14.526497 1.102743 0.806057 NaN 49.693429 NaN NaN
min 1.000000 0.000000 1.000000 NaN NaN 0.420000 0.000000 0.000000 NaN 0.000000 NaN NaN
25% 223.500000 0.000000 2.000000 NaN NaN 20.125000 0.000000 0.000000 NaN 7.910400 NaN NaN
50% 446.000000 0.000000 3.000000 NaN NaN 28.000000 0.000000 0.000000 NaN 14.454200 NaN NaN
75% 668.500000 1.000000 3.000000 NaN NaN 38.000000 1.000000 0.000000 NaN 31.000000 NaN NaN
max 891.000000 1.000000 3.000000 NaN NaN 80.000000 8.000000 6.000000 NaN 512.329200 NaN NaN
In [13]:
data[data['Sex'] == 'male']
Out [13]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
883 884 0 2 Banfield, Mr. Frederick James male 28.0 0 0 C.A./SOTON 34068 10.5000 NaN S
884 885 0 3 Sutehall, Mr. Henry Jr male 25.0 0 0 SOTON/OQ 392076 7.0500 NaN S
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

577 rows × 12 columns

In [14]:
sum(data['Sex'] == 'male')
Out [14]:
577
In [16]:
#deaths
sum(data['Survived'] == 0)
Out [16]:
549
In [20]:
data.isnull().sum()
Out [20]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
In [24]:
import matplotlib.pyplot as pp
In [26]:
per_missing = data.isnull().sum() * 100 / len(data)
per_missing
Out [26]:
PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64
In [34]:
data['Sex'].unique()
Out [34]:
array(['male', 'female'], dtype=object)
In [ ]: