The Ministry wants to know in advance which pumps are most likely to be non-functional, so that they can triage their repair efforts. They ask help to come up with a repair and replace strategy, that minimizes time/cost and optimizes water access. The final analysis should be presented in a presentation style format with slides as if it would be shared with a real client.

See github for full jupyter notebook: LINK

Background information

1) Preparations

# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
# Load Data
test_vals = pd.read_csv("test_set_values.csv")
train_vals = pd.read_csv("training_set_values.csv")
train_labs = pd.read_csv("training_set_labels.csv")

train = pd.merge(train_vals, train_labs, on='id')

2) Preprocessing/feature selection

train_labs.status_group.hist(grid=False, bins=3, color='red', edgecolor='black')
plt.title('Distribution of water pump states')

train_labs[['status_group', 'id']].groupby('status_group').count()
Figure 1: Groups and their size
# check for features with null values
train.apply(lambda x: sum(x.isnull()))
id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
quantity_group               0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
waterpoint_type_group        0
status_group                 0
dtype: int64
# To save time I am liberal when it comes to removing features. If a feature with many missing values has 
# many unique values I will remove it for the analysis.
print('amount of unique values for the features which contain missing values')
print('Unique funders: ', len(train.funder.value_counts()))
# conclusion too many unique funders so lets drop this feature

print('Unique villages: ', len(train.subvillage.value_counts()))
# conclusion too many unique villages so lets drop this feature

print('Unique installer: ', len(train.installer.value_counts()))
# conclusion too many unique installers so lets drop this feature

print('Unique scheme_name: ', len(train.scheme_name.value_counts()))
# conclusion too many unique scheme_name so lets drop this feature

print('---')
print('Unique public_meeting: ', len(train.public_meeting.value_counts()))
# maybe useful to keep

print('Unique permit: ', len(train.permit.value_counts()))
# maybe useful to keep

print('Unique scheme_management: ', len(train.scheme_management.value_counts()))
# maybe useful to keep
amount of unique values for the features which contain missing values
Unique funders:  1897
Unique villages:  19287
Unique installer:  2145
Unique scheme_name:  2696
---
Unique public_meeting:  2
Unique permit:  2
Unique scheme_management:  12
# fixing public_meeting
train.public_meeting.value_counts()
True     38852
False    17492
Name: permit, dtype: int64
# do the same as for public_meeting and fill in null as Unknown.
train.permit = train.permit.fillna('Unknown')
# Fixing scheme management
train.scheme_management.value_counts()
VWC                 36793
WUG                  5206
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
SWC                    97
Trust                  72
None                    1
Name: scheme_management, dtype: int64
# I will create a function which will simplify the groups in this feature.
# Only the top 8 groups will stay, all others will become other.

def grouping_func(row):
    '''Keep top 8 values'. '''
    if row['scheme_management']=='VWC':
        return 'vwc'
    elif row['scheme_management']=='WUG':
        return 'wug'
    elif row['scheme_management']=='Water authority':
        return 'wtr_auth'
    elif row['scheme_management']=='WUA':
        return 'wua'
    elif row['scheme_management']=='Water Board':
        return 'wtr_brd'
    elif row['scheme_management']=='Parastatal':
        return 'Parastatal'
    elif row['scheme_management']=='Private operator':
        return 'priv_op'
    elif row['scheme_management']=='Company':
        return 'comp'
    else:
        return 'other'

train['scheme_management'] = train.apply(lambda row: grouping_func(row), axis=1)
train = train.drop(columns = ['scheme_name', 'subvillage', 'installer', 'funder'])
# train.apply(lambda x: sum(x.isnull()))
# No null values in the dataset anymore. 
# Next step in data cleaning/ feature selection is reviewing string data

string_cols = train.select_dtypes(include = ['object'])
string_cols.apply(lambda x: len(x.unique()))
date_recorded              356
wpt_name                 37400
basin                        9
region                      21
lga                        125
ward                      2092
public_meeting               3
recorded_by                  1
scheme_management            9
permit                       3
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
quantity_group               5
source                      10
source_type                  7
source_class                 3
waterpoint_type              7
waterpoint_type_group        6
status_group                 3
dtype: int64
# date_recorded review
train.date_recorded.describe()
count          59400
unique           356
top       2011-03-15
freq             572
Name: date_recorded, dtype: object
# date recorded might be very useful since the time could have an effect on functionailty of the water pump.
# first transform data to datetime.
# next step is to change column to 'days_recorded_since'. We take the most recent date and count from there.

train['date_recorded'] = pd.to_datetime(train['date_recorded'])
train.date_recorded.describe()
count                   59400
unique                    356
top       2011-03-15 00:00:00
freq                      572
first     2002-10-14 00:00:00
last      2013-12-03 00:00:00
Name: date_recorded, dtype: object
# The most recent data is 2013-12-03. Subtract each date from this point to obtain a 
# 'days_since_recorded' column.

train.date_recorded = pd.datetime(2013, 12, 3) - pd.to_datetime(train.date_recorded)
train.columns = ['days_since_recorded' if x=='date_recorded' else x for x in train.columns]
train.days_since_recorded = train.days_since_recorded.astype('timedelta64[D]').astype(int)
# train.days_since_recorded.describe()
# fixing wpt_name
train.wpt_name.value_counts()

# I will drop this feature (later on), too many values
# df = df.drop('wpt_name', axis=1)
none                    3563
Shuleni                 1748
Zahanati                 830
Msikitini                535
Kanisani                 323
                        ... 
Abiudi Nyarandu            1
Kwa Mzee Kirosa            1
Kwa Mapogo                 1
Kwa Kondiki Madukani       1
Kwakafilika                1
Name: wpt_name, Length: 37400, dtype: int64
# Fixing region
train.region.value_counts()

# the feature region, lga and ward all contain geo data.
# They could be highly correlated and better to remove.
Iringa           5294
Shinyanga        4982
Mbeya            4639
Kilimanjaro      4379
Morogoro         4006
Arusha           3350
Kagera           3316
Mwanza           3102
Kigoma           2816
Ruvuma           2640
Pwani            2635
Tanga            2547
Dodoma           2201
Singida          2093
Mara             1969
Tabora           1959
Rukwa            1808
Mtwara           1730
Manyara          1583
Lindi            1546
Dar es Salaam     805
Name: region, dtype: int64
train.recorded_by.value_counts()
train.extraction_type.value_counts()
train.extraction_type_group.value_counts()
train.management.value_counts()
train.management_group.value_counts()
train.payment_type.value_counts()
# payment and payment_type contain the same info, I drop payment_type and keep payment

train.quality_group.value_counts()
# quaility and quality_group contain the same info, I drop quality_group and keep quality.

train.quantity_group.value_counts()
# quantity and quantity_group contain the same info, I drop quanitity_group and keep quantity.

train.source_type.value_counts()
# source and source_type contain the same info, I drop source_type and keep source.
spring                  17021
shallow well            16824
borehole                11949
river/lake              10377
rainwater harvesting     2295
dam                       656
other                     278
Name: source_type, dtype: int64
#All the features evaluated in the previous steps are droped here.
train = train.drop(['wpt_name', 'region', 'lga', 'ward', 'recorded_by', 'extraction_type', 'extraction_type_group'
             ,'management', 'management_group', 'payment_type', 'quality_group', 'quantity_group'
             , 'source_type'], axis=1)

# The following features all relate to geo data, which on initial inspection seem good to remove. 
# They unlikely add predictive power and could be highly related.
train = train.drop(['gps_height', 'region_code', 'district_code',
             'num_private', 'id'], axis=1)

# I could remove construction_year to spead things up, but it might have significant predictive ability.
# df.construction_year.value_counts()
#Instead of individual years we can group them in decades.
def group_year_in_decades(row):
    if row['construction_year'] >= 1960 and row['construction_year'] < 1970:
        return '60s'
    elif row['construction_year'] >= 1970 and row['construction_year'] < 1980:
        return '70s'
    elif row['construction_year'] >= 1980 and row['construction_year'] < 1990:
        return '80s'
    elif row['construction_year'] >= 1990 and row['construction_year'] < 2000:
        return '90s'
    elif row['construction_year'] >= 2000 and row['construction_year'] < 2010:
        return '00s'
    elif row['construction_year'] >= 2010:
        return '10s'
    else:
        return 'unknown'
    
train['construction_year'] = train.apply(lambda row: group_year_in_decades(row), axis=1)

# final table
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             59400 non-null  float64
 1   days_since_recorded    59400 non-null  int64  
 2   longitude              59400 non-null  float64
 3   latitude               59400 non-null  float64
 4   basin                  59400 non-null  object 
 5   population             59400 non-null  int64  
 6   public_meeting         59400 non-null  object 
 7   scheme_management      59400 non-null  object 
 8   permit                 59400 non-null  object 
 9   construction_year      59400 non-null  object 
 10  extraction_type_class  59400 non-null  object 
 11  payment                59400 non-null  object 
 12  water_quality          59400 non-null  object 
 13  quantity               59400 non-null  object 
 14  source                 59400 non-null  object 
 15  source_class           59400 non-null  object 
 16  waterpoint_type        59400 non-null  object 
 17  waterpoint_type_group  59400 non-null  object 
 18  status_group           59400 non-null  object 
dtypes: float64(3), int64(2), object(14)
memory usage: 9.1+ MB
# last step is to apply the same steps to the test set.
test_vals = test_vals.drop(['gps_height', 'region_code', 'district_code',
             'num_private', 'id','scheme_name', 'subvillage', 'installer', 'funder','wpt_name', 
            'region', 'lga', 'ward', 'recorded_by', 'extraction_type', 'extraction_type_group',
            'management', 'management_group', 'payment_type', 'quality_group', 'quantity_group'
             , 'source_type'], axis=1)

test_vals.date_recorded = pd.datetime(2013, 12, 3) - pd.to_datetime(test_vals.date_recorded)
test_vals.columns = ['days_since_recorded' if x=='date_recorded' else x for x in test_vals.columns]
test_vals.days_since_recorded = test_vals.days_since_recorded.astype('timedelta64[D]').astype(int)

test_vals.permit = test_vals.permit.fillna('Unknown')
test_vals.public_meeting = test_vals.public_meeting.fillna('Unknown')

test_vals['scheme_management'] = test_vals.apply(lambda row: grouping_func(row), axis=1)
test_vals['construction_year'] = test_vals.apply(lambda row: group_year_in_decades(row), axis=1)
test_vals.shape
(14850, 18)
train.shape
# Both tables are now the same (the train table has one extra column with label info for functionailty of waterpump.)
(59400, 19

3) Data analysis (Gradient Boosting Classifier)

# importing libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
# giving the test set a clearer name
test = test_vals
# train.columns

# Get dummy columns for the categorical columns and shuffle the data.
dummy_cols = ['basin', 'public_meeting', 'scheme_management', 'permit',
              'construction_year', 'extraction_type_class', 'payment', 'water_quality',
              'quantity', 'source', 'source_class', 'waterpoint_type',
             'waterpoint_type_group']

train = pd.get_dummies(train, columns = dummy_cols)
train = train.sample(frac=1).reset_index(drop=True)
test = pd.get_dummies(test, columns = dummy_cols)
# Let's split the train set into train and validation sets. Also remove the target.

target = train.status_group
features = train.drop('status_group', axis=1)
X_train, X_val, y_train, y_val = train_test_split(features, target, train_size=0.8)
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
param_grid = {'learning_rate': [0.1],
                'max_depth': [8],
                'min_samples_leaf': [40],
                'max_features': [1.0],
                'n_estimators': [20]}                      
estimator = GridSearchCV(estimator=GradientBoostingClassifier(),
                            param_grid=param_grid,
                            n_jobs=-1)
estimator.fit(X_train, y_train)
best_params = estimator.best_params_

print (best_params)                       
validation_accuracy = estimator.score(X_val, y_val)
print('Validation accuracy: ', validation_accuracy)
{'learning_rate': 0.1, 'max_depth': 8, 'max_features': 1.0, 'min_samples_leaf': 40, 'n_estimators': 20}
Validation accuracy:  0.7653198653198653

4) Data visualization

# prepare data for visualization
status_group_predict = pd.DataFrame(estimator.predict(test))
test_vis = test
test_vis['amount_tsh'] = status_group_predict
df = test_vis[['amount_tsh', 'longitude', 'latitude', 'population']]
df.head(5)
amount_tsh	longitude	latitude	population
0	non functional	35.290799	-4.059696	321
1	functional	36.656709	-3.309214	300
2	functional	34.767863	-5.004344	500
3	non functional	38.058046	-9.418672	250
4	functional	35.006123	-10.950412	60
# prepare for folium
from sklearn.cluster import KMeans
import folium

tanzania_latitude = -6.424484
tanzania_longitude = 35.035288

non_functional_pumps = df[df['amount_tsh'].str.contains('non functional')]
need_repair_pumps = df[df['amount_tsh'].str.contains('functional needs repair')]

# remove non function pumps with less than average population
non_functional_pumps.population.mean()
non_functional_pumps = non_functional_pumps[~(non_functional_pumps['population'] <= 170)]  
need_repair_pumps = need_repair_pumps[~(need_repair_pumps['population'] <= 170)] 

tanzania_map = folium.Map(location=[tanzania_latitude, tanzania_longitude], zoom_start=6)
   
for lat, lng, label in zip(non_functional_pumps.latitude, non_functional_pumps.longitude, non_functional_pumps.population):
    folium.CircleMarker([lat, lng],radius=5, color='red', popup=label, fill = True, fill_color='red', fill_opacity=0.6
    ).add_to(tanzania_map)
    
for lat, lng, label in zip(need_repair_pumps.latitude, need_repair_pumps.longitude, need_repair_pumps.population):
    folium.CircleMarker([lat, lng],radius=5, color='blue', popup=label, fill = True, fill_color='red', fill_opacity=0.6
    ).add_to(tanzania_map)
    
tanzania_map
train_vals.head()
Figure 2: see the github page for the full table
train_vals.quality_group.hist(grid=False, bins=6, color='red', edgecolor='black')
plt.title('Distribution of water pump water quaility groups')
train2 = pd.merge(train_vals, train_labs, on='id')
pd.set_option('display.max_columns', None)

train2.head()
data_plot=pd.crosstab(index=train2[‘quality_group’],columns=train2[‘status_group’]) print(CrosstabResult) data_plot.plot.bar(figsize=(10,6))
data_plot2=pd.crosstab(index=train2['extraction_type_class'],columns=train2['status_group'])
print(CrosstabResult)
data_plot2.plot.bar(figsize=(10,6))
data_plot3=pd.crosstab(index=train2['quantity'],columns=train2['status_group'])
print(CrosstabResult)
data_plot3.plot.bar(figsize=(10,6))

Leave a Reply

Your email address will not be published. Required fields are marked *