The Ministry wants to know in advance which pumps are most likely to be non-functional, so that they can triage their repair efforts. They ask help to come up with a repair and replace strategy, that minimizes time/cost and optimizes water access. The final analysis should be presented in a presentation style format with slides as if it would be shared with a real client.
See github for full jupyter notebook: LINK
Background information
- https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/
- https://ednafernandes.medium.com/pump-it-up-data-mining-the-water-table-7d7a5d2ebb38
1) Preparations
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Load Data
test_vals = pd.read_csv("test_set_values.csv")
train_vals = pd.read_csv("training_set_values.csv")
train_labs = pd.read_csv("training_set_labels.csv")
train = pd.merge(train_vals, train_labs, on='id')
2) Preprocessing/feature selection
train_labs.status_group.hist(grid=False, bins=3, color='red', edgecolor='black')
plt.title('Distribution of water pump states')
train_labs[['status_group', 'id']].groupby('status_group').count()
# check for features with null values
train.apply(lambda x: sum(x.isnull()))
id 0
amount_tsh 0
date_recorded 0
funder 3635
gps_height 0
installer 3655
longitude 0
latitude 0
wpt_name 0
num_private 0
basin 0
subvillage 371
region 0
region_code 0
district_code 0
lga 0
ward 0
population 0
public_meeting 3334
recorded_by 0
scheme_management 3877
scheme_name 28166
permit 3056
construction_year 0
extraction_type 0
extraction_type_group 0
extraction_type_class 0
management 0
management_group 0
payment 0
payment_type 0
water_quality 0
quality_group 0
quantity 0
quantity_group 0
source 0
source_type 0
source_class 0
waterpoint_type 0
waterpoint_type_group 0
status_group 0
dtype: int64
# To save time I am liberal when it comes to removing features. If a feature with many missing values has
# many unique values I will remove it for the analysis.
print('amount of unique values for the features which contain missing values')
print('Unique funders: ', len(train.funder.value_counts()))
# conclusion too many unique funders so lets drop this feature
print('Unique villages: ', len(train.subvillage.value_counts()))
# conclusion too many unique villages so lets drop this feature
print('Unique installer: ', len(train.installer.value_counts()))
# conclusion too many unique installers so lets drop this feature
print('Unique scheme_name: ', len(train.scheme_name.value_counts()))
# conclusion too many unique scheme_name so lets drop this feature
print('---')
print('Unique public_meeting: ', len(train.public_meeting.value_counts()))
# maybe useful to keep
print('Unique permit: ', len(train.permit.value_counts()))
# maybe useful to keep
print('Unique scheme_management: ', len(train.scheme_management.value_counts()))
# maybe useful to keep
amount of unique values for the features which contain missing values
Unique funders: 1897
Unique villages: 19287
Unique installer: 2145
Unique scheme_name: 2696
---
Unique public_meeting: 2
Unique permit: 2
Unique scheme_management: 12
# fixing public_meeting train.public_meeting.value_counts()
True 38852
False 17492
Name: permit, dtype: int64
# do the same as for public_meeting and fill in null as Unknown.
train.permit = train.permit.fillna('Unknown')
# Fixing scheme management
train.scheme_management.value_counts()
VWC 36793
WUG 5206
Water authority 3153
WUA 2883
Water Board 2748
Parastatal 1680
Private operator 1063
Company 1061
Other 766
SWC 97
Trust 72
None 1
Name: scheme_management, dtype: int64
# I will create a function which will simplify the groups in this feature.
# Only the top 8 groups will stay, all others will become other.
def grouping_func(row):
'''Keep top 8 values'. '''
if row['scheme_management']=='VWC':
return 'vwc'
elif row['scheme_management']=='WUG':
return 'wug'
elif row['scheme_management']=='Water authority':
return 'wtr_auth'
elif row['scheme_management']=='WUA':
return 'wua'
elif row['scheme_management']=='Water Board':
return 'wtr_brd'
elif row['scheme_management']=='Parastatal':
return 'Parastatal'
elif row['scheme_management']=='Private operator':
return 'priv_op'
elif row['scheme_management']=='Company':
return 'comp'
else:
return 'other'
train['scheme_management'] = train.apply(lambda row: grouping_func(row), axis=1)
train = train.drop(columns = ['scheme_name', 'subvillage', 'installer', 'funder'])
# train.apply(lambda x: sum(x.isnull()))
# No null values in the dataset anymore.
# Next step in data cleaning/ feature selection is reviewing string data
string_cols = train.select_dtypes(include = ['object'])
string_cols.apply(lambda x: len(x.unique()))
date_recorded 356
wpt_name 37400
basin 9
region 21
lga 125
ward 2092
public_meeting 3
recorded_by 1
scheme_management 9
permit 3
extraction_type 18
extraction_type_group 13
extraction_type_class 7
management 12
management_group 5
payment 7
payment_type 7
water_quality 8
quality_group 6
quantity 5
quantity_group 5
source 10
source_type 7
source_class 3
waterpoint_type 7
waterpoint_type_group 6
status_group 3
dtype: int64
# date_recorded review
train.date_recorded.describe()
count 59400
unique 356
top 2011-03-15
freq 572
Name: date_recorded, dtype: object
# date recorded might be very useful since the time could have an effect on functionailty of the water pump.
# first transform data to datetime.
# next step is to change column to 'days_recorded_since'. We take the most recent date and count from there.
train['date_recorded'] = pd.to_datetime(train['date_recorded'])
train.date_recorded.describe()
count 59400
unique 356
top 2011-03-15 00:00:00
freq 572
first 2002-10-14 00:00:00
last 2013-12-03 00:00:00
Name: date_recorded, dtype: object
# The most recent data is 2013-12-03. Subtract each date from this point to obtain a
# 'days_since_recorded' column.
train.date_recorded = pd.datetime(2013, 12, 3) - pd.to_datetime(train.date_recorded)
train.columns = ['days_since_recorded' if x=='date_recorded' else x for x in train.columns]
train.days_since_recorded = train.days_since_recorded.astype('timedelta64[D]').astype(int)
# train.days_since_recorded.describe()
# fixing wpt_name
train.wpt_name.value_counts()
# I will drop this feature (later on), too many values
# df = df.drop('wpt_name', axis=1)
none 3563
Shuleni 1748
Zahanati 830
Msikitini 535
Kanisani 323
...
Abiudi Nyarandu 1
Kwa Mzee Kirosa 1
Kwa Mapogo 1
Kwa Kondiki Madukani 1
Kwakafilika 1
Name: wpt_name, Length: 37400, dtype: int64
# Fixing region
train.region.value_counts()
# the feature region, lga and ward all contain geo data.
# They could be highly correlated and better to remove.
Iringa 5294
Shinyanga 4982
Mbeya 4639
Kilimanjaro 4379
Morogoro 4006
Arusha 3350
Kagera 3316
Mwanza 3102
Kigoma 2816
Ruvuma 2640
Pwani 2635
Tanga 2547
Dodoma 2201
Singida 2093
Mara 1969
Tabora 1959
Rukwa 1808
Mtwara 1730
Manyara 1583
Lindi 1546
Dar es Salaam 805
Name: region, dtype: int64
train.recorded_by.value_counts()
train.extraction_type.value_counts()
train.extraction_type_group.value_counts()
train.management.value_counts()
train.management_group.value_counts()
train.payment_type.value_counts()
# payment and payment_type contain the same info, I drop payment_type and keep payment
train.quality_group.value_counts()
# quaility and quality_group contain the same info, I drop quality_group and keep quality.
train.quantity_group.value_counts()
# quantity and quantity_group contain the same info, I drop quanitity_group and keep quantity.
train.source_type.value_counts()
# source and source_type contain the same info, I drop source_type and keep source.
spring 17021
shallow well 16824
borehole 11949
river/lake 10377
rainwater harvesting 2295
dam 656
other 278
Name: source_type, dtype: int64
#All the features evaluated in the previous steps are droped here.
train = train.drop(['wpt_name', 'region', 'lga', 'ward', 'recorded_by', 'extraction_type', 'extraction_type_group'
,'management', 'management_group', 'payment_type', 'quality_group', 'quantity_group'
, 'source_type'], axis=1)
# The following features all relate to geo data, which on initial inspection seem good to remove.
# They unlikely add predictive power and could be highly related.
train = train.drop(['gps_height', 'region_code', 'district_code',
'num_private', 'id'], axis=1)
# I could remove construction_year to spead things up, but it might have significant predictive ability.
# df.construction_year.value_counts()
#Instead of individual years we can group them in decades.
def group_year_in_decades(row):
if row['construction_year'] >= 1960 and row['construction_year'] < 1970:
return '60s'
elif row['construction_year'] >= 1970 and row['construction_year'] < 1980:
return '70s'
elif row['construction_year'] >= 1980 and row['construction_year'] < 1990:
return '80s'
elif row['construction_year'] >= 1990 and row['construction_year'] < 2000:
return '90s'
elif row['construction_year'] >= 2000 and row['construction_year'] < 2010:
return '00s'
elif row['construction_year'] >= 2010:
return '10s'
else:
return 'unknown'
train['construction_year'] = train.apply(lambda row: group_year_in_decades(row), axis=1)
# final table
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 19 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 amount_tsh 59400 non-null float64
1 days_since_recorded 59400 non-null int64
2 longitude 59400 non-null float64
3 latitude 59400 non-null float64
4 basin 59400 non-null object
5 population 59400 non-null int64
6 public_meeting 59400 non-null object
7 scheme_management 59400 non-null object
8 permit 59400 non-null object
9 construction_year 59400 non-null object
10 extraction_type_class 59400 non-null object
11 payment 59400 non-null object
12 water_quality 59400 non-null object
13 quantity 59400 non-null object
14 source 59400 non-null object
15 source_class 59400 non-null object
16 waterpoint_type 59400 non-null object
17 waterpoint_type_group 59400 non-null object
18 status_group 59400 non-null object
dtypes: float64(3), int64(2), object(14)
memory usage: 9.1+ MB
# last step is to apply the same steps to the test set.
test_vals = test_vals.drop(['gps_height', 'region_code', 'district_code',
'num_private', 'id','scheme_name', 'subvillage', 'installer', 'funder','wpt_name',
'region', 'lga', 'ward', 'recorded_by', 'extraction_type', 'extraction_type_group',
'management', 'management_group', 'payment_type', 'quality_group', 'quantity_group'
, 'source_type'], axis=1)
test_vals.date_recorded = pd.datetime(2013, 12, 3) - pd.to_datetime(test_vals.date_recorded)
test_vals.columns = ['days_since_recorded' if x=='date_recorded' else x for x in test_vals.columns]
test_vals.days_since_recorded = test_vals.days_since_recorded.astype('timedelta64[D]').astype(int)
test_vals.permit = test_vals.permit.fillna('Unknown')
test_vals.public_meeting = test_vals.public_meeting.fillna('Unknown')
test_vals['scheme_management'] = test_vals.apply(lambda row: grouping_func(row), axis=1)
test_vals['construction_year'] = test_vals.apply(lambda row: group_year_in_decades(row), axis=1)
test_vals.shape
(14850, 18)
train.shape
# Both tables are now the same (the train table has one extra column with label info for functionailty of waterpump.)
(59400, 19
3) Data analysis (Gradient Boosting Classifier)
# importing libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
# giving the test set a clearer name
test = test_vals
# train.columns
# Get dummy columns for the categorical columns and shuffle the data.
dummy_cols = ['basin', 'public_meeting', 'scheme_management', 'permit',
'construction_year', 'extraction_type_class', 'payment', 'water_quality',
'quantity', 'source', 'source_class', 'waterpoint_type',
'waterpoint_type_group']
train = pd.get_dummies(train, columns = dummy_cols)
train = train.sample(frac=1).reset_index(drop=True)
test = pd.get_dummies(test, columns = dummy_cols)
# Let's split the train set into train and validation sets. Also remove the target.
target = train.status_group
features = train.drop('status_group', axis=1)
X_train, X_val, y_train, y_val = train_test_split(features, target, train_size=0.8)
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
param_grid = {'learning_rate': [0.1],
'max_depth': [8],
'min_samples_leaf': [40],
'max_features': [1.0],
'n_estimators': [20]}
estimator = GridSearchCV(estimator=GradientBoostingClassifier(),
param_grid=param_grid,
n_jobs=-1)
estimator.fit(X_train, y_train)
best_params = estimator.best_params_
print (best_params)
validation_accuracy = estimator.score(X_val, y_val)
print('Validation accuracy: ', validation_accuracy)
{'learning_rate': 0.1, 'max_depth': 8, 'max_features': 1.0, 'min_samples_leaf': 40, 'n_estimators': 20}
Validation accuracy: 0.7653198653198653
4) Data visualization
# prepare data for visualization
status_group_predict = pd.DataFrame(estimator.predict(test))
test_vis = test
test_vis['amount_tsh'] = status_group_predict
df = test_vis[['amount_tsh', 'longitude', 'latitude', 'population']]
df.head(5)
amount_tsh longitude latitude population
0 non functional 35.290799 -4.059696 321
1 functional 36.656709 -3.309214 300
2 functional 34.767863 -5.004344 500
3 non functional 38.058046 -9.418672 250
4 functional 35.006123 -10.950412 60
# prepare for folium
from sklearn.cluster import KMeans
import folium
tanzania_latitude = -6.424484
tanzania_longitude = 35.035288
non_functional_pumps = df[df['amount_tsh'].str.contains('non functional')]
need_repair_pumps = df[df['amount_tsh'].str.contains('functional needs repair')]
# remove non function pumps with less than average population
non_functional_pumps.population.mean()
non_functional_pumps = non_functional_pumps[~(non_functional_pumps['population'] <= 170)]
need_repair_pumps = need_repair_pumps[~(need_repair_pumps['population'] <= 170)]
tanzania_map = folium.Map(location=[tanzania_latitude, tanzania_longitude], zoom_start=6)
for lat, lng, label in zip(non_functional_pumps.latitude, non_functional_pumps.longitude, non_functional_pumps.population):
folium.CircleMarker([lat, lng],radius=5, color='red', popup=label, fill = True, fill_color='red', fill_opacity=0.6
).add_to(tanzania_map)
for lat, lng, label in zip(need_repair_pumps.latitude, need_repair_pumps.longitude, need_repair_pumps.population):
folium.CircleMarker([lat, lng],radius=5, color='blue', popup=label, fill = True, fill_color='red', fill_opacity=0.6
).add_to(tanzania_map)
tanzania_map
train_vals.head()
train_vals.quality_group.hist(grid=False, bins=6, color='red', edgecolor='black')
plt.title('Distribution of water pump water quaility groups')
train2 = pd.merge(train_vals, train_labs, on='id')
pd.set_option('display.max_columns', None)
train2.head()
data_plot2=pd.crosstab(index=train2['extraction_type_class'],columns=train2['status_group'])
print(CrosstabResult)
data_plot2.plot.bar(figsize=(10,6))
data_plot3=pd.crosstab(index=train2['quantity'],columns=train2['status_group'])
print(CrosstabResult)
data_plot3.plot.bar(figsize=(10,6))