3. Supervised learning model development¶

Written by Men Vuthy, 2022

Import modules

[1]:

import os
import pandas as pd
import numpy as np
np.random.seed(0)

import rasterio
import geopandas as gpd

# Import scikit-learn modules
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc
rc('text', usetex=True)

[2]:

# Input classified data of each river
Kano_classified = pd.read_csv('data/kano_river/out_img/classified/kano_classified.csv')
Yoshii_classified = pd.read_csv('data/yoshii_river/out_img/classified/yoshii_classified.csv')

[3]:

# Create dataframe containing all classified data
classified_df = pd.concat([Kano_classified, Yoshii_classified], ignore_index=True)

[4]:

# Create Test and Train Data
classified_df['train'] = np.random.uniform(0, 1, len(classified_df)) <= .75
classified_df.head()

[4]:

	B1	G1	R1	NIR1	NDVI1	NDWI1	BSI1	B2	G2	R2	...	BSI3	B4	G4	R4	NIR4	NDVI4	NDWI4	BSI4	label	train
0	890	992	995	1965	0.242947	-0.214166	0.303773	860	912	819	...	0.325482	655	745	768	1148	0.207700	-0.172238	0.307462	6	True
1	900	990	990	2008	0.255461	-0.225427	0.306076	882	936	865	...	0.332129	652	722	735	1072	0.195912	-0.154165	0.310903	6	True
2	901	972	958	1862	0.235430	-0.198141	0.307286	877	915	854	...	0.329432	636	709	733	983	0.155239	-0.120563	0.312732	6	True
3	801	889	863	1576	0.205786	-0.160642	0.297341	846	857	799	...	0.319362	619	683	705	874	0.116680	-0.080891	0.314762	6	True
4	842	893	896	1301	0.093848	-0.063836	0.315164	821	811	749	...	0.313164	572	642	652	739	0.072273	-0.028123	0.307235	6	True

5 rows × 30 columns

[5]:

# Create dataframes with test rows and training rows
train, test = classified_df[classified_df['train']==True], classified_df[classified_df['train']==False]

# show the number of observations for test and train dataframe
print('Training data:', len(train))
print('Testing data:', len(test))

Training data: 1383093
Testing data: 462144

[6]:

# Create a list of the feature column's names
features = classified_df.columns[:28]

features

[6]:

Index(['B1', 'G1', 'R1', 'NIR1', 'NDVI1', 'NDWI1', 'BSI1', 'B2', 'G2', 'R2',
       'NIR2', 'NDVI2', 'NDWI2', 'BSI2', 'B3', 'G3', 'R3', 'NIR3', 'NDVI3',
       'NDWI3', 'BSI3', 'B4', 'G4', 'R4', 'NIR4', 'NDVI4', 'NDWI4', 'BSI4'],
      dtype='object')

[7]:

# Since our land use is already in digits, there's no need to factorize
classes = train['label']

[8]:

# Initialize our model with 150 trees
clf = RandomForestClassifier(n_estimators=150, oob_score=True, random_state=0)

# Fit our model to training data
clf = clf.fit(train[features], classes)

[9]:

# save
joblib.dump(clf, "./random_forest.joblib")

# load, no need to initialize the loaded_rf
rfc_model = joblib.load("./random_forest.joblib")

[10]:

# Check the importance of each feature
for b, imp in zip(features, rfc_model.feature_importances_):
    print('Band {b} importance: {imp}'.format(b=b, imp=imp))

Band B1 importance: 0.022640564247320704
Band G1 importance: 0.04775373986536404
Band R1 importance: 0.03305181014428356
Band NIR1 importance: 0.08998159842070703
Band NDVI1 importance: 0.021422512253227544
Band NDWI1 importance: 0.0251564608437582
Band BSI1 importance: 0.016150975502086287
Band B2 importance: 0.020143180151256875
Band G2 importance: 0.02750993202560692
Band R2 importance: 0.03344753939644612
Band NIR2 importance: 0.11919608698774228
Band NDVI2 importance: 0.027123799062126603
Band NDWI2 importance: 0.03763494172800262
Band BSI2 importance: 0.017902071552508066
Band B3 importance: 0.020224980301024903
Band G3 importance: 0.022142024296447817
Band R3 importance: 0.027312165405646797
Band NIR3 importance: 0.07469120978582319
Band NDVI3 importance: 0.025987768747866326
Band NDWI3 importance: 0.028268420759686608
Band BSI3 importance: 0.016441553188424125
Band B4 importance: 0.022289771094910774
Band G4 importance: 0.031409771837951336
Band R4 importance: 0.04344981684834436
Band NIR4 importance: 0.06311245118705901
Band NDVI4 importance: 0.028885612361275136
Band NDWI4 importance: 0.041524950168692044
Band BSI4 importance: 0.015144291836410532

[11]:

# View Out-of-Bag accuracy score
print('Our OOB prediction of accuracy is: {oob}%'.format(oob=rfc_model.oob_score_ * 100))

Our OOB prediction of accuracy is: 88.7774719415108%

Predicting test dataset

[12]:

# Apply the trained Classifier to the test
preds = rfc_model.predict(test[features])

[13]:

# View accuracy classification (cross-validation) score
print('Our classification accuracy is: {cv}%'.format(cv=accuracy_score(test['label'], preds)* 100))

Our classification accuracy is: 88.93137203988367%

Visualizing confusion matrix

[14]:

# Get and reshape confusion matrix data
Matrix = confusion_matrix(test['label'], preds)
matrix = Matrix.astype('float') / Matrix.sum(axis=1)[:, np.newaxis]

[15]:

# Build the plot
plt.figure(figsize=(15,5))
sns.heatmap(Matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['1', '2', '3', '4', '5', '6', '7']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks+ 0.5, class_names, rotation=25, fontsize=10)
plt.yticks(tick_marks2, class_names, rotation=0, fontsize=10)
plt.xlabel('Predicted label', fontsize=12)
plt.ylabel('True label', fontsize=12)
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

../../../../_images/Content_Project_2022_kano-and-yoshii-river_3-RFC_model-_Kano&Yoshii_River_19_0.png

[16]:

# Build the plot
plt.figure(figsize=(15,5))
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['1', '2', '3', '4', '5', '6', '7']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks+ 0.5, class_names, rotation=25, fontsize=10)
plt.yticks(tick_marks2, class_names, rotation=0, fontsize=10)
plt.xlabel('Predicted label', fontsize=12)
plt.ylabel('True label', fontsize=12)
plt.title('Confusion Matrix for Random Forest Model\n75\% training and 25\% testing\nAccuracy score is 88.93\%')
# plt.savefig('confusion-matrix.png', dpi=300)
plt.show()

../../../../_images/Content_Project_2022_kano-and-yoshii-river_3-RFC_model-_Kano&Yoshii_River_20_0.png