6. KMeans Clustering and Results

Written by Men Vuthy, 2021


Import packages

[1]:
import os
import rasterio
import rasterio.mask
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# sklearn & Scipy Libraries
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

import fiona

Input Feature Data

[2]:
# Reading dataset
NN_NDVI = pd.read_csv('output/3/no_noise_data/no_noise_ndvi.csv')

# Data manipulation
Input_DF = NN_NDVI.iloc[4:,:].T
Input_DF
[2]:
4 5 6 7 8 9 10 11 12 13 ... 59 60 61 62 63 64 65 66 67 68
464 0.51950 0.57864 0.63714 0.64732 0.67500 0.67774 0.68980 0.69454 0.69234 0.67418 ... 0.66744 0.65592 0.61508 0.53592 0.49996 0.49064 0.52672 0.50306 0.58068 0.62020
465 0.53694 0.54528 0.59784 0.63970 0.65370 0.65422 0.68336 0.69858 0.65140 0.63988 ... 0.59776 0.47894 0.32178 0.20738 0.12034 0.03210 0.13908 0.17912 0.24692 0.30328
466 0.55552 0.58060 0.64768 0.68540 0.67414 0.67482 0.69040 0.69288 0.63874 0.59288 ... 0.59052 0.41846 0.25602 0.13288 0.04092 -0.03880 0.11154 0.11886 0.20148 0.26276
467 0.55560 0.63836 0.66420 0.70210 0.69084 0.71550 0.72062 0.72310 0.68032 0.60178 ... 0.51830 0.37740 0.24368 0.16522 0.09862 0.03156 0.12450 0.13378 0.21640 0.29612
1215 0.70558 0.74582 0.74244 0.73444 0.75022 0.73344 0.73354 0.73058 0.73094 0.71617 ... 0.67193 0.62406 0.58591 0.49347 0.54397 0.51235 0.55181 0.59130 0.67986 0.69842
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
391324 0.80152 0.79928 0.79852 0.80118 0.79846 0.76344 0.76540 0.77446 0.76666 0.65752 ... 0.65426 0.57752 0.62126 0.56004 0.48050 0.53128 0.56300 0.50778 0.57666 0.69396
391325 0.82454 0.82177 0.82139 0.82083 0.76687 0.75187 0.74798 0.75704 0.72350 0.75358 ... 0.61884 0.61610 0.65594 0.61808 0.45692 0.53786 0.42998 0.28714 0.36382 0.53096
392074 0.83522 0.82322 0.82546 0.82742 0.82954 0.81118 0.73832 0.74184 0.70344 0.60238 ... 0.61174 0.61277 0.69493 0.66797 0.65902 0.75882 0.74525 0.74403 0.76445 0.77786
392075 0.83498 0.83440 0.83656 0.83886 0.84306 0.82860 0.74522 0.74880 0.72850 0.61764 ... 0.54754 0.46808 0.53508 0.50970 0.46386 0.57238 0.64828 0.58970 0.61062 0.73332
392076 0.80760 0.80264 0.80284 0.79794 0.79396 0.74928 0.75226 0.75998 0.71980 0.61102 ... 0.54766 0.49084 0.55458 0.57926 0.55106 0.59722 0.56852 0.47848 0.50222 0.62462

192289 rows × 65 columns

[3]:
# Set X as input feature data
X_KMeans = Input_DF

Elbow Method

Elbow curve method is used to identify the ideal number of clusters.

[4]:
# Let's use the elbow curve method to identify the ideal number of clusters
Error = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i).fit(X_KMeans)
    kmeans.fit(X_KMeans)
    Error.append(kmeans.inertia_)
[5]:
plt.plot(range(1, 11), Error)
plt.title('Elbow method')
plt.xlabel('No of clusters')
plt.ylabel('Error')
plt.show()
../../../../_images/Content_Project_2021_paddy-area-classification_6._kmeans-clustering_9_0.png

Based on the curve, 6 clusters are used.

Classification

[6]:
# Apply KMeans clustering
kmeans = KMeans(n_clusters=6, init='k-means++', max_iter=500, random_state=42)

Y_KMeans = kmeans.fit(X_KMeans)
[7]:
# Assign label
X_group = X_KMeans.copy()
X_group['cluster_id'] = kmeans.labels_
X_group.head()
[7]:
4 5 6 7 8 9 10 11 12 13 ... 60 61 62 63 64 65 66 67 68 cluster_id
464 0.51950 0.57864 0.63714 0.64732 0.67500 0.67774 0.68980 0.69454 0.69234 0.67418 ... 0.65592 0.61508 0.53592 0.49996 0.49064 0.52672 0.50306 0.58068 0.62020 0
465 0.53694 0.54528 0.59784 0.63970 0.65370 0.65422 0.68336 0.69858 0.65140 0.63988 ... 0.47894 0.32178 0.20738 0.12034 0.03210 0.13908 0.17912 0.24692 0.30328 2
466 0.55552 0.58060 0.64768 0.68540 0.67414 0.67482 0.69040 0.69288 0.63874 0.59288 ... 0.41846 0.25602 0.13288 0.04092 -0.03880 0.11154 0.11886 0.20148 0.26276 2
467 0.55560 0.63836 0.66420 0.70210 0.69084 0.71550 0.72062 0.72310 0.68032 0.60178 ... 0.37740 0.24368 0.16522 0.09862 0.03156 0.12450 0.13378 0.21640 0.29612 2
1215 0.70558 0.74582 0.74244 0.73444 0.75022 0.73344 0.73354 0.73058 0.73094 0.71617 ... 0.62406 0.58591 0.49347 0.54397 0.51235 0.55181 0.59130 0.67986 0.69842 1

5 rows × 66 columns

[8]:
# Checking how many data points are there in each cluster
X_group['cluster_id'].value_counts()
[8]:
5    51797
4    41958
2    30122
1    28829
0    25071
3    14512
Name: cluster_id, dtype: int64
[9]:
# Scatter plot
sns.scatterplot(x=10, y = 11, hue = 'cluster_id', s=1, data = X_group, palette="deep");
../../../../_images/Content_Project_2021_paddy-area-classification_6._kmeans-clustering_15_0.png

Validation

[10]:
# Read validation data
DF_Validation = pd.read_csv('output/4/validation_data/validation_paddy.csv')

X_Validation = DF_Validation.T

[11]:
# Y true
Y_Validation = np.full((250,), 2, dtype='int32')
Y_Validation
[11]:
array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])
[12]:
# Y prediction
Y_Prediction = kmeans.predict(X_Validation)
Y_Prediction
[12]:
array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

Validation Score

[13]:
# k-means performance
print("Validation score for paddy fields:", accuracy_score(Y_Validation, Y_Prediction))
Validation score for paddy fields: 0.996

Create Raster of Land Cover

[14]:
# Add one image for projection and shape reference
raster = rasterio.open("input/ndvi_2011/2011_07_28.tif")
[15]:
clust_kmean = pd.DataFrame()
clust_kmean['id'] = X_KMeans.index
clust_kmean['class'] = kmeans.labels_
[16]:
# Check the shape
raster.read().shape
[16]:
(1, 521, 753)
[17]:
# Re-arrange cluster range
indx = []
for i in range(0,392313):
    indx.append(i)

Index = pd.DataFrame()
Index['id'] = indx

df1 = Index.set_index('id')
df2 = clust_kmean.set_index('id')
df2 = clust_kmean.set_index(df2.index.astype('int64')).drop(columns=['id'])
mask = df2.index.isin(df1.index)
df1['cluster'] = df2.loc[mask, 'class']
[18]:
# Reshape the cluster array
array = np.array(df1['cluster'])
n_array = array.reshape(raster.read().shape)
[19]:
# Data dir
data_dir = "output/5/land_cover"

# Output raster
out_tif = os.path.join(data_dir, "land_cover.tif")

# Copy the metadata
out_meta = raster.meta.copy()
out_meta

# Update the metadata
out_meta.update({'driver': 'GTiff',
                 'dtype': 'float32',
                 'nodata': None,
                 'width': raster.shape[1],
                 'height': raster.shape[0],
                 'crs': raster.crs,
                 'count':1,
                 'transform': raster.transform
                })
[20]:
with rasterio.open(out_tif, "w", **out_meta) as dest:
    dest.write(n_array.astype(np.float32))

Create Raster of Paddy

[21]:
Paddy_DF = X_group.loc[X_group['cluster_id'] == 2]
[22]:
Land_cover = rasterio.open('output/5/land_cover/land_cover.tif').read()

Paddy = np.where(Land_cover != 2, np.nan, Land_cover)
[23]:
# Data dir
data_dir = "output/5/paddy_area"

# Output raster
out_tif = os.path.join(data_dir, "paddy_area.tif")

# Copy the metadata
out_meta = raster.meta.copy()
out_meta

# Update the metadata
out_meta.update({'driver': 'GTiff',
                 'dtype': 'float32',
                 'nodata': None,
                 'width': raster.shape[1],
                 'height': raster.shape[0],
                 'crs': raster.crs,
                 'count':1,
                 'transform': raster.transform
                })
[24]:
with rasterio.open(out_tif, "w", **out_meta) as dest:
    dest.write(Paddy.astype(np.float32))

Mask non-paddy area

[25]:
# Read Shape file
with fiona.open("input/non_paddy_area/non_stu_area.shp", "r") as shapefile:
    shapes = [feature["geometry"] for feature in shapefile]

# Mask area
raster_fp = "output/5/paddy_area/paddy_area.tif"
with rasterio.open(raster_fp) as src:
    out_image, out_transform = rasterio.mask.mask(src, shapes, crop=False)
    out_meta = src.meta

out_image = np.where(out_image == 0, np.nan, out_image)

# Save clipped imagery
out_meta.update({"driver": "GTiff",
                 "height": out_image.shape[1],
                 "width": out_image.shape[2],
                 "transform": out_transform})

out_fp = "output/5/paddy_area/paddy_area.tif"

with rasterio.open(out_fp, "w", **out_meta) as dest:
    dest.write(out_image)

Visualize result of paddy area

[26]:
from IPython.display import Image
Image(filename="images/paddy-area-result.png")
[26]:
../../../../_images/Content_Project_2021_paddy-area-classification_6._kmeans-clustering_38_0.png
[27]:
Image(filename="images/paddy-area-result-satellite.png")
[27]:
../../../../_images/Content_Project_2021_paddy-area-classification_6._kmeans-clustering_39_0.png