Classify iris dataset with random forest classifier¶

Written by: Men Vuthy, 2022

Objective¶

Classify iris dataset using supervised learning method with random forest classifier

Code¶

Import necessary modules

[1]:

from sklearn.datasets import load_iris

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import pandas as pd
import numpy as np
np.random.seed(0)

import matplotlib.pyplot as plt
import seaborn as sns

[2]:

# Creating an object called iris with the iris data
iris = load_iris()

[3]:

# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df.head()

[3]:

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

[4]:

# Add a new column for the species name
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

df.head()

[4]:

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

[5]:

# Create Test and Train Data
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

df.head()

[5]:

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	species	is_train
0	5.1	3.5	1.4	0.2	setosa	True
1	4.9	3.0	1.4	0.2	setosa	True
2	4.7	3.2	1.3	0.2	setosa	True
3	4.6	3.1	1.5	0.2	setosa	True
4	5.0	3.6	1.4	0.2	setosa	True

[6]:

# Create dataframes with test rows and training rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

# show the number of observations for test and train dataframe
print('Training data:', len(train))
print('Testing data:', len(test))

Training data: 118
Testing data: 32

[7]:

# Create a list of the feature column's names
features = df.columns[:4]

features

[7]:

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

[8]:

# Converting each species name into digits
y = pd.factorize(train['species'])[0]

y

[8]:

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

Train Classifier

[9]:

# Creating a random forest classifier
clf = RandomForestClassifier(n_jobs=-1, random_state=0)

# Training the classifier
clf.fit(train[features], y)

[9]:

RandomForestClassifier(n_jobs=-1, random_state=0)

Test Classifier

[10]:

# Apply the trained Classifier to the test
clf.predict(test[features])

[10]:

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

View Probability

[11]:

# Viewing the predicted probabilities of the first 10 observation
clf.predict_proba(test[features])[10:20]

[11]:

array([[1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.67, 0.33],
       [0.  , 1.  , 0.  ],
       [0.  , 0.82, 0.18],
       [0.  , 0.03, 0.97],
       [0.  , 0.42, 0.58],
       [0.  , 0.99, 0.01],
       [0.  , 0.96, 0.04]])

[12]:

# mapping names for the plants for each predicted plan class
preds = iris.target_names[clf.predict(test[features])]

preds[:5]

[12]:

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

[13]:

# Viewing the Actural species for the first five observation
test['species'].head()

[13]:

7     setosa
8     setosa
10    setosa
13    setosa
17    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

[14]:

# Create confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

[14]:

Predicted Species	setosa	versicolor	virginica
Actual Species
setosa	13	0	0
versicolor	0	5	2
virginica	0	0	12

[15]:

# View accuracy score
accuracy_score(test['species'], preds)

[15]:

0.9375

[16]:

# Get and reshape confusion matrix data
matrix = confusion_matrix(test['species'], preds)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
matrix

[16]:

array([[1.        , 0.        , 0.        ],
       [0.        , 0.71428571, 0.28571429],
       [0.        , 0.        , 1.        ]])

[17]:

# Build the plot
plt.figure(figsize=(10,5))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['setosa', 'versicolor', 'virginica']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

../../../_images/Content_Documentation_geo-python_Random-forest-on-iris_25_0.png

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2