Classify iris dataset with random forest classifier

Written by: Men Vuthy, 2022


Objective

  • Classify iris dataset using supervised learning method with random forest classifier

Code

Import necessary modules

[1]:
from sklearn.datasets import load_iris

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import pandas as pd
import numpy as np
np.random.seed(0)

import matplotlib.pyplot as plt
import seaborn as sns
[2]:
# Creating an object called iris with the iris data
iris = load_iris()
[3]:
# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df.head()
[3]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
[4]:
# Add a new column for the species name
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

df.head()
[4]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
[5]:
# Create Test and Train Data
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

df.head()
[5]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) species is_train
0 5.1 3.5 1.4 0.2 setosa True
1 4.9 3.0 1.4 0.2 setosa True
2 4.7 3.2 1.3 0.2 setosa True
3 4.6 3.1 1.5 0.2 setosa True
4 5.0 3.6 1.4 0.2 setosa True
[6]:
# Create dataframes with test rows and training rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

# show the number of observations for test and train dataframe
print('Training data:', len(train))
print('Testing data:', len(test))
Training data: 118
Testing data: 32
[7]:
# Create a list of the feature column's names
features = df.columns[:4]

features
[7]:
Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')
[8]:
# Converting each species name into digits
y = pd.factorize(train['species'])[0]

y
[8]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

Train Classifier

[9]:
# Creating a random forest classifier
clf = RandomForestClassifier(n_jobs=-1, random_state=0)

# Training the classifier
clf.fit(train[features], y)
[9]:
RandomForestClassifier(n_jobs=-1, random_state=0)

Test Classifier

[10]:
# Apply the trained Classifier to the test
clf.predict(test[features])
[10]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

View Probability

[11]:
# Viewing the predicted probabilities of the first 10 observation
clf.predict_proba(test[features])[10:20]
[11]:
array([[1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.67, 0.33],
       [0.  , 1.  , 0.  ],
       [0.  , 0.82, 0.18],
       [0.  , 0.03, 0.97],
       [0.  , 0.42, 0.58],
       [0.  , 0.99, 0.01],
       [0.  , 0.96, 0.04]])
[12]:
# mapping names for the plants for each predicted plan class
preds = iris.target_names[clf.predict(test[features])]

preds[:5]
[12]:
array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')
[13]:
# Viewing the Actural species for the first five observation
test['species'].head()
[13]:
7     setosa
8     setosa
10    setosa
13    setosa
17    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']
[14]:
# Create confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])
[14]:
Predicted Species setosa versicolor virginica
Actual Species
setosa 13 0 0
versicolor 0 5 2
virginica 0 0 12
[15]:
# View accuracy score
accuracy_score(test['species'], preds)
[15]:
0.9375
[16]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(test['species'], preds)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
matrix
[16]:
array([[1.        , 0.        , 0.        ],
       [0.        , 0.71428571, 0.28571429],
       [0.        , 0.        , 1.        ]])
[17]:
# Build the plot
plt.figure(figsize=(10,5))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['setosa', 'versicolor', 'virginica']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()


../../../_images/Content_Documentation_geo-python_Random-forest-on-iris_25_0.png