It is a well-known fact, and something we have already mentioned, that 1-layer neural networks cannot predict the function XOR. 1-layer neural nets can only classify linearly separable sets, however, as we have seen, the Universal Approximation Theorem states that a 2-layer network can approximate any function, given a complex enough architecture.

We will now create a neural network with two neurons in the hidden layer and we will show how this can model the XOR function. However, we will write code that will allow the reader to simply modify it to allow for any number of layers and neurons in each layer, so that the reader can try simulating different scenarios. We are also going to use the hyperbolic tangent as the activity function for this network. To train the network, we will implement the back-propagation algorithm discussed earlier.

In addition, if you are interested in the mathemetical derivation of this implementation, please see my another post .

Define the Neural Network Class

We will need to import some libraries first.

import numpy
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

# The following code is used for hiding the warnings and make this notebook clearer.
import warnings
warnings.filterwarnings('ignore')

Next we define our activity function and its derivative (we use tanh(x) in this example):

def tanh(x):
    return (1.0 - numpy.exp(-2*x))/(1.0 + numpy.exp(-2*x))

def tanh_derivative(x):
    return (1 + x)*(1 - x)

Next we define the NeuralNetwork class:

class NeuralNetwork:
    #########
    # parameters
    # ----------
    # self:      the class object itself
    # net_arch:  consists of a list of integers, indicating
    #            the number of neurons in each layer, i.e. the network architecture
    #########
    def __init__(self, net_arch):
        numpy.random.seed(0)
        
        # Initialized the weights, making sure we also 
        # initialize the weights for the biases that we will add later
        self.activity = tanh
        self.activity_derivative = tanh_derivative
        self.layers = len(net_arch)
        self.steps_per_epoch = 1
        self.arch = net_arch
        self.weights = []

        # Random initialization with range of weight values (-1,1)
        for layer in range(self.layers - 1):
            w = 2*numpy.random.rand(net_arch[layer] + 1, net_arch[layer+1]) - 1
            self.weights.append(w)
    
    def _forward_prop(self, x):
        y = x

        for i in range(len(self.weights)-1):
            activation = numpy.dot(y[i], self.weights[i])
            activity = self.activity(activation)

            # add the bias for the next layer
            activity = numpy.concatenate((numpy.ones(1), numpy.array(activity)))
            y.append(activity)

        # last layer
        activation = numpy.dot(y[-1], self.weights[-1])
        activity = self.activity(activation)
        y.append(activity)
        
        return y
    
    def _back_prop(self, y, target, learning_rate):
        error = target - y[-1]
        delta_vec = [error * self.activity_derivative(y[-1])]

        # we need to begin from the back, from the next to last layer
        for i in range(self.layers-2, 0, -1):
            error = delta_vec[-1].dot(self.weights[i][1:].T)
            error = error*self.activity_derivative(y[i][1:])
            delta_vec.append(error)

        # Now we need to set the values from back to front
        delta_vec.reverse()
        
        # Finally, we adjust the weights, using the backpropagation rules
        for i in range(len(self.weights)):
            layer = y[i].reshape(1, self.arch[i]+1)
            delta = delta_vec[i].reshape(1, self.arch[i+1])
            self.weights[i] += learning_rate*layer.T.dot(delta)
    
    #########
    # parameters
    # ----------
    # self:    the class object itself
    # data:    the set of all possible pairs of booleans True or False indicated by the integers 1 or 0
    # labels:  the result of the logical operation 'xor' on each of those input pairs
    #########
    def fit(self, data, labels, learning_rate=0.1, epochs=100):
        
        # Add bias units to the input layer - 
        # add a "1" to the input data (the always-on bias neuron)
        ones = numpy.ones((1, data.shape[0]))
        Z = numpy.concatenate((ones.T, data), axis=1)
        
        for k in range(epochs):
            if (k+1) % 10000 == 0:
                print('epochs: {}'.format(k+1))
        
            sample = numpy.random.randint(X.shape[0])

            # We will now go ahead and set up our feed-forward propagation:
            x = [Z[sample]]
            y = self._forward_prop(x)

            # Now we do our back-propagation of the error to adjust the weights:
            target = labels[sample]
            self._back_prop(y, target, learning_rate)
    
    #########
    # the predict function is used to check the prediction result of
    # this neural network.
    # 
    # parameters
    # ----------
    # self:   the class object itself
    # x:      single input data
    #########
    def predict_single_data(self, x):
        val = numpy.concatenate((numpy.ones(1).T, numpy.array(x)))
        for i in range(0, len(self.weights)):
            val = self.activity(numpy.dot(val, self.weights[i]))
            val = numpy.concatenate((numpy.ones(1).T, numpy.array(val)))
        return val[1]
    
    #########
    # the predict function is used to check the prediction result of
    # this neural network.
    # 
    # parameters
    # ----------
    # self:   the class object itself
    # X:      the input data array
    #########
    def predict(self, X):
        Y = numpy.array([]).reshape(0, self.arch[-1])
        for x in X:
            y = numpy.array([[self.predict_single_data(x)]])
            Y = numpy.vstack((Y,y))
        return Y

Check the Performance of This Neural Network

Now we can check if this Neural Network can actually learn XOR rule, which is

  y=0 y=1
x=0 0 1
x=1 1 0
numpy.random.seed(0)

# Initialize the NeuralNetwork with
# 2 input neurons
# 2 hidden neurons
# 1 output neuron
nn = NeuralNetwork([2,2,1])

# Set the input data
X = numpy.array([[0, 0], [0, 1],
                [1, 0], [1, 1]])

# Set the labels, the correct results for the xor operation
y = numpy.array([0, 1, 
                 1, 0])

# Call the fit function and train the network for a chosen number of epochs
nn.fit(X, y, epochs=100000)

# Show the prediction results
print("Final prediction")
for s in X:
    print(s, nn.predict_single_data(s))

Final prediction
[0 0] 0.0030321736925
[0 1] 0.996386076136
[1 0] 0.995903456394
[1 1] 0.000638644921757

The reader can slightly modify the code we created in the plot_decision_regions function defined in the appendix of this article and see how different neural networks separate different regions depending on the architecture chosen.

numpy.random.seed(0)
nn = NeuralNetwork([2,2,1])
nn.fit(X, y, epochs=10)
plot_decision_regions(X, y, nn)
plt.xlabel('x-axis')
plt.ylabel('y-axis')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

Different neural network architectures (for example, implementing a network with a different number of neurons in the hidden layer, or with more than just one hidden layer) may produce a different separating region.

For example, ([2,4,3,1]) will represent a 3-layer neural network, with four neurons in the first hidden layer and three neurons in the second hidden layer, and choosing it will give the following figure:

numpy.random.seed(0)
nn = NeuralNetwork([2,4,3,1])
nn.fit(X, y, epochs=10)
plot_decision_regions(X, y, nn)
plt.xlabel('x-axis')
plt.ylabel('y-axis')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

While choosing nn = NeuralNetwork([2,4,1]), for example, would produce the following:

numpy.random.seed(0)
nn = NeuralNetwork([2,4,1])
nn.fit(X, y, epochs=10)
plot_decision_regions(X, y, nn)
plt.xlabel('x-axis')
plt.ylabel('y-axis')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

Code Review

Let’s review the code in details.

A. sigmoid function

In this implementation, actually sigmoid function can also used for activation. According to Wikipedia, a sigmoid function is a mathematical function having a characteristic “S”-shaped curve or sigmoid curve.

Often, sigmoid function refers to the special case of the logistic function shown in the figure above and defined by the formula

\[g(x) = \frac{1}{1+e^{-x}} = \frac{e^x}{e^x+1}\]

which can be written in python code with numpy library as follows

def sigmoid(x):
    return 1 / (1 + numpy.exp(-x))

Then, to take the derivative in the process of back propagation, we need to do differentiation of logistic function.

Suppose the output of a neuron (after activation) is $y = g(x) = (1+e^{-x})^{-1}$ where $x$ is the net input to this neuron, then the differentiation of logistic function is

\(g'(x) =-(1+\exp(-x))^{-2}\exp(-x)(-1)\)\(=g(x)\frac{\exp(-x)}{1+\exp(-x)} =g(x)\frac{1+\exp(-x)-1}{1+\exp(-x)}\)\(=g(x)(1-g(x))\)

So when we take the partial derivative $\partial y / \partial x=y(1-y)$, we can use the following python function

def sigmoid_derivative(y):
    return y * (1 - y)

B. neural network

We devised a class named NeuralNetwork that is capable of training a “XOR” function. The NeuralNetwork consists of the following 3 parts:

  1. initialization
  2. fit
  3. predict
class NeuralNetwork:

    #########
    # parameters
    # ----------
    # self:      the class object itself
    # net_arch:  consists of a list of integers, indicating
    #            the number of neurons in each layer, 
    #            i.e. [2,2,1] (two neurons for the input layer, 
    #                          two neurons for the first and the only hidden layer, 
    #                          and one neuron for the output layer)
    #########
    def __init__(self, net_arch):
        # Initialized the weights, making sure we also initialize the weights
        # for the biases that we will add later.
        # Afterwards, we do random initialization with range of weight values (-1,1)

    def _forward_prop(self, x):
        # feed-forward
    
    def _back_prop(self, y, target, learning_rate):
        # adjust the weights using the backpropagation rules

    #########
    # parameters
    # ----------
    # self:    the class object itself
    # data:    the set of all possible pairs of booleans True or False indicated by 
    #          the integers 1 or 0
    # labels:  the result of the logical operation 'xor' on each of those input pairs
    #########
    def fit(self, X, labels, learning_rate=0.1, epochs=100):
        # Add bias units to the input layer
        for k in range(epochs):
            # Set up our feed-forward propagation
            # And then do our back-propagation of the error to adjust the weights

    #########
    # parameters
    # ----------
    # self:   the class object itself
    # X:      the input data array
    #########
    def predict(self, X):
        # Do prediction with the given data X and the pre-trained weights

In the initialization part, we create a list of arrays for the weights. That is, given $k$ layers (the $1^{th}$ layer is the input layer and the $k^{th}$ layer is the output layer) and $n_k$ units in the $k^{th}$ layer, we have

\[self.weights = [\Theta^{(1)}~\Theta^{(3)}~...~\Theta^{(k-1)}]\] \[where~\Theta^{(j)}=\begin{bmatrix} \Theta_{1,1}^{(j)} & \Theta_{1,2}^{(j)} & \Theta_{1,3}^{(j)} & \dots & \Theta_{1,n_{j+1}}^{(j)} \\ \Theta_{2,1}^{(j)} & \Theta_{2,2}^{(j)} & \Theta_{2,3}^{(j)} & \dots & \Theta_{2,n_{j+1}}^{(j)} \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ \Theta_{(n_j+1),1}^{(j)} & \Theta_{(n_j+1),2}^{(j)} & \Theta_{(n_j+1),3}^{(j)} & \dots & \Theta_{(n_j+1),n_{j+1}}^{(j)} \end{bmatrix}\]
# Random initialization with range of weight values (-1,1)
self.weights = []
for layer in range(self.layers - 1):
    w = 2*numpy.random.rand(net_arch[layer] + 1, net_arch[layer+1]) - 1
    self.weights.append(w)

Note that a bias unit is added to each hidden layer and a “1” will be added to the input layer. That’s why the dimension of weight matrix is $(n_j+1) \times n_{j+1}$ instead of $n_j \times n_{j+1}$.

The fit part will train our network. For each epoch, we sample a training data and then do forward propagation and back propagation with this input.

for k in range(epochs):
    sample = numpy.random.randint(X.shape[0])

    # We will now go ahead and set up our feed-forward propagation:
    x = [Z[sample]]
    y = self._forward_prop(x)

    # Now we do our back-propagation of the error to adjust the weights:
    target = labels[sample]
    self._back_prop(y, target, learning_rate)

Forward propagation propagates the sampled input data forward through the network to generate the output value.

According to the generated output value, back propagation calculates the cost (error term) and do the propagation of the output activations back through the network using the training pattern target in order to generate the deltas (the difference between the targeted and actual output values) of all output and hidden neurons. With these deltas, we can get the gradients of the weights and use these gradients to update the original weights.

C. backpropagation

Use the neural network shown in Figure 1 as an example, the final output of the model would be

\[h_{\Theta}(x) = a_1^{(3)} = g(z_1^{(3)}) = g(\Theta_{0,1}^{(2)}a_0^{(2)}+\Theta_{1,1}^{(2)}a_1^{(2)}+\Theta_{2,1}^{(2)}a_2^{(2)})\]
  • $\Theta^{(j)}$ is the matrix of weights mapping from layer $j$ to layer $(j+1)$
  • $a_i^{(j)}$ is the activation of unit $i$ in layer $j$
  • $z_i^{(j)}$ is the net input to the unit $i$ in layer $j$
  • $g$ is sigmoid function that refers to the special case of the logistic function
  • $x$ is the input vector $[x_0~x_1~x_2]^T$.

To update the weights with gradient descent method, we need to calculate the gradients. Ultimately, this means computing the partial derivatives $\partial err / \partial a_1^{(3)}$ given the error term $E_{total}$ defined as $E_{total} = (1/2)(y - a_1^{(3)})^2$, which is the loss between the actual label $y$ and the prediction $a_1^{(3)}$.

Note that with chain rule, the partial derivative of $E_{total}$ with respect to $\Theta_{2,1}^{(2)}$ is only related to the error term and the output values $a_2^{(2)}$ and $a_1^{(3)}$.

\[\frac{\partial~E_{total}}{\partial~\Theta_{2,1}^{(2)}} = \frac{\partial~E_{total}}{\partial~a_1^{(3)}} \times \frac{\partial~a_1^{(3)}}{\partial~\Theta_{2,1}^{(2)}} = \frac{\partial~E_{total}}{\partial~a_1^{(3)}} \times \Bigg(\frac{\partial~g(z_1^{(3)})}{\partial~z_1^{(3)}} \times \frac{\partial~z_1^{(3)}}{\partial~\Theta_{2,1}^{(2)}}\Bigg)\] \[= \Bigg(-(y - a_1^{(3)})\Bigg) \times \Bigg(\Big(a_1^{(3)} * (1-a_1^{(3)})\Big) \times a_2^{(2)}\Bigg)\]

Furthermore, the partial derivative of $E_{total}$ with respect to $\Theta_{2,1}^{(1)}$ can be calculated with the same regards as follows.

\[\frac{\partial~E_{total}}{\partial~\Theta_{2,1}^{(1)}} = \frac{\partial~E_{total}}{\partial~a_2^{(2)}} \times \frac{\partial~a_2^{(2)}}{\partial~\Theta_{2,1}^{(1)}} = \frac{\partial~E_{total}}{\partial~a_2^{(2)}} \times \Bigg(\frac{\partial~g(z_2^{(2)})}{\partial~z_2^{(2)}} \times \frac{\partial~z_2^{(2)}}{\partial~\Theta_{2,1}^{(1)}}\Bigg)\] \[where~\frac{\partial~E_{total}}{\partial~a_i^{(j)}}=\sum_{k}\frac{\partial~E_{a_{k}^{(j+1)}}}{\partial~a_i^{(j)}}=\sum_{k} \frac{\partial~E_{a_{k}^{(j+1)}}}{\partial~z_k^{(j+1)}} \times \Theta_{k,i}^{(j)}\]

In conclusion, the back propagation process can be divided into 2 steps:

Step 1. Generate the deltas (the difference between the targeted and actual output values) of all output and hidden neurons

First, we need to calculate the partial derivative of the total error with respect to the net input values of the neuron(s) in the output layer.

Recall that we have calculated the partial derivative of the total error $E_{total}$ with respect to $z_1^{(3)}$, which is the net input to the neuron in the output layer in the case we discuss above.

\[E_{z_1^{(3)}} = \frac{\partial~E_{total}}{\partial~z_1^{(3)}} = \frac{\partial~E_{total}}{\partial~a_1^{(3)}} \times \frac{\partial~g(z_1^{(3)})}{\partial~z_1^{(3)}} = \Bigg(-(target - a_1^{(3)})\Bigg) \times \Bigg(a_1^{(3)} * (1-a_1^{(3)})\Bigg)\]

This can be written in a general form as

delta_vec = [(target - y[-1]) * self.activity_derivative(y[-1])]

where $y[j] = [a_{0}^{(j)}~a_{1}^{(j)}~…]$ is a vector representing the output values of layer $j$ and the delta we compute here is actually the negative gradient.

Afterwards, we calculate the deltas for neurons in the remaining layers.

For the remaining layers, given $\Theta_{pq}^{(j)}$ as the weight maps from the $p^{th}$ unit of layer $j$ to the $q^{th}$ unit of layer $(j+1)$, we have

\[E_{z_i^{(j)}} = \frac{\partial~E_{total}}{\partial~z_{i}^{(j)}} = \frac{\partial~E_{total}}{\partial~a_{i}^{(j)}} \times \frac{\partial~a_{i}^{(j)}}{\partial~z_{i}^{(j)}} = \Bigg(\sum_{k}\frac{\partial~E_{a_{k}^{(j+1)}}}{\partial~a_i^{(j)}} \Bigg) \times \frac{\partial~a_{i}^{(j)}}{\partial~z_{i}^{(j)}}\] \[=\Bigg(\sum_{k} \frac{\partial~E_{a_{k}^{(j+1)}}}{\partial~z_k^{(j+1)}} \times \frac{\partial~z_k^{(j+1)}}{\partial~a_{i}^{(j)}} \Bigg) \times \frac{\partial~a_{i}^{(j)}}{\partial~z_{i}^{(j)}} =\Bigg(\sum_{k} E_{z_k^{(j+1)}} \times \Theta_{i,k}^{(j)} \Bigg) \times \frac{\partial~a_{i}^{(j)}}{\partial~z_{i}^{(j)}}\]

As a result, when we consider the matrix representation of weights,

\[self.weights[j]=\Theta^{(j)}=\begin{bmatrix} \Theta_{1,1}^{(j)} & \Theta_{1,2}^{(j)} & \Theta_{1,3}^{(j)} & \dots & \Theta_{1,n_{j+1}}^{(j)} \\ \Theta_{2,1}^{(j)} & \Theta_{2,2}^{(j)} & \Theta_{2,3}^{(j)} & \dots & \Theta_{2,n_{j+1}}^{(j)} \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ \Theta_{(n_j+1),1}^{(j)} & \Theta_{(n_j+1),2}^{(j)} & \Theta_{(n_j+1),3}^{(j)} & \dots & \Theta_{(n_j+1),n_{j+1}}^{(j)} \end{bmatrix}\]

we can calculate the gradient of weights layer-by-layer from the last hidden layer to the input layer with the code below.

# we need to begin from the back, from the next to last layer
for j in range(self.layers-2, 0, -1):
    error = delta_vec[-1].dot(self.weights[j][1:].T)   # Line 3
    error = error*self.activity_derivative(y[j][1:])   # Line 4
    delta_vec.append(error)

# Now we need to set the values from back to front
delta_vec.reverse()

Note that for a certain layer $j$, the inner product generated by Line 3 of the code above represents

\[\bigg[\frac{\partial~E_{total}}{\partial~z_{2}^{(j)}}~\frac{\partial~E_{total}}{\partial~z_{3}^{(j)}}~\frac{\partial~E_{total}}{\partial~z_{4}^{(j)}}~...\frac{\partial~E_{total}}{\partial~z_{(n_j+1)}^{(j)}}\bigg]\] \[=[E_{z_2^{(j+1)}}~E_{z_2^{(j+1)}}~E_{z_3^{(j+1)}}~...~E_{z_{(n_j+1)}^{(j+1)}}] \cdot \begin{bmatrix} \Theta_{2,1}^{(j)} & \Theta_{2,2}^{(j)} & \Theta_{2,3}^{(j)} & \dots & \Theta_{2,n_{j+1}}^{(j)} \\ \Theta_{3,1}^{(j)} & \Theta_{3,2}^{(j)} & \Theta_{3,3}^{(j)} & \dots & \Theta_{3,n_{j+1}}^{(j)} \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ \Theta_{(n_j+1),1}^{(j)} & \Theta_{(n_j+1),2}^{(j)} & \Theta_{(n_j+1),3}^{(j)} & \dots & \Theta_{(n_j+1),n_{j+1}}^{(j)} \end{bmatrix}\]

And in Line 4 we generate delta_vec[j] with

\[delta\_vec[j] =\bigg[E_{z_2^{(j)}}~E_{z_3^{(j)}}~E_{z_4^{(j)}}~...~E_{z_{n_j+1}^{(j)}}\bigg]\] \[= \bigg[\frac{\partial~E_{total}}{\partial~z_{2}^{(j)}}~\frac{\partial~E_{total}}{\partial~z_{3}^{(j)}}~\frac{\partial~E_{total}}{\partial~z_{4}^{(j)}}~...\frac{\partial~E_{total}}{\partial~z_{(n_j+1)}^{(j)}}\bigg] * \bigg[\frac{\partial~a_{2}^{(j)}}{\partial~z_{2}^{(j)}}~\frac{\partial~a_{3}^{(j)}}{\partial~z_{3}^{(j)}}~\frac{\partial~a_{4}^{(j)}}{\partial~z_{4}^{(j)}}~...\frac{\partial~a_{n_j+1}^{(j)}}{\partial~z_{n_j+1}^{(j)}}\bigg]\]

Step 2. Adjust the weights using gradient descent

Given $\Theta_{pq}^{(j)}$ as the weight maps from the $p^{th}$ unit of layer $j$ to the $q^{th}$ unit of layer $(j+1)$, the gradient $g$ of weight $\Theta_{pq}^{(j)}$ can be written as

\[g_{\Theta_{pq}^{(j)}} = \frac{\partial~E_{total}}{\partial~\Theta_{pq}^{(j)}} = \frac{\partial~E_{total}}{\partial~z_{q}^{(j)}} \times \frac{\partial~z_{q}^{(j+1)}}{\partial~\Theta_{pq}^{(j)}} = E_{z_q^{(j+1)}} \times a_{p}^{(j)}\]

with the fact that $E_{z_q^{(j+1)}}$ for all units have been calculated in the previous step. Next, the weights would be updated according to the following rule

\[w \leftarrow w - \eta * \frac{\partial~E_{total}}{\partial~w}\]

($w$: weight; $\eta$: learning rate)

# Finally, we adjust the weights, using the backpropagation rules
for j in range(len(self.weights)):
    layer = y[j].reshape(1, nn.arch[j]+1)
    delta = delta_vec[j].reshape(1, nn.arch[j+1])
    self.weights[j] += learning_rate*layer.T.dot(delta)

For a certain layer $j$, the layer.T.dot(delta) representation in the last line of the code above can be illustrated as

\[\begin{bmatrix} a_1^{(j)} \\ a_2^{(j)} \\ a_3^{(j)} \\ \vdots \\ a_{(n_j+1)}^{(j)} \end{bmatrix} \cdot \begin{bmatrix} E_{z_2^{(j)}} & E_{z_3^{(j)}} & E_{z_4^{(j)}} & \dots & E_{z_{n_{j+1}}^{(j)}} \end{bmatrix}\]

Appendix

The self-defined plot functions are written here.

import numpy as np
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    # plot class samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=cmap(idx),
                    marker=markers[idx], label=cl)

    # highlight test samples
    if test_idx:
        # plot all samples
        X_test, y_test = X[test_idx, :], y[test_idx]

        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    alpha=1.0,
                    linewidths=1,
                    marker='o',
                    s=55, label='test set')

References