import numpy as np
import matplotlib.pyplot as plt

np.random.seed(1) # set a seed so that the results are consistent

# Initialize weights and biases
def initialize_dnn_parameters(layer_dims):
    L = len(layer_dims) # number of layers
    parameters = {}
    for l in range(1, L):
        ### Code here ### (~ 1 lines of code)
        parameters['W' + str(l)] = stdv * np.random.randn(layer_dims[l], layer_dims[l-1]) + mu
        ### Code here ### (~ 1 lines of code)
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
    return parameters

# Test initialization
n_x = 5
n_h = 4
n_y = 3
layer_dims = [n_x, n_h, n_y]
parameters = initialize_dnn_parameters(layer_dims)
for l in range(1, len(layer_dims)):
    print(f"W{l} = \n{parameters['W' + str(l)]}")
    print(f"b{l} = \n{parameters['b' + str(l)]}")

W1 = 
[[ 0.72642933 -0.27358579 -0.23620559 -0.47984616  0.38702206]
 [-1.0292794   0.78030354 -0.34042208  0.14267862 -0.11152182]
 [ 0.65387455 -0.92132293 -0.14418936 -0.17175433  0.50703711]
 [-0.49188633 -0.07711224 -0.39259022  0.01887856  0.26064289]]
b1 = 
[[0.]
 [0.]
 [0.]
 [0.]]
W2 = 
[[-0.55030959  0.57236185  0.45079536  0.25124717]
 [ 0.45042797 -0.34186393 -0.06144511 -0.46788472]
 [-0.13394404  0.26517773 -0.34583038 -0.19837676]]
b2 = 
[[0.]
 [0.]
 [0.]]

# Define Sigmoid activation function class
class Sigmoid:
    def __init__(self):
        pass

    def forward(self, x):
        return 1 / (1 + np.exp(-x))

    def derivative(self, x):
        return self.forward(x) * (1 - self.forward(x))

    def __call__(self, x):
        return self.forward(x)

# Define ReLU activation function class
class ReLU:
    def __init__(self):
        pass

    def forward(self, x):
        ### Code here ### (~ 1 lines of code)

        ### Code here ### (~ 1 lines of code)

    def derivative(self, x):
        ### Code here ### (~ 1 lines of code)

        ### Code here ### (~ 1 lines of code)

    def __call__(self, x):
        return self.forward(x)

# Test the forward and backward (derivative) of ReLU activaiton
x = np.random.randn(10)
print(f"x = \n{x}")
print(f"ReLU(x) = \n{ReLU()(x)}")
print(f"ReLU'(x) = \n{ReLU().derivative(x)}")

x = 
[-0.6871727  -0.84520564 -0.67124613 -0.0126646  -1.11731035  0.2344157
  1.65980218  0.74204416 -0.19183555 -0.88762896]
ReLU(x) = 
[0.         0.         0.         0.         0.         0.2344157
 1.65980218 0.74204416 0.         0.        ]
ReLU'(x) = 
[0 0 0 0 0 1 1 1 0 0]

def layer(A_prev, W, b, act=ReLU()):
  ### Code here ### (~ 1 lines of code)

  ### Code here ### (~ 1 lines of code)
  A = act(Z)
  return A, Z

# Testing the nonlinear layer
num_samples = 5
X = np.random.randn(n_x, num_samples)
Y = np.random.randn(n_y, num_samples)

A_prev = X
W1 = parameters['W1']
b1 = parameters['b1']
A1, Z1 = layer(A_prev, W1, b1)
print(f"Z1 = \n{Z1}")
print(f"A1 = \n{A1}")

Z1 = 
[[-1.09575907  1.0266851  -0.31827206 -0.3506713  -0.12633385]
 [ 2.87245507 -1.45533316  0.57223289  0.59096241 -0.70441952]
 [-2.15866841  0.8461243  -0.40963115 -0.68640041  0.68699072]
 [ 0.80537242 -0.77691677  0.15346325  0.02594094 -0.07751913]]
A1 = 
[[0.         1.0266851  0.         0.         0.        ]
 [2.87245507 0.         0.57223289 0.59096241 0.        ]
 [0.         0.8461243  0.         0.         0.68699072]
 [0.80537242 0.         0.15346325 0.02594094 0.        ]]

def forward_propagation(X, parameters, act=ReLU()):
  L = len(parameters) // 2 # num of layers
  A = X # Initialize A0 with X
  caches = {}
  caches['A0'] = A # Cache the initial activaiton

  for l in range(1, L+1): # including the output layer
    A_prev = A
    W = parameters['W' + str(l)]
    b = parameters['b' + str(l)]
    ### Code here ### (~ 1 lines of code)

    ### Code here ### (~ 1 lines of code)
    caches['Z' + str(l)] = Z
    caches['A' + str(l)] = A

  return A, caches

# Test forward propagation
A, caches = forward_propagation(X, parameters)

L = len(parameters) // 2
for l in range(1, L+1):
    print(f"Z{l} = \n{caches['Z' + str(l)]}")

Z1 = 
[[-1.09575907  1.0266851  -0.31827206 -0.3506713  -0.12633385]
 [ 2.87245507 -1.45533316  0.57223289  0.59096241 -0.70441952]
 [-2.15866841  0.8461243  -0.40963115 -0.68640041  0.68699072]
 [ 0.80537242 -0.77691677  0.15346325  0.02594094 -0.07751913]]
Z2 = 
[[ 1.84643125 -0.18356575  0.36608149  0.34476193  0.30969223]
 [-1.35881023  0.41045749 -0.2674289  -0.2141661  -0.04221222]
 [ 0.60194395 -0.43013384  0.12129988  0.15156399 -0.23758226]]

def compute_cost(A, Y):
    m = Y.shape[1]
    cost = np.sum((A - Y) ** 2) / (2 * m)
    return cost

# Test the cost
print(f"cost = {compute_cost(A, Y)}")

cost = 1.7072853869205002

def back_propagation(Y, parameters, caches, act_derivative=ReLU().derivative):
    L = len(parameters) // 2 # num of layers in DNN
    m = Y.shape[1]           # num of training samples
    grads = {}

    ZL = caches['Z' + str(L)]
    AL = caches['A' + str(L)]
    dZ = (AL - Y) / m * act_derivative(ZL)  # Derivative of loss w.r.t. A

    for l in reversed(range(1, L+1)):
        # Compute the gradients using dZ and A_prev
        A_prev = caches['A' + str(l-1)]
        ### Code here ### (~ 2 lines of code)


        ### Code here ### (~ 2 lines of code)

        # Store the computed gradients
        grads['dW' + str(l)] = dW
        grads['db' + str(l)] = db

        # Backpropogate dZ
        W = parameters['W' + str(l)]
        if l > 1:
            Z_prev = caches['Z' + str(l - 1)]
            dZ = W.T @ dZ * act_derivative(Z_prev)

    return grads

# Test the backpropagation()
grads = back_propagation(Y, parameters, caches)
L = len(parameters) // 2
for l in range(1, L+1):
    print(f"dW{l} = \n{grads['dW' + str(l)]}")
    print(f"db{l} = \n{grads['db' + str(l)]}")

dW1 = 
[[-0.07100506 -0.00504114  0.01465629 -0.0119815   0.01250615]
 [-0.1535268   0.2243261   0.06768908  0.00272228  0.07802137]
 [ 0.02370006 -0.02516877  0.05958518  0.09359966  0.0813598 ]
 [-0.06098153  0.1461884  -0.00877939  0.06790589  0.06303178]]
db1 = 
[[-0.04195389]
 [ 0.41779308]
 [ 0.07912688]
 [ 0.21442471]]
dW2 = 
[[ 0.          0.60078972  0.11186381  0.11598846]
 [-0.0956278   0.         -0.07880995  0.        ]
 [ 0.         -0.15898858  0.         -0.05413308]]
db2 = 
[[ 0.93844522]
 [-0.09314229]
 [-0.09856997]]

# Define the Neural Network class
class NeuralNetwork:
    def __init__(self, n_x, n_y, n_h, depth, act=ReLU()):
        self.n_x = n_x
        self.n_y = n_y
        self.n_h = n_h
        self.depth = depth
        self.act = act
        self.initialize()

    def initialize(self):
        layer_dims = [self.n_x] + [self.n_h] * (self.depth-1) + [self.n_y]
        ### Code here ### (~ 1 lines of code)

        ### Code here ### (~ 1 lines of code)

    def forward(self, X):
        self.caches = {}
        output, caches = forward_propagation(X, self.parameters, self.act)
        self.caches = caches
        return output

    def backward(self, Y):
        self.grads = {}
        ### Code here ### (~ 1 lines of code)

        ### Code here ### (~ 1 lines of code)
        self.grads = grads

    def __call__(self, X):
        return self.forward(X)

# Test NeuralNetwork and its forward()
network = NeuralNetwork(n_x, n_y, n_h, depth=3, act=Sigmoid())
for l in range(1, network.depth+1):
    print(f"W{l} = \n{network.parameters['W' + str(l)]}")
    print(f"b{l} = \n{network.parameters['b' + str(l)]}")

A = network(X)
print(f"A = \n{A}")

W1 = 
[[ 0.08343279  0.18338067  0.08868233  0.05322228 -0.29992929]
 [ 0.16885166  0.05448013  0.50512056  0.53617238  0.08280447]
 [-0.16783253 -0.28564892  0.18939243  0.03458753 -0.15377604]
 [ 0.01949711 -0.27727281  0.31216942 -0.19996197  0.54761649]]
b1 = 
[[0.]
 [0.]
 [0.]
 [0.]]
W2 = 
[[ 0.20174582  0.29678926 -0.54745592  0.08469122]
 [ 0.37027823 -0.4768503  -0.13310925  0.01630727]
 [-0.68655866  0.1575797   0.42308032 -0.42975797]
 [ 0.17527299 -0.65614171 -0.01934775 -0.80788618]]
b2 = 
[[0.]
 [0.]
 [0.]
 [0.]]
W3 = 
[[ 0.56070885  0.20445027 -0.01230848 -0.38758081]
 [ 0.63687797  0.98355087 -0.92899093  0.61808202]
 [ 0.81382538  0.16900585 -0.59963402  0.43167266]]
b3 = 
[[0.]
 [0.]
 [0.]]
A = 
[[0.5594009  0.56163771 0.56270777 0.55963543 0.56721295]
 [0.65571946 0.64975006 0.64269171 0.64587559 0.62735589]
 [0.60243733 0.5987133  0.59506056 0.59312079 0.58729086]]

# Test the backward() method in NeuralNetwork
network.backward(Y)

for l in range(1, network.depth+1):
    print(f"dW{l} = \n{network.grads['dW' + str(l)]}")
    print(f"db{l} = \n{network.grads['db' + str(l)]}")

dW1 = 
[[-0.00819391  0.00434696  0.01032395 -0.00024787  0.00653069]
 [ 0.00220476 -0.00447719 -0.00122303  0.00096706 -0.00096016]
 [ 0.00773704 -0.00218865 -0.01071998 -0.00014516 -0.00656312]
 [-0.0032138  -0.00055366  0.00339891  0.00027155  0.00214983]]
db1 = 
[[ 0.01889197]
 [-0.00765741]
 [-0.0164768 ]
 [ 0.00279429]]
dW2 = 
[[ 0.02987019  0.03644857  0.03402445  0.03874588]
 [ 0.02737154  0.03083942  0.02911464  0.03225519]
 [-0.02720202 -0.03051821 -0.02837608 -0.03141728]
 [ 0.01224843  0.0119922   0.01098373  0.01152867]]
db2 = 
[[ 0.0630978 ]
 [ 0.05643017]
 [-0.05552861]
 [ 0.02358812]]
dW3 = 
[[0.06034881 0.053968   0.05274028 0.03386175]
 [0.09777237 0.09016585 0.08312728 0.0634621 ]
 [0.03890758 0.03489111 0.03297021 0.02320944]]
db3 = 
[[0.1204913 ]
 [0.19279404]
 [0.07642293]]

def gradient_descent_step(parameters, grads, learning_rate):
    L = len(parameters) // 2
    for l in range(1, L+1):
      W, b = parameters['W' + str(l)], parameters['b' + str(l)]
      dW, db = grads['dW' + str(l)], grads['db' + str(l)]

      ### Code here ### (~ 1 lines of code)

      ### Code here ### (~ 1 lines of code)
      b -= learning_rate * db

      parameters['W' + str(l)] = W
      parameters['b' + str(l)] = b

gradient_descent_step(network.parameters, network.grads, learning_rate=0.01)

for l in range(1, network.depth+1):
    print(f"W{l} = \n{network.parameters['W' + str(l)]}")
    print(f"b{l} = \n{network.parameters['b' + str(l)]}")

W1 = 
[[ 0.08351473  0.1833372   0.08857909  0.05322476 -0.2999946 ]
 [ 0.16882961  0.0545249   0.50513279  0.53616271  0.08281407]
 [-0.1679099  -0.28562704  0.18949963  0.03458898 -0.15371041]
 [ 0.01952925 -0.27726727  0.31213543 -0.19996469  0.547595  ]]
b1 = 
[[-1.88919658e-04]
 [ 7.65740929e-05]
 [ 1.64767984e-04]
 [-2.79428845e-05]]
W2 = 
[[ 0.20144712  0.29642478 -0.54779617  0.08430376]
 [ 0.37000451 -0.4771587  -0.1334004   0.01598472]
 [-0.68628664  0.15788488  0.42336408 -0.4294438 ]
 [ 0.17515051 -0.65626163 -0.01945759 -0.80800146]]
b2 = 
[[-0.00063098]
 [-0.0005643 ]
 [ 0.00055529]
 [-0.00023588]]
W3 = 
[[ 0.56010537  0.20391059 -0.01283588 -0.38791943]
 [ 0.63590024  0.98264922 -0.92982221  0.61744739]
 [ 0.8134363   0.16865694 -0.59996372  0.43144056]]
b3 = 
[[-0.00120491]
 [-0.00192794]
 [-0.00076423]]

learning_rate = 0.1
max_iter = 10
losses = []
for i in range(max_iter):
    A = network(X)
    network.backward(Y)
    losses.append(compute_cost(A, Y))
    gradient_descent_step(network.parameters, network.grads, learning_rate)

losses = np.array(losses)
print(f"Losses: \n{losses}")

Losses: 
[2.24437118 2.23186699 2.21949885 2.20728322 2.19523579 2.18337136
 2.17170374 2.16024557 2.14900828 2.13800201]

class Optimizer:
    def __init__(self, network, learning_rate):
        self.network = network  # The entire neural network is passed in
        self.learning_rate = learning_rate

    def step(self):
        raise NotImplementedError("Step method must be implemented in a subclass")

class GradientDescent(Optimizer):
    # No need to redefine __init__() because it uses the same as Optimizer

    def step(self):
        ### Code here ### (~ 1 lines of code)

        ### Code here ### (~ 1 lines of code)

# Test the `GradientDescent` optimizer by using the previous training loop
optimizer = GradientDescent(network, learning_rate = 0.1)

max_iter = 10
losses = []
for i in range(max_iter):
    A = network(X)
    network.backward(Y)
    losses.append(compute_cost(A, Y))
    optimizer.step()

losses = np.array(losses)
print(f"Losses: \n{losses}")

Losses: 
[2.12723555 2.11671634 2.10645047 2.09644267 2.0866964  2.07721384
 2.06799602 2.05904287 2.05035331 2.04192534]

class GradientDescentWithMomentum(Optimizer):
    def __init__(self, network, learning_rate, beta=0.9):
        super().__init__(network, learning_rate)
        self.beta = beta
        self.velocities = {}

    def step(self):
        for key in self.network.parameters.keys():
            # Get corresponding gradient key: W1 -> dW1, b1 -> db1
            grad_key = 'd' + key

            # Get the parameter value and gradient value
            param = self.network.parameters[key]
            grad = self.network.grads[grad_key]

            # Initialize velocity if not present
            if key not in self.velocities:
                self.velocities[key] = np.zeros_like(param)

            # Get the previous velocity for this parameter
            velocity = self.velocities[key]

            # Update the velocity and parameter using the momentum formula
            ### Code here ### (~ 2 lines of code)


            ### Code here ### (~ 2 lines of code)

            # Store the updated velocity and parameter back into the neural network
            self.velocities[key] = velocity
            self.network.parameters[key] = param

optimizer = GradientDescentWithMomentum(network, learning_rate = 0.1, beta=0.8)

max_iter = 10
losses = []
for i in range(max_iter):
    A = network(X)
    network.backward(Y)
    losses.append(compute_cost(A, Y))
    optimizer.step()

losses = np.array(losses)
print(f"Losses: \n{losses}")

Losses: 
[2.03375613 2.03216316 2.02931359 2.02548826 2.02091931 2.01579704
 2.01027639 2.00448266 1.99851656 1.99245847]

class RMSProp(Optimizer):
    def __init__(self, network, learning_rate, beta=0.9):
        super().__init__(network, learning_rate)
        self.beta = beta
        self.squared_gradients = {}

    def step(self):
        for key in self.network.parameters.keys():
            # Get corresponding gradient key: W1 -> dW1, b1 -> db1
            grad_key = 'd' + key

            # Get the parameter value and gradient value
            param = self.network.parameters[key]
            grad = self.network.grads[grad_key]

            # Initialize running average of squared gradients if not present
            if key not in self.squared_gradients:
                self.squared_gradients[key] = np.zeros_like(param)

            # Get the previous squared_gradient for this parameter
            squared_gradient = self.squared_gradients[key]

            # Update the squared_gradient and parameter using the RMSProp formula
            ### Code here ### (~ 2 lines of code)


            ### Code here ### (~ 2 lines of code)

            # Store the updated parameter back into the neural network
            self.squared_gradients[key] = squared_gradient
            self.network.parameters[key] = param

# Test RMSProp
optimizer = RMSProp(network, learning_rate = 0.1, beta=0.8)

max_iter = 10
losses = []
for i in range(max_iter):
    A = network(X)
    network.backward(Y)
    losses.append(compute_cost(A, Y))
    optimizer.step()

losses = np.array(losses)
print(f"Losses: \n{losses}")

Losses: 
[1.98637207 1.81473496 1.77935884 1.75574723 1.73123266 1.69885805
 1.65468703 1.59739508 1.52794071 1.45044982]

class Adam(Optimizer):
    def __init__(self, network, learning_rate, beta1=0.9, beta2=0.999):
        super().__init__(network, learning_rate)
        self.beta1 = beta1
        self.beta2 = beta2
        self.velocities = {}
        self.squared_gradients = {}

    def step(self):
        for key in self.network.parameters.keys():
            # Get corresponding gradient key: W1 -> dW1, b1 -> db1
            grad_key = 'd' + key

            # Get the parameter value and gradient value
            param = self.network.parameters[key]
            grad = self.network.grads[grad_key]

            # Initialize velocity and running average of squared gradients if not present
            if key not in self.velocities:
              self.velocities[key] = np.zeros_like(param)

            if key not in self.squared_gradients:
              self.squared_gradients[key] = np.zeros_like(param)

            # Get the previous velocity and squared_gradient for this parameter
            velocity = self.velocities[key]
            squared_gradient = self.squared_gradients[key]

            # Update the velocity, squared_gradient, and parameters using the RMSProp formula
            ### Code here ### (~ 3 lines of code)



            ### Code here ### (~ 3 lines of code)

            # Store the updated parameter back into the neural network
            self.velocities[key] = velocity
            self.squared_gradients[key] = squared_gradient
            self.network.parameters[key] = param

# Test the Adam optimizer using the training loop
optimizer = Adam(network, learning_rate = 0.1, beta1=0.8, beta2=0.9)

max_iter = 10
losses = []
for i in range(max_iter):
    A = network(X)
    network.backward(Y)
    losses.append(compute_cost(A, Y))
    optimizer.step()

losses = np.array(losses)
print(f"Losses: \n{losses}")

Losses: 
[1.3747617  1.3401749  1.30033804 1.26212976 1.22939172 1.20369176
 1.18448922 1.17062461 1.16050343 1.15281649]

def train_loop(model, optimizer, X, Y, evaluate=False, print_cost=False):
    outputs = model(X)
    loss = compute_cost(outputs, Y)
    if not evaluate:
        model.backward(Y)
        optimizer.step()

    return loss

def train(model, optimizer, X_train, Y_train, X_test, Y_test, num_iterations=10, print_cost=False):
    train_losses = []
    test_losses = []

    for i in range(num_iterations):
        # Perform training and evaluation every iteration
        test_loss = train_loop(model, optimizer, X_test, Y_test, evaluate=True, print_cost=print_cost)
        train_loss = train_loop(model, optimizer, X_train, Y_train, evaluate=False, print_cost=print_cost)

        # Store both train and test losses
        train_losses.append(train_loss)
        test_losses.append(test_loss)

        # Optionally print the loss values
        if print_cost and i % 10 == 0:
            print(f"Train Loss at {i}: {train_loss}; Test Loss at {i}: {test_loss}")

    return np.array(train_losses), np.array(test_losses)

network = NeuralNetwork(n_x, n_y, n_h, depth=3)
optimizer = Adam(network, learning_rate = 0.1, beta1=0.8, beta2=0.9)
train_losses, test_losses = train(network, optimizer, X, Y, X, Y, num_iterations=10)
print(f"Train Losses: \n{train_losses}")
print(f"Test Losses: \n{test_losses}")

Train Losses: 
[1.8452532  1.80712141 1.74078336 1.75232359 1.72123482 1.7096015
 1.70552052 1.68568429 1.5995576  1.52332713]
Test Losses: 
[1.8452532  1.80712141 1.74078336 1.75232359 1.72123482 1.7096015
 1.70552052 1.68568429 1.5995576  1.52332713]

import tensorflow as tf
import numpy as np

# Load the MNIST dataset
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

# Flatten the 28x28 images into vectors of 784 elements and normalize to [0, 1]
X_train = X_train.reshape(X_train.shape[0], -1).T / 255.0  # Transpose to (in_features, num_samples)
X_test = X_test.reshape(X_test.shape[0], -1).T / 255.0     # Transpose to (in_features, num_samples)

# Select only the samples of class '0' and '1' for binary classification
train_filter = (y_train == 0) | (y_train == 1)
test_filter = (y_test == 0) | (y_test == 1)

X_train_binary = X_train[:, train_filter]
y_train_binary = y_train[train_filter].reshape(1, -1)  # Reshape to (1, num_samples)

X_test_binary = X_test[:, test_filter]
y_test_binary = y_test[test_filter].reshape(1, -1)  # Reshape to (1, num_samples)

# Verify the shapes
print(f"Training data shape: {X_train_binary.shape}")  # Should be (784, num_samples)
print(f"Training labels shape: {y_train_binary.shape}")  # Should be (1, num_samples)
print(f"Testing data shape: {X_test_binary.shape}")  # Should be (784, num_samples)
print(f"Testing labels shape: {y_test_binary.shape}")  # Should be (1, num_samples)

# Print out some example labels to verify
print("Training labels:", np.unique(y_train_binary))
print("Testing labels:", np.unique(y_test_binary))

Training data shape: (784, 12665)
Training labels shape: (1, 12665)
Testing data shape: (784, 2115)
Testing labels shape: (1, 2115)
Training labels: [0 1]
Testing labels: [0 1]

# Initialize neural network and optimizer settings
input_size = X_train_binary.shape[0]
hidden_size = 64
output_size = 1
depth = 3
num_iterations = 100

# Dictionary of optimizers and their settings
optimizers = {
    "Gradient Descent": lambda net: GradientDescent(net, learning_rate=0.1),
    # "Momentum": lambda net: GradientDescentWithMomentum(net, learning_rate=0.9, beta=0.9),
    "RMSProp": lambda net: RMSProp(net, learning_rate=0.001, beta=0.9),
    # "Adam": lambda net: Adam(net, learning_rate=0.01, beta1=0.9, beta2=0.99)
}

# Define colors for each optimizer
colors = {
    "Gradient Descent": 'b',
    # "Momentum": 'k',
    "RMSProp": 'r',
    # "Adam": 'c'
}

# Dictionary to store losses for each optimizer
train_losses_dict = {}
test_losses_dict = {}

# Loop through each optimizer, reinitialize the network, and train
for opt_name, optimizer_fn in optimizers.items():
    print(f"Training with {opt_name} ...")
    np.random.seed(1)
    mnist_net = NeuralNetwork(input_size, output_size, hidden_size, depth)
    optimizer = optimizer_fn(mnist_net)
    train_losses, test_losses = train(mnist_net, optimizer, X_train_binary, y_train_binary, X_test_binary, y_test_binary, num_iterations=num_iterations)
    train_losses_dict[opt_name] = train_losses
    test_losses_dict[opt_name] = test_losses

    print(f"Train Losses: {train_losses[-1]:.4f}, Test Losses: {test_losses[-1]:.4f}")



# Plot both training and test losses in the same figure with the same color for each optimizer
plt.figure(figsize=(10, 6))
for opt_name in optimizers.keys():
    color = colors[opt_name]
    plt.plot(train_losses_dict[opt_name], label=f'{opt_name} Train Loss', linestyle='-', color=color)
    plt.plot(test_losses_dict[opt_name], label=f'{opt_name} Test Loss', linestyle='--', color=color)

plt.title("Training and Test Loss Comparison")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

Training with Gradient Descent ...
Train Losses: 0.0033, Test Losses: 0.0029
Training with RMSProp ...
Train Losses: 0.0035, Test Losses: 0.0041

Homework 3: Optimization in Neural Network¶

0 - Packages¶

1 - Build up Deep Neural Network¶

1.1 - Initialization¶

1.2 - Activation Functions and Its Derivatives¶

1.3 - Forward Propogation¶

1.4 Compute the Cost¶

1.5 - Backpropagation¶

1.6 - Define Deep Neural Networks¶

2 - Optimization Algorithms¶

2.1 - Gradient Descent¶

2.2 - Gradient Descent with Momentum¶

2.3 - RMSProp¶

2.4 - Adam¶

3 - Training with Optimizers¶

3.1 - Load Image Dataset: MNIST¶

3.2 Optimizer Comparison¶

Congratulations¶