defparse_mnist(image_filename, label_filename): """ Read an images and labels file in MNIST format. See this page: http://yann.lecun.com/exdb/mnist/ for a description of the file format. Args: image_filename (str): name of gzipped images file in MNIST format label_filename (str): name of gzipped labels file in MNIST format Returns: Tuple (X,y): X (numpy.ndarray[np.float32]): 2D numpy array containing the loaded data. The dimensionality of the data should be (num_examples x input_dim) where 'input_dim' is the full dimension of the data, e.g., since MNIST images are 28x28, it will be 784. Values should be of type np.float32, and the data should be normalized to have a minimum value of 0.0 and a maximum value of 1.0 (i.e., scale original values of 0 to 0.0 and 255 to 1.0). y (numpy.ndarray[dtype=np.uint8]): 1D numpy array containing the labels of the examples. Values should be of type np.uint8 and for MNIST will contain the values 0-9. """ ### BEGIN YOUR CODE with gzip.open(image_filename, "rb") as image, gzip.open(label_filename, "rb") as label: image_magic, image_number, image_rows, image_cols = struct.unpack(">IIII", image.read(16)) image_pixels = struct.unpack("B" * (image_number * image_rows * image_cols), image.read()) image_pixels_array = np.array(image_pixels, dtype=np.float32).reshape(image_number, image_rows * image_cols) / 255.0 label_magic, label_number = struct.unpack(">II", label.read(8)) label_values = struct.unpack("B" * label_number, label.read()) label_values_array = np.array(label_values, dtype=np.uint8)
return image_pixels_array, label_values_array ### END YOUR CODE
defsoftmax_loss(Z, y): """ Return softmax loss. Note that for the purposes of this assignment, you don't need to worry about "nicely" scaling the numerical properties of the log-sum-exp computation, but can just compute this directly. Args: Z (np.ndarray[np.float32]): 2D numpy array of shape (batch_size, num_classes), containing the logit predictions for each class. y (np.ndarray[np.uint8]): 1D numpy array of shape (batch_size, ) containing the true label of each example. Returns: Average softmax loss over the sample. """ ### BEGIN YOUR CODE batch_size = y.shape[0] Z_softmax = np.exp(Z) / np.sum(np.exp(Z), axis=1, keepdims=True) # 两种写法 # cross_entropy = -np.log(Z_softmax[np.arange(batch_size), y]).mean() cross_entropy = (-Z[np.arange(batch_size), y] + np.log(np.sum(np.exp(Z), axis=1, keepdims=False))).mean() return cross_entropy ### END YOUR CODE
defsoftmax_regression_epoch(X, y, theta, lr = 0.1, batch=100): """ Run a single epoch of SGD for softmax regression on the data, using the step size lr and specified batch size. This function should modify the theta matrix in place, and you should iterate through batches in X _without_ randomizing the order. Args: X (np.ndarray[np.float32]): 2D input array of size (num_examples x input_dim). y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,) theta (np.ndarrray[np.float32]): 2D array of softmax regression parameters, of shape (input_dim, num_classes) lr (float): step size (learning rate) for SGD batch (int): size of SGD minibatch Returns: None """ ### BEGIN YOUR CODE num_examples, num_classes = X.shape[0], theta.shape[1]
defnn_epoch(X, y, W1, W2, lr = 0.1, batch=100): """ Run a single epoch of SGD for a two-layer neural network defined by the weights W1 and W2 (with no bias terms): logits = ReLU(X * W1) * W2 The function should use the step size lr, and the specified batch size (and again, without randomizing the order of X). It should modify the W1 and W2 matrices in place. Args: X (np.ndarray[np.float32]): 2D input array of size (num_examples x input_dim). y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,) W1 (np.ndarray[np.float32]): 2D array of first layer weights, of shape (input_dim, hidden_dim) W2 (np.ndarray[np.float32]): 2D array of second layer weights, of shape (hidden_dim, num_classes) lr (float): step size (learning rate) for SGD batch (int): size of SGD minibatch Returns: None """ ### BEGIN YOUR CODE num_examples, num_classes = X.shape[0], W2.shape[1]
voidsoftmax_regression_epoch_cpp(constfloat *X, constunsignedchar *y, float *theta, size_t m, size_t n, size_t k, float lr, size_t batch) { /** * A C++ version of the softmax regression epoch code. This should run a * single epoch over the data defined by X and y (and sizes m,n,k), and * modify theta in place. Your function will probably want to allocate * (and then delete) some helper arrays to store the logits and gradients. * * Args: * X (const float *): pointer to X data, of size m*n, stored in row * major (C) format * y (const unsigned char *): pointer to y data, of size m * theta (float *): pointer to theta data, of size n*k, stored in row * major (C) format * m (size_t): number of examples * n (size_t): input dimension * k (size_t): number of classes * lr (float): learning rate / SGD step size * batch (int): SGD minibatch size * * Returns: * (None) */
/// BEGIN YOUR CODE // gradient_theta = X.T @ (Z - I_y) for (size_t step = 0; step < m; step += batch) { // initialization of image & label float *image = newfloat[batch * n]; unsignedchar *label = newunsignedchar[batch]; size_t start_idx_X = step * n; size_t start_idx_y = step ; for (size_t i = 0; i < batch * n; i++) { image[i] = X[start_idx_X + i]; } for (size_t i = 0; i < batch; i++) { label[i] = y[start_idx_y + i]; } // Z = image @ theta // Z: batch * k image: batch * n theta: n * k float *Z = newfloat[batch * k]; for (size_t ib = 0; ib < batch; ib++) { for (size_t ik = 0; ik < k; ik++) { Z[ib * k + ik] = 0; for (size_t in = 0; in < n; in++) { Z[ib * k + ik] += image[ib * n + in] * theta[in * k + ik]; } } } // Z_softmax for (size_t ib = 0; ib < batch; ib++) { float sum = 0; for (size_t ik = 0; ik < k; ik++) { Z[ib * k + ik] = std::exp(Z[ib * k + ik]); sum += Z[ib * k + ik]; }
for (size_t ik = 0; ik < k; ik++) { Z[ib * k + ik] = Z[ib * k + ik] / sum; } } // Z - I_y for (size_t ib = 0; ib < batch; ib++) { size_t label_idx = label[ib]; Z[ib * k + label_idx] -= 1.0; } // gradient for (size_t in = 0; in < n; in++) { for (size_t ik = 0; ik < k; ik++) { float gradient_n_k = 0.0; for (size_t ib = 0; ib < batch; ib++) { gradient_n_k += image[ib*n + in] * Z[ib*k +ik]; } gradient_n_k /= batch; theta[in * k + ik] -= lr * gradient_n_k; } } delete[] image; delete[] label; delete[] Z; } /// END YOUR CODE }