pytorch - nn

Lecture 18

Dr. Colin Rundel

Odds & Ends

Torch models

Implementation details:

  • Models are implemented as a class inheriting from torch.nn.Module

  • Must implement constructor and forward() method

    • __init__() should call parent constructor via super()

      • Use torch.nn.Parameter() to indicate model parameters
    • forward() should implement the model - constants + parameters -> return predictions

Fitting proceedure:

  • For each iteration of solver:

    • Get current predictions via a call to forward() or equivalent.

    • Calculate a (scalar) loss or equivalent

    • Call backward() method on loss

    • Use built-in optimizer (step() and then zero_grad() if necessary)

From last time

class Model(torch.nn.Module):
    def __init__(self, X, y, beta=None):
        super().__init__()
        self.X = X
        self.y = y
        if beta is None:
          beta = torch.zeros(X.shape[1])
        beta.requires_grad = True
        self.beta = torch.nn.Parameter(beta)
        
    def forward(self, X):
        return X @ self.beta
    
    def fit(self, opt, n=1000, loss_fn = torch.nn.MSELoss()):
      losses = []
      for i in range(n):
          loss = loss_fn(
            self(self.X).squeeze(), 
            self.y.squeeze()
          )
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

What is self(self.X)?

This is (mostly) just short hand for calling self.forward(X) to generate the output tensors from the current value(s) of the parameters.

This is done via the __call__() method in the torch.nn.Module class. __call__() allows python classes to be invoked like functions.


class greet:
  def __init__(self, greeting):
    self.greeting = greeting
  def __call__(self, name):
    return self.greeting + " " + name
hello = greet("Hello")
hello("Jane")
'Hello Jane'
gm = greet("Good morning")
gm("Bob")
'Good morning Bob'

MNIST & Logistic models

MNIST handwritten digits - simplified

from sklearn.datasets import load_digits
digits = load_digits()
X = digits.data
X.shape
(1797, 64)
X[0:2]
array([[ 0.,  0.,  5., 13.,  9.,  1.,  0.,
         0.,  0.,  0., 13., 15., 10., 15.,
         5.,  0.,  0.,  3., 15.,  2.,  0.,
        11.,  8.,  0.,  0.,  4., 12.,  0.,
         0.,  8.,  8.,  0.,  0.,  5.,  8.,
         0.,  0.,  9.,  8.,  0.,  0.,  4.,
        11.,  0.,  1., 12.,  7.,  0.,  0.,
         2., 14.,  5., 10., 12.,  0.,  0.,
         0.,  0.,  6., 13., 10.,  0.,  0.,
         0.],
       [ 0.,  0.,  0., 12., 13.,  5.,  0.,
         0.,  0.,  0.,  0., 11., 16.,  9.,
         0.,  0.,  0.,  0.,  3., 15., 16.,
         6.,  0.,  0.,  0.,  7., 15., 16.,
        16.,  2.,  0.,  0.,  0.,  0.,  1.,
        16., 16.,  3.,  0.,  0.,  0.,  0.,
         1., 16., 16.,  6.,  0.,  0.,  0.,
         0.,  1., 16., 16.,  6.,  0.,  0.,
         0.,  0.,  0., 11., 16., 10.,  0.,
         0.]])
y = digits.target
y.shape
(1797,)
y[0:10]
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

Example digits

Test train split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, shuffle=True, random_state=1234
)
X_train.shape
(1437, 64)
y_train.shape
(1437,)
X_test.shape
(360, 64)
y_test.shape
(360,)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression(
  penalty=None
).fit(
  X_train, y_train
)
accuracy_score(y_train, lr.predict(X_train))
1.0
accuracy_score(y_test, lr.predict(X_test))
0.9583333333333334

As Torch tensors

X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train)
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test)
X_train.shape
torch.Size([1437, 64])
y_train.shape
torch.Size([1437])
X_test.shape
torch.Size([360, 64])
y_test.shape
torch.Size([360])
X_train.dtype
torch.float32
y_train.dtype
torch.int64
X_test.dtype
torch.float32
y_test.dtype
torch.int64

PyTorch Model

class mnist_model(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.beta = torch.nn.Parameter(
          torch.randn(input_dim, output_dim, requires_grad=True)  
        )
        self.intercept = torch.nn.Parameter(
          torch.randn(output_dim, requires_grad=True)  
        )
        
    def forward(self, X):
        return (X @ self.beta + self.intercept).squeeze()
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses = []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
      
      return losses

Cross entropy loss

model = mnist_model(64, 10)
l = model.fit(X_train, y_train, X_test, y_test)

Cross entropy loss

From the pytorch documentation:

\[ \ell(x, y)=L=\left\{l_1, \ldots, l_N\right\}^{\top}, \quad l_n=-w_{y_n} \log \frac{\exp \left(x_{n, y_n}\right)}{\sum_{c=1}^C \exp \left(x_{n, c}\right)} \]

\[ \ell(x, y)= \begin{cases}\sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n} \cdot 1\left\{y_n \neq \text { ignore_index }\right\}} l_n, & \text { if reduction }=\text { 'mean' } \\ \sum_{n=1}^N l_n, & \text { if reduction }=\text { 'sum' }\end{cases} \]

Out-of-sample accuracy

model(X_test)
tensor([[ -49.9200,  -67.9009,  -67.7915,
          -25.4841,    5.6110,  -25.7693,
          -51.8763,   25.9151,  -16.4802,
          -62.9280],
        [ -37.2071,   60.9700,   -1.3581,
           -5.8147,  -32.9665,  -43.7874,
          -36.7363,  -27.0210,  -26.2828,
           58.2118],
        [ -79.5292,  -45.3445,  -49.9198,
          -25.6674,  -21.3866,  -78.1699,
          -54.0964,   44.1203,  -19.2856,
          -51.4893],
        [  28.0738,  -38.6156,  -29.9736,
          -16.1450,    3.7312,   27.9189,
           43.4791,  -55.5872,    4.6998,
          -43.6908],
        [  33.7138,  -45.8625,  -32.5098,
          -24.7704,  -32.1352,  -25.8803,
          -28.9786,  -28.5370,   19.0377,
          -25.2243],
        [  -0.7731,    1.9881,   65.2261,
            6.7533,  -53.7962,  -21.2471,
          -86.4824,  -60.0709,   -6.9500,
           -0.3159],
        [  48.2211,  -68.7868,  -71.3363,
          -60.9625,   53.6818,   25.9942,
           17.8802,   -1.5838,  -28.5199,
          -35.8688],
        [ -26.1648,  -55.1651,  -23.7075,
           56.0629, -105.0018,    4.5681,
          -87.9120,  -35.3615,    4.8287,
            0.7082],
        [  -5.0018,    4.8130,  -37.5708,
            6.0855,  -20.8327,  -12.0561,
           73.7988,  -57.2150,   29.6550,
          -57.5796],
        [ -61.7833,  -11.5748,   -1.5401,
           29.9173,  -92.3245,  -42.0357,
          -88.9818,  -67.7597,   16.5681,
          -21.2668],
        [ -56.7739,  -56.7420,  -70.1299,
            4.3631,    6.7857,  -55.8315,
           -9.8519,   32.9332,   16.8699,
          -98.3355],
        [ -70.2859,  -39.9975,  -39.0167,
          -14.0603,    0.9457,  -21.5823,
          -58.0045,  -29.6610,   42.7685,
           -6.8726],
        [ -40.2235,   12.2782,  -60.1162,
           -5.6306,    5.5064,   -1.2246,
          -31.3906,   30.2261,   -4.2946,
          -51.8418],
        [ -15.6473,  -40.7572,  -23.6798,
            6.1645,  -43.0402,   -9.8536,
          -61.9746,    3.3257,  -15.8711,
           53.5126],
        [ -42.5713,    4.7241,  -61.8114,
           -7.5687,   60.7838,  -33.7518,
           17.5793,  -24.8248,   18.9942,
          -44.7187],
        [ -80.1365,  -45.2409,  -31.2903,
           39.5304,  -91.6293,  -35.4137,
          -78.3926,  -30.9156,   21.4426,
          -22.9381],
        [ -69.7320,    6.2890,  -21.0742,
            6.6509,   -5.4703,  -39.1973,
           -6.0068,  -34.9390,   30.6302,
          -42.1493],
        [ -82.0379,  -45.6035,  -43.1911,
          -34.6528,  -45.6167,  -53.7865,
          -54.6632,   10.9445,   -6.5320,
          -65.8220],
        [ -73.7828,  -30.2637,  -68.2941,
          -13.6877,  -34.6303,    5.3935,
          -24.1512,  -38.5678,   28.7409,
          -27.1660],
        [  11.1156,  -20.9093,  -78.3547,
          -20.3376,   69.1587,  -23.6712,
           30.5368,  -15.0278,    4.5705,
          -26.8626],
        [  44.6679,  -69.4247,  -29.6015,
          -29.4764,  -26.3141,    5.8637,
          -68.2758,  -13.8559,  -18.9863,
           -0.2920],
        [ -10.5293,  -65.6030,   27.5657,
           55.4219,  -72.7985,  -16.2736,
          -54.5865,  -15.3677,  -13.8954,
           27.8501],
        [ -25.2119,  -63.6601,  -41.0695,
            7.1278,  -39.3093,  -16.7949,
          -57.2494,  -13.2489,  -16.1448,
           32.8357],
        [  12.0734,   61.8483,   16.2833,
           41.3118,  -17.1034,   41.8100,
          -35.6085,  -45.5667,   20.4139,
           29.1373],
        [ -32.7078,  -21.1810,   -2.9462,
           63.4470,  -51.1928,    5.0311,
          -82.7983,   22.8783,  -11.0040,
           11.5404],
        [   7.5180,    5.5635,  -17.9456,
           -5.5236,   -4.8515,  -17.6500,
           79.7602,  -19.3657,   13.0637,
          -30.1890],
        [  11.7947,   -4.0493,  -18.1141,
           -1.2471,  -16.4154,   21.9370,
           85.1571,  -48.7208,   14.2784,
          -45.9190],
        [  22.5462,  -82.2811,   10.7989,
          -22.9844,  -62.7503,  -29.9046,
          -41.3630,  -38.6502,  -25.9091,
           17.2206],
        [ -19.7871,  -50.8199,  -73.1250,
            1.7992,   32.5218,   54.1196,
          -36.5488,   -3.9263,  -15.9481,
           26.6332],
        [  28.6438,  -25.7269,  -49.9050,
          -28.0292,   70.0870,    3.2148,
           35.1108,   -4.2627,   -3.9686,
          -44.2583],
        ...,
        [ -59.5598,  -38.1439,  -22.8275,
           54.4020, -101.3551,  -22.7019,
          -65.6049,  -27.0377,   10.8591,
          -16.8523],
        [   3.6628,  -64.3091,  -65.9950,
          -15.7287,   69.7525,  -19.9354,
           42.2718,   -0.2788,   -7.7071,
          -53.6804],
        [ -68.6525,   50.6300,    2.0121,
           -6.5684,   -1.3760,  -30.8843,
          -37.3728,  -23.1203,    1.1386,
          -22.2416],
        [ -48.2897,  -20.1217,  -54.2121,
          -54.9299,  -18.0185,  -31.3920,
          -13.6199,  -51.4058,   14.1197,
          -79.8711],
        [ -51.3711,   28.4476,   -6.6170,
          -15.4608,  -15.7629,  -29.5629,
          -32.4920,  -18.5765,   -2.9435,
          -28.1069],
        [ -14.8189,  -45.2200,  -53.4259,
            5.8012,  -58.6586,   -2.4880,
          -11.9949,  -34.4735,   35.8833,
          -10.7588],
        [ -70.4950,  -22.8870,  -45.4331,
            0.4324,  -31.2447,   48.8390,
          -38.9366,  -14.4926,   -3.6772,
          -62.4307],
        [  41.8819,  -96.6720,  -27.3996,
          -33.4901,  -44.9795,  -17.3671,
          -48.5656,  -30.3893,  -40.6856,
           -3.6906],
        [ -59.5972,    3.1133,  -79.0633,
            7.1174,   20.4282,   11.6658,
          -18.1826,   18.9008,  -24.4131,
           45.2861],
        [ -70.5767,  -29.6548,   53.5455,
           45.7387,  -26.8997,  -18.4199,
          -46.5991,  -20.1466,   -3.3624,
            3.7413],
        [ -53.9670,  -39.5399,  -98.2233,
            0.5352,    4.7848,  -46.3221,
            6.0731,   53.3602,  -29.6251,
          -44.7373],
        [ -39.5081,   -1.2926,   68.8788,
            6.0229,  -41.4752,  -59.0169,
          -71.8829,  -72.3274,   10.8123,
          -32.4986],
        [ -21.3653,  -55.2006,   -1.1437,
           54.1921,  -87.6352,    6.1319,
          -74.6427,  -15.1233,  -17.4606,
           36.9797],
        [ -60.4961,  -15.0917,  -65.3309,
          -19.1153,  -60.8717,   24.3740,
          -82.2150,   -6.9663,    5.6696,
          -59.8725],
        [ -38.1706,   13.2651,   41.0793,
          -10.2142,  -35.1171,  -32.1654,
          -42.1612,  -69.9226,    8.6830,
          -46.9888],
        [   0.5476,   14.0423,  -25.6105,
          -12.8775,    4.5134,  -13.4339,
           46.6561,  -18.6431,   -4.3831,
           -2.6816],
        [ -42.4114,  -34.4747,  -30.6963,
           42.6681,  -74.7016,    4.3664,
          -64.8344,  -15.9077,    1.9127,
            1.2911],
        [ -44.2109,  -51.0652,  -53.3976,
          -31.3559,   35.0867,  -17.2435,
          -20.2044,    4.9633,  -12.9898,
            6.4754],
        [ -60.6795,   26.5337,  -12.6780,
          -18.4027,  -21.3589,  -42.4981,
          -38.7426,  -13.0817,   -6.4859,
          -37.9180],
        [  -2.0410,  -31.4200,  -56.1665,
            9.2013,  -18.1380,   63.0423,
          -45.4895,  -43.7738,    0.1194,
           10.7744],
        [  37.6154,  -92.1015,  -17.6222,
          -31.7867,  -53.1583,  -13.1713,
          -45.7628,  -55.6291,  -16.0799,
           -4.8391],
        [ -61.2296,  -28.2095,  -63.7581,
          -20.9770,  -54.0251,   21.8798,
          -75.4676,   -1.6469,  -18.7370,
          -42.8916],
        [ -47.7285,  -38.8241,  -93.8397,
          -40.6321,   17.2266,  -22.0233,
          -61.2311,   39.2634,  -46.2191,
            8.8920],
        [  -3.0138,  -17.5818,  -36.7592,
          -14.7334,    4.6895,    0.3553,
           82.3018,  -57.2122,   17.0568,
          -64.5048],
        [ -10.4193,  -30.8591,  -32.5560,
           45.8656,  -44.7769,   14.1744,
          -62.4751,  -25.2088,   -7.1774,
           -5.5658],
        [  -0.9809,    6.9581,   47.2757,
          -13.8512,  -61.7085,  -33.7331,
          -26.8233,  -92.3801,   16.6932,
          -81.6771],
        [ -33.3769,  -33.8252,  -93.2812,
           16.6663,  -71.3295,   52.9333,
          -50.8388,  -35.3904,    2.7245,
          -38.4505],
        [  57.6947,  -18.9365,  -16.4728,
          -38.6695,  -47.4915,   -1.5065,
          -37.4565,  -36.2351,  -10.2007,
           14.2575],
        [ -55.3503,   -6.3520,  -45.7122,
          -53.1162,    3.1417,  -29.9937,
          -38.4711,   58.1723,  -44.1398,
           16.8157],
        [ -56.4888,  -15.6005,  -12.4646,
           45.8520,  -71.5193,   -0.5924,
          -33.1525,  -25.6992,    8.2602,
          -23.6719]],
       grad_fn=<SqueezeBackward0>)
val, index = torch.max(model(X_test), dim=1)
index
tensor([7, 1, 7, 6, 0, 2, 4, 3, 6, 3, 7, 8, 7,
        9, 4, 3, 8, 7, 8, 4, 0, 3, 9, 1, 3, 6,
        6, 0, 5, 4, 1, 6, 1, 2, 3, 2, 7, 6, 4,
        8, 6, 4, 4, 0, 9, 1, 8, 5, 4, 4, 4, 1,
        7, 6, 3, 2, 9, 9, 9, 0, 9, 3, 1, 8, 8,
        8, 3, 9, 1, 3, 9, 5, 9, 5, 2, 1, 9, 2,
        1, 3, 8, 7, 3, 3, 8, 7, 7, 5, 8, 2, 1,
        1, 9, 1, 6, 4, 5, 2, 2, 4, 5, 4, 7, 6,
        5, 8, 2, 4, 1, 0, 7, 6, 1, 2, 9, 5, 2,
        5, 0, 3, 2, 7, 6, 0, 9, 2, 1, 1, 6, 7,
        6, 2, 7, 4, 7, 5, 0, 9, 1, 0, 5, 6, 7,
        6, 3, 8, 3, 2, 0, 4, 4, 9, 5, 4, 6, 1,
        1, 1, 6, 1, 7, 9, 0, 7, 9, 5, 4, 1, 3,
        8, 6, 4, 7, 1, 5, 7, 4, 7, 4, 5, 8, 2,
        1, 1, 4, 4, 3, 5, 5, 9, 4, 5, 5, 9, 3,
        9, 3, 1, 2, 0, 8, 2, 9, 3, 2, 4, 6, 8,
        3, 5, 1, 0, 8, 1, 8, 5, 6, 8, 7, 1, 8,
        3, 4, 9, 7, 0, 5, 5, 6, 1, 3, 0, 5, 8,
        2, 0, 9, 8, 6, 7, 8, 4, 1, 0, 5, 1, 5,
        1, 6, 4, 7, 1, 2, 6, 4, 4, 6, 3, 2, 3,
        2, 6, 5, 2, 9, 4, 7, 0, 1, 0, 4, 3, 1,
        2, 7, 9, 8, 5, 9, 5, 7, 0, 4, 8, 4, 9,
        4, 0, 7, 7, 8, 5, 3, 5, 3, 8, 7, 5, 5,
        2, 7, 0, 8, 9, 1, 7, 9, 8, 5, 0, 5, 0,
        8, 7, 6, 9, 5, 5, 9, 6, 1, 2, 3, 9, 6,
        3, 2, 9, 3, 4, 3, 4, 1, 8, 1, 8, 5, 0,
        9, 2, 7, 2, 3, 5, 2, 6, 3, 4, 1, 5, 0,
        5, 7, 6, 3, 2, 5, 0, 7, 3])
(index == y_test).sum()
tensor(333)
(index == y_test).sum() / len(y_test)
tensor(0.9250)

Calculating Accuracy

class mnist_model(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.beta = torch.nn.Parameter(
          torch.randn(input_dim, output_dim, requires_grad=True)  
        )
        self.intercept = torch.nn.Parameter(
          torch.randn(output_dim, requires_grad=True)  
        )
        
    def forward(self, X):
        return (X @ self.beta + self.intercept).squeeze()
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum() / len(y_test) )
            
      return (losses, train_acc, test_acc)

Performance

loss, train_acc, test_acc = mnist_model(
  64, 10
).fit(
  X_train, y_train, X_test, y_test, acc_step=10, n=3000
)

NN Layers

class mnist_nn_model(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)
        
    def forward(self, X):
        return self.linear(X)
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum() / len(y_test) )
            
      return (losses, train_acc, test_acc)

NN linear layer

Applies a linear transform to the incoming data (\(X\)): \[y = X A^T+b\]

X.shape
(1797, 64)
model = mnist_nn_model(64, 10)
model.parameters()
<generator object Module.parameters at 0x17bf99c40>
list(model.parameters())[0].shape  # A - weights (betas)
torch.Size([10, 64])
list(model.parameters())[1].shape  # b - bias
torch.Size([10])

Performance

loss, train_acc, test_acc = model.fit(X_train, y_train, X_test, y_test, n=1000)
train_acc[-5:]
[tensor(0.9882), tensor(0.9889), tensor(0.9889), tensor(0.9896), tensor(0.9896)]
test_acc[-5:]
[tensor(0.9667), tensor(0.9667), tensor(0.9667), tensor(0.9667), tensor(0.9667)]

Feedforward Neural Network

FNN Model

class mnist_fnn_model(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, nl_step = torch.nn.ReLU(), seed=1234):
        super().__init__()
        self.l1 = torch.nn.Linear(input_dim, hidden_dim)
        self.nl = nl_step
        self.l2 = torch.nn.Linear(hidden_dim, output_dim)
        
    def forward(self, X):
        out = self.l1(X)
        out = self.nl(out)
        out = self.l2(out)
        return out
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum().item() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum().item() / len(y_test) )
            
      return (losses, train_acc, test_acc)

Non-linear activation functions

\[\text{Tanh}(x) = \frac{\exp(x)-\exp(-x)}{\exp(x) + \exp(-x)}\]

\[\text{ReLU}(x) = \max(0,x)\]

Model parameters

model = mnist_fnn_model(64,64,10)
len(list(model.parameters()))
4
for i, p in enumerate(model.parameters()):
  print("Param", i, p.shape)
Param 0 torch.Size([64, 64])
Param 1 torch.Size([64])
Param 2 torch.Size([10, 64])
Param 3 torch.Size([10])

Performance - ReLU

loss, train_acc, test_acc = mnist_fnn_model(64,64,10).fit(
  X_train, y_train, X_test, y_test, n=2000
)
train_acc[-5:]
[0.9986082115518441, 0.9986082115518441, 0.9986082115518441, 0.9986082115518441, 0.9986082115518441]
test_acc[-5:]
[0.9638888888888889, 0.9638888888888889, 0.9638888888888889, 0.9638888888888889, 0.9638888888888889]

Performance - tanh

loss, train_acc, test_acc = mnist_fnn_model(64,64,10, nl_step=torch.nn.Tanh()).fit(
  X_train, y_train, X_test, y_test, n=2000
)
train_acc[-5:]
[0.9951287404314544, 0.9958246346555324, 0.9958246346555324, 0.9958246346555324, 0.9958246346555324]
test_acc[-5:]
[0.9722222222222222, 0.9722222222222222, 0.9722222222222222, 0.9722222222222222, 0.9722222222222222]

Adding another layer

class mnist_fnn2_model(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, nl_step = torch.nn.ReLU(), seed=1234):
        super().__init__()
        self.l1 = torch.nn.Linear(input_dim, hidden_dim)
        self.nl1 = nl_step
        self.l2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.nl2 = nl_step
        self.l3 = torch.nn.Linear(hidden_dim, output_dim)
        
    def forward(self, X):
        out = self.l1(X)
        out = self.nl1(out)
        out = self.l2(out)
        out = self.nl2(out)
        out = self.l3(out)
        return out
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      loss_fn = torch.nn.CrossEntropyLoss()
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = loss_fn(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum().item() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum().item() / len(y_test) )
            
      return (losses, train_acc, test_acc)

Performance - relu

loss, train_acc, test_acc = mnist_fnn2_model(
  64,64,10, nl_step=torch.nn.ReLU()
).fit(
  X_train, y_train, X_test, y_test, n=1000
)
train_acc[-5:]
[0.988169798190675, 0.988169798190675, 0.988169798190675, 0.988169798190675, 0.988865692414753]
test_acc[-5:]
[0.9638888888888889, 0.9638888888888889, 0.9638888888888889, 0.9638888888888889, 0.9638888888888889]

Performance - tanh

loss, train_acc, test_acc = mnist_fnn2_model(
  64,64,10, nl_step=torch.nn.Tanh()
).fit(
  X_train, y_train, X_test, y_test, n=1000
)
train_acc[-5:]
[0.9798190675017397, 0.9798190675017397, 0.9798190675017397, 0.9805149617258176, 0.9805149617258176]
test_acc[-5:]
[0.9611111111111111, 0.9611111111111111, 0.9611111111111111, 0.9611111111111111, 0.9611111111111111]

Convolutional NN

2d convolutions

nn.Conv2d()

cv = torch.nn.Conv2d(
  in_channels=1, out_channels=4, 
  kernel_size=3, 
  stride=1, padding=1
)
list(cv.parameters())[0] # kernel weights
Parameter containing:
tensor([[[[-0.0916, -0.1771,  0.2714],
          [ 0.2181, -0.3278, -0.1257],
          [-0.2952, -0.3100, -0.0557]]],

        [[[-0.2708, -0.1664,  0.2959],
          [-0.2933, -0.0739,  0.2642],
          [ 0.3022, -0.2897, -0.1198]]],

        [[[-0.1436, -0.2614, -0.3112],
          [-0.1443,  0.0939,  0.2256],
          [ 0.1558, -0.2530,  0.1756]]],

        [[[ 0.2582, -0.1870,  0.0828],
          [ 0.1618,  0.1541,  0.1748],
          [ 0.0206, -0.2393,  0.0881]]]],
       requires_grad=True)
list(cv.parameters())[1] # biases
Parameter containing:
tensor([-0.1378, -0.0058,  0.1867, -0.2046],
       requires_grad=True)

Applying Conv2d()

X_train[[0]]
tensor([[ 0.,  0.,  0., 10., 11.,  0.,  0.,
          0.,  0.,  0.,  9., 16.,  6.,  0.,
          0.,  0.,  0.,  0., 15., 13.,  0.,
          0.,  0.,  0.,  0.,  0., 14., 10.,
          0.,  0.,  0.,  0.,  0.,  1., 15.,
         12.,  8.,  2.,  0.,  0.,  0.,  0.,
         12., 16., 16., 16., 10.,  1.,  0.,
          0.,  7., 16., 12., 12., 16.,  4.,
          0.,  0.,  0.,  9., 15., 12.,  5.,
          0.]])
X_train[[0]].shape
torch.Size([1, 64])
cv(X_train[[0]])
RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 64]
X_train[[0]].view(1,8,8)
tensor([[[ 0.,  0.,  0., 10., 11.,  0.,  0.,
           0.],
         [ 0.,  0.,  9., 16.,  6.,  0.,  0.,
           0.],
         [ 0.,  0., 15., 13.,  0.,  0.,  0.,
           0.],
         [ 0.,  0., 14., 10.,  0.,  0.,  0.,
           0.],
         [ 0.,  1., 15., 12.,  8.,  2.,  0.,
           0.],
         [ 0.,  0., 12., 16., 16., 16., 10.,
           1.],
         [ 0.,  0.,  7., 16., 12., 12., 16.,
           4.],
         [ 0.,  0.,  0.,  9., 15., 12.,  5.,
           0.]]])
cv(X_train[[0]].view(1,8,8))
tensor([[[-1.3784e-01, -6.3952e-01,
          -5.0761e+00, -1.2749e+01,
          -8.1465e+00,  4.8966e-01,
          -1.3784e-01, -1.3784e-01],
         [-1.3784e-01, -2.1049e+00,
          -7.7594e+00, -1.1418e+01,
          -5.3177e+00,  1.6247e-01,
          -1.3784e-01, -1.3784e-01],
         [-1.3784e-01, -3.6071e-01,
          -8.8376e+00, -1.0391e+01,
          -2.7838e+00, -6.8770e-01,
          -1.3784e-01, -1.3784e-01],
         [-1.9358e-01,  1.0275e+00,
          -1.0726e+01, -1.2634e+01,
          -5.2823e+00, -3.1194e+00,
          -7.2824e-01, -1.3784e-01],
         [-2.6349e-01,  7.7982e-01,
          -1.0722e+01, -1.4254e+01,
          -1.1886e+01, -9.2893e+00,
          -7.5805e+00, -3.3998e+00],
         [ 1.3354e-01,  1.8576e+00,
          -8.6353e+00, -1.3800e+01,
          -1.4990e+01, -1.2392e+01,
          -8.9611e+00, -4.2482e+00],
         [-1.3784e-01,  2.2391e+00,
          -2.7277e+00, -8.5814e+00,
          -1.0024e+01, -1.3478e+01,
          -1.1327e+01, -5.2966e-01],
         [-1.3784e-01,  1.7618e+00,
           1.8338e+00, -5.1914e+00,
          -4.9353e+00, -3.1174e-01,
          -2.0076e+00, -1.2221e+00]],

        [[-5.8134e-03, -1.0837e+00,
          -1.8867e+00, -4.7128e-01,
          -6.5383e-01, -1.4187e+00,
          -5.8134e-03, -5.8134e-03],
         [-5.8134e-03,  5.7591e-01,
           6.1430e-01,  1.1665e-01,
          -5.7514e+00, -4.7446e+00,
          -5.8134e-03, -5.8134e-03],
         [-5.8134e-03,  4.9442e+00,
           3.0528e-01, -7.3554e+00,
          -6.1281e+00, -1.6308e+00,
          -5.8134e-03, -5.8134e-03],
         [-1.2558e-01,  6.0459e+00,
          -2.5267e+00, -1.0977e+01,
          -5.3900e+00,  1.8324e+00,
           5.9858e-01, -5.8134e-03],
         [ 2.5843e-01,  6.5894e+00,
          -2.9988e+00, -1.1558e+01,
          -8.0117e+00, -3.4971e+00,
           1.2262e+00,  2.7265e+00],
         [ 2.9008e-01,  6.5988e+00,
           1.7594e-01, -8.1275e+00,
          -5.7194e+00, -7.5028e+00,
          -7.2017e+00,  6.6403e-01],
         [-5.8134e-03,  5.3947e+00,
           5.3649e+00, -5.6509e+00,
          -7.7373e+00, -3.7621e+00,
          -7.1731e+00, -6.3574e+00],
         [-5.8134e-03,  2.0655e+00,
           5.9420e+00,  2.2858e+00,
          -3.3614e+00, -4.4823e+00,
          -8.6231e+00, -6.4711e+00]],

        [[ 1.8672e-01,  1.7670e+00,
           2.9756e+00,  2.0153e+00,
           7.5137e-01, -4.6553e-01,
           1.8672e-01,  1.8672e-01],
         [ 1.8672e-01,  4.8512e+00,
           1.7939e-02, -5.2445e+00,
          -3.8446e+00, -2.2588e+00,
           1.8672e-01,  1.8672e-01],
         [ 1.8672e-01,  3.2290e+00,
          -4.5890e+00, -8.4476e+00,
          -3.9972e+00, -6.7504e-01,
           1.8672e-01,  1.8672e-01],
         [ 3.6230e-01,  1.0590e+00,
          -5.7406e+00, -5.7411e+00,
          -2.9263e+00,  9.2711e-01,
           4.9831e-01,  1.8672e-01],
         [ 4.1235e-01,  1.4159e+00,
          -2.8394e+00, -3.0394e+00,
          -5.2429e-01, -5.7892e-01,
           3.6640e-02,  1.4917e+00],
         [-1.2444e-01, -8.0537e-01,
          -1.8366e+00, -5.0630e+00,
           1.1741e-01,  1.6079e+00,
          -2.7205e+00,  3.1863e-01],
         [ 1.8672e-01, -1.9678e+00,
          -2.0810e+00, -7.1410e+00,
          -1.0032e+01, -6.2207e+00,
          -3.7585e+00, -2.6649e+00],
         [ 1.8672e-01, -1.9914e+00,
          -4.5908e+00, -4.5054e+00,
          -6.1647e+00, -9.5615e+00,
          -8.2256e+00, -3.8782e+00]],

        [[-2.0463e-01,  5.8855e-01,
           8.0009e-01,  1.4583e-01,
           2.0033e+00,  1.6988e+00,
          -2.0463e-01, -2.0463e-01],
         [-2.0463e-01,  2.6904e+00,
           2.3645e+00,  1.0069e+00,
           4.1017e+00,  3.6058e+00,
          -2.0463e-01, -2.0463e-01],
         [-2.0463e-01,  4.3965e+00,
           1.5540e+00,  1.9510e+00,
           5.1134e+00,  1.3443e+00,
          -2.0463e-01, -2.0463e-01],
         [-1.1650e-01,  4.5676e+00,
          -5.3724e-01,  3.1868e+00,
           3.2790e+00, -5.1812e-01,
          -1.6338e-01, -2.0463e-01],
         [-2.9845e-02,  4.7885e+00,
           1.1164e+00,  5.0441e+00,
           3.8132e+00, -1.2187e+00,
          -1.8554e+00, -2.3762e-01],
         [-1.2180e-01,  3.5652e+00,
           2.6244e+00,  6.6648e+00,
           7.9310e+00,  7.0759e+00,
           1.3885e+00,  9.4040e-01],
         [-2.0463e-01,  2.0129e+00,
           3.5457e+00,  6.0918e+00,
           6.4496e+00,  6.2292e+00,
           6.2969e+00,  5.4982e+00],
         [-2.0463e-01,  3.7519e-01,
           1.3849e+00,  3.6138e+00,
           8.5417e+00,  7.1252e+00,
           2.9450e+00,  3.9868e+00]]],
       grad_fn=<SqueezeBackward1>)

Pooling

x = torch.tensor(
  [[[0,0,0,0],
    [0,1,2,0],
    [0,3,4,0],
    [0,0,0,0]]],
  dtype=torch.float
)
x.shape
torch.Size([1, 4, 4])
torch.nn.MaxPool2d(
  kernel_size=2, stride=1
)(x)
tensor([[[1., 2., 2.],
         [3., 4., 4.],
         [3., 4., 4.]]])
torch.nn.MaxPool2d(
  kernel_size=3, stride=1, padding=1
)(x)
tensor([[[1., 2., 2., 2.],
         [3., 4., 4., 4.],
         [3., 4., 4., 4.],
         [3., 4., 4., 4.]]])
torch.nn.AvgPool2d(
  kernel_size=2
)(x)
tensor([[[0.2500, 0.5000],
         [0.7500, 1.0000]]])
torch.nn.AvgPool2d(
  kernel_size=2, padding=1
)(x)
tensor([[[0.0000, 0.0000, 0.0000],
         [0.0000, 2.5000, 0.0000],
         [0.0000, 0.0000, 0.0000]]])

Convolutional model

class mnist_conv_model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn  = torch.nn.Conv2d(
          in_channels=1, out_channels=8,
          kernel_size=3, stride=1, padding=1
        )
        self.relu = torch.nn.ReLU()
        self.pool = torch.nn.MaxPool2d(kernel_size=2)
        self.lin  = torch.nn.Linear(8 * 4 * 4, 10)
        
    def forward(self, X):
        out = self.cnn(X.view(-1, 1, 8, 8))
        out = self.relu(out)
        out = self.pool(out)
        out = self.lin(out.view(-1, 8 * 4 * 4))
        return out
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      loss_fn = torch.nn.CrossEntropyLoss()
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = loss_fn(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum().item() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum().item() / len(y_test) )
            
      return (losses, train_acc, test_acc)

Performance

loss, train_acc, test_acc = mnist_conv_model().fit(
  X_train, y_train, X_test, y_test, n=1000
)
train_acc[-5:]
[0.9916492693110647, 0.9923451635351427, 0.9923451635351427, 0.9923451635351427, 0.9923451635351427]
test_acc[-5:]
[0.9666666666666667, 0.9666666666666667, 0.9638888888888889, 0.9638888888888889, 0.9638888888888889]

Organizing models

class mnist_conv_model2(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = torch.nn.Sequential(
          torch.nn.Unflatten(1, (1,8,8)),
          torch.nn.Conv2d(
            in_channels=1, out_channels=8,
            kernel_size=3, stride=1, padding=1
          ),
          torch.nn.ReLU(),
          torch.nn.MaxPool2d(kernel_size=2),
          torch.nn.Flatten(),
          torch.nn.Linear(8 * 4 * 4, 10)
        )
        
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum() / len(y_test) )
            
      return (losses, train_acc, test_acc)

A bit more on non-linear
activation layers

Non-linear functions

df = pd.read_csv("data/gp.csv")
X = torch.tensor(df["x"], dtype=torch.float32).reshape(-1,1)
y = torch.tensor(df["y"], dtype=torch.float32)

Linear regression

class lin_reg(torch.nn.Module):
    def __init__(self, X):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, self.p)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Model results

m1 = lin_reg(X)
loss = m1.fit(X,y, n=2000)

Training loss:

Predictions

Double linear regression

class dbl_lin_reg(torch.nn.Module):
    def __init__(self, X, hidden_dim=10):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, hidden_dim),
          torch.nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Model results

m2 = dbl_lin_reg(X, hidden_dim=10)
loss = m2.fit(X,y, n=2000)

Training loss:

Predictions

Non-linear regression w/ ReLU

class lin_reg_relu(torch.nn.Module):
    def __init__(self, X, hidden_dim=100):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Model results

Hidden dimensions

Non-linear regression w/ Tanh

class lin_reg_tanh(torch.nn.Module):
    def __init__(self, X, hidden_dim=10):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, hidden_dim),
          torch.nn.Tanh(),
          torch.nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Tanh & hidden dimension

Three layers

class three_layers(torch.nn.Module):
    def __init__(self, X, hidden_dim=100):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Model results

Five layers

class five_layers(torch.nn.Module):
    def __init__(self, X, hidden_dim=100):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Model results