用R torch 做深度学习和科学计算

薛英杰 / 2023-09-04

PyTorch是业界和学术界广泛使用的深度学习和科学计算的框架，R语言使用者开发了torch，为其使用该框架提供了一个R接口。我们分三个部分来介绍torch的使用。

torch基本模块

1.Tensors

为了使用torch，我们需要了解tensors。Tensor并不是数学或物理上的张量，在机器学习框架下，tensors仅仅是为了快速计算而使用的多维度的数组。事实上，一个torch tensor 像一个任意维度的数组，为了快速标准化的数学计算而被设计，你可以将他移到GPU中。

技术上，一个tensor非常像一个R6对象，你可以使用$符号获取它的字段。具体如下：

library(torch)
library(torchvision)
library(luz)
t1<-torch_tensor(1)
t1

## torch_tensor
##  1
## [ CPUFloatType{1} ]

t1$dtype

## torch_Float

t1$device

## torch_device(type='cpu')

t1$shape

## [1] 1

我们可以使用tensor对象$\$to$方法来设置tensor属性，例如：

t2 <- t1$to(dtype = torch_int())
t2$dtype

## torch_Int

# only applicable if you have a GPU
t2 <- t1$to(device = "cuda")
t2$device

## torch_device(type='cuda', index=0)

关于tensor的形状，我们单独讨论，我们可以将一维向量tensor转换为二维矩阵tensor。例如：

t3 <- t1$view(c(1, 1))
t3$shape

## [1] 1 1

从概念上讲，这类似于在 R 中我们可以拥有一个单元素向量和一个单元素矩阵，比如：

c(1)

## [1] 1

matrix(1)

##      [,1]
## [1,]    1

2.创建tensors

前面我们调用torch_tensor() 传递值。这种方法可以产生多维对象。当我们需要传递许多不同数值时，可以使用torch_tensor() 。

例如，传递一维数组时，我们可以传一个长向量给torch_tensor() 。具体如下：

torch_tensor(1:5)

## torch_tensor
##  1
##  2
##  3
##  4
##  5
## [ CPULongType{5} ]

torch_tensor(1:5,dtype=torch_float())

## torch_tensor
##  1
##  2
##  3
##  4
##  5
## [ CPUFloatType{5} ]

torch_tensor(1:5,device="cuda")

## torch_tensor
##  1
##  2
##  3
##  4
##  5
## [ CUDALongType{5} ]

我们也可以用同样的方式传递一个矩阵。

torch_tensor(matrix(1:9, ncol = 3))

## torch_tensor
##  1  4  7
##  2  5  8
##  3  6  9
## [ CPULongType{3,3} ]

torch_tensor(matrix(1:9, ncol = 3, byrow = TRUE))

## torch_tensor
##  1  2  3
##  4  5  6
##  7  8  9
## [ CPULongType{3,3} ]

对于更高维的数据，遵循同样的原则，传递一个数组即可。

torch_tensor(array(1:24, dim = c(4, 3, 2)))

## torch_tensor
## (1,.,.) = 
##    1  13
##    5  17
##    9  21
## 
## (2,.,.) = 
##    2  14
##    6  18
##   10  22
## 
## (3,.,.) = 
##    3  15
##    7  19
##   11  23
## 
## (4,.,.) = 
##    4  16
##    8  20
##   12  24
## [ CPULongType{4,3,2} ]

当你不关心tensor内部的具体数值，而关心其分布时，你可以使用 torch_randn() 。例如，产生一个[0,1]之间的均匀分布。

torch_randn(3, 3)

## torch_tensor
## -1.5090 -1.9471 -1.0123
##  0.4907 -0.6271  0.1965
##  1.2400  0.0293  0.5151
## [ CPUFloatType{3,3} ]

如果我们要设置所有tensor值为0或1时，我们可以使用如下代码：

torch_zeros(2, 5)

## torch_tensor
##  0  0  0  0  0
##  0  0  0  0  0
## [ CPUFloatType{2,5} ]

torch_ones(2, 2)

## torch_tensor
##  1  1
##  1  1
## [ CPUFloatType{2,2} ]

创建线性代数中的单位阵

torch_eye(n = 5)

## torch_tensor
##  1  0  0  0  0
##  0  1  0  0  0
##  0  0  1  0  0
##  0  0  0  1  0
##  0  0  0  0  1
## [ CPUFloatType{5,5} ]

创建对角阵

torch_diag(c(1, 2, 3))

## torch_tensor
##  1  0  0
##  0  2  0
##  0  0  3
## [ CPUFloatType{3,3} ]

创建tensor数据集

torch_tensor(JohnsonJohnson)

## torch_tensor
##   0.7100
##   0.6300
##   0.8500
##   0.4400
##   0.6100
##   0.6900
##   0.9200
##   0.5500
##   0.7200
##   0.7700
##   0.9200
##   0.6000
##   0.8300
##   0.8000
##   1.0000
##   0.7700
##   0.9200
##   1.0000
##   1.2400
##   1.0000
##   1.1600
##   1.3000
##   1.4500
##   1.2500
##   1.2600
##   1.3800
##   1.8600
##   1.5600
##   1.5300
##   1.5900
## ... [the output was truncated (use n=-1 to disable)]
## [ CPUFloatType{84} ]

3. tensors的运算操作

在torch_tensor() 中，我们可以进行正常的数学运算：加、减、乘、除等。这些运算可以作为函数或方法来使用，例如：

加法运算

t1 <- torch_tensor(c(1, 2))
t2 <- torch_tensor(c(3, 4))

torch_add(t1, t2)

## torch_tensor
##  4
##  6
## [ CPUFloatType{2} ]

# equivalently
t1$add(t2)

## torch_tensor
##  4
##  6
## [ CPUFloatType{2} ]

##修改t1
t1$add_(t2)

## torch_tensor
##  4
##  6
## [ CPUFloatType{2} ]

t1

## torch_tensor
##  4
##  6
## [ CPUFloatType{2} ]

向量点乘

t1 <- torch_tensor(1:3)
t2 <- torch_tensor(4:6)
t1$dot(t2)

## torch_tensor
## 32
## [ CPULongType{} ]

t1

## torch_tensor
##  1
##  2
##  3
## [ CPULongType{3} ]

t1 <- torch_tensor(1:3)
t2 <- torch_tensor(4:6)
t1$t()$dot(t2) #转置与否结果等价

## torch_tensor
## 32
## [ CPULongType{} ]

向量与矩阵乘法

t1 <- torch_tensor(1:3)
t3 <- torch_tensor(matrix(1:12, ncol = 3, byrow = TRUE))
t3$matmul(t1)

## torch_tensor
##  14
##  32
##  50
##  68
## [ CPULongType{4} ]

torch_multiply(t1, t2)

## torch_tensor
##   4
##  10
##  18
## [ CPULongType{3} ]

描述性运算

求和

m <- outer(1:3, 1:6)

sum(m)

## [1] 126

apply(m, 1, sum)

## [1] 21 42 63

apply(m, 2, sum)

## [1]  6 12 18 24 30 36

t <- torch_outer(torch_tensor(1:3), torch_tensor(1:6))
t$sum()

## torch_tensor
## 126
## [ CPULongType{} ]

t$sum(dim = 1)

## torch_tensor
##   6
##  12
##  18
##  24
##  30
##  36
## [ CPULongType{6} ]

t$sum(dim = 2)

## torch_tensor
##  21
##  42
##  63
## [ CPULongType{3} ]

均值

t <- torch_randn(4, 3, 2)
t

## torch_tensor
## (1,.,.) = 
##   1.0909  0.7280
##  -0.9649  0.0656
##   0.6603 -0.1091
## 
## (2,.,.) = 
##   0.6917 -1.8887
##  -0.7837 -0.4657
##  -1.9810  0.8186
## 
## (3,.,.) = 
##  -0.3341  0.4803
##   1.4476 -0.4953
##   0.0839 -1.2790
## 
## (4,.,.) = 
##   0.5520  0.1349
##  -2.5736  0.5080
##   0.5140 -0.8253
## [ CPUFloatType{4,3,2} ]

t$mean(dim = c(1, 2))

## torch_tensor
## -0.1331
## -0.1940
## [ CPUFloatType{2} ]

t$mean(dim = 2)

## torch_tensor
##  0.2621  0.2282
## -0.6910 -0.5119
##  0.3991 -0.4313
## -0.5025 -0.0608
## [ CPUFloatType{4,2} ]

切片

t <- torch_tensor(matrix(1:9, ncol = 3, byrow = TRUE))
t[1, ]

## torch_tensor
##  1
##  2
##  3
## [ CPULongType{3} ]

t[1, , drop = FALSE]

## torch_tensor
##  1  2  3
## [ CPULongType{1,3} ]

t <- torch_rand(3, 3, 3)
t[1:2, 2:3, c(1, 3)]

## torch_tensor
## (1,.,.) = 
##   0.0366  0.7073
##   0.3949  0.6707
## 
## (2,.,.) = 
##   0.8056  0.1895
##   0.0173  0.1867
## [ CPUFloatType{2,2,2} ]

函数最优化

许多机器学习模型都需要最优化，求解函数的最大或最小值。

a <- 1
b <- 5

rosenbrock <- function(x) {
  x1 <- x[1]
  x2 <- x[2]
  (a - x1)^2 + b * (x2 - x1^2)^2
}

library(torch)
num_iterations <- 1000

lr <- 0.01

x <- torch_tensor(c(-1, 1), requires_grad = TRUE)

for (i in 1:num_iterations) {
  if (i %% 100 == 0) cat("Iteration: ", i, "\n")

  value <- rosenbrock(x)
  if (i %% 100 == 0) {
    cat("Value is: ", as.numeric(value), "\n")
  }

  value$backward()
  if (i %% 100 == 0) {
    cat("Gradient is: ", as.matrix(x$grad), "\n")
  }

  with_no_grad({
    x$sub_(lr * x$grad)
    x$grad$zero_()
  })
}

## Iteration:  100 
## Value is:  0.3502924 
## Gradient is:  -0.667685 -0.5771312 
## Iteration:  200 
## Value is:  0.07398106 
## Gradient is:  -0.1603189 -0.2532476 
## Iteration:  300 
## Value is:  0.02483024 
## Gradient is:  -0.07679074 -0.1373911 
## Iteration:  400 
## Value is:  0.009619333 
## Gradient is:  -0.04347242 -0.08254051 
## Iteration:  500 
## Value is:  0.003990697 
## Gradient is:  -0.02652063 -0.05206227 
## Iteration:  600 
## Value is:  0.001719962 
## Gradient is:  -0.01683905 -0.03373682 
## Iteration:  700 
## Value is:  0.0007584976 
## Gradient is:  -0.01095017 -0.02221584 
## Iteration:  800 
## Value is:  0.0003393509 
## Gradient is:  -0.007221781 -0.01477957 
## Iteration:  900 
## Value is:  0.0001532408 
## Gradient is:  -0.004811743 -0.009894371 
## Iteration:  1000 
## Value is:  6.962555e-05 
## Gradient is:  -0.003222887 -0.006653666

神经网络模型

# input dimensionality (number of input features)
d_in <- 3
# number of observations in training set
n <- 100

x <- torch_randn(n, d_in)
coefs <- c(0.2, -1.3, -0.5)
y <- x$matmul(coefs)$unsqueeze(2) + torch_randn(n, 1)

# dimensionality of hidden layer
d_hidden <- 32
# output dimensionality (number of predicted features)
d_out <- 1

net <- nn_sequential(
  nn_linear(d_in, d_hidden),
  nn_relu(),
  nn_linear(d_hidden, d_out)
)

opt <- optim_adam(net$parameters)

### training loop --------------------------------------

for (t in 1:200) {
  
  ### -------- Forward pass --------
  y_pred <- net(x)
  
  ### -------- Compute loss -------- 
  loss <- nnf_mse_loss(y_pred, y)
  if (t %% 10 == 0)
    cat("Epoch: ", t, "   Loss: ", loss$item(), "\n")
  
  ### -------- Backpropagation --------
  opt$zero_grad()
  loss$backward()
  
  ### -------- Update weights -------- 
  opt$step()

}

## Epoch:  10    Loss:  2.553179 
## Epoch:  20    Loss:  2.40357 
## Epoch:  30    Loss:  2.258252 
## Epoch:  40    Loss:  2.116031 
## Epoch:  50    Loss:  1.97622 
## Epoch:  60    Loss:  1.840057 
## Epoch:  70    Loss:  1.708541 
## Epoch:  80    Loss:  1.582341 
## Epoch:  90    Loss:  1.461682 
## Epoch:  100    Loss:  1.347009 
## Epoch:  110    Loss:  1.242465 
## Epoch:  120    Loss:  1.150654 
## Epoch:  130    Loss:  1.073239 
## Epoch:  140    Loss:  1.010069 
## Epoch:  150    Loss:  0.9601057 
## Epoch:  160    Loss:  0.9214334 
## Epoch:  170    Loss:  0.8922321 
## Epoch:  180    Loss:  0.871074 
## Epoch:  190    Loss:  0.8558359 
## Epoch:  200    Loss:  0.8449585

深度学习方法的应用

加载数据

在较小的数据集上，我们能传递所有观测值到模型中，但当数据量较大时，torch深度学习框架含有让你分批传递数据到输入层。你可以使用dataset()和dataloader() 。

dataset() 是一个torch对象，它知道如何处理一件事情——传递一个条目到调用层。条目通常是一个list，包含一个输入和一个目标tensor。

dataloader() 的作用是分批为模型载入数据，这会占用内存。许多dataset() 太大以至于不能一次传入模型，但分批处理有他的好处，由于梯度计算是分批进行，这个过程本身就有随机性，这种随机性有助于模型训练。

dataset设定

library(torch)
library(palmerpenguins)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

penguins %>% glimpse()

## Rows: 344
## Columns: 8
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex               <fct> male, female, female, NA, female, male, female, male…
## $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

penguins_dataset <- dataset(
  name = "penguins_dataset()",
  initialize = function(df) {
    df <- na.omit(df)
    self$x <- as.matrix(df[, 3:6]) %>% torch_tensor()
    self$y <- torch_tensor(
      as.numeric(df$species)
    )$to(torch_long())
  },
  .getitem = function(i) {
    list(x = self$x[i, ], y = self$y[i])
  },
  .length = function() {
    dim(self$x)[1]
  }
)

ds <- penguins_dataset(penguins)
length(ds)

## [1] 333

机器学习中最重要的两件事情是数据准备和模型设定，具体如下：

# input dimensionality (number of input features)
d_in <- 3
# number of observations in training set
n <- 1000

x <- torch_randn(n, d_in)
coefs <- c(0.2, -1.3, -0.5)
y <- x$matmul(coefs)$unsqueeze(2) + torch_randn(n, 1)

ds <- tensor_dataset(x, y)

dl <- dataloader(ds, batch_size = 100, shuffle = TRUE)

train_ids <- sample(1:length(ds), size = 0.6 * length(ds))
valid_ids <- sample(setdiff(
  1:length(ds),
  train_ids
), size = 0.2 * length(ds))
test_ids <- setdiff(1:length(ds), union(train_ids, valid_ids))

train_ds <- dataset_subset(ds, indices = train_ids)
valid_ds <- dataset_subset(ds, indices = valid_ids)
test_ds <- dataset_subset(ds, indices = test_ids)

train_dl <- dataloader(train_ds,
  batch_size = 100,
  shuffle = TRUE
)
valid_dl <- dataloader(valid_ds, batch_size = 100)
test_dl <- dataloader(test_ds, batch_size = 100)

# dimensionality of hidden layer
d_hidden <- 32
# output dimensionality (number of predicted features)
d_out <- 1

net <- nn_module(
  initialize = function(d_in, d_hidden, d_out) {
    self$net <- nn_sequential(
      nn_linear(d_in, d_hidden),
      nn_relu(),
      nn_linear(d_hidden, d_out)
    )
  },
  forward = function(x) {
    self$net(x)
  }
)

调用luz,关注你的训练和验证集的损失。

fitted <- net %>%
  setup(
    loss = nn_mse_loss(),
    optimizer = optim_adam
  ) %>%
  set_hparams(
    d_in = d_in,
    d_hidden = d_hidden, d_out = d_out
  ) %>%
  fit(train_dl, epochs = 200, valid_data = valid_dl)

如果你安装了CUDA，移动网络权重到GPU。

图像分类

library(torch)

convnet <- nn_module(
  "convnet",
  
  initialize = function() {
    
    # nn_conv2d(in_channels, out_channels, kernel_size)
    self$conv1 <- nn_conv2d(1, 16, 3)
    self$conv2 <- nn_conv2d(16, 32, 3)
    self$conv3 <- nn_conv2d(32, 64, 3)
    
    self$output <- nn_linear(2304, 3)

  },
  
  forward = function(x) {
    
    x %>% 
      self$conv1() %>% 
      nnf_relu() %>%
      nnf_max_pool2d(2) %>%
      self$conv2() %>% 
      nnf_relu() %>%
      nnf_max_pool2d(2) %>%
      self$conv3() %>% 
      nnf_relu() %>%
      nnf_max_pool2d(2) %>%
      torch_flatten(start_dim = 2) %>%
      self$output()
      
  }
)

model <- convnet()
img <- torch_randn(1, 1, 64, 64)
model(img)

## torch_tensor
##  0.1491 -0.0458 -0.0643
## [ CPUFloatType{1,3} ][ grad_fn = <AddmmBackward0> ]

资料

torch for R

Deep Learning and Scientific computing with R torch

image process