用R torch 做深度学习和科学计算
薛英杰 / 2023-09-04
PyTorch是业界和学术界广泛使用的深度学习和科学计算的框架,R语言使用者开发了torch,为其使用该框架提供了一个R接口。我们分三个部分来介绍torch的使用。
torch基本模块
1.Tensors
为了使用torch,我们需要了解tensors。Tensor并不是数学或物理上的张量,在机器学习框架下,tensors仅仅是为了快速计算而使用的多维度的数组。事实上,一个torch tensor 像一个任意维度的数组,为了快速标准化的数学计算而被设计,你可以将他移到GPU中。
技术上,一个tensor非常像一个R6对象,你可以使用$符号获取它的字段。具体如下:
library(torch)
library(torchvision)
library(luz)
t1<-torch_tensor(1)
t1
## torch_tensor
## 1
## [ CPUFloatType{1} ]
t1$dtype
## torch_Float
t1$device
## torch_device(type='cpu')
t1$shape
## [1] 1
我们可以使用tensor对象\(\$to\)方法来设置tensor属性,例如:
t2 <- t1$to(dtype = torch_int())
t2$dtype
## torch_Int
# only applicable if you have a GPU
t2 <- t1$to(device = "cuda")
t2$device
## torch_device(type='cuda', index=0)
关于tensor的形状,我们单独讨论,我们可以将一维向量tensor转换为二维矩阵tensor。例如:
t3 <- t1$view(c(1, 1))
t3$shape
## [1] 1 1
从概念上讲,这类似于在 R 中我们可以拥有一个单元素向量和一个单元素矩阵,比如:
c(1)
## [1] 1
matrix(1)
## [,1]
## [1,] 1
2.创建tensors
前面我们调用torch_tensor()
传递值。这种方法可以产生多维对象。当我们需要传递许多不同数值时,可以使用torch_tensor()
。
例如,传递一维数组时,我们可以传一个长向量给torch_tensor()
。具体如下:
torch_tensor(1:5)
## torch_tensor
## 1
## 2
## 3
## 4
## 5
## [ CPULongType{5} ]
torch_tensor(1:5,dtype=torch_float())
## torch_tensor
## 1
## 2
## 3
## 4
## 5
## [ CPUFloatType{5} ]
torch_tensor(1:5,device="cuda")
## torch_tensor
## 1
## 2
## 3
## 4
## 5
## [ CUDALongType{5} ]
我们也可以用同样的方式传递一个矩阵。
torch_tensor(matrix(1:9, ncol = 3))
## torch_tensor
## 1 4 7
## 2 5 8
## 3 6 9
## [ CPULongType{3,3} ]
torch_tensor(matrix(1:9, ncol = 3, byrow = TRUE))
## torch_tensor
## 1 2 3
## 4 5 6
## 7 8 9
## [ CPULongType{3,3} ]
对于更高维的数据,遵循同样的原则,传递一个数组即可。
torch_tensor(array(1:24, dim = c(4, 3, 2)))
## torch_tensor
## (1,.,.) =
## 1 13
## 5 17
## 9 21
##
## (2,.,.) =
## 2 14
## 6 18
## 10 22
##
## (3,.,.) =
## 3 15
## 7 19
## 11 23
##
## (4,.,.) =
## 4 16
## 8 20
## 12 24
## [ CPULongType{4,3,2} ]
当你不关心tensor内部的具体数值,而关心其分布时,你可以使用 torch_randn()
。例如,产生一个[0,1]之间的均匀分布。
torch_randn(3, 3)
## torch_tensor
## -1.5090 -1.9471 -1.0123
## 0.4907 -0.6271 0.1965
## 1.2400 0.0293 0.5151
## [ CPUFloatType{3,3} ]
如果我们要设置所有tensor值为0或1时,我们可以使用如下代码:
torch_zeros(2, 5)
## torch_tensor
## 0 0 0 0 0
## 0 0 0 0 0
## [ CPUFloatType{2,5} ]
torch_ones(2, 2)
## torch_tensor
## 1 1
## 1 1
## [ CPUFloatType{2,2} ]
创建线性代数中的单位阵
torch_eye(n = 5)
## torch_tensor
## 1 0 0 0 0
## 0 1 0 0 0
## 0 0 1 0 0
## 0 0 0 1 0
## 0 0 0 0 1
## [ CPUFloatType{5,5} ]
创建对角阵
torch_diag(c(1, 2, 3))
## torch_tensor
## 1 0 0
## 0 2 0
## 0 0 3
## [ CPUFloatType{3,3} ]
创建tensor数据集
torch_tensor(JohnsonJohnson)
## torch_tensor
## 0.7100
## 0.6300
## 0.8500
## 0.4400
## 0.6100
## 0.6900
## 0.9200
## 0.5500
## 0.7200
## 0.7700
## 0.9200
## 0.6000
## 0.8300
## 0.8000
## 1.0000
## 0.7700
## 0.9200
## 1.0000
## 1.2400
## 1.0000
## 1.1600
## 1.3000
## 1.4500
## 1.2500
## 1.2600
## 1.3800
## 1.8600
## 1.5600
## 1.5300
## 1.5900
## ... [the output was truncated (use n=-1 to disable)]
## [ CPUFloatType{84} ]
3. tensors的运算操作
在torch_tensor()
中,我们可以进行正常的数学运算:加、减、乘、除等。这些运算可以作为函数或方法来使用,例如:
加法运算
t1 <- torch_tensor(c(1, 2))
t2 <- torch_tensor(c(3, 4))
torch_add(t1, t2)
## torch_tensor
## 4
## 6
## [ CPUFloatType{2} ]
# equivalently
t1$add(t2)
## torch_tensor
## 4
## 6
## [ CPUFloatType{2} ]
##修改t1
t1$add_(t2)
## torch_tensor
## 4
## 6
## [ CPUFloatType{2} ]
t1
## torch_tensor
## 4
## 6
## [ CPUFloatType{2} ]
向量点乘
t1 <- torch_tensor(1:3)
t2 <- torch_tensor(4:6)
t1$dot(t2)
## torch_tensor
## 32
## [ CPULongType{} ]
t1
## torch_tensor
## 1
## 2
## 3
## [ CPULongType{3} ]
t1 <- torch_tensor(1:3)
t2 <- torch_tensor(4:6)
t1$t()$dot(t2) #转置与否结果等价
## torch_tensor
## 32
## [ CPULongType{} ]
向量与矩阵乘法
t1 <- torch_tensor(1:3)
t3 <- torch_tensor(matrix(1:12, ncol = 3, byrow = TRUE))
t3$matmul(t1)
## torch_tensor
## 14
## 32
## 50
## 68
## [ CPULongType{4} ]
torch_multiply(t1, t2)
## torch_tensor
## 4
## 10
## 18
## [ CPULongType{3} ]
描述性运算
求和
m <- outer(1:3, 1:6)
sum(m)
## [1] 126
apply(m, 1, sum)
## [1] 21 42 63
apply(m, 2, sum)
## [1] 6 12 18 24 30 36
t <- torch_outer(torch_tensor(1:3), torch_tensor(1:6))
t$sum()
## torch_tensor
## 126
## [ CPULongType{} ]
t$sum(dim = 1)
## torch_tensor
## 6
## 12
## 18
## 24
## 30
## 36
## [ CPULongType{6} ]
t$sum(dim = 2)
## torch_tensor
## 21
## 42
## 63
## [ CPULongType{3} ]
均值
t <- torch_randn(4, 3, 2)
t
## torch_tensor
## (1,.,.) =
## 1.0909 0.7280
## -0.9649 0.0656
## 0.6603 -0.1091
##
## (2,.,.) =
## 0.6917 -1.8887
## -0.7837 -0.4657
## -1.9810 0.8186
##
## (3,.,.) =
## -0.3341 0.4803
## 1.4476 -0.4953
## 0.0839 -1.2790
##
## (4,.,.) =
## 0.5520 0.1349
## -2.5736 0.5080
## 0.5140 -0.8253
## [ CPUFloatType{4,3,2} ]
t$mean(dim = c(1, 2))
## torch_tensor
## -0.1331
## -0.1940
## [ CPUFloatType{2} ]
t$mean(dim = 2)
## torch_tensor
## 0.2621 0.2282
## -0.6910 -0.5119
## 0.3991 -0.4313
## -0.5025 -0.0608
## [ CPUFloatType{4,2} ]
切片
t <- torch_tensor(matrix(1:9, ncol = 3, byrow = TRUE))
t[1, ]
## torch_tensor
## 1
## 2
## 3
## [ CPULongType{3} ]
t[1, , drop = FALSE]
## torch_tensor
## 1 2 3
## [ CPULongType{1,3} ]
t <- torch_rand(3, 3, 3)
t[1:2, 2:3, c(1, 3)]
## torch_tensor
## (1,.,.) =
## 0.0366 0.7073
## 0.3949 0.6707
##
## (2,.,.) =
## 0.8056 0.1895
## 0.0173 0.1867
## [ CPUFloatType{2,2,2} ]
函数最优化
许多机器学习模型都需要最优化,求解函数的最大或最小值。
a <- 1
b <- 5
rosenbrock <- function(x) {
x1 <- x[1]
x2 <- x[2]
(a - x1)^2 + b * (x2 - x1^2)^2
}
library(torch)
num_iterations <- 1000
lr <- 0.01
x <- torch_tensor(c(-1, 1), requires_grad = TRUE)
for (i in 1:num_iterations) {
if (i %% 100 == 0) cat("Iteration: ", i, "\n")
value <- rosenbrock(x)
if (i %% 100 == 0) {
cat("Value is: ", as.numeric(value), "\n")
}
value$backward()
if (i %% 100 == 0) {
cat("Gradient is: ", as.matrix(x$grad), "\n")
}
with_no_grad({
x$sub_(lr * x$grad)
x$grad$zero_()
})
}
## Iteration: 100
## Value is: 0.3502924
## Gradient is: -0.667685 -0.5771312
## Iteration: 200
## Value is: 0.07398106
## Gradient is: -0.1603189 -0.2532476
## Iteration: 300
## Value is: 0.02483024
## Gradient is: -0.07679074 -0.1373911
## Iteration: 400
## Value is: 0.009619333
## Gradient is: -0.04347242 -0.08254051
## Iteration: 500
## Value is: 0.003990697
## Gradient is: -0.02652063 -0.05206227
## Iteration: 600
## Value is: 0.001719962
## Gradient is: -0.01683905 -0.03373682
## Iteration: 700
## Value is: 0.0007584976
## Gradient is: -0.01095017 -0.02221584
## Iteration: 800
## Value is: 0.0003393509
## Gradient is: -0.007221781 -0.01477957
## Iteration: 900
## Value is: 0.0001532408
## Gradient is: -0.004811743 -0.009894371
## Iteration: 1000
## Value is: 6.962555e-05
## Gradient is: -0.003222887 -0.006653666
神经网络模型
# input dimensionality (number of input features)
d_in <- 3
# number of observations in training set
n <- 100
x <- torch_randn(n, d_in)
coefs <- c(0.2, -1.3, -0.5)
y <- x$matmul(coefs)$unsqueeze(2) + torch_randn(n, 1)
# dimensionality of hidden layer
d_hidden <- 32
# output dimensionality (number of predicted features)
d_out <- 1
net <- nn_sequential(
nn_linear(d_in, d_hidden),
nn_relu(),
nn_linear(d_hidden, d_out)
)
opt <- optim_adam(net$parameters)
### training loop --------------------------------------
for (t in 1:200) {
### -------- Forward pass --------
y_pred <- net(x)
### -------- Compute loss --------
loss <- nnf_mse_loss(y_pred, y)
if (t %% 10 == 0)
cat("Epoch: ", t, " Loss: ", loss$item(), "\n")
### -------- Backpropagation --------
opt$zero_grad()
loss$backward()
### -------- Update weights --------
opt$step()
}
## Epoch: 10 Loss: 2.553179
## Epoch: 20 Loss: 2.40357
## Epoch: 30 Loss: 2.258252
## Epoch: 40 Loss: 2.116031
## Epoch: 50 Loss: 1.97622
## Epoch: 60 Loss: 1.840057
## Epoch: 70 Loss: 1.708541
## Epoch: 80 Loss: 1.582341
## Epoch: 90 Loss: 1.461682
## Epoch: 100 Loss: 1.347009
## Epoch: 110 Loss: 1.242465
## Epoch: 120 Loss: 1.150654
## Epoch: 130 Loss: 1.073239
## Epoch: 140 Loss: 1.010069
## Epoch: 150 Loss: 0.9601057
## Epoch: 160 Loss: 0.9214334
## Epoch: 170 Loss: 0.8922321
## Epoch: 180 Loss: 0.871074
## Epoch: 190 Loss: 0.8558359
## Epoch: 200 Loss: 0.8449585
深度学习方法的应用
加载数据
在较小的数据集上,我们能传递所有观测值到模型中,但当数据量较大时,torch深度学习框架含有让你分批传递数据到输入层。你可以使用dataset()
和dataloader()
。
dataset()
是一个torch对象,它知道如何处理一件事情——传递一个条目到调用层。条目通常是一个list,包含一个输入和一个目标tensor。
dataloader()
的作用是分批为模型载入数据,这会占用内存。许多dataset()
太大以至于不能一次传入模型,但分批处理有他的好处,由于梯度计算是分批进行,这个过程本身就有随机性,这种随机性有助于模型训练。
dataset设定
library(torch)
library(palmerpenguins)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
penguins %>% glimpse()
## Rows: 344
## Columns: 8
## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex <fct> male, female, female, NA, female, male, female, male…
## $ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
penguins_dataset <- dataset(
name = "penguins_dataset()",
initialize = function(df) {
df <- na.omit(df)
self$x <- as.matrix(df[, 3:6]) %>% torch_tensor()
self$y <- torch_tensor(
as.numeric(df$species)
)$to(torch_long())
},
.getitem = function(i) {
list(x = self$x[i, ], y = self$y[i])
},
.length = function() {
dim(self$x)[1]
}
)
ds <- penguins_dataset(penguins)
length(ds)
## [1] 333
机器学习中最重要的两件事情是数据准备和模型设定,具体如下:
# input dimensionality (number of input features)
d_in <- 3
# number of observations in training set
n <- 1000
x <- torch_randn(n, d_in)
coefs <- c(0.2, -1.3, -0.5)
y <- x$matmul(coefs)$unsqueeze(2) + torch_randn(n, 1)
ds <- tensor_dataset(x, y)
dl <- dataloader(ds, batch_size = 100, shuffle = TRUE)
train_ids <- sample(1:length(ds), size = 0.6 * length(ds))
valid_ids <- sample(setdiff(
1:length(ds),
train_ids
), size = 0.2 * length(ds))
test_ids <- setdiff(1:length(ds), union(train_ids, valid_ids))
train_ds <- dataset_subset(ds, indices = train_ids)
valid_ds <- dataset_subset(ds, indices = valid_ids)
test_ds <- dataset_subset(ds, indices = test_ids)
train_dl <- dataloader(train_ds,
batch_size = 100,
shuffle = TRUE
)
valid_dl <- dataloader(valid_ds, batch_size = 100)
test_dl <- dataloader(test_ds, batch_size = 100)
# dimensionality of hidden layer
d_hidden <- 32
# output dimensionality (number of predicted features)
d_out <- 1
net <- nn_module(
initialize = function(d_in, d_hidden, d_out) {
self$net <- nn_sequential(
nn_linear(d_in, d_hidden),
nn_relu(),
nn_linear(d_hidden, d_out)
)
},
forward = function(x) {
self$net(x)
}
)
调用luz,关注你的训练和验证集的损失。
fitted <- net %>%
setup(
loss = nn_mse_loss(),
optimizer = optim_adam
) %>%
set_hparams(
d_in = d_in,
d_hidden = d_hidden, d_out = d_out
) %>%
fit(train_dl, epochs = 200, valid_data = valid_dl)
如果你安装了CUDA,移动网络权重到GPU。
图像分类
library(torch)
convnet <- nn_module(
"convnet",
initialize = function() {
# nn_conv2d(in_channels, out_channels, kernel_size)
self$conv1 <- nn_conv2d(1, 16, 3)
self$conv2 <- nn_conv2d(16, 32, 3)
self$conv3 <- nn_conv2d(32, 64, 3)
self$output <- nn_linear(2304, 3)
},
forward = function(x) {
x %>%
self$conv1() %>%
nnf_relu() %>%
nnf_max_pool2d(2) %>%
self$conv2() %>%
nnf_relu() %>%
nnf_max_pool2d(2) %>%
self$conv3() %>%
nnf_relu() %>%
nnf_max_pool2d(2) %>%
torch_flatten(start_dim = 2) %>%
self$output()
}
)
model <- convnet()
img <- torch_randn(1, 1, 64, 64)
model(img)
## torch_tensor
## 0.1491 -0.0458 -0.0643
## [ CPUFloatType{1,3} ][ grad_fn = <AddmmBackward0> ]