整理了一些常见的lr scheduler

环境:py3.9.13 + torch1.12.1

参考来自这位大佬的总结~~

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import LambdaLR, StepLR, MultiStepLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, _LRScheduler
from torch.utils.tensorboard import SummaryWriter


initial_lr = 0.1
writer = SummaryWriter("logs/lr_scheduler")


class model(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3)

def forward(self, x):
pass

# ......

if __name__ == '__main__':
lambdaLR_test()
StepLR_test()
MultiStepLR_test()
ExponentialLR_test()
CosineAnnealingLR_test()
ReduceLROnPlateau_test()
warmUpLR_test()

writer.close()
  • lambdaLR

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    """
    LambdaLR: (optimizer, lr_lambda, last_epoch=-1)
    last_epoch: 断点续训时,中断前的epoch。默认为-1,即从头开始训练

    new_lr = λ * initial_lr
    """
    def lambdaLR_test():
    net = model()
    optimizer = torch.optim.Adam(net.parameters(), lr=initial_lr)
    scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 1/(epoch+1))

    print('初始化学习率: ', optimizer.defaults['lr'])

    for epoch in range(1, 11):
    # train...

    optimizer.zero_grad()
    optimizer.step()
    print('第%d个epoch的学习率: %f' % (epoch, optimizer.param_groups[0]['lr']))
    writer.add_scalar("LambdaLR", optimizer.param_groups[0]['lr'], epoch) # tensorboard面板,Smoothing调为0;且清除logs/LambdaLR下无用文件
    scheduler.step()

    lambdaLR

  • StepLR

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    """
    StepLR: (optimizer, step_size, gamma=0.1, last_epoch=-1)

    new_lr = initial_lr * γ^(epoch//step_size)
    """
    def StepLR_test():
    net = model()
    optimizer = torch.optim.Adam(net.parameters(), lr=initial_lr)
    scheduler = StepLR(optimizer, step_size=3, gamma=0.1) # 每3个epoch更新一次lr

    print('初始化学习率: ', optimizer.defaults['lr'])

    for epoch in range(1, 11):
    # train...

    optimizer.zero_grad()
    optimizer.step()
    print('第%d个epoch的学习率: %f' % (epoch, optimizer.param_groups[0]['lr']))
    writer.add_scalar("StepLR", optimizer.param_groups[0]['lr'], epoch)
    scheduler.step()

    StepLR

  • MultiStepLR

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    """
    MultiStepLR: (optimizer, milestones, gamma=0.1, last_epoch=-1)

    new_lr = initial_lr * γ^(bisect_right * (milestones.epoch))
    """
    def MultiStepLR_test():
    net = model()
    optimizer = torch.optim.Adam(net.parameters(), lr=initial_lr)
    scheduler = MultiStepLR(optimizer, milestones=[3, 7], gamma=0.1) # 在第3和第7个迭代时更新lr

    print('初始化学习率: ', optimizer.defaults['lr'])

    for epoch in range(1, 11):
    # train...

    optimizer.zero_grad()
    optimizer.step()
    print('第%d个epoch的学习率: %f' % (epoch, optimizer.param_groups[0]['lr']))
    writer.add_scalar("MultiStepLR", optimizer.param_groups[0]['lr'], epoch)
    scheduler.step()

    MultiStepLR

  • ExponentialLR

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    """
    ExponentialLR: (optimizer, gamma, last_epoch=-1)

    new_lr = initial_lr * γ^(epoch)
    """
    def ExponentialLR_test():
    net = model()
    optimizer = torch.optim.Adam(net.parameters(), lr=initial_lr)
    scheduler = ExponentialLR(optimizer, gamma=0.1)

    print('初始化学习率: ', optimizer.defaults['lr'])

    for epoch in range(1, 11):
    # train...

    optimizer.zero_grad()
    optimizer.step()
    print('第%d个epoch的学习率: %f' % (epoch, optimizer.param_groups[0]['lr']))
    writer.add_scalar("ExponentialLR", optimizer.param_groups[0]['lr'], epoch)
    scheduler.step()

    ExponentialLR

  • CosineAnnealingLR

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    """
    CosineAnnealingLR: (optimizer, T_max, eta_min=0, last_epoch=-1) 2*T_max个epoch之后重新设置lr.(T_max为周期)
    eta_min: lr最小值

    new_lr = eta_min + (initial_lr - eta_min) * (1 + cos((epoch/T_max)*π))
    """
    def CosineAnnealingLR_test():
    net = model()
    optimizer = torch.optim.Adam(net.parameters(), lr=initial_lr)
    scheduler = CosineAnnealingLR(optimizer, T_max=10)

    print('初始化学习率: ', optimizer.defaults['lr'])

    for epoch in range(1, 101):
    # train...

    optimizer.zero_grad()
    optimizer.step()
    print('第%d个epoch的学习率: %f' % (epoch, optimizer.param_groups[0]['lr']))
    writer.add_scalar("CosineAnnealingLR", optimizer.param_groups[0]['lr'], epoch)
    scheduler.step()

    CosineAnnealingLR

  • ReduceLROnPlateau

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    """
    ReduceLROnPlateau: (optimizer, mode='min', factor=0.1, patience=10, verbose=False, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08)
    基于训练过程中的某些测量值对学习率进行动态下降

    new_lr = λ * old_lr
    """
    def ReduceLROnPlateau_test():
    net = model()
    optimizer = torch.optim.Adam(net.parameters(), lr=initial_lr)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)

    print('初始化学习率: ', optimizer.defaults['lr'])

    for epoch in range(1, 15):
    # train...

    train_loss = 2
    optimizer.zero_grad()
    optimizer.step()
    print('第%d个epoch的学习率: %f' % (epoch, optimizer.param_groups[0]['lr']))
    writer.add_scalar("ReduceLROnPlateau", optimizer.param_groups[0]['lr'], epoch)
    scheduler.step(train_loss)

    ReduceLROnPlateau

  • WarmUp

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    class WarmUpLR(_LRScheduler):
    def __init__(self, optimizer, total_iters, last_epoch=-1):
    """
    total_iters: 前n个epoch,或前n个batch,lr进行warmup(即lr从低到高)。lr随epoch变化还是随batch变化,可任意调节
    """
    self.total_iters = total_iters
    super().__init__(optimizer, last_epoch)

    def get_lr(self):
    """
    self.base_lrs: optimizer的初始lr,即warmup后的最大lr
    self.last_epoch: 当前的epoch index,或当前的batch index。每次执行warmup_scheduler.step(),self.last_epoch都会加一。
    """
    return [base_lr * self.last_epoch / (self.total_iters + 1e-8) for base_lr in self.base_lrs]


    def warmUpLR_test():
    net = model()
    optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)

    # scheduler = CosineAnnealingLR(optimizer, 20)
    scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (epoch + 1))

    warmup_epoch = 10 # 这里我们假设,lr随epoch变化,前5个epoch进行lr warmup
    warmup_scheduler = WarmUpLR(optimizer, warmup_epoch) # 这行代码执行后,optimizer.param_groups[0]['lr']会从0.001变为0.0

    print('初始化学习率: ', optimizer.defaults['lr']) # default输入值,0.001

    for epoch in range(1, 50):
    # train...

    optimizer.zero_grad()
    optimizer.step()

    print('第%d个epoch的学习率: %f' % (epoch, optimizer.param_groups[0]['lr']))
    writer.add_scalar("warmUpLR", optimizer.param_groups[0]['lr'], epoch)

    if epoch <= warmup_epoch:
    warmup_scheduler.step()
    else:
    scheduler.step()

    WarmUp