Square Attack:一种基于随机搜索和得分的黑盒攻击方法的原理以及代码实现

论文:Square Attack: a query-efficient black-box adversarial attack via random search

代码:https://github.com/max-andr/square-attack

1 基于分数的黑盒攻击

基于梯度的白盒攻击容易受到gradient obfuscation或masking[1][2]所影响，而黑盒攻击和PGD形式上不太一样，但是相比于白盒攻击来说要很多查询次数，而且一般性能差一些，基于score-based的黑盒攻击不访问梯度信息，而是在访问分类模型softmax之前最后一层的得分矩阵，对抗损失也是通过访问这个得分矩阵来得到对抗效果

1.2 基本思想

Square Attack是基于随机搜索的，其核心思想就是每次迭代ramdom一个噪声，加到对抗样本再传入到对抗损失中看对抗效果是否提升，如果提升则使用这一次添加的噪声，如果没有提升则丢弃这一次添加的噪声.

3.h是挑选要添加噪声的窗口， 4.P根据这个窗口随机采样噪声 5.将噪声添加到对抗样本中形成xnew，6.把xnew放到对抗损失中，如果对抗损失值下降了(对抗效果变好)则保留本轮添加的噪声，如果对抗损失值没有下降则丢弃本次添加的噪声

1.3 正方形的随机噪声采样

基于笔记: Square Attack - 知乎 (zhihu.com)

1.4 L∞攻击

1.5 Margin Loss

无目标

L(f(x),y)=fy(x) - max(k≠y)fk(x)

其中y是正确标签，k是错误标签，fk(x)是图像x经过模型后得到矩阵的k维度值，fy(x)是图像x经过模型后得到矩阵的y维度值，损失最后是使得他们之间的差距越小越好代表对抗效果越好

有目标

2. pytorch代码实现

2.1 main函数

def load():
    image_path = "./干净样本.jpg"
    image = Image.open(image_path)
    # 定义图像预处理的变换
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),  # 调整图像大小为 224x224
        transforms.ToTensor(),  # 将图像转换为 PyTorch Tensor,
        #像素值自动从0-255转到0-1
    ])
    # 对图像进行预处理
    x = preprocess(image)
    # 在第0维添加一个维度，使其成为形状为 [1, 3, 224, 224] 的 Tensor
    x = x.unsqueeze(0)
    # y = torch.tensor(917)
    y =np.array([917])
    return x,y
    x_test,y_test=load()
    y_target_onehot = utils.dense_to_onehot(y_target, n_cls=n_cls)
    n_queries, x_adv = square_attack(model, x_test, y_target_onehot, corr_classified, args.eps, args.n_iter,
                                     args.p, metrics_path, args.targeted, args.loss)
# 假设 x 是经过预处理后的 PyTorch Tensor
    #byte() 方法被用于将张量的数据类型转换为8位整数类型,再从0-1转0-255
    print("攻击后图像类别是:{}".format(model.predict(x_test).argmax(1)))

    x_adv = (x_adv.squeeze(0) * 255).byte()
    # 将 PyTorch Tensor 转换为 NumPy 数组
    x_adv = x_adv.numpy()
    #CHW->HWC
    x_adv=np.transpose(x_adv, (1, 2, 0))
    #保存
    Image.fromarray(x_adv).save("restored_image.jpg")

    print("查询次数为:{}".format(n_queries))

2.2 square_attack

ef square_attack_linf(model, x, y, corr_classified, eps, n_iters, p_init, metrics_path, targeted, loss_type):
    """ The Linf square attack """
    np.random.seed(0)  # important to leave it here as well
    min_val, max_val = 0, 1 if x.max() <= 1 else 255 #min_val=0 max_val=1
    c, h, w = x.shape[1:]  #c=3 h=224 w=224
    n_features = c*h*w     #n_feature=150528
    n_ex_total = x.shape[0]  #n_ex_total=batch_size=1
    x, y = x[corr_classified], y[corr_classified] #仅取被resNet50模型正确分类的样本

    # [c, 1, w], i.e. vertical stripes work best for untargeted attacks
    init_delta = np.random.choice([-eps, eps], size=[x.shape[0], c, 1, w]) #随机初始化噪声范围[-0.0001,+0.0001]
    # x_best=[1,3,224,224],依然是初始样本，只是像素值裁剪到[0，1],x_best同时也是最终生成的对抗样本
    x_best = np.clip(x + init_delta, min_val, max_val)

    logits = model.predict(x_best)  #获取模型经过最后一层softmax之间的输出，logits=[1,1000]
    loss_min = model.loss(y, logits, targeted, loss_type=loss_type) #模型为ModelPT,默认损失函数为margin_loss loss_min=[2.96]
    margin_min = model.loss(y, logits, targeted, loss_type='margin_loss') #margin_min=2.96
    #n_queries的内容为: min=1.0 max=1.0 shape=(1,)
    n_queries = np.ones(x.shape[0])  # ones because we have already used 1 query
    time_start = time.time()
    metrics = np.zeros([n_iters, 7]) #n_iter=10000 metrics=[10000,7]
    for i_iter in range(n_iters - 1):
        idx_to_fool = margin_min > 0  #margin_min=2.96>0 idx_to_fool=true
        x_curr, x_best_curr, y_curr = x[idx_to_fool], x_best[idx_to_fool], y[idx_to_fool]
        loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool] #loss_min_curr=[2.9] margin_min_curr=2.9
        deltas = x_best_curr - x_curr #添加的噪声deltas=[1,3,224,224]

        p = p_selection(p_init, i_iter, n_iters)  #p=0.05
        for i_img in range(x_best_curr.shape[0]): #由于batch_size=1,仅取出一个张图片i_img
            s = int(round(np.sqrt(p * n_features / c)))  #s=50
            s = min(max(s, 1), h-1)  # at least c x 1 x 1 window is taken and at most c x h-1 x h-1,s=50
            center_h = np.random.randint(0, h - s)  #center_h=107
            center_w = np.random.randint(0, w - s)  #center_w=158
            #选择要添加扰动噪声的窗口 中心为x_curr_window，x_best_curr_window，长宽各为50
            x_curr_window = x_curr[i_img, :, center_h:center_h+s, center_w:center_w+s] #x_curr_window=[3,50,50]
            x_best_curr_window = x_best_curr[i_img, :, center_h:center_h+s, center_w:center_w+s] #x_best_curr_window=[3,50,50]
            # prevent trying out a delta if it doesn't change x_curr (e.g. an overlapping patch)
            while torch.sum(np.abs(np.clip(x_curr_window + deltas[i_img, :, center_h:center_h+s, center_w:center_w+s], min_val, max_val) - x_best_curr_window) < 10**-7) == c*s*s:
                #往窗口里随机添加噪声
                deltas[i_img, :, center_h:center_h+s, center_w:center_w+s] = torch.from_numpy(np.random.choice([-eps, eps], size=[c, 1, 1]))
        #新添加完噪声的图片
        x_new = np.clip(x_curr + deltas, min_val, max_val)

        logits = model.predict(x_new) #重新在模型中获取softmax层之间的得分矩阵logits=[1,1000]
        loss = model.loss(y_curr, logits, targeted, loss_type=loss_type) #传入magin loss中求得loss=2.96265
        margin = model.loss(y_curr, logits, targeted, loss_type='margin_loss')

        idx_improved = loss < loss_min_curr #idx_improved =false,本次迭代没有降低margin loss
        loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr #loss_min=[2.96165]
        margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr #margin_min=[2.96165]
        idx_improved = np.reshape(idx_improved, [-1, *[1]*len(x.shape[:-1])]) #idx_improved=false
        idx_improved =torch.from_numpy(idx_improved)
        x_best[idx_to_fool] = idx_improved * x_new + ~idx_improved * x_best_curr
        n_queries[idx_to_fool] += 1  #查询次数+1

        acc = (margin_min > 0.0).sum() / n_ex_total  #acc=1.0
        acc_corr = (margin_min > 0.0).mean()       #acc_corr=1.0  mean_nq=2.0 median_nq_ae=nan
        mean_nq, mean_nq_ae, median_nq_ae = np.mean(n_queries), np.mean(n_queries[margin_min <= 0]), np.median(n_queries[margin_min <= 0])
        avg_margin_min = np.mean(margin_min)  #avg_margin_min=2.9616505
        time_total = time.time() - time_start
        print('{}: acc={:.2%} acc_corr={:.2%} avg#q_ae={:.2f} med#q={:.1f}, avg_margin={:.2f} (n_ex={}, eps={:.3f}, {:.2f}s)'.format(
            i_iter + 1, acc, acc_corr, mean_nq_ae, median_nq_ae, avg_margin_min, x.shape[0], eps, time_total
        ))


        metrics[i_iter] = [acc, acc_corr, mean_nq, mean_nq_ae, median_nq_ae, margin_min.mean(), time_total]
        # if (i_iter <= 500 and i_iter % 20 == 0) or (i_iter > 100 and i_iter % 50 == 0) or i_iter + 1 == n_iters or acc == 0:
        #     np.save(metrics_path, metrics)
        if acc == 0:
            break

    return n_queries, x_best

2.3 margin loss

    #使用margin_loss损失，logits是经过最后一层softmax之前的得分矩阵[1000],y是正确标签并已经进行了独热编码为[1000]
    #y = utils.dense_to_onehot(y, n_cls=n_cls) 是否是有目标攻击targeted
    def loss(self, y, logits, targeted=False, loss_type='margin_loss'):
        """ Implements the margin loss (difference between the correct and 2nd best class). """
        if loss_type == 'margin_loss':
            preds_correct_class = (logits * y).sum(1, keepdims=True) #保持[1000],仅取正常分类标签的分数
            diff = preds_correct_class - logits  # difference between the correct class and all other classes
            diff[y] = np.inf  # to exclude zeros coming from f_correct - f_correct
            margin = diff.min(1, keepdims=True) #取和目标标签差距最小的那个
            loss = margin * -1 if targeted else margin
        elif loss_type == 'cross_entropy':
            probs = utils.softmax(logits)
            loss = -np.log(probs[y])
            loss = loss * -1 if not targeted else loss
        else:
            raise ValueError('Wrong loss.')
        return loss.flatten()

1]Obfuscated gradients give a false sense of security: Circumventing defenses to adversarial examples

[2]Logit pairing methods can fool gradient-based attacks