Lightbm 多标签分类

2025年03月17日 | 阅读 9 分钟

多标签分类允许每个实例同时分配给多个类别，而不是仅仅一个。例如，在一款音乐推荐系统中，一首歌曲可能被归类为多个流派，如“摇滚”、“蓝调”和“爵士”。相比之下，在传统的分类任务中，一个实例通常被分配给一组互斥类别中的一个。

预测每个事件的多个标签导致多标签分类固有的难度。因此，需要能够表示各种标签之间相关性和相互依赖性的模型。传统技术可能无法有效处理这种复杂性，尤其是在处理大数据集和高维特征空间时。

微软创建了 LightGBM，一个利用基于树的学习技术的梯度增强框架。它旨在实现可扩展性、高效性，并能够处理大量数据。LightGBM 通过多种改进实现了这一点，例如叶子生长策略、对大数据集的有效处理以及基于直方图的决策树学习。基于直方图的技术通过将连续信息离散化为离散区间来加速训练过程。虽然其他梯度增强技术采用深度优先方式构建树，但 LightGBM 采用叶子优先方式构建树，专注于损失减少最大的叶子，从而提高了准确性。LightGBM 还针对性能进行了优化，使其适用于大型高维数据集。

我们可以利用 LightGBM 在同时处理多个二元分类问题方面的适应性和效率，从而将其修改用于多标签分类。二元相关 (BR)、分类器链 (CC)、标签幂集 (LP) 和集成方法是一些流行技术。最简单的方法是二元相关，其中使用二元分类器独立学习每个标签，该分类器可以有效地使用 LightGBM 进行训练。尽管此方法单独处理每个标签，但可能无法很好地捕捉标签关系。相反，分类器链以类似于链的方式训练二元分类器，使用每个分类器的预测作为后续分类器的额外特征。这使得模型能够识别标签之间的相关性和依赖性，从而提高预测准确性。标签幂集方法将所有可能的标签组合视为一个单独的类别，从而将多标签问题简化为单个多类别问题。因此，可以使用 LightGBM 训练多类别分类器。但是，当存在大量不同标签组合时，此方法可能无法实施。

代码

现在为了更好地理解这个概念，我们将借助 Lightbm 尝试对鸢尾植物进行分类。

导入库

 
from pathlib import Path
from typing import List, Tuple, Optional

import numpy as np
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb

读取数据集

 
DATASET_NAME = "iris"
ROOT = Path.cwd().parent
INPUT_ROOT = ROOT / "input"
RAW_DATA = INPUT_ROOT / DATASET_NAME

ID = "Id"
TARGET = "Species"
RANDOM_SEED = 0
NUM_THREADS = 4
N_FOLD = 5

 
train_all = pd.read_csv(RAW_DATA / "Iris.csv")
train_all.head()

现在我们将创建一个自定义多类别对数损失函数和准确性指标，以用于 LightGBM，一个梯度增强框架。

class MultiLoglossForLGBM:
    """Self-made multi-class logloss for LightGBM."""
    
    def __init__(self, n_class: int=3, use_softmax: bool=True, epsilon: float=1e-32) -> None:
        """Initialize."""
        self.name = "my_mlnloss"
        self.n_class = n_class
        self.prob_func = self._get_prob_value if use_softmax else lambda x: x
        self.epsilon = epsilon
    
    def __call__(self, preds: np.ndarray, labels: np.ndarray, weight: Optional[np.ndarray]=None) -> float:
        """Calc loss."""
        # # get prob value by softmax
        prob = self.prob_func(preds)           # <= logits 
        # # Convert labels to 1-hot
        labels = self._get_1hot_label(labels)  # <= labels (1D-array)  1hot 
        loss_by_sample = np.sum(- np.log(prob) * labels, axis=1)
        loss = np.average(loss_by_sample, weight)
        
        return loss
    
    def _calc_grad_and_hess(
        self, preds: np.ndarray, labels: np.ndarray, weight: Optional[np.ndarray]=None
    ) -> Tuple[np.ndarray]:
        """Calc Grad and Hess"""
        # # get prob value by softmax
        prob = self.prob_func(preds)           # <= margin 
        # # Convert labels to 1-hot
        labels = self._get_1hot_label(labels)  # <= labels (1D-array)  1hot 

        grad = prob - labels
        hess = prob * (1 - prob)        
        if weight is not None:
            grad = grad * weight[:, None]
            hess = hess * weight[:, None]
        return grad, hess
    
    def return_loss(self, preds: np.ndarray, data: lgb.Dataset) -> Tuple[str, float, bool]:
        """Return Loss for lightgbm"""
        labels = data.get_label()
        weight = data.get_weight()
        n_example = len(labels)
        
        # # reshape preds: (n_class * n_example,) => (n_class, n_example) =>  (n_example, n_class)
        preds = preds.reshape(self.n_class, n_example).T  # <= preds (1D-array)  2D-array 
        # # calc loss
        loss = self(preds, labels, weight)
        
        return self.name, loss, False
    
    def return_grad_and_hess(self, preds: np.ndarray, data: lgb.Dataset) -> Tuple[np.ndarray]:
        """Return Grad and Hess for lightgbm"""
        labels = data.get_label()
        weight = data.get_weight()
        n_example = len(labels)
        
        # # reshape preds: (n_class * n_example,) => (n_class, n_example) =>  (n_example, n_class)
        preds = preds.reshape(self.n_class, n_example).T  # <= preds (1D-array) 2D-array 
        # # calc grad and hess.
        grad, hess =  self._calc_grad_and_hess(preds, labels, weight)

        # # reshape grad, hess: (n_example, n_class) => (n_class, n_example) => (n_class * n_example,) 
        grad = grad.T.reshape(n_example * self.n_class)   # <= 1D-array 
        hess = hess.T.reshape(n_example * self.n_class)   # <= 1D-array         
        return grad, hess
    
    def _get_prob_value(self, preds: np.ndarray) -> np.ndarray:
        """Convert Margin(Logit) to Prob by Softmax."""
        prob = np.exp(preds - preds.max(axis=1)[:, None])
        prob = prob / prob.sum(axis=1)[:, None]
        prob = np.clip(prob, self.epsilon, 1 - self. epsilon)
        
        return prob
    
    def _get_1hot_label(self, labels: np.ndarray) -> np.ndarray:
        """Convert labels to 1hot array."""
        n_example = len(labels)
        ohot = np.zeros((n_example, self.n_class), dtype=int)
        ohot[np.arange(n_example), labels.astype(int)] = 1
        
        return ohot

def multi_class_accuracy_for_lgbm(
    preds: np.ndarray, data: lgb.Dataset, n_class: int=3,
):
    labels = data.get_label()  # (n_example,)
    weight = data.get_weight()  # (n_example,)
    
    n_example = len(labels)
    # # reshape: (n_example * n_class) => (n_class, n_example) => (n_example, n_class)
    preds = preds.reshape(n_class, n_example).T
    labels_pred = preds.argmax(axis=1)
    
    score = np.average(labels_pred == labels, weights=weight)
    return "my_macc", score, True

编码目标

我们将把目标值转换为序数值。

 
ord_enc = ce.OrdinalEncoder(
    mapping=[{
        "col": TARGET,
        "mapping": {c: i for i, c in enumerate(train_all.Species.unique())}}],
    cols=[TARGET])
train_all_multi_class = ord_enc.fit_transform(train_all)
ord_enc.category_mapping

输出

拆分数据

 
# We will now split the dataset into 2 training and validation sets
X_0 = train_all_multi_class.iloc[:,1:5].values
y_0 = train_all_multi_class.iloc[:,-1].values

kf = StratifiedKFold(n_splits=N_FOLD, random_state=RANDOM_SEED, shuffle=True)
train_val_splits = list(kf.split(X_0, y_0))

# # use fold 0
train_index, valid_index = train_val_splits[0]
X_0_tr, y_0_tr = X_0[train_index], y_0[train_index]
X_0_val, y_0_val =  X_0[valid_index], y_0[valid_index]

现在我们将使用两种方法来训练 LightGBM 模型以完成多类别分类任务：一种使用自定义多类别对数损失函数，另一种使用 LightGBM 提供的内置多类别目标函数。

 
MODEL_PARAMS_LGB = {
    'num_class': 3,  # <= class
   "metric": "None",
    "first_metric_only": True,
    "eta": 0.01,
    "max_depth": -1,
    "seed": RANDOM_SEED,
    "num_threads": NUM_THREADS,
    "verbose": -1
}
FIT_PARAMS_LGB = {
    "num_boost_round": 10000,
    "early_stopping_rounds": 100,
    "verbose_eval":100}

my_mlnloss = MultiLoglossForLGBM(n_class=3, use_softmax=True)  # <=  class 

lgb_tr = lgb.Dataset(X_0_tr, y_0_tr)
lgb_val = lgb.Dataset(X_0_val, y_0_val)

model_my_mlnloss = lgb.train(
    params=MODEL_PARAMS_LGB, train_set=lgb_tr, **FIT_PARAMS_LGB,
    valid_names=['train', 'valid'], valid_sets=[lgb_tr, lgb_val],
    fobj=my_mlnloss.return_grad_and_hess,           # <= gradient 
    feval=lambda preds, data: [
        my_mlnloss.return_loss(preds, data),        # <= loss 
        multi_class_accuracy_for_lgbm(preds, data)  # <= multi-class accuracy ( 
    ]
)

输出

 
# Training by inplemented multiclass
MODEL_PARAMS_LGB = {
    "objective": "multiclass",  # <= set implemented multi logloss
    'num_class': 3,
    "first_metric_only": True,
    "eta": 0.01,
    "max_depth": -1,
    "seed": RANDOM_SEED,
    "num_threads": NUM_THREADS,
    "verbose": -1
}
FIT_PARAMS_LGB = {
    "num_boost_round": 10000,
    "early_stopping_rounds": 100,
    "verbose_eval":100}

my_mlnloss = MultiLoglossForLGBM(n_class=3, use_softmax=False)   # <= objective softmax 
lgb_tr = lgb.Dataset(X_0_tr, y_0_tr)
lgb_val = lgb.Dataset(X_0_val, y_0_val)

model_mlnloss = lgb.train(
    params=MODEL_PARAMS_LGB, train_set=lgb_tr, **FIT_PARAMS_LGB,
    valid_names=['train', 'valid'], valid_sets=[lgb_tr, lgb_val],
    feval=lambda preds, data: [
        my_mlnloss.return_loss(preds, data),        # <= multi-class logloss  
        multi_class_accuracy_for_lgbm(preds, data)  # <= multi-class accuracy  
    ]
)

输出

多任务

现在我们将介绍一个自定义数据集类 MultiLabelDatasetForLGBM，专门用于在 LightGBM 中处理多标签数据，以及一个自定义损失函数 MultiMSEForLGBM，用于多任务均方误差。

 
class MultiLabelDatasetForLGBM(lgb.Dataset):
    """
    Makeshift Class for storing multi label.
    
    label: numpy.ndarray (n_example, n_target)
    """

    def __init__(
        self, data, label=None, reference=None, weight=None, group=None, init_score=None, silent=False,
        feature_name='auto', categorical_feature='auto', params=None,  free_raw_data=True
    ):
        """Initialize."""
        if label is not None:
            # # make dummy 1D-array
            dummy_label = np.arange(len(data))

        super(MultiLabelDatasetForLGBM, self).__init__(
            data, dummy_label, reference, weight, group, init_score, silent,
            feature_name, categorical_feature, params, free_raw_data)
        
        self.mult_label = label
        
    def get_multi_label(self):
        """Get 2D-array label"""
        return self.mult_label
    
    def set_multi_label(self, multi_label: np.ndarray):
        """Set 2D-array label"""
        self.mult_label = multi_label
        return self

 
class MultiMSEForLGBM:
    """Self-made multi-task(?) mse for LightGBM."""
    
    def __init__(self, n_target: int=3) -> None:
        """Initialize."""
        self.name = "my_mmse"
        self.n_target = n_target
    
    def __call__(self, preds: np.ndarray, labels: np.ndarray, weight: Optional[np.ndarray]=None) -> float:
        """Calc loss."""
        loss_by_sample = np.sum((preds - labels) ** 2, axis=1)
        loss = np.average(loss_by_sample, weights=weight)
        
        return loss
    
    def _calc_grad_and_hess(
        self, preds: np.ndarray, labels: np.ndarray, weight: Optional[np.ndarray]=None
    ) -> Tuple[np.ndarray]:
        """Calc Grad and Hess"""
        grad = preds - labels
        hess = np.ones_like(preds)     
        if weight is not None:
            grad = grad * weight[:, None]
            hess = hess * weight[:, None]

        return grad, hess
    
    def return_loss(self, preds: np.ndarray, data: lgb.Dataset) -> Tuple[str, float, bool]:
        """Return Loss for lightgbm"""
        labels = data.get_multi_label()  # <= Dataset  multi-label 
        weight = data.get_weight()
        n_example = len(labels)
        
        # # reshape preds: (n_target * n_example,) => (n_target, n_example) =>  (n_example, n_target)
        preds = preds.reshape(self.n_target, n_example).T  # <= preds (1D-array)  2D-array 
        # # calc loss
        loss = self(preds, labels, weight)
        
        return self.name, loss, False
    
    def return_grad_and_hess(self, preds: np.ndarray, data: lgb.Dataset) -> Tuple[np.ndarray]:
        """Return Grad and Hess for lightgbm"""
        labels = data.get_multi_label()  # <= Dataset  multi-label 
        weight = data.get_weight()
        n_example = len(labels)
        
        # # reshape preds: (n_target * n_example,) => (n_target, n_example) =>  (n_example, n_target)
        preds = preds.reshape(self.n_target, n_example).T  # <= preds (1D-array)  2D-array 
        # # calc grad and hess.
        grad, hess =  self._calc_grad_and_hess(preds, labels, weight)

        # # reshape grad, hess: (n_example, n_target) => (n_class, n_target) => (n_target * n_example,) 
        grad = grad.T.reshape(n_example * self.n_target)   # <= 1D-array 
        hess = hess.T.reshape(n_example * self.n_target)   # <= 1D-array 
        
        return grad, hess

 
def multi_class_accuracy_for_lgbm_altered(
    preds: np.ndarray, data: lgb.Dataset, n_class: int=3,
):
    labels = data.get_multi_label()  # (n_example, n_class)
    weight = data.get_weight()  # (n_example,)
    
    n_example = len(labels)
    # # reshape: (n_example * n_class) => (n_class, n_example) => (n_example, n_class)
    preds = preds.reshape(n_class, n_example).T
    labels_true = labels.argmax(axis=1)
    labels_pred = preds.argmax(axis=1)

    score = np.average(labels_pred == labels_true, weights=weight)
    return "my_macc", score, True

编码目标

这里我们现在将使用不同的方法，我们将把目标编码为分类值。

 
ohot_enc = ce.OneHotEncoder(cols=[TARGET], use_cat_names=True)
train_all_multi_reg = ohot_enc.fit_transform(train_all)

train_all_multi_reg.head()

输出

拆分数据

 
# Now we will prepare data for a regression task after one-hot encoding of the target variables
X_1 = train_all_multi_reg.iloc[:, 1:5].values
y_1 = train_all_multi_reg.iloc[:, 5:8].values

# # Use the same split as multi-class classification
train_index, valid_index = train_val_splits[1]
X_1_tr, y_1_tr = X_1[train_index], y_1[train_index]
X_1_val, y_1_val =  X_1[valid_index], y_1[valid_index]

训练

现在我们将再次训练模型。

 
MODEL_PARAMS_LGB = {
    'num_class': 3,  # <= class 
    "eta": 0.01,
    "metric": "None",
    "first_metric_only": True,
    "max_depth": -1,
    "seed": RANDOM_SEED,
    "num_threads": NUM_THREADS,
    "verbose": -1
}
FIT_PARAMS_LGB = {
    "num_boost_round": 10000,
    "early_stopping_rounds": 100,
    "verbose_eval":50}

my_mmse = MultiMSEForLGBM(n_target=3)  # <=  class 

lgb_tr = MultiLabelDatasetForLGBM(X_1_tr, y_1_tr)     # <= Dataset
lgb_val = MultiLabelDatasetForLGBM(X_1_val, y_1_val)  # <= Dataset

model_my_mmse = lgb.train(
    MODEL_PARAMS_LGB, lgb_tr, **FIT_PARAMS_LGB,
    valid_names=['train', 'valid'], valid_sets=[lgb_tr, lgb_val],
    fobj=my_mmse.return_grad_and_hess,                       # <= gradient 
    feval=lambda preds, data: [
        my_mmse.return_loss(preds, data),                    # <= loss         multi_class_accuracy_for_lgbm_altered(preds, data),  # <= multi-class accuracy (??) 
    ]
)

输出

 
class MultiMSEAlphaForLGBM:
    """Self-made multi-task(?) mse for LightGBM."""
    
    def __init__(self, n_target: int=3) -> None:
        """Initialize."""
        self.name = "my_mmse_2"
        self.n_target = n_target
    
    def __call__(self, preds: np.ndarray, labels: np.ndarray, weight: Optional[np.ndarray]=None) -> float:
        """Calc loss."""
        loss_by_sample = np.sum((preds - labels) ** 2, axis=1) + (1 - np.sum(preds, axis=1)) ** 2
        loss = np.average(loss_by_sample, weights=weight)
        
        return loss
    
    def _calc_grad_and_hess(
        self, preds: np.ndarray, labels: np.ndarray, weight: Optional[np.ndarray]=None
    ) -> Tuple[np.ndarray]:
        """Calc Grad and Hess"""
        grad = preds - labels - 1 + np.sum(preds, axis=1)[:, None]
        hess = np.ones_like(preds) * 2
        if weight is not None:
            grad = grad * weight[:, None]
            hess = hess * weight[:, None]

        return grad, hess
    
    def return_loss(self, preds: np.ndarray, data: lgb.Dataset) -> Tuple[str, float, bool]:
        """Return Loss for lightgbm"""
        labels = data.get_multi_label()  # <= Dataset multi-label 
        weight = data.get_weight()
        n_example = len(labels)
        
        # # reshape preds: (n_target * n_example,) => (n_target, n_example) =>  (n_example, n_target)
        preds = preds.reshape(self.n_target, n_example).T  # <= preds (1D-array)  2D-array 
        # # calc loss
        loss = self(preds, labels, weight)
        
        return self.name, loss, False
    
    def return_grad_and_hess(self, preds: np.ndarray, data: lgb.Dataset) -> Tuple[np.ndarray]:
        """Return Grad and Hess for lightgbm"""
        labels = data.get_multi_label()  # <= Dataset multi-label 
        weight = data.get_weight()
        n_example = len(labels)
        
        # # reshape preds: (n_target * n_example,) => (n_target, n_example) =>  (n_example, n_target)
        preds = preds.reshape(self.n_target, n_example).T  # <= preds (1D-array)  2D-array 
        # # calc grad and hess.
        grad, hess =  self._calc_grad_and_hess(preds, labels, weight)

        # # reshape grad, hess: (n_example, n_target) => (n_class, n_target) => (n_target * n_example,) 
        grad = grad.T.reshape(n_example * self.n_target)   # <= 1D-array 
        hess = hess.T.reshape(n_example * self.n_target)   # <= 1D-array 
        
        return grad, hess

 
MODEL_PARAMS_LGB = {
    'num_class': 3,  # <= class ????
    "eta": 0.01,
    "metric": "None",
    "first_metric_only": True,
    "max_depth": -1,
    "seed": RANDOM_SEED,
    "num_threads": NUM_THREADS,
    "verbose": -1
}
FIT_PARAMS_LGB = {
    "num_boost_round": 10000,
    "early_stopping_rounds": 100,
    "verbose_eval":50}


my_mmse_2 = MultiMSEAlphaForLGBM(n_target=3)  # <= class  
my_mmse = MultiMSEForLGBM(n_target=3)          # <= class (loss)

lgb_tr = MultiLabelDatasetForLGBM(X_1_tr, y_1_tr)     # <= Dataset
lgb_val = MultiLabelDatasetForLGBM(X_1_val, y_1_val)  # <= Dataset

model_my_mmse_2 = lgb.train(
    MODEL_PARAMS_LGB, lgb_tr, **FIT_PARAMS_LGB,
    valid_names=['train', 'valid'], valid_sets=[lgb_tr, lgb_val],
    fobj=my_mmse_2.return_grad_and_hess,                       # <= gradient     feval=lambda preds, data: [
        my_mmse_2.return_loss(preds, data),                 # <= loss         my_mmse.return_loss(preds, data),                    # <= loss         multi_class_accuracy_for_lgbm_altered(preds, data),  # <= multi-class accuracy     ]
)

输出

比较预测

现在将 softmax 函数应用于预测的 logits 以获得类别概率，对于回归任务，预测直接从训练好的模型中获得，无需额外的转换。

 
# # prediction of classification
y_pred_val_mlnloss = model_my_mlnloss.predict(X_0_val)
y_pred_val_mlnloss = my_mlnloss._get_prob_value(y_pred_val_mlnloss) # <= softmax 
# # prediction of regression
y_pred_val_mmse = model_my_mmse.predict(X_1_val)
y_pred_val_mmse_2 = model_my_mmse_2.predict(X_1_val)

y_pred_val_mlnloss[:5]

输出

下一个主题蒙特卡洛方法

← 上一个下一个 →

Lightbm 多标签分类

导入库

读取数据集

编码目标

拆分数据

多任务

编码目标

拆分数据

训练

比较预测

联系信息

关注我们

教程

面试题

在线编译器

Python

Java

.Net Framework

AI, ML and Data Science

Cloud Technology

B.Tech and MCA

Web Technology

PHP

Software Testing

Technical Interview

Java Interview

Python

Web Interview

Database Interview

B.Tech / MCA

Important Interview

Software Testing Interview

Company Interviews

Online Compilers

Multiple Choice Questions

机器学习

监督式学习

分类

杂项

相关教程

面试题

Lightbm 多标签分类

导入库

读取数据集

编码目标

拆分数据

多任务

编码目标

拆分数据

训练

比较预测

相关帖子

什么是逆强化学习

一对一 (OvO) 多类分类器

递归特征消除

贝叶斯深度学习简介

蒙特卡洛方法

高斯泼溅概述

机器学习在教育领域的应用

使用深度学习结合马尔可夫模型预测用户需求

密度估计

机器学习中的森林覆盖类型预测

订阅 Tpoint Tech

联系信息

关注我们

教程

面试题

在线编译器