author = {Matthieu Geist},
title = {Soft-max boosting},
journal = {Machine Learning},
year = {2015},
volume = {100},
number = {2},
pages = {305-332},
note = {(I discovered after publication that a very similar approach has been published some time ago, see "an iterative method for multi-class cost-sensitive learning" by Abe, Zadrozny and Langford, KDD'04)},
url = {http://www.metz.supelec.fr//metz/personnel/geist_mat/pdfs/ml_sm_boost_rev.pdf},
abstract = {The standard multi-class classification risk, based on the binary loss, is rarely directly minimized. This is due to (i) the lack of convexity and (ii) the lack of smoothness (and even continuity). The classic approach consists in minimizing instead a convex surrogate. In this paper, we propose to replace the usually considered deterministic decision rule by a stochastic one, which allows obtaining a smooth risk (generalizing the expected binary loss, and more generally the cost-sensitive loss). Practically, this (empirical) risk is minimized by performing a gradient descent in the function space linearly spanned by a base learner (a.k.a. boosting). We provide a convergence analysis of the resulting algorithm and experiment it on a bunch of synthetic and real world data sets (with noiseless and noisy domains, compared to convex and non convex boosters).}