@Workshop{Supelec862,

author = {Matthieu Geist and Edouard Klein and Bilal PIOT and Yann Guermeur and Olivier Pietquin},

title = {Around Inverse Reinforcement Learning and Score-based Classification},

year = {2013},

booktitle = {1st Multidisciplinary Conference on Reinforcement Learning and Decision Making (RLDM 2013)},

month = {October},

address = {Princeton (USA)},

url = {http://www.metz.supelec.fr//metz/personnel/geist_mat/pdfs/supelec862.pdf},

abstract = {Inverse reinforcement learning (IRL) aims at estimating an
unknown reward function optimized by some expert agent from
interactions between this expert and the system to be
controlled. One of its major application fields is imitation
learning, where the goal is to imitate the expert, possibly in
situations not encountered before. A classic and simple way to
handle this problem is to see it as a classification problem,
mapping states to actions. The potential issue with this
approach is that classification does not take naturally into
account the temporal structure of sequential decision making.
Yet, many classification algorithms consist in learning a \textit
{score function}, mapping state-action couples to values, such
that the value of the action chosen by the expert is higher than
the others. The \textit{decision rule} of the classifier
maximizes the score over actions for a given state. This is
curiously reminiscent of the \textit{state-action value
function} in reinforcement learning, and of the associated
\textit{greedy policy}.
Based on this simple statement, we propose two IRL algorithms
that incorporate the structure of the sequential decision making
problem into some classifier in different ways. The first one,
SCIRL (Structured Classification for IRL), starts from the
observation that linearly parameterizing a reward function by
some features imposes a linear parametrization of the Q-function
by a so-called feature expectation. SCIRL simply uses (an
estimate of) the expert feature expectation as the basis
function of the score function. The second algorithm, CSI
(Cascaded Supervised IRL), applies a reversed Bellman equation
(expressing the reward as a function of the Q-function) to the
score function outputted by any score-based classifier, which
reduces to a simple (and generic) regression step. These two
algorithms come with theoretical guarantees and perform
competitively on toy problems.
}

}