author = {Olivier Pietquin and Helen Hastie},
title = {A survey on metrics for the evaluation of user simulations},
journal = {Knowledge Engineering Review},
year = {2013},
volume = {28},
number = {01},
pages = {59-73},
month = {February},
note = {first published as FirstView},
url = {http://www.metz.supelec.fr/metz/personnel/pietquin/pdf/KER_2013_OPHH.pdf},
doi = {10.1017/S0269888912000343},
abstract = {From the mid 90’s user simulation has become an important trend of research in the field of spoken dialogue systems (SDS) (Eckert et al., 1997; Zuckerman and Albrecht, 2001; Georgila et al., 2005; CuayAhuitl et al., 2005; Pietquin, 2006; Schatzmann et al., 2007b; Janarthanam and Lemon, 2009b; Pietquin et al., 2009), because collecting and annotating real human-machine interactions is often expensive and time consuming. Yet, such data are generally required for designing, training and assessing dialogue systems (Levin et al., 2000; Scheffler and Young, 2001; L´opez- C´ozar et al., 2003; Pietquin and Dutoit, 2006; Schatzmann et al., 2007a). Especially when using machine learning methods for optimising dialogue management strategies such as Reinforcement Learning (RL) (Sutton and Barto, 1998), the amount of data necessary for training is larger than existing corpora. Indeed, exploring the whole dialogue state space and strategy space requires a number of interactions that increases exponentially with the number of states while even simple dialogue systems have continuous state spaces (because of the inclusion of speech recognition and understanding confidence levels into the state description). User simulation is, therefore, necessary to expand data sets. The general goal of a user simulation is thus to produce as many as necessary natural, varied and consistent interactions from as few data as possible. The quality of the user simulation is, therefore, of crucial importance because it dramatically influences the results in terms of SDS performance analysis and learnt strategy (Schatzmann et al., 2005b). Assessment of the quality of simulated dialogues and user simulation methods is an open issue and, although assessment metrics are required, there is no commonly adopted metric (Schatzmann et al., 2005a; Georgila et al., 2006). In this paper, we will first define a list of desired features of a good user simulation metric. Secondly, state-of-the-art of metrics described in the literature are presented.}