author = {Lucie Daubigney and Matthieu Geist and Olivier Pietquin},
title = {Off-policy Learning in Large-scale POMDP-based Dialogue Systems},
year = {2012},
booktitle = {Proceedings of the 37th IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2012)},
publisher = {IEEE},
pages = {4989 - 4992},
address = {Kyoto (Japan)},
url = {http://www.metz.supelec.fr//metz/personnel/geist_mat/pdfs/Supelec763.pdf},
abstract = {Reinforcement learning (RL) is now part of the state of the art in the domain of spoken dialogue systems (SDS) optimisation. Most performant RL methods, such as those based on Gaussian Processes, require to test small changes in the policy to assess them as improvements or degradations. This process is called on policy learning. Nevertheless, it can result in system behaviours that are not acceptable by users. Learning algorithms should ideally infer an optimal strategy by observing interactions generated by a non-optimal but acceptable strategy, that is learning off-policy. Such methods usually fail to scale up and are thus not suited for real-world systems. In this contribution, a sample-efficient, online and off-policy RL algorithm is proposed to learn an optimal policy. This algorithm is combined to a compact non-linear value function representation (namely a multilayers perceptron) enabling to handle large scale systems.}