@misc{deyoung2019eraser,
    title={ERASER: A Benchmark to Evaluate Rationalized NLP Models},
    author={Jay DeYoung and Sarthak Jain and Nazneen Fatema Rajani and Eric Lehman and Caiming Xiong and Richard Socher and Byron C. Wallace},
    year={2019},
    eprint={1911.03429},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
@inproceedings{lehman2019inferring,
  title={Inferring Which Medical Treatments Work from Reports of Clinical Trials},
  author={Lehman, Eric and DeYoung, Jay and Barzilay, Regina and Wallace, Byron C},
  booktitle={Proceedings of the North American Chapter of the Association for Computational Linguistics (NAACL)},
  pages={3705--3717},
  year={2019}
}
@inproceedings{clark2019boolq,
  title =     {BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions},
  author =    {Clark, Christopher and Lee, Kenton and Chang, Ming-Wei, and Kwiatkowski, Tom and Collins, Michael, and Toutanova, Kristina},
  booktitle = {NAACL},
  year =      {2019},
}
@inproceedings{zaidan2007using,
  title={Using "annotator rationales" to improve machine learning for text categorization},
  author={Zaidan, Omar and Eisner, Jason and Piatko, Christine},
  booktitle={Proceedings of the conference of the North American chapter of the Association for Computational Linguistics (NAACL)},
  pages={260--267},
  year={2007}
}
@inproceedings{thorne2018fever,
  title={{FEVER: a Large-scale Dataset for Fact Extraction and VERification}},
  author={Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},
  booktitle={Proceedings of the North American Chapter of the Association for Computational Linguistics (NAACL)},
  pages={809--819},
  year={2018}
}
@inproceedings{KCRUR18-multirc,
    author = {Daniel Khashabi and Snigdha Chaturvedi and Michael Roth and Shyam Upadhyay and Dan Roth},
    title = {{Looking Beyond the Surface: A Challenge Set for Reading Comprehension over Multiple Sentences}},
    booktitle = {Proc. of the Annual Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)},
    year = {2018},
    url = "http://cogcomp.org/papers/2018-MultiRC-NAACL.pdf",
}
@article{rajani2019explain,
  title={Explain Yourself! Leveraging Language Models for Commonsense Reasoning},
  author={Rajani, Nazneen Fatema and McCann, Bryan and Xiong, Caiming and Socher, Richard},
  journal={Proceedings of the Association for Computational Linguistics (ACL)},
  year={2019}
}
@inproceedings{talmor-etal-2019-commonsenseqa,
    title = "{C}ommonsense{QA}: A Question Answering Challenge Targeting Commonsense Knowledge",
    author = "Talmor, Alon  and
      Herzig, Jonathan  and
      Lourie, Nicholas  and
      Berant, Jonathan",
    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
    month = jun,
    year = "2019",
    address = "Minneapolis, Minnesota",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/N19-1421",
    doi = "10.18653/v1/N19-1421",
    pages = "4149--4158",
    abstract = "When answering a question, people often draw upon their rich world knowledge in addition to the particular context. Recent work has focused primarily on answering questions given some relevant document or context, and required very little general background. To investigate question answering with prior knowledge, we present CommonsenseQA: a challenging new dataset for commonsense question answering. To capture common sense beyond associations, we extract from ConceptNet (Speer et al., 2017) multiple target concepts that have the same semantic relation to a single source concept. Crowd-workers are asked to author multiple-choice questions that mention the source concept and discriminate in turn between each of the target concepts. This encourages workers to create questions with complex semantics that often require prior knowledge. We create 12,247 questions through this procedure and demonstrate the difficulty of our task with a large number of strong baselines. Our best baseline is based on BERT-large (Devlin et al., 2018) and obtains 56{\%} accuracy, well below human performance, which is 89{\%}.",
}
@inproceedings{camburu2018snli,
  title={e-SNLI: Natural language inference with natural language explanations},
  author={Camburu, Oana-Maria and Rockt{\"a}schel, Tim and Lukasiewicz, Thomas and Blunsom, Phil},
  booktitle={Advances in Neural Information Processing Systems},
  pages={9539--9549},
  year={2018}
}