Attacks Against XAI
Recent research has shown a close connection between explanations and adversarial examples.
It thus is not surprising that methods for explaining machine learning have successfully been attacked in a similar setting.
With such input-manipulation attacks, it is possible for an adversary to effectively deceive explainable machine-learning methods.
An input sample is modified in a way that it shows a specific explanation or generates uninformative output.
These attacks are tailored towards individual input samples, limiting their reach.
If, however, it were possible to trigger an incorrect or uninformative explanation for any input, an adversary could disguise the reasons for a classifier’s decision and even point towards alternative facts as a red herring on a larger scale.
- "Model-Manipulation Attacks Against Black-Box Explanations," ACSAC 2024
(project page, paper, code)
@InProceedings{Hegde2024Model,
author = {Achyut Hegde and Maximilian Noppel and Christian Wressnegger},
booktitle = {Proc. of the 40th Annual Computer Security Applications Conference ({ACSAC})},
title = {Model-Manipulation Attacks Against Black-Box Explanations},
year = {2024},
month = dec,
day = {9.-13.}
}
- "Disguising Attacks with Explanation-Aware Backdoors," IEEE S&P 2023
(project page, paper, video, code)
@InProceedings{Noppel2023Disguising,
author = {Maximilian Noppel and Lukas Peter and Christian Wressnegger},
booktitle = {Proc. of the 44th {IEEE} Symposium on Security and Privacy ({S\&P})},
title = {Disguising Attacks with Explanation-Aware Backdoors},
year = {2023},
month = may,
day = {22.-25.}
}
- "Poster: Fooling XAI with Explanation-Aware Backdoors," CCS 2023
(poster)
@InProceedings{Noppel2023Poster,
author = {Maximilian Noppel and Christian Wressnegger},
booktitle = {Proc. of 30th ACM Conference on Computer and Communications Security ({CCS})},
title = {{Poster}: {F}ooling {XAI} with Explanation-Aware Backdoors},
year = 2023,
month = nov,
}
- "Explanation-Aware Backdoors in a Nutshell," AI 2023
(paper)
@InProceedings{Noppel2023ExplanationAware,
author = {Maximilian Noppel and Christian Wressnegger},
booktitle = {Proc. of 46th German Conference on Artificial Intelligence},
title = {Explanation-Aware Backdoors in a Nutshell},
year = 2023,
month = sep
}