klinger.bib
@misc{greschner2025categoricalemotionsappraisals,
title = {Categorical Emotions or Appraisals - Which Emotion
Model Explains Argument Convincingness Better?},
author = {Lynn Greschner and Meike Bauer and Sabine Weber and
Roman Klinger},
year = {2025},
eprint = {2511.07162},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2511.07162},
internaltype = {preprint}
}
@inproceedings{menchaca-resendiz-etal-2025-supporting,
title = {Supporting Plain Language Summarization of
Psychological Meta-Analyses with Large Language
Models},
author = {Menchaca Resendiz, Yarik and Kerwer, Martin and
Chasiotis, Anita and Bodemer, Marlene and
Sassenberg, Kai and Klinger, Roman},
editor = {Liu, Xuebo and Purwarianti, Ayu},
booktitle = {Proceedings of The 14th International Joint
Conference on Natural Language Processing and The
4th Conference of the Asia-Pacific Chapter of the
Association for Computational Linguistics: System
Demonstrations},
month = dec,
year = {2025},
address = {Mumbai, India},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.ijcnlp-demo.4/},
pages = {25--35},
isbn = {979-8-89176-301-2},
abstract = {Communicating complex scientific findings to
non-experts remains a major challenge in fields like
psychology, where research is often presented in
highly technical language. One effective way to
improve accessibility, for non-experts, is through
plain language summaries, which summarize key
insights into simple and understandable
terms. However, the limited number of institutions
that produce lay summaries typically relies on
psychology experts to create them manually {--} an
approach that ensures high quality but requires
significant expertise, time, and effort. In this
paper, we introduce the KLARpsy App, a system
designed to support psychology experts in creating
plain language summaries of psychological
meta-analyses using Large Language Models (LLM). Our
system generates initial draft summaries based on a
37-criterion guideline developed to ensure clarity
for non-experts. All summaries produced through the
system are manually validated and edited by KLARpsy
authors to ensure factual correctness and
readability. We demonstrate how the system
integrates LLM-generated content into an
expert-in-the-loop workflow. The automatic
evaluation showed a mean semantic-similarity score
of 0.73 against expert-written summaries, and human
evaluation on a 5-point Likert scale averaged above
3 (higher is better), indicate that the generated
drafts are of high quality. The application and code
are open source.},
internaltype = {conferenceproc}
}
@misc{resendiz2025parlpromptbasedagentsreinforcement,
title = {PARL: Prompt-based Agents for Reinforcement
Learning},
author = {Yarik Menchaca Resendiz and Roman Klinger},
year = {2025},
eprint = {2510.21306},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2510.21306},
internaltype = {preprint}
}
@misc{greschner2025trustmeiconvince,
title = {Trust Me, I Can Convince You: The Contextualized Argument Appraisal Framework},
author = {Lynn Greschner and Sabine Weber and Roman Klinger},
year = {2025},
eprint = {2509.17844},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2509.17844},
internaltype = {preprint}
}
@inproceedings{li-etal-2025-humans,
title = {Are Humans as Brittle as Large Language Models?},
author = {Li, Jiahui and Papay, Sean and Klinger, Roman},
editor = {Inui, Kentaro and Sakti, Sakriani and Wang, Haofen
and Wong, Derek F. and Bhattacharyya, Pushpak and
Banerjee, Biplab and Ekbal, Asif and Chakraborty,
Tanmoy and Singh, Dhirendra Pratap},
booktitle = {Proceedings of the 14th International Joint
Conference on Natural Language Processing and the
4th Conference of the Asia-Pacific Chapter of the
Association for Computational Linguistics},
month = dec,
year = {2025},
address = {Mumbai, India},
publisher = {The Asian Federation of Natural Language Processing
and The Association for Computational Linguistics},
url = {https://aclanthology.org/2025.ijcnlp-long.116/},
pages = {2130--2155},
isbn = {979-8-89176-298-5},
abstract = {The output of large language models (LLMs) is
unstable, due both to non-determinism of the
decoding process as well as to prompt
brittleness. While the intrinsic non-determinism of
LLM generation may mimic existing uncertainty in
human annotations through distributional shifts in
outputs, it is largely assumed, yet unexplored, that
the prompt brittleness effect is unique to
LLMs. This raises the question: do human annotators
show similar sensitivity to prompt changes? If so,
should prompt brittleness in LLMs be considered
problematic? One may alternatively hypothesize that
prompt brittleness correctly reflects human
annotation variances. To fill this research gap, we
systematically compare the effects of prompt
modifications on LLMs and identical instruction
modifications for human annotators, focusing on the
question of whether humans are similarly sensitive
to prompt perturbations. To study this, we prompt
both humans and LLMs for a set of text
classification tasks conditioned on prompt
variations. Our findings indicate that both humans
and LLMs exhibit increased brittleness in response
to specific types of prompt modifications,
particularly those involving the substitution of
alternative label sets or label formats. However,
the distribution of human judgments is less affected
by typographical errors and reversed label order
than that of LLMs.},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2509.07869},
internaltype = {conferenceproc},
eprint = {2509.07869}
}
@inproceedings{schafer-etal-2025-localization,
title = {Localization of {E}nglish Affective Narrative
Generation to {G}erman},
author = {Sch{\"a}fer, Johannes and Weber, Sabine and Klinger,
Roman},
editor = {Wartena, Christian and Heid, Ulrich},
booktitle = {Proceedings of the 21st Conference on Natural
Language Processing (KONVENS 2025): Long and Short
Papers},
month = sep,
year = {2025},
address = {Hannover, Germany},
publisher = {HsH Applied Academics},
url = {https://aclanthology.org/2025.konvens-1.21/},
pages = {241--256},
internaltype = {conferenceproc}
}
@proceedings{nlpsi2025,
title = {Proceedings of the First Workshop on
Integrating NLP and Psychology to Study Social Interactions},
editor = {Aswathy Velutharambath and Sofie Labat and Neele Falk and Flor Miriam Plaza-del-Arco and Roman Klinger and V\'eronique Hoste},
month = {June},
year = {2025},
address = {Copenhagen, Denmark},
publisher = {ICWSM},
internaltype = {edited}
}
@misc{Schaefer2025shaping,
title = {Shaping Event Backstories to Estimate Potential
Emotion Contexts},
author = {Johannes Sch\"afer and Roman Klinger},
year = {2025},
eprint = {2508.09954},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2508.09954},
internaltype = {preprint}
}
@misc{velutharambath2025deceptiondetectedcrosslinguisticstudy,
title = {What if Deception Cannot be Detected? A
Cross-Linguistic Study on the Limits of Deception
Detection from Text},
author = {Aswathy Velutharambath and Kai
Sassenberg and Roman Klinger},
year = {2025},
eprint = {2505.13147},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2505.13147},
internaltype = {preprint}
}
@inproceedings{bamnlp2025,
title = {Which Demographics do {LLM}s Default to During
Annotation?},
author = {Sch{\"a}fer, Johannes and Combs, Aidan and Bagdon,
Christopher and Li, Jiahui and Probol, Nadine and
Greschner, Lynn and Papay, Sean and Menchaca
Resendiz, Yarik and Velutharambath, Aswathy and
Wuehrl, Amelie and Weber, Sabine and Klinger, Roman},
editor = {Che, Wanxiang and Nabende, Joyce and Shutova,
Ekaterina and Pilehvar, Mohammad Taher},
booktitle = {Proceedings of the 63rd Annual Meeting of the
Association for Computational Linguistics (Volume 1:
Long Papers)},
month = jul,
year = {2025},
address = {Vienna, Austria},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.acl-long.848/},
pages = {17331--17348},
isbn = {979-8-89176-251-0},
abstract = {Demographics and cultural background of annotators
influence the labels they assign in text annotation
{--} for instance, an elderly woman might find it
offensive to read a message addressed to a ``bro'',
but a male teenager might find it appropriate. It is
therefore important to acknowledge label variations
to not under-represent members of a society. Two
research directions developed out of this
observation in the context of using large language
models (LLM) for data annotations, namely (1)
studying biases and inherent knowledge of LLMs and
(2) injecting diversity in the output by
manipulating the prompt with demographic
information. We combine these two strands of
research and ask the question to which demographics
an LLM resorts to when no demographics is given. To
answer this question, we evaluate which attributes
of human annotators LLMs inherently
mimic. Furthermore, we compare non-demographic
conditioned prompts and placebo-conditioned prompts
(e.g., ``you are an annotator who lives in house
number 5'') to demographics-conditioned prompts
({``}You are a 45 year old man and an expert on
politeness annotation. How do you rate
instance''). We study these questions for politeness
and offensiveness annotations on the POPQUORN data
set, a corpus created in a controlled manner to
investigate human label variations based on
demographics which has not been used for LLM-based
analyses so far. We observe notable influences
related to gender, race, and age in demographic
prompting, which contrasts with previous studies
that found no such effects.},
internaltype = {conferenceproc},
eprint = {2410.08820},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2410.08820}
}
@inproceedings{bagdon-etal-2025-donate,
title = {Donate or Create? Comparing Data Collection
Strategies for Emotion-labeled Multimodal Social
Media Posts},
author = {Bagdon, Christopher and Combs, Aidan and Silberer,
Carina and Klinger, Roman},
editor = {Che, Wanxiang and Nabende, Joyce and Shutova,
Ekaterina and Pilehvar, Mohammad Taher},
booktitle = {Proceedings of the 63rd Annual Meeting of the
Association for Computational Linguistics (Volume 1:
Long Papers)},
month = jul,
year = {2025},
address = {Vienna, Austria},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.acl-long.847/},
pages = {17307--17330},
isbn = {979-8-89176-251-0},
abstract = {Accurate modeling of subjective phenomena such as
emotion expression requires data annotated with
authors' intentions. Commonly such data is collected
by asking study participants to donate and label
genuine content produced in the real world, or
create content fitting particu- lar labels during
the study. Asking participants to create content is
often simpler to implement and presents fewer risks
to participant privacy than data donation. However,
it is unclear if and how study-created content may
differ from genuine content, and how differences may
impact models. We collect study-created and genuine
multimodal social media posts labeled for emotion
and compare them on several dimen- sions, including
model performance. We find that compared to genuine
posts, study-created posts are longer, rely more on
their text and less on their images for emotion
expression, and focus more on emotion-prototypical
events. The samples of participants willing to
donate versus create posts are demographically
different. Study-created data is valuable to train
models that generalize well to genuine data, but
realistic effectiveness estimates require genuine
data.},
eprint = {2505.24427},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2505.24427},
internaltype = {conferenceproc}
}
@inproceedings{greschner-etal-2025-qolas,
title = {{Q}o{LAS}: A {R}eddit Corpus of Health-Related
Quality of Life Aspects of Mental Disorders},
author = {Greschner, Lynn and W{\"u}hrl, Amelie and Klinger,
Roman},
editor = {Demner-Fushman, Dina and Ananiadou, Sophia and Miwa,
Makoto and Tsujii, Junichi},
booktitle = {Proceedings of the 24th Workshop on Biomedical Language Processing},
month = aug,
year = {2025},
address = {Viena, Austria},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.bionlp-1.18/},
pages = {201--216},
isbn = {979-8-89176-275-6},
abstract = {Quality of Life (QoL) refers to a person{'}s
subjective perception of various aspects of their
life. For medical practitioners, it is one of the
most important concepts for treatment
decisions. Therefore, it is essential to understand
in which aspects a medical condition affects a
patient{'}s subjective perception of their
life. With this paper, we focus on the
under-resourced domain of mental health-related QoL,
and contribute the first corpus to study and model
this concept: We (1) annotate 240 Reddit posts with
a set of 11 QoL aspects (such as `independence',
`mood', or `relationships') and their sentiment
polarity. Based on this novel corpus, we (2)
evaluate a pipeline to detect QoL mentions and
classify them into aspects using open-domain
aspect-based sentiment analysis. We find that users
frequently discuss health-related QoL in their
posts, focusing primarily on the aspects
`relationships' and `selfimage'. Our method reliably
predicts such mentions and their sentiment, however,
detecting fine-grained individual aspects remains
challenging. An analysis of a large corpus of
automatically labeled data reveals that social media
content contains novel aspects pertinent to patients
that are not covered by existing QoL taxonomies.},
pdf = {https://www.romanklinger.de/publications/GreschnerKlinger_BioNLP2025.pdf},
internaltype = {workshop}
}
@inproceedings{papay-etal-2025-regular,
title = {Regular-pattern-sensitive {CRF}s for Distant Label
Interactions},
author = {Papay, Sean and Klinger, Roman and Pad{\'o},
Sebastian},
editor = {Fei, Hao and Tu, Kewei and Zhang, Yuhui and Hu,
Xiang and Han, Wenjuan and Jia, Zixia and Zheng,
Zilong and Cao, Yixin and Zhang, Meishan and Lu, Wei
and Siddharth, N. and {\O}vrelid, Lilja and Xue,
Nianwen and Zhang, Yue},
booktitle = {Proceedings of the 1st Joint Workshop on Large
Language Models and Structure Modeling (XLLM 2025)},
month = aug,
year = {2025},
address = {Vienna, Austria},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.xllm-1.4/},
pages = {26--35},
isbn = {979-8-89176-286-2},
abstract = {While LLMs have grown popular in sequence labeling,
linear-chain conditionalrandom fields (CRFs) remain
a popular alternativewith the ability to directly
model interactions between labels.However, the
Markov assumption limits them to interactions
between adjacent labels.Weighted finite-state
transducers (FSTs), in contrast, can modeldistant
label{--}label interactions, but exact label
inference is intractable in general.In this work, we
present regular-pattern-sensitiveCRFs (RPCRFs), a
method of enriching standardlinear-chain CRFs with
the ability to learnlong-distance label interactions
through user-specified patterns.This approach allows
users to write regular-expressionlabel patterns
concisely specifying which types of interactionsthe
model should take into account, allowingthe model to
learn from data whether and inwhich contexts these
patterns occur. The resultcan be interpreted
alternatively as a CRF augmented with
additional,non-local potentials,or as a finite-state
transducer whose structureis defined by a set of
easily-interpretable patterns.Critically, exact
training and inferenceare tractable for many pattern
sets. We detailhow an RPCRF can be automatically
constructed from a set of user-specified
patterns,and demonstrate the model{'}s effectiveness
ona sequence of three synthetic sequence modeling
datasets.},
eprint = {2411.12484},
archiveprefix = {arXiv},
primaryclass = {cs.LG},
url = {https://arxiv.org/abs/2411.12484},
internaltype = {workshop}
}
@misc{resendiz2025llmbasedaffectivetextgeneration,
title = {LLM-based Affective Text Generation Quality Based on
Different Quantization Values},
author = {Yarik Menchaca Resendiz and Roman Klinger},
year = {2025},
eprint = {2501.19317},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2501.19317},
internaltype = {preprint}
}
@inproceedings{greschner-klinger-2025-fearful,
title = {Fearful Falcons and Angry Llamas: Emotion Category
Annotations of Arguments by Humans and {LLM}s},
author = {Greschner, Lynn and Klinger, Roman},
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and {\"O}hman, Emily
and Bizzoni, Yuri and Miyagawa, So and Alnajjar,
Khalid},
booktitle = {Proceedings of the 5th International Conference on
Natural Language Processing for Digital Humanities},
month = may,
year = {2025},
address = {Albuquerque, USA},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.nlp4dh-1.52/},
pages = {628--646},
isbn = {979-8-89176-234-3},
abstract = {Arguments evoke emotions, influencing the effect of
the argument itself. Not only the emotional
intensity but also the category influences the
argument`s effects, for instance, the willingness to
adapt stances. While binary emotionality has been
studied in argumentative texts, there is no work on
discrete emotion categories (e.g.,
{\textquoteleft}anger') in such data. To fill this
gap, we crowdsource subjective annotations of
emotion categories in a German argument corpus and
evaluate automatic LLM-based labeling
methods. Specifically, we compare three prompting
strategies (zero-shot, one-shot, chain-of-thought)
on three large instruction-tuned language models
(Falcon-7b-instruct, Llama-3.1-8B-instruct,
GPT-4o-mini). We further vary the definition of the
output space to be binary (is there emotionality in
the argument?), closed-domain (which emotion from a
given label set is in the argument?), or open-domain
(which emotion is in the argument?). We find that
emotion categories enhance the prediction of
emotionality in arguments, emphasizing the need for
discrete emotion annotations in arguments. Across
all prompt settings and models, automatic
predictions show a high recall but low precision for
predicting anger and fear, indicating a strong bias
toward negative emotions.},
url = {https://arxiv.org/abs/2412.15993},
eprint = {2412.15993},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
internaltype = {conferenceproc}
}
@inproceedings{li-klinger-2025-iprop,
title = {i{P}r{O}p: Interactive Prompt Optimization for Large
Language Models with a Human in the Loop},
author = {Li, Jiahui and Klinger, Roman},
editor = {Zhao, Jin and Wang, Mingyang and Liu, Zhu},
booktitle = {Proceedings of the 63rd Annual Meeting of the
Association for Computational Linguistics (Volume 4:
Student Research Workshop)},
month = jul,
year = {2025},
address = {Vienna, Austria},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.acl-srw.18/},
pages = {276--285},
isbn = {979-8-89176-254-1},
abstract = {Prompt engineering has made significant
contributions to the era of large language models,
yet its effectiveness depends on the skills of a
prompt author. This paper introduces
$\textit{iPrOp}$, a novel interactive prompt
optimization approach, to bridge manual prompt
engineering and automatic prompt optimization while
offering users the flexibility to assess evolving
prompts. We aim to provide users with task-specific
guidance to enhance human engagement in the
optimization process, which is structured through
prompt variations, informative instances,
predictions generated by large language models along
with their corresponding explanations, and relevant
performance metrics. This approach empowers users to
choose and further refine the prompts based on their
individual preferences and needs. It can not only
assist non-technical domain experts in generating
optimal prompts tailored to their specific tasks or
domains, but also enable to study the intrinsic
parameters that influence the performance of prompt
optimization. The evaluation shows that our approach
has the capability to generate improved prompts,
leading to enhanced task performance.},
eprint = {2412.12644},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2412.12644},
internaltype = {workshop}
}
@inproceedings{menchaca-resendiz-klinger-2025-mopo,
title = {{MOPO}: Multi-Objective Prompt Optimization for
Affective Text Generation},
author = {Menchaca Resendiz, Yarik and Klinger, Roman},
editor = {Rambow, Owen and Wanner, Leo and Apidianaki,
Marianna and Al-Khalifa, Hend and Eugenio, Barbara
Di and Schockaert, Steven},
booktitle = {Proceedings of the 31st International Conference on
Computational Linguistics},
month = jan,
year = {2025},
address = {Abu Dhabi, UAE},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.coling-main.375/},
pages = {5588--5606},
eprint = {2412.12948},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2412.12948},
internaltype = {conferenceproc},
abstract = {How emotions are expressed depends on the context
and domain. On X (formerly Twitter), for instance,
an author might simply use the hashtag {\#}anger,
while in a news headline, emotions are typically
written in a more polite, indirect manner. To enable
conditional text generation models to create
emotionally connotated texts that fit a domain,
users need to have access to a parameter that allows
them to choose the appropriate way to express an
emotion. To achieve this, we introduce MOPO, a
Multi-Objective Prompt Optimization
methodology. MOPO optimizes prompts according to
multiple objectives (which correspond here to the
output probabilities assigned by emotion classifiers
trained for different domains). In contrast to
single objective optimization, MOPO outputs a set of
prompts, each with a different weighting of the
multiple objectives. Users can then choose the most
appropriate prompt for their context. We evaluate
MOPO using three objectives, determined by various
domain-specific emotion classifiers. MOPO improves
performance by up to 15 pp across all objectives
with a minimal loss (1{--}2 pp) for any single
objective compared to single-objective
optimization. These minor performance losses are
offset by a broader generalization across multiple
objectives {--} which is not possible with
single-objective optimization. Additionally, MOPO
reduces computational requirements by simultaneously
optimizing for multiple objectives, eliminating
separate optimization procedures for each
objective.}
}
@inproceedings{hofmann-etal-2025-prompt,
title = {Prompt-based Personality Profiling: Reinforcement
Learning for Relevance Filtering},
author = {Hofmann, Jan and Sindermann, Cornelia and Klinger,
Roman},
editor = {Kamalloo, Ehsan and Gontier, Nicolas and Lu, Xing
Han and Dziri, Nouha and Murty, Shikhar and Lacoste,
Alexandre},
booktitle = {Proceedings of the 1st Workshop for Research on
Agent Language Models (REALM 2025)},
month = jul,
year = {2025},
address = {Vienna, Austria},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.realm-1.1/},
pages = {1--16},
isbn = {979-8-89176-264-0},
abstract = {Author profiling is the task of inferring
characteristics about individuals by analyzing
content they share. Supervised machine learning
still dominates automatic systems that perform this
task, despite the popularity of prompting large
language models to address natural language
understanding tasks. One reason is that the
classification instances consist of large amounts of
posts, potentially a whole user profile, which may
exceed the input length of Transformers. Even if a
model can use a large context window, the entirety
of posts makes the application of API-accessed black
box systems costly and slow, next to issues which
come with such ``needle-in-the-haystack'' tasks. To
mitigate this limitation, we propose a new method
for author profiling which aims at distinguishing
relevant from irrelevant content first, followed by
the actual user profiling only with relevant
data. To circumvent the need for relevance-annotated
data, we optimize this relevance filter via
reinforcement learning with a reward function that
utilizes the zero-shot capabilities of large
language models. We evaluate our method for Big Five
personality trait prediction on two Twitter
corpora. On publicly available real-world data with
a skewed label distribution, our method shows
similar efficacy to using all posts in a user
profile, but with a substantially shorter
context. An evaluation on a version of these data
balanced with artificial posts shows that the
filtering to relevant posts leads to a significantly
improved accuracy of the predictions.},
eprint = {2409.04122},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {http://arxiv.org/abs/2409.04122},
internaltype = {workshop}
}