klinger.bib
@inproceedings{Schaefer2025,
author = {Johannes Sch\"afer and Sabine Weber and Roman Klinger},
title = {Localization of English Affective Narrative Generation to German},
booktitle = {Proceedings of the 21th Conference on Natural
Language Processing (KONVENS 2025)},
year = {2025},
note = {accepted},
internaltype = {conferenceproc}
}
@proceedings{nlpsi2025,
title = {Proceedings of the First Workshop on
Integrating NLP and Psychology to Study Social Interactions},
editor = {Aswathy Velutharambath and Sofie Labat and Neele Falk and Flor Miriam Plaza-del-Arco and Roman Klinger and V\'eronique Hoste},
month = {June},
year = {2025},
address = {Copenhagen, Denmark},
publisher = {ICWSM},
internaltype = {edited}
}
@misc{velutharambath2025deceptiondetectedcrosslinguisticstudy,
title = {What if Deception Cannot be Detected? A
Cross-Linguistic Study on the Limits of Deception
Detection from Text},
author = {Aswathy Velutharambath and Kai
Sassenberg and Roman Klinger},
year = {2025},
eprint = {2505.13147},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2505.13147},
internaltype = {preprint}
}
@inproceedings{bamnlp2025,
title = {Which Demographics do {LLM}s Default to During
Annotation?},
author = {Sch{\"a}fer, Johannes and Combs, Aidan and Bagdon,
Christopher and Li, Jiahui and Probol, Nadine and
Greschner, Lynn and Papay, Sean and Menchaca
Resendiz, Yarik and Velutharambath, Aswathy and
Wuehrl, Amelie and Weber, Sabine and Klinger, Roman},
editor = {Che, Wanxiang and Nabende, Joyce and Shutova,
Ekaterina and Pilehvar, Mohammad Taher},
booktitle = {Proceedings of the 63rd Annual Meeting of the
Association for Computational Linguistics (Volume 1:
Long Papers)},
month = jul,
year = {2025},
address = {Vienna, Austria},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.acl-long.848/},
pages = {17331--17348},
isbn = {979-8-89176-251-0},
abstract = {Demographics and cultural background of annotators
influence the labels they assign in text annotation
{--} for instance, an elderly woman might find it
offensive to read a message addressed to a ``bro'',
but a male teenager might find it appropriate. It is
therefore important to acknowledge label variations
to not under-represent members of a society. Two
research directions developed out of this
observation in the context of using large language
models (LLM) for data annotations, namely (1)
studying biases and inherent knowledge of LLMs and
(2) injecting diversity in the output by
manipulating the prompt with demographic
information. We combine these two strands of
research and ask the question to which demographics
an LLM resorts to when no demographics is given. To
answer this question, we evaluate which attributes
of human annotators LLMs inherently
mimic. Furthermore, we compare non-demographic
conditioned prompts and placebo-conditioned prompts
(e.g., ``you are an annotator who lives in house
number 5'') to demographics-conditioned prompts
({``}You are a 45 year old man and an expert on
politeness annotation. How do you rate
instance''). We study these questions for politeness
and offensiveness annotations on the POPQUORN data
set, a corpus created in a controlled manner to
investigate human label variations based on
demographics which has not been used for LLM-based
analyses so far. We observe notable influences
related to gender, race, and age in demographic
prompting, which contrasts with previous studies
that found no such effects.},
internaltype = {conferenceproc},
eprint = {2410.08820},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2410.08820}
}
@inproceedings{bagdon-etal-2025-donate,
title = {Donate or Create? Comparing Data Collection
Strategies for Emotion-labeled Multimodal Social
Media Posts},
author = {Bagdon, Christopher and Combs, Aidan and Silberer,
Carina and Klinger, Roman},
editor = {Che, Wanxiang and Nabende, Joyce and Shutova,
Ekaterina and Pilehvar, Mohammad Taher},
booktitle = {Proceedings of the 63rd Annual Meeting of the
Association for Computational Linguistics (Volume 1:
Long Papers)},
month = jul,
year = {2025},
address = {Vienna, Austria},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.acl-long.847/},
pages = {17307--17330},
isbn = {979-8-89176-251-0},
abstract = {Accurate modeling of subjective phenomena such as
emotion expression requires data annotated with
authors' intentions. Commonly such data is collected
by asking study participants to donate and label
genuine content produced in the real world, or
create content fitting particu- lar labels during
the study. Asking participants to create content is
often simpler to implement and presents fewer risks
to participant privacy than data donation. However,
it is unclear if and how study-created content may
differ from genuine content, and how differences may
impact models. We collect study-created and genuine
multimodal social media posts labeled for emotion
and compare them on several dimen- sions, including
model performance. We find that compared to genuine
posts, study-created posts are longer, rely more on
their text and less on their images for emotion
expression, and focus more on emotion-prototypical
events. The samples of participants willing to
donate versus create posts are demographically
different. Study-created data is valuable to train
models that generalize well to genuine data, but
realistic effectiveness estimates require genuine
data.},
eprint = {2505.24427},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2505.24427},
internaltype = {conferenceproc}
}
@inproceedings{greschner-etal-2025-qolas,
title = {{Q}o{LAS}: A {R}eddit Corpus of Health-Related
Quality of Life Aspects of Mental Disorders},
author = {Greschner, Lynn and W{\"u}hrl, Amelie and Klinger,
Roman},
editor = {Demner-Fushman, Dina and Ananiadou, Sophia and Miwa,
Makoto and Tsujii, Junichi},
booktitle = {ACL 2025},
month = aug,
year = {2025},
address = {Viena, Austria},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.bionlp-1.18/},
pages = {201--216},
isbn = {979-8-89176-275-6},
abstract = {Quality of Life (QoL) refers to a person{'}s
subjective perception of various aspects of their
life. For medical practitioners, it is one of the
most important concepts for treatment
decisions. Therefore, it is essential to understand
in which aspects a medical condition affects a
patient{'}s subjective perception of their
life. With this paper, we focus on the
under-resourced domain of mental health-related QoL,
and contribute the first corpus to study and model
this concept: We (1) annotate 240 Reddit posts with
a set of 11 QoL aspects (such as `independence',
`mood', or `relationships') and their sentiment
polarity. Based on this novel corpus, we (2)
evaluate a pipeline to detect QoL mentions and
classify them into aspects using open-domain
aspect-based sentiment analysis. We find that users
frequently discuss health-related QoL in their
posts, focusing primarily on the aspects
`relationships' and `selfimage'. Our method reliably
predicts such mentions and their sentiment, however,
detecting fine-grained individual aspects remains
challenging. An analysis of a large corpus of
automatically labeled data reveals that social media
content contains novel aspects pertinent to patients
that are not covered by existing QoL taxonomies.},
pdf = {https://www.romanklinger.de/publications/GreschnerKlinger_BioNLP2025.pdf},
internaltype = {workshop}
}
@inproceedings{papay-etal-2025-regular,
title = {Regular-pattern-sensitive {CRF}s for Distant Label
Interactions},
author = {Papay, Sean and Klinger, Roman and Pad{\'o},
Sebastian},
editor = {Fei, Hao and Tu, Kewei and Zhang, Yuhui and Hu,
Xiang and Han, Wenjuan and Jia, Zixia and Zheng,
Zilong and Cao, Yixin and Zhang, Meishan and Lu, Wei
and Siddharth, N. and {\O}vrelid, Lilja and Xue,
Nianwen and Zhang, Yue},
booktitle = {Proceedings of the 1st Joint Workshop on Large
Language Models and Structure Modeling (XLLM 2025)},
month = aug,
year = {2025},
address = {Vienna, Austria},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.xllm-1.4/},
pages = {26--35},
isbn = {979-8-89176-286-2},
abstract = {While LLMs have grown popular in sequence labeling,
linear-chain conditionalrandom fields (CRFs) remain
a popular alternativewith the ability to directly
model interactions between labels.However, the
Markov assumption limits them to interactions
between adjacent labels.Weighted finite-state
transducers (FSTs), in contrast, can modeldistant
label{--}label interactions, but exact label
inference is intractable in general.In this work, we
present regular-pattern-sensitiveCRFs (RPCRFs), a
method of enriching standardlinear-chain CRFs with
the ability to learnlong-distance label interactions
through user-specified patterns.This approach allows
users to write regular-expressionlabel patterns
concisely specifying which types of interactionsthe
model should take into account, allowingthe model to
learn from data whether and inwhich contexts these
patterns occur. The resultcan be interpreted
alternatively as a CRF augmented with
additional,non-local potentials,or as a finite-state
transducer whose structureis defined by a set of
easily-interpretable patterns.Critically, exact
training and inferenceare tractable for many pattern
sets. We detailhow an RPCRF can be automatically
constructed from a set of user-specified
patterns,and demonstrate the model{'}s effectiveness
ona sequence of three synthetic sequence modeling
datasets.},
eprint = {2411.12484},
archiveprefix = {arXiv},
primaryclass = {cs.LG},
url = {https://arxiv.org/abs/2411.12484},
internaltype = {workshop}
}
@misc{resendiz2025llmbasedaffectivetextgeneration,
title = {LLM-based Affective Text Generation Quality Based on
Different Quantization Values},
author = {Yarik Menchaca Resendiz and Roman Klinger},
year = {2025},
eprint = {2501.19317},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2501.19317},
internaltype = {preprint}
}
@inproceedings{greschner-klinger-2025-fearful,
title = {Fearful Falcons and Angry Llamas: Emotion Category
Annotations of Arguments by Humans and {LLM}s},
author = {Greschner, Lynn and Klinger, Roman},
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and {\"O}hman, Emily
and Bizzoni, Yuri and Miyagawa, So and Alnajjar,
Khalid},
booktitle = {Proceedings of the 5th International Conference on
Natural Language Processing for Digital Humanities},
month = may,
year = {2025},
address = {Albuquerque, USA},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.nlp4dh-1.52/},
pages = {628--646},
isbn = {979-8-89176-234-3},
abstract = {Arguments evoke emotions, influencing the effect of
the argument itself. Not only the emotional
intensity but also the category influences the
argument`s effects, for instance, the willingness to
adapt stances. While binary emotionality has been
studied in argumentative texts, there is no work on
discrete emotion categories (e.g.,
{\textquoteleft}anger') in such data. To fill this
gap, we crowdsource subjective annotations of
emotion categories in a German argument corpus and
evaluate automatic LLM-based labeling
methods. Specifically, we compare three prompting
strategies (zero-shot, one-shot, chain-of-thought)
on three large instruction-tuned language models
(Falcon-7b-instruct, Llama-3.1-8B-instruct,
GPT-4o-mini). We further vary the definition of the
output space to be binary (is there emotionality in
the argument?), closed-domain (which emotion from a
given label set is in the argument?), or open-domain
(which emotion is in the argument?). We find that
emotion categories enhance the prediction of
emotionality in arguments, emphasizing the need for
discrete emotion annotations in arguments. Across
all prompt settings and models, automatic
predictions show a high recall but low precision for
predicting anger and fear, indicating a strong bias
toward negative emotions.},
url = {https://arxiv.org/abs/2412.15993},
eprint = {2412.15993},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
internaltype = {conferenceproc}
}
@inproceedings{li-klinger-2025-iprop,
title = {i{P}r{O}p: Interactive Prompt Optimization for Large
Language Models with a Human in the Loop},
author = {Li, Jiahui and Klinger, Roman},
editor = {Zhao, Jin and Wang, Mingyang and Liu, Zhu},
booktitle = {Proceedings of the 63rd Annual Meeting of the
Association for Computational Linguistics (Volume 4:
Student Research Workshop)},
month = jul,
year = {2025},
address = {Vienna, Austria},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.acl-srw.18/},
pages = {276--285},
isbn = {979-8-89176-254-1},
abstract = {Prompt engineering has made significant
contributions to the era of large language models,
yet its effectiveness depends on the skills of a
prompt author. This paper introduces
$\textit{iPrOp}$, a novel interactive prompt
optimization approach, to bridge manual prompt
engineering and automatic prompt optimization while
offering users the flexibility to assess evolving
prompts. We aim to provide users with task-specific
guidance to enhance human engagement in the
optimization process, which is structured through
prompt variations, informative instances,
predictions generated by large language models along
with their corresponding explanations, and relevant
performance metrics. This approach empowers users to
choose and further refine the prompts based on their
individual preferences and needs. It can not only
assist non-technical domain experts in generating
optimal prompts tailored to their specific tasks or
domains, but also enable to study the intrinsic
parameters that influence the performance of prompt
optimization. The evaluation shows that our approach
has the capability to generate improved prompts,
leading to enhanced task performance.},
eprint = {2412.12644},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2412.12644},
internaltype = {workshop}
}
@inproceedings{menchaca-resendiz-klinger-2025-mopo,
title = {{MOPO}: Multi-Objective Prompt Optimization for
Affective Text Generation},
author = {Menchaca Resendiz, Yarik and Klinger, Roman},
editor = {Rambow, Owen and Wanner, Leo and Apidianaki,
Marianna and Al-Khalifa, Hend and Eugenio, Barbara
Di and Schockaert, Steven},
booktitle = {Proceedings of the 31st International Conference on
Computational Linguistics},
month = jan,
year = {2025},
address = {Abu Dhabi, UAE},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.coling-main.375/},
pages = {5588--5606},
eprint = {2412.12948},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2412.12948},
internaltype = {conferenceproc},
abstract = {How emotions are expressed depends on the context
and domain. On X (formerly Twitter), for instance,
an author might simply use the hashtag {\#}anger,
while in a news headline, emotions are typically
written in a more polite, indirect manner. To enable
conditional text generation models to create
emotionally connotated texts that fit a domain,
users need to have access to a parameter that allows
them to choose the appropriate way to express an
emotion. To achieve this, we introduce MOPO, a
Multi-Objective Prompt Optimization
methodology. MOPO optimizes prompts according to
multiple objectives (which correspond here to the
output probabilities assigned by emotion classifiers
trained for different domains). In contrast to
single objective optimization, MOPO outputs a set of
prompts, each with a different weighting of the
multiple objectives. Users can then choose the most
appropriate prompt for their context. We evaluate
MOPO using three objectives, determined by various
domain-specific emotion classifiers. MOPO improves
performance by up to 15 pp across all objectives
with a minimal loss (1{--}2 pp) for any single
objective compared to single-objective
optimization. These minor performance losses are
offset by a broader generalization across multiple
objectives {--} which is not possible with
single-objective optimization. Additionally, MOPO
reduces computational requirements by simultaneously
optimizing for multiple objectives, eliminating
separate optimization procedures for each
objective.}
}
@inproceedings{hofmann-etal-2025-prompt,
title = {Prompt-based Personality Profiling: Reinforcement
Learning for Relevance Filtering},
author = {Hofmann, Jan and Sindermann, Cornelia and Klinger,
Roman},
editor = {Kamalloo, Ehsan and Gontier, Nicolas and Lu, Xing
Han and Dziri, Nouha and Murty, Shikhar and Lacoste,
Alexandre},
booktitle = {Proceedings of the 1st Workshop for Research on
Agent Language Models (REALM 2025)},
month = jul,
year = {2025},
address = {Vienna, Austria},
publisher = {Association for Computational Linguistics},
url = {https://preview.aclanthology.org/acl25-workshop-ingestion/2025.realm-1.1/},
pages = {1--16},
isbn = {979-8-89176-264-0},
abstract = {Author profiling is the task of inferring
characteristics about individuals by analyzing
content they share. Supervised machine learning
still dominates automatic systems that perform this
task, despite the popularity of prompting large
language models to address natural language
understanding tasks. One reason is that the
classification instances consist of large amounts of
posts, potentially a whole user profile, which may
exceed the input length of Transformers. Even if a
model can use a large context window, the entirety
of posts makes the application of API-accessed black
box systems costly and slow, next to issues which
come with such ``needle-in-the-haystack'' tasks. To
mitigate this limitation, we propose a new method
for author profiling which aims at distinguishing
relevant from irrelevant content first, followed by
the actual user profiling only with relevant
data. To circumvent the need for relevance-annotated
data, we optimize this relevance filter via
reinforcement learning with a reward function that
utilizes the zero-shot capabilities of large
language models. We evaluate our method for Big Five
personality trait prediction on two Twitter
corpora. On publicly available real-world data with
a skewed label distribution, our method shows
similar efficacy to using all posts in a user
profile, but with a substantially shorter
context. An evaluation on a version of these data
balanced with artificial posts shows that the
filtering to relevant posts leads to a significantly
improved accuracy of the predictions.},
eprint = {2409.04122},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {http://arxiv.org/abs/2409.04122},
internaltype = {workshop}
}