klinger.bib

@inproceedings{Schaefer2025,
  author = {Johannes Sch\"afer and Sabine Weber and Roman Klinger},
  title = {Localization of English Affective Narrative Generation to German},
  booktitle = {Proceedings of the 21th Conference on Natural
                  Language Processing (KONVENS 2025)},
  year = {2025},
  note = {accepted},
  internaltype = {conferenceproc}
}

@proceedings{nlpsi2025,
  title = {Proceedings of the First Workshop on
Integrating NLP and Psychology to Study Social Interactions},
  editor = {Aswathy Velutharambath and Sofie Labat and Neele Falk and Flor Miriam Plaza-del-Arco and Roman Klinger and V\'eronique Hoste},
  month = {June},
  year = {2025},
  address = {Copenhagen, Denmark},
  publisher = {ICWSM},
  internaltype = {edited}
}

@misc{velutharambath2025deceptiondetectedcrosslinguisticstudy,
  title = {What if Deception Cannot be Detected? A
                  Cross-Linguistic Study on the Limits of Deception
                  Detection from Text},
  author = {Aswathy Velutharambath and Kai
                  Sassenberg and Roman Klinger},
  year = {2025},
  eprint = {2505.13147},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2505.13147},
  internaltype = {preprint}
}

@inproceedings{bamnlp2025,
  title = {Which Demographics do {LLM}s Default to During
                  Annotation?},
  author = {Sch{\"a}fer, Johannes and Combs, Aidan and Bagdon,
                  Christopher and Li, Jiahui and Probol, Nadine and
                  Greschner, Lynn and Papay, Sean and Menchaca
                  Resendiz, Yarik and Velutharambath, Aswathy and
                  Wuehrl, Amelie and Weber, Sabine and Klinger, Roman},
  editor = {Che, Wanxiang and Nabende, Joyce and Shutova,
                  Ekaterina and Pilehvar, Mohammad Taher},
  booktitle = {Proceedings of the 63rd Annual Meeting of the
                  Association for Computational Linguistics (Volume 1:
                  Long Papers)},
  month = jul,
  year = {2025},
  address = {Vienna, Austria},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2025.acl-long.848/},
  pages = {17331--17348},
  isbn = {979-8-89176-251-0},
  abstract = {Demographics and cultural background of annotators
                  influence the labels they assign in text annotation
                  {--} for instance, an elderly woman might find it
                  offensive to read a message addressed to a ``bro'',
                  but a male teenager might find it appropriate. It is
                  therefore important to acknowledge label variations
                  to not under-represent members of a society. Two
                  research directions developed out of this
                  observation in the context of using large language
                  models (LLM) for data annotations, namely (1)
                  studying biases and inherent knowledge of LLMs and
                  (2) injecting diversity in the output by
                  manipulating the prompt with demographic
                  information. We combine these two strands of
                  research and ask the question to which demographics
                  an LLM resorts to when no demographics is given. To
                  answer this question, we evaluate which attributes
                  of human annotators LLMs inherently
                  mimic. Furthermore, we compare non-demographic
                  conditioned prompts and placebo-conditioned prompts
                  (e.g., ``you are an annotator who lives in house
                  number 5'') to demographics-conditioned prompts
                  ({``}You are a 45 year old man and an expert on
                  politeness annotation. How do you rate
                  instance''). We study these questions for politeness
                  and offensiveness annotations on the POPQUORN data
                  set, a corpus created in a controlled manner to
                  investigate human label variations based on
                  demographics which has not been used for LLM-based
                  analyses so far. We observe notable influences
                  related to gender, race, and age in demographic
                  prompting, which contrasts with previous studies
                  that found no such effects.},
  internaltype = {conferenceproc},
  eprint = {2410.08820},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2410.08820}
}

@inproceedings{bagdon-etal-2025-donate,
  title = {Donate or Create? Comparing Data Collection
                  Strategies for Emotion-labeled Multimodal Social
                  Media Posts},
  author = {Bagdon, Christopher and Combs, Aidan and Silberer,
                  Carina and Klinger, Roman},
  editor = {Che, Wanxiang and Nabende, Joyce and Shutova,
                  Ekaterina and Pilehvar, Mohammad Taher},
  booktitle = {Proceedings of the 63rd Annual Meeting of the
                  Association for Computational Linguistics (Volume 1:
                  Long Papers)},
  month = jul,
  year = {2025},
  address = {Vienna, Austria},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2025.acl-long.847/},
  pages = {17307--17330},
  isbn = {979-8-89176-251-0},
  abstract = {Accurate modeling of subjective phenomena such as
                  emotion expression requires data annotated with
                  authors' intentions. Commonly such data is collected
                  by asking study participants to donate and label
                  genuine content produced in the real world, or
                  create content fitting particu- lar labels during
                  the study. Asking participants to create content is
                  often simpler to implement and presents fewer risks
                  to participant privacy than data donation. However,
                  it is unclear if and how study-created content may
                  differ from genuine content, and how differences may
                  impact models. We collect study-created and genuine
                  multimodal social media posts labeled for emotion
                  and compare them on several dimen- sions, including
                  model performance. We find that compared to genuine
                  posts, study-created posts are longer, rely more on
                  their text and less on their images for emotion
                  expression, and focus more on emotion-prototypical
                  events. The samples of participants willing to
                  donate versus create posts are demographically
                  different. Study-created data is valuable to train
                  models that generalize well to genuine data, but
                  realistic effectiveness estimates require genuine
                  data.},
  eprint = {2505.24427},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2505.24427},
  internaltype = {conferenceproc}
}

@inproceedings{greschner-etal-2025-qolas,
  title = {{Q}o{LAS}: A {R}eddit Corpus of Health-Related
                  Quality of Life Aspects of Mental Disorders},
  author = {Greschner, Lynn and W{\"u}hrl, Amelie and Klinger,
                  Roman},
  editor = {Demner-Fushman, Dina and Ananiadou, Sophia and Miwa,
                  Makoto and Tsujii, Junichi},
  booktitle = {ACL 2025},
  month = aug,
  year = {2025},
  address = {Viena, Austria},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2025.bionlp-1.18/},
  pages = {201--216},
  isbn = {979-8-89176-275-6},
  abstract = {Quality of Life (QoL) refers to a person{'}s
                  subjective perception of various aspects of their
                  life. For medical practitioners, it is one of the
                  most important concepts for treatment
                  decisions. Therefore, it is essential to understand
                  in which aspects a medical condition affects a
                  patient{'}s subjective perception of their
                  life. With this paper, we focus on the
                  under-resourced domain of mental health-related QoL,
                  and contribute the first corpus to study and model
                  this concept: We (1) annotate 240 Reddit posts with
                  a set of 11 QoL aspects (such as `independence',
                  `mood', or `relationships') and their sentiment
                  polarity. Based on this novel corpus, we (2)
                  evaluate a pipeline to detect QoL mentions and
                  classify them into aspects using open-domain
                  aspect-based sentiment analysis. We find that users
                  frequently discuss health-related QoL in their
                  posts, focusing primarily on the aspects
                  `relationships' and `selfimage'. Our method reliably
                  predicts such mentions and their sentiment, however,
                  detecting fine-grained individual aspects remains
                  challenging. An analysis of a large corpus of
                  automatically labeled data reveals that social media
                  content contains novel aspects pertinent to patients
                  that are not covered by existing QoL taxonomies.},
  pdf = {https://www.romanklinger.de/publications/GreschnerKlinger_BioNLP2025.pdf},
  internaltype = {workshop}
}

@inproceedings{papay-etal-2025-regular,
  title = {Regular-pattern-sensitive {CRF}s for Distant Label
                  Interactions},
  author = {Papay, Sean and Klinger, Roman and Pad{\'o},
                  Sebastian},
  editor = {Fei, Hao and Tu, Kewei and Zhang, Yuhui and Hu,
                  Xiang and Han, Wenjuan and Jia, Zixia and Zheng,
                  Zilong and Cao, Yixin and Zhang, Meishan and Lu, Wei
                  and Siddharth, N.  and {\O}vrelid, Lilja and Xue,
                  Nianwen and Zhang, Yue},
  booktitle = {Proceedings of the 1st Joint Workshop on Large
                  Language Models and Structure Modeling (XLLM 2025)},
  month = aug,
  year = {2025},
  address = {Vienna, Austria},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2025.xllm-1.4/},
  pages = {26--35},
  isbn = {979-8-89176-286-2},
  abstract = {While LLMs have grown popular in sequence labeling,
                  linear-chain conditionalrandom fields (CRFs) remain
                  a popular alternativewith the ability to directly
                  model interactions between labels.However, the
                  Markov assumption limits them to interactions
                  between adjacent labels.Weighted finite-state
                  transducers (FSTs), in contrast, can modeldistant
                  label{--}label interactions, but exact label
                  inference is intractable in general.In this work, we
                  present regular-pattern-sensitiveCRFs (RPCRFs), a
                  method of enriching standardlinear-chain CRFs with
                  the ability to learnlong-distance label interactions
                  through user-specified patterns.This approach allows
                  users to write regular-expressionlabel patterns
                  concisely specifying which types of interactionsthe
                  model should take into account, allowingthe model to
                  learn from data whether and inwhich contexts these
                  patterns occur. The resultcan be interpreted
                  alternatively as a CRF augmented with
                  additional,non-local potentials,or as a finite-state
                  transducer whose structureis defined by a set of
                  easily-interpretable patterns.Critically, exact
                  training and inferenceare tractable for many pattern
                  sets. We detailhow an RPCRF can be automatically
                  constructed from a set of user-specified
                  patterns,and demonstrate the model{'}s effectiveness
                  ona sequence of three synthetic sequence modeling
                  datasets.},
  eprint = {2411.12484},
  archiveprefix = {arXiv},
  primaryclass = {cs.LG},
  url = {https://arxiv.org/abs/2411.12484},
  internaltype = {workshop}
}

@misc{resendiz2025llmbasedaffectivetextgeneration,
  title = {LLM-based Affective Text Generation Quality Based on
                  Different Quantization Values},
  author = {Yarik Menchaca Resendiz and Roman Klinger},
  year = {2025},
  eprint = {2501.19317},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2501.19317},
  internaltype = {preprint}
}

@inproceedings{greschner-klinger-2025-fearful,
  title = {Fearful Falcons and Angry Llamas: Emotion Category
                  Annotations of Arguments by Humans and {LLM}s},
  author = {Greschner, Lynn and Klinger, Roman},
  editor = {H{\"a}m{\"a}l{\"a}inen, Mika and {\"O}hman, Emily
                  and Bizzoni, Yuri and Miyagawa, So and Alnajjar,
                  Khalid},
  booktitle = {Proceedings of the 5th International Conference on
                  Natural Language Processing for Digital Humanities},
  month = may,
  year = {2025},
  address = {Albuquerque, USA},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2025.nlp4dh-1.52/},
  pages = {628--646},
  isbn = {979-8-89176-234-3},
  abstract = {Arguments evoke emotions, influencing the effect of
                  the argument itself. Not only the emotional
                  intensity but also the category influences the
                  argument`s effects, for instance, the willingness to
                  adapt stances. While binary emotionality has been
                  studied in argumentative texts, there is no work on
                  discrete emotion categories (e.g.,
                  {\textquoteleft}anger') in such data. To fill this
                  gap, we crowdsource subjective annotations of
                  emotion categories in a German argument corpus and
                  evaluate automatic LLM-based labeling
                  methods. Specifically, we compare three prompting
                  strategies (zero-shot, one-shot, chain-of-thought)
                  on three large instruction-tuned language models
                  (Falcon-7b-instruct, Llama-3.1-8B-instruct,
                  GPT-4o-mini). We further vary the definition of the
                  output space to be binary (is there emotionality in
                  the argument?), closed-domain (which emotion from a
                  given label set is in the argument?), or open-domain
                  (which emotion is in the argument?). We find that
                  emotion categories enhance the prediction of
                  emotionality in arguments, emphasizing the need for
                  discrete emotion annotations in arguments. Across
                  all prompt settings and models, automatic
                  predictions show a high recall but low precision for
                  predicting anger and fear, indicating a strong bias
                  toward negative emotions.},
  url = {https://arxiv.org/abs/2412.15993},
  eprint = {2412.15993},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  internaltype = {conferenceproc}
}

@inproceedings{li-klinger-2025-iprop,
  title = {i{P}r{O}p: Interactive Prompt Optimization for Large
                  Language Models with a Human in the Loop},
  author = {Li, Jiahui and Klinger, Roman},
  editor = {Zhao, Jin and Wang, Mingyang and Liu, Zhu},
  booktitle = {Proceedings of the 63rd Annual Meeting of the
                  Association for Computational Linguistics (Volume 4:
                  Student Research Workshop)},
  month = jul,
  year = {2025},
  address = {Vienna, Austria},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2025.acl-srw.18/},
  pages = {276--285},
  isbn = {979-8-89176-254-1},
  abstract = {Prompt engineering has made significant
                  contributions to the era of large language models,
                  yet its effectiveness depends on the skills of a
                  prompt author. This paper introduces
                  $\textit{iPrOp}$, a novel interactive prompt
                  optimization approach, to bridge manual prompt
                  engineering and automatic prompt optimization while
                  offering users the flexibility to assess evolving
                  prompts. We aim to provide users with task-specific
                  guidance to enhance human engagement in the
                  optimization process, which is structured through
                  prompt variations, informative instances,
                  predictions generated by large language models along
                  with their corresponding explanations, and relevant
                  performance metrics. This approach empowers users to
                  choose and further refine the prompts based on their
                  individual preferences and needs. It can not only
                  assist non-technical domain experts in generating
                  optimal prompts tailored to their specific tasks or
                  domains, but also enable to study the intrinsic
                  parameters that influence the performance of prompt
                  optimization. The evaluation shows that our approach
                  has the capability to generate improved prompts,
                  leading to enhanced task performance.},
  eprint = {2412.12644},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2412.12644},
  internaltype = {workshop}
}

@inproceedings{menchaca-resendiz-klinger-2025-mopo,
  title = {{MOPO}: Multi-Objective Prompt Optimization for
                  Affective Text Generation},
  author = {Menchaca Resendiz, Yarik and Klinger, Roman},
  editor = {Rambow, Owen and Wanner, Leo and Apidianaki,
                  Marianna and Al-Khalifa, Hend and Eugenio, Barbara
                  Di and Schockaert, Steven},
  booktitle = {Proceedings of the 31st International Conference on
                  Computational Linguistics},
  month = jan,
  year = {2025},
  address = {Abu Dhabi, UAE},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2025.coling-main.375/},
  pages = {5588--5606},
  eprint = {2412.12948},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2412.12948},
  internaltype = {conferenceproc},
  abstract = {How emotions are expressed depends on the context
                  and domain. On X (formerly Twitter), for instance,
                  an author might simply use the hashtag {\#}anger,
                  while in a news headline, emotions are typically
                  written in a more polite, indirect manner. To enable
                  conditional text generation models to create
                  emotionally connotated texts that fit a domain,
                  users need to have access to a parameter that allows
                  them to choose the appropriate way to express an
                  emotion. To achieve this, we introduce MOPO, a
                  Multi-Objective Prompt Optimization
                  methodology. MOPO optimizes prompts according to
                  multiple objectives (which correspond here to the
                  output probabilities assigned by emotion classifiers
                  trained for different domains). In contrast to
                  single objective optimization, MOPO outputs a set of
                  prompts, each with a different weighting of the
                  multiple objectives. Users can then choose the most
                  appropriate prompt for their context. We evaluate
                  MOPO using three objectives, determined by various
                  domain-specific emotion classifiers. MOPO improves
                  performance by up to 15 pp across all objectives
                  with a minimal loss (1{--}2 pp) for any single
                  objective compared to single-objective
                  optimization. These minor performance losses are
                  offset by a broader generalization across multiple
                  objectives {--} which is not possible with
                  single-objective optimization. Additionally, MOPO
                  reduces computational requirements by simultaneously
                  optimizing for multiple objectives, eliminating
                  separate optimization procedures for each
                  objective.}
}

@inproceedings{hofmann-etal-2025-prompt,
  title = {Prompt-based Personality Profiling: Reinforcement
                  Learning for Relevance Filtering},
  author = {Hofmann, Jan and Sindermann, Cornelia and Klinger,
                  Roman},
  editor = {Kamalloo, Ehsan and Gontier, Nicolas and Lu, Xing
                  Han and Dziri, Nouha and Murty, Shikhar and Lacoste,
                  Alexandre},
  booktitle = {Proceedings of the 1st Workshop for Research on
                  Agent Language Models (REALM 2025)},
  month = jul,
  year = {2025},
  address = {Vienna, Austria},
  publisher = {Association for Computational Linguistics},
  url = {https://preview.aclanthology.org/acl25-workshop-ingestion/2025.realm-1.1/},
  pages = {1--16},
  isbn = {979-8-89176-264-0},
  abstract = {Author profiling is the task of inferring
                  characteristics about individuals by analyzing
                  content they share. Supervised machine learning
                  still dominates automatic systems that perform this
                  task, despite the popularity of prompting large
                  language models to address natural language
                  understanding tasks. One reason is that the
                  classification instances consist of large amounts of
                  posts, potentially a whole user profile, which may
                  exceed the input length of Transformers. Even if a
                  model can use a large context window, the entirety
                  of posts makes the application of API-accessed black
                  box systems costly and slow, next to issues which
                  come with such ``needle-in-the-haystack'' tasks. To
                  mitigate this limitation, we propose a new method
                  for author profiling which aims at distinguishing
                  relevant from irrelevant content first, followed by
                  the actual user profiling only with relevant
                  data. To circumvent the need for relevance-annotated
                  data, we optimize this relevance filter via
                  reinforcement learning with a reward function that
                  utilizes the zero-shot capabilities of large
                  language models. We evaluate our method for Big Five
                  personality trait prediction on two Twitter
                  corpora. On publicly available real-world data with
                  a skewed label distribution, our method shows
                  similar efficacy to using all posts in a user
                  profile, but with a substantially shorter
                  context. An evaluation on a version of these data
                  balanced with artificial posts shows that the
                  filtering to relevant posts leads to a significantly
                  improved accuracy of the predictions.},
  eprint = {2409.04122},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {http://arxiv.org/abs/2409.04122},
  internaltype = {workshop}
}