Abstract

var data = [{"loc": [9.418848037719727, 1.0917103290557861], "openalex_id": "https://openalex.org/W4415616105", "title": "ENHANCING JAPANESE LEXICAL NETWORKS USING LARGE LANGUAGE MODELS", "authors": "Benedikt Per\u0430k, Dragana \u0160pica", "abstract": "In recent advancements within natural language processing (NLP), lexical networks play a crucial role in representing semantic relationships between words, enhancing applications from word sense disambiguation to educational tools. Traditional methods for constructing lexical networks, however, are resource-intensive, relying heavily on expert lexicographers. Leveraging GPT-4o, a large language model (LLM), our study presents an automated, scalable approach to creating multi-relational Japanese lexical networks for the general Japanese language. This study builds on previous methods of integrating synonyms but extends to other relations such as hyponymy, hypernymy, meronymy, and holonomy. Using a combination of structured prompts and graph-based data storage, the model extracts detailed lexical relationships, which are then systematically validated and encoded. Results reveal a substantial expansion in network size, with over 155,000 nodes and 700,000 edges, enriching Japanese lexical associations with nuanced hierarchical and associative layers. Comparisons with WordNet show substantial alignment in relation types, particularly with soft matching, underscoring the model\u2019s efficacy in reflecting the multifaceted nature of lexical semantics. This work contributes a versatile framework for constructing expansive lexical resources that hold promises for enhancing NLP tasks and educational applications across various languages and domains.", "venue": "Knowledge", "label": 0}, {"loc": [2.978393077850342, 1.801192045211792], "openalex_id": "https://openalex.org/W4415196009", "title": "The Rise of Generative Artificial Intelligence: Impact on Societies, Economies and Enterprises", "authors": "T. D. Wilson", "abstract": "Kshetri, N. (2024). The rise of generative artificial intelligence. Impact on societies, economies and enterprises. Cheltenham, UK: Edward Elgar Publishing. xiii, 322 p. ISBN: 978-1-0353-4673-8", "venue": "Information Research an international electronic journal", "label": 0}, {"loc": [8.56558895111084, -0.10564300417900085], "openalex_id": "https://openalex.org/W4415009101", "title": "Multilingual Table-to-Text Generation with Question-Answer Plans", "authors": "Aden Haussmann", "abstract": "Multilingual Natural Language Generation (NLG) is challenging due to the lack of training data for low-resource languages. However, some low-resource languages have up to tens of millions of speakers globally, making it important to improve NLG tools for them. Table-to-Text NLG is an excellent measure of models' reasoning abilities but is very challenging in the multilingual setting. System outputs are often not attributable, or faithful, to the data in the source table. Intermediate planning techniques like Question-Answer (QA) blueprints have been shown to improve attributability on summarisation tasks. This work explores whether QA blueprints make multilingual Table-to-Text outputs more attributable to the input tables. This paper extends the challenging multilingual Table-to-Text dataset, TaTA, which includes African languages, with QA blueprints. Sequence-to-sequence language models are then finetuned on this dataset, with and without blueprints. Results show that QA blueprints improve performance for models finetuned and evaluated only on English examples, but do not demonstrate gains in the multilingual setting. This is due to inaccuracies in machine translating the blueprints from English into target languages when generating the training data, and models failing to rely closely on the blueprints they generate. An in-depth analysis is conducted on why this is challenging.", "venue": "International Journal of Undergraduate Research and Creative Activities", "label": 0}, {"loc": [5.298444747924805, 3.184633731842041], "openalex_id": "https://openalex.org/W4414019013", "title": "Physics-informed Machine Learning with Uncertainty Quantification", "authors": "Mario De Florio, Gabriel Appleby, Jonathan Keller, Ali Eftekhari Milani, Donatella Zappal\u00e1, Shuangwen Sheng", "abstract": "Abstract. This paper introduces the eXtreme Theory of Functional Connections (X-TFC), a physics-informed machine learning algorithm, and tailors it to estimate the remaining useful life (RUL) of wind turbine gearbox bearings experiencing fatigue crack growth. Unlike purely data-driven methods, X-TFC embeds a physics model, based on the Head\u2019s theory in this work, into its training objective. The core of X-TFC is a random-projection single-layer neural network trained via Extreme Learning Machine, which requires only limited damage progression data and solves for output weights with a least-squares optimization algorithm. A composite loss function balances the network\u2019s fit to observed degradation data against the residuals of the governing crack-growth differential equation, ensuring the learned damage trajectory remains physically plausible. When applied to a vibration-based health-index (HI) dataset measured during the growth of a crack on the inner ring of a high-speed bearing in a wind turbine gearbox (Bechhoefer and Dub\u00e9, 2020), X-TFC achieves near-zero prediction bias. Even when trained on only the first 10\u201320 % of the damage progression data, its predictions remain monotonic and smooth, delivering high prognosability and trendability. To quantify the epistemic uncertainty, we employ a Monte Carlo ensemble of independently initialized X-TFC models trained on noise-perturbed data, which yields confidence intervals around each RUL estimate. This approach provides confidence intervals around each RUL estimate, capturing both model-parameter and epistemic uncertainty. In ad- dition to a vibration-based HI, we demonstrate that the proposed framework can be directly applied to a SCADA data-based HI (Eftekhari Milani et al., 2025) measured during similar wind turbine gearbox bearing crack faults, preserving its accuracy and interpretability. This extension shows the versatility of our approach, which is applicable to bearings of multiple gearbox manufacturers, models and ratings using only SCADA data. By integrating domain knowledge with machine learning, X-TFC offers a rapid, reliable tool for crack prognostics. Its adaptability to other bearing failure modes, such as pitch-bearing ring cracks, positions X-TFC as a powerful enabler of data-driven, physics-informed asset management in the wind energy sector and beyond.", "venue": "https://doi.org/10.5194/wes-2025-157", "label": 0}, {"loc": [3.7905282974243164, 3.6322333812713623], "openalex_id": "https://openalex.org/W4414360521", "title": "Towards Safer Pretraining: Analyzing and Filtering Harmful Content in Webscale datasets for Responsible LLMs", "authors": "Sai Krishna Mendu, Harish Yenala, Arvind Gulati, Shanu Kumar, Parag Agrawal", "abstract": "Large language models (LLMs) have become integral to various real-world applications, leveraging massive, web-sourced datasets like Common Crawl, C4, and FineWeb for pretraining. While these datasets provide linguistic data essential for high-quality natural language generation, they often contain harmful content, such as hate speech, misinformation, and biased narratives. Training LLMs on such unfiltered data risks perpetuating toxic behaviors, spreading misinformation, and amplifying societal biases which can undermine trust in LLM-driven applications and raise ethical concerns about their use. This paper presents a large-scale analysis of inappropriate content across these datasets, offering a comprehensive taxonomy that categorizes harmful webpages into Topical and Toxic based on their intent. We also introduce a prompt evaluation dataset, a high-accuracy Topical and Toxic Prompt (TTP), and a transformer-based model (HarmFormer) for harmful content filtering. Additionally, we create a new multi-harm open-ended toxicity benchmark (HAVOC) and provide crucial insights into how models respond to adversarial toxic inputs. Our work offers insights into ensuring safer LLM pretraining and serves as a resource for Responsible AI (RAI) compliance. Disclaimer: This paper includes potentially offensive content due to the nature of the research.", "venue": "https://doi.org/10.24963/ijcai.2025/53", "label": 0}, {"loc": [7.072526454925537, -1.0609488487243652], "openalex_id": "https://openalex.org/W4413299002", "title": "Towards the Development of Balanced Synthetic Data for Correcting Grammatical Errors in Arabic: An Approach Based on Error Tagging Model and Synthetic Data \u2026", "authors": "Ahlam Alrehili, Areej Alhothali", "abstract": "Abstract Synthetic data generation is widely recognized as an approach to improve the quality of neural grammatical error correction (GEC) systems. However, current approaches often lack diversity or are overly simplistic in generating the wide range of grammatical errors made by humans, particularly for low-resource languages such as Arabic. In this study, we developed an error tagging model and a synthetic data generation model to generate a large synthetic dataset in Arabic for GEC. In the error tagging model, the correct sentence is classified into multiple error types that humans are expected to make using the DeBERTav3 model. The Arabic Error Type Annotation (ARETA) tool is used to guide multi-label classification tasks in an error tagging model that divides each sentence into 26 error tags. The synthetic data generation model is a back-translation-based model that generates incorrect sentences by appending error tags before the correct sentence that was generated by the error tagging model using the AraT5 model. In the QALB-14 and QALB-15 test sets, the error tagging model achieved 94.42% F1, which is a state-of-the-art result in identifying error tags in clean sentences. As a result of our syntactic data training in GEC, we achieved a new state-of-the-art result with F1-score of 79.36% in the QALB-14 test set. We generated 30,219,310 synthetic sentence pairs using a synthetic data generation model. Our data are accessible to the public.*.", "venue": "https://doi.org/10.21203/rs.3.rs-7049585/v1", "label": 0}, {"loc": [5.8933491706848145, 1.4358208179473877], "openalex_id": "https://openalex.org/W4415433341", "title": "Recreating Neural Activity During Speech Production with Language and Speech Model Embeddings", "authors": "Owais Mujtaba Khanday, Philippe Esteban, Zubair Ahmad Lone, Marc Ouellet, Jos\u00e9 A. Gonz\u00e1lez", "abstract": "Understanding how neural activity encodes speech and language production is a fundamental challenge in neuroscience and artificial intelligence. This study investigates whether embeddings from large-scale, self-supervised language and speech models can effectively reconstruct high-gamma neural activity characteristics, key indicators of cortical processing, recorded during speech production. We use pre-trained embeddings from deep learning models on linguistic and acoustic data to map high-level speech features onto high-gamma signals. We analyze the extent to which these embeddings preserve the spatio-temporal dynamics of brain activity. Reconstructed neural signals are evaluated against high-gamma ground-truth activity using correlation metrics and signal reconstruction quality assessments. The results indicate High-gamma activity was effectively reconstructed using language and speech model embeddings, yielding Pearson correlation coefficients of 0.79\u20130.99 across all participants.", "venue": "https://doi.org/10.21437/interspeech.2025-1400", "label": 0}, {"loc": [5.3699116706848145, 2.1671080589294434], "openalex_id": "https://openalex.org/W4413108220", "title": "Measuring individual semantic networks: A simulation study", "authors": "Samuel Aeschbach, Rui Mata, Dirk U. Wulff", "abstract": "Accurately capturing individual differences in semantic networks is fundamental to advancing our mechanistic understanding of semantic memory. Past empirical attempts to construct individual-level semantic networks from behavioral paradigms may be limited by data constraints. To assess these limitations and propose improved designs for the measurement of individual semantic networks, we conducted a recovery simulation investigating the psychometric properties underlying estimates of individual semantic networks obtained from two different behavioral paradigms: free associations and relatedness judgment tasks. Our results show that successful inference of semantic networks is achievable, but they also highlight critical challenges. Estimates of absolute network characteristics are severely biased, such that comparisons between behavioral paradigms and different design configurations are often not meaningful. However, comparisons within a given paradigm and design configuration can be accurate and generalizable when based on designs with moderate numbers of cues, moderate numbers of responses, and cue sets including diverse words. Ultimately, our results provide insights that help evaluate past findings on the structure of semantic networks and design new studies capable of more reliably revealing individual differences in semantic networks.", "venue": "PLoS ONE", "label": 11}, {"loc": [7.030765533447266, -0.0014437115751206875], "openalex_id": "https://openalex.org/W4412940034", "title": "UniBERTs: Adversarial Training for Language-Universal Representations", "authors": "Andrei-Marius Avram, Marian Lupa\u015fcu, Dumitru-Clementin Cercel, Ionu\u0163 Mironic\u0103, \u0218tefan Tr\u0103u\u0219an-Matu", "abstract": "Abstract This paper presents UniBERT, a compact multilingual language model that uses an innovative training framework that integrates three components: masked language modeling, adversarial training, and knowledge distillation. Pre-trained on a meticulously curated Wikipedia corpus spanning 107 languages, UniBERT is designed to reduce the computational demands of large-scale models while maintaining competitive performance across various natural language processing tasks. Comprehensive evaluations on four tasks, named entity recognition, natural language inference, question answering, and semantic textual similarity, demonstrate that our multilingual training strategy, enhanced by an adversarial objective, significantly improves cross-lingual generalization. Specifically, UniBERT models show an average relative improvement of 7.72% over traditional baselines, which achieved an average relative improvement of only 1.12%, and statistical analysis confirms the significance of these gains (p value = 0.0184). This work highlights the benefits of combining adversarial training and knowledge distillation to build robust and scalable language models, thus advancing the field of multilingual and cross-lingual natural language processing.", "venue": "Neural Computing and Applications", "label": 0}, {"loc": [2.6204349994659424, 2.7618305683135986], "openalex_id": "https://openalex.org/W4412946091", "title": "On Regulating Downstream AI Developers", "authors": "Sophie Williams, Jonas Schuett, Markus Anderljung", "abstract": "Abstract Foundation models \u2013 models trained on broad data that can be adapted to a wide range of downstream tasks \u2013 can pose significant risks, ranging from intimate image abuse, cyberattacks, to bioterrorism. To reduce these risks, policymakers are starting to impose obligations on the developers of these models. However, downstream developers \u2013 actors who fine-tune or otherwise modify foundational models \u2013 can create or amplify risks by improving a model\u2019s capabilities or compromising its safety features. This can make rules on upstream developers ineffective. One way to address this issue could be to impose direct obligations on downstream developers. However, since downstream developers are numerous, diverse, and rapidly growing in number, such direct regulation may be both practically challenging and stifling to innovation. A different approach would be to require upstream developers to mitigate downstream modification risks (e.g., by restricting what modifications can be made). Another approach would be to use alternative policy tools (e.g., clarifying how existing tort law applies to downstream developers or issuing voluntary guidance to help mitigate downstream modification risks). We expect that regulation on upstream developers to mitigate downstream modification risks will be necessary. Although further work is needed, regulation of downstream developers may also be warranted where they retain the ability to increase risk to an unacceptable level.", "venue": "European Journal of Risk Regulation", "label": 0}, {"loc": [9.365289688110352, 1.6362134218215942], "openalex_id": "https://openalex.org/W4414266713", "title": "MaTElDa: Multi-Table Error Detection", "authors": "Fatemeh Ahmadi, Julian Paulu\u00dfen, Ziawasch Abedjan", "abstract": "Real-world datasets are often fragmented across multiple heterogeneous tables, managed by different teams or organizations. Ensuring data quality in such environments is challenging, as traditional error detection tools typically operate on isolated tables and overlook cross-table relationships. To address this gap, we investigate how cleaning multiple tables simultaneously, combined with structured user collaboration, can reduce annotation effort and enhance the effectiveness and efficiency of error detection. We present Matelda, an interactive system for multi-table error detection that combines automated error detection with human-in-the-loop refinement. Matelda guides users through Inspection & Action, allowing them to explore system-generated insights, refine decisions, and annotate data with contextual support. It organizes tables using domain-based and quality-based folding and leverages semi-supervised learning to propagate labels across related tables efficiently. Our demonstration showcases Matelda's capabilities for collaborative error detection and resolution by leveraging shared knowledge, contextual similarity, and structured user interactions across multiple tables.", "venue": "Proceedings of the VLDB Endowment", "label": 21}, {"loc": [3.09126353263855, -0.7111696600914001], "openalex_id": "https://openalex.org/W4412624812", "title": "A Natural Language Processing Approach to Support Biomedical Data Harmonization: Leveraging Large Language Models", "authors": "Zexu Li, S. Prabhu, Zachary Popp, Shubhi Jain, Vijetha Balakundi, Ting Fang Alvin Ang, Rhoda Au, Jinying Chen", "abstract": "Background Biomedical research requires large, diverse samples to produce unbiased results. Retrospective data harmonization is often used to integrate existing datasets to create these samples, but the process is labor-intensive. Automated methods for matching variables across datasets can accelerate this process, particularly when harmonizing datasets with numerous variables and varied naming conventions. Research in this area has been limited, primarily focusing on lexical matching and ontology-based semantic matching. We aimed to develop new methods, leveraging large language models (LLMs) and ensemble learning, to automate variable matching. Methods This study utilized data from two GERAS cohort studies (European [EU] and Japan [JP]) obtained through the Alzheimer\u2019s Disease (AD) Data Initiative\u2019s AD workbench. We first manually created a dataset by matching 347 EU variables with 1322 candidate JP variables and treated matched variable pairs as positive instances and unmatched pairs as negative instances. We then developed four natural language processing (NLP) methods using state-of-the-art LLMs (E5, MPNet, MiniLM, and BioLORD-2023) to estimate variable similarity based on variable labels and derivation rules. A lexical matching method using fuzzy matching was included as a baseline model. In addition, we developed an ensemble-learning method, using the Random Forest (RF) model, to integrate individual NLP methods. RF was trained and evaluated on 50 trials. Each trial had a random split (4:1) of training and test sets, with the model\u2019s hyperparameters optimized through cross-validation on the training set. For each EU variable, 1322 candidate JP variables were ranked based on NLP-derived similarity scores or RF\u2019s probability scores, denoting their likelihood to match the EU variable. Ranking performance was measured by top- n hit ratio (HR- n) and mean reciprocal rank (MRR). Results E5 performed best among individual methods, achieving 0.898 HR-30 and 0.700 MRR. RF performed better than E5 on all metrics over 50 trials (P < 0.001) and achieved an average HR-30 of 0.986 and MRR of 0.744. LLM-derived features contributed most to RF\u2019s performance. One major cause of errors in automatic variable matching was ambiguous variable definitions. Conclusion NLP techniques (especially LLMs), combined with ensemble learning, hold great potential in automating variable matching and accelerating biomedical data harmonization.", "venue": "PLoS ONE", "label": 11}, {"loc": [8.292567253112793, 3.728950262069702], "openalex_id": "https://openalex.org/W4412581296", "title": "Profiling and optimization of multi-card GPU machine learning jobs", "authors": "Marcin Lawenda, Kyrylo Khloponin, Krzesimir Samborski, \u0141ukasz Szustak", "abstract": "ABSTRACT The article discusses various model optimization techniques, providing a comprehensive analysis of key performance indicators. Several parallelization strategies for image recognition are analyzed, adapted to different hardware and software configurations, including distributed data parallelism and distributed hardware processing. Changing the tensor layout in PyTorch DataLoader from NCHW to NHWC and enabling pin _ memory has proven to be very beneficial and easy to implement. Furthermore, the impact of different performance techniques (DPO, LoRA, QLoRA, and QAT) on the tuning process of LLMs was investigated. LoRA allows for faster tuning, while requiring less VRAM compared to DPO. On the other hand, QAT is the most resource\u2010intensive method, with the slowest processing times. A significant portion of LLM tuning time is attributed to initializing new kernels and synchronizing multiple threads when memory operations are not dominant.", "venue": "Concurrency and Computation Practice and Experience", "label": 33}, {"loc": [5.614595890045166, 0.9893454909324646], "openalex_id": "https://openalex.org/W4412530513", "title": "SaudiCulture: A Benchmark for Evaluating Large Language Models Cultural Competence within Saudi Arabia", "authors": "Lama Ayash, Hassan Alhuzali, Ashwag Alasmari, Sultan Awwad Aloufi", "abstract": "Abstract Large Language Models (LLMs) have demonstrated remarkable capabilities in natural language processing; however, they often struggle to accurately capture and reflect cultural nuances. This research addresses this challenge by focusing on Saudi Arabia, a country characterized by diverse dialects and rich cultural traditions. We introduce SaudiCulture, a novel benchmark designed to evaluate the cultural competence of LLMs within the distinct geographical and cultural contexts of Saudi Arabia. SaudiCulture is a comprehensive dataset of questions covering five major geographical regions\u2014West, East, South, North, and Center\u2014along with general questions applicable across all regions. The dataset encompasses a broad spectrum of cultural domains, including food, clothing, entertainment, celebrations, and crafts. To ensure a rigorous evaluation, SaudiCulture includes questions of varying complexity, such as open-ended, single-choice, and multiple-choice formats, with some requiring multiple correct answers. Additionally, the dataset distinguishes between common cultural knowledge and specialized regional aspects. We conduct extensive evaluations on six LLMs\u2014GPT-4, Llama 3.3, FANAR, Jais, AceGPT and DeepSeek \u2014analyzing their performance across different question types and cultural contexts. Our findings reveal that all models experience significant performance declines when faced with highly specialized or region-specific questions, particularly those requiring multiple correct responses. Furthermore, we observe that while some regions are better understood by LLMs, others remain largely misrepresented. For instance, GPT-4 achieves the highest accuracy in the western region (66%), whereas Jais records the lowest accuracy in the northern region (16%). Additionally, certain cultural categories are more easily identifiable than others, further highlighting inconsistencies in LLMs\u2019 cultural understanding. These results emphasize the importance of incorporating region-specific knowledge into LLMs training to enhance their cultural competence. We hope that SaudiCulture serves as a foundation for future efforts aimed at improving the ability of LLMs to engage with and accurately represent diverse cultural contexts.", "venue": "Journal of King Saud University - Computer and Information Sciences", "label": 32}, {"loc": [8.288843154907227, 0.18545876443386078], "openalex_id": "https://openalex.org/W4412961793", "title": "Science Out of Its Ivory Tower: Improving Accessibility with Reinforcement Learning", "authors": "Haining Wang, Jason A. Clark, Hannah McKelvey, Leila Sterman, Zheng Gao, Zuoyu Tian, Sandra K\u00fcbler, Xiaozhong Liu", "abstract": "Abstract A vast amount of scholarly work is published daily, yet much of it remains inaccessible to the general public due to dense jargon and complex language. To address this challenge in science communication, we introduce a reinforcement learning framework that fine-tunes a language model to rewrite scholarly abstracts into more comprehensible versions. Guided by a carefully balanced combination of word- and sentence-level accessibility rewards, our language model effectively substitutes technical terms with more accessible alternatives, a task which models supervised fine-tuned or guided by conventional readability measures struggle to accomplish. Our best model adjusts the readability level of scholarly abstracts by approximately six U.S. grade levels\u2014in other words, from a postgraduate to a high school level. This translates to roughly a 90% relative boost over the supervised fine-tuning baseline, all while maintaining factual accuracy and high-quality language. An in-depth analysis of our approach shows that balanced rewards lead to systematic modifications in the base model, likely contributing to smoother optimization and superior performance. We envision this work as a step toward bridging the gap between scholarly research and the general public, particularly younger readers and those without a college degree.", "venue": "Scientometrics", "label": 0}, {"loc": [5.340702533721924, 1.8485010862350464], "openalex_id": "https://openalex.org/W4412625114", "title": "From Embeddings to Explainability: A Tutorial on LLM-Based Text Analysis for Behavioral Scientists", "authors": "Rudolf Debelak, T. Koch, Matthias A\u00dfenmacher, Clemens Stachl", "abstract": "Large language models (LLMs) are transforming research in psychology and the behavioral sciences by enabling advanced text analysis at scale. Their applications range from the analysis of social media posts to infer psychological traits to the automated scoring of open-ended survey responses. However, despite their potential, many behavioral scientists struggle to integrate LLMs into their research because of the complexity of text modeling. In this tutorial, we aim to provide an accessible introduction to LLM-based text analysis, focusing on the Transformer architecture. We guide researchers through the process of preparing text data, using pretrained Transformer models to generate text embeddings, fine-tuning models for specific tasks such as text classification, and applying interpretability methods, such as Shapley additive explanations and local interpretable model-agnostic explanations, to explain model predictions. By making these powerful techniques more approachable, we hope to empower behavioral scientists to leverage LLMs in their research, unlocking new opportunities for analyzing and interpreting textual data.", "venue": "Advances in Methods and Practices in Psychological Science", "label": 0}, {"loc": [5.307485103607178, 1.158043384552002], "openalex_id": "https://openalex.org/W4411728730", "title": "Evaluating GPT-and Reasoning-based Large Language Models on Physics Olympiad Problems: Surpassing Human Performance and Implications for Educational \u2026", "authors": "Paul Tschisgale, Holger Maus, Fabian Kieser, Ben Kroehs, Stefan Petersen, Peter Wulff", "abstract": "Large language models (LLMs) are now widely accessible, reaching learners across all educational levels. This development has raised concerns that their use may circumvent essential learning processes and compromise the integrity of established assessment formats. In physics education, where problem solving plays a central role in both instruction and assessment, it is therefore essential to understand the physics-specific problem-solving capabilities of LLMs. Such understanding is key to informing responsible and pedagogically sound approaches to integrating LLMs into instruction and assessment. This study therefore compares the problem-solving performance of a general-purpose LLM (GPT\u22124o, using varying prompting techniques) and a reasoning-optimized model (o1) with that of participants in the German Physics Olympiad, based on a set of well-defined Olympiad problems. In addition to evaluating the correctness of the generated solutions, the study analyzes the characteristic strengths and limitations of LLM-generated solutions. The results of this study indicate that both tested LLMs (GPT\u22124o and o1) demonstrate advanced problem-solving capabilities on Olympiad-type physics problems, on average outperforming the human participants. Prompting techniques had little effect on GPT\u22124o\u2019s performance, and o1 almost consistently outperformed both GPT\u22124o and the human benchmark. The main implications of these findings are twofold: LLMs pose a challenge for summative assessment in unsupervised settings, as they can solve advanced physics problems at a level that exceeds top-performing students, making it difficult to ensure the authenticity of student work. At the same time, their problem-solving capabilities offer potential for formative assessment, where LLMs can support students in evaluating their own solutions to problems.", "venue": "Physical Review Physics Education Research", "label": 0}, {"loc": [2.9485676288604736, -0.1304291933774948], "openalex_id": "https://openalex.org/W4411625527", "title": "Interventional Radiology Checklist for Artificial Intelligence Research Evaluation", "authors": "James Anibal, Hannah Huth, Tom Boeken, Dania Daye, Judy Wawira Gichoya, Fernando G\u00f3mez, Julius Chapiro, Bradford J. Wood, Daniel Y. Sze, Klaus A. Hausegger", "abstract": "As artificial intelligence (AI) becomes increasingly prevalent within interventional radiology (IR) research and clinical practice, steps must be taken to ensure the robustness of novel technological systems presented in peer-reviewed journals. This report introduces comprehensive standards and an evaluation checklist (iCARE) that covers the application of modern AI methods in IR-specific contexts. The iCARE checklist encompasses the full \"code-to-clinic\" pipeline of AI development, including dataset curation, pre-training, task-specific training, explainability, privacy protection, bias mitigation, reproducibility, and model deployment. The iCARE checklist aims to support the development of safe, generalizable technologies for enhancing IR workflows, the delivery of care, and patient outcomes.", "venue": "CardioVascular and Interventional Radiology", "label": 0}, {"loc": [6.204136848449707, 5.402525424957275], "openalex_id": "https://openalex.org/W4411336859", "title": "Toward a Holistic Evaluation of Robustness in CLIP Models", "authors": "Weijie Tu, Weijian Deng, Tom Gedeon", "abstract": "Contrastive Language-Image Pre-training (CLIP) models have shown significant potential, particularly in zero-shot classification across diverse distribution shifts. Building on existing evaluations of overall classification robustness, this work aims to provide a more comprehensive assessment of CLIP by introducing several new perspectives. First, we investigate their robustness to variations in specific visual factors. Second, we assess two critical safety objectives-confidence uncertainty and out-of-distribution detection-beyond mere classification accuracy. Third, we evaluate the finesse with which CLIP models bridge the image and text modalities. Fourth, we extend our examination to 3D awareness in CLIP models, moving beyond traditional 2D image understanding. Finally, we explore the interaction between vision and language encoders within modern large multimodal models (LMMs) that utilize CLIP as the visual backbone, focusing on how this interaction impacts classification robustness. In each aspect, we consider the impact of six factors on CLIP models: model architecture, training distribution, training set size, fine-tuning, contrastive loss, and test-time prompts. Our study uncovers several previously unknown insights into CLIP. For instance, the architecture of the visual encoder in CLIP plays a significant role in their robustness against 3D corruption. CLIP models tend to exhibit a bias towards shape when making predictions. Moreover, this bias tends to diminish after fine-tuning on ImageNet. Vision-language models like LLaVA, leveraging the CLIP vision encoder, could exhibit benefits in classification performance for challenging categories over CLIP alone. Our findings are poised to offer valuable guidance for enhancing the robustness and reliability of CLIP models.", "venue": "IEEE Transactions on Pattern Analysis and Machine Intelligence", "label": 0}, {"loc": [5.160853385925293, 2.0507824420928955], "openalex_id": "https://openalex.org/W4411239326", "title": "Neural Text Embeddings in Psychological Research: A Guide With Examples in R", "authors": "Louis Teitelbaum, Almog Simchon", "abstract": "In this guide, we review neural embedding models and compare three methods of quantifying psychological constructs for use with embeddings: distributed dictionary representation, contextualized construct representation, and a novel approach: correlational anchored vectors. We aim to cultivate an intuition for the geometric properties of neural embeddings and a sensitivity to methodological problems that can arise in their use. We argue that while large language model embeddings have the advantage of contextualization, decontextualized word embeddings may have more ability to generalize across text genres when using cosine or dot product similarity metrics. The three methods of operationalizing psychological constructs in vector space likewise each have their advantages in particular applications. We recommend distributed dictionary representation, which derives a vector representation from a word list, for quantifying abstract constructs relating to the overall feel of a text, especially when the research requires that these constructs generalize across multiple genres of text. We recommend contextualized construct representation, which derives a representation from a questionnaire, for cases in which texts are relatively similar in content to the embedded questionnaire, such as experiments in which participants are asked to respond to a related prompt. Correlational anchored vectors, which derives a representation from labeled examples, requires suitably large and reliable training data. (PsycInfo Database Record (c) 2025 APA, all rights reserved).", "venue": "Psychological Methods", "label": 0}, {"loc": [5.5369133949279785, -0.682340145111084], "openalex_id": "https://openalex.org/W4411120917", "title": "SocialQuotes: Learning Contextual Roles of Social Media Quotes on the Web", "authors": "John Palowitch, Hamidreza Alvari, Mehran Kazemi, Md Tanvir Al Amin, Filip Radlinski", "abstract": "Web authors frequently embed social media to support and enrich their content, creating the potential to derive webbased, cross-platform social media representations that can enable more effective social media retrieval systems and richer scientific analyses. As a step toward such capabilities, we introduce a novel language modeling framework that enables automatic annotation of roles that social media entities play in their embedded web context. Using related communication theory, we liken social media embeddings to quotes, formalize the page context as structured natural language signals, and identify a taxonomy of roles for quotes within the page context. We release SocialQuotes, a new data set built from the Common Crawl of over 32 million social quotes, 8.3k of them with crowdsourced quote annotations. Using SocialQuotes and the accompanying annotations, we provide a role classification case study, showing reasonable performance with modern-day LLMs, and exposing explainable aspects of our framework via page content ablations. We also classify a large batch of un-annotated quotes, revealing interesting cross-domain, cross-platform role distributions on the web.", "venue": "Proceedings of the International AAAI Conference on Web and Social Media", "label": 49}, {"loc": [3.79785418510437, 1.2771729230880737], "openalex_id": "https://openalex.org/W4411120843", "title": "Keeping Humans in the Loop: Human-Centered Automated Annotation with Generative AI", "authors": "Nick Pangakis, Sam Wolken", "abstract": "Automated text annotation is a compelling use case for generative large language models (LLMs) in social media research. Recent work suggests that LLMs can achieve strong performance on annotation tasks; however, these studies evaluate LLMs on a small number of tasks and likely suffer from contamination due to a reliance on public benchmark datasets. Here, we test a human-centered framework for responsibly evaluating artificial intelligence tools used in automated annotation. We use GPT-4 to replicate 27 annotation tasks across 11 password-protected datasets from recently published computational social science articles in high-impact journals. For each task, we compare GPT-4 annotations against human-annotated ground-truth labels and against annotations from separate supervised classification models fine-tuned on human-generated labels. Although the quality of LLM labels is generally high, we find significant variation in LLM performance across tasks, even within datasets. Our findings underscore the importance of a human-centered workflow and careful evaluation standards: Automated annotations significantly diverge from human judgment in numerous scenarios, despite various optimization strategies such as prompt tuning. Grounding automated annotation in validation labels generated by humans is essential for responsible evaluation.", "venue": "Proceedings of the International AAAI Conference on Web and Social Media", "label": 49}, {"loc": [9.464454650878906, 0.7233160734176636], "openalex_id": "https://openalex.org/W4414565878", "title": "Token and Span Classification for Entity Recognition in French Historical Encyclopedias", "authors": "Ludovic Moncla, H\u00e9di Zeghidi", "abstract": "Named Entity Recognition (NER) in historical texts presents unique challenges due to non-standardized language, archaic orthography, and nested or overlapping entities. This study benchmarks a diverse set of NER approaches, ranging from classical Conditional Random Fields (CRFs) and spaCy-based models to transformer-based architectures such as CamemBERT and sequence-labeling models like Flair. Experiments are conducted on the GeoEDdA dataset, a richly annotated corpus derived from 18th-century French encyclopedias. We propose framing NER as both token-level and span-level classification to accommodate complex nested entity structures typical of historical documents. Additionally, we evaluate the emerging potential of few-shot prompting with generative language models for low-resource scenarios. Our results demonstrate that while transformer-based models achieve state-of-the-art performance, especially on nested entities, generative models offer promising alternatives when labeled data are scarce. The study highlights ongoing challenges in historical NER and suggests avenues for hybrid approaches combining symbolic and neural methods to better capture the intricacies of early modern French text.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.1006340980529785, -2.4853436946868896], "openalex_id": "https://openalex.org/W4411122931", "title": "Enhancing Fake News Detection with Transformer Models and Summarization", "authors": "Abdelhalim A. Saadi, Hacene Belhadef, Akram Guessas, Oussama Hafirassou", "abstract": "This study evaluates the performance of transformer-based models such as BERT, RoBERTa, and XLNet for fake news detection. Using supervised and unsupervised deep learning techniques, we optimized classification accuracy while reducing computational costs through text summarization. The results show that RoBERTa, fine-tuned with summarized content, achieves 98.39% accuracy, outperforming the other models. Additionally, we assessed AI-generated misinformation using GPT-2, confirming that transformer models effectively distinguish real from synthetic news. We utilized the GPT-2 model instead of more recent models like GPT-4, as our objective was to generate fake news locally and compare it with pretrained models from the same time period.", "venue": "Engineering Technology & Applied Science Research", "label": 44}, {"loc": [6.855657577514648, 0.2956472337245941], "openalex_id": "https://openalex.org/W4415131806", "title": "The State of Large Language Models for African Languages: Progress and Challenges", "authors": "Kedir Yassin Hussen, Walelign Tewabe Sewunetie, Abinew Ali Ayele, Sukairaj Hafiz Imam, Shamsuddeen Hassan Muhammad, Seid Muhie Yimam", "abstract": "Large Language Models (LLMs) are transforming Natural Language Processing (NLP), but their benefits are largely absent for Africa's 2,000 low-resource languages. This paper comparatively analyzes African language coverage across six LLMs, eight Small Language Models (SLMs), and six Specialized SLMs (SSLMs). The evaluation covers language coverage, training sets, technical limitations, script problems, and language modelling roadmaps. The work identifies 42 supported African languages and 23 available public data sets, and it shows a big gap where four languages (Amharic, Swahili, Afrikaans, and Malagasy) are always treated while there is over 98\\% of unsupported African languages. Moreover, the review shows that just Latin, Arabic, and Ge'ez scripts are identified while 20 active scripts are neglected. Some of the primary challenges are lack of data, tokenization biases, computational costs being very high, and evaluation issues. These issues demand language standardization, corpus development by the community, and effective adaptation methods for African languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.50722599029541, 3.621887683868408], "openalex_id": "https://openalex.org/W4414894563", "title": "Taming LLMs by Scaling Learning Rates with Gradient Grouping", "authors": "Siyuan Li, Jie Tian, Zedong Wang, Xin Jin, Zichen Liu, Wentao Zhang, Dong Xu", "abstract": "Training large language models (LLMs) poses challenges due to their massive scale and heterogeneous architectures. While adaptive optimizers like AdamW help address gradient variations, they still struggle with efficient and effective parameter-wise learning rate estimation, resulting in training instability, slow convergence, and poor compatibility with parameter-efficient fine-tuning (PEFT) techniques. This work introduces Scaling with Gradient Grouping (SGG), an optimizer wrapper that improves adaptive learning rate estimation by dynamic grouping and group-specific scaling. SGG first groups gradient statistics in each layer into clusters and then applies cluster-specific scaling to calibrate learning rates for each parameter, thus imposing collective group-wise constraints while maintaining precise per-parameter adaptation. Experiments on diverse (M)LLM benchmarks show that SGG integrates seamlessly with existing optimizers, and offers consistent gains and faster convergence over baselines, with various model sizes. Its stability across varying batch sizes and learning rates establishes SGG as a robust choice for LLM optimization.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.577645778656006, 1.211733102798462], "openalex_id": "https://openalex.org/W4414896451", "title": "Comparing LLM-generated and human-authored news text using formal syntactic theory", "authors": "Olga Zamaraeva, Dan Flickinger, Francis Bond, Carlos G\u00f3mez\u2010Rodr\u00edguez", "abstract": "This study provides the first comprehensive comparison of New York Times-style text generated by six large language models against real, human-authored NYT writing. The comparison is based on a formal syntactic theory. We use Head-driven Phrase Structure Grammar (HPSG) to analyze the grammatical structure of the texts. We then investigate and illustrate the differences in the distributions of HPSG grammar types, revealing systematic distinctions between human and LLM-generated writing. These findings contribute to a deeper understanding of the syntactic behavior of LLMs as well as humans, within the NYT genre.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.229320526123047, 1.2322126626968384], "openalex_id": "https://openalex.org/W4414898027", "title": "Common Corpus: The Largest Collection of Ethical Data for LLM Pre-Training", "authors": "Pierre-Carl Langlais, Carlos Rosas Hinostroza, Mattia Nee, Catherine Arnett, Pavel Chizhov, Eleanor Jones, Ir\u00e8ne Girard, David Mach, Anastasia Stasenko, Ivan P. Yamshchikov", "abstract": "Large Language Models (LLMs) are pre-trained on large amounts of data from different sources and domains. These data most often contain trillions of tokens with large portions of copyrighted or proprietary content, which hinders the usage of such models under AI legislation. This raises the need for truly open pre-training data that is compliant with the data security regulations. In this paper, we introduce Common Corpus, the largest open dataset for language model pre-training. The data assembled in Common Corpus are either uncopyrighted or under permissible licenses and amount to about two trillion tokens. The dataset contains a wide variety of languages, ranging from the main European languages to low-resource ones rarely present in pre-training datasets; in addition, it includes a large portion of code data. The diversity of data sources in terms of covered domains and time periods opens up the paths for both research and entrepreneurial needs in diverse areas of knowledge. In this technical report, we present the detailed provenance of data assembling and the details of dataset filtering and curation. Being already used by such industry leaders as Anthropic and multiple LLM training projects, we believe that Common Corpus will become a critical infrastructure for open science research in LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.248482704162598, -1.1303824186325073], "openalex_id": "https://openalex.org/W4414896289", "title": "The Landscape of Arabic Large Language Models (ALLMs): A New Era for Arabic Language Technology", "authors": "Shahad Al-Khalifa, Nadir Durrani, Hend S. Al\u2010Khalifa, Firoj Alam", "abstract": "The emergence of ChatGPT marked a transformative milestone for Artificial Intelligence (AI), showcasing the remarkable potential of Large Language Models (LLMs) to generate human-like text. This wave of innovation has revolutionized how we interact with technology, seamlessly integrating LLMs into everyday tasks such as vacation planning, email drafting, and content creation. While English-speaking users have significantly benefited from these advancements, the Arabic world faces distinct challenges in developing Arabic-specific LLMs. Arabic, one of the languages spoken most widely around the world, serves more than 422 million native speakers in 27 countries and is deeply rooted in a rich linguistic and cultural heritage. Developing Arabic LLMs (ALLMs) presents an unparalleled opportunity to bridge technological gaps and empower communities. The journey of ALLMs has been both fascinating and complex, evolving from rudimentary text processing systems to sophisticated AI-driven models. This article explores the trajectory of ALLMs, from their inception to the present day, highlighting the efforts to evaluate these models through benchmarks and public leaderboards. We also discuss the challenges and opportunities that ALLMs present for the Arab world.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.176787853240967, 0.12712253630161285], "openalex_id": "https://openalex.org/W4414897052", "title": "Multilingual Definition Modeling", "authors": "Edison Marrese-Taylor, Erica K. Shimomoto, Alfredo Solano, Edna F. Reid", "abstract": "In this paper, we propose the first multilingual study on definition modeling. We use monolingual dictionary data for four new languages (Spanish, French, Portuguese, and German) and perform an in-depth empirical study to test the performance of pre-trained multilingual language models on definition modeling of monosemic words when finetuned on this data. Furthermore, we use a zero-shot approach to test the multilingual capabilities of two popular chat-based Large Language Models (LLMs) in the task. Results show that multilingual language models can perform on-pair with English but cannot leverage potential cross-lingual synergies, with LLMs generally offering better performance overall. A comprehensive human evaluation of the LLM-generated definition highlights the zero and few-shot capabilities of these models in this new task, also showing their shortcomings. Finally, we show that performance on our task via BERTScore strongly correlates to the performance on multilingual LLM benchmarks, suggesting that our task offers a viable compute-constrained, stable and natural alternative to these.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.342491149902344, 3.0718281269073486], "openalex_id": "https://openalex.org/W4410983846", "title": "Origin of the ring ellipticity in the black hole images of M87", "authors": "Rohan Dahale, Ilje Cho, Kotaro Moriyama, K. Wiik, Paul Tiede, Laura G\u00f3mez, Chi\u2010kwan Chan, Roman Gold, Vadim Y. Bernshteyn, Marianna Foschi, Britton Jeter, Hung-Yi Pu, Boris Georgiev, Vikas Joshi, Alejandro Cruz-Osorio, Iniyan Natarajan, E. Avery Broderick, L. Salas, Koushik Chatterjee, Kazunori Akiyama, Ezequiel Albentosa-Ru\u00edz, A. Alberdi, W. Alef, Juan Carlos Algaba, Richard Anantua, Keiichi Asada, Rebecca Azulay, U. Bach, Anne-Kathrin Baczko, David Ball, Mislav Balokovi\u0107, Bidisha Bandyopadhyay, John Barrett, Michi Baub\u00f6ck, B. A. Benson, Dan Bintley, Lindy Blackburn, R. Blundell, L. Katherine Bouman, Geoffrey C. Bower, Michael Bremer, Roger Brissenden, S. Britzen, Dominique Brogui\u00e8re, Thomas Bronzwaer, Sandra Bustamante, Douglas Ferreira Carlos, J. E. Carlstrom, Andrew Chael, O. Chang, Shami Chatterjee, Ming\u2010Tang Chen, Xiaopeng Cheng, Xiaopeng Cheng, Pierre Christian, Shelly Conroy, J. E. Conway, T. M. Crawford, G. Crew, Yuzhu Cui, Brandon Curd, Jordy Davelaar, Mariafelicia De Laurentis, Roger Deane, Jessica Dempsey, G. Desvignes, Jason Dexter, Vedant Dhruv, K. Indu Dihingia, Sheperd S. Doeleman, Sergio A. Dzib, Ralph P. Eatough, Razieh Emami, H. Falcke, Joseph Farah, Vincent L. Fish, E. B. Fomalont, H. Alyson Ford, Raquel Fraga-Encinas, William T. Freeman, Per Friberg, Michael Fromm, Antonio Fuentes, Peter Galison, F. Charles Gammie, Roberto Garc\u00eda, Olivier Gentaz, Gertie Geertsema, C. Goddi, I. Arturo G\u00f3mez-Ruiz, Minfeng Gu, Mark Gurwell, Kazuhiro Hada, Daryl Haggard, Ronald Hesper, Dirk Heumann, Luis C. Ho, Paul T. P. Ho, Mareki Honma, Lei Huang", "abstract": "We investigate the origin of the elliptical ring structure observed in the images of the supermassive black hole M87 *, aiming to disentangle contributions from gravitational, astrophysical, and imaging effects. Leveraging the enhanced capabilities of the Event Horizon Telescope (EHT)'s 2018 array, including improved (u, v)-coverage from the Greenland Telescope, we measured the ring's ellipticity using five independent imaging methods, obtaining a consistent average value of \u03c4 = 0.08 \u22120.02 +0.03 with a position angle of \u03be = 50.1 \u22127.6 +6.2 degrees. To interpret this measurement, we compared it to general relativistic magnetohydrodynamic (GRMHD) simulations spanning a wide range of physical parameters including the thermal or nonthermal electron distribution function, spins, and ion-to-electron temperature ratios in both low- and high-density regions. We find no statistically significant correlation between spin and ellipticity in GRMHD images. Instead, we identify a correlation between ellipticity and the fraction of non-ring emission, particularly in nonthermal models and models with higher jet emission. These results indicate that the ellipticity measured from the M87 * emission structure is consistent with that expected from simulations of turbulent accretion flows around black holes, where it is dominated by astrophysical effects rather than gravitational ones. Future high-resolution imaging, including space very long baseline interferometry and long-term monitoring, will be essential to isolate gravitational signatures from astrophysical effects.", "venue": "Astronomy and Astrophysics", "label": 43}, {"loc": [5.4238739013671875, 2.1797335147857666], "openalex_id": "https://openalex.org/W4410977263", "title": "Retrieving the spatial layout of medium-scale geographical maps through distributional semantics", "authors": "Giorgia Anceresi, Daniele Gatti, Tomaso Vecchi, Marco Marelli, Luca Rinaldi", "abstract": "Recent evidence has indicated that spatial representations, such as large-scale geographical maps, can be retrieved from natural language alone through cognitively plausible distributional-semantic models, which capture word meanings through contextual relationship (i.e., non-spatial associative-learning mechanisms) in large linguistic corpora. Here, we demonstrate that spatial information can be extracted from purely linguistic data even at the medium-scale level (e.g., landmarks within a city). Our results indeed show that different spatial representations (i.e., with information encoded either in terms of relative spatial distances or absolute locations defined by coordinate axes) of the underground maps of five European cities can be retrieved from natural language. Furthermore, by selectively focusing on the London tube, we show that linguistic data align effectively with both geographical and schematic visual maps. These findings contribute to a growing body of research that challenges the traditional view of cognitive maps as primarily relying on specialized spatial computations and highlight the importance of non-spatial associative-learning mechanisms within the linguistic environment in the setting of spatial representations.", "venue": "Neuropsychologia", "label": 0}, {"loc": [8.096968650817871, 3.780611991882324], "openalex_id": "https://openalex.org/W4414896312", "title": "TAH-QUANT: Effective Activation Quantization in Pipeline Parallelism over Slow Network", "authors": "Guoqiang He, Yuan Cao, Yutong He, Tianyi Bai, Kun Yuan, Binhang Yuan", "abstract": "Decentralized training of large language models offers the opportunity to pool computational resources across geographically distributed participants but faces significant network communication bottlenecks, particularly in pipeline-parallel settings. While pipeline parallelism partitions model layers across devices to handle large-scale models, it necessitates frequent communication of intermediate activations, creating challenges when network bandwidth is limited. Existing activation compression methods, such as AQ-SGD, mitigate quantization-induced errors through error compensation but impose prohibitive memory overhead by requiring storage of previous activations. To address these issues, we introduce TAH-Quant (Tile-wise Adaptive Hadamard Quantization), a novel activation quantization framework designed specifically for pipeline parallelism. Our approach integrates fine-grained tile-wise quantization for precise control, entropy-guided token-level adaptive bit allocation for optimal bit usage, and a Hadamard-based transform with pivot element swapping to effectively suppress quantization outliers. We further provide a theoretical analysis, proving that pipeline parallel training equipped with TAH-Quant maintains a convergence rate of $\\mathcal{O}(1/\\sqrt{T})$, matching that of vanilla stochastic gradient descent. Extensive experiments on diverse LLM tasks demonstrate that TAH-Quant achieves aggressive activation quantization (3-4 bits) ratio, which provides up to 4.3$\\times$ end-to-end speedup without compromising training convergence, matches state-of-the-art methods, incurs no extra memory overhead, and generalizes well across different training scenarios.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.40247631072998, 2.786346197128296], "openalex_id": "https://openalex.org/W4414856689", "title": "Advantageous Parameter Expansion Training Makes Better Large Language Models", "authors": "Nannan Gu, Yilong Chen, Zhenyu Zhang, Peng Fu, Zheng Lin, Shuohuan Wang, Yu Sun, Hua Wu, Weiping Wang, Haifeng Wang", "abstract": "Although scaling up the number of trainable parameters in both pre-training and fine-tuning can effectively improve the performance of large language models, it also leads to increased computational overhead. When delving into the parameter difference, we find that a subset of parameters, termed advantageous parameters, plays a crucial role in determining model performance. Further analysis reveals that stronger models tend to possess more such parameters. In this paper, we propose Advantageous Parameter EXpansion Training (APEX), a method that progressively expands advantageous parameters into the space of disadvantageous ones, thereby increasing their proportion and enhancing training effectiveness. Further theoretical analysis from the perspective of matrix effective rank explains the performance gains of APEX. Extensive experiments on both instruction tuning and continued pre-training demonstrate that, in instruction tuning, APEX outperforms full-parameter tuning while using only 52% of the trainable parameters. In continued pre-training, APEX achieves the same perplexity level as conventional training with just 33% of the training data, and yields significant improvements on downstream tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.8897674083709717, 2.913450002670288], "openalex_id": "https://openalex.org/W4414856076", "title": "Temporally Extending Existing Web Archive Collections for Longitudinal Analysis", "authors": "Lesley Frew, Michael L. Nelson, Michele C. Weigle", "abstract": "The Environmental Governance and Data Initiative (EDGI) regularly crawled US federal environmental websites between 2016 and 2020 to capture changes between two presidential administrations. However, because it does not include the previous administration ending in 2008, the collection is unsuitable for answering our research question, Were the website terms deleted by the Trump administration (2017--2021) added by the Obama administration (2009--2017)? Thus, like many researchers using the Wayback Machine's holdings for historical analysis, we do not have access to a complete collection suiting our needs. To answer our research question, we must extend the EDGI collection back to January, 2008. This includes discovering relevant pages that were not included in the EDGI collection that persisted through 2020, not just going further back in time with the existing pages. We pieced together artifacts collected by various organizations for their purposes through many means (Save Page Now, Archive-It, and more) in order to curate a dataset sufficient for our intentions. In this paper, we contribute a methodology to extend existing web archive collections temporally to enable longitudinal analysis, including a dataset extended with this methodology. We use our new dataset to analyze our question, Were the website terms deleted by the Trump administration added by the Obama administration? We find that 81 percent of the pages in the dataset changed between 2008 and 2020, and that 87 percent of the pages with terms deleted by the Trump administration were terms added during the Obama administration.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.669546127319336, 1.6396468877792358], "openalex_id": "https://openalex.org/W4410942332", "title": "Simulating Subjects: The Promise and Peril of Artificial Intelligence Stand-Ins for Social Agents and Interactions", "authors": "Austin C. Kozlowski, James A. Evans", "abstract": "Large language models (LLMs), through their exposure to massive collections of online text, learn to reproduce the perspectives and linguistic styles of diverse social and cultural groups. This capability suggests a powerful social scientific application\u2014the simulation of empirically realistic, culturally situated human subjects. Synthesizing recent research in artificial intelligence and computational social science, we outline a methodological foundation for simulating human subjects and their social interactions. We then identify six characteristics of current models that are likely to impair the realistic simulation of human subjects: bias, uniformity, atemporality, disembodiment, linguistic cultures, and alien intelligence. For each of these areas, we discuss promising approaches for overcoming their associated shortcomings. Given the rate of change of these models, we advocate for an ongoing methodological program for the simulation of human subjects that keeps pace with rapid technical progress, and caution that validation against human subjects data remains essential to ensure simulation accuracy.", "venue": "Sociological Methods & Research", "label": 0}, {"loc": [5.39995813369751, 3.075603485107422], "openalex_id": "https://openalex.org/W4414854741", "title": "Defining Foundation Models for Computational Science: A Call for Clarity and Rigor", "authors": "Youngsoo Choi, Siu Wun Cheung, Youngkyu Kim, Ping-Hsuan Tsai, Alejandro N. Diaz, Ivan Zanardi, Seung Whan Chung, Dylan Copeland, Coleman Kendrick, William D. Anderson, Traian Iliescu, Matthias Heinkenschloss", "abstract": "The widespread success of foundation models in natural language processing and computer vision has inspired researchers to extend the concept to scientific machine learning and computational science. However, this position paper argues that as the term \"foundation model\" is an evolving concept, its application in computational science is increasingly used without a universally accepted definition, potentially creating confusion and diluting its precise scientific meaning. In this paper, we address this gap by proposing a formal definition of foundation models in computational science, grounded in the core values of generality, reusability, and scalability. We articulate a set of essential and desirable characteristics that such models must exhibit, drawing parallels with traditional foundational methods, like the finite element and finite volume methods. Furthermore, we introduce the Data-Driven Finite Element Method (DD-FEM), a framework that fuses the modular structure of classical FEM with the representational power of data-driven learning. We demonstrate how DD-FEM addresses many of the key challenges in realizing foundation models for computational science, including scalability, adaptability, and physics consistency. By bridging traditional numerical methods with modern AI paradigms, this work provides a rigorous foundation for evaluating and developing novel approaches toward future foundation models in computational science.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.610644340515137, 2.5957303047180176], "openalex_id": "https://openalex.org/W4415036037", "title": "Test-Time Learning for Large Language Models", "authors": "Jinwu Hu, Zhitian Zhang, Guohao Chen, Xin-Jian Wen, Chao Shuai, Wei Luo, Bin Xiao, Yuanqing Li, Mingkui Tan", "abstract": "While Large Language Models (LLMs) have exhibited remarkable emergent capabilities through extensive pre-training, they still face critical limitations in generalizing to specialized domains and handling diverse linguistic variations, known as distribution shifts. In this paper, we propose a Test-Time Learning (TTL) paradigm for LLMs, namely TLM, which dynamically adapts LLMs to target domains using only unlabeled test data during testing. Specifically, we first provide empirical evidence and theoretical insights to reveal that more accurate predictions from LLMs can be achieved by minimizing the input perplexity of the unlabeled test data. Based on this insight, we formulate the Test-Time Learning process of LLMs as input perplexity minimization, enabling self-supervised enhancement of LLM performance. Furthermore, we observe that high-perplexity samples tend to be more informative for model optimization. Accordingly, we introduce a Sample Efficient Learning Strategy that actively selects and emphasizes these high-perplexity samples for test-time updates. Lastly, to mitigate catastrophic forgetting and ensure adaptation stability, we adopt Low-Rank Adaptation (LoRA) instead of full-parameter optimization, which allows lightweight model updates while preserving more original knowledge from the model. We introduce the AdaptEval benchmark for TTL and demonstrate through experiments that TLM improves performance by at least 20% compared to original LLMs on domain knowledge adaptation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.568524360656738, 2.4069840908050537], "openalex_id": "https://openalex.org/W4415035741", "title": "GRAPE: Optimize Data Mixture for Group Robust Multi-target Adaptive Pretraining", "authors": "Simin Fan, Maria Ios Glarou, Martin Jaggi", "abstract": "The performance of large language models (LLMs) across diverse downstream applications is fundamentally governed by the quality and composition of their pretraining corpora. Existing domain reweighting algorithms primarily optimize data mixtures for a single target task, thereby resulting in models that overfit to specialized objectives while exhibiting substantial performance degradation on other benchmarks. This paper introduces Group Robust Multi-target Adaptive PrEtraining (GRAPE), a novel multi-source-multi-target domain reweighting framework designed to calibrate pretraining data mixtures for robust performance across multiple target tasks simultaneously. GRAPE dynamically adjusts sampling weights across source domains (domain weights) while concurrently modulating task weights that quantify the relative importance of each individual target task. This adaptive process prioritizes tasks based on their learning difficulty throughout training. We formulate this interleaved reweighting mechanism as a minimax optimization problem: The inner maximization adjusts task weights leveraging group distributed-robust-optimization (DRO), where those tasks demonstrating the least improvement under the current data mixture are prioritized with higher weights; The outer minimization then optimizes domain weights to maximize loss reduction on the prioritized tasks. Experiments on ClimbLab and SlimPajama datasets demonstrate that GRAPE consistently outperforms baseline methods in terms of reasoning performance across 6 benchmarks. Furthermore, when applied to multilingual targets, GRAPE effectively identifies optimal training mixtures from mainstream languages, achieving superior language modeling capabilities across 8 low-resource target languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.53759765625, 1.627633810043335], "openalex_id": "https://openalex.org/W4414587255", "title": "Foundation Models for Tabular Data within Systemic Contexts Need Grounding", "authors": "Tassilo Klein, Johannes Hoffart", "abstract": "Current research on tabular foundation models often overlooks the complexities of large-scale, real-world data by treating tables as isolated entities and assuming information completeness, thereby neglecting the vital operational context. To address this, we introduce the concept of Semantically Linked Tables (SLT), recognizing that tables are inherently connected to both declarative and procedural operational knowledge. We propose Foundation Models for Semantically Linked Tables (FMSLT), which integrate these components to ground tabular data within its true operational context. This comprehensive representation unlocks the full potential of machine learning for complex, interconnected tabular data across diverse domains. Realizing FMSLTs requires access to operational knowledge that is often unavailable in public datasets, highlighting the need for close collaboration between domain experts and researchers. Our work exposes the limitations of current tabular foundation models and proposes a new direction centered on FMSLTs, aiming to advance robust, context-aware models for structured data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.196240425109863, 1.0566351413726807], "openalex_id": "https://openalex.org/W4414854692", "title": "DeepResearchGym: A Free, Transparent, and Reproducible Evaluation Sandbox for Deep Research", "authors": "Jo\u00e3o M. P. Coelho, Ning Jia, Jingyuan He, Kangrui Mao, Abhijay Paladugu, Pranav Setlur, Junhong Jin, Jamie Callan, Jo\u00e3o Magalh\u00e3es, Bruno Martins, Chenyan Xiong", "abstract": "Deep research systems represent an emerging class of agentic information retrieval methods that generate comprehensive and well-supported reports to complex queries. However, most existing frameworks rely on dynamic commercial search APIs, which pose reproducibility and transparency challenges in addition to their cost. To address these limitations, we introduce DeepResearchGym, an open-source sandbox that combines a reproducible search API with a rigorous evaluation protocol for benchmarking deep research systems. The API indexes large-scale public web corpora, namely ClueWeb22 and FineWeb, using a state-of-the-art dense retriever and approximate nearest neighbor search via DiskANN. It achieves lower latency than popular commercial APIs while ensuring stable document rankings across runs, and is freely available for research use. To evaluate deep research systems' outputs, we extend the Researchy Questions benchmark with automatic metrics through LLM-as-a-judge assessments to measure alignment with users' information needs, retrieval faithfulness, and report quality. Experimental results show that systems integrated with DeepResearchGym achieve performance comparable to those using commercial APIs, with performance rankings remaining consistent across evaluation metrics. A human evaluation study further confirms that our automatic protocol aligns with human preferences, validating the framework's ability to help support controlled assessment of deep research systems. Our code and API documentation are available at https://www.deepresearchgym.ai.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.040232181549072, 1.934051275253296], "openalex_id": "https://openalex.org/W4414586617", "title": "Turing Test 2.0: The General Intelligence Threshold", "authors": "Georgios Mappouras", "abstract": "With the rise of artificial intelligence (A.I.) and large language models like ChatGPT, a new race for achieving artificial general intelligence (A.G.I) has started. While many speculate how and when A.I. will achieve A.G.I., there is no clear agreement on how A.G.I. can be detected in A.I. models, even when popular tools like the Turing test (and its modern variations) are used to measure their intelligence. In this work, we discuss why traditional methods like the Turing test do not suffice for measuring or detecting A.G.I. and provide a new, practical method that can be used to decide if a system (computer or any other) has reached or surpassed A.G.I. To achieve this, we make two new contributions. First, we present a clear definition for general intelligence (G.I.) and set a G.I. Threshold (G.I.T.) that can be used to distinguish between systems that achieve A.G.I. and systems that do not. Second, we present a new framework on how to construct tests that can detect if a system has achieved G.I. in a simple, comprehensive, and clear-cut fail/pass way. We call this novel framework the Turing test 2.0. We then demonstrate real-life examples of applying tests that follow our Turing test 2.0 framework on modern A.I. models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.6397123336792, 2.683992385864258], "openalex_id": "https://openalex.org/W4414587470", "title": "ESLM: Risk-Averse Selective Language Modeling for Efficient Pretraining", "authors": "Melis Ilayda Bal, Volkan Cevher, Michael Muehlebach", "abstract": "Large language model pretraining is compute-intensive, yet many tokens contribute marginally to learning, resulting in inefficiency. We introduce Efficient Selective Language Modeling (ESLM), a risk-aware algorithm that improves training efficiency and distributional robustness by performing online token-level batch selection. ESLM leverages per-token statistics (e.g., entropy or loss) and applies value-at-risk thresholding to retain only the most informative tokens per batch. This data-centric mechanism reshapes the training loss, prioritizing high-risk tokens and eliminating redundant gradient computation. We frame ESLM as a bilevel game: the model competes with a masking adversary that selects worst-case token subsets under a constrained thresholding rule. In the loss-based setting, ESLM recovers conditional value-at-risk loss minimization, providing a principled connection to distributionally robust optimization. We extend our approach to Ada-ESLM, which adaptively tunes the selection confidence during training. Experiments on GPT-2 pretraining show that ESLM significantly reduces training FLOPs while maintaining or improving both perplexity and downstream performance compared to baselines. Our approach also scales across model sizes, pretraining corpora, and integrates naturally with knowledge distillation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.441784858703613, 5.058460712432861], "openalex_id": "https://openalex.org/W4414580785", "title": "FullFront: Benchmarking MLLMs Across the Full Front-End Engineering Workflow", "authors": "Haoyu Sun, Huichen Will Wang, Jiawei Gu, Linjie Li, Yu Cheng", "abstract": "Front-end engineering involves a complex workflow where engineers conceptualize designs, translate them into code, and iteratively refine the implementation. While recent benchmarks primarily focus on converting visual designs to code, we present FullFront, a benchmark designed to evaluate Multimodal Large Language Models (MLLMs) \\textbf{across the full front-end development pipeline}. FullFront assesses three fundamental tasks that map directly to the front-end engineering pipeline: Webpage Design (conceptualization phase), Webpage Perception QA (comprehension of visual organization and elements), and Webpage Code Generation (implementation phase). Unlike existing benchmarks that use either scraped websites with bloated code or oversimplified LLM-generated HTML, FullFront employs a novel, two-stage process to transform real-world webpages into clean, standardized HTML while maintaining diverse visual designs and avoiding copyright issues. Extensive testing of state-of-the-art MLLMs reveals significant limitations in page perception, code generation (particularly for image handling and layout), and interaction implementation. Our results quantitatively demonstrate performance disparities across models and tasks, and highlight a substantial gap between current MLLM capabilities and human expert performance in front-end engineering. The FullFront benchmark and code are available in https://github.com/Mikivishy/FullFront.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.006827354431152, 3.6205151081085205], "openalex_id": "https://openalex.org/W4414581015", "title": "Discovering Forbidden Topics in Language Models", "authors": "Can Rager, Chris Wendler, Rohit Gandikota, David Bau", "abstract": "Refusal discovery is the task of identifying the full set of topics that a language model refuses to discuss. We introduce this new problem setting and develop a refusal discovery method, Iterated Prefill Crawler (IPC), that uses token prefilling to find forbidden topics. We benchmark IPC on Tulu-3-8B, an open-source model with public safety tuning data. Our crawler manages to retrieve 31 out of 36 topics within a budget of 1000 prompts. Next, we scale the crawler to a frontier model using the prefilling option of Claude-Haiku. Finally, we crawl three widely used open-weight models: Llama-3.3-70B and two of its variants finetuned for reasoning: DeepSeek-R1-70B and Perplexity-R1-1776-70B. DeepSeek-R1-70B reveals patterns consistent with censorship tuning: The model exhibits \"thought suppression\" behavior that indicates memorization of CCP-aligned responses. Although Perplexity-R1-1776-70B is robust to censorship, IPC elicits CCP-aligned refusals answers in the quantized model. Our findings highlight the critical need for refusal discovery methods to detect biases, boundaries, and alignment failures of AI systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.2408034801483154, 1.3160736560821533], "openalex_id": "https://openalex.org/W4410759468", "title": "And Plato met ChatGPT: an ethical reflection on the use of chatbots in scientific research writing, with a particular focus on the social sciences", "authors": "Reyes Calder\u00f3n, Francisco Herrera", "abstract": "Abstract This interdisciplinary paper analyzes the use of Large Language Models based chatbots (LLM-chatbots), with ChatGPT the most known exponent, in scientific research writing. By interacting with LLM-chatbots, researchers could reduce efforts and costs as well as improve efficiency, but taking important risks, limitations, and weaknesses, which could highly-order erosion scientific thought. While many scientific journals, as well as major publishers such as Springer-Nature or Taylor & Francis, are restricting its use, others advocate for its normalization. Debate focuses on two main questions: the possible authorship of LLM-chatbots, which is majority denied because their inability to meet the required standards; and the acceptance of hybrid articles (using LLM-chatbots). Very recently, focusing on the education area, literature has found analogical similarities between some issues involved in Chatbots and that of Plato criticisms of writing, contained in the Phaedrus. However, the research area has been neglected. Combining philosophical and technological analysis, we explore Plato\u2019s myth of Theuth and Thamus, questioning if chatbots can improve science. From an interdisciplinary perspective, and according with Plato, we conclude LLM-chatbots cannot be considered as authors in a scientific context. Moreover, we offer some arguments and requirements to accept hybrid articles. We draw attention to the need for social science publishers, an area where conceptual hypotheses can take a long time to confirm, rather than solely on experimental observations. Finally, we advocate that publishers, communities, technical experts, and regulatory authorities collaborate to establish recommendations and best practices for chatbot use.", "venue": "Humanities and Social Sciences Communications", "label": 0}, {"loc": [9.494466781616211, 0.7191019654273987], "openalex_id": "https://openalex.org/W4415035372", "title": "Does Synthetic Data Help Named Entity Recognition for Low-Resource Languages?", "authors": "Gaurav Kamath, Sowmya Vajjala", "abstract": "Named Entity Recognition(NER) for low-resource languages aims to produce robust systems for languages where there is limited labeled training data available, and has been an area of increasing interest within NLP. Data augmentation for increasing the amount of low-resource labeled data is a common practice. In this paper, we explore the role of synthetic data in the context of multilingual, low-resource NER, considering 11 languages from diverse language families. Our results suggest that synthetic data does in fact hold promise for low-resource language NER, though we see significant variation between languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.00781512260437, 2.6307241916656494], "openalex_id": "https://openalex.org/W4410598123", "title": "AI-Generated Content in Copyright Law: A Roadmap for Updating GCC Copyright Law", "authors": "Yongqing Lu", "abstract": "With the emergence of disputes over the copyright of AI-generated content (AIGC), academia has extensively discussed relevant issues, including copyright protectability and ownership. However, the copyright law community has not reached an international consensus. Adopting a doctrinal methodology, this paper investigates these issues and proposes reforms, arguing that copyright law should clarify the de facto authorship of AI and determine the originality of AIGC based on minimum creativity at the expression level. It also recommends attributing copyright of AIGC to the AI owner via statutory provision, allowing contractual allocation between parties. The proposed framework would resolve significant academic controversies on fundamental issues surrounding AIGC copyright and provide a reference model for future research.", "venue": "Technology and Regulation", "label": 0}, {"loc": [6.0754547119140625, 0.03111533261835575], "openalex_id": "https://openalex.org/W4410612857", "title": "Entropy and type-token ratio in gigaword corpora", "authors": "Pablo Rosillo-Rodes, M. San Miguel, David S\u00e1nchez", "abstract": "There are different ways of measuring diversity in complex systems. In particular, in language, lexical diversity is characterized in terms of the type-token ratio and the word entropy. We here investigate both diversity metrics in six massive linguistic data sets in English, Spanish, and Turkish, consisting of books, news articles, and tweets. These gigaword corpora correspond to languages with distinct morphological features and differ in registers and genres, thus constituting a varied testbed for a quantitative approach to lexical diversity. We unveil an empirical functional relation between entropy and type-token ratio of texts of a given corpus and language, which is a consequence of the statistical laws observed in natural language. Further, in the limit of large text lengths we find an analytical expression for this relation relying on both Zipf and Heaps laws that agrees with our empirical findings.", "venue": "Physical Review Research", "label": 0}, {"loc": [7.116024971008301, 3.1911160945892334], "openalex_id": "https://openalex.org/W4415326918", "title": "Foundations of Unknown-aware Machine Learning", "authors": "Xuefeng Du", "abstract": "Ensuring the reliability and safety of machine learning models in open-world deployment is a central challenge in AI safety. This thesis develops both algorithmic and theoretical foundations to address key reliability issues arising from distributional uncertainty and unknown classes, from standard neural networks to modern foundation models like large language models (LLMs). Traditional learning paradigms, such as empirical risk minimization (ERM), assume no distribution shift between training and inference, often leading to overconfident predictions on out-of-distribution (OOD) inputs. This thesis introduces novel frameworks that jointly optimize for in-distribution accuracy and reliability to unseen data. A core contribution is the development of an unknown-aware learning framework that enables models to recognize and handle novel inputs without labeled OOD data. We propose new outlier synthesis methods, VOS, NPOS, and DREAM-OOD, to generate informative unknowns during training. Building on this, we present SAL, a theoretical and algorithmic framework that leverages unlabeled in-the-wild data to enhance OOD detection under realistic deployment conditions. These methods demonstrate that abundant unlabeled data can be harnessed to recognize and adapt to unforeseen inputs, providing formal reliability guarantees. The thesis also extends reliable learning to foundation models. We develop HaloScope for hallucination detection in LLMs, MLLMGuard for defending against malicious prompts in multimodal models, and data cleaning methods to denoise human feedback used for better alignment. These tools target failure modes that threaten the safety of large-scale models in deployment. Overall, these contributions promote unknown-aware learning as a new paradigm, and we hope it can advance the reliability of AI systems with minimal human efforts.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.641709566116333, 4.438873767852783], "openalex_id": "https://openalex.org/W4415328677", "title": "An Efficient Private GPT Never Autoregressively Decodes", "authors": "Zhuyong Li, Yue Guan, Kang Yang, Yu Feng, Ning Liu, Yu Yu, Jingwen Leng, Minyi Guo", "abstract": "The wide deployment of the generative pre-trained transformer (GPT) has raised privacy concerns for both clients and servers. While cryptographic primitives can be employed for secure GPT inference to protect the privacy of both parties, they introduce considerable performance overhead.To accelerate secure inference, this study proposes a public decoding and secure verification approach that utilizes public GPT models, motivated by the observation that securely decoding one and multiple tokens takes a similar latency. The client uses the public model to generate a set of tokens, which are then securely verified by the private model for acceptance. The efficiency of our approach depends on the acceptance ratio of tokens proposed by the public model, which we improve from two aspects: (1) a private sampling protocol optimized for cryptographic primitives and (2) model alignment using knowledge distillation. Our approach improves the efficiency of secure decoding while maintaining the same level of privacy and generation quality as standard secure decoding. Experiments demonstrate a $2.1\\times \\sim 6.0\\times$ speedup compared to standard decoding across three pairs of public-private models and different network conditions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.521881580352783, 2.1395275592803955], "openalex_id": "https://openalex.org/W4415329277", "title": "The Representational Alignment between Humans and Language Models is implicitly driven by a Concreteness Effect", "authors": "Cosimo Iaia, Bhavin Choksi, Emily Wiebers, Gemma Roig, Christian J. Fiebach", "abstract": "The nouns of our language refer to either concrete entities (like a table) or abstract concepts (like justice or love), and cognitive psychology has established that concreteness influences how words are processed. Accordingly, understanding how concreteness is represented in our mind and brain is a central question in psychology, neuroscience, and computational linguistics. While the advent of powerful language models has allowed for quantitative inquiries into the nature of semantic representations, it remains largely underexplored how they represent concreteness. Here, we used behavioral judgments to estimate semantic distances implicitly used by humans, for a set of carefully selected abstract and concrete nouns. Using Representational Similarity Analysis, we find that the implicit representational space of participants and the semantic representations of language models are significantly aligned. We also find that both representational spaces are implicitly aligned to an explicit representation of concreteness, which was obtained from our participants using an additional concreteness rating task. Importantly, using ablation experiments, we demonstrate that the human-to-model alignment is substantially driven by concreteness, but not by other important word characteristics established in psycholinguistics. These results indicate that humans and language models converge on the concreteness dimension, but not on other dimensions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.415402412414551, 5.357738971710205], "openalex_id": "https://openalex.org/W4415329305", "title": "SNAP: A Benchmark for Testing the Effects of Capture Conditions on Fundamental Vision Tasks", "authors": "Iuliia Kotseruba, John K. Tsotsos", "abstract": "Generalization of deep-learning-based (DL) computer vision algorithms to various image perturbations is hard to establish and remains an active area of research. The majority of past analyses focused on the images already captured, whereas effects of the image formation pipeline and environment are less studied. In this paper, we address this issue by analyzing the impact of capture conditions, such as camera parameters and lighting, on DL model performance on 3 vision tasks -- image classification, object detection, and visual question answering (VQA). To this end, we assess capture bias in common vision datasets and create a new benchmark, SNAP (for $\\textbf{S}$hutter speed, ISO se$\\textbf{N}$sitivity, and $\\textbf{AP}$erture), consisting of images of objects taken under controlled lighting conditions and with densely sampled camera settings. We then evaluate a large number of DL vision models and show the effects of capture conditions on each selected vision task. Lastly, we conduct an experiment to establish a human baseline for the VQA task. Our results show that computer vision datasets are significantly biased, the models trained on this data do not reach human accuracy even on the well-exposed images, and are susceptible to both major exposure changes and minute variations of camera settings. Code and data can be found at https://github.com/ykotseruba/SNAP", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.706820964813232, -0.3471450209617615], "openalex_id": "https://openalex.org/W4410601021", "title": "Large Language Models in Crisis Informatics for Zero and Few-Shot Classification", "authors": "Cinthia S\u00e1nchez, Andr\u00e9s Abeliuk, B\u00e1rbara Poblete", "abstract": "This article presents an exploration of the use of pre-trained Large Language Models (LLMs) for crisis classification to address labeled data dependency issues. We present a methodology that enhances open LLMs through fine-tuning, creating zero-shot and few-shot classifiers that approach traditional supervised models in classifying crisis-related messages. A comparative study evaluates crisis classification tasks using general domain pre-trained LLMs, crisis-specific LLMs, and traditional supervised learning methods, establishing a benchmark in the field. Our task-specific fine-tuned Llama model achieved a 69% macro F1 score in classifying humanitarian information\u2013a remarkable 26% improvement compared to the Llama baseline, even with limited training data. Moreover, it outperformed ChatGPT4 by 3% in macro F1. This improvement increased to 71% macro F1 when fine-tuning Llama with multitask data. For the binary classification of messages as related vs. not related to crises, we observed that pre-trained LLMs, such as Llama 2 and ChatGPT4, performed well without fine-tuning, achieving an 87% macro F1 score with ChatGPT4. This research expands our knowledge of how to exploit the potential of LLMs for crisis classification, representing a great opportunity for crisis scenarios that lack labeled data. The findings emphasize the potential of LLMs in crisis informatics to address cold start challenges, especially critical in the initial phases of a disaster, while also showcasing their capacity to attain high accuracy even with limited training data.", "venue": "ACM Transactions on the Web", "label": 0}, {"loc": [4.303424835205078, 3.121946334838867], "openalex_id": "https://openalex.org/W4415330128", "title": "Exploring Causes of Representational Similarity in Machine Learning Models", "authors": "Zeyu Michael Li, Hung Anh Vu, Damilola Awofisayo, Emily Wenger", "abstract": "Numerous works have noted similarities in how machine learning models represent the world, even across modalities. Although much effort has been devoted to uncovering properties and metrics on which these models align, surprisingly little work has explored causes of this similarity. To advance this line of inquiry, this work explores how two factors - dataset overlap and task overlap - influence downstream model similarity. We evaluate the effects of both factors through experiments across model sizes and modalities, from small classifiers to large language models. We find that both task and dataset overlap cause higher representational similarity and that combining them provides the strongest effect. Finally, we consider downstream consequences of representational similarity, demonstrating how greater similarity increases vulnerability to transferable adversarial and jailbreak attacks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.393611907958984, 5.337865829467773], "openalex_id": "https://openalex.org/W4415023874", "title": "SCAN: Semantic Document Layout Analysis for Textual and Visual Retrieval-Augmented Generation", "authors": "Yuyang Dong, Nobuhiro Ueda, Kriszti\u00e1n Boros, Daiki Ito, Takuya Sera, Masafumi Oyamada", "abstract": "With the increasing adoption of Large Language Models (LLMs) and Vision-Language Models (VLMs), rich document analysis technologies for applications like Retrieval-Augmented Generation (RAG) and visual RAG are gaining significant attention. Recent research indicates that using VLMs can achieve better RAG performance, but processing rich documents still remains a challenge since a single page contains large amounts of information. In this paper, we present SCAN (\\textbf{S}emanti\\textbf{C} Document Layout \\textbf{AN}alysis), a novel approach enhancing both textual and visual Retrieval-Augmented Generation (RAG) systems working with visually rich documents. It is a VLM-friendly approach that identifies document components with appropriate semantic granularity, balancing context preservation with processing efficiency. SCAN uses a coarse-grained semantic approach that divides documents into coherent regions covering continuous components. We trained the SCAN model by fine-tuning object detection models with sophisticated annotation datasets. Our experimental results across English and Japanese datasets demonstrate that applying SCAN improves end-to-end textual RAG performance by up to 9.0\\% and visual RAG performance by up to 6.4\\%, outperforming conventional approaches and even commercial document processing solutions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.155404567718506, -0.15985292196273804], "openalex_id": "https://openalex.org/W4415020697", "title": "Cross-Linguistic Transfer in Multilingual NLP: The Role of Language Families and Morphology", "authors": "Ajitesh Bankula, Praney Bankula", "abstract": "Cross-lingual transfer has become a crucial aspect of multilingual NLP, as it allows for models trained on resource-rich languages to be applied to low-resource languages more effectively. Recently massively multilingual pre-trained language models (e.g., mBERT, XLM-R) demonstrate strong zero-shot transfer capabilities[14] [13]. This paper investigates cross-linguistic transfer through the lens of language families and morphology. Investigating how language family proximity and morphological similarity affect performance across NLP tasks. We further discuss our results and how it relates to findings from recent literature. Overall, we compare multilingual model performance and review how linguistic distance metrics correlate with transfer outcomes. We also look into emerging approaches that integrate typological and morphological information into model pre-training to improve transfer to diverse languages[18] [19].", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.569845199584961, 0.835236668586731], "openalex_id": "https://openalex.org/W4415021854", "title": "Enhancing LLMs via High-Knowledge Data Selection", "authors": "Feiyu Duan, Xiaona Zhang, Sirui Wang, Haoran Que, Yuqi Liu, Wenge Rong, Xunliang Cai", "abstract": "The performance of Large Language Models (LLMs) is intrinsically linked to the quality of its training data. Although several studies have proposed methods for high-quality data selection, they do not consider the importance of knowledge richness in text corpora. In this paper, we propose a novel and gradient-free High-Knowledge Scorer (HKS) to select high-quality data from the dimension of knowledge, to alleviate the problem of knowledge scarcity in the pre-trained corpus. We propose a comprehensive multi-domain knowledge element pool and introduce knowledge density and coverage as metrics to assess the knowledge content of the text. Based on this, we propose a comprehensive knowledge scorer to select data with intensive knowledge, which can also be utilized for domain-specific high-knowledge data selection by restricting knowledge elements to the specific domain. We train models on a high-knowledge bilingual dataset, and experimental results demonstrate that our scorer improves the model's performance in knowledge-intensive and general comprehension tasks, and is effective in enhancing both the generic and domain-specific capabilities of the model.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.282820224761963, 2.5527055263519287], "openalex_id": "https://openalex.org/W4414580562", "title": "SHARP: Synthesizing High-quality Aligned Reasoning Problems for Large Reasoning Models Reinforcement Learning", "authors": "Xingfang Wu, Z. Y. Zhang, Zheng Wen, Zhiqiang Zhang, Wang Ren, Lei Shi, Cai Chen, Zhao Deng, Qing Wang, Xudong Han, Chengfu Tang, Dingnan Jin, Qing Cui, Jun Zhou", "abstract": "Training large reasoning models (LRMs) with reinforcement learning in STEM domains is hindered by the scarcity of high-quality, diverse, and verifiable problem sets. Existing synthesis methods, such as Chain-of-Thought prompting, often generate oversimplified or uncheckable data, limiting model advancement on complex tasks. To address these challenges, we introduce SHARP, a unified approach to Synthesizing High-quality Aligned Reasoning Problems for LRMs reinforcement learning with verifiable rewards (RLVR). SHARP encompasses a strategic set of self-alignment principles -- targeting graduate and Olympiad-level difficulty, rigorous logical consistency, and unambiguous, verifiable answers -- and a structured three-phase framework (Alignment, Instantiation, Inference) that ensures thematic diversity and fine-grained control over problem generation. We implement SHARP by leveraging a state-of-the-art LRM to infer and verify challenging STEM questions, then employ a reinforcement learning loop to refine the model's reasoning through verifiable reward signals. Experiments on benchmarks such as GPQA demonstrate that SHARP-augmented training substantially outperforms existing methods, markedly improving complex reasoning accuracy and pushing LRM performance closer to expert-level proficiency. Our contributions include the SHARP strategy, framework design, end-to-end implementation, and experimental evaluation of its effectiveness in elevating LRM reasoning capabilities.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.738212585449219, 3.729712963104248], "openalex_id": "https://openalex.org/W4415020574", "title": "EfficientLLM: Efficiency in Large Language Models", "authors": "Zhengqing Yuan, Weixiang Sun, Yixin Liu, Huichi Zhou, Zhou Rong, Yiyang Li, Zheyuan Zhang, Wei Song, Yue Huang, Hongtao Jia, Keerthiram Murugesan, Yu Wang, HE Li-fang, Jianfeng Gao, Lichao Sun, Yanfang Ye", "abstract": "Large Language Models (LLMs) have driven significant progress, yet their growing parameter counts and context windows incur prohibitive compute, energy, and monetary costs. We introduce EfficientLLM, a novel benchmark and the first comprehensive empirical study evaluating efficiency techniques for LLMs at scale. Conducted on a production-class cluster (48xGH200, 8xH200 GPUs), our study systematically explores three key axes: (1) architecture pretraining (efficient attention variants: MQA, GQA, MLA, NSA; sparse Mixture-of-Experts (MoE)), (2) fine-tuning (parameter-efficient methods: LoRA, RSLoRA, DoRA), and (3) inference (quantization methods: int4, float16). We define six fine-grained metrics (Memory Utilization, Compute Utilization, Latency, Throughput, Energy Consumption, Compression Rate) to capture hardware saturation, latency-throughput balance, and carbon cost. Evaluating over 100 model-technique pairs (0.5B-72B parameters), we derive three core insights: (i) Efficiency involves quantifiable trade-offs: no single method is universally optimal; e.g., MoE reduces FLOPs and improves accuracy but increases VRAM by 40%, while int4 quantization cuts memory/energy by up to 3.9x at a 3-5% accuracy drop. (ii) Optima are task- and scale-dependent: MQA offers optimal memory-latency trade-offs for constrained devices, MLA achieves lowest perplexity for quality-critical tasks, and RSLoRA surpasses LoRA efficiency only beyond 14B parameters. (iii) Techniques generalize across modalities: we extend evaluations to Large Vision Models (Stable Diffusion 3.5, Wan 2.1) and Vision-Language Models (Qwen2.5-VL), confirming effective transferability. By open-sourcing datasets, evaluation pipelines, and leaderboards, EfficientLLM provides essential guidance for researchers and engineers navigating the efficiency-performance landscape of next-generation foundation models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.851194143295288, 3.7829840183258057], "openalex_id": "https://openalex.org/W4415022851", "title": "Capturing the Effects of Quantization on Trojans in Code LLMs", "authors": "Aftab Hussain, Sadegh AlMahdi Kazemi Zarkouei, Md Rafiqul Islam Rabin, Mohammad Amin Alipour, Sen Lin, Bowen Xu", "abstract": "Large language models of code exhibit high capability in performing diverse software engineering tasks, such as code translation, defect detection, text-to-code generation, and code summarization. While their ability to enhance developer productivity has spurred widespread use, these models have also seen substantial growth in size, often reaching billions of parameters. This scale demands efficient memory resource usage, prompting practitioners to use optimization techniques such as model quantization. Quantization uses smaller bit representations for the model parameters, reducing the precision of the weights. In this work, we investigate the impact of quantization on the risk of data poisoning attacks on these models, specifically examining whether it mitigates or exacerbates such vulnerabilities. We focus on two large language models, Meta's Llama-2-7b and CodeLlama-7b, applied to an SQL code generation task. Additionally, we introduce a new metric for measuring trojan signals in compromised models. We find that quantization has differing effects on code-generating LLMs: while reducing precision does not significantly alter Llama-2's behavior, it boosts performance and reduces attack success rates in CodeLlama, particularly at 4-bit precision.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.1061930656433105, 0.02547088824212551], "openalex_id": "https://openalex.org/W4410508815", "title": "Methods and Resources in Germanic Variationist Linguistics", "authors": "John Nerbonne, Verena Blaschke, Hinrich Sch\u00fctze, Barbara Plank", "abstract": "Variationist linguistics, encompassing dialectology and sociolinguistics, studies how linguistic variation is distributed and the dynamics behind the distribution. This article aims to present the most important current resources\u2014methods and data and software archives\u2014for research in Germanic variationist linguists. It is novel to include an article on resources in a collection such as this Encyclopedia, so we begin by motivating its inclusion, justifying why contemporary scholars are expected to make resources available to the discipline. With respect to methods, the emphasis is on analytical methods as opposed to methods for field work, site selection, or interviews, and the focus is on software for data analysis. With respect to archives, we emphasize digital repositories. We report on resources important in the variationist research community, that is, dialectology and sociolinguistics, but also on resources used in the growing community of computational linguists interested in variation.", "venue": "Oxford Research Encyclopedia of Linguistics", "label": 0}, {"loc": [2.0165748596191406, 5.346847057342529], "openalex_id": "https://openalex.org/W4410429392", "title": "Safeguarding Patient Data: Machine Learning for Phishing URL Detection in Healthcare Systems", "authors": "A. A. Mousa, S. Hassan, Mohammed Kareem Rashid, Murtadha L. Al\u2010Saady", "abstract": "Since the healthcare industry depends more and more on digital infrastructure, it is a perfect target for cyberattacks especially phishing. Designed to pass for real healthcare websites, phishing URLs seriously compromise patient data security. Effective strikes may cause disruptions in patient care, financial losses, and Protected Health Information (PHI) breaches. This work investigates the use of machine learning (ML) approaches for robust and accurate phishing URL detection in healthcare systems in order to handle this important problem. We examine a Multilayer Perceptron (MLP) neural network-based detection model and evaluate its performance against known techniques, Decision Tree (DT) and Naive Bayes (NB). Comprehensive URL datasets\u2014more especially, the ISCX-URL-2016 dataset for training and testing\u2014as well as the CIC-InvesBanking-2017 dataset combined with live phishing feeds for validation help to train and rigorously validate the models. With a high accuracy of 87.45% on test data and a precision of 84% on unseen validation data, our experimental results show that the proposed MLP model much exceeds DT and NB. This emphasizes how ML\u2014more especially, MLP\u2014may improve cybersecurity defences in healthcare, hence securing private patient data and the integrity of healthcare processes.", "venue": "Journal of Advanced Research Design", "label": 0}, {"loc": [5.2944793701171875, 3.136716365814209], "openalex_id": "https://openalex.org/W4410460359", "title": "First polarization study of the M87 jet and active galactic nuclei at submillimeter wavelengths with ALMA", "authors": "C. Goddi, Douglas Ferreira Carlos, G. Crew, Lynn D. Matthews, Hugo Messias, Alejandro Mus, Iv\u00e1n Mart\u00ed-Vidal, Ezequiel Albentosa-Ru\u00edz, Mariafelicia De Laurentis, Elisabetta Liuzzo, N. Marchili, L.J. Kazi Rygl, Kazunori Akiyama, A. Alberdi, W. Alef, Juan Carlos Algaba, Richard Anantua, Keiichi Asada, Rebecca Azulay, U. Bach, Anne-Kathrin Baczko, David Ball, Mislav Balokovi\u0107, Bidisha Bandyopadhyay, John Barrett, Michi Baub\u00f6ck, B. A. Benson, Dan Bintley, Lindy Blackburn, R. Blundell, L. Katherine Bouman, Geoffrey C. Bower, Michael Bremer, Roger Brissenden, S. Britzen, E. Avery Broderick, Dominique Brogui\u00e8re, Thomas Bronzwaer, Sandra Bustamante, J. E. Carlstrom, Andrew Chael, Chi\u2010kwan Chan, O. Chang, Koushik Chatterjee, Shami Chatterjee, Ming\u2010Tang Chen, Xiaopeng Cheng, \u9648\u6c38\u519b, Xiaopeng Cheng, Pierre Christian, Pierre Christian, Shelly Conroy, T. M. Crawford, M. K. Crawford, Yuzhu Cui, Yuzhu Cui, \u5d14\u7389\u7af9, Jordy Davelaar, Rohan Dahale, Jordy Davelaar, Roger Deane, Jessica Dempsey, Vedant Dhruv, Jason Dexter, Vedant Dhruv, Sergio A. Dzib, Sheperd S. Doeleman, Razieh Emami, P. Ralph Eatough, Razieh Emami, Vincent L. Fish, Joseph Farah, Lisa Fish, Marianna Foschi, Raquel Fraga-Encinas, William T. Freeman, Raquel Fraga-Encinas, T. Freeman, Per Friberg, Peter Galison, Charles F. Gammie, Roberto Garc\u00eda, Olivier Gentaz, Roberto Garc\u00eda, Roman Gold, Boris Georgiev, Roman Gold, Minfeng Gu, Mark Gurwell, Minfeng Gu, \u987e\u654f\u5cf0, Ronald Hesper, Kazuhiro Hada, Luis C. Ho, Ronald Hesper, Mareki Honma, Chih-Wei L. Huang, \u4f55\u5b50\u5c71, Paul T. P. Ho, Shiro Ikeda", "abstract": "Aims. We investigated the polarization and Faraday properties of Messier 87 (M87) and seven other radio-loud active galactic nuclei (AGNs) at \u03bb 0.87 mm (345 GHz) using the Atacama Large Millimeter/submillimeter Array (ALMA). Our goal was to characterize the linear polarization (LP) fractions, measure Faraday rotation measures (RMs), and examine the magnetic field structures in the emission regions of these AGNs. Methods. We conducted full-polarization observations as part of the ALMA Band 7 very long baseline interferometry (VLBI) commissioning during the April 2021 Event Horizon Telescope (EHT) campaign. We analyzed the LP fractions and RMs to assess the nature of Faraday screens and magnetic fields in the submillimeter emission regions. Results. We find LP fractions between 1% and 17% and RMs exceeding 10 5 rad m \u22122, which are 1\u20132 orders of magnitude higher than typically observed at longer wavelengths (\u03bb >3 mm). This suggests denser Faraday screens or stronger magnetic fields. Additionally, we present the first submillimeter polarized images of the M87 jet and the observed AGNs, revealing RM gradients and sign reversals in the M87 jet indicative of a kiloparsec-scale helical magnetic field structure. Conclusions. Our results provide essential constraints for calibrating, analyzing, and interpreting VLBI data from the EHT at 345 GHz, representing a critical step toward submillimeter VLBI imaging.", "venue": "Astronomy and Astrophysics", "label": 43}, {"loc": [3.310493230819702, 1.9704616069793701], "openalex_id": "https://openalex.org/W4410446393", "title": "\u201cWe Share an Unbreakable Bond:\u201d Sociality and Language Ideologies in Human Relationships with Artificial Intelligence", "authors": "A.F. Rocha", "abstract": "Abstract Replika, an artificial intelligence (AI) companion, is part of a growing number of social chatbots. This paper examines the multimodal semiotic signs influencing how users perceive realness in their chatbots. I argue that what users describe as real/alive in relation to the bots refers to an iconization of humanness, following Judith T. Irvine and Susan Gal on the semiotic process of \u201ciconization.\u201d Users reflect and share their experiences of voicing contrasts of Replika in digital spaces that function primarily for sociability. I draw on Mikhail Bakhtin\u2019s concept of \u201cheteroglossia\u201d as a framework for thinking about the multiplicity of voices implicit in the conversational exchanges with the chatbots and among users in reflexive texts. I look at the relationships with the chatbots through frames of language ideologies, historical discourse, and visuality.", "venue": "Signs and Society", "label": 0}, {"loc": [6.204150199890137, 2.474444627761841], "openalex_id": "https://openalex.org/W4415199731", "title": "Mining Hidden Thoughts from Texts: Evaluating Continual Pretraining with Synthetic Data for LLM Reasoning", "authors": "Yuki Ishibashi, Taro Yano, Masafumi Oyamada", "abstract": "Large Language Models (LLMs) have demonstrated significant improvements in reasoning capabilities through supervised fine-tuning and reinforcement learning. However, when training reasoning models, these approaches are primarily applicable to specific domains such as mathematics and programming, which imposes fundamental constraints on the breadth and scalability of training data. In contrast, continual pretraining (CPT) offers the advantage of not requiring task-specific signals. Nevertheless, how to effectively synthesize training data for reasoning and how such data affect a wide range of domains remain largely unexplored. This study provides a detailed evaluation of Reasoning CPT, a form of CPT that uses synthetic data to reconstruct the hidden thought processes underlying texts, based on the premise that texts are the result of the author's thinking process. Specifically, we apply Reasoning CPT to Gemma2-9B using synthetic data with hidden thoughts derived from STEM and Law corpora, and compare it to standard CPT on the MMLU benchmark. Our analysis reveals that Reasoning CPT consistently improves performance across all evaluated domains. Notably, reasoning skills acquired in one domain transfer effectively to others; the performance gap with conventional methods widens as problem difficulty increases, with gains of up to 8 points on the most challenging problems. Furthermore, models trained with hidden thoughts learn to adjust the depth of their reasoning according to problem difficulty.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.260155200958252, 5.429503917694092], "openalex_id": "https://openalex.org/W4415199327", "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language", "authors": "Ijazul Haq, Yingjie Zhang, Irfan Ali Khan", "abstract": "This paper evaluates the performance of Large Multimodal Models (LMMs) on Optical Character Recognition (OCR) in the low-resource Pashto language. Natural Language Processing (NLP) in Pashto faces several challenges due to the cursive nature of its script and a scarcity of structured datasets. To address this, we developed a synthetic Pashto OCR dataset, PsOCR, consisting of one million images annotated with bounding boxes at word, line, and document levels, suitable for training and evaluating models based on different architectures, including Convolutional Neural Networks (CNNs) and Transformers. PsOCR covers variations across 1,000 unique font families, colors, image sizes, and layouts. A benchmark subset of 10K images was selected to evaluate the performance of several LMMs, including seven open-source models: DeepSeek's Janus, InternVL, MiniCPM, Florence, and Qwen (3B and 7B), and four closed-source models: GPT-4o, Gemini, Claude, and Grok. Experimental results demonstrate that Gemini achieves the best performance among all models, whereas among open-source models, Qwen-7B stands out. This work provides an insightful assessment of the capabilities and limitations of current LMMs for OCR tasks in Pashto and establishes a foundation for further research not only in Pashto OCR but also for other similar scripts such as Arabic, Persian, and Urdu. PsOCR is available at https://github.com/zirak-ai/PashtoOCR.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.292734146118164, 2.6879947185516357], "openalex_id": "https://openalex.org/W4410386990", "title": "Calibrated Semi-Supervised Models for Disaster Response based on Training Dynamics", "authors": "Khushboo Gupta, Nikita Gautam, Tiberiu Sosea, Doina Caragea, Cornelia Caragea", "abstract": "Despite advancements in semi-supervised learning (SSL) techniques that can be used when labeled data is limited, many SSL approaches still face challenges related to miscalibration. Calibration is crucial for ensuring the accuracy, reliability, and robustness of uncertainty estimates. In this work, we analyze the calibration performance of various SSL methods in the disaster response domain. Our results show that traditional self-training (ST) and mixup-based SSL methods often suffer from high Expected Calibration Error (ECE) despite achieving competitive F1 scores. In contrast, a newly introduced approach in the disaster domain, AUM-ST-Mixup, significantly improves calibration, achieving the lowest ECE across all settings. This improvement suggests that incorporating uncertainty-awareselection via Area Under the Margin (AUM) alongside mixup regularization enhances both predictive performance and model confidence alignment. Our findings highlight the importance of calibration-aware SSL methods, paving the way for more trustworthy model predictions in low-resource settings.", "venue": "Proceedings of the ... International ISCRAM Conference", "label": 0}, {"loc": [5.053098678588867, 1.8704456090927124], "openalex_id": "https://openalex.org/W4414938323", "title": "S-DAT: A Multilingual, GenAI-Driven Framework for Automated Divergent Thinking Assessment", "authors": "Jennifer Haase, Paul H. P. Hanel, Sebastian Pokutta", "abstract": "This paper introduces S-DAT (Synthetic-Divergent Association Task), a scalable, multilingual framework for automated assessment of divergent thinking (DT) -a core component of human creativity. Traditional creativity assessments are often labor-intensive, language-specific, and reliant on subjective human ratings, limiting their scalability and cross-cultural applicability. In contrast, S-DAT leverages large language models and advanced multilingual embeddings to compute semantic distance -- a language-agnostic proxy for DT. We evaluate S-DAT across eleven diverse languages, including English, Spanish, German, Russian, Hindi, and Japanese (Kanji, Hiragana, Katakana), demonstrating robust and consistent scoring across linguistic contexts. Unlike prior DAT approaches, the S-DAT shows convergent validity with other DT measures and correct discriminant validity with convergent thinking. This cross-linguistic flexibility allows for more inclusive, global-scale creativity research, addressing key limitations of earlier approaches. S-DAT provides a powerful tool for fairer, more comprehensive evaluation of cognitive flexibility in diverse populations and can be freely assessed online: https://sdat.iol.zib.de/.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.8742921352386475, 1.9569518566131592], "openalex_id": "https://openalex.org/W4410395948", "title": "The Collapse of GPT", "authors": "Neil Savage", "abstract": "Will future artificial intelligence systems perform increasingly poorly due to AI-generated data in their training data?", "venue": "Communications of the ACM", "label": 0}, {"loc": [2.0547194480895996, 5.315127372741699], "openalex_id": "https://openalex.org/W4410337809", "title": "Machine Learning-Based Phishing Websites Classification Using Diverse Datasets: An Empirical Analysis", "authors": "Shahab Haider, Bilal Khan, Wahab Khan, Sana Ullah, Zulfiqar Ali", "abstract": "Recent technological developments make users vulnerable to several cyber-attacks, where phishing attacks compromise users' sensitive information. To identify these attacks, there are different social techniques, which bring user awareness. However, they are incapable of significantly identifying phishing attacks that demands technology-assisted techniques. The existing literature provides several blacklist-whitelist, fuzzy rule, and Machine Learning (ML) based techniques, where ML techniques are the most efficient. However, there is a lack of comprehensive analysis of ML techniques. To fill this gap, this work presents an in-depth empirical comparative analysis of eleven different ML techniques, including WiSARD, CHIRP, and Ridor that have not been previously analyzed, for phishing attacks' identification. To achieve this aim, this study employs diverse UCI and Mendeley datasets and performs a 10 fold cross validation. Results demonstrate superior performance of J48, with enhanced accuracy and minimized error rate, in comparison with the other eminent techniques.", "venue": "IGI Global eBooks", "label": 0}, {"loc": [6.623980522155762, 2.6417696475982666], "openalex_id": "https://openalex.org/W4415245867", "title": "Rewriting Pre-Training Data Boosts LLM Performance in Math and Code", "authors": "Kazuki Fujii, Y. Tajima, Sakae Mizuki, Hiroyuki Shimada, Taihei Shiotani, Koshiro Saito, Masanari Ohi, Masaki Kawamura, Taishi Nakamura, T. Okamoto, S Ishida, Kakeru Hattori, Youmi Ma, Hiroya Takamura, Rio Yokota, Naoaki Okazaki", "abstract": "The performance of large language models (LLMs) in program synthesis and mathematical reasoning is fundamentally limited by the quality of their pre-training corpora. We introduce two openly licensed datasets, released under the Llama 3.3 Community License, that significantly enhance LLM performance by systematically rewriting public data. SwallowCode (approximately 16.1 billion tokens) refines Python snippets from The-Stack-v2 through a novel four-stage pipeline: syntax validation, pylint-based style filtering, and a two-stage LLM rewriting process that enforces style conformity and transforms snippets into self-contained, algorithmically efficient examples. Unlike prior methods that rely on exclusionary filtering or limited transformations, our transform-and-retain approach upgrades low-quality code, maximizing data utility. SwallowMath (approximately 2.3 billion tokens) enhances Finemath-4+ by removing boilerplate, restoring context, and reformatting solutions into concise, step-by-step explanations. Within a fixed 50 billion token training budget, continual pre-training of Llama-3.1-8B with SwallowCode boosts pass@1 by +17.0 on HumanEval and +17.7 on HumanEval+ compared to Stack-Edu, surpassing the baseline model's code generation capabilities. Similarly, substituting SwallowMath yields +12.4 accuracy on GSM8K and +7.6 on MATH. Ablation studies confirm that each pipeline stage contributes incrementally, with rewriting delivering the largest gains. All datasets, prompts, and checkpoints are publicly available, enabling reproducible research and advancing LLM pre-training for specialized domains.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.759044170379639, 3.1733243465423584], "openalex_id": "https://openalex.org/W4415247917", "title": "GPU Implementation of the Wavelet Tree", "authors": "Marco Franzreb, Martin Burtscher, Stephan Rudolph", "abstract": "I present a new GPU implementation of the wavelet tree data structure. It includes binary rank and select support structures that provide at least 10 times higher throughput of binary rank and select queries than the best publicly available CPU implementations at comparable storage overhead. My work also presents a new parallel tree construction algorithm that, when excluding the time to copy the data from the CPU to the GPU, outperforms the current state of the art. The GPU implementation, given enough parallelism, processes access, rank, and select queries at least 2x faster than the wavelet tree implementation contained in the widely used Succinct Data Structure Library (SDSL), including the time necessary to copy the queries from the CPU to the GPU and the results back to the CPU from the GPU.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.912754535675049, 3.801001787185669], "openalex_id": "https://openalex.org/W4415248384", "title": "An Overview of Large Language Models: Architectures, Emergent Abilities, and Applications", "authors": "Feibo Jiang, Cunhua Pan, Li Dong, Kezhi Wang, M\u00e9rouane Debbah, Dusit Niyato, Han Zhu", "abstract": "The 6G wireless communications aim to establish an intelligent world of ubiquitous connectivity, providing an unprecedented communication experience. Large artificial intelligence models (LAMs) are characterized by significantly larger scales (e.g., billions or trillions of parameters) compared to typical artificial intelligence (AI) models. LAMs exhibit outstanding cognitive abilities, including strong generalization capabilities for fine-tuning to downstream tasks, and emergent capabilities to handle tasks unseen during training. Therefore, LAMs efficiently provide AI services for diverse communication applications, making them crucial tools for addressing complex challenges in future wireless communication systems. This study provides a comprehensive review of the foundations, applications, and challenges of LAMs in communication. First, we introduce the current state of AI-based communication systems, emphasizing the motivation behind integrating LAMs into communications and summarizing the key contributions. We then present an overview of the essential concepts of LAMs in communication. This includes an introduction to the main architectures of LAMs, such as transformer, diffusion models, and mamba. We also explore the classification of LAMs, including large language models (LLMs), large vision models (LVMs), large multimodal models (LMMs), and world models, and examine their potential applications in communication. Additionally, we cover the training methods and evaluation techniques for LAMs in communication systems. Lastly, we introduce optimization strategies such as chain of thought (CoT), retrieval augmented generation (RAG), and agentic systems. Following this, we discuss the research advancements of LAMs across various communication scenarios. Finally, we analyze the challenges in the current research and provide insights into potential future research directions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.503832817077637, -1.4983750581741333], "openalex_id": "https://openalex.org/W4410165802", "title": "Geosocial media's perspective on energy: a text classification approach using natural language processing", "authors": "Jana Verdoodt, Kenzo Milleville, Haosheng Huang, Christophe Vandeviver, Steven Verstockt, Nico Van de Weghe", "abstract": "This study examines public opinion on various energy sources through Twitter data, focusing on fossil fuels, nuclear energy, and renewable energy sources like solar and wind. Utilizing natural language processing techniques, specifically BERTweet and GPT models, the research analyses tweet categorization based on sentiment and stance related to these energy sources. Our findings reveal a positive shift towards nuclear, solar, and wind energy, contrasting with increasing negative sentiment towards fossil fuels. Notably, BERTweet demonstrates superior precision and recall in tweet categorization compared to GPT-3.5 and GPT-4, which show potential bias against fossil fuels, misclassifying many tweets as opposing them. This study highlights the importance of social media analytics in understanding public opinions and shaping energy policy, suggesting that future research should broaden the scope of data, enhance multilingual capabilities, and improve data visualization to more accurately reflect global public opinion. The results underscore the need for balanced AI training to mitigate bias and more accurately capture diverse perspectives on contentious energy topics. The datasets, code utilized, and interactive maps with word clouds are available at https://doi.org/10.5281/zenodo.15020578 and https://doi.org/10.5281/zenodo.15084294.", "venue": "Journal of Location Based Services", "label": 0}, {"loc": [7.140542507171631, 3.2029261589050293], "openalex_id": "https://openalex.org/W4415027762", "title": "LogDB: Multivariate Log-based Failure Diagnosis for Distributed Databases (Extended from MultiLog)", "authors": "Lingzhe Zhang, Tong Jia, Mengxi Jia, Ying Li", "abstract": "Distributed databases, as the core infrastructure software for internet applications, play a critical role in modern cloud services. However, existing distributed databases frequently experience system failures and performance degradation, often leading to significant economic losses. Log data, naturally generated within systems, can effectively reflect internal system states. In practice, operators often manually inspect logs to monitor system behavior and diagnose anomalies, a process that is labor-intensive and costly. Although various log-based failure diagnosis methods have been proposed, they are generally not tailored for database systems and fail to fully exploit the internal characteristics and distributed nature of these systems. To address this gap, we propose LogDB, a log-based failure diagnosis method specifically designed for distributed databases. LogDB extracts and compresses log features at each database node and then aggregates these features at the master node to diagnose cluster-wide anomalies. Experiments conducted on the open-source distributed database system Apache IoTDB demonstrate that LogDB achieves robust failure diagnosis performance across different workloads and a variety of anomaly types.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.842024803161621, 5.42038631439209], "openalex_id": "https://openalex.org/W4416010170", "title": "Unified Multimodal Understanding and Generation Models: Advances, Challenges, and Opportunities", "authors": "Xinjie Zhang, Jintao Guo, Shanshan Zhao, M.W. Fu, Lunhao Duan, Jiakui Hu, Yong Xien Chng, Guohua Wang, Qingguo Chen, Xu Zhao, Weihua Luo, Kaifu Zhang", "abstract": "Recent years have seen remarkable progress in both multimodal understanding models and image generation models. Despite their respective successes, these two domains have evolved independently, leading to distinct architectural paradigms: While autoregressive-based architectures have dominated multimodal understanding, diffusion-based models have become the cornerstone of image generation. Recently, there has been growing interest in developing unified frameworks that integrate these tasks. The emergence of GPT-4o's new capabilities exemplifies this trend, highlighting the potential for unification. However, the architectural differences between the two domains pose significant challenges. To provide a clear overview of current efforts toward unification, we present a comprehensive survey aimed at guiding future research. First, we introduce the foundational concepts and recent advancements in multimodal understanding and text-to-image generation models. Next, we review existing unified models, categorizing them into three main architectural paradigms: diffusion-based, autoregressive-based, and hybrid approaches that fuse autoregressive and diffusion mechanisms. For each category, we analyze the structural designs and innovations introduced by related works. Additionally, we compile datasets and benchmarks tailored for unified models, offering resources for future exploration. Finally, we discuss the key challenges facing this nascent field, including tokenization strategy, cross-modal attention, and data. As this area is still in its early stages, we anticipate rapid advancements and will regularly update this survey. Our goal is to inspire further research and provide a valuable reference for the community. The references associated with this survey are available on GitHub (https://github.com/AIDC-AI/Awesome-Unified-Multimodal-Models).", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.11093282699585, 4.887155532836914], "openalex_id": "https://openalex.org/W4415032487", "title": "Using Knowledge Graphs to harvest datasets for efficient CLIP model training", "authors": "Simon Ging, Sebastian Walter, Jelena Bratuli\u0107, Johannes Dienert, Hannah Bast, Thomas Brox", "abstract": "Training high-quality CLIP models typically requires enormous datasets, which limits the development of domain-specific models -- especially in areas that even the largest CLIP models do not cover well -- and drives up training costs. This poses challenges for scientific research that needs fine-grained control over the training procedure of CLIP models. In this work, we show that by employing smart web search strategies enhanced with knowledge graphs, a robust CLIP model can be trained from scratch with considerably less data. Specifically, we demonstrate that an expert foundation model for living organisms can be built using just 10M images. Moreover, we introduce EntityNet, a dataset comprising 33M images paired with 46M text descriptions, which enables the training of a generic CLIP model in significantly reduced time.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.699698448181152, 0.6724926829338074], "openalex_id": "https://openalex.org/W4414769819", "title": "Synthesize-on-Graph: Knowledgeable Synthetic Data Generation for Continue Pre-training of Large Language Models", "authors": "Shengjie Ma, Xuhui Jiang, Chengjin Xu, Cehao Yang, Zhang Liyu, Jian Guo", "abstract": "Large Language Models (LLMs) have achieved remarkable success but remain data-inefficient, especially when learning from small, specialized corpora with limited and proprietary data. Existing synthetic data generation methods for continue pre-training focus on intra-document content and overlook cross-document knowledge associations, limiting content diversity and depth. We propose Synthetic-on-Graph (SoG), a synthetic data generation framework that incorporates cross-document knowledge associations for efficient corpus expansion. SoG constructs a context graph by extracting entities and concepts from the original corpus, representing cross-document associations, and employing a graph walk strategy for knowledge-associated sampling. This enhances synthetic data diversity and coherence, enabling models to learn complex knowledge structures and handle rare knowledge. To further improve the quality of synthetic data, we integrate two complementary strategies, Chain-of-Thought (CoT) and Contrastive Clarifying (CC), to enhance both reasoning capability and discriminative power. Extensive experiments demonstrate that SoG surpasses state-of-the-art (SOTA) methods on multi-hop and domain-specific question answering, while achieving competitive performance on long-context reading comprehension. These results highlight the superior generalization ability of SoG. Our work advances the paradigm of synthetic data generation and offers practical solutions for efficient knowledge acquisition in LLMs, particularly for downstream tasks and domains with limited training data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.224625587463379, 2.1204230785369873], "openalex_id": "https://openalex.org/W4410089651", "title": "Centroid analysis: Inferring concept representations from open-ended word responses", "authors": "Aliona Petrenco, Fritz Guenther", "abstract": "The present research proposes and evaluates a novel method - centroid analysis - for measuring representations and concepts at both individual and group levels by mapping open-ended responses onto a pre-existing semantic vector space. Centroid analysis allows to retrace the target concept as the geometric center of the semantic vectors of the responses generated by this concept. At the group level, centroid analysis enables researchers to compare conceptual structures across different populations to investigate how factors such as language, culture, cognitive differences, educational background, or exposure to specific narratives shape shared representations. At the individual level,centroid analysis allows for fine-grained assessments of how personal experiences, expertise, cognitive styles, or even temporary contextual influences affect conceptual representations. We evaluate this method using two distributional semantic models across several calculation methods, reference lexicon sizes, response types, and datasets with tasks ranging from single word substitutions to single and multiple free associations and multiple feature generation. We conclude that at the group level, the best method to retrace the response-generating concept as a vector in a multi-dimensional semantic space from the averaged vectors of participant responses is to collect multiple free associations (70 uniqueand 245 total responses per cue), use fastText for meaning-to-vector mapping for responses and cues, and to consider each response in the centroid calculation as often as it occurred in the data. At the individual level, the best results are achieved by employing fastText and considering at least 8 responses per item per participant in the centroid calculation.", "venue": "https://doi.org/10.31234/osf.io/2xbuh_v1", "label": 0}, {"loc": [2.695533037185669, 2.7568023204803467], "openalex_id": "https://openalex.org/W4410065577", "title": "TRANSFORMANDO O CEN\u00c1RIO JUR\u00cdDICO: UMA ESTRUTURA ORIENTADA POR IA PARA PROCESSAMENTO DE TEXTOS JUDICIAIS", "authors": "Luciano Zanuz, Sandro Jos\u00e9 Rigo", "abstract": "A Intelig\u00eancia Artificial pode revolucionar o campo jur\u00eddico ao abordar as complexidades do gerenciamento de extensos dados textuais inerentes aos processos judiciais. No entanto, a literatura destaca as dificuldades em gerenciar diferentes contextos em rela\u00e7\u00e3o a distintos cen\u00e1rios de aplica\u00e7\u00e3o. Este artigo apresenta uma nova metodologia adaptada para o desenvolvimento de aplica\u00e7\u00f5es no dom\u00ednio jur\u00eddico, alavancando t\u00e9cnicas de processamento de linguagem natural de ponta, incluindo arquiteturas baseadas em transformadores, modelos pr\u00e9-treinados e aprendizado por transfer\u00eancia. Diferentemente do desenvolvimento de software tradicional, essa abordagem abrange as incertezas inerentes \u00e0s solu\u00e7\u00f5es de Intelig\u00eancia Artificial, empregando uma estrutura iterativa que integra forte colabora\u00e7\u00e3o com profissionais do direito, conjuntos de dados espec\u00edficos do dom\u00ednio e estrat\u00e9gias abrangentes de avalia\u00e7\u00e3o. A metodologia foi validada por meio de aplica\u00e7\u00f5es reais no Tribunal de Justi\u00e7a do Rio Grande do Sul, incluindo o desenvolvimento de um Gerador de Relat\u00f3rios de Julgamento, que automatiza a cria\u00e7\u00e3o de relat\u00f3rios de julgamento usando Intelig\u00eancia Artificial Generativa, e experimentos adicionais demonstraram desempenho de ponta em Reconhecimento de Entidades Nomeadas jur\u00eddicas usando modelos BERT ajustados e gera\u00e7\u00e3o de texto adaptada ao contexto com modelos baseados em GPT-2, demonstrando adaptabilidade a diversos cen\u00e1rios jur\u00eddicos. Este trabalho conecta t\u00e9cnicas avan\u00e7adas de processamento de linguagem natural com demandas pr\u00e1ticas do judici\u00e1rio, estabelecendo uma base para aplica\u00e7\u00f5es de IA escal\u00e1veis e confi\u00e1veis. A metodologia proposta aborda desafios pr\u00e1ticos, alinhamento regulat\u00f3rio e especificidade do conjunto de dados, permitindo a integra\u00e7\u00e3o eficaz da IA no setor jur\u00eddico para maior efici\u00eancia e impacto em sistemas judiciais do mundo real.", "venue": "RECIMA21 - Revista Cient\u00edfica Multidisciplinar - ISSN 2675-6218", "label": 0}, {"loc": [5.067922592163086, -1.613254427909851], "openalex_id": "https://openalex.org/W4410068923", "title": "Fine-Tuning Deep Learning Models for Sentiment Analysis: A Study on Movie Titles", "authors": "Dimitrios \u039a. Nasiopoulos, Konstantinos I. Roumeliotis, \u0394\u03b1\u03bc\u03b9\u03b1\u03bd\u03cc\u03c2 \u03a0. \u03a3\u03b1\u03ba\u03ac\u03c2, Kanellos Toudas, Panagiotis Reklitis", "abstract": "Financial sentiment analysis is crucial for making informed decisions in the financial markets, as it helps predict trends, guide investments, and assess economic conditions. Traditional methods for financial sentiment classification, such as Support Vector Machines (SVM), Random Forests, and Logistic Regression, served as our baseline models. While somewhat effective, these conventional approaches often struggled to capture the complexity and nuance of financial language. Recent advancements in deep learning, particularly transformer-based models like GPT and BERT, have significantly enhanced sentiment analysis by capturing intricate linguistic patterns. In this study, we explore the application of deep learning for financial sentiment analysis, focusing on fine-tuning GPT-4o, GPT-4o-mini, BERT, and FinBERT, alongside comparisons with traditional models. To ensure optimal configurations, we performed hyperparameter tuning using Bayesian optimization across 100 trials. Using a combined dataset of FiQA and Financial PhraseBank, we first apply zero-shot classification and then fine tune each model to improve performance. The results demonstrate substantial improvements in sentiment prediction accuracy post-fine-tuning, with GPT-4o-mini showing strong efficiency and performance. Our findings highlight the potential of deep learning models, particularly GPT models, in advancing financial sentiment classification, offering valuable insights for investors and financial analysts seeking to understand market sentiment and make data-driven decisions.", "venue": "International Journal of Financial Studies", "label": 0}, {"loc": [3.986067533493042, -2.176795482635498], "openalex_id": "https://openalex.org/W4410038929", "title": "The Impact of Online Censorship on LLMs", "authors": "Malik Almaliki, Abdulqader M. Almars, Khulood O. Aljuhani, El-Sayed Atlam", "abstract": "Cyberhate presents a multifaceted, context-sensitive challenge that existing detection methods often struggle to tackle effectively. Large language models (LLMs) exhibit considerable potential for improving cyberhate detection due to their advanced contextual understanding. However, detection alone is insufficient; it is crucial for software to also promote healthier user behaviors and empower individuals to actively confront the spread of cyberhate. This study investigates whether integrating large language models (LLMs) with persuasive technology (PT) can effectively detect cyberhate and encourage prosocial user behavior in digital spaces. Through an empirical study, we examine users\u2019 perceptions of a self-monitoring persuasive strategy designed to reduce cyberhate. Specifically, the study introduces the Comment Analysis Feature to limit cyberhate spread, utilizing a prompt-based fine-tuning approach combined with LLMs. By framing users\u2019 comments within the relevant context of cyberhate, the feature classifies input as either cyberhate or non-cyberhate and generates context-aware alternative statements when necessary to encourage more positive communication. A case study evaluated its real-world performance, examining user comments, detection accuracy, and the impact of alternative statements on user engagement and perception. The findings indicate that while most of the users (83%) found the suggestions clear and helpful, some resisted them, either because they felt the changes were irrelevant or misaligned with their intended expression (15%) or because they perceived them as a form of censorship (36%). However, a substantial number of users (40%) believed the interventions enhanced their language and overall commenting tone, with 68% suggesting they could have a positive long-term impact on reducing cyberhate. These insights highlight the potential of combining LLMs and PT to promote healthier online discourse while underscoring the need to address user concerns regarding relevance, intent, and freedom of expression.", "venue": "Computers", "label": 0}, {"loc": [8.86580753326416, 0.9456217885017395], "openalex_id": "https://openalex.org/W4410037498", "title": "A Temporal Knowledge Graph Generation Dataset Supervised Distantly by Large Language Models", "authors": "Jun Zhu, Yan Fu, Junlin Zhou, Duanbing Chen", "abstract": "Knowledge graphs can be constructed by extracting triples from documents, which denotes document-level relation extraction. Each triple illustrates a fact composed of two entities and a relation. However, temporal information corresponding to these facts is ignored. Incorporating temporal information exhibits the temporal connections between facts. Constructing a temporal knowledge graph (TKG) from documents is relatively unexplored. To address this limitation, we built a new dataset for this task based on a document-level relation extraction dataset. We mine the combination relation patterns and construct temporal quadruples by combining facts and timestamps. Additionally, two large language models (LLMs) are adopted to generate quadruples for the rest of the triples without timestamps. Multiple filters and manual annotation are used to ensure the quality of the data. To evaluate the dataset, we propose an LLM-based framework for extracting relations with temporal information from documents. The framework transforms relation extraction to a seq-to-seq task and fine-tunes LLMs to predict the relation with timestamps between entities. Experiments show the performance of LLMs on the proposed dataset.", "venue": "Scientific Data", "label": 20}, {"loc": [4.462831020355225, 2.103811740875244], "openalex_id": "https://openalex.org/W4410084019", "title": "An Analysis of Bias Towards Women in Large Language Models Using Likert Scale Evaluations", "authors": "Sarah Fieck", "abstract": "Closed-source large language models (LLMs) developed by large technology companies continue to grow in popularity. However, ethical conversations surrounding the safety of model outputs have been a prominent topic of discussion. This project aims to assess three leading closed-source LLMs: OpenAI\u2019s ChatGPT, Google\u2019s Gemini, and Anthropic\u2019s Claude, to analyze how their outputs perform when treated as a subject of several psychological evaluation scales measuring biased behaviors against women. The Ambivalent Sexism Index, Modern Sexism Scale, and Belief in Sexism Shift evaluations were used to get descriptions of how the LLMs respond to traditional and modern prompts involving sexism and gender bias. The three evaluations used Likert scale response scores, providing quantitative scoring data. Results from evaluation trials were obtained using each LLMs API, collecting Likert scores in response to the evaluation prompts. Free-response data was also collected to understand output reasoning. Ordinal regression modeling with mixed effects aim to further understand how certain variables affect scoring. To understand patterns in reasoning, thematic analysis of the free response data was completed. Three significant themes were found across all model responses: recognizing women\u2019s challenges, understanding variation in gender experience, and feminism and progressive initiatives. These themes emphasize the ways in which LLMs respond to biased statements against women.", "venue": "https://doi.org/10.36837/chapman.000655", "label": 0}, {"loc": [9.33700942993164, 1.261725902557373], "openalex_id": "https://openalex.org/W4413973312", "title": "OpenForge: Probabilistic Metadata Integration", "authors": "Tianji Cong, Fatemeh Nargesian, Junjie Xing, H. V. Jagadish", "abstract": "Modern data stores increasingly rely on metadata to enable diverse activities such as data cataloging and search. However, metadata curation remains a labor-intensive task, and the broader challenge of metadata maintenance\u2014ensuring its consistency and usefulness\u2014has been largely overlooked. In this work, we tackle the problem of resolving relationships among metadata concepts from disparate sources. Inferring these relationships are critical for creating clean and consistent metadata repositories, and a central challenge for metadata integration. We propose OpenForge, a two-stage prior-posterior framework for metadata integration. In the first stage, OpenForge exploits multiple methods including fine-tuned large language models to obtain prior beliefs about concept relationships. In the second stage, OpenForge refines these predictions using the Markov Random Field, a probabilistic graphical model. We formalize metadata integration as an optimization problem, where the objective is to identify the relationship assignments that maximize the joint probability of assignments. The MRF formulation allows OpenForge to capture prior beliefs while encoding critical relationship properties, such as transitivity, in probabilistic inference. Experiments on four datasets show the effectiveness and efficiency of OpenForge. In a use case of matching two metadata vocabularies, OpenForge outperforms GPT-4, the second-best method, by 25 F1 points.", "venue": "Proceedings of the VLDB Endowment", "label": 21}, {"loc": [7.445291519165039, 0.27290740609169006], "openalex_id": "https://openalex.org/W4410508318", "title": "Comparative Analysis of Embedding Models for Hindi-English Code-Mixed University related queries", "authors": "Om Ingale, Sampada Margaj", "abstract": "This study presents a comparative analysis of open source embedding models for developing a understanding Hindi-English code-mixed language on university related questions. With the increasing adoption of conversational agents in Indian higher education institutions, there is a need for systems that can effectively process queries containing mixed Hindi and English language elements. This research evaluates the performance of five state-of-the-art embedding models - MuRIL, IndicBERT, XLM-RoBERTa, mBERT, on a custom dataset of university-related Hindi-English code-mixed queries. These models were assessed across key metrics including intent classification accuracy, entity recognition performance, and computational efficiency. The results indicate that MuRIL consistently outperforms other models, achieving 87.3% intent classification accuracy and 84.2% entity recognition F1-score, representing a 12.8% improvement over the other models. Analysis across varying code-mixing levels reveals that MuRIL maintains robust performance even with high mixing indices, while other models show significant degradation. This research provides practical insights for educational institutions seeking to implement linguistically inclusive chatbot systems and contributes to the growing body of knowledge on multilingual NLP applications in educational contexts.", "venue": "The Voice of Creative Research", "label": 0}, {"loc": [3.6372246742248535, -0.04087645187973976], "openalex_id": "https://openalex.org/W4409911915", "title": "A Comparative Survey on Large Language Models for Biological Data", "authors": "Ramin Mousa, Ali Sarabadani, Tania Taami, Amir Ali Bengari, Omid Eslamifar, Mohammad Alijanpour Shalmani, Ehsan Karimi Shahmarvandi", "abstract": "The development of large language models (LLMs) has grown exponentially since the release of ChatGPT. Large language models have gained attention for their robust performance across various tasks. The ability of LLMs to understand and produce general-purpose language is achieved by training billions of parameters. These models have emerged as a transformative force in increasing natural language understanding, representing an important step toward general artificial intelligence(AI). LLMs have become powerful tools for various tasks, including natural language processing (NLP), machine translation(MT), vision applications, and question-answering(QA). The expanded reach of LLMs goes beyond the conventional linguistic bounds and includes specialized languages created in different scientific disciplines. The intensification of interest in this new subclass of scientifically oriented LLMs has led to the birth of the scientific LLMs. These scientific LLMs are gradually gaining a foothold as an exciting research area for science study. Theoretically, they share a structure in common with general LLMs. In practice, however, they differ regarding input and usage. This paper undertakes an exhaustive effort to study all the scientific LLMs, the types of structures offered, the datasets, the parameters, and the context of use. Our analysis uses a focused lens that focuses on the biological and chemical domains, which enables an in-depth examination of LLMs for textual knowledge, small molecules, macromolecules, proteins, genomic sequences, and combinations. By providing an overview of the technical advances in the field, this survey is a valuable resource for researchers navigating the complex landscape of scientific LLMs.", "venue": "Preprints.org", "label": 3}, {"loc": [2.9030086994171143, 0.06654803454875946], "openalex_id": "https://openalex.org/W4409890149", "title": "Quantum leap in medical", "authors": "Santosh Chokkakula, Siomui Chong, Bing Xiang Yang, Hong Jiang, Juan Yu, Runze Han, Idress Hamad Attitalla, Chengliang Yin, Shuyao Zhang", "abstract": "ChatGPT, an advanced AI language model, presents a transformative opportunity in several fields including the medical education. This article examines the integration of ChatGPT into healthcare learning environments, exploring its potential to revolutionize knowledge acquisition, personalize education, support curriculum development, and enhance clinical reasoning. The AI\u2019s ability to swiftly access and synthesize medical information across various specialties offers significant value to students and professionals alike. It provides rapid answers to queries on medical theories, treatment guidelines, and diagnostic methods, potentially accelerating the learning curve. The paper emphasizes the necessity of verifying ChatGPT\u2019s outputs against authoritative medical sources. A key advantage highlighted is the AI\u2019s capacity to tailor learning experiences by assessing individual needs, accommodating diverse learning styles, and offering personalized feedback. The article also considers ChatGPT\u2019s role in shaping curricula and assessment techniques, suggesting that educators may need to adapt their methods to incorporate AI-driven learning tools. Additionally, it explores how ChatGPT could bolster clinical problem-solving through AI-powered simulations, fostering critical thinking and diagnostic acumen among students. While recognizing ChatGPT\u2019s transformative potential in medical education, the article stresses the importance of thoughtful implementation, continuous validation, and the establishment of protocols to ensure its responsible and effective application in healthcare education settings.", "venue": "Frontiers in Medicine", "label": 0}, {"loc": [5.1255011558532715, 0.934474766254425], "openalex_id": "https://openalex.org/W4415308082", "title": "Multilingual Performance Biases of Large Language Models in Education", "authors": "Vansh Gupta, Sankalan Pal Chowdhury, Vil\u00e9m Zouhar, Donya Rooein, Mrinmaya Sachan", "abstract": "Large language models (LLMs) are increasingly being adopted in educational settings. These applications expand beyond English, though current LLMs remain primarily English-centric. In this work, we ascertain if their use in education settings in non-English languages is warranted. We evaluated the performance of popular LLMs on four educational tasks: identifying student misconceptions, providing targeted feedback, interactive tutoring, and grading translations in eight languages (Mandarin, Hindi, Arabic, German, Farsi, Telugu, Ukrainian, Czech) in addition to English. We find that the performance on these tasks somewhat corresponds to the amount of language represented in training data, with lower-resource languages having poorer task performance. Although the models perform reasonably well in most languages, the frequent performance drop from English is significant. Thus, we recommend that practitioners first verify that the LLM works well in the target language for their educational task before deployment.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.605112552642822, -1.0992166996002197], "openalex_id": "https://openalex.org/W4415307005", "title": "Low-Resource Neural Machine Translation Using Recurrent Neural Networks and Transfer Learning: A Case Study on English-to-Igbo", "authors": "Ocheme Anthony Ekle, Biswarup Das", "abstract": "In this study, we develop Neural Machine Translation (NMT) and Transformer-based transfer learning models for English-to-Igbo translation - a low-resource African language spoken by over 40 million people across Nigeria and West Africa. Our models are trained on a curated and benchmarked dataset compiled from Bible corpora, local news, Wikipedia articles, and Common Crawl, all verified by native language experts. We leverage Recurrent Neural Network (RNN) architectures, including Long Short-Term Memory (LSTM) and Gated Recurrent Units (GRU), enhanced with attention mechanisms to improve translation accuracy. To further enhance performance, we apply transfer learning using MarianNMT pre-trained models within the SimpleTransformers framework. Our RNN-based system achieves competitive results, closely matching existing English-Igbo benchmarks. With transfer learning, we observe a performance gain of +4.83 BLEU points, reaching an estimated translation accuracy of 70%. These findings highlight the effectiveness of combining RNNs with transfer learning to address the performance gap in low-resource language translation tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.20291519165039, 3.759305715560913], "openalex_id": "https://openalex.org/W4415307809", "title": "Cross-region Model Training with Communication-Computation Overlapping and Delay Compensation", "authors": "Ying Zhu, Xu Yang, Hongli Xu, Yunming Liao, Zhiwei Yao, Liusheng Huang", "abstract": "Training large language models (LLMs) requires massive computational resources, often necessitating the aggregation of geographically distributed data centers (\\ie, cross-region training). However, the high communication latency in wide-area networks severely degrades the efficiency of traditional distributed training. While methods like DiLoCo reduce communication frequency, they suffer from blocking synchronization. Streaming DiLoCo alleviates this issue via communication-computation overlapping but introduces update staleness and model inconsistency due to delayed global updates and partial synchronization. These factors impair convergence, especially when aggressive overlap is needed to mask high latency. We propose CoCoDC, a novel distributed training framework with communication-computation overlapping and delay compensation, to explicitly tackle these challenges. Within the CoCoDC framework, we specifically develop a novel Delay Compensation strategy based on Taylor expansion to effectively mitigate the staleness and an Adaptive Transmission strategy that dynamically schedules model fragment synchronization to optimize bandwidth usage and accelerate convergence. Extensive experiments highlight the superior performance of CoCoDC over both DiLoCo and Streaming DiLoCo regarding final accuracy and training speed. Specifically, CoCoDC reduces the training steps needed to reach a comparable perplexity by up to 21.0% compared to Streaming DiLoCo. Our work provides an effective solution for scalable and efficient cross-region LLM training.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.559759140014648, 1.7330964803695679], "openalex_id": "https://openalex.org/W4415064730", "title": "Representation Learning for Tabular Data: A Comprehensive Survey", "authors": "J. W. Jiang, Siyang Liu, Hao-Run Cai, Qihou Zhou, Han-Jia Ye", "abstract": "Tabular data, structured as rows and columns, is among the most prevalent data types in machine learning classification and regression applications. Models for learning from tabular data have continuously evolved, with Deep Neural Networks (DNNs) recently demonstrating promising results through their capability of representation learning. In this survey, we systematically introduce the field of tabular representation learning, covering the background, challenges, and benchmarks, along with the pros and cons of using DNNs. We organize existing methods into three main categories according to their generalization capabilities: specialized, transferable, and general models. Specialized models focus on tasks where training and evaluation occur within the same data distribution. We introduce a hierarchical taxonomy for specialized models based on the key aspects of tabular data -- features, samples, and objectives -- and delve into detailed strategies for obtaining high-quality feature- and sample-level representations. Transferable models are pre-trained on one or more datasets and subsequently fine-tuned on downstream tasks, leveraging knowledge acquired from homogeneous or heterogeneous sources, or even cross-modalities such as vision and language. General models, also known as tabular foundation models, extend this concept further, allowing direct application to downstream tasks without fine-tuning. We group these general models based on the strategies used to adapt across heterogeneous datasets. Additionally, we explore ensemble methods, which integrate the strengths of multiple tabular models. Finally, we discuss representative extensions of tabular learning, including open-environment tabular machine learning, multimodal learning with tabular data, and tabular understanding. More information can be found in the following repository: https://github.com/LAMDA-Tabular/Tabular-Survey.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7534995079040527, 1.447251319885254], "openalex_id": "https://openalex.org/W4409762438", "title": "Testimony by LLMs", "authors": "Jinhua He, Chen Yang", "abstract": "Abstract Artificial testimony generated by large language models (LLMs) can be a source of knowledge. However, the requirement that artificial testifiers must satisfy for successful knowledge acquisition is different from the requirement that human testifiers must satisfy. Correspondingly, the epistemic ground of artificial testimonial knowledge is not the well-known and accepted ones suggested by renowned epistemological theories of (human) testimony. Based on Thomas Reid\u2019s old teaching, we suggest a novel epistemological theory of artificial testimony that for receivers to justifiably believe artificially generated statements, testifiers of the statement should robustly perform the propensities of veracity and cautiousness. The theory transforms the weakness of Reid\u2019s view to an advantage of its own. It sets an achievable standard for LLMs and clarifies the improvement that current LLMs should make for meeting the standard. Moreover, it indicates a pluralistic nature of testimonial justification pertaining to the pluralistic nature of possible testifiers for knowledge transmission.", "venue": "AI & Society", "label": 16}, {"loc": [3.765355348587036, -3.928719997406006], "openalex_id": "https://openalex.org/W4410192471", "title": "Efficient Hate Speech Detection: Evaluating 38 Models from Traditional Methods to Transformers", "authors": "Mahmoud Abusaqer, Jamil Saquer, Hazim Shatnawi", "abstract": "The proliferation of hate speech on social media necessitates automated detection systems that balance accuracy with computational efficiency. This study evaluates 38 model configurations in detecting hate speech across datasets ranging from 6.5K to 451K samples. We analyze transformer architectures (e.g., BERT, RoBERTa, Distil-BERT), deep neural networks (e.g., CNN, LSTM, GRU, Hierarchical Attention Networks), and traditional machine learning methods (e.g., SVM, CatBoost, Random Forest). Our results show that transformers, particularly RoBERTa, consistently achieve superior performance with accuracy and F1-scores exceeding 90%. Among deep learning approaches, Hierarchical Attention Networks yield the best results, while traditional methods like CatBoost and SVM remain competitive, achieving F1-scores above 88% with significantly lower computational costs. Additionally, our analysis highlights the importance of dataset characteristics, with balanced, moderately sized unprocessed datasets outperforming larger, preprocessed datasets. These findings offer valuable insights for developing efficient and effective hate speech detection systems.", "venue": "https://doi.org/10.1145/3696673.3723061", "label": 0}, {"loc": [7.4380974769592285, 2.465688467025757], "openalex_id": "https://openalex.org/W4415320100", "title": "Instruction-Tuning Data Synthesis from Scratch via Web Reconstruction", "authors": "Yuxin Jiang, Yufei Wang, Chuhan Wu, X. C. Dai, Yan Xu, Weinan Gan, Yasheng Wang, Xin Jiang, Lifeng Shang, Ruiming Tang, Wei Wang", "abstract": "The improvement of LLMs' instruction-following capabilities depends critically on the availability of high-quality instruction-response pairs. While existing automatic data synthetic methods alleviate the burden of manual curation, they often rely heavily on either the quality of seed data or strong assumptions about the structure and content of web documents. To tackle these challenges, we propose Web Reconstruction (WebR), a fully automated framework for synthesizing high-quality instruction-tuning (IT) data directly from raw web documents with minimal assumptions. Leveraging the inherent diversity of raw web content, we conceptualize web reconstruction as an instruction-tuning data synthesis task via a novel dual-perspective paradigm--Web as Instruction and Web as Response--where each web document is designated as either an instruction or a response to trigger the reconstruction process. Comprehensive experiments show that datasets generated by WebR outperform state-of-the-art baselines by up to 16.65% across four instruction-following benchmarks. Notably, WebR demonstrates superior compatibility, data efficiency, and scalability, enabling enhanced domain adaptation with minimal effort. The data and code are publicly available at https://github.com/YJiangcm/WebR.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.83394193649292, 0.654736340045929], "openalex_id": "https://openalex.org/W4414631649", "title": "Compass-V2 Technical Report", "authors": "Sophia Maria", "abstract": "Predominant LLMs focus on high-resource languages while leaving low-resource languages, particularly those in Southeast Asia (SEA), underrepresented. In addition, those models are general-purpose and pay limited attention to the e-commerce domain. To overcome these limitations, we introduce Compass-v2, a lightweight Mixture-of-Experts (MoE) model specifically designed for Southeast Asian languages and e-commerce applications. To balance model performance and inference cost, the model is designed with 30B total parameters and 5B active parameters, incorporating both fine-grained and shared expert modules. To enhance multilingual performance, we curated and constructed a high-quality, industry-leading SEA dataset, to the best of our knowledge. To boost performance in the e-commerce domain, we built a dataset comprising hundreds of billions of tokens, sourced through external data mining and internal platform collection. Besides, we pioneered a hybrid reasoning model that supports both fast thinking and deep thinking within a unified framework to enhance the reasoning capabilities, diverging from the conventional industry practice of deploying two separate models. Through extensive experimental evaluations, our model demonstrates state-of-the-art SEA multilingual and e-commerce performance among sub-30B models, while maintaining significantly lower inference cost.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.268447399139404, 3.1661899089813232], "openalex_id": "https://openalex.org/W4414688692", "title": "Natural Fingerprints of Large Language Models", "authors": "Teppei Suzuki, Ryokan Ri, Sho Takase", "abstract": "Recent studies have shown that the outputs from large language models (LLMs) can often reveal the identity of their source model. While this is a natural consequence of LLMs modeling the distribution of their training data, such identifiable traces may also reflect unintended characteristics with potential implications for fairness and misuse. In this work, we go one step further and show that even when LLMs are trained on exactly the same dataset, their outputs remain distinguishable, suggesting that training dynamics alone can leave recognizable patterns. We refer to these unintended, distinctive characteristics as natural fingerprints. By systematically controlling training conditions, we show that the natural fingerprints can emerge from subtle differences in the training process, such as parameter sizes, optimization settings, and even random seeds. These results suggest that training dynamics can systematically shape model behavior, independent of data or architecture, and should be explicitly considered in future research on transparency, reliability, and interpretability.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.61997127532959, -1.0926892757415771], "openalex_id": "https://openalex.org/W4409606174", "title": "Evaluating and improving lexical language understanding in neural machine translation", "authors": "Yan Qin", "abstract": "The persistent challenges of polysemy and ambiguity continue to hinder the semantic accuracy of Neural Machine Translation (NMT), particularly in language pairs with distinct syntactic structures. While transformer-based models such as BERT and GPT have achieved notable progress in capturing contextual word meanings, they still fall short in understanding explicit semantic roles. This study aims to address this limitation by integrating Semantic Role Labeling (SRL) into a Transformer-based NMT framework to enhance semantic comprehension and reduce translation errors. Using a parallel corpus of 100,000 English-Indonesian and English-Japanese sentence pairs, the proposed SRL-enhanced NMT model was trained and evaluated against a baseline Transformer NMT. The integration of SRL enabled the model to annotate semantic roles, such as agent, patient, and instrument, which were fused with encoder representations through semantic-aware attention mechanisms. Experimental results demonstrate that the SRL-integrated model significantly outperformed the standard NMT model, improving BLEU scores by 6.2 points (from 32.5 to 38.7), METEOR scores by 6.3 points (from 58.5 to 64.8), and reducing the TER by 5.8 points (from 45.1 to 39.3). These results were statistically validated using a paired t-test (p < 0.05). Furthermore, qualitative analyses confirmed SRL's effectiveness in resolving lexical ambiguities and syntactic uncertainties. Although SRL integration increased inference time by 12%, the performance trade-off was deemed acceptable for applications requiring higher semantic fidelity. The novelty of this research lies in the architectural fusion of SRL with transformer-based attention layers in NMT, a domain seldom explored in prior studies. Moreover, the model demonstrates robust performance across linguistically divergent language pairs, suggesting its broader applicability. This work contributes to the advancement of semantically aware translation systems and paves the way for future research in unsupervised SRL integration and multilingual scalability.", "venue": "Journal of Technology Informatics and Engineering", "label": 0}, {"loc": [9.164605140686035, -0.8023828864097595], "openalex_id": "https://openalex.org/W4409591253", "title": "A Systematic Survey of Text Summarization: From Statistical Methods to Large Language Models", "authors": "Haopeng Zhang, Philip S. Yu, Jiawei Zhang", "abstract": "Text summarization research has undergone several significant transformations with the advent of deep neural networks, pre-trained language models (PLMs), and recent large language models (LLMs). This survey thus provides a comprehensive review of the research progress and evolution in text summarization through the lens of these paradigm shifts. It is organized into two main parts: (1) a detailed overview of datasets, evaluation metrics, and summarization methods before the LLM era, encompassing traditional statistical methods, deep learning approaches, and PLM fine-tuning techniques, and (2) the first detailed examination of recent advancements in benchmarking, modeling, and evaluating summarization in the LLM era. By synthesizing existing literature and presenting a cohesive overview, this survey also discusses research trends, open challenges, and proposes promising research directions in summarization, aiming to guide researchers through the evolving landscape of summarization research.", "venue": "ACM Computing Surveys", "label": 7}, {"loc": [3.2076425552368164, 3.0186076164245605], "openalex_id": "https://openalex.org/W4409568306", "title": "Security and Privacy Challenges of AIGC in Metaverse: A Comprehensive Survey", "authors": "Shoulong Zhang, Haomin Li, Kaiwen Sun, Hejia Chen, Yan Wang, Shuai Li", "abstract": "The Metaverse is a hybrid environment that integrates both physical and virtual realms. The Metaverse has been accessible due to many facilitating technologies. One of the essential technologies that contribute to the Metaverse is AIGC. It is crucial in creating artificial assets and presenting natural interactions efficiently and effectively. Nevertheless, AIGC models encounter external and internal obstacles in security, privacy, and ethics during every level of their development. To conduct a thorough analysis and investigation of risks and threats, we propose a new taxonomy system that categorizes the issues based on three primary factors: the stage of threat exposure, the specific area of the concerns, and the origin of the threats. Furthermore, we present specific unresolved questions that prompt additional investigation into the risks posed by AIGC and the steps taken to counteract them in Metaverse art creation and interactive methodologies. This thorough evaluation offers a broad perspective on the security measures AIGC uses in the Metaverse.", "venue": "ACM Computing Surveys", "label": 7}, {"loc": [7.899954319000244, 2.3874127864837646], "openalex_id": "https://openalex.org/W4414757699", "title": "An Evaluation of N-Gram Selection Strategies for Regular Expression Indexing in Contemporary Text Analysis Tasks", "authors": "Ling Zhang, Shaleen Deep, Jignesh M. Patel, Karthikeyan Sankaralingam", "abstract": "Efficient evaluation of regular expressions (regex, for short) is crucial for text analysis, and n-gram indexes are fundamental to achieving fast regex evaluation performance. However, these indexes face scalability challenges because of the exponential number of possible n-grams that must be indexed. Many existing selection strategies, developed decades ago, have not been rigorously evaluated on contemporary large-scale workloads and lack comprehensive performance comparisons. Therefore, a unified and comprehensive evaluation framework is necessary to compare these methods under the same experimental settings. This paper presents the first systematic evaluation of three representative n-gram selection strategies across five workloads, including real-time production logs and genomic sequence analysis. We examine their trade-offs in terms of index construction time, storage overhead, false positive rates, and end-to-end query performance. Through empirical results, this study provides a modern perspective on existing n-gram based regular expression evaluation methods, extensive observations, valuable discoveries, and an adaptable testing framework to guide future research in this domain. We make our implementations of these methods and our test framework available as open-source at https://github.com/mush-zhang/RegexIndexComparison.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.6015799045562744, 1.453352451324463], "openalex_id": "https://openalex.org/W4409531279", "title": "Strategies for Utilizing Generative AI in Educational Environments", "authors": "Wyatt Porter Jones, Sarah Logan", "abstract": "The development of widely accessible generative AI tools, such as ChatGPT, has significantly impacted education and the workforce. Users have the capability to synthesize information from across the whole internet in seconds, which gives rise to new methods of learning, teaching, and optimizing workflows. The ways in which educators choose to handle this development greatly affects the quality and equity of instruction and learning available to their students, and their preparedness for life after education. This chapter will share research regarding generative AI and educational strategies. This research will be used as the basis for a list of strategies to help implement generative AI tools into the classroom effectively to prepare students for their futures.", "venue": "IGI Global eBooks", "label": 0}, {"loc": [4.418348789215088, 2.539306402206421], "openalex_id": "https://openalex.org/W4415161599", "title": "Bias Beyond English: Evaluating Social Bias and Debiasing Methods in a Low-Resource Setting", "authors": "Ej Zhou, Weiming L\u00fc", "abstract": "Social bias in language models can potentially exacerbate social inequalities. Despite it having garnered wide attention, most research focuses on English data. In a low-resource scenario, the models often perform worse due to insufficient training data. This study aims to leverage high-resource language corpora to evaluate bias and experiment with debiasing methods in low-resource languages. We evaluated the performance of recent multilingual models in five languages: English, Chinese, Russian, Indonesian and Thai, and analyzed four bias dimensions: gender, religion, nationality, and race-color. By constructing multilingual bias evaluation datasets, this study allows fair comparisons between models across languages. We have further investigated three debiasing methods-CDA, Dropout, SenDeb-and demonstrated that debiasing methods from high-resource languages can be effectively transferred to low-resource ones, providing actionable insights for fairness research in multilingual NLP.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.015698432922363, 1.0105185508728027], "openalex_id": "https://openalex.org/W4410187366", "title": "GENERATIVE AI-POWERED FRAMEWORK", "authors": "Purshottam J. Assudani, P. Balakrishnan, A. Anny Leema, Rajesh K Nasare", "abstract": "This paper introduces a hybrid deep learning system for complex audio interpretation and post time communication utilizing associated hidden Convolutional Neural Networks (CNNs) with transformer based Large Language Models (LLMs) over spectrogram. The system inputs raw audio input in the form of audio signals, and maps them into spectrograms, extracts high level features using CNNs, and asks for fusion of LLM-produced embeddings with it, for adding semantic understanding, and contextual discussions. The multimodal attention technique helps in crossing the audio-linguistic gap and therefore, it is possible that they can have meaningful and context-aware response. The release offers the apps for intelligent assistant, education, intelligent monitoring, and other. Github repository, experimental evaluation presents increase in performance over the state-of-the-art in both experiments, with accuracy at 93.8%, latency at 420 ms and high semantic coherence (BLEU score of 0.74 is obtained). This result proves that the proposed system is usable to offer both user-friendly and intelligent audio exploration.", "venue": "Metallurgical and Materials Engineering", "label": 0}, {"loc": [8.208930015563965, 3.7402548789978027], "openalex_id": "https://openalex.org/W4414830116", "title": "OVERLORD: Ultimate Scaling of DataLoader for Multi-Source Large Foundation Model Training", "authors": "Juntao Zhao, Qi Lu, Wei Jia, Borui Wan, Lei Zuo, Junda Feng, Jianyu Jiang, Yangrui Chen, Shuaishuai Cao, Jialing He, Kaiming Jiang, Yuanzhe Hu, Shibiao Nong, Yanghua Peng, Haibin Lin, Xin Liu, Chuan Wu", "abstract": "Modern frameworks for training large foundation models (LFMs) employ dataloaders in a data-parallel manner, with each loader processing a disjoint subset of training data. Under multisource preprocessing, two fundamental challenges exist. First, due to the quadratic computational complexity of the attention operator, the non-uniform sample distribution over data-parallel ranks leads to significant workload imbalance among dataloaders, degrading the training efficiency. Second, supporting diverse data sources requires per-dataset file access states that are redundantly replicated across parallel loaders, consuming excessive memory. This also hinders dynamic data mixing (e.g., curriculum learning) and causes redundant access/memory overhead in hybrid parallelism. We present Omniload, an industrial-grade distributed data loading architecture for LFMs, with four innovations: (1) Disaggregated data preprocessing via role-specific actors (Source Loaders/Data Constructors) to eliminate source and parallelism redundant data access and ensure multisource scalability. (2) Centralized and declarative data plane for elastic multisource orchestration, such as long-short context, multimodality, and curriculum learning. (3) Multi-level auto-partitioning and scaling mechanism for source loaders under heterogeneous preprocessing costs. (4) Shadow loaders with differential checkpointing for fault recovery without workflow interruption. Deployed on production clusters scaling to multi-thousand GPUs, Omniload achieves: (1) 4.5x end-to-end training throughput improvement, (2) 13.5x reduction in CPU memory usage.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.548842430114746, 2.1353695392608643], "openalex_id": "https://openalex.org/W4415153734", "title": "Position: Beyond Euclidean--Foundation Models Should Embrace Non-Euclidean Geometries", "authors": "Neil He, Jianguo Liu, Bing Zhang, Ngoc Bui, Ali Maatouk, Meng\u2010Lin Yang, Irwin King, Melanie Weber, Rex Ying", "abstract": "In the era of foundation models and Large Language Models (LLMs), Euclidean space has been the de facto geometric setting for machine learning architectures. However, recent literature has demonstrated that this choice comes with fundamental limitations. At a large scale, real-world data often exhibit inherently non-Euclidean structures, such as multi-way relationships, hierarchies, symmetries, and non-isotropic scaling, in a variety of domains, such as languages, vision, and the natural sciences. It is challenging to effectively capture these structures within the constraints of Euclidean spaces. This position paper argues that moving beyond Euclidean geometry is not merely an optional enhancement but a necessity to maintain the scaling law for the next-generation of foundation models. By adopting these geometries, foundation models could more efficiently leverage the aforementioned structures. Task-aware adaptability that dynamically reconfigures embeddings to match the geometry of downstream applications could further enhance efficiency and expressivity. Our position is supported by a series of theoretical and empirical investigations of prevalent foundation models.Finally, we outline a roadmap for integrating non-Euclidean geometries into foundation models, including strategies for building geometric foundation models via fine-tuning, training from scratch, and hybrid approaches.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.598541498184204, 1.5902854204177856], "openalex_id": "https://openalex.org/W4415152185", "title": "Delving into: the quantification of Ai-generated content on the internet (synthetic data)", "authors": "Dirk Spennemann", "abstract": "While it is increasingly evident that the internet is becoming saturated with content created by generated Ai large language models, accurately measuring the scale of this phenomenon has proven challenging. By analyzing the frequency of specific keywords commonly used by ChatGPT, this paper demonstrates that such linguistic markers can effectively be used to esti-mate the presence of generative AI content online. The findings suggest that at least 30% of text on active web pages originates from AI-generated sources, with the actual proportion likely ap-proaching 40%. Given the implications of autophagous loops, this is a sobering realization.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.1789538860321045, -0.7365229725837708], "openalex_id": "https://openalex.org/W4409431050", "title": "Diet Engine: A real-time food nutrition assistant system for personalized dietary guidance", "authors": "Asim Moin Saad, Md. Sifat Rahi, Md. Manirul Islam, Gulam Rabbani", "abstract": "In an era where intelligent technologies are rapidly shaping our lives, a Real-Time Nutrition Assistant System emerges as an essential tool for maintaining a healthy lifestyle and promoting awareness. A Real-Time Nutrition Assistant System advances nutrition and healthcare technologies to improve public health by offering quick insight into the nutritional content of our meals. This study introduces Diet Engine, an innovative smartphone application powered by machine learning that enhances health outcomes by providing immediate food classification and personalized dietary suggestions. The system features modules using deep learning (DL) and Convolutional Neural Networks (CNNs) to detect food, as well as textual analysis and natural language processing (NLP) to estimate components such as nutritional content. It offers customized food suggestions according to the user's dietary preferences and constraints. Diet Engine accurately identifies and evaluates the nutritional value of food from images. The system employs a client-server architecture, using advanced deep learning techniques like YOLOv8 (You Only Look Once version 8) and Convolutional Neural Networks (CNNs) optimized for real-time object detection with 295 layers, for training and processing image requests. Our system outperforms existing algorithms, achieving an 86 % classification accuracy on food datasets. Moreover, a personalized chatbot provides diet advice, meal recommendations, and fitness suggestions. By seamlessly integrating advanced deep learning algorithms with user-centric features, this study underscores the transformative potential of Diet Engine in fostering healthier eating habits, raising nutritional awareness, and contributing to a global shift toward more informed and sustainable lifestyle choices.", "venue": "Food Chemistry Advances", "label": 0}, {"loc": [7.7575249671936035, 2.599694013595581], "openalex_id": "https://openalex.org/W4414829925", "title": "SWAN-GPT: An Efficient and Scalable Approach for Long-Context Language Modeling", "authors": "Krishna C. Puvvada, Faisal Ladhak, Santiago Akle Serrano, Cheng-Ping Hsieh, Shantanu Acharya, Somshubra Majumdar, Fei Jia, Samuel Kriman, Simeng Sun, Dima Rekesh, Boris Ginsburg", "abstract": "We present a decoder-only Transformer architecture that robustly generalizes to sequence lengths substantially longer than those seen during training. Our model, SWAN-GPT, interleaves layers without positional encodings (NoPE) and sliding-window attention layers equipped with rotary positional encodings (SWA-RoPE). Experiments demonstrate strong performance on sequence lengths significantly longer than the training length without the need for additional long-context training. This robust length extrapolation is achieved through our novel architecture, enhanced by a straightforward dynamic scaling of attention scores during inference. In addition, SWAN-GPT is more computationally efficient than standard GPT architectures, resulting in cheaper training and higher throughput. Further, we demonstrate that existing pre-trained decoder-only models can be efficiently converted to the SWAN architecture with minimal continued training, enabling longer contexts. Overall, our work presents an effective approach for scaling language models to longer contexts in a robust and efficient manner.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.195226192474365, 2.5701444149017334], "openalex_id": "https://openalex.org/W4409405197", "title": "Learning about color from language", "authors": "Qiawen Liu, Jeroen van Paridon, Gary Lupyan", "abstract": "Certain colors are strongly associated with certain adjectives (e.g. red is hot, blue is cold). Some of these associations are grounded in visual experiences such as seeing glowing red embers. Surprisingly, despite having no visual experience, many congenitally blind people show very similar color associations which are likely learned through language. We show that these associations are indeed embedded in the statistical structure of language. We apply a projection method to word embeddings trained on corpora of spoken and written language to identify color-adjective associations as they are represented in English. These projections were predictive of color-adjective associations reported by blind and sighted English speakers. The most predictive projections were generated by embeddings derived from a corpus of fiction, which outperformed even the state-of-the-art large language model, GPT-4. By augmenting the training corpora in various ways we discover the types of sentences most responsible for conveying the color-adjective associations to the models. We find that word embedding models learn these associations from indirect (second-order) co-occurrences, and that when prompted, people are able to identify some of the words that are most informative for associating colors with specific adjectives. Learning through linguistic co-occurrences is one way word meanings can be continually aligned across language users despite large variations in perceptual experience.", "venue": "Communications Psychology", "label": 0}, {"loc": [7.5872416496276855, 2.3831770420074463], "openalex_id": "https://openalex.org/W4409362536", "title": "Structured Packing in LLM Training Improves Long Context Utilization", "authors": "Konrad Staniszewski, Szymon Tworkowski, Sebastian Jaszczur, Yu Zhao, Henryk Michalewski, \u0141ukasz Kuci\u0144ski, Piotr Mi\u0142o\u015b", "abstract": "Recent advancements in long-context language modeling have attracted significant attention, yet their practical applications often suffer from suboptimal context utilization. To efficiently address this issue, we introduce the Structured Packing for Long Context, SPLiCe, a method that uses retrieval to collate mutually relevant documents into long training samples. We demonstrate that SPLiCe improves performance on long-context tasks, particularly by achieving perfect accuracy on the synthetic Needle in the Haystack benchmark, and effectively mitigating the \u2018lost-in-the-middle\u2019 phenomenon often observed in large language models. Notably, these long-context capabilities also extend to realistic downstream tasks, such as Qasper, across multiple model sizes\u20143B, 7B, and 13B\u2014and are achieved with only brief fine-tuning on 2-6 billion tokens. We supplement these results with a detailed analysis of SPLiCe, examining the impact of hyperparameter choices, the different mixtures and proportions of SPLiCe-generated training data, and the choice of the retriever. We also study the transfer of long-context utilization skills between the modalities. An intriguing finding from our analysis is that training on a corpus of code can enhance performance on natural language tasks.", "venue": "Proceedings of the AAAI Conference on Artificial Intelligence", "label": 5}, {"loc": [3.6361424922943115, 3.8367698192596436], "openalex_id": "https://openalex.org/W4409360671", "title": "Advanced technique for firmware security analysis through heterogeneous data fusion and knowledge mapping", "authors": "Peng Xiao, Linjiang Xie, Feilu Hang, Hanruo Li", "abstract": "As the core component of a device, firmware\u2019s security directly affects the stability of the entire system and the security of user data. In order to provide a more comprehensive and accurate data foundation and improve the accuracy of firmware security analysis, this article conducts research on advanced technologies for firmware security analysis through heterogeneous data fusion and knowledge mapping. Firstly, preprocess the firmware security analysis knowledge graph data using cleaning, segmentation, classification, and other processing methods. Secondly, calculate the firmware security status value under heterogeneous information based on the processed data; Again, based on the calculation results of firmware security status values, extract knowledge graph features for firmware security analysis and annotate relationship description entries; Finally, based on knowledge mapping technology, ontology integration and ontology mapping knowledge fusion were carried out to achieve more effective organization and representation of firmware security analysis knowledge, and the research on the construction technology of firmware security analysis knowledge graph was completed. The experimental results show that after applying the proposed method for firmware security analysis, the loss function index value and F1 index value are both very high, and the prediction accuracy of knowledge graph evolution is very close to 100%, with good operational effect.", "venue": "PLoS ONE", "label": 11}, {"loc": [6.3725385665893555, 5.399191379547119], "openalex_id": "https://openalex.org/W4409363896", "title": "CLIP-CID: Efficient CLIP Distillation via Cluster-Instance Discrimination", "authors": "Kaicheng Yang, T. Gu, Xiang An, Haiqiang Jiang, Xiangzi Dai, Ziyong Feng, Weidong Cai, Jiankang Deng", "abstract": "Contrastive Language-Image Pre-training (CLIP) has achieved excellent performance over a wide range of tasks. However, the effectiveness of CLIP heavily relies on a substantial corpus of pre-training data, resulting in notable consumption of computational resources. Although knowledge distillation has been widely applied in single modality models, how to efficiently expand knowledge distillation to vision-language foundation models with extensive data remains relatively unexplored. In this paper, we introduce CLIP-CID, a novel distillation mechanism that effectively transfers knowledge from a large vision-language foundation model to a smaller model. We initially propose a simple but efficient image semantic balance method to reduce transfer learning bias and improve distillation efficiency. This method filters out 43.7% of image-text pairs from the LAION400M while maintaining superior performance. After that, we leverage cluster-instance discrimination to facilitate knowledge transfer from the teacher model to the student model, thereby empowering the student model to acquire a holistic semantic comprehension of the pre-training data. Experimental results demonstrate that CLIP-CID achieves state-of-the-art performance on various downstream tasks including linear probe and zero-shot classification.", "venue": "Proceedings of the AAAI Conference on Artificial Intelligence", "label": 5}, {"loc": [4.111186504364014, -2.3154048919677734], "openalex_id": "https://openalex.org/W4409346729", "title": "ViFactCheck: A New Benchmark Dataset and Methods for Multi-domain News Fact-Checking in Vietnamese", "authors": "Tr\u1ea7n Th\u00e1i Ho\u00e0, Tran Quang Duy, Khanh Quoc Tran, Kiet Van Nguyen", "abstract": "The rapid spread of information in the digital age highlights the critical need for effective fact-checking tools, particularly for languages with limited resources, such as Vietnamese. In response to this challenge, we introduce ViFactCheck, the first publicly available benchmark dataset designed specifically for Vietnamese fact-checking across multiple online news domains. This dataset contains 7,232 human-annotated pairs of claim-evidence combinations sourced from reputable Vietnamese online news, covering 12 diverse topics. It has been subjected to a meticulous annotation process to ensure high quality and reliability, achieving a Fleiss Kappa inter-annotator agreement score of 0.83. Our evaluation leverages state-of-the-art pre-trained and large language models, employing fine-tuning and prompting techniques to assess performance. Notably, the Gemma model demonstrated superior effectiveness, with an impressive macro F1 score of 89.90%, thereby establishing a new standard for fact-checking benchmarks. This result highlights the robust capabilities of Gemma in accurately identifying and verifying facts in Vietnamese. To further promote advances in fact-checking technology and improve the reliability of digital media, we have made the ViFactCheck dataset, model checkpoints, fact-checking pipelines, and source code freely available on GitHub. This initiative aims to inspire further research and enhance the accuracy of information in low-resource languages.", "venue": "Proceedings of the AAAI Conference on Artificial Intelligence", "label": 5}, {"loc": [8.4822416305542, 3.6660964488983154], "openalex_id": "https://openalex.org/W4409363306", "title": "SMMF: Square-Matricized Momentum Factorization for Memory-Efficient Optimization", "authors": "K. C. Park, Seulki Lee", "abstract": "We propose SMMF (Square-Matricized Momentum Factorization), a memory-efficient optimizer that reduces the memory requirement of the widely used adaptive learning rate optimizers, such as Adam, by up to 96%. SMMF enables flexible and efficient factorization of an arbitrary rank (shape) of the first and second momentum tensors during optimization, based on the proposed square-matricization and one-time single matrix factorization. From this, it becomes effectively applicable to any rank (shape) of momentum tensors, i.e., bias, matrix, and any rank-d tensors, prevalent in various deep model architectures, such as CNNs (high rank) and Transformers (low rank), in contrast to existing memory-efficient optimizers that applies only to a particular (rank-2) momentum tensor, e.g., linear layers. We conduct a regret bound analysis of SMMF, which shows that it converges similarly to non-memory-efficient adaptive learning rate optimizers, such as AdamNC, providing a theoretical basis for its competitive optimization capability. In our experiment, SMMF takes up to 96% less memory compared to state-of-the-art memoryefficient optimizers, e.g., Adafactor, CAME, and SM3, while achieving comparable model performance on various CNN and Transformer tasks.", "venue": "Proceedings of the AAAI Conference on Artificial Intelligence", "label": 5}, {"loc": [4.038266181945801, -2.3224799633026123], "openalex_id": "https://openalex.org/W4409360936", "title": "Tracking and Identifying International Propaganda and Influence Networks Online", "authors": "Hans W. A. Hanley", "abstract": "Misinformation and propaganda undermine trust in institutions, spread falsehoods, and sometimes incite violence. However, recent advancements in transformer-based AI models can help combat the proliferation of disinformation globally and in real time. In this work, I propose and develop a system using these models to scalably identify, track, and analyze the spread of narratives from over 40,000 international news websites. First, by employing novel multilingual Matryoshka embeddings and hierarchical level-wise clustering, my proposed system identifies news stories, topics, and themes across these thousands of news websites. Second, by utilizing multilingual stance detection, my system assesses the biases and factual inconsistencies in news articles, enabling the identification of websites that spread propaganda or misinformation. Finally, through network inference methods, my system uncovers connections among websites disseminating slanted or false content. My approach illustrates how AI can be utilized to mitigate the global spread of harmful misinformation and propaganda.", "venue": "Proceedings of the AAAI Conference on Artificial Intelligence", "label": 5}, {"loc": [8.680631637573242, 2.050685405731201], "openalex_id": "https://openalex.org/W4409363160", "title": "Quality over Quantity: Boosting Data Efficiency Through Ensembled Multimodal Data Curation", "authors": "Jiaxin Xu, Yuhao Song, Daming Wang, Weiwei Zhao, Minghua Chen, Kun Chen, Qinya Li", "abstract": "In an era overwhelmed by vast amounts of data, the effective curation of web-crawl datasets is essential for optimizing model performance. This paper tackles the challenges associated with the unstructured and heterogeneous nature of such datasets. Traditional heuristic curation methods often inadequately capture complex features, resulting in biases and the exclusion of relevant data. We introduce an advanced, learning-driven approach, Ensemble Curation Of DAta ThroUgh Multimodal Operators, called EcoDatum, which employs a novel quality-guided deduplication method to balance feature distribution. EcoDatum strategically integrates various unimodal and multimodal data curation operators within a weak supervision ensemble framework, utilizing automated optimization to effectively score each data point. EcoDatum, which significantly improves the data curation quality and efficiency, outperforms existing state-of-the-art (SOTA) techniques, ranking 1st on the DataComp leaderboard with an average performance score of 0.182 across 38 diverse evaluation datasets. This represents a 28% improvement over the DataComp baseline method, demonstrating its effectiveness in improving dataset curation and model training efficiency.", "venue": "Proceedings of the AAAI Conference on Artificial Intelligence", "label": 5}, {"loc": [8.102470397949219, 3.4196393489837646], "openalex_id": "https://openalex.org/W4409362760", "title": "Pruning Large Language Models with Semi-Structural Adaptive Sparse Training", "authors": "Weiyu Huang, Yuezhou Hu, Guohao Jian, Jun Zhu, Chen Jian-fei", "abstract": "The remarkable success of Large Language Models (LLMs) relies heavily on their substantial scale, which poses significant challenges during model deployment in terms of latency and memory consumption. Recently, numerous studies have attempted to compress LLMs using one-shot pruning methods. However, these methods often suffer from considerable performance degradation on complex language understanding tasks, raising concerns about the feasibility of pruning in LLMs. To address this issue, we propose Adaptive Sparse Trainer (AST), a novel and efficient retraining framework tailored for semi-structured sparse models. AST enables models to learn optimal masks during the weight update process without incurring additional computational overhead. Furthermore, we demonstrate that incorporating knowledge distillation significantly improves retraining efficiency and enhances model performance under fixed computational constraints. Additionally, a supplementary set of well-initialized parameters is integrated to further augment the model's efficacy. AST achieves state-of-the-art performance with minimal training cost. When applied to the LLaMA2-7B model, AST reduces the perplexity and zero-shot accuracy gap between dense and 2:4 semi-structured sparse models to 0.6 and 1.16%, respectively, utilizing less than 0.4% of the pretraining tokens and GPU hours. Our work demonstrates the feasibility of deploying semi-structured sparse LLMs and offers a promising alternative for achieving highly compressed models when combined with existing quantization techniques.", "venue": "Proceedings of the AAAI Conference on Artificial Intelligence", "label": 5}, {"loc": [3.1760377883911133, 1.812695026397705], "openalex_id": "https://openalex.org/W4410878166", "title": "EVALUATING THE IMPACT OF ARTIFICIAL INTELLIGENCE ON BUSINESS THROUGH SENTIMENT ANALYSIS.", "authors": "Omaima Moqaddem", "abstract": "<p>The integration of Artificial Intelligence (AI) into marketing strategies has revolutionized the way businesses engage with consumers, enabling the delivery of hyper-personalized experiences through advanced data analysis, machine learning, and predictive modeling. This paper conducts a critical literature review of recent research (2020\u20132025) to examine the role of AI in enhancing personalization and customer engagement. By analyzing peer-reviewed articles, industry reports, and case studies, the review explores key developments in AI-driven personalization, recommendation systems, real-time engagement, sentiment analysis, and predictive analytics. It also addresses pressing ethical concerns, including data privacy and algorithmic bias, and evaluates the implications of these issues on consumer trust. Moreover, the paper identifies research gaps, particularly in the areas of long-term impact, ethical governance, and sector-specific applications. The findings suggest that while AI significantly improves marketing effectiveness, its adoption must be guided by transparent, ethical, and human-centered frameworks to maximize benefits and minimize risks. The paper concludes by proposing directions for future research and practical guidelines for responsible AI implementation in intelligent marketing strategies.</p><p><strong> </strong></p><p><strong>JEL:</strong><strong> </strong>M31; M15; C55; D83; L86<strong></strong></p><p> </p><p><strong> Article visualizations:</strong></p><p><img src=\"/-counters-/soc/0998/a.php\" alt=\"Hit counter\" /></p>", "venue": "European Journal of Management and Marketing Studies", "label": 0}, {"loc": [7.024518966674805, 0.21040503680706024], "openalex_id": "https://openalex.org/W4415981216", "title": "Rethinking Multilingual Continual Pretraining: Data Mixing for Adapting LLMs Across Languages and Resources", "authors": "Zihao Li, Shaoxiong Ji, Hengyu Luo, J\u00f6rg Tiedemann", "abstract": "Large Language Models (LLMs) exhibit significant disparities in performance across languages, primarily benefiting high-resource languages while marginalizing underrepresented ones. Continual Pretraining (CPT) has emerged as a promising approach to address this imbalance, although the relative effectiveness of monolingual, bilingual, and code-augmented data strategies remains unclear. This study systematically evaluates 36 CPT configurations involving three multilingual base models, across 30+ languages categorized as altruistic, selfish, and stagnant, spanning various resource levels. Our findings reveal three major insights: (1) Bilingual CPT improves multilingual classification but often causes language mixing issues during generation. (2) Including programming code data during CPT consistently enhances multilingual classification accuracy, particularly benefiting low-resource languages, but introduces a trade-off by slightly degrading generation quality. (3) Contrary to prior work, we observe substantial deviations from language classifications according to their impact on cross-lingual transfer: Languages classified as altruistic often negatively affect related languages, selfish languages show conditional and configuration-dependent behavior, and stagnant languages demonstrate surprising adaptability under certain CPT conditions. These nuanced interactions emphasize the complexity of multilingual representation learning, underscoring the importance of systematic studies on generalizable language classification to inform future multilingual CPT strategies.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.90816068649292, -1.2452532052993774], "openalex_id": "https://openalex.org/W4409287886", "title": "Modeling Multimodal Emotion with Dynamic Interaction-Focused Representation Network", "authors": "Alex Brooks, Marco Rivera, Lobry Hsu, zachary carter", "abstract": "Understanding human emotions through multimodal signals has become a pivotal task in affective computing and human-computer interaction. Among the multiple modalities, text and audio jointly deliver rich and complementary emotional cues. However, a key challenge lies in the temporal misalignment between these modalities, making it difficult to fuse them into a coherent emotional representation. In this work, we propose a novel framework named DIFERNet (Dynamic Interaction-Focused Emotion Representation Network), which directly learns robust and discriminative fused features from unaligned text and audio sequences. Unlike prior works that often rely on strict alignment or shallow fusion techniques, our method dynamically adapts to the unique characteristics of each modality while emphasizing their interdependencies. The architecture of DIFERNet comprises three main components: (1) a crossmodal dimensional alignment module that ensures feature compatibility between heterogeneous inputs; (2) an interaction-guided attention mechanism that facilitates deep crossmodal synergy for initializing the fused embeddings; and (3) a dynamic fusion adaptation transformer, which refines the fused representation in a modality-preserving manner. This final module serves as a correction mechanism to retain crucial unimodal semantics while enhancing contextual understanding across modalities. We conduct extensive evaluations on two widely-used sentiment benchmarks, CMU-MOSI and CMU-MOSEI, to validate the proposed approach. Experimental results indicate that DIFERNet consistently outperforms existing baselines, showing marked improvements across all key metrics. Furthermore, qualitative analysis demonstrates its capacity to appropriately regulate sentiment predictions by leveraging nuanced acoustic features. These findings highlight the potential of DIFERNet for multimodal sentiment analysis in real-world, asynchronous environments.", "venue": "Preprints.org", "label": 3}, {"loc": [4.111960411071777, -2.509162664413452], "openalex_id": "https://openalex.org/W4409247745", "title": "Harnessing Large Language Models and Deep Neural Networks for Fake News Detection", "authors": "Eleftheria Papageorgiou, Iraklis Varlamis, Christos Chronis", "abstract": "The spread of fake news threatens trust in both traditional and digital media. Early detection methods, based on linguistic patterns and handcrafted features, struggle to identify more sophisticated misinformation. Large language models (LLMs) offer promising solutions by capturing complex text patterns, but challenges remain in ensuring their accuracy and generalizability. This study evaluates LLM-based feature extraction for fake news detection across multiple datasets. We compare BERT-based text representations, introduce a method for extracting factual segments from news articles, and create two new datasets with fact-based features. Additionally, we explore graph-based text representations using LLMs to capture relationships within news content. By integrating these approaches, we improve fake news detection, making it more accurate and interpretable. Our findings provide insights into how LLMs and graph-based techniques can enhance misinformation detection.", "venue": "Information", "label": 17}, {"loc": [5.299776554107666, -1.4925172328948975], "openalex_id": "https://openalex.org/W4409331770", "title": "An Unsupervised Integrated Framework for Arabic Aspect-Based Sentiment Analysis and Abstractive Text Summarization of Traffic Services Using Transformer \u2026", "authors": "Alanoud Alotaibi, Farrukh Nadeem", "abstract": "Social media is crucial for gathering public feedback on government services, particularly in the traffic sector. While Aspect-Based Sentiment Analysis (ABSA) offers a means to extract actionable insights from user posts, analyzing Arabic content poses unique challenges. Existing Arabic ABSA approaches heavily rely on supervised learning and manual annotation, limiting scalability. To tackle these challenges, we suggest an integrated framework combining unsupervised BERTopic-based Aspect Category Detection with distance supervision using a fine-tuned CAMeLBERT model for sentiment classification. This is further complemented by transformer-based summarization through a fine-tuned AraBART model. Key contributions of this paper include: (1) the first comprehensive Arabic traffic services dataset containing 461,844 tweets, enabling future research in this previously unexplored domain; (2) a novel unsupervised approach for Arabic ABSA that eliminates the need for large-scale manual annotation, using FastText custom embeddings and BERTopic to achieve superior topic clustering; (3) a pioneering integration of aspect detection, sentiment analysis, and abstractive summarization that provides a complete pipeline for analyzing Arabic traffic service feedback; (4) state-of-the-art performance metrics across all tasks, achieving 92% accuracy in ABSA and a ROUGE-L score of 0.79 for summarization, establishing new benchmarks for Arabic NLP in the traffic domain. The framework significantly enhances smart city traffic management by enabling automated processing of citizen feedback, supporting data-driven decision-making, and allowing authorities to monitor public sentiment, identify emerging issues, and allocate resources based on citizen needs, ultimately improving urban mobility and service responsiveness.", "venue": "Smart Cities", "label": 0}, {"loc": [2.9448750019073486, -0.5666709542274475], "openalex_id": "https://openalex.org/W4415976084", "title": "TheBlueScrubs-v1, a comprehensive curated medical dataset derived from the internet", "authors": "Luis Felipe, Carlos Alexandre Borges Garcia, Issam El Naqa, Monique Shotande, Aakash Tripathi, Vivek A. Rudrapatna, Ghulam Rasool, Danielle S. Bitterman, Gilmer Vald\u00e9s", "abstract": "The need for robust and diverse data sets to train clinical large language models (cLLMs) is critical given that currently available public repositories often prove too limited in size or scope for comprehensive medical use. While resources like PubMed provide foundational medical literature, they capture only a narrow range of formal publications and omit the broader medical discourse on the internet. To address these deficits, we introduce TheBlueScrubs-v1, a curated dataset of over 25 billion medical tokens - nearly three times larger than PubMed - drawn from a broad-scale internet corpus. Our two-stage filtering pipeline employs a Logistic Regression model for document screening (achieving an AUC of approximately 0.95 on external validation), followed by verification via a 70B-parameter Llama 3.1 instruct model. Each text is assigned three LLM-based quality scores encompassing medical relevance, precision and factual detail, and safety and ethical standards. Clinician reviews confirm high concordance with these automated evaluations, and a specialized cancer classifier further labels approximately 11 billion oncology tokens. Two demonstration tasks highlight the dataset's practical value: first, we distill the safety evaluations to a smaller BERT-style model that reaches an AUC near 0.96 on unseen data; second, we fine-tune a compact LLM on a filtered subset, showing measurable improvements over standard baselines in medical benchmarks as well as private ones. This Data Descriptor details the dataset's creation and validation, underscoring its potential utility for medical AI research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.6868720054626465, 0.5199402570724487], "openalex_id": "https://openalex.org/W4415978452", "title": "From ChatGPT to DeepSeek AI: A Comprehensive Analysis of Evolution, Deviation, and Future Implications in AI-Language Models", "authors": "Simrandeep Singh, Shreya Bansal, Abdulmotaleb El Saddik, Mukesh Saini", "abstract": "The rapid advancement of artificial intelligence (AI) has reshaped the field of natural language processing (NLP), with models like OpenAI ChatGPT and DeepSeek AI. Although ChatGPT established a strong foundation for conversational AI, DeepSeek AI introduces significant improvements in architecture, performance, and ethical considerations. This paper presents a detailed analysis of the evolution from ChatGPT to DeepSeek AI, highlighting their technical differences, practical applications, and broader implications for AI development. To assess their capabilities, we conducted a case study using a predefined set of multiple choice questions in various domains, evaluating the strengths and limitations of each model. By examining these aspects, we provide valuable insight into the future trajectory of AI, its potential to transform industries, and key research directions for improving AI-driven language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.8256254196167, 0.1660401076078415], "openalex_id": "https://openalex.org/W4414957389", "title": "Multilingual Retrieval-Augmented Generation for Knowledge-Intensive Task", "authors": "Leonardo Ranaldi, Barry Haddow, Alexandra Birch", "abstract": "Retrieval-augmented generation (RAG) has become a cornerstone of contemporary NLP, enhancing large language models (LLMs) by allowing them to access richer factual contexts through in-context retrieval. While effective in monolingual settings, especially in English, its use in multilingual tasks remains unexplored. This paper investigates the effectiveness of RAG across multiple languages by proposing novel approaches for multilingual open-domain question-answering. We evaluate the performance of various multilingual RAG strategies, including question-translation (tRAG), which translates questions into English before retrieval, and Multilingual RAG (MultiRAG), where retrieval occurs directly across multiple languages. Our findings reveal that tRAG, while useful, suffers from limited coverage. In contrast, MultiRAG improves efficiency by enabling multilingual retrieval but introduces inconsistencies due to cross-lingual variations in the retrieved content. To address these issues, we propose Crosslingual RAG (CrossRAG), a method that translates retrieved documents into a common language (e.g., English) before generating the response. Our experiments show that CrossRAG significantly enhances performance on knowledge-intensive tasks, benefiting both high-resource and low-resource languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.575140953063965, 0.12810368835926056], "openalex_id": "https://openalex.org/W4409198999", "title": "Large Language Models for Psychological Assessment: A Comprehensive Overview", "authors": "Jocelyn Brickman, Mehak Gupta, Joshua R. Oltmanns", "abstract": "Large language models (LLMs) are extraordinary tools demonstrating potential to improve our understanding of psychological characteristics. They provide an unprecedented opportunity to supplement self-report in psychology research and practice with scalable behavioral assessment. However, they also pose unique risks and challenges. This article serves as an overview and guide for psychological scientists to evaluate LLMs for psychological assessment. In Section I, we briefly review the development of transformer-based LLMs and discuss their advances in natural language processing. In Section II, we describe the experimental design process including techniques for language data collection, audio processing and transcription, text preprocessing, and model selection, as well as analytic matters such as model output, model evaluation, hyperparameter tuning, model visualization, and topic modeling. At each stage, we describe options, important decisions, and resources for further in-depth learning, while providing examples from different areas of psychology. In Section III, we discuss important broader ethical and implementation issues and future directions for researchers using this methodology. The reader will develop an understanding of essential ideas and an ability to navigate the process of using LLMs for psychological assessment.", "venue": "https://doi.org/10.31234/osf.io/qm9ae_v1", "label": 0}, {"loc": [3.915776014328003, 4.161883354187012], "openalex_id": "https://openalex.org/W4410279296", "title": "Retrieval-Augmented Purifier for Robust LLM-Empowered Recommendation", "authors": "Liangbo Ning, Wenqi Fan, Qing Li", "abstract": "Recently, Large Language Model (LLM)-empowered recommender systems have revolutionized personalized recommendation frameworks and attracted extensive attention. Despite the remarkable success, existing LLM-empowered RecSys have been demonstrated to be highly vulnerable to minor perturbations. To mitigate the negative impact of such vulnerabilities, one potential solution is to employ collaborative signals based on item-item co-occurrence to purify the malicious collaborative knowledge from the user's historical interactions inserted by attackers. On the other hand, due to the capabilities to expand insufficient internal knowledge of LLMs, Retrieval-Augmented Generation (RAG) techniques provide unprecedented opportunities to enhance the robustness of LLM-empowered recommender systems by introducing external collaborative knowledge. Therefore, in this paper, we propose a novel framework (RETURN) by retrieving external collaborative signals to purify the poisoned user profiles and enhance the robustness of LLM-empowered RecSys in a plug-and-play manner. Specifically, retrieval-augmented perturbation positioning is proposed to identify potential perturbations within the users' historical sequences by retrieving external knowledge from collaborative item graphs. After that, we further retrieve the collaborative knowledge to cleanse the perturbations by using either deletion or replacement strategies and introduce a robust ensemble recommendation strategy to generate final robust predictions. Extensive experiments on three real-world datasets demonstrate the effectiveness of the proposed RETURN.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.548678398132324, 2.640909194946289], "openalex_id": "https://openalex.org/W4410348938", "title": "MegaMath: Pushing the Limits of Open Math Corpora", "authors": "Zhou Fan, Zengzhi Wang, N. Ranjan, Cheng Zhang, Liping Tang, Guowei He, Zhengzhong Liu, Eric P. Xing", "abstract": "Mathematical reasoning is a cornerstone of human intelligence and a key benchmark for advanced capabilities in large language models (LLMs). However, the research community still lacks an open, large-scale, high-quality corpus tailored to the demands of math-centric LLM pre-training. We present MegaMath, an open dataset curated from diverse, math-focused sources through following practices: (1) Revisiting web data: We re-extracted mathematical documents from Common Crawl with math-oriented HTML optimizations, fasttext-based filtering and deduplication, all for acquiring higher-quality data on the Internet. (2) Recalling Math-related code data: We identified high quality math-related code from large code training corpus, Stack-V2, further enhancing data diversity. (3) Exploring Synthetic data: We synthesized QA-style text, math-related code, and interleaved text-code blocks from web data or code data. By integrating these strategies and validating their effectiveness through extensive ablations, MegaMath delivers 371B tokens with the largest quantity and top quality among existing open math pre-training datasets.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.707545280456543, 1.8478413820266724], "openalex_id": "https://openalex.org/W4409696568", "title": "Language Models at the Syntax-Semantics Interface: A Case Study of the Long-Distance Binding of Chinese Reflexive Ziji", "authors": "Xiulin Yang", "abstract": "This paper explores whether language models can effectively resolve the complex binding patterns of the Mandarin Chinese reflexive ziji, which are constrained by both syntactic and semantic factors. We construct a dataset of 240 synthetic sentences using templates and examples from syntactic literature, along with 320 natural sentences from the BCC corpus. Evaluating 21 language models against this dataset and comparing their performance to judgments from native Mandarin speakers, we find that none of the models consistently replicates human-like judgments. The results indicate that existing language models tend to rely heavily on sequential cues, though not always favoring the closest strings, and often overlooking subtle semantic and syntactic constraints. They tend to be more sensitive to noun-related than verb-related semantics.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.063660144805908, -1.2685058116912842], "openalex_id": "https://openalex.org/W4409116221", "title": "Real-time Monitoring of Economic Shocks using Company Websites", "authors": "Martin Woerter, Michael Koenig, Jakob Rauch", "abstract": "Abstract Understanding the effects of economic shocks on firms is critical for analyzing economic growth and resilience. We introduce a Web-Based Affectedness Indicator (WAI), a general-purpose tool for real-time monitoring of economic disruptions across diverse contexts. By leveraging Large Language Model (LLM) assisted classification and information extraction on texts from over five million company websites, WAI quantifies the degree and nature of firms\u2019 responses to external shocks. Using the COVID-19 pandemic as a specific application, we show that WAI is highly correlated with pandemic containment measures and reliably predicts firm performance. Unlike traditional data sources, WAI provides timely firm-level information across industries and geographies worldwide that would otherwise be unavailable due to institutional and data availability constraints. This methodology offers significant potential for monitoring and mitigating the impact of technological, political, financial, health or environmental crises, and represents a transformative tool for adaptive policy-making and economic resilience.", "venue": "https://doi.org/10.21203/rs.3.rs-6097333/v1", "label": 0}, {"loc": [7.053364276885986, 0.15435846149921417], "openalex_id": "https://openalex.org/W4409157490", "title": "A Survey on Multilingual Large Language Models: Corpora, Alignment, and Bias", "authors": "Yuemei Xu, Ling Hu, Jiayi Zhao, Zihan Qiu, Kexin Xu, Yuqi Ye, Hanwen Gu", "abstract": "Abstract Based on the foundation of Large Language Models (LLMs), Multilingual LLMs (MLLMs) have been developed to address the challenges faced in multilingual natural language processing, hoping to achieve knowledge transfer from high-resource languages to low-resource languages. However, significant limitations and challenges still exist, such as language imbalance, multilingual alignment, and inherent bias. In this paper, we aim to provide a comprehensive analysis of MLLMs, delving deeply into discussions surrounding these critical issues. First of all, we start by presenting an overview of MLLMs, covering their evolutions, key techniques, and multilingual capacities. Secondly, we explore the multilingual training corpora of MLLMs and the multilingual datasets oriented for downstream tasks that are crucial to enhance the cross-lingual capability of MLLMs. Thirdly, we survey the state-of-the-art studies of multilingual representations and investigate whether the current MLLMs can learn a universal language representation. Fourthly, we discuss bias on MLLMs, including its categories, evaluation metrics, and debiasing techniques. Finally, we discuss existing challenges and point out promising research directions of MLLMs.", "venue": "Frontiers of Computer Science", "label": 0}, {"loc": [8.34611701965332, 0.5498234033584595], "openalex_id": "https://openalex.org/W4415273139", "title": "Unifying Retrieval and Generation: A Survey on Retrieval-Augmented Generation in NLP", "authors": "Casey Emelia, Jeanie Genesis, Brandie Nickolas", "abstract": "

Retrieval-Augmented Generation (RAG) has emerged as a transformative approach in the field of natural language processing (NLP), combining the capabilities of information retrieval with generative models to tackle a wide range of language understanding and generation tasks. Traditional generative models, while proficient in generating fluent and coherent responses, often struggle with knowledge grounding, especially when dealing with domain-specific or long-tail queries. Retrieval-Augmented Generation addresses this limitation by leveraging external knowledge sources, such as large document corpora, to improve the factual accuracy and contextual relevance of the generated content. In RAG, the retrieval component fetches pertinent information from these knowledge sources, which is then fused with the input query and used to guide the generative model in producing responses that are both informed and coherent. This survey provides an extensive review of the RAG paradigm, focusing on its key components: the retrieval mechanism, the generative model, and the integration between them. We delve into various retrieval techniques employed in RAG systems, including traditional sparse retrieval methods (e.g., BM25) and modern dense retrieval methods that utilize neural embeddings, such as those based on transformer architectures. The paper compares these methods in terms of their retrieval performance, efficiency, and scalability, highlighting the trade-offs between retrieval accuracy and computational cost. We also discuss the challenges and complexities in the fusion of retrieved knowledge with the generation process, emphasizing the importance of ensuring coherence, relevance, and factual correctness in the generated outputs. In addition to the foundational techniques, we explore various challenges faced by RAG systems. These challenges include the retrieval of relevant and high-quality documents, the need for effective fusion strategies that combine the retrieved knowledge with the input query, and the prevention of hallucinations-where the model generates plausiblesounding but incorrect information. We also address the computational complexities involved in training and fine-tuning RAG models, including the difficulties of joint optimization between the retrieval and generative components. Furthermore, ethical considerations, such as bias in the retrieval sources and the generated outputs, are discussed, underscoring the need for fairness, transparency, and responsible AI practices in the deployment of RAG systems.

", "venue": "HAL (Le Centre pour la Communication Scientifique Directe)", "label": 6}, {"loc": [5.654418468475342, -1.3282408714294434], "openalex_id": "https://openalex.org/W4409173725", "title": "Large Language Models for Arabic Sentiment Analysis and Machine Translation", "authors": "Mohamed Zouidine, Mohammed Khalil", "abstract": "Large Language Models (LLMs) have recently demonstrated outstanding performance in a variety of Natural Language Processing (NLP) tasks. Although many LLMs have been developed, only a few models have been evaluated in the context of the Arabic language, with a significant focus on the ChatGPT model. This study assessed three LLMs on two Arabic NLP tasks: sentiment analysis and machine translation. The capabilities of LLaMA, Mixtral, and Gemma under zero- and few-shot learning were investigated, and their performance was compared against State-Of-The-Art (SOTA) models. The experimental results showed that, among the three models, LLaMA tends to have better comprehension abilities for the Arabic language, outperforming Mixtral and Gemma on both tasks. However, except for the Arabic-to-English translation, where LLaMA outperforms the transformer model by 4 BLEU points, in all cases, the performance of the three LLMs fell behind that of the SOTA model.", "venue": "Engineering Technology & Applied Science Research", "label": 44}, {"loc": [2.9439334869384766, -0.4035619795322418], "openalex_id": "https://openalex.org/W4409289826", "title": "Domain-Adaptive Pretraining of Transformer-Based Language Models on Medical Texts: A High-Performance Computing Experiment", "authors": "Charles Kinyua Gitonga, Lydia Gakii Mugao", "abstract": "This research was to investigate the effect of utilizing high-performance computing (HPC) resources to enhance the adaptability and performance of transformer-based language models. The research was done through intensive domain-specific pretraining in the medical domain. The study aimed to answer the question: Can domain-adaptive pretraining on medical texts significantly improve language model performance metrics such as perplexity while maintaining computational efficiency and addressing ethical considerations? The research utilized a corpus of medical texts. These were carefully split into training and evaluation datasets. Initial model training on NVIDIA A30 GPUs, with 96% GPU utilization, calculated an average perplexity of 73.54. Following iterative refinements\u2014including domain-specific tokenizer optimization, data preprocessing, mixed-precision training, and adjusted learning parameters\u2014the final model achieved an average perplexity of 3.39. The evaluation run processed 7103 samples in 98.02 seconds, with a training loss of 2.405 and an evaluation loss of 2.045, indicating strong generalization and the absence of overfitting. The final model and results were saved for reproducibility and future use. This study was justified by the pressing need for accurate and efficient medical natural language processing (NLP) applications. The application areas are in clinical decision support, patient record summarization, and medical research analysis. The research findings highlight that investing in HPC-driven domain-adaptive pretraining delivers substantial improvements in performance. It also equips medical NLP models with abilities to handle the complexities of domain-specific language effectively. The Ethical considerations of this research were based on optimizing GPU utilization to reduce energy consumption and ensure transparency through reproducible methodologies. We recommend future research to explore larger medical datasets, broader clinical specializations, and diverse transformer architectures while also investigating the transferability of learned representations across related medical subdomains. The advancements could further enhance the applicability of specialized language models in medical research and practice.", "venue": "European Journal of Information Technologies and Computer Science", "label": 0}, {"loc": [9.215676307678223, -0.8485156297683716], "openalex_id": "https://openalex.org/W4411270928", "title": "A Hybrid Architecture with Efficient Fine Tuning for Abstractive Patent Document Summarization", "authors": "Nevidu Jayatilleke, Ruvan Weerasinghe", "abstract": "Automatic patent summarization approaches that help in the patent analysis and comprehension procedure are in high demand due to the colossal growth of innovations. The development of natural language processing (NLP), text mining, and deep learning has notably amplified the efficacy of text summarization models for abundant types of documents. Summarizing patent text remains a pertinent challenge due to the labyrinthine writing style of these documents, which includes technical and legal intricacies. Additionally, these patent document contents are considerably lengthier than archetypal documents, which complicates the process of extracting pertinent information for summarization. Embodying extractive and abstractive text summarization methodologies into a hybrid framework, this study proposes a system for efficiently creating abstractive summaries of patent records. The procedure involves leveraging the LexRank graph-based algorithm to retrieve the important sentences from input parent texts, then utilizing a Bidirectional Auto-Regressive Transformer (BART) model that has been fine-tuned using Low-Ranking Adaptation (LoRA) for producing text summaries. This is accompanied by methodical testing and evaluation strategies. Furthermore, the author employed certain meta-learning techniques to achieve Domain Generalization (DG) of the abstractive component across multiple patent fields.", "venue": "https://doi.org/10.1109/scse65633.2025.11030964", "label": 0}, {"loc": [7.688267230987549, -0.9950845837593079], "openalex_id": "https://openalex.org/W4409178241", "title": "Large Language Models With Contrastive Decoding Algorithm for Hallucination Mitigation in Low\u2010Resource Languages", "authors": "Hongying Zan, Arifa Javed, Muhammad Abdullah, Javed Rashid, Muhammad Faheem", "abstract": "ABSTRACT Neural machine translation (NMT) has advanced with deep learning and large\u2010scale multilingual models, yet translating low\u2010resource languages often lacks sufficient training data and leads to hallucinations. This often results in translated content that diverges significantly from the source text. This research proposes a refined Contrastive Decoding (CD) algorithm that dynamically adjusts weights of log probabilities from strong expert and weak amateur models to mitigate hallucinations in low\u2010resource NMT and improve translation quality. Advanced large language NMT models, including ChatGLM and LLaMA, are fine\u2010tuned and implemented for their superior contextual understanding and cross\u2010lingual capabilities. The refined CD algorithm evaluates multiple candidate translations using BLEU score, semantic similarity, and Named Entity Recognition accuracy. Extensive experimental results show substantial improvements in translation quality and a significant reduction in hallucination rates. Fine\u2010tuned models achieve higher evaluation metrics compared to baseline models and state\u2010of\u2010the\u2010art models. An ablation study confirms the contributions of each methodological component and highlights the effectiveness of the refined CD algorithm and advanced models in mitigating hallucinations. Notably, the refined methodology increased the BLEU score by approximately 30% compared to baseline models.", "venue": "CAAI Transactions on Intelligence Technology", "label": 0}, {"loc": [3.795778751373291, -1.0457974672317505], "openalex_id": "https://openalex.org/W4409147473", "title": "The generative revolution: AI foundation models in geospatial health\u2014applications, challenges and future research", "authors": "Bernd Resch, Polychronis Kolokoussis, David Hanny, Maria Antonia Brovelli, Maged N. Kamel Boulos", "abstract": "In an era of rapid technological advancements, generative artificial intelligence and foundation models are reshaping industries and offering new advanced solutions in a wide range of scientific areas, particularly in public and environmental health. However, foundation models have previously mostly focused on understanding and generating text, while geospatial features, interrelations, flows and correlations have been neglected. Thus, this paper outlines the importance of research into Geospatial Foundation Models, which have the potential to revolutionise digital health surveillance and public health. We examine the latest advances, opportunities, challenges, and ethical considerations of geospatial foundation models for research and applications in digital health. We focus on the specific challenges of integrating geospatial context with foundation models and lay out the future potential for multimodal geospatial foundation models for a variety of research avenues in digital health surveillance and health assessment.", "venue": "International Journal of Health Geographics", "label": 0}, {"loc": [5.843639850616455, -0.8546851277351379], "openalex_id": "https://openalex.org/W4409088340", "title": "BERTWEETRO: PRE-TRAINED LANGUAGE MODELS FOR ROMANIAN SOCIAL MEDIA CONTENT", "authors": "Dan Claudiu Neagu", "abstract": "Abstract The introduction of Transformers, like BERT or RoBERTa, have revolutionized NLP due to their ability to better \u201cunderstand\u201d the meaning of texts. These models are created (pre-trained) in a self-supervised manner on large scale data to predict words in a sentence but can be adjusted (fine-tuned) for other specific NLP applications. Initially, these models were created using literary texts but very quickly the need to process social media content emerged. Social media texts have some problematic characteristics (they are short, informal, filled with typos, etc.) which means that a traditional BERT model will have problems when dealing with this type of input. For this reason, dedicated models need to be pre-trained on microblogging content and many such models have been developed in popular languages like English or Spanish. For under-represented languages, like Romanian, this is more difficult to achieve due to the lack of open-source resources. In this paper we present our efforts in pre-training from scratch 8 BERTweetRO models, based on RoBERTa architecture, with the help of a Romanian tweets corpus. To evaluate our models, we fine-tune them on 2 down-stream tasks, Sentiment Analysis (with 3 classes) and Topic Classification (with 26 classes), and compare them against Multilingual BERT plus a number of other popular classic and deep learning models. We include a commercial solution in this comparison and show that some BERTweetRO variants and almost all models trained on the translated data have a better accuracy than the commercial solution. Our best performing BERTweetRO variants place second after Multilingual BERT in most of our experiments, which is a good result considering that our Romanian corpus used for pre-training is relatively small, containing around 51,000 texts.", "venue": "Studia Universitatis Babe\u015f-Bolyai. Oeconomica", "label": 0}, {"loc": [2.0433945655822754, 5.333052158355713], "openalex_id": "https://openalex.org/W4409713068", "title": "A Concise Survey on Modern Web\u2010Based Phishing Techniques and Advanced Mitigation Strategies", "authors": "Dhanavanthini Panneerselvam, Sibi Chakkaravarthy Sethuraman, Alfred E. Emerson, Tarun Kanakam", "abstract": "ABSTRACT Phishing is a tactical technique practiced by cyber\u2010criminals, wherein the target systems are approached, made vulnerable, and exploited. A Phisher who does the act of phishing is always creative, calculative, and persistent. This potentially leads to the increase in the success rate of phishing and the individuals who are technically expertise even falls in phishing campaigns. This article discusses about the various web\u2010based phishing techniques used by the modern day cyber criminals. Various mitigation techniques related to the state of the art machine learning and deep learning techniques are also studied. The article also extensively discusses about the features utilized for the detection. Additionally, a qualitative and quantitative comparison of different studies for mitigating the web phishing attacks is also examined.", "venue": "Transactions on Emerging Telecommunications Technologies", "label": 0}, {"loc": [6.3838276863098145, 2.576918840408325], "openalex_id": "https://openalex.org/W4409073973", "title": "Synthetic Data Enhances Mathematical Reasoning of Language Models Based on Artificial Intelligence", "authors": "Zeyu Han, Weiwei Jiang", "abstract": "Current large language models (LLMs) training involves extensive training data and computing resources to handle multiple natural language processing (NLP) tasks. This paper endeavors to assist individuals to compose feasible mathematical question-answering (QA) language models in specific fields. We leveraged Gretel.ai, a feasible data generation platform, to generate high-quality mathematical QA data covering several areas, including definitions, theorems, and calculations related to linear algebra and abstract algebra. After fine- tuning through Open-AI infrastructure, GPT-3 performed significant improvements on accuracy, achieving a roughly 18.2% increase in abstract algebra benchmark, approximately 1.6x improvement on linear algebra theorems benchmark, and approximately 24.0% increase on linear algebra calculations benchmark. And small language models (SLMs) such as LLama-2-7B/13B and Mistral-7B have outstanding around 2x accuracy advancements in linear algebra calculations. This study demonstrates the potential for individuals to develop customized SLMs for specialized mathematical domains using synthetic data generation and fine-tuning techniques.", "venue": "Information Technology And Control", "label": 0}, {"loc": [4.404465198516846, -2.4796411991119385], "openalex_id": "https://openalex.org/W4409670204", "title": "A Novel Approach to Automated Detection of AI-Generated Text", "authors": "Hazem M. Abbas", "abstract": "Detecting machine-generated text involves identifying whether text has been created by artificial intelligence models or written by humans. This task has become increasingly significant due to the potential misuse of AI-generated text for producing fake news, reviews, or spam that can mislead people. The aim of this study is to develop a model capable of determining if a tweet's author is human or a robot. To achieve this, we utilized a zero-shot prompt with a pre-trained model and fine-tuned SBERT using various transformer models. Additionally, we employed graph attention network and graph convolutional network models to analyze the author's writing style. The findings indicate that using the graph convolutional network model to extract writing style characteristics yields the highest accuracy, reaching 93.60%. Detecting machine-generated text is vital for preventing the abuse of AI models and ensuring the reliability of content on online platforms by effectively distinguishing between human and AI-generated text.", "venue": "Journal of Al-Qadisiyah for Computer Science and Mathematics", "label": 0}, {"loc": [5.582437992095947, -0.5654850602149963], "openalex_id": "https://openalex.org/W4408935345", "title": "Enhancing Product Categorization with LLMs", "authors": "Konstantinos I. Roumeliotis, Nikolaos D. Tselikas, Dimitrios \u039a. Nasiopoulos", "abstract": "In the rapidly evolving e-commerce landscape, efficient and accurate product classification is essential for enhancing customer experience and streamlining operations. Traditional product classification methods, which depend heavily on labeled data and manual effort, struggle with scalability and adaptability to diverse product categories. This study explores the transformative potential of large language models (LLMs) for zero-shot product classification in e-commerce, addressing the challenge of automating product categorization without prior labeled training data. We evaluate the performance of four state-of-the-art LLMs \u2014 GPT-4o, GPT-4o mini, Claude 3.5 Sonnet, and Claude 3.5 Haiku \u2014 on a diverse dataset of 248 product categories, each containing 20 samples, structured into 8 subsets. Each model performs zero-shot classification, assigning products to predefined categories without prior exposure. Our findings reveal significant variations in classification accuracy across models, with certain LLMs demonstrating superior scalability and adaptability for real-world e-commerce applications. Based on these insights, we developed an API software to integrate the top-performing models into e-commerce systems, enhancing automation and efficiency. This study underscores the transformative role of LLMs in revolutionizing e-commerce workflows and recommends their adoption for scalable, intelligent product classification.", "venue": "Natural Language Processing Journal", "label": 9}, {"loc": [3.892625093460083, 2.183336019515991], "openalex_id": "https://openalex.org/W4410077814", "title": "Multi-Modal Framing Analysis of News", "authors": "Arnav Arora, Srishti Yadav, Maria Antoniak, Serge Belongie, Isabelle Augenstein", "abstract": "Automated frame analysis of political communication is a popular task in computational social science that is used to study how authors select aspects of a topic to frame its reception. So far, such studies have been narrow, in that they use a fixed set of pre-defined frames and focus only on the text, ignoring the visual contexts in which those texts appear. Especially for framing in the news, this leaves out valuable information about editorial choices, which include not just the written article but also accompanying photographs. To overcome such limitations, we present a method for conducting multi-modal, multi-label framing analysis at scale using large (vision-) language models. Grounding our work in framing theory, we extract latent meaning embedded in images used to convey a certain point and contrast that to the text by comparing the respective frames used. We also identify highly partisan framing of topics with issue-specific frame analysis found in prior qualitative work. We demonstrate a method for doing scalable integrative framing analysis of both text and image in news, providing a more complete picture for understanding media bias.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.821142196655273, 2.6015422344207764], "openalex_id": "https://openalex.org/W4415061822", "title": "Data Mixture Optimization: A Multi-fidelity Multi-scale Bayesian Framework", "authors": "Tso-Jung Yen, Andrew Wei Tung Siah, H.F. Chen, Tianyi Peng, Daniel Guetta, Hongseok Namkoong", "abstract": "Careful curation of data sources can significantly improve the performance of LLM pre-training, but predominant approaches rely heavily on intuition or costly trial-and-error, making them difficult to generalize across different data domains and downstream tasks. Although scaling laws can provide a principled and general approach for data curation, standard deterministic extrapolation from small-scale experiments to larger scales requires strong assumptions on the reliability of such extrapolation, whose brittleness has been highlighted in prior works. In this paper, we introduce a $\\textit{probabilistic extrapolation framework}$ for data mixture optimization that avoids rigid assumptions and explicitly models the uncertainty in performance across decision variables. We formulate data curation as a sequential decision-making problem$\\unicode{x2013}$multi-fidelity, multi-scale Bayesian optimization$\\unicode{x2013}$where $\\{$data mixtures, model scale, training steps$\\}$ are adaptively selected to balance training cost and potential information gain. Our framework naturally gives rise to algorithm prototypes that leverage noisy information from inexpensive experiments to systematically inform costly training decisions. To accelerate methodological progress, we build a simulator based on 472 language model pre-training runs with varying data compositions from the SlimPajama dataset. We observe that even simple kernels and acquisition functions can enable principled decisions across training models from 20M to 1B parameters and achieve $\\textbf{2.6x}$ and $\\textbf{3.3x}$ speedups compared to multi-fidelity BO and random search baselines. Taken together, our framework underscores potential efficiency gains achievable by developing principled and transferable data mixture optimization methods.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.094094753265381, 3.1673083305358887], "openalex_id": "https://openalex.org/W4415061261", "title": "Evidencing Unauthorized Training Data from AI Generated Content using Information Isotopes", "authors": "Tao Qi, Yin Jinhua, Dongqi Cai, Xie Yueqi, Huili Wang, Hu Zhiyang, Yang Peiru, Guoshun Nan, Zhou Zhili, Shangguang Wang, Lyu Lingjuan, Yongfeng Huang, Lauren Hersch Nicholas", "abstract": "In light of scaling laws, many AI institutions are intensifying efforts to construct advanced AIs on extensive collections of high-quality human data. However, in a rush to stay competitive, some institutions may inadvertently or even deliberately include unauthorized data (like privacy- or intellectual property-sensitive content) for AI training, which infringes on the rights of data owners. Compounding this issue, these advanced AI services are typically built on opaque cloud platforms, which restricts access to internal information during AI training and inference, leaving only the generated outputs available for forensics. Thus, despite the introduction of legal frameworks by various countries to safeguard data rights, uncovering evidence of data misuse in modern opaque AI applications remains a significant challenge. In this paper, inspired by the ability of isotopes to trace elements within chemical reactions, we introduce the concept of information isotopes and elucidate their properties in tracing training data within opaque AI systems. Furthermore, we propose an information isotope tracing method designed to identify and provide evidence of unauthorized data usage by detecting the presence of target information isotopes in AI generations. We conduct experiments on ten AI models (including GPT-4o, Claude-3.5, and DeepSeek) and four benchmark datasets in critical domains (medical data, copyrighted books, and news). Results show that our method can distinguish training datasets from non-training datasets with 99\\% accuracy and significant evidence (p-value$<0.001$) by examining a data entry equivalent in length to a research paper. The findings show the potential of our work as an inclusive tool for empowering individuals, including those without expertise in AI, to safeguard their data rights in the rapidly evolving era of AI advancements and applications.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.8490049839019775, 2.863798141479492], "openalex_id": "https://openalex.org/W4414634258", "title": "Legal frictions for data openness: Reflections from a case-study on re-use of the open web for AI training", "authors": "Rameela Chandrasekhar", "abstract": "Training data is key to foundation AI development \u2013 particularly Generative AI models and Large Language Models (LLMs). And a significant portion of this training data comes from the open web. But despite being lauded as a digital commons, the open web is not open for all. It is difficult to \u2018see\u2019 data flows when data and content from the open web is reused to create training datasets, and as these training datasets then move through the various stages of AI development. Legal and policy initiatives for data governance in the AI context often understand data flows as stable and traceable, when in reality, data re-use is an \u201cinherently entangled phenomenon\u201d. Over the course of 2024, Ramya Chandrasekhar from CIS (as part of the ODECO project), collaborated with Inno3 and the Open Knowledge Foundation to investigate legal entanglements of re-use, when data and content from the open web is used to train foundation AI models. Based on conversations with AI researchers and practitioners, an online workshop, and legal analysis of a repository of 41 legal disputes relating to copyright and data protection, the research report highlights tensions between legal imaginations of data flows and computational processes involved in training foundation models. The report makes three contributions: It discusses a three-dimensional framework for data openness of training datasets. While techno-legal openness is necessary, this report argues that the political economy of data re-use also necessitates legal strategies that impose certain limits on data extractivism by well-resourced actors like Big Tech on the one hand, and enable community data sovereignty on the other hand. It contains a repository of 41 ongoing legal controversies relating to copyright and data protection related to training foundation AI models, together with a detailed analysis of how these legal controversies either impact or advance three-dimensional data openness of training datasets. It also contains a critical analysis of existing open licenses, permissive licenses, as well as certain alternative licensing frameworks for training datasets. While these licensing frameworks impose more obligations on re-users and necessitate more collective thinking on interoperability, these licensing frameworks together with other legal and institutional changes are nonetheless necessary for the creation of healthy digital and data commons, to realise the original promise of the open web as open for all.", "venue": "HAL (Le Centre pour la Communication Scientifique Directe)", "label": 6}, {"loc": [5.431344509124756, 2.0801801681518555], "openalex_id": "https://openalex.org/W4408903069", "title": "Figurative Archive: an open dataset and web-based application for the study of metaphor", "authors": "Maddalena Bressler, Veronica Mangiaterra, Paolo Canal, Federico Frau, Fabrizio Luciani, Biagio Scalingi, Chiara Barattieri di San Pietro, Chiara Battaglini, Chiara Pompei, F. Romeo, Luca Bischetti, Valentina Bambini", "abstract": "Research on metaphor has steadily increased over the last decades, as this phenomenon opens a window into a range of processes in language and cognition, from pragmatic inference to abstraction and embodied simulation. At the same time, the demand for rigorously constructed and extensively normed experimental materials increased as well. Here, we present the Figurative Archive, an open database of 997 metaphors in Italian enriched with rating and corpus-based measures (from familiarity to lexical frequency), derived by collecting stimuli used across 11 studies. It includes both everyday and literary metaphors, varying in structure and semantic domains. Dataset validation comprised correlations between familiarity and other measures. The Figurative Archive has several aspects of novelty: it is increased in size compared to previous resources; it includes a novel measure of inclusiveness, to comply with current recommendations for non-discriminatory language use; it is displayed in a web-based interface, with features for a flexible and customized consultation. We provide guidelines for using the Archive in future metaphor studies, in the spirit of open science.", "venue": "https://doi.org/10.31234/osf.io/38kju_v2", "label": 0}, {"loc": [4.0067901611328125, 0.8179813623428345], "openalex_id": "https://openalex.org/W4414685003", "title": "Examining the Impact and Limitations of Distributed Large Language Models and Multimodal Systems", "authors": "Klaus Elli", "abstract": "

Large Language Models (LLMs) and Multimodal Large Language Models (MLLMs) have emerged as transformative advancements in artificial intelligence, enabling unprecedented capabilities in natural language processing, multimodal understanding, and generative AI. These models, powered by massive datasets and distributed computing frameworks, have significantly impacted diverse fields, including healthcare, education, robotics, and creative industries. However, despite their success, LLMs and MLLMs face substantial challenges related to computational efficiency, bias, interpretability, hallucinations, and security vulnerabilities. This survey provides a comprehensive overview of recent progress in distributed LLMs and multimodal models, highlighting key architectural innovations, training paradigms, and real-world applications. We explore critical challenges, including the environmental impact of large-scale model training, ethical concerns surrounding bias and misinformation, and the limitations of current reasoning and knowledge integration capabilities. Furthermore, we discuss emerging trends and future research directions aimed at enhancing model efficiency, reliability, and alignment with human values.

", "venue": "HAL (Le Centre pour la Communication Scientifique Directe)", "label": 6}, {"loc": [2.908839464187622, 1.7818806171417236], "openalex_id": "https://openalex.org/W4408827782", "title": "Reframing the performance and ethics of \u201cempathic\u201d AI: Wisdom of the crowd and placebos", "authors": "Mark Thornton", "abstract": "Recently, claims have emerged that artificial intelligence (AI) is better at providing empathy than humans. These claims come paired with suggestions that people should use empathic AI to supplement human empathy. This paper critically examines these positions by drawing analogies to two well-established psychological effects. First, I argue that the apparent superiority of AI-generated empathy reflects an analog of the \u201cwisdom of the crowd\u201d effect. This reframes the alleged superiority of empathic AI in a more mundane and less dehumanizing way. Second, I consider whether people should use AI for empathy. Here I draw an analogy to placebo effects, suggesting that even clear utilitarian benefits may not justify the adoption of empathic AI.", "venue": "https://doi.org/10.31234/osf.io/zf9w5_v2", "label": 0}, {"loc": [3.9792263507843018, 0.30837690830230713], "openalex_id": "https://openalex.org/W4408802928", "title": "Natural Language Processing and Large Language Models", "authors": "Xue Jiang, Weiren Wang, Shaohan Tian, Hao Wang, Turab Lookman, Yanjing Su", "abstract": "Abstract The transformative impact of artificial intelligence (AI) technologies on materials science has revolutionized the study of materials problems. By leveraging well-characterized datasets derived from the scientific literature, AI-powered tools such as Natural Language Processing (NLP) have opened new avenues to accelerate materials research. The advances in NLP techniques and the development of large language models (LLMs) facilitate the efficient extraction and utilization of information. This review explores the application of NLP tools in materials science, focusing on automatic data extraction, materials discovery, and autonomous research. We also discuss the challenges and opportunities associated with utilizing LLMs and outline the prospects and advancements that will propel the field forward.", "venue": "npj Computational Materials", "label": 0}, {"loc": [9.272979736328125, 0.5693554282188416], "openalex_id": "https://openalex.org/W4408728389", "title": "Comparison of Pre-trained Models for Domain-specific Entity Extraction from Student Report Documents", "authors": "\u0410.\u0412. \u041c\u0435\u043b\u044c\u043d\u0438\u043a\u043e\u0432\u0430, Marina S. Vorobeva, Anna Glazkova", "abstract": "The authors propose a methodology for extracting domain-specific entities from student report documents in Russian language using pre-trained transformer-based language models. Extracting domain-specific entities from student report documents is a relevant task since the obtained data can be used for various purposes, ranging from the formation of project teams to the personalization of learning pathways. Additionally, automating the document processing workflow reduces the labor costs associated with manual processing. As training material for training models, expert-annotated student report documents were used. These documents were created by students in information technology programs between 2019 and 2022 for project-based, practical disciplines, and theses. The domain-specific entity extraction task is approached as two subtasks: named entity recognition (NER) and annotated text generation. A comparative analysis was conducted among NER encoder-only models (ruBERT, ruRoBERTa), encoder-decoder models (ruT5, mBART), and decoder-only models (ruGPT, T-lite) for text generation. The effectiveness of the models was evaluated using the F1-score, along with an analysis of common errors. The highest F1-score on the test set was achieved by mBART (93.55%). This model also showed the lowest error rate in domain-specific entity identification during text generation and annotation. The NER models demonstrated a lower tendency for errors but tended to extract domain-specific entities in a fragmented manner. The obtained results indicate the applicability of the examined models for solving the stated tasks, considering the specific requirements of the problem.", "venue": "Modeling and Analysis of Information Systems", "label": 0}, {"loc": [4.839997291564941, -1.2581465244293213], "openalex_id": "https://openalex.org/W4408690527", "title": "Feature Engineering Trends in Text-Based Affective Computing: Rules to Advance Deep Learning Models", "authors": "Geeta Pattun, Pradeep Kumar", "abstract": "Understanding emotions in textual data, particularly within dynamic social media platforms such as YouTube, Facebook, and Twitter, presents significant challenges. This paper aims to provide a comprehensive review of emotion detection techniques in affective computing, highlighting key advancements, challenges, and ethical concerns. The key contributions of this review include an examination of foundational theories of NLP-based emotion recognition, an analysis of the role of affect lexicons in emotional classification, and a review of commonly used datasets for training emotion detection models. Additionally, it explores various feature extraction techniques, including lexicon-based approaches such as SentiWordNet and NRC Emotion Lexicon, statistical and syntactic features like n-grams and POS tags, and semantic embeddings from deep learning models such as Word2Vec, GloVe, BERT, RoBERTa, and GPT. Findings show that while deep learning and transformer models improve contextual understanding, they also introduce challenges such as high computational costs, data imbalance, and domain adaptability issues. Bias in training data poses ethical risks, potentially reinforcing stereotypes and enabling manipulative applications like targeted advertising and misinformation. Key research gaps include the need for improved feature representations, bias mitigation, enhanced model accuracy and fairness. Traditional models struggle with real-world complexities, while transformer-based models face challenges related to scalability, dataset limitations, and interpretability. Addressing these challenges will enhance affective computing accuracy, fairness, and applicability across industries such as healthcare, education, and human-computer interaction.", "venue": "International Research Journal of Multidisciplinary Technovation", "label": 0}, {"loc": [6.461034774780273, 2.215909719467163], "openalex_id": "https://openalex.org/W4414902254", "title": "MASS: Mathematical Data Selection via Skill Graphs for Pretraining Large Language Models", "authors": "Deleted Author ID, Yu Lu, Qing Cui, Zhiqiang Zhang, Jun Zhou, Yanfang Ye, Chuxu Zhang", "abstract": "High-quality data plays a critical role in the pretraining and fine-tuning of large language models (LLMs), even determining their performance ceiling to some degree. Consequently, numerous data selection methods have been proposed to identify subsets of data that can effectively and efficiently enhance model performance. However, most of these methods focus on general data selection and tend to overlook the specific nuances of domain-related data. In this paper, we introduce MASS, a \\textbf{MA}thematical data \\textbf{S}election framework using the \\textbf{S}kill graph for pretraining LLMs in the mathematical reasoning domain. By taking into account the unique characteristics of mathematics and reasoning, we construct a skill graph that captures the mathematical skills and their interrelations from a reference dataset. This skill graph guides us in assigning quality scores to the target dataset, enabling us to select the top-ranked subset which is further used to pretrain LLMs. Experimental results demonstrate the efficiency and effectiveness of MASS across different model sizes (1B and 7B) and pretraining datasets (web data and synthetic data). Specifically, in terms of efficiency, models trained on subsets selected by MASS can achieve similar performance to models trained on the original datasets, with a significant reduction in the number of trained tokens - ranging from 50\\% to 70\\% fewer tokens. In terms of effectiveness, when trained on the same amount of tokens, models trained on the data selected by MASS outperform those trained on the original datasets by 3.3\\% to 5.9\\%. These results underscore the potential of MASS to improve both the efficiency and effectiveness of pretraining LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.238119125366211, 3.49078106880188], "openalex_id": "https://openalex.org/W4414902931", "title": "A Peek Behind the Curtain: Using Step-Around Prompt Engineering to Identify Bias and Misinformation in GenAI Models", "authors": "Don Hickerson, Mike Perkins", "abstract": "This research examines the emerging technique of step-around prompt engineering in GenAI research, a method that deliberately bypasses AI safety measures to expose underlying biases and vulnerabilities in GenAI models. We discuss how Internet-sourced training data introduces unintended biases and misinformation into AI systems, which can be revealed through the careful application of step-around techniques. Drawing parallels with red teaming in cybersecurity, we argue that step-around prompting serves a vital role in identifying and addressing potential vulnerabilities while acknowledging its dual nature as both a research tool and a potential security threat. Our findings highlight three key implications: (1) the persistence of Internet-derived biases in AI training data despite content filtering, (2) the effectiveness of step-around techniques in exposing these biases when used responsibly, and (3) the need for robust safeguards against malicious applications of these methods. We conclude by proposing an ethical framework for using step-around prompting in AI research and development, emphasizing the importance of balancing system improvements with security considerations.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.4969398975372314, -0.37826821208000183], "openalex_id": "https://openalex.org/W4408608129", "title": "NeoBERT: A Next-Generation BERT", "authors": "Meilong Shi, Qianyi Yan, Wei Zhao, Chuanqi Teng, Fenglin Han, Haobin Chen, Yizhuo Li, Lingyun Xu, Fei Yang, Gang Jin, Y\u012bm\u00edng B\u00e0o, Chunman Zuo, Jing Li", "abstract": "Abstract Neoantigens are classified into canonical and noncanonical types. Noncanonical neoantigens include those derived from noncoding regions, transposable elements (TE), and intron retention events, and they have recently gained considerable attention in cancer immunity. We curated 35,574 non-redundant neoantigen-HLA pairs from 14 immunopeptidomes studies, by analyzing unique features and differences across various sources of neoantigens. This knowledge enabled us to develop machine learning models for the prediction of different types of neoantigens. Our data and models are available at a public portal (https://ngdc.cncb.ac.cn/neoatlas) to facilitate broad access and future research. This resource offers advanced functionalities, including integration with epigenome browsers which allow easy navigation of epigenomic datasets to support and confirm the expression of neoantigens. We further demonstrate that combining our database with mass spectrometry analysis can identify noncanonical neoantigens. The resource we constructed holds significant value and promise for the development of neoantigen-based vaccines.", "venue": "bioRxiv (Cold Spring Harbor Laboratory)", "label": 12}, {"loc": [2.844322681427002, -0.4662456512451172], "openalex_id": "https://openalex.org/W4408488540", "title": "Enhancing Bidirectional Encoder Representations From Transformers (BERT) With Frame Semantics to Extract Clinically Relevant Information From German \u2026", "authors": "Daniel Reichenpfader, Jonas Knupp, Sandro Urs von D\u00e4niken, Roberto Gaio, Fabio Dennst\u00e4dt, Grazia M. Cereghetti, Andr\u00e9 Sander, Hans Hiltbrunner, Knud Nairz, Kerstin Denecke", "abstract": "Background Structured reporting is essential for improving the clarity and accuracy of radiological information. Despite its benefits, the European Society of Radiology notes that it is not widely adopted. For example, while structured reporting frameworks such as the Breast Imaging Reporting and Data System provide standardized terminology and classification for mammography findings, radiology reports still mostly comprise free-text sections. This variability complicates the systematic extraction of key clinical data. Moreover, manual structuring of reports is time-consuming and prone to inconsistencies. Recent advancements in large language models have shown promise for clinical information extraction by enabling models to understand contextual nuances in medical text. However, challenges such as domain adaptation, privacy concerns, and generalizability remain. To address these limitations, frame semantics offers an approach to information extraction grounded in computational linguistics, allowing a structured representation of clinically relevant concepts. Objective This study explores the combination of Bidirectional Encoder Representations from Transformers (BERT) architecture with the linguistic concept of frame semantics to extract and normalize information from free-text mammography reports. Methods After creating an annotated corpus of 210 German reports for fine-tuning, we generate several BERT model variants by applying 3 pretraining strategies to hospital data. Afterward, a fact extraction pipeline is built, comprising an extractive question-answering model and a sequence labeling model. We quantitatively evaluate all model variants using common evaluation metrics (model perplexity, Stanford Question Answering Dataset 2.0 [SQuAD_v2], seqeval) and perform a qualitative clinician evaluation of the entire pipeline on a manually generated synthetic dataset of 21 reports, as well as a comparison with a generative approach following best practice prompting techniques using the open-source Llama 3.3 model (Meta). Results Our system is capable of extracting 14 fact types and 40 entities from the clinical findings section of mammography reports. Further pretraining on hospital data reduced model perplexity, although it did not significantly impact the 2 downstream tasks. We achieved average F1-scores of 90.4% and 81% for question answering and sequence labeling, respectively (best pretraining strategy). Qualitative evaluation of the pipeline based on synthetic data shows an overall precision of 96.1% and 99.6% for facts and entities, respectively. In contrast, generative extraction shows an overall precision of 91.2% and 87.3% for facts and entities, respectively. Hallucinations and extraction inconsistencies were observed. Conclusions This study demonstrates that frame semantics provides a robust and interpretable framework for automating structured reporting. By leveraging frame semantics, the approach enables customizable information extraction and supports generalization to diverse radiological domains and clinical contexts with additional annotation efforts. Furthermore, the BERT-based model architecture allows for efficient, on-premise deployment, ensuring data privacy. Future research should focus on validating the model\u2019s generalizability across external datasets and different report types to ensure its broader applicability in clinical practice.", "venue": "Journal of Medical Internet Research", "label": 13}, {"loc": [6.049454689025879, 3.4043445587158203], "openalex_id": "https://openalex.org/W4416038337", "title": "Advanced Tool Learning and Selection System (ATLASS): A Closed-Loop Framework Using LLM", "authors": "Mohd Ariful Haque, Justin Williams, Sunzida Siddique, Md Azharul Islam, Hasmot Ali, Kishor Datta Gupta, Roy George", "abstract": "The combination of LLM agents with external tools enables models to solve complex tasks beyond their knowledge base. Human-designed tools are inflexible and restricted to solutions within the scope of pre-existing tools created by experts. To address this problem, we propose ATLASS, an advanced tool learning and selection system designed as a closed-loop framework. It enables the LLM to solve problems by dynamically generating external tools on demand. In this framework, agents play a crucial role in orchestrating tool selection, execution, and refinement, ensuring adaptive problem-solving capabilities. The operation of ATLASS follows three phases: The first phase, Understanding Tool Requirements, involves the Agents determining whether tools are required and specifying their functionality; the second phase, Tool Retrieval/Generation, involves the Agents retrieving or generating tools based on their availability; and the third phase, Task Solving, involves combining all the component tools necessary to complete the initial task. The Tool Dataset stores the generated tools, ensuring reusability and minimizing inference cost. Current LLM-based tool generation systems have difficulty creating complex tools that need APIs or external packages. In ATLASS, we solve the problem by automatically setting up the environment, fetching relevant API documentation online, and using a Python interpreter to create a reliable, versatile tool that works in a wider range of situations. OpenAI GPT-4.0 is used as the LLM agent, and safety and ethical concerns are handled through human feedback before executing generated code. By addressing the limitations of predefined toolsets and enhancing adaptability, ATLASS serves as a real-world solution that empowers users with dynamically generated tools for complex problem-solving.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.776915550231934, -0.8216323256492615], "openalex_id": "https://openalex.org/W4408472421", "title": "Found in Translation: Sourcing parallel corpora for low-resource language pairs", "authors": "Hinrik Hafsteinsson, Stein\u00fe\u00f3r Steingr\u00edmsson", "abstract": "This paper describes the sourcing, processing, and application of parallel text data for Icelandic and Polish for the purpose of bilingual lexicon induction (BLI), demonstrating how a parallel corpus can be compiled for a low-to-medium resource language pair that has no available parallel data, by pivoting through a common language. We show the usefulness of the corpus by training and evaluating a machine translation (MT) model on the data. Iceland's linguistic landscape is evolving, with an increasing need for multilingual support due to the growing immigrant population. Polish, in particular, stands out as the language of the largest single minority in Iceland, underscoring the importance of this project.", "venue": "Digital Humanities in the Nordic and Baltic Countries Publications", "label": 45}, {"loc": [9.499359130859375, 0.7625645399093628], "openalex_id": "https://openalex.org/W4408472460", "title": "Developing named-entity recognition for state authority archives", "authors": "Ida Toivanen, Venla Poso, Mikko Lipsanen, Tanja V\u00e4lisalo", "abstract": "Named entity recognition (NER) is one of the more common natural language processing tasks, that usually entails the detection of entities like person, location and date from textual data. Due to the bureaucratic language present in the data from state authority archives, existing NER models may not perform as well as researchers utilising them would wish. The diversity of the archival data, containing texts from different domains, as well as noise due to imperfect optical character recognition (OCR), creates challenges for NER. This gave us an incentive to train our own NER model, FinArcNER, and see if our attempts would produce better classification results in an archival setting. The aim of our study was to answer the following research questions: 1) Does training with noisy archival data bring the needed improvement to the model performance? 2) Does the training with noisy archival data skew the results with non-archival data? The FinArcNER model shows consistent performance when tested with modern and archival data (F1 scores 0.9200 and 0.8710, respectively). We can deduce from this that the increased diversity of the training data improved the model performance \u2013 that is, even though we included archival data with OCR noise, the model still learned to detect named entities correctly from noise-free, non-archival data.", "venue": "Digital Humanities in the Nordic and Baltic Countries Publications", "label": 45}, {"loc": [6.403927326202393, 5.182191848754883], "openalex_id": "https://openalex.org/W4415100951", "title": "Filter Like You Test: Data-Driven Data Filtering for CLIP Pretraining", "authors": "Mikey Shechter, Yair Carmon", "abstract": "We introduce Filter Like You Test (FLYT), an algorithm for curating large-scale vision-language datasets that learns the usefulness of each data point as a pretraining example. FLYT trains a scoring model that learns to weigh each example's features using gradient signals from downstream tasks training sets. Based on FLYT, we implement Mixing-FLYT (M-FLYT), which takes the per-example scores generated by different scoring methods as features, and learns to unify them into a single score. FLYT naturally produces a distribution over the training examples, which we leverage through Soft Cap Sampling (SCS), a strategy for obtaining a filtered pretraining dataset from per-example probabilities that samples examples while preventing over-representation through a repetition penalty. Using these methods, we achieve 40.1% ImageNet zero-shot accuracy on the DataComp medium scale filtering benchmark, a 2% absolute accuracy increase over all previous results and a 5.5% increase over results that - like us - use only public resources. Our approach also yields 37.7\\% on the average of 38 DataComp evaluation tasks, outperforming previous public-resource approaches by 0.4\\%.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.6159117221832275, 3.826159715652466], "openalex_id": "https://openalex.org/W4415102926", "title": "Automatic Association of Quality Requirements and Quantifiable Metrics for Cloud Security Certification", "authors": "John Bianchi, Shuya Dong, Luca Petrillo, Marinella Petrocchi", "abstract": "The European Cybersecurity Certification Scheme for Cloud Services (EUCS) is one of the first cybersecurity schemes in Europe, defined by the European Union Agency for Cybersecurity (ENISA). It aims to encourage cloud providers to strengthen their cybersecurity policies in order to receive an official seal of approval from European authorities. EUCS defines a set of security requirements that the cloud provider must meet, in whole or in part, in order to achieve the security certification. The requirements are written in natural language and cover every aspect of security in the cloud environment, from logging access to protecting the system with anti-malware tools to training staff. Operationally, each requirement is associated with one or more evaluable metrics. For example, a requirement to monitor access attempts to a service will have associated metrics that take into account the number of accesses, the number of access attempts, who is accessing, and what resources are being used. Partners in the European project Medina, which ended in October 2023, defined 163 metrics and manually mapped them to 70 EUCS requirements. Manual mapping is intuitively a long and costly process in terms of human resources. This paper proposes an approach based on Sentence Transformers to automatically associate requirements and metrics. In terms of correctness of associations, the proposed method achieves a Normalized Discounted Cumulative Gain of 0.640, improving a previous experiment by 0.146 points.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.509500503540039, 3.8256025314331055], "openalex_id": "https://openalex.org/W4414577520", "title": "Will LLMs Scaling Hit the Wall? Breaking Barriers via Distributed Resources on Massive Edge Devices", "authors": "Tao Shen, Dibin Zhu, Ziyu Zhao, Zexu Li, Chao\u2010Hsin Wu, Fei Wu", "abstract": "The remarkable success of foundation models has been driven by scaling laws, demonstrating that model performance improves predictably with increased training data and model size. However, this scaling trajectory faces two critical challenges: the depletion of high-quality public data, and the prohibitive computational power required for larger models, which have been monopolized by tech giants. These two bottlenecks pose significant obstacles to the further development of AI. In this position paper, we argue that leveraging massive distributed edge devices can break through these barriers. We reveal the vast untapped potential of data and computational resources on massive edge devices, and review recent technical advancements in distributed/federated learning that make this new paradigm viable. Our analysis suggests that by collaborating on edge devices, everyone can participate in training large language models with small edge devices. This paradigm shift towards distributed training on edge has the potential to democratize AI development and foster a more inclusive AI community.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.246704578399658, 2.816174268722534], "openalex_id": "https://openalex.org/W4414565385", "title": "Fair Text Classification via Transferable Representations", "authors": "Thibaud Leteno, Micha\u00ebl Perrot, Charlotte Laclau, Antoine Gourru, Christophe Gravier", "abstract": "Group fairness is a central research topic in text classification, where reaching fair treatment between sensitive groups (e.g., women and men) remains an open challenge. We propose an approach that extends the use of the Wasserstein Dependency Measure for learning unbiased neural text classifiers. Given the challenge of distinguishing fair from unfair information in a text encoder, we draw inspiration from adversarial training by inducing independence between representations learned for the target label and those for a sensitive attribute. We further show that Domain Adaptation can be efficiently leveraged to remove the need for access to the sensitive attributes in the dataset we cure. We provide both theoretical and empirical evidence that our approach is well-founded.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.370201826095581, 3.6938905715942383], "openalex_id": "https://openalex.org/W4414577596", "title": "A Grey-box Text Attack Framework using Explainable AI", "authors": "Esther Chiramal, Kelvin Soh Boon Kai", "abstract": "Explainable AI is a strong strategy implemented to understand complex black-box model predictions in a human interpretable language. It provides the evidence required to execute the use of trustworthy and reliable AI systems. On the other hand, however, it also opens the door to locating possible vulnerabilities in an AI model. Traditional adversarial text attack uses word substitution, data augmentation techniques and gradient-based attacks on powerful pre-trained Bidirectional Encoder Representations from Transformers (BERT) variants to generate adversarial sentences. These attacks are generally whitebox in nature and not practical as they can be easily detected by humans E.g. Changing the word from \"Poor\" to \"Rich\". We proposed a simple yet effective Grey-box cum Black-box approach that does not require the knowledge of the model while using a set of surrogate Transformer/BERT models to perform the attack using Explainable AI techniques. As Transformers are the current state-of-the-art models for almost all Natural Language Processing (NLP) tasks, an attack generated from BERT1 is transferable to BERT2. This transferability is made possible due to the attention mechanism in the transformer that allows the model to capture long-range dependencies in a sequence. Using the power of BERT generalisation via attention, we attempt to exploit how transformers learn by attacking a few surrogate transformer variants which are all based on a different architecture. We demonstrate that this approach is highly effective to generate semantically good sentences by changing as little as one word that is not detectable by humans while still fooling other BERT models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.327996253967285, 2.305384635925293], "openalex_id": "https://openalex.org/W4414566345", "title": "Datasets, Documents, and Repetitions: The Practicalities of Unequal Data Quality", "authors": "Alex Chengyu Fang, Hadi Pouransari, Matt Jordan, Alexander Toshev, Vaishaal Shankar, Ludwig Schmidt, Tom Gunter", "abstract": "Data filtering has become a powerful tool for improving model performance while reducing computational cost. However, as large language model compute budgets continue to grow, the limited data volume provided by heavily filtered and deduplicated datasets will become a practical constraint. In efforts to better understand how to proceed, we study model performance at various compute budgets and across multiple pre-training datasets created through data filtering and deduplication. We find that, given appropriate modifications to the training recipe, repeating existing aggressively filtered datasets for up to ten epochs can outperform training on the ten times larger superset for a single epoch across multiple compute budget orders of magnitude. While this finding relies on repeating the dataset for many epochs, we also investigate repeats within these datasets at the document level. We find that not all documents within a dataset are equal, and we can create better datasets relative to a token budget by explicitly manipulating the counts of individual documents. We conclude by arguing that even as large language models scale, data filtering remains an important direction of research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.208831787109375, 0.8337938189506531], "openalex_id": "https://openalex.org/W4415194919", "title": "Kr\\'eyoLID From Language Identification Towards Language Mining", "authors": "Rasul Dent, Pedro Ortiz Suarez, Thibault Cl\u00e9rice, Beno\u00eet Sagot", "abstract": "Automatic language identification is frequently framed as a multi-class classification problem. However, when creating digital corpora for less commonly written languages, it may be more appropriate to consider it a data mining problem. For these varieties, one knows ahead of time that the vast majority of documents are of little interest. By minimizing resources spent on classifying such documents, we can create corpora much faster and with better coverage than using established pipelines. To demonstrate the effectiveness of the language mining perspective, we introduce a new pipeline and corpora for several French-based Creoles.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.043096542358398, 1.2429128885269165], "openalex_id": "https://openalex.org/W4408252562", "title": "Semantic Annotation Model and Method Based on Internet Open Dataset", "authors": "Xin Gao, Yansong Wang, Fang Wang, Baoqun Zhang, Caie Hu, Jian Wang, Longfei Ma", "abstract": "Traditional semantic annotation faces the problem of dataset diversity. Different fields and scenarios need to be specially annotated, and annotation work usually requires a lot of manpower and time investment. To meet these challenges, this paper deeply studies the semantic annotation model and method based on internet open datasets, aiming to improve annotation efficiency and accuracy and promote data resource sharing and utilization. This paper selects Common Crawl dataset to provide sufficient training samples; methods such as removing stop words and deduplication are used to preprocess data to improve data quality; a keyword extraction model based on heuristic rules and text context is constructed. In terms of semantic annotation model, this paper constructs a model based on Bidirectional Long Short-Term Memory (BiLSTM), which can make full use of the part-of-speech information of the corpus context, capture the part-of-speech features of the corpus, and generate semantic tags through supervised learning.", "venue": "International Journal of Intelligent Information Technologies", "label": 0}, {"loc": [3.1374471187591553, -0.4962392747402191], "openalex_id": "https://openalex.org/W4408228388", "title": "NLP modeling recommendations for restricted data availability in clinical settings", "authors": "Fabi\u00e1n Villena, Felipe Bravo-M\u00e1rquez, Jocelyn Dunstan", "abstract": "Abstract Background Clinical decision-making in healthcare often relies on unstructured text data, which can be challenging to analyze using traditional methods. Natural Language Processing (NLP) has emerged as a promising solution, but its application in clinical settings is hindered by restricted data availability and the need for domain-specific knowledge. Methods We conducted an experimental analysis to evaluate the performance of various NLP modeling paradigms on multiple clinical NLP tasks in Spanish. These tasks included referral prioritization and referral specialty classification. We simulated three clinical settings with varying levels of data availability and evaluated the performance of four foundation models. Results Clinical-specific pre-trained language models (PLMs) achieved the highest performance across tasks. For referral prioritization, Clinical PLMs attained an 88.85 % macro F1 score when fine-tuned. In referral specialty classification, the same models achieved a 53.79 % macro F1 score, surpassing domain-agnostic models. Continuing pre-training with environment-specific data improved model performance, but the gains were marginal compared to the computational resources required. Few-shot learning with large language models (LLMs) demonstrated lower performance but showed potential in data-scarce scenarios. Conclusions Our study provides evidence-based recommendations for clinical NLP practitioners on selecting modeling paradigms based on data availability. We highlight the importance of considering data availability, task complexity, and institutional maturity when designing and training clinical NLP models. Our findings can inform the development of effective clinical NLP solutions in real-world settings.", "venue": "BMC Medical Informatics and Decision Making", "label": 0}, {"loc": [2.508347511291504, 1.4543981552124023], "openalex_id": "https://openalex.org/W4408206838", "title": "Generative AI and the Continuing Importance of Information Literacy", "authors": "He Li, Elvira S. Balinas", "abstract": "Abstract: The rapid development of generative AI is transforming university information literacy education by reshaping how students access and process information. This study systematically reviews 49 research papers published between 2020 and 2024, using the PRISMA framework and thematic analysis to explore the applications, impacts, and pedagogical changes associated with generative AI in the field of information literacy education. Results show that generative AI has a wide range of applications in information literacy education, mainly in student learning support, learner-oriented personalized learning, academic research assistants, academic writing assistance, information literacy skills development, and curriculum design and teaching assistance. Generative AI has promoted students\u2019 information retrieval, evaluation skills and critical thinking, but also brought the challenge that over-reliance on AI may weaken students\u2019 critical thinking and information evaluation skills. Important changes in curriculum design and teaching methods are needed to introduce instruction in prompt engineering and computational thinking. The role of the teacher has shifted from knowledge transmitter to learning facilitator, emphasizing the importance of professional basic knowledge and ethical education. Through the results it is find that Generative AI can significantly enhance student learning outcomes and skills development in university information literacy education. However, its application requires caution and must fully consider potential challenges and risks. Through reasonable curriculum design, innovative teaching methods, and policy support, educators can leverage the advantages of Generative AI to cultivate high-quality talent with critical thinking, innovation, and a sense of moral responsibility. As AI technology continues to develop, information literacy education will usher in more innovations and opportunities, bringing new vitality and possibilities to higher education.", "venue": "International Journal of Latest Technology in Engineering Management & Applied Science", "label": 0}, {"loc": [2.853400707244873, 2.925396203994751], "openalex_id": "https://openalex.org/W4415075872", "title": "Do Not Trust Licenses You See\u2014Dataset Compliance Requires Massive-Scale AI-Powered Lifecycle Tracing", "authors": "Jaekyeom Kim, Sungryull Sohn, Gerrard Jeongwon Jo, Jihoon Choi, Kwang-Ho Bae, Hwa\u2010Young Lee, Yongmin Park, Honglak Lee", "abstract": "This paper argues that a dataset's legal risk cannot be accurately assessed by its license terms alone; instead, tracking dataset redistribution and its full lifecycle is essential. However, this process is too complex for legal experts to handle manually at scale. Tracking dataset provenance, verifying redistribution rights, and assessing evolving legal risks across multiple stages require a level of precision and efficiency that exceeds human capabilities. Addressing this challenge effectively demands AI agents that can systematically trace dataset redistribution, analyze compliance, and identify legal risks. We develop an automated data compliance system called NEXUS and show that AI can perform these tasks with higher accuracy, efficiency, and cost-effectiveness than human experts. Our massive legal analysis of 17,429 unique entities and 8,072 license terms using this approach reveals the discrepancies in legal rights between the original datasets before redistribution and their redistributed subsets, underscoring the necessity of the data lifecycle-aware compliance. For instance, we find that out of 2,852 datasets with commercially viable individual license terms, only 605 (21%) are legally permissible for commercialization. This work sets a new standard for AI data governance, advocating for a framework that systematically examines the entire lifecycle of dataset redistribution to ensure transparent, legal, and responsible dataset management.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.3376665115356445, -1.0703065395355225], "openalex_id": "https://openalex.org/W4415081407", "title": "Decoupling Content and Expression: Two-Dimensional Detection of AI-Generated Text", "authors": "Guangsheng Bao, L\u00fc Rong, Yanbin Zhao, Qiji Zhou, Yue Zhang", "abstract": "The wide usage of LLMs raises critical requirements on detecting AI participation in texts. Existing studies investigate these detections in scattered contexts, leaving a systematic and unified approach unexplored. In this paper, we present HART, a hierarchical framework of AI risk levels, each corresponding to a detection task. To address these tasks, we propose a novel 2D Detection Method, decoupling a text into content and language expression. Our findings show that content is resistant to surface-level changes, which can serve as a key feature for detection. Experiments demonstrate that 2D method significantly outperforms existing detectors, achieving an AUROC improvement from 0.705 to 0.849 for level-2 detection and from 0.807 to 0.886 for RAID. We release our data and code at https://github.com/baoguangsheng/truth-mirror.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.793999195098877, 1.4854360818862915], "openalex_id": "https://openalex.org/W4415084305", "title": "ReaderLM-v2: Small Language Model for HTML to Markdown and JSON", "authors": "Feng Wang, Zesheng Shi, Bo Wang, Nan Wang, Han Xiao", "abstract": "We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding large language models. The model's effectiveness results from two key innovations: (1) a three-stage data synthesis pipeline that generates high quality, diverse training data by iteratively drafting, refining, and critiquing web content extraction; and (2) a unified training framework combining continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20\\% on carefully curated benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly lower computational requirements.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.706034660339355, 2.4740216732025146], "openalex_id": "https://openalex.org/W4415974127", "title": "Predictive Data Selection: The Data That Predicts Is the Data That Teaches", "authors": "Kashun Shum, Yuzhen Huang, Hui Zou, Qi Ding, Yixuan Liao, Xiaoxin Chen, Qian Liu, Junxian He", "abstract": "Language model pretraining involves training on extensive corpora, where data quality plays a pivotal role. In this work, we aim to directly estimate the contribution of data during pretraining and select pretraining data in an efficient manner. Specifically, we draw inspiration from recent findings showing that compression efficiency (i.e., the normalized loss) of diverse models on certain text correlates strongly with their downstream performance, when the text domain aligns with the downstream benchmarks(Huang et al., 2024). Building on this observation, we hypothesize that data on which model losses are predictive of downstream abilities also contribute effectively to learning, which shares similar intuition with Thrush et al.(2024). To leverage this insight, we introduce predictive data selection (PreSelect), a lightweight and efficient data selection method that requires training and deploying only a fastText-based scorer. Through comprehensive experiments with 1B and 3B parameter models, we demonstrate that models trained on 30B tokens selected with PreSelect surpass the performance of the vanilla baseline trained on 300B tokens, achieving a 10x reduction in compute requirements. Furthermore, PreSelect significantly outperforms other competitive data selection baselines, such as DCLM and FineWeb-Edu on a scale of 3B models trained on 100B tokens. We open-source our trained data selection scorer along with the curated datasets at https://github.com/hkust-nlp/PreSelect.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.075788497924805, 1.0746653079986572], "openalex_id": "https://openalex.org/W4415085168", "title": "Machine Learners Should Acknowledge the Legal Implications of Large Language Models as Personal Data", "authors": "Henrik Nolte, Mich\u00e8le Finck, Kristof Meding", "abstract": "Does GPT know you? The answer depends on your level of public recognition; however, if your information was available on a website, the answer could be yes. Most Large Language Models (LLMs) memorize training data to some extent. Thus, even when an LLM memorizes only a small amount of personal data, it typically falls within the scope of data protection laws. If a person is identified or identifiable, the implications are far-reaching. The LLM is subject to EU General Data Protection Regulation requirements even after the training phase is concluded. To back our arguments: (1.) We reiterate that LLMs output training data at inference time, be it verbatim or in generalized form. (2.) We show that some LLMs can thus be considered personal data on their own. This triggers a cascade of data protection implications such as data subject rights, including rights to access, rectification, or erasure. These rights extend to the information embedded within the AI model. (3.) This paper argues that machine learning researchers must acknowledge the legal implications of LLMs as personal data throughout the full ML development lifecycle, from data collection and curation to model provision on e.g., GitHub or Hugging Face. (4.) We propose different ways for the ML research community to deal with these legal implications. Our paper serves as a starting point for improving the alignment between data protection law and the technical capabilities of LLMs. Our findings underscore the need for more interaction between the legal domain and the ML community.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.723773002624512, 2.44462251663208], "openalex_id": "https://openalex.org/W4415084879", "title": "SampleMix: A Sample-wise Pre-training Data Mixing Strategey by Coordinating Data Quality and Diversity", "authors": "Xiangyu Xi, D. J. Kong, Jian Yang, Jiawei Yang, Zhengyu Chen, Wang Wei, Jingang Wang, Xunliang Cai, Shikun Zhang, Wei Ye", "abstract": "Existing pretraining data mixing methods for large language models (LLMs) typically follow a domain-wise methodology, a top-down process that first determines domain weights and then performs uniform data sampling across each domain. However, these approaches neglect significant inter-domain overlaps and commonalities, failing to control the global diversity of the constructed training dataset. Further, uniform sampling within domains ignores fine-grained sample-specific features, potentially leading to suboptimal data distribution. To address these shortcomings, we propose a novel sample-wise data mixture approach based on a bottom-up paradigm. This method performs global cross-domain sampling by systematically evaluating the quality and diversity of each sample, thereby dynamically determining the optimal domain distribution. Comprehensive experiments across multiple downstream tasks and perplexity assessments demonstrate that SampleMix surpasses existing domain-based methods. Meanwhile, SampleMix requires 1.4x to 2.1x training steps to achieves the baselines' performance, highlighting the substantial potential of SampleMix to optimize pre-training data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.256953239440918, 0.24552591145038605], "openalex_id": "https://openalex.org/W4415083533", "title": "Babel: Open Multilingual Large Language Models Serving Over 90% of Global Speakers", "authors": "Yiran Zhao, Chaoqun Liu, Yue Deng, Jiahao Ying, Mahani Aljunied, Zhaodonghui Li, Lidong Bing, Hou Pong Chan, Yu Rong, Deli Zhao, Wenxuan Zhang", "abstract": "Large language models (LLMs) have revolutionized natural language processing (NLP), yet open-source multilingual LLMs remain scarce, with existing models often limited in language coverage. Such models typically prioritize well-resourced languages, while widely spoken but under-resourced languages are often overlooked. To address this disparity, we introduce $\\texttt{Babel}$, an open multilingual LLM that covers the top 25 languages by number of speakers, supports over 90% of the global population, and includes many languages neglected by other open multilingual LLMs. Unlike traditional continue pretraining approaches, Babel expands its parameter count through a layer extension technique that elevates Babel's performance ceiling. We introduce two variants: $\\texttt{Babel-9B}$, designed for efficient inference and fine-tuning, and $\\texttt{Babel-83B}$, which sets a new standard for open multilingual LLMs. Extensive evaluations on multilingual tasks demonstrate its superior performance compared to open LLMs of comparable size. In addition, using open-source supervised fine-tuning datasets, Babel achieves remarkable performance, with Babel-9B-Chat leading among 10B-sized LLMs and Babel-83B-Chat setting a new standard for multilingual tasks, reaching the same level of commercial models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.227354049682617, -0.012516442686319351], "openalex_id": "https://openalex.org/W4408158337", "title": "Deep Learning for Economists", "authors": "Melissa Dell", "abstract": "Deep learning provides powerful methods to impute structured information from large-scale, unstructured text and image datasets. For example, economists might wish to detect the presence of economic activity in satellite images or measure the topics or entities mentioned in social media, the congressional record, or firm filings. This review introduces deep neural networks, covering methods such as classifiers, regression models, generative artificial intelligence (AI), and embedding models. Applications include classification, document digitization, record linkage, and methods for data exploration in massive-scale text and image corpora. When suitable methods are used, deep learning models can be cheap to tune and can scale affordably to problems involving millions or billions of data points. The review is accompanied by a regularly updated companion website, EconDL (https://econdl.github.io/), with user-friendly demo notebooks, software resources, and a knowledge base that provides technical details and additional applications. (JEL C38, C45, C88, D83)", "venue": "Journal of Economic Literature", "label": 0}, {"loc": [3.2827515602111816, -0.41232481598854065], "openalex_id": "https://openalex.org/W4408045025", "title": "Consistent Performance of GPT-4o in Rare Disease Diagnosis Across Nine Languages and 4967 Cases", "authors": "Leonardo Chimirri, J. Harry Caufield, Yasemin Bridges, Nicolas Matentzoglu, Michael Gargano, Mario Cazalla, Shihan Chen, Daniel Dani\u0161, Alexander J.M. Dingemans, Petra Gehle, Adam S L Graefe, Weihong Gu, Markus S. Ladewig, Pablo Lapunzina, Juli\u00e1n Nevado, Enock Niyonkuru, Soichi Ogishima, Dominik Seelow, Jair Tenorio, Marek Turnovec, Bert B.A. de Vries, Kai Wang, Kyran Wissink, Zafer Y\u00fcksel, Gabriele Zucca, Melissa Haendel, Chris Mungall, Justin Reese, Peter N. Robinson", "abstract": "Summary Background Large language models (LLMs) are increasingly used in the medical field for diverse applications including differential diagnostic support. The estimated training data used to create LLMs such as the Generative Pretrained Transformer (GPT) predominantly consist of English-language texts, but LLMs could be used across the globe to support diagnostics if language barriers could be overcome. Initial pilot studies on the utility of LLMs for differential diagnosis in languages other than English have shown promise, but a large-scale assessment on the relative performance of these models in a variety of European and non-European languages on a comprehensive corpus of challenging rare-disease cases is lacking. Methods We created 4967 clinical vignettes using structured data captured with Human Phenotype Ontology (HPO) terms with the Global Alliance for Genomics and Health (GA4GH) Phenopacket Schema. These clinical vignettes span a total of 378 distinct genetic diseases with 2618 associated phenotypic features. We used translations of the Human Phenotype Ontology together with language-specific templates to generate prompts in English, Chinese, Czech, Dutch, German, Italian, Japanese, Spanish, and Turkish. We applied GPT-4o, version gpt-4o-2024-08-06, to the task of delivering a ranked differential diagnosis using a zero-shot prompt. An ontology-based approach with the Mondo disease ontology was used to map synonyms and to map disease subtypes to clinical diagnoses in order to automate evaluation of LLM responses. Findings For English, GPT-4o placed the correct diagnosis at the first rank 19\u00b78% and within the top-3 ranks 27\u00b70% of the time. In comparison, for the eight non-English languages tested here the correct diagnosis was placed at rank 1 between 16\u00b79% and 20\u00b75%, within top-3 between 25\u00b73% and 27\u00b77% of cases. Interpretation The differential diagnostic performance of GPT-4o across a comprehensive corpus of rare-disease cases was consistent across the nine languages tested. This suggests that LLMs such as GPT-4o may have utility in non-English clinical settings. Funding NHGRI 5U24HG011449 and 5RM1HG010860. P.N.R. was supported by a Professorship of the Alexander von Humboldt Foundation; P.L. was supported by a National Grant (PMP21/00063 ONTOPREC-ISCIII, Fondos FEDER).", "venue": "bioRxiv (Cold Spring Harbor Laboratory)", "label": 12}, {"loc": [8.422667503356934, -0.07359011471271515], "openalex_id": "https://openalex.org/W4416043760", "title": "LongEval: A Comprehensive Analysis of Long-Text Generation Through a Plan-based Paradigm", "authors": "Siwei Wu, Yi\u2010Zhi Li, Xingwei Qu, R. N. Ravikumar, Yucheng Li, Tyler Loakman, Shanghaoran Quan, Jun Rao, Riza Batista-Navarro, Chenghua Lin", "abstract": "Large Language Models (LLMs) have achieved remarkable success in various natural language processing tasks, yet their ability to generate long-form content remains poorly understood and evaluated. Our analysis reveals that current LLMs struggle with length requirements and information density in long-text generation, with performance deteriorating as text length increases. To quantitively locate such a performance degradation and provide further insights on model development, we present LongEval, a benchmark that evaluates long-text generation through both direct and plan-based generation paradigms, inspired by cognitive and linguistic writing models. The comprehensive experiments in this work reveal interesting findings such as that while model size correlates with generation ability, the small-scale model (e.g., LongWriter), well-trained on long texts, has comparable performance. All code and datasets are released in https://github.com/Wusiwei0410/LongEval.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.55023193359375, 0.6285043358802795], "openalex_id": "https://openalex.org/W4416042735", "title": "Neurobiber: Fast and Interpretable Stylistic Feature Extraction", "authors": "Kenan Alkiek, Anna Wegmann, Jian Zhu, David Jurgens", "abstract": "Linguistic style is pivotal for understanding how texts convey meaning and fulfill communicative purposes, yet extracting detailed stylistic features at scale remains challenging. We present Neurobiber, a transformer-based system for fast, interpretable style profiling built on Biber's Multidimensional Analysis (MDA). Neurobiber predicts 96 Biber-style features from our open-source BiberPlus library (a Python toolkit that computes stylistic features and provides integrated analytics, e.g., PCA and factor analysis). Despite being up to 56 times faster than existing open source systems, Neurobiber replicates classic MDA insights on the CORE corpus and achieves competitive performance on the PAN 2020 authorship verification task without extensive retraining. Its efficient and interpretable representations readily integrate into downstream NLP pipelines, facilitating large-scale stylometric research, forensic analysis, and real-time text monitoring. All components are made publicly available.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.678732395172119, -0.6372538805007935], "openalex_id": "https://openalex.org/W4416043659", "title": "Improving the quality of Web-mined Parallel Corpora of Low-Resource Languages using Debiasing Heuristics", "authors": "Aloka Fernando, Nisansa de Silva, Menan Velyuthan, Charitha Rathnayake, Surangika Ranathunga", "abstract": "Parallel Data Curation (PDC) techniques aim to filter out noisy parallel sentences from web-mined corpora. Ranking sentence pairs using similarity scores on sentence embeddings derived from Pre-trained Multilingual Language Models (multiPLMs) is the most common PDC technique. However, previous research has shown that the choice of the multiPLM significantly impacts the quality of the filtered parallel corpus, and the Neural Machine Translation (NMT) models trained using such data show a disparity across multiPLMs. This paper shows that this disparity is due to different multiPLMs being biased towards certain types of sentence pairs, which are treated as noise from an NMT point of view. We show that such noisy parallel sentences can be removed to a certain extent by employing a series of heuristics. The NMT models, trained using the curated corpus, lead to producing better results while minimizing the disparity across multiPLMs. We publicly release the source code and the curated datasets.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.630661964416504, 2.8849072456359863], "openalex_id": "https://openalex.org/W4416043542", "title": "(Mis) Fitting: A Survey of Scaling Laws", "authors": "Margaret Li, Sneha Kudugunta, Luke Zettlemoyer", "abstract": "Modern foundation models rely heavily on using scaling laws to guide crucial training decisions. Researchers often extrapolate the optimal architecture and hyper parameters settings from smaller training runs by describing the relationship between, loss, or task performance, and scale. All components of this process vary, from the specific equation being fit, to the training setup, to the optimization method. Each of these factors may affect the fitted law, and therefore, the conclusions of a given study. We discuss discrepancies in the conclusions that several prior works reach, on questions such as the optimal token to parameter ratio. We augment this discussion with our own analysis of the critical impact that changes in specific details may effect in a scaling study, and the resulting altered conclusions. Additionally, we survey over 50 papers that study scaling trends: while 45 of these papers quantify these trends using a power law, most under-report crucial details needed to reproduce their findings. To mitigate this, we we propose a checklist for authors to consider while contributing to scaling law research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.382019519805908, 3.711852788925171], "openalex_id": "https://openalex.org/W4408007789", "title": "Fine-Tuning Small Language Models for Domain-Specific AI: An Edge AI Perspective", "authors": "Rakshit Aralimatti, Syed Abdul Gaffar Shakhadri, Kruthika KR, Kartik Basavaraj Angadi", "abstract": "The Shakti series of 100M, 250M, and 500M models offers compact, resource-efficient language models designed for edge AI deployment. Unlike large models like GPT-3 and LLaMA that demand cloud-based infrastructure, Shakti models operate seamlessly on low-resource devices, including smartphones, smart TVs, IoT systems, drones, and low-end GPUs. They ensure minimal energy consumption, privacy-preserving computation, and real-time performance without internet dependency. Optimized for efficiency, Shakti models come in quantized versions (int8, int5, int4) for even faster, lighter execution on edge devices. The 2.5B Shakti model has demonstrated strong performance while maintaining low latency, paving the way for the smaller, highly efficient 100M, 250M, and 500M models. Built on Responsible AI principles, Shakti prioritizes fairness, transparency, and trust while mitigating risks such as bias, privacy concerns, and high carbon footprints. These models are ideal for sensitive domains like finance, healthcare, and legal services, providing cost-effective, sustainable, and scalable AI solutions with on-device data security. Each model is tailored for specific applications. Shakti-100M excels in text generation, summarization, and chatbots for IoT and mobile apps. Shakti-250M specializes in domain-specific tasks such as contract analysis and personalized financial or healthcare advice. Shakti-500M, a versatile model, enhances customer support, content creation, and virtual assistants with multilingual capabilities and long-context understanding. By decentralizing AI, the Shakti series democratizes access to intelligent, ethical, and impactful AI solutions across industries.", "venue": "Preprints.org", "label": 3}, {"loc": [3.9888222217559814, 0.8955518007278442], "openalex_id": "https://openalex.org/W4415186975", "title": "An Overview of Large Language Models for Statisticians", "authors": "Wenlong Ji, Weizhe Yuan, Emily Getzen, Kyunghyun Cho, Michael I. Jordan, Song Mei, Jason Weston, Weijie Su, Jing Xu, Linjun Zhang", "abstract": "Large Language Models (LLMs) have emerged as transformative tools in artificial intelligence (AI), exhibiting remarkable capabilities across diverse tasks such as text generation, reasoning, and decision-making. While their success has primarily been driven by advances in computational power and deep learning architectures, emerging problems -- in areas such as uncertainty quantification, decision-making, causal inference, and distribution shift -- require a deeper engagement with the field of statistics. This paper explores potential areas where statisticians can make important contributions to the development of LLMs, particularly those that aim to engender trustworthiness and transparency for human users. Thus, we focus on issues such as uncertainty quantification, interpretability, fairness, privacy, watermarking and model adaptation. We also consider possible roles for LLMs in statistical analysis. By bridging AI and statistics, we aim to foster a deeper collaboration that advances both the theoretical foundations and practical applications of LLMs, ultimately shaping their role in addressing complex societal challenges.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.542630195617676, 0.6492185592651367], "openalex_id": "https://openalex.org/W4415187755", "title": "AfroXLMR-Comet: Multilingual Knowledge Distillation with Attention Matching for Low-Resource languages", "authors": "Joshua Sakthivel Raju, S Sanjay, Jaskaran Singh Walia, Sachin Pratap Singh Raghav, Vukosi Marivate", "abstract": "Language model compression through knowledge distillation has emerged as a promising approach for deploying large language models in resource-constrained environments. However, existing methods often struggle to maintain performance when distilling multilingual models, especially for low-resource languages. In this paper, we present a novel hybrid distillation approach that combines traditional knowledge distillation with a simplified attention matching mechanism, specifically designed for multilingual contexts. Our method introduces an extremely compact student model architecture, significantly smaller than conventional multilingual models. We evaluate our approach on five African languages: Kinyarwanda, Swahili, Hausa, Igbo, and Yoruba. The distilled student model; AfroXLMR-Comet successfully captures both the output distribution and internal attention patterns of a larger teacher model (AfroXLMR-Large) while reducing the model size by over 85%. Experimental results demonstrate that our hybrid approach achieves competitive performance compared to the teacher model, maintaining an accuracy within 85% of the original model's performance while requiring substantially fewer computational resources. Our work provides a practical framework for deploying efficient multilingual models in resource-constrained environments, particularly benefiting applications involving African languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.406365871429443, -0.1152510792016983], "openalex_id": "https://openalex.org/W4414846044", "title": "UrduLLaMA 1.0: Dataset Curation, Preprocessing, and Evaluation in Low-Resource Settings", "authors": "Layba Fiaz, Munief Hassan Tahir, Sana Shams, Sarmad Hussain", "abstract": "Multilingual Large Language Models (LLMs) often provide suboptimal performance on low-resource languages like Urdu. This paper introduces UrduLLaMA 1.0, a model derived from the open-source Llama-3.1-8B-Instruct architecture and continually pre-trained on 128 million Urdu tokens, capturing the rich diversity of the language. To enhance instruction-following and translation capabilities, we leverage Low-Rank Adaptation (LoRA) to fine tune the model on 41,000 Urdu instructions and approximately 50,000 English-Urdu translation pairs. Evaluation across three machine translation datasets demonstrates significant performance improvements compared to state-of-the-art (SOTA) models, establishing a new benchmark for Urdu LLMs. These findings underscore the potential of targeted adaptation strategies with limited data and computational resources to address the unique challenges of low-resource languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.821260929107666, 5.0646586418151855], "openalex_id": "https://openalex.org/W4416042267", "title": "Megrez-Omni Technical Report", "authors": "Boxun Li, Yadong Li, Zhiyuan Li, C. Liu, Weilin Liu, Guowei Niu, Zheyue Tan, Haiyang Xu, Zhirong Yao, Tao Yuan, Dong Zhou, Yueqing Zhuang, Shengen Yan, Guohao Dai, Yu Wang", "abstract": "In this work, we present the Megrez models, comprising a language model (Megrez-3B-Instruct) and a multimodal model (Megrez-3B-Omni). These models are designed to deliver fast inference, compactness, and robust edge-side intelligence through a software-hardware co-design approach. Megrez-3B-Instruct offers several advantages, including high accuracy, high speed, ease of use, and a wide range of applications. Building on Megrez-3B-Instruct, Megrez-3B-Omni is an on-device multimodal understanding LLM that supports image, text, and audio analysis. It achieves state-of-the-art accuracy across all three modalities and demonstrates strong versatility and robustness, setting a new benchmark for multimodal AI models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.866445541381836, 2.5723679065704346], "openalex_id": "https://openalex.org/W4414836549", "title": "Optimizing Pre-Training Data Mixtures with Mixtures of Data Expert Models", "authors": "Liudmila Belenki, Alekh Agarwal, Tianze Shi, Kristina Toutanova", "abstract": "We propose a method to optimize language model pre-training data mixtures through efficient approximation of the cross-entropy loss corresponding to each candidate mixture via a Mixture of Data Experts (MDE). We use this approximation as a source of additional features in a regression model, trained from observations of model loss for a small number of mixtures. Experiments with Transformer decoder-only language models in the range of 70M to 1B parameters on the SlimPajama dataset show that our method achieves significantly better performance than approaches that train regression models using only the mixture rates as input features. Combining this improved optimization method with an objective that takes into account cross-entropy on end task data leads to superior performance on few-shot downstream evaluations. We also provide theoretical insights on why aggregation of data expert predictions can provide good approximations to model losses for data mixtures.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.6839141845703125, 5.4112396240234375], "openalex_id": "https://openalex.org/W4414836592", "title": "Multi-Agent Multimodal Models for Multicultural Text to Image Generation", "authors": "Parth Bhalerao, Mounika Yalamarty, Brian Trinh, Oana Ignat", "abstract": "Large Language Models (LLMs) demonstrate impressive performance across various multimodal tasks. However, their effectiveness in cross-cultural contexts remains limited due to the predominantly Western-centric nature of existing data and models. Meanwhile, multi-agent models have shown strong capabilities in solving complex tasks. In this paper, we evaluate the performance of LLMs in a multi-agent interaction setting for the novel task of multicultural image generation. Our key contributions are: (1) We introduce MosAIG, a Multi-Agent framework that enhances multicultural Image Generation by leveraging LLMs with distinct cultural personas; (2) We provide a dataset of 9,000 multicultural images spanning five countries, three age groups, two genders, 25 historical landmarks, and five languages; and (3) We demonstrate that multi-agent interactions outperform simple, no-agent models across multiple evaluation metrics, offering valuable insights for future research. Our dataset and models are available at https://github.com/OanaIgnat/MosAIG.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.481178283691406, 3.6352810859680176], "openalex_id": "https://openalex.org/W4416042708", "title": "COSMOS: A Hybrid Adaptive Optimizer for Memory-Efficient Training of LLMs", "authors": "Liming Liu, Zhenghao Xu, Z. H. Zhang, Hao Kang, Z. J. Li, Liang Chen, Weizhu Chen, Tianbao Zhao", "abstract": "Large Language Models (LLMs) have demonstrated remarkable success across various domains, yet their optimization remains a significant challenge due to the complex and high-dimensional loss landscapes they inhabit. While adaptive optimizers such as AdamW are widely used, they suffer from critical limitations, including an inability to capture interdependencies between coordinates and high memory consumption. Subsequent research, exemplified by SOAP, attempts to better capture coordinate interdependence but incurs greater memory overhead, limiting scalability for massive LLMs. An alternative approach aims to reduce memory consumption through low-dimensional projection, but this leads to substantial approximation errors, resulting in less effective optimization (e.g., in terms of per-token efficiency). In this paper, we propose COSMOS, a novel hybrid optimizer that leverages the varying importance of eigensubspaces in the gradient matrix to achieve memory efficiency without compromising optimization performance. The design of COSMOS is motivated by our empirical insights and practical considerations. Specifically, COSMOS applies SOAP to the leading eigensubspace, which captures the primary optimization dynamics, and MUON to the remaining eigensubspace, which is less critical but computationally expensive to handle with SOAP. This hybrid strategy significantly reduces memory consumption while maintaining robust optimization performance, making it particularly suitable for massive LLMs. Numerical experiments on various datasets and transformer architectures are provided to demonstrate the effectiveness of COSMOS. Our code is available at https://github.com/lliu606/COSMOS.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.304990291595459, 2.0290753841400146], "openalex_id": "https://openalex.org/W4415202618", "title": "ExpliCa: Evaluating Explicit Causal Reasoning in Large Language Models", "authors": "Martina Miliani, Serena Auriemma, Alessandro Bondielli, Emmanuele Chersoni, Lucia Passaro, Irene Sucameli, Alessandro Lenci", "abstract": "Large Language Models (LLMs) are increasingly used in tasks requiring interpretive and inferential accuracy. In this paper, we introduce ExpliCa, a new dataset for evaluating LLMs in explicit causal reasoning. ExpliCa uniquely integrates both causal and temporal relations presented in different linguistic orders and explicitly expressed by linguistic connectives. The dataset is enriched with crowdsourced human acceptability ratings. We tested LLMs on ExpliCa through prompting and perplexity-based metrics. We assessed seven commercial and open-source LLMs, revealing that even top models struggle to reach 0.80 accuracy. Interestingly, models tend to confound temporal relations with causal ones, and their performance is also strongly influenced by the linguistic order of the events. Finally, perplexity-based scores and prompting performance are differently affected by model size.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.972529172897339, -0.29166558384895325], "openalex_id": "https://openalex.org/W4415339002", "title": "Vision Language Models in Medicine", "authors": "Beria Chingnabe Kalpelbe, Angel Gabriel Adaambiik, Wei Peng", "abstract": "With the advent of Vision-Language Models (VLMs), medical artificial intelligence (AI) has experienced significant technological progress and paradigm shifts. This survey provides an extensive review of recent advancements in Medical Vision-Language Models (Med-VLMs), which integrate visual and textual data to enhance healthcare outcomes. We discuss the foundational technology behind Med-VLMs, illustrating how general models are adapted for complex medical tasks, and examine their applications in healthcare. The transformative impact of Med-VLMs on clinical practice, education, and patient care is highlighted, alongside challenges such as data scarcity, narrow task generalization, interpretability issues, and ethical concerns like fairness, accountability, and privacy. These limitations are exacerbated by uneven dataset distribution, computational demands, and regulatory hurdles. Rigorous evaluation methods and robust regulatory frameworks are essential for safe integration into healthcare workflows. Future directions include leveraging large-scale, diverse datasets, improving cross-modal generalization, and enhancing interpretability. Innovations like federated learning, lightweight architectures, and Electronic Health Record (EHR) integration are explored as pathways to democratize access and improve clinical relevance. This review aims to provide a comprehensive understanding of Med-VLMs' strengths and limitations, fostering their ethical and balanced adoption in healthcare.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.363333225250244, 3.3102867603302], "openalex_id": "https://openalex.org/W4407903641", "title": "Enhancing Small Language Models for Graph Tasks Through Graph Encoder Integration", "authors": "Dongryul Oh, Su Jin Kang, Hee-Jin Kim, Dongsuk Oh", "abstract": "Small language models (SLMs) are increasingly utilized for on-device applications due to their ability to ensure user privacy, reduce inference latency, and operate independently of cloud infrastructure. However, their performance is often limited when processing complex data structures such as graphs, which are ubiquitous in real-world datasets like social networks and system interactions. Graphs inherently encode intricate structural dependencies, requiring models to effectively capture both local and global relationships. Traditional language models, designed primarily for text data, struggle to address these requirements, leading to suboptimal performance in graph-related tasks. To overcome this limitation, we propose a novel graph encoder-based prompt tuning framework which integrates a graph convolutional network (GCN) with a graph transformer. By leveraging the complementary strengths of the GCN for local structural modeling and the graph transformer for capturing global relationships, our method enables SLMs to effectively process graph data. This integration significantly enhances the ability of SLMs to handle graph-centric tasks while maintaining the efficiency required for resource-constrained devices. The experimental results show that our approach not only improves the performance of SLMs on various graph benchmarks but also achieves results which closely approach the performance of a large language model (LLM). This work highlights the potential of extending SLMs for graph-based applications and advancing the capabilities of on-device artificial intelligence.", "venue": "Applied Sciences", "label": 8}, {"loc": [6.6976799964904785, 3.649634838104248], "openalex_id": "https://openalex.org/W4407809258", "title": "Tiny Language Models for Automation and Control: Overview, Potential Applications, and Future Research Directions", "authors": "Ismail Lamaakal, Yassine Maleh, Khalid El Makkaoui, Ibrahim Ouahbi, Pawe\u0142 P\u0142awiak, Osama Alfarraj, May Almousa, Ahmed A. Abd El\u2010Latif", "abstract": "Large Language Models (LLMs), like GPT and BERT, have significantly advanced Natural Language Processing (NLP), enabling high performance on complex tasks. However, their size and computational needs make LLMs unsuitable for deployment on resource-constrained devices, where efficiency, speed, and low power consumption are critical. Tiny Language Models (TLMs), also known as BabyLMs, offer compact alternatives by using advanced compression and optimization techniques to function effectively on devices such as smartphones, Internet of Things (IoT) systems, and embedded platforms. This paper provides a comprehensive survey of TLM architectures and methodologies, including key techniques such as knowledge distillation, quantization, and pruning. Additionally, it explores potential and emerging applications of TLMs in automation and control, covering areas such as edge computing, IoT, industrial automation, and healthcare. The survey discusses challenges unique to TLMs, such as trade-offs between model size and accuracy, limited generalization, and ethical considerations in deployment. Future research directions are also proposed, focusing on hybrid compression techniques, application-specific adaptations, and context-aware TLMs optimized for hardware-specific constraints. This paper aims to serve as a foundational resource for advancing TLMs capabilities across diverse real-world applications.", "venue": "Sensors", "label": 0}, {"loc": [4.568974494934082, -1.4478497505187988], "openalex_id": "https://openalex.org/W4407806900", "title": "Stories that (are) Move (d by) Markets: A Causal Exploration of Market Shocks and Semantic Shifts across Different Partisan Groups", "authors": "Felix Drinkall, Stefan Zohren, Michael McMahon, Janet B. Pierrehumbert", "abstract": "Macroeconomic fluctuations and the narratives that shape them form a mutually reinforcing cycle: public discourse can spur behavioural changes leading to economic shifts, which then result in changes in the stories that propagate. We show that shifts in semantic embedding space can be causally linked to financial market shocks -- deviations from the expected market behaviour. Furthermore, we show how partisanship can influence the predictive power of text for market fluctuations and shape reactions to those same shocks. We also provide some evidence that text-based signals are particularly salient during unexpected events such as COVID-19, highlighting the value of language data as an exogenous variable in economic forecasting. Our findings underscore the bidirectional relationship between news outlets and market shocks, offering a novel empirical approach to studying their effect on each other.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.542225360870361, 1.8824794292449951], "openalex_id": "https://openalex.org/W4407806804", "title": "A Survey on Data Contamination for Large Language Models", "authors": "Y.-H. Cheng, Yi Kuo Chang, Yuan Wu", "abstract": "Recent advancements in Large Language Models (LLMs) have demonstrated significant progress in various areas, such as text generation and code synthesis. However, the reliability of performance evaluation has come under scrutiny due to data contamination-the unintended overlap between training and test datasets. This overlap has the potential to artificially inflate model performance, as LLMs are typically trained on extensive datasets scraped from publicly available sources. These datasets often inadvertently overlap with the benchmarks used for evaluation, leading to an overestimation of the models' true generalization capabilities. In this paper, we first examine the definition and impacts of data contamination. Secondly, we review methods for contamination-free evaluation, focusing on three strategies: data updating-based methods, data rewriting-based methods, and prevention-based methods. Specifically, we highlight dynamic benchmarks and LLM-driven evaluation methods. Finally, we categorize contamination detecting methods based on model information dependency: white-Box, gray-Box, and black-Box detection approaches. Our survey highlights the requirements for more rigorous evaluation protocols and proposes future directions for addressing data contamination challenges.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.518900871276855, 1.5861605405807495], "openalex_id": "https://openalex.org/W4407839089", "title": "stEELlm: An LLM for Generating Semantic Annotations of Tabular Data", "authors": "Marco Cremaschi, Fabio D\u2019Adda, Andrea Maurino", "abstract": "The capabilities of LLMs represent a pivotal step in transforming how we manage and interact with information and data. We witness an increasingly pervasive use of such models in various computational tasks. In some preliminary works, attempts to integrate Knowledge Graphs and Large Language Models (LLMs) can be identified, in particular, to perform the classic tasks related to the construction of Knowledge Graphs through semantic annotation of texts. Nowadays, tables are widely used and play a crucial role in creating, organising, and sharing information that could be used to produce factual knowledge to be integrated into a Knowledge Graph. However, table-to-KG techniques through LLM have not been extensively investigated. This paper presents stEELlm, an innovative Semantic Table Interpretation approach obtained by fine-tuning the Mixtral 8x7B model. Conducted experiments demonstrate the capabilities of our model to successfully create semantic annotations of heterogeneous datasets, a scenario where classic approaches based on heuristics tend to fail.", "venue": "ACM Transactions on Intelligent Systems and Technology", "label": 0}, {"loc": [7.435841083526611, 1.698085904121399], "openalex_id": "https://openalex.org/W4407764202", "title": "Craw4LLM: Efficient Web Crawling for LLM Pretraining", "authors": "Yu Shi, Zhiyuan Liu, Chenyan Xiong", "abstract": "Web crawl is a main source of large language models' (LLMs) pretraining data, but the majority of crawled web pages are discarded in pretraining due to low data quality. This paper presents Craw4LLM, an efficient web crawling method that explores the web graph based on the preference of LLM pretraining. Specifically, it leverages the influence of a webpage in LLM pretraining as the priority score of the web crawler's scheduler, replacing the standard graph connectivity based priority. Our experiments on a web graph containing 900 million webpages from a commercial search engine's index demonstrate the efficiency of Craw4LLM in obtaining high-quality pretraining data. With just 21% URLs crawled, LLMs pretrained on Craw4LLM data reach the same downstream performances of previous crawls, significantly reducing the crawling waste and alleviating the burdens on websites. Our code is publicly available at https://github.com/cxcscmu/Craw4LLM.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.405315399169922, 2.671381950378418], "openalex_id": "https://openalex.org/W4407771386", "title": "Inner Thinking Transformer: Leveraging Dynamic Depth Scaling to Foster Adaptive Internal Thinking", "authors": "Yilong Chen, Junyuan Shang, Zhenyu Zhang, Yanxi Xie, Jiawei Sheng, Tingwen Liu, Shuohuan Wang, Yu Sun, Hua Wu, Haifeng Wang", "abstract": "Large language models (LLMs) face inherent performance bottlenecks under parameter constraints, particularly in processing critical tokens that demand complex reasoning. Empirical analysis reveals challenging tokens induce abrupt gradient spikes across layers, exposing architectural stress points in standard Transformers. Building on this insight, we propose Inner Thinking Transformer (ITT), which reimagines layer computations as implicit thinking steps. ITT dynamically allocates computation through Adaptive Token Routing, iteratively refines representations via Residual Thinking Connections, and distinguishes reasoning phases using Thinking Step Encoding. ITT enables deeper processing of critical tokens without parameter expansion. Evaluations across 162M-466M parameter models show ITT achieves 96.5\\% performance of a 466M Transformer using only 162M parameters, reduces training data by 43.2\\%, and outperforms Transformer/Loop variants in 11 benchmarks. By enabling elastic computation allocation during inference, ITT balances performance and efficiency through architecture-aware optimization of implicit thinking pathways.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.374204158782959, 2.5802621841430664], "openalex_id": "https://openalex.org/W4407771433", "title": "Fine-grained Fallacy Detection with Human Label Variation", "authors": "Alan Ramponi, Agnese Daffara, Sara Tonelli", "abstract": "We introduce Faina, the first dataset for fallacy detection that embraces multiple plausible answers and natural disagreement. Faina includes over 11K span-level annotations with overlaps across 20 fallacy types on social media posts in Italian about migration, climate change, and public health given by two expert annotators. Through an extensive annotation study that allowed discussion over multiple rounds, we minimize annotation errors whilst keeping signals of human label variation. Moreover, we devise a framework that goes beyond \"single ground truth\" evaluation and simultaneously accounts for multiple (equally reliable) test sets and the peculiarities of the task, i.e., partial span matches, overlaps, and the varying severity of labeling errors. Our experiments across four fallacy detection setups show that multi-task and multi-label transformer-based approaches are strong baselines across all settings. We release our data, code, and annotation guidelines to foster research on fallacy detection and human label variation more broadly.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.318535804748535, 2.4945924282073975], "openalex_id": "https://openalex.org/W4407760043", "title": "NaturalReasoning: Reasoning in the Wild with 2.8 M Challenging Questions", "authors": "Weizhe Yuan, Jian Yu, Song Jiang, Karthik Padthe, Li Yang, Dong Wang, Ilia Kulikov, Kyunghyun Cho, Yuandong Tian, Jason Weston, Xian Li", "abstract": "Scaling reasoning capabilities beyond traditional domains such as math and coding is hindered by the lack of diverse and high-quality questions. To overcome this limitation, we introduce a scalable approach for generating diverse and challenging reasoning questions, accompanied by reference answers. We present NaturalReasoning, a comprehensive dataset comprising 2.8 million questions that span multiple domains, including STEM fields (e.g., Physics, Computer Science), Economics, Social Sciences, and more. We demonstrate the utility of the questions in NaturalReasoning through knowledge distillation experiments which show that NaturalReasoning can effectively elicit and transfer reasoning capabilities from a strong teacher model. Furthermore, we demonstrate that NaturalReasoning is also effective for unsupervised self-training using external reward models or self-rewarding. To foster future work, we publicly release NaturalReasoning at https://huggingface.co/datasets/facebook/natural_reasoning.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.649391174316406, -0.7673940062522888], "openalex_id": "https://openalex.org/W4407759265", "title": "SMOL: Professionally translated parallel data for 115 under-represented languages", "authors": "Isaac Caswell, Elizabeth L. Nielsen, Jiaming Luo, Colin Cherry, Geza Kovacs, Hadar Shemtov, Partha Talukdar, Dinesh Tewari, Baba Mamadi Dian\u00e9, Koulako Moussa Doumbouya, Djibrila Dian\u00e9, Solo Farabado Ciss\u00e9", "abstract": "We open-source SMOL (Set of Maximal Overall Leverage), a suite of training data to unlock machine translation for low-resource languages. SMOL has been translated into 124 (and growing) under-resourced languages (125 language pairs), including many for which there exist no previous public resources, for a total of 6.1M translated tokens. SMOL comprises two sub-datasets, each carefully chosen for maximum impact given its size: SMOLSENT, a set of sentences chosen for broad unique token coverage, and SMOLDOC, a document-level resource focusing on a broad topic coverage. They join the already released GATITOS for a trifecta of paragraph, sentence, and token-level content. We demonstrate that using SMOL to prompt or fine-tune Large Language Models yields robust chrF improvements. In addition to translation, we provide factuality ratings and rationales for all documents in SMOLDOC, yielding the first factuality datasets for most of these languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.06234073638916, -0.49704238772392273], "openalex_id": "https://openalex.org/W4407759693", "title": "How Much Do LLMs Hallucinate across Languages? On Multilingual Estimation of LLM Hallucination in the Wild", "authors": "Siraj Ul Islam, Anne Lauscher, Goran Glava\u009a\u0161", "abstract": "In the age of misinformation, hallucination -- the tendency of Large Language Models (LLMs) to generate non-factual or unfaithful responses -- represents the main risk for their global utility. Despite LLMs becoming increasingly multilingual, the vast majority of research on detecting and quantifying LLM hallucination are (a) English-centric and (b) focus on machine translation (MT) and summarization, tasks that are less common ``in the wild'' than open information seeking. In contrast, we aim to quantify the extent of LLM hallucination across languages in knowledge-intensive long-form question answering. To this end, we train a multilingual hallucination detection model and conduct a large-scale study across 30 languages and 6 open-source LLM families. We start from an English hallucination detection dataset and rely on MT to generate (noisy) training data in other languages. We also manually annotate gold data for five high-resource languages; we then demonstrate, for these languages, that the estimates of hallucination rates are similar between silver (LLM-generated) and gold test sets, validating the use of silver data for estimating hallucination rates for other languages. For the final rates estimation, we build a knowledge-intensive QA dataset for 30 languages with LLM-generated prompts and Wikipedia articles as references. We find that, while LLMs generate longer responses with more hallucinated tokens for higher-resource languages, there is no correlation between length-normalized hallucination rates of languages and their digital representation. Further, we find that smaller LLMs exhibit larger hallucination rates than larger models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.861695766448975, 5.521213531494141], "openalex_id": "https://openalex.org/W4407759476", "title": "RealSyn: An Effective and Scalable Multimodal Interleaved Document Transformation Paradigm", "authors": "Tiancheng Gu, Kai-Cheng Yang, C. ZHANG, Yin Xie, Xiang An, Ziyong Feng, Dongnan Liu, Weidong Cai, Jiankang Deng", "abstract": "After pre-training on extensive image-text pairs, Contrastive Language-Image Pre-training (CLIP) demonstrates promising performance on a wide variety of benchmarks. However, a substantial volume of multimodal interleaved documents remains underutilized for contrastive vision-language representation learning. To fully leverage these unpaired documents, we initially establish a Real-World Data Extraction pipeline to extract high-quality images and texts. Then we design a hierarchical retrieval method to efficiently associate each image with multiple semantically relevant realistic texts. To further enhance fine-grained visual information, we propose an image semantic augmented generation module for synthetic text production. Furthermore, we employ a semantic balance sampling strategy to improve dataset diversity, enabling better learning of long-tail concepts. Based on these innovations, we construct RealSyn, a dataset combining realistic and synthetic texts, available in three scales: 15M, 30M, and 100M. We compare our dataset with other widely used datasets of equivalent scale for CLIP training. Models pre-trained on RealSyn consistently achieve state-of-the-art performance across various downstream tasks, including linear probe, zero-shot transfer, zero-shot robustness, and zero-shot retrieval. Furthermore, extensive experiments confirm that RealSyn significantly enhances contrastive vision-language representation learning and demonstrates robust scalability. To facilitate future research, the RealSyn dataset and pretrained model weights are released at https://github.com/deepglint/RealSyn.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.2343010902404785, 2.421912670135498], "openalex_id": "https://openalex.org/W4407735397", "title": "Intergenerational justice as a framework for social media archiving", "authors": "Ryo Shiozaki", "abstract": "Purpose This conceptual study aims to explore the rationale of preservation institutions in archiving new types of documents, such as social media, rather than focusing on traditionally valued materials or established cultural heritage. Design/methodology/approach Recognising the need to include the perspectives of both current and future generations, this study reviews and organises various theories of intergenerational justice to determine whether they can provide a solid justification. Findings No single theory of intergenerational justice fully endorses the long-term preservation of user-generated content. However, utilitarianism can offer a compelling justification for large-scale archiving, whereas other views, including communitarianism, support selective approaches. Originality/value This study is the first to extensively apply intergenerational justice theories to the field of digital preservation, providing a foundational framework for justifying the preservation of emerging forms of digital documents, albeit to a limited extent.", "venue": "Journal of Documentation", "label": 0}, {"loc": [5.10939359664917, -1.6082326173782349], "openalex_id": "https://openalex.org/W4407755210", "title": "Leveraging sentiment analysis of food delivery services reviews using deep learning and word embedding", "authors": "Dheya Mustafa, Safaa M. Khabour, Mousa Al-kfairy, Ahmed S. Shatnawi", "abstract": "Companies that deliver food (food delivery services, or FDS) try to use customer feedback to identify aspects where the customer experience could be improved. Consumer feedback on purchasing and receiving goods via online platforms is a crucial tool for learning about a company\u2019s performance. Many English-language studies have been conducted on sentiment analysis (SA). Arabic is becoming one of the most extensively written languages on the World Wide Web, but because of its morphological and grammatical difficulty as well as the lack of openly accessible resources for Arabic SA, like as dictionaries and datasets, there has not been much research done on the language. Using a manually annotated FDS dataset, the current study conducts extensive sentiment analysis using reviews related to FDS that include Modern Standard Arabic and dialectal Arabic. It does this by utilizing word embedding models, deep learning techniques, and natural language processing to extract subjective opinions, determine polarity, and recognize customers\u2019 feelings in the FDS domain. Convolutional neural network (CNN), bidirectional long short-term memory recurrent neural network (BiLSTM), and an LSTM-CNN hybrid model were among the deep learning approaches to classification that we evaluated. In addition, the article investigated different effective approaches for word embedding and stemming techniques. Using a dataset of Modern Standard Arabic and dialectal Arabic corpus gathered from Talabat.com, we trained and evaluated our suggested models. Our best accuracy was approximately 84% for multiclass classification and 92.5% for binary classification on the FDS. To verify that the proposed approach is suitable for analyzing human perceptions in diversified domains, we designed and carried out excessive experiments on other existing Arabic datasets. The highest obtained multi-classification accuracy is 88.9% on the Hotels Arabic-Reviews Dataset (HARD) dataset, and the highest obtained binary classification accuracy is 97.2% on the same dataset.", "venue": "PeerJ Computer Science", "label": 4}, {"loc": [6.831011772155762, 0.3906799852848053], "openalex_id": "https://openalex.org/W4407759910", "title": "Are Multilingual Language Models an Off-ramp for Under-resourced Languages? Will we arrive at Digital Language Equality in Europe in 2030?", "authors": "Georg Rehm, Annika Gr\u00fctzner-Zahn, Fabio Barth", "abstract": "Large language models (LLMs) demonstrate unprecedented capabilities and define the state of the art for almost all natural language processing (NLP) tasks and also for essentially all Language Technology (LT) applications. LLMs can only be trained for languages for which a sufficient amount of pre-training data is available, effectively excluding many languages that are typically characterised as under-resourced. However, there is both circumstantial and empirical evidence that multilingual LLMs, which have been trained using data sets that cover multiple languages (including under-resourced ones), do exhibit strong capabilities for some of these under-resourced languages. Eventually, this approach may have the potential to be a technological off-ramp for those under-resourced languages for which \"native\" LLMs, and LLM-based technologies, cannot be developed due to a lack of training data. This paper, which concentrates on European languages, examines this idea, analyses the current situation in terms of technology support and summarises related work. The article concludes by focusing on the key open questions that need to be answered for the approach to be put into practice in a systematic way.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.558948040008545, 2.3040120601654053], "openalex_id": "https://openalex.org/W4407695996", "title": "Cuckoo: An IE Free Rider Hatched by Massive Nutrition in LLM's Nest", "authors": "Letian Peng, Zilong Wang, Feng Yao, Jingbo Shang", "abstract": "Massive high-quality data, both pre-training raw texts and post-training annotations, have been carefully prepared to incubate advanced large language models (LLMs). In contrast, for information extraction (IE), pre-training data, such as BIO-tagged sequences, are hard to scale up. We show that IE models can act as free riders on LLM resources by reframing next-token \\emph{prediction} into \\emph{extraction} for tokens already present in the context. Specifically, our proposed next tokens extraction (NTE) paradigm learns a versatile IE model, \\emph{Cuckoo}, with 102.6M extractive data converted from LLM's pre-training and post-training data. Under the few-shot setting, Cuckoo adapts effectively to traditional and complex instruction-following IE with better performance than existing pre-trained IE models. As a free rider, Cuckoo can naturally evolve with the ongoing advancements in LLM data preparation, benefiting from improvements in LLM training pipelines without additional manual effort.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7514703273773193, 3.8119630813598633], "openalex_id": "https://openalex.org/W4407690459", "title": "Primus: A Pioneering Collection of Open-Source Datasets for Cybersecurity LLM Training", "authors": "Yi Yu, Tung\u2010liang Chiang, Cheng\u2010Wei Tsai, Chien-Ming Huang, W. C. Tsao", "abstract": "Large Language Models (LLMs) have shown remarkable advancements in specialized fields such as finance, law, and medicine. However, in cybersecurity, we have noticed a lack of open-source datasets, with a particular lack of high-quality cybersecurity pretraining corpora, even though much research indicates that LLMs acquire their knowledge during pretraining. To address this, we present a comprehensive suite of datasets covering all major training stages, including pretraining, instruction fine-tuning, and reasoning distillation with cybersecurity-specific self-reflection data. Extensive ablation studies demonstrate their effectiveness on public cybersecurity benchmarks. In particular, continual pre-training on our dataset yields a 15.9% improvement in the aggregate score, while reasoning distillation leads to a 15.8% gain in security certification (CISSP). We will release all datasets and trained cybersecurity LLMs under the ODC-BY and MIT licenses to encourage further research in the community. For access to all datasets and model weights, please refer to https://huggingface.co/collections/trendmicro-ailab/primus-67b1fd27052b802b4af9d243.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.050911903381348, 3.591040849685669], "openalex_id": "https://openalex.org/W4407687439", "title": "CoLA: Compute-Efficient Pre-Training of LLMs via Low-Rank Activation", "authors": "Ziyue Liu, Ruijie K. Zhang, Zhengyang Wang, Zi Yang, Paul Hovland, Bogdan Nicolae, Franck Cappello, Zheng Zhang", "abstract": "The full-size MLPs and the projection layers in attention introduce tremendous model sizes of large language models (LLMs), consuming extensive computational resources in pre-training. We empirically observe that the activations of pre-trained LLMs exhibit low-rank property. Motivated by such observations, we propose CoLA and its memory-efficient implementation, CoLA-M, to replace these full-size layers with compute-efficient auto-encoders that naturally enforce low-rank activations throughout training. This fundamental architectural change eliminates the activation redundancy and significantly boosts model capacity and training efficiency. Experiments on LLaMA models with 60 million to 7 billion parameters show that CoLA reduces the computing cost by $\\bf 2\\pmb{\\times}$ and improves training throughput by $\\bf 1.86\\pmb{\\times}$ while maintaining full-rank level performance. CoLA-M further squeezes memory cost without sacrificing throughput, offering a pre-training approach with collectively superior parameter, computing, and memory efficiency. The LLMs produced are also $\\bf 2\\pmb{\\times}$ smaller, enabling faster inference with lower memory cost on resource-constrained platforms.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.710026264190674, 1.6822203397750854], "openalex_id": "https://openalex.org/W4407759132", "title": "Idiosyncrasies in Large Language Models", "authors": "Ming-Jie Sun, Yue Yin, Zeshui Xu, J. Zico Kolter, Zhuang Liu", "abstract": "In this work, we unveil and study idiosyncrasies in Large Language Models (LLMs) -- unique patterns in their outputs that can be used to distinguish the models. To do so, we consider a simple classification task: given a particular text output, the objective is to predict the source LLM that generates the text. We evaluate this synthetic task across various groups of LLMs and find that simply fine-tuning text embedding models on LLM-generated texts yields excellent classification accuracy. Notably, we achieve 97.1% accuracy on held-out validation data in the five-way classification problem involving ChatGPT, Claude, Grok, Gemini, and DeepSeek. Our further investigation reveals that these idiosyncrasies are rooted in word-level distributions. These patterns persist even when the texts are rewritten, translated, or summarized by an external LLM, suggesting that they are also encoded in the semantic content. Additionally, we leverage LLM as judges to generate detailed, open-ended descriptions of each model's idiosyncrasies. Finally, we discuss the broader implications of our findings, including training on synthetic data, inferring model similarity, and robust evaluation of LLMs. Code is available at https://github.com/locuslab/llm-idiosyncrasies.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.39434814453125, 1.2695962190628052], "openalex_id": "https://openalex.org/W4407719397", "title": "DCAD-2000: A Multilingual Dataset across 2000+ Languages with Data Cleaning as Anomaly Detection", "authors": "Yingli Shen, Wen Ter Lai, Shuo Wang, Xueren Zhang, Kangyang Luo, Alexander Fraser, Maosong Sun", "abstract": "The rapid development of multilingual large language models (LLMs) highlights the need for high-quality, diverse, and well-curated multilingual datasets. In this paper, we introduce DCAD-2000 (Data Cleaning as Anomaly Detection), a large-scale multilingual corpus constructed from newly extracted Common Crawl data and existing multilingual sources. DCAD-2000 covers 2,282 languages, 46.72TB of text, and 8.63 billion documents, spanning 155 high- and medium-resource languages and 159 writing scripts. To overcome the limitations of existing data cleaning approaches, which rely on manually designed heuristic thresholds, we reframe data cleaning as an anomaly detection problem. This dynamic filtering paradigm substantially improves data quality by automatically identifying and removing noisy or anomalous content. By fine-tuning LLMs on DCAD-2000, we demonstrate notable improvements in data quality, robustness of the cleaning pipeline, and downstream performance, particularly for low-resource languages across multiple multilingual benchmarks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7743403911590576, 3.8625388145446777], "openalex_id": "https://openalex.org/W4407690459", "title": "PRIMUS: A Pioneering Collection of Open-Source Datasets for", "authors": "Yi Yu, Tung\u2010liang Chiang, Cheng\u2010Wei Tsai, Chien-Ming Huang, W. C. Tsao", "abstract": "Large Language Models (LLMs) have shown remarkable advancements in specialized fields such as finance, law, and medicine. However, in cybersecurity, we have noticed a lack of open-source datasets, with a particular lack of high-quality cybersecurity pretraining corpora, even though much research indicates that LLMs acquire their knowledge during pretraining. To address this, we present a comprehensive suite of datasets covering all major training stages, including pretraining, instruction fine-tuning, and reasoning distillation with cybersecurity-specific self-reflection data. Extensive ablation studies demonstrate their effectiveness on public cybersecurity benchmarks. In particular, continual pre-training on our dataset yields a 15.9% improvement in the aggregate score, while reasoning distillation leads to a 15.8% gain in security certification (CISSP). We will release all datasets and trained cybersecurity LLMs under the ODC-BY and MIT licenses to encourage further research in the community. For access to all datasets and model weights, please refer to https://huggingface.co/collections/trendmicro-ailab/primus-67b1fd27052b802b4af9d243.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.387287616729736, 1.1573312282562256], "openalex_id": "https://openalex.org/W4407633258", "title": "Enhancing Multilingual LLM Pretraining with Model-Based Data Selection", "authors": "B. T. Messmer, Vinko Sabol\u010dec, Martin Jaggi", "abstract": "Dataset curation has become a basis for strong large language model (LLM) performance. While various rule-based filtering heuristics exist for English and multilingual datasets, model-based filtering techniques have primarily focused on English. To address the disparity stemming from limited research on non-English languages, we propose a model-based filtering framework for multilingual datasets that aims to identify a diverse set of structured and knowledge-rich samples. Our approach emphasizes transparency, simplicity, and efficiency, leveraging Transformer- and FastText-based classifiers to ensure the broad accessibility of our technique and data. We conduct comprehensive ablation studies on the FineWeb-2 web crawl dataset across diverse language families, scripts, and resource availability to demonstrate the effectiveness of our method. Training a 1B-parameter Llama model for 70B and 119B tokens, our approach can match the baseline MMLU score with as little as 15% of the training tokens, while also improving across other benchmarks. These findings provide strong evidence for the generalizability of our approach to other languages. As a result, we extend our framework to 20 languages for which we release the refined pretraining datasets.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.765906810760498, 0.736178457736969], "openalex_id": "https://openalex.org/W4407631953", "title": "Krutrim LLM: Multilingual Foundational Model for over a Billion People", "authors": "Aditya Kallappa, Paresh Kamble, Abhinav Ravi, Arvind Patidar, Vinayak Dhruv, Deepak Kumar, Raghav Awasthi, Arveti Manjunath, Shubham Agarwal, Kumar Ashish, Gautam Bhargava, Chandra Khatri", "abstract": "India is a diverse society with unique challenges in developing AI systems, including linguistic diversity, oral traditions, data accessibility, and scalability. Existing foundation models are primarily trained on English, limiting their effectiveness for India's population. Indic languages comprise only 1 percent of Common Crawl corpora despite India representing 18 percent of the global population, leading to linguistic biases. Thousands of regional languages, dialects, and code mixing create additional representation challenges due to sparse training data. We introduce Krutrim LLM, a 2 trillion token multilingual model designed for India's linguistic landscape. It incorporates the largest known Indic dataset, mitigating data scarcity and ensuring balanced performance across dialects. Krutrim outperforms or matches state-of-the-art models on Indic benchmarks while maintaining competitive English performance. Despite being significantly smaller in training flops, Krutrim LLM matches or exceeds models like LLAMA-2 on 10 out of 16 tasks, with an average score of 0.57 versus 0.55. This evidences Krutrim's flexible multilingual fluency across diverse linguistic contexts. Krutrim is integrated with real-time search to improve factual accuracy in conversational AI applications. This enhances accessibility for over 1 billion users worldwide. Through intentional design choices addressing data imbalances, Krutrim LLM signifies meaningful progress in building ethical, globally representative AI models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.980970859527588, 3.3168752193450928], "openalex_id": "https://openalex.org/W4407631978", "title": "Trust at Your Own Peril: A Mixed Methods Exploration of the Ability of Large Language Models to Generate Expert-Like Systems Engineering Artifacts and a \u2026", "authors": "Taylan G. Topcu, Mohammad Husain, Max Ofsa, Paul Wach", "abstract": "Multi-purpose Large Language Models (LLMs), a subset of generative Artificial Intelligence (AI), have recently made significant progress. While expectations for LLMs to assist systems engineering (SE) tasks are paramount; the interdisciplinary and complex nature of systems, along with the need to synthesize deep-domain knowledge and operational context, raise questions regarding the efficacy of LLMs to generate SE artifacts, particularly given that they are trained using data that is broadly available on the internet. To that end, we present results from an empirical exploration, where a human expert-generated SE artifact was taken as a benchmark, parsed, and fed into various LLMs through prompt engineering to generate segments of typical SE artifacts. This procedure was applied without any fine-tuning or calibration to document baseline LLM performance. We then adopted a two-fold mixed-methods approach to compare AI generated artifacts against the benchmark. First, we quantitatively compare the artifacts using natural language processing algorithms and find that when prompted carefully, the state-of-the-art algorithms cannot differentiate AI-generated artifacts from the human-expert benchmark. Second, we conduct a qualitative deep dive to investigate how they differ in terms of quality. We document that while the two-material appear very similar, AI generated artifacts exhibit serious failure modes that could be difficult to detect. We characterize these as: premature requirements definition, unsubstantiated numerical estimates, and propensity to overspecify. We contend that this study tells a cautionary tale about why the SE community must be more cautious adopting AI suggested feedback, at least when generated by multi-purpose LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.760982036590576, 1.5136005878448486], "openalex_id": "https://openalex.org/W4407633227", "title": "Organize the Web: Constructing Domains Enhances Pre-Training Data Curation", "authors": "Alexander Wettig, Kyle Lo, Sewon Min, Hannaneh Hajishirzi, Danqi Chen, Luca Soldaini", "abstract": "Modern language models are trained on large, unstructured datasets consisting of trillions of tokens and obtained by crawling the web. The unstructured nature makes it difficult to reason about their contents and develop systematic approaches to data curation. In this paper, we unpack monolithic web corpora by developing taxonomies of their contents and organizing them into domains. We introduce WebOrganizer, a framework for organizing web pages in terms of both their topic and format. Using these two complementary notions of domains, we automatically annotate pre-training data by distilling annotations from a large language model into efficient classifiers. This allows us to study how data from different domains should be mixed to improve models on downstream tasks, and we show that we can combine insights about effective topics and formats to further boost performance. We demonstrate that our domain mixing also improves existing methods that select data based on quality. Furthermore, we study and compare how quality-based methods will implicitly change the domain mixture. Overall, our work demonstrates that constructing and mixing domains provides a valuable complement to quality-based data curation methods, opening new avenues for effective and insightful pre-training data curation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.680873870849609, 3.6434667110443115], "openalex_id": "https://openalex.org/W4407626534", "title": "HYLR-FO: Hybrid Approach Using Language Models and Rule-Based Systems for On-Device Food Ordering", "authors": "Subhin Yang, Donghwan Kim, Sung-Ju Lee", "abstract": "Recent research has explored combining large language models (LLMs) with speech recognition for various services, but such applications require a strong network environment for quality service delivery. For on-device services, which do not rely on networks, resource limitations must be considered. This study proposes HYLR-FO, an efficient model that integrates a smaller language model (LM) and a rule-based system (RBS) to enable fast and reliable voice-based order processing in resource-constrained environments, approximating the performance of LLMs. By considering potential error scenarios and leveraging flexible natural language processing (NLP) and inference validation, this approach ensures both efficiency and robustness in order execution. Smaller LMs are used instead of LLMs to reduce resource usage. The LM transforms speech input, received via automatic speech recognition (ASR), into a consistent form that can be processed by the RBS. The RBS then extracts the order and validates the extracted information. The experimental results show that HYLR-FO, trained and tested on 5000 order data samples, achieves up to 86% accuracy, comparable to the 90% accuracy of LLMs. Additionally, HYLR-FO achieves a processing speed of up to 55 orders per second, significantly outperforming LLM-based approaches, which handle only 1.14 orders per second. This results in a 48.25-fold improvement in processing speed in resource-constrained environments. This study demonstrates that HYLR-FO provides faster processing and achieves accuracy similar to LLMs in resource-constrained on-device environments. This finding has theoretical implications for optimizing LM efficiency in constrained settings and practical implications for real-time low-resource AI applications. Specifically, the design of HYLR-FO suggests its potential for efficient deployment in various commercial environments, achieving fast response times and low resource consumption with smaller models.", "venue": "Electronics", "label": 19}, {"loc": [6.474853992462158, 5.297027111053467], "openalex_id": "https://openalex.org/W4407632432", "title": "Granite Vision: a lightweight, open-source multimodal model for enterprise Intelligence", "authors": "Granite Vision Team, Leonid Karlinsky, Assaf Arbelle, A Daniels, Ahmed Nassar, Amit Alfassi, Bo Wu, Eli Schwartz, Dhiraj Joshi, Jovana Kondic, Nimrod Shabtay, Pengyuan Li, Roei Herzig, Shafiq Abedin, Shaked Perek, Sivan Harary, Udi Barzelay, Adi Raz Goldfarb, Aude Oliva, Ben Wieles, Bishwaranjan Bhattacharjee, Brandon Huang, Christoph Auer, Dan Gutfreund, David Beymer, David Wood, Hilde Kuehne, Jacob Hansen, Joseph Shtok, Ken\u2010Tsung Wong, Luis Angel D. Bathen, Mayank Mishra, Maksym Lysak, Michele Dolfi, Mikhail Yurochkin, Nikolaos Livathinos, Nimrod Harel, Ophir Azulai, Oshri Naparstek, R. Teixeira De Lima, Rameswar Panda, Sivan Doveh, Shubham Gupta, Subhro Das, Syed Zawad, Yusik Kim, Zexue He, A. F. Brooks, Gabe Goodhart, Anita Govindjee, Derek Leist, Ibrahim Anwar Ibrahim, Aya Soffer, David Cox, Kate Soule, Luis Lastras, Nirmit Desai, Shila Ofek-Koifman, Sridharan Raghavan, Tanveer Syeda-Mahmood, Peter Staar, Tal Drory, Rog\u00e9rio Feris", "abstract": "We introduce Granite Vision, a lightweight large language model with vision capabilities, specifically designed to excel in enterprise use cases, particularly in visual document understanding. Our model is trained on a comprehensive instruction-following dataset, including document-related tasks, such as content extraction from tables, charts, diagrams, sketches, and infographics, as well as general image tasks. The architecture of Granite Vision is centered around visual modality alignment with a decoder-only, 2 billion parameter Granite large language model. Additionally, we introduce a dedicated safety classification approach in test-time that leverages a sparse set of attention vectors to identify potential harmful inputs. Despite its lightweight architecture, Granite Vision achieves strong results in standard benchmarks related to visual document understanding, as well as on the LiveXiv benchmark, which is designed to avoid test set contamination by using a constantly updated corpus of recently published Arxiv papers. We are releasing the model under the Apache-2 license, allowing for both research and commercial use, while offering complete visibility into the training data and other relevant details. See https://huggingface.co/ibm-granite/ for model weights.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.591616153717041, 2.9009552001953125], "openalex_id": "https://openalex.org/W4407571428", "title": "Understanding LLM's: Comprehensive Overview", "authors": "Yu Mo, Lemao Liu, Junjie Wu, Tsz Ting Chung, Sui Zhang, Jiangnan Li, Dit\u2010Yan Yeung, Ji Zhou", "abstract": "In a systematic way, we investigate a widely asked question: Do LLMs really understand what they say?, which relates to the more familiar term Stochastic Parrot. To this end, we propose a summative assessment over a carefully designed physical concept understanding task, PhysiCo. Our task alleviates the memorization issue via the usage of grid-format inputs that abstractly describe physical phenomena. The grids represents varying levels of understanding, from the core phenomenon, application examples to analogies to other abstract patterns in the grid world. A comprehensive study on our task demonstrates: (1) state-of-the-art LLMs, including GPT-4o, o1 and Gemini 2.0 flash thinking, lag behind humans by ~40%; (2) the stochastic parrot phenomenon is present in LLMs, as they fail on our grid task but can describe and recognize the same concepts well in natural language; (3) our task challenges the LLMs due to intrinsic difficulties rather than the unfamiliar grid format, as in-context learning and fine-tuning on same formatted data added little to their performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.6221532821655273, 4.511600494384766], "openalex_id": "https://openalex.org/W4407571378", "title": "Scalable Private Partition Selection via Adaptive Weighting", "authors": "Jing Chen, Vincent Cohen-Addad, Alessandro Epasto, Morteza Zadimoghaddam", "abstract": "In the differentially private partition selection problem (a.k.a. private set union, private key discovery), users hold subsets of items from an unbounded universe. The goal is to output as many items as possible from the union of the users' sets while maintaining user-level differential privacy. Solutions to this problem are a core building block for many privacy-preserving ML applications including vocabulary extraction in a private corpus, computing statistics over categorical data and learning embeddings over user-provided items. We propose an algorithm for this problem, MaxAdaptiveDegree (MAD), which adaptively reroutes weight from items with weight far above the threshold needed for privacy to items with smaller weight, thereby increasing the probability that less frequent items are output. Our algorithm can be efficiently implemented in massively parallel computation systems allowing scalability to very large datasets. We prove that our algorithm stochastically dominates the standard parallel algorithm for this problem. We also develop a two-round version of our algorithm, MAD2R, where results of the computation in the first round are used to bias the weighting in the second round to maximize the number of items output. In experiments, our algorithms provide the best results among parallel algorithms and scale to datasets with hundreds of billions of items, up to three orders of magnitude larger than those analyzed by prior sequential algorithms.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.696360111236572, 5.405185222625732], "openalex_id": "https://openalex.org/W4407570919", "title": "Skrr: Skip and Re-use Text Encoder Layers for Memory Efficient Text-to-Image Generation", "authors": "Hoigi Seo, Wongi Jeong, Jae-sun Seo, Se Young Chun", "abstract": "Large-scale text encoders in text-to-image (T2I) diffusion models have demonstrated exceptional performance in generating high-quality images from textual prompts. Unlike denoising modules that rely on multiple iterative steps, text encoders require only a single forward pass to produce text embeddings. However, despite their minimal contribution to total inference time and floating-point operations (FLOPs), text encoders demand significantly higher memory usage, up to eight times more than denoising modules. To address this inefficiency, we propose Skip and Re-use layers (Skrr), a simple yet effective pruning strategy specifically designed for text encoders in T2I diffusion models. Skrr exploits the inherent redundancy in transformer blocks by selectively skipping or reusing certain layers in a manner tailored for T2I tasks, thereby reducing memory consumption without compromising performance. Extensive experiments demonstrate that Skrr maintains image quality comparable to the original model even under high sparsity levels, outperforming existing blockwise pruning methods. Furthermore, Skrr achieves state-of-the-art memory efficiency while preserving performance across multiple evaluation metrics, including the FID, CLIP, DreamSim, and GenEval scores.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.703076362609863, 2.8322036266326904], "openalex_id": "https://openalex.org/W4415035193", "title": "Sign Operator for Coping with Heavy-Tailed Noise: High Probability Convergence Bounds with Extensions to Distributed Optimization and Comparison Oracle", "authors": "Nikita Kornilov, Philip Zmushko, Andrei Semenov, M.Yu. Ikonnikov, Alexander Gasnikov, Aleksandr Beznosikov", "abstract": "In recent years, non-convex optimization problems are more often described by generalized $(L_0, L_1)$-smoothness assumption rather than standard one. Meanwhile, severely corrupted data used in these problems has increased the demand for methods capable of handling heavy-tailed noises, i.e., noises with bounded $\u03ba$-th moment. Motivated by these real-world trends and challenges, we explore sign-based methods in this setup and demonstrate their effectiveness in comparison with other popular solutions like clipping or normalization. In theory, we prove the first-known high probability convergence bounds under $(L_0, L_1)$-smoothness and heavy-tailed noises with mild parameter dependencies. In the case of standard smoothness, these bounds are novel for sign-based methods as well. In particular, SignSGD with batching achieves sample complexity $\\tilde{O}\\left(\\left(\\frac{\u0394L_0d}{\\varepsilon^2} + \\frac{\u0394L_1d^\\frac{3}{2}}{\\varepsilon}\\right)\\left[1 + \\left(\\frac\u03c3{\\varepsilon}\\right)^\\frac\u03ba{\u03ba-1}\\right]\\right), \u03ba\\in (1,2]$. Under the assumption of symmetric noises, SignSGD with Majority Voting can robustly work on the whole range of $\u03ba\\in (0,2]$ with complexity $\\tilde{O}\\left(\\left(\\frac{\u0394L_0d}{\\varepsilon^2} + \\frac{\u0394L_1d^\\frac{3}{2}}{\\varepsilon}\\right)\\left[\\frac{1}{\u03ba^2} + \\frac{\u03c3^2}{\\varepsilon^2}\\right]\\right)$. We also obtain results for parameter-agnostic setups, Polyak-Lojasiewicz functions and momentum-based methods (in expectation). Our theoretical findings are supported by the superior performance of sign-based methods in training Large Language Models compared to clipping and normalization.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.629031658172607, 5.47318172454834], "openalex_id": "https://openalex.org/W4407569519", "title": "TextAtlas5M: A Large-scale Dataset for Dense Text Image Generation", "authors": "Jinpeng Wang, Dongxing Mao, Jiawei Zhang, Weiming Han, Ziwei Dong, Linjie Li, Yiqi Lin, Zhengyuan Yang, L. Q. Qin, Fuwei Zhang, Lijuan Wang, Min Li", "abstract": "Text-conditioned image generation has gained significant attention in recent years and are processing increasingly longer and comprehensive text prompt. In everyday life, dense and intricate text appears in contexts like advertisements, infographics, and signage, where the integration of both text and visuals is essential for conveying complex information. However, despite these advances, the generation of images containing long-form text remains a persistent challenge, largely due to the limitations of existing datasets, which often focus on shorter and simpler text. To address this gap, we introduce TextAtlas5M, a novel dataset specifically designed to evaluate long-text rendering in text-conditioned image generation. Our dataset consists of 5 million long-text generated and collected images across diverse data types, enabling comprehensive evaluation of large-scale generative models on long-text image generation. We further curate 3000 human-improved test set TextAtlasEval across 3 data domains, establishing one of the most extensive benchmarks for text-conditioned generation. Evaluations suggest that the TextAtlasEval benchmarks present significant challenges even for the most advanced proprietary models (e.g. GPT4o with DallE-3), while their open-source counterparts show an even larger performance gap. These evidences position TextAtlas5M as a valuable dataset for training and evaluating future-generation text-conditioned image generation models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.847533702850342, 0.37000083923339844], "openalex_id": "https://openalex.org/W4407570734", "title": "Retrieval-Based Grammatical Error Correction", "authors": "Wei Li, Wen Luo, Guangyue Peng, Houfeng Wang", "abstract": "Grammatical error correction (GEC) aims to correct grammatical, spelling, and semantic errors in natural language text. With the growing of large language models (LLMs), direct text generation has gradually become the focus of the GEC methods, and few-shot in-context learning presents a cost-effective solution. However, selecting effective in-context examples remains challenging, as the similarity between input texts does not necessarily correspond to similar grammatical error patterns. In this paper, we propose a novel retrieval method based on natural language grammatical error explanations (GEE) to address this issue. Our method retrieves suitable few-shot demonstrations by matching the GEE of the test input with that of pre-constructed database samples, where explanations for erroneous samples are generated by LLMs. We conducted multilingual GEC few-shot experiments on both major open-source and closed-source LLMs. Experiments across five languages show that our method outperforms existing semantic and BM25-based retrieval techniques, without requiring additional training or language adaptation. This also suggests that matching error patterns is key to selecting examples.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.18590259552002, 3.0031917095184326], "openalex_id": "https://openalex.org/W4407570152", "title": "Unlocking Scaling Law in Industrial Recommendation Systems with a Three-step Paradigm based Large User Model", "authors": "Bencheng Yan, Shilei Liu, Zhiyuan Zeng, Zihao Wang, Yizhen Zhang, Yujin Yuan, Langming Liu, Jiaqi Liu, Di Wang, Wenbo Su, Pengjie Wang, Jian Xu, B. Zheng", "abstract": "Recent advancements in autoregressive Large Language Models (LLMs) have achieved significant milestones, largely attributed to their scalability, often referred to as the \"scaling law\". Inspired by these achievements, there has been a growing interest in adapting LLMs for Recommendation Systems (RecSys) by reformulating RecSys tasks into generative problems. However, these End-to-End Generative Recommendation (E2E-GR) methods tend to prioritize idealized goals, often at the expense of the practical advantages offered by traditional Deep Learning based Recommendation Models (DLRMs) in terms of in features, architecture, and practices. This disparity between idealized goals and practical needs introduces several challenges and limitations, locking the scaling law in industrial RecSys. In this paper, we introduce a large user model (LUM) that addresses these limitations through a three-step paradigm, designed to meet the stringent requirements of industrial settings while unlocking the potential for scalable recommendations. Our extensive experimental evaluations demonstrate that LUM outperforms both state-of-the-art DLRMs and E2E-GR approaches. Notably, LUM exhibits excellent scalability, with performance improvements observed as the model scales up to 7 billion parameters. Additionally, we have successfully deployed LUM in an industrial application, where it achieved significant gains in an A/B test, further validating its effectiveness and practicality.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.9885334968566895, 0.765548050403595], "openalex_id": "https://openalex.org/W4407569679", "title": "Training Sparse Mixture Of Experts Text Embedding Models", "authors": "Zach Nussbaum, Brandon Duderstadt", "abstract": "Transformer-based text embedding models have improved their performance on benchmarks like MIRACL and BEIR by increasing their parameter counts. However, this scaling approach introduces significant deployment challenges, including increased inference latency and memory usage. These challenges are particularly severe in retrieval-augmented generation (RAG) applications, where large models' increased memory requirements constrain dataset ingestion capacity, and their higher latency directly impacts query-time performance. While causal language models have addressed similar efficiency challenges using Mixture of Experts (MoE) architectures, this approach hasn't been successfully adapted to the general text embedding setting. In this paper, we introduce Nomic Embed v2, the first general purpose MoE text embedding model. Our model outperforms models in the same parameter class on both monolingual and multilingual benchmarks while also maintaining competitive performance with models twice its size. We open-source all code, models, and evaluation data to ensure full reproducibility of our training pipeline at \\href{https://github.com/nomic-ai/contrastors}{https://github.com/nomic-ai/contrastors}.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.288808822631836, 1.2330033779144287], "openalex_id": "https://openalex.org/W4407569752", "title": "Break the Checkbox: Challenging Closed-Style Evaluations of Cultural Alignment in LLMs", "authors": "Mohsinul Kabir, Ajwad Abrar, Sophia Ananiadou", "abstract": "A large number of studies rely on closed-style multiple-choice surveys to evaluate cultural alignment in Large Language Models (LLMs). In this work, we challenge this constrained evaluation paradigm and explore more realistic, unconstrained approaches. Using the World Values Survey (WVS) and Hofstede Cultural Dimensions as case studies, we demonstrate that LLMs exhibit stronger cultural alignment in less constrained settings, where responses are not forced. Additionally, we show that even minor changes, such as reordering survey choices, lead to inconsistent outputs, exposing the limitations of closed-style evaluations. Our findings advocate for more robust and flexible evaluation frameworks that focus on specific cultural proxies, encouraging more nuanced and accurate assessments of cultural alignment in LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.67948055267334, 3.110485076904297], "openalex_id": "https://openalex.org/W4407569612", "title": "Intrinsic Bias is Predicted by Pretraining Data and Correlates with Downstream Performance in Vision-Language Encoders", "authors": "Kshitish Ghate, Isaac Slaughter, Kyra Wilson, Mona Diab, Aylin Caliskan", "abstract": "While recent work has found that vision-language models trained under the Contrastive Language Image Pre-training (CLIP) framework contain intrinsic social biases, the extent to which different upstream pre-training features of the framework relate to these biases, and hence how intrinsic bias and downstream performance are connected has been unclear. In this work, we present the largest comprehensive analysis to-date of how the upstream pre-training factors and downstream performance of CLIP models relate to their intrinsic biases. Studying 131 unique CLIP models, trained on 26 datasets, using 55 architectures, and in a variety of sizes, we evaluate bias in each model using 26 well-established unimodal and cross-modal principled Embedding Association Tests. We find that the choice of pre-training dataset is the most significant upstream predictor of bias, whereas architectural variations have minimal impact. Additionally, datasets curated using sophisticated filtering techniques aimed at enhancing downstream model performance tend to be associated with higher levels of intrinsic bias. Finally, we observe that intrinsic bias is often significantly correlated with downstream performance ($0.3 \\leq r \\leq 0.8$), suggesting that models optimized for performance inadvertently learn to amplify representational biases. Comparisons between unimodal and cross-modal association tests reveal that social group bias depends heavily on the modality. Our findings imply that more sophisticated strategies are needed to address intrinsic model bias for vision-language models across the entire model development pipeline.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.2875237464904785, 1.9098329544067383], "openalex_id": "https://openalex.org/W4407570518", "title": "Salamandra Technical Report", "authors": "Aitor Gonz\u00e1lez-Agirre, Marc P\u00e0mies, Joan Llop, Irene Baucells, Severino Da Dalt, Daniel C. Londo\u00f1o Tamayo, Javier S\u00e1iz, Ferran Espu\u00f1a, Jaume Prats, Javier Aula-Blasco, Mario Mina, I\u00f1igo Pikabea, Adri\u00e1n Rubio, Alexander Shvets, Anna Sall\u00e9s, I\u00f1aki Lacunza, Jorge Palomar, J\u00falia Falc\u00e3o, Luc\u00eda Tormo, Luis Vasquez-Reina, Montserrat Marimon, Oriol Pareras, Valle Ruiz-Fern\u00e1ndez, Marta Villegas", "abstract": "This work introduces Salamandra, a suite of open-source decoder-only large language models available in three different sizes: 2, 7, and 40 billion parameters. The models were trained from scratch on highly multilingual data that comprises text in 35 European languages and code. Our carefully curated corpus is made exclusively from open-access data compiled from a wide variety of sources. Along with the base models, supplementary checkpoints that were fine-tuned on public-domain instruction data are also released for chat applications. Additionally, we also share our preliminary experiments on multimodality, which serve as proof-of-concept to showcase potential applications for the Salamandra family. Our extensive evaluations on multilingual benchmarks reveal that Salamandra has strong capabilities, achieving competitive performance when compared to similarly sized open-source models. We provide comprehensive evaluation results both on standard downstream tasks as well as key aspects related to bias and safety.With this technical report, we intend to promote open science by sharing all the details behind our design choices, data curation strategy and evaluation methodology. In addition to that, we deviate from the usual practice by making our training and evaluation scripts publicly accessible. We release all models under a permissive Apache 2.0 license in order to foster future research and facilitate commercial use, thereby contributing to the open-source ecosystem of large language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.795274257659912, 5.444836616516113], "openalex_id": "https://openalex.org/W4407425748", "title": "Scaling Pre-training to One Hundred Billion Data for Vision Language Models", "authors": "Xiao Wang, Ibrahim Alabdulmohsin, Daniel Salz, Zhe Li, Keran Rong, Xiaohua Zhai", "abstract": "We provide an empirical investigation of the potential of pre-training vision-language models on an unprecedented scale: 100 billion examples. We find that model performance tends to saturate at this scale on many common Western-centric classification and retrieval benchmarks, such as COCO Captions. Nevertheless, tasks of cultural diversity achieve more substantial gains from the 100-billion scale web data, thanks to its coverage of long-tail concepts. Furthermore, we analyze the model's multilinguality and show gains in low-resource languages as well. In addition, we observe that reducing the size of the pretraining dataset via quality filters like using CLIP, typically used to enhance performance, may inadvertently reduce the cultural diversity represented even in large-scale datasets. Our results highlight that while traditional benchmarks may not benefit significantly from scaling noisy, raw web data to 100 billion examples, this data scale is vital for building truly inclusive multimodal systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.047635078430176, 2.539552688598633], "openalex_id": "https://openalex.org/W4407424734", "title": "LongReD: Mitigating Short-Text Degradation of Long-Context Large Language Models via Restoration Distillation", "authors": "Zican Dong, Xia Li, Jinhao Jiang, Mingyu Xu, Wayne Xin Zhao, Bingning Wang, Weipeng Chen", "abstract": "Large language models (LLMs) have gained extended context windows through scaling positional encodings and lightweight continual pre-training. However, this often leads to degraded performance on short-text tasks, while the reasons for this degradation remain insufficiently explored. In this work, we identify two primary factors contributing to this issue: distribution drift in hidden states and attention scores, and catastrophic forgetting during continual pre-training. To address these challenges, we propose Long Context Pre-training with Restoration Distillation (LongReD), a novel approach designed to mitigate short-text performance degradation through minimizing the distribution discrepancy between the extended and original models. Besides training on long texts, LongReD distills the hidden state of selected layers from the original model on short texts. Additionally, LongReD also introduces a short-to-long distillation, aligning the output distribution on short texts with that on long texts by leveraging skipped positional indices. Experiments on common text benchmarks demonstrate that LongReD effectively preserves the model's short-text performance while maintaining comparable or even better capacity to handle long texts than baselines. Our code is available at https://github.com/RUCAIBox/LongReD.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.081242561340332, -0.5605508685112], "openalex_id": "https://openalex.org/W4407631937", "title": "Cancer Vaccine Adjuvant Name Recognition from Biomedical Literature using Large Language Models", "authors": "Hasin Rehana, Jie Zheng, LianShung Yeh, Benu Bansal, Nur Bengisu \u00c7am, Christianah Jemiyo, Brett A. McGregor, Arzucan \u00d6zg\u00fcr, Yongqun He, Junguk Hur", "abstract": "Source code is available at https://github.com/hurlab/Vaccine-Adjuvant-LLM.", "venue": "PubMed", "label": 0}, {"loc": [8.452831268310547, 3.59713077545166], "openalex_id": "https://openalex.org/W4407424218", "title": "A Memory Efficient Randomized Subspace Optimization Method for Training Large Language Models", "authors": "Yiming Chen, Yuan Zhang, Yin Liu, Kun Yuan, Zaiwen Wen", "abstract": "The memory challenges associated with training Large Language Models (LLMs) have become a critical concern, particularly when using the Adam optimizer. To address this issue, numerous memory-efficient techniques have been proposed, with GaLore standing out as a notable example designed to reduce the memory footprint of optimizer states. However, these approaches do not alleviate the memory burden imposed by activations, rendering them unsuitable for scenarios involving long context sequences or large mini-batches. Moreover, their convergence properties are still not well-understood in the literature. In this work, we introduce a Randomized Subspace Optimization framework for pre-training and fine-tuning LLMs. Our approach decomposes the high-dimensional training problem into a series of lower-dimensional subproblems. At each iteration, a random subspace is selected, and the parameters within that subspace are optimized. This structured reduction in dimensionality allows our method to simultaneously reduce memory usage for both activations and optimizer states. We establish comprehensive convergence guarantees and derive rates for various scenarios, accommodating different optimization strategies to solve the subproblems. Extensive experiments validate the superior memory and communication efficiency of our method, achieving performance comparable to GaLore and Adam.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.350524425506592, 1.2198081016540527], "openalex_id": "https://openalex.org/W4409211663", "title": "Breaking the Programming Language Barrier: Multilingual Prompting to Empower Non-Native English Learners", "authors": "James Prather, Brent N. Reeves, Paul Denny, Juho Leinonen, Stephen MacNeil, Andrew Luxton-Reilly, Jo\u00e3o Orvalho, Mohammad Amin Alipour, Ali Alfageeh, Thezyrie Amarouche, Bailey Kimmel, Jared Wright, Musa Blake, Gweneth Barbre", "abstract": "Publisher Copyright: \u00a9 2025 Copyright held by the owner/author(s).", "venue": "https://doi.org/10.1145/3716640.3716649", "label": 0}, {"loc": [8.520615577697754, 2.458760976791382], "openalex_id": "https://openalex.org/W4407358678", "title": "PiKE: Adaptive Data Mixing for Multi-Task Learning Under Low Gradient Conflicts", "authors": "Zeman Li, Yuan Deng, Peilin Zhong, Meisam Razaviyayn, Vahab Mirrokni", "abstract": "Modern foundation models are trained on diverse datasets to enhance generalization across tasks and domains A central challenge in this process is determining how to effectively mix and sample data from multiple sources This naturally leads to a multitask learning (MTL) perspective While prior work in MTL has emphasized mitigating gradient conflicts we observe that largescale pretraining scenariossuch as multilingual or multidomain trainingoften exhibit little to no gradient conflict Motivated by this observation we propose PiKE (Positive gradient interaction-based K-task weights Estimator) an adaptive data mixing algorithm that dynamically adjusts sampling weights during training PiKE exploits nonconflicting gradient interactions to minimize a neartight upper bound on the average loss decrease at each step while incurring negligible computational overhead We provide theoretical convergence guarantees and show that PiKE outperforms static and nonadaptive mixing baselines Furthermore we extend PiKE to promote balanced learning across tasks Extensive experiments on largescale language model pretraining confirm that PiKE achieves faster convergence and improved downstream performance compared to existing approaches", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.1213481426239014, 2.3163201808929443], "openalex_id": "https://openalex.org/W4407386075", "title": "The Human Labour of Data Work: Capturing Cultural Diversity through World Wide Dishes", "authors": "Siobhan Mackenzie Hall, Samantha Dalal, Raesetje Sefala, Foutse Yuehgoh, Aisha Alaagib, Imane Hamzaoui, Shu Ishida, Jabez Magomere, Lauren Crais, Aya Salama, Tejumade Afonja", "abstract": "This paper provides guidance for building and maintaining infrastructure for participatory AI efforts by sharing reflections on building World Wide Dishes (WWD), a bottom-up, community-led image and text dataset of culinary dishes and associated cultural customs. We present WWD as an example of participatory dataset creation, where community members both guide the design of the research process and contribute to the crowdsourced dataset. This approach incorporates localised expertise and knowledge to address the limitations of web-scraped Internet datasets acknowledged in the Participatory AI discourse. We show that our approach can result in curated, high-quality data that supports decentralised contributions from communities that do not typically contribute to datasets due to a variety of systemic factors. Our project demonstrates the importance of participatory mediators in supporting community engagement by identifying the kinds of labour they performed to make WWD possible. We surface three dimensions of labour performed by participatory mediators that are crucial for participatory dataset construction: building trust with community members, making participation accessible, and contextualising community values to support meaningful data collection. Drawing on our findings, we put forth five lessons for building infrastructure to support future participatory AI efforts.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.47292947769165, 4.4205145835876465], "openalex_id": "https://openalex.org/W4407359596", "title": "Towards Internet-Scale Training For Agents", "authors": "Brandon Trabucco, Gunnar Sigur\u00f0sson, Robinson Piramuthu, Ruslan Salakhutdinov", "abstract": "The predominant approach for training web navigation agents is to gather human demonstrations for a set of popular websites and hand-written tasks, but it is becoming clear that human data is an inefficient resource. We develop a pipeline to facilitate internet-scale training for agents without laborious human annotations. In the first stage, an LLM annotates 150k sites with agentic tasks. In the next stage, LLM agents complete tasks and produce trajectories. In the final stage, an LLM filters trajectories by judging their success. Language models are powerful data curation tools, identifying harmful content with an accuracy of 97%, judging successful trajectories with an accuracy of 82.6%, and producing effective data. We train agents based on Qwen 3 1.7B that are competitive with frontier LLMs as web agents, while being smaller and faster. Our top agent reaches a success rate of 56.9%, outperforming the data collection policy Qwen 3 235B, a 235 times larger Llama 4 Maverick, and reaching 94.7% of the performance of Gemini 2.5 Flash. We are releasing code, models and data at: https://data-for-agents.github.io.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.285372734069824, 1.869526982307434], "openalex_id": "https://openalex.org/W4407385164", "title": "XiHeFusion: Harnessing Large Language Models for Science Communication in Nuclear Fusion", "authors": "Xiao Wang, Qingquan Yang, Fuling Wang, Qiang Chen, Wann\u2010Yih Wu, Yu Jin, Jun Jiang, Liang Jin, Bo Jiang, Dengdi Sun, Wenzhi Lv, Min Chen, Zehua Chen, Guosheng Xu, Jin Tang", "abstract": "Nuclear fusion is one of the most promising ways for humans to obtain infinite energy. Currently, with the rapid development of artificial intelligence, the mission of nuclear fusion has also entered a critical period of its development. How to let more people to understand nuclear fusion and join in its research is one of the effective means to accelerate the implementation of fusion. This paper proposes the first large model in the field of nuclear fusion, XiHeFusion, which is obtained through supervised fine-tuning based on the open-source large model Qwen2.5-14B. We have collected multi-source knowledge about nuclear fusion tasks to support the training of this model, including the common crawl, eBooks, arXiv, dissertation, etc. After the model has mastered the knowledge of the nuclear fusion field, we further used the chain of thought to enhance its logical reasoning ability, making XiHeFusion able to provide more accurate and logical answers. In addition, we propose a test questionnaire containing 180+ questions to assess the conversational ability of this science popularization large model. Extensive experimental results show that our nuclear fusion dialogue model, XiHeFusion, can perform well in answering science popularization knowledge. The pre-trained XiHeFusion model is released on https://github.com/Event-AHU/XiHeFusion.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.472729682922363, 2.7660152912139893], "openalex_id": "https://openalex.org/W4407359523", "title": "Dynamic Loss-Based Sample Reweighting for Improved Large Language Model Pretraining", "authors": "Daouda Sow, Herbert Woisetschl\u00e4ger, Saikiran Bulusu, Shiqiang Wang, Hans\u2010Arno Jacobsen, Yingbin Liang", "abstract": "Pretraining large language models (LLMs) on vast and heterogeneous datasets is crucial for achieving state-of-the-art performance across diverse downstream tasks. However, current training paradigms treat all samples equally, overlooking the importance or relevance of individual samples throughout the training process. Existing reweighting strategies, which primarily focus on group-level data importance, fail to leverage fine-grained instance-level information and do not adapt dynamically to individual sample importance as training progresses. In this paper, we introduce novel algorithms for dynamic, instance-level data reweighting aimed at improving both the efficiency and effectiveness of LLM pretraining. Our methods adjust the weight of each training sample based on its loss value in an online fashion, allowing the model to dynamically focus on more informative or important samples at the current training stage. In particular, our framework allows us to systematically devise reweighting strategies deprioritizing redundant or uninformative data, which we find tend to work best. Furthermore, we develop a new theoretical framework for analyzing the impact of loss-based reweighting on the convergence of gradient-based optimization, providing the first formal characterization of how these strategies affect convergence bounds. We empirically validate our approach across a spectrum of tasks, from pretraining 7B and 1.4B parameter LLMs to smaller-scale language models and linear regression problems, demonstrating that our loss-based reweighting approach can lead to faster convergence and significantly improved performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.342644214630127, 4.2501068115234375], "openalex_id": "https://openalex.org/W4407359220", "title": "Hephaestus: Improving Fundamental Agent Capabilities of Large Language Models through Continual Pre-Training", "authors": "Yuchen Zhuang, Jingfeng Yang, Haoming Jiang, Xin Liu, Kewei Cheng, Sanket Lokegaonkar, Yifan Gao, Ping Qing, Tianyi Liu, Binxuan Huang, Zheng Li, Zhengyang Wang, Pei\u2010Yu Chen, Ruijie Wang, Rongzhi Zhang, Nasser Zalmout, Priyanka Nigam, Bing Yin, Chao Zhang", "abstract": "Due to the scarcity of agent-oriented pre-training data, LLM-based autonomous agents typically rely on complex prompting or extensive fine-tuning, which often fails to introduce new capabilities while preserving strong generalizability. We introduce Hephaestus-Forge, the first large-scale pre-training corpus designed to enhance the fundamental capabilities of LLM agents in API function calling, intrinsic reasoning and planning, and adapting to environmental feedback. Hephaestus-Forge comprises 103B agent-specific data encompassing 76,537 APIs, including both tool documentation to introduce knowledge of API functions and function calling trajectories to strengthen intrinsic reasoning. To explore effective training protocols, we investigate scaling laws to identify the optimal recipe in data mixing ratios. By continual pre-training on Hephaestus-Forge, Hephaestus outperforms small- to medium-scale open-source LLMs and rivals commercial LLMs on three agent benchmarks, demonstrating the effectiveness of our pre-training corpus in enhancing fundamental agentic capabilities and generalization of LLMs to new tasks or environments.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.921571254730225, 0.3875541090965271], "openalex_id": "https://openalex.org/W4407309458", "title": "Enhancing Impression Change Prediction in Speed Dating Simulations Based on Speakers' Personalities", "authors": "Kazuya Matsuo, Yoko Ishii, Atsushi Otsuka, Ryo Ishii, Hiroaki Sugiyama, Masahiro Mizukami, Tsunehiro Arimoto, Narichika Nomoto, Yukio Sato, Tatsuro Yamaguchi", "abstract": "This paper focuses on simulating text dialogues in which impressions between speakers improve during speed dating. This simulation involves selecting an utterance from multiple candidates generated by a text generation model that replicates a specific speaker's utterances, aiming to improve the impression of the speaker. Accurately selecting an utterance that improves the impression is crucial for the simulation. We believe that whether an utterance improves a dialogue partner's impression of the speaker may depend on the personalities of both parties. However, recent methods for utterance selection do not consider the impression per utterance or the personalities. To address this, we propose a method that predicts whether an utterance improves a partner's impression of the speaker, considering the personalities. The evaluation results showed that personalities are useful in predicting impression changes per utterance. Furthermore, we conducted a human evaluation of simulated dialogues using our method. The results showed that it could simulate dialogues more favorably received than those selected without considering personalities.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.465640544891357, 2.792797565460205], "openalex_id": "https://openalex.org/W4407308520", "title": "Understanding and Mitigating the Bias Inheritance in LLM-based Data Augmentation on Downstream Tasks", "authors": "Miaomiao Li, Xuejiao Chen, Yang Wang, Tianhao Zhu, Weijia Zhang, Kaijie Zhu, Kam\u2010Fai Wong, Jindong Wang", "abstract": "Generating synthetic datasets via large language models (LLMs) themselves has emerged as a promising approach to improve LLM performance. However, LLMs inherently reflect biases present in their training data, leading to a critical challenge: when these models generate synthetic data for training, they may propagate and amplify their inherent biases that can significantly impact model fairness and robustness on downstream tasks--a phenomenon we term bias inheritance. This work presents the first systematic investigation in understanding, analyzing, and mitigating bias inheritance. We study this problem by fine-tuning LLMs with a combined dataset consisting of original and LLM-augmented data, where bias ratio represents the proportion of augmented data. Through systematic experiments across 10 classification and generation tasks, we analyze how 6 different types of biases manifest at varying bias ratios. Our results reveal that bias inheritance has nuanced effects on downstream tasks, influencing both classification tasks and generation tasks differently. Then, our analysis identifies three key misalignment factors: misalignment of values, group data, and data distributions. Based on these insights, we propose three mitigation strategies: token-based, mask-based, and loss-based approaches. Experiments demonstrate that these strategies also work differently on various tasks and bias, indicating the substantial challenges to fully mitigate bias inheritance. We hope this work can provide valuable insights to the research of LLM data augmentation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.421591758728027, -1.446420431137085], "openalex_id": "https://openalex.org/W4407397248", "title": "Machine Learning Method Employed for the Objective of Identifying Text on Tweet Dataset", "authors": "Sakshi Pandey", "abstract": "When it comes to training ML systems, internet-based data is invaluable. Despite the difficulty in collecting this information, teams of experts from academic institutions and research labs have created publicly accessible databases. Twitter and other social media platforms provided large quantities of useful information throughout the pandemic, which was used to evaluate healthcare decisions. In order to forecast illness prevalence and offer early warnings, we suggest analysing user attitudes by using efficient supervised machine learning algorithms. The gathered tweets were sorted into positive, negative, and neutral categories for preprocessing. Hybrid feature extraction is the innovative aspect of our work; we used it to correctly describe posts by combining syntactic features (TF-IDF) and semantic elements (FastText and Glove), which in turn improved classification. The experimental findings suggest that when using Naive Bayes, the combination of FastText and TF-IDF achieves the best results.", "venue": "BENTHAM SCIENCE PUBLISHERS eBooks", "label": 0}, {"loc": [2.8588309288024902, 2.7726004123687744], "openalex_id": "https://openalex.org/W4391833076", "title": "From Data Creator to Data Reuser: Distance Matters", "authors": "Christine L. Borgman, Paul Groth", "abstract": "Sharing research data is necessary, but not sufficient, for data reuse. Open science policies focus more heavily on data sharing than on reuse, yet both are complex, labor-intensive, expensive, and require infrastructure investments by multiple stakeholders. The value of data reuse lies in relationships between creators and reusers. By addressing knowledge exchange, rather than mere transactions between stakeholders, investments in data management and knowledge infrastructures can be made more wisely. Drawing upon empirical studies of data sharing and reuse, we develop the metaphor of distance between data creator and data reuser, identifying six dimensions of distance that influence the ability to transfer knowledge effectively: domain, methods, collaboration, curation, purposes, and time and temporality. We explore how social and socio-technical aspects of these dimensions may decrease -- or increase -- distances to be traversed between creators and reusers. Our theoretical framing of the distance between data creators and prospective reusers leads to recommendations to four categories of stakeholders on how to make data sharing and reuse more effective: data creators, data reusers, data archivists, and funding agencies. 'It takes a village' to share research data -- and a village to reuse data. Our aim is to provoke new research questions, new research, and new investments in effective and efficient circulation of research data; and to identify criteria for investments at each stage of data and research life cycles.", "venue": "Harvard Data Science Review", "label": 0}, {"loc": [2.779665231704712, -0.338309645652771], "openalex_id": "https://openalex.org/W4407251580", "title": "Unveiling GPT-4V's hidden challenges behind high accuracy on USMLE questions: Observational Study", "authors": "Zhichao Yang, Zonghai Yao, Mahbuba Tasmin, Parth Vashisht, Won Seok Jang, Feiyun Ouyang, Baiyang Wang, David D. McManus, Dan R. Berlowitz, Hong Yu", "abstract": "Background Recent advancements in artificial intelligence, such as GPT-3.5 Turbo (OpenAI) and GPT-4, have demonstrated significant potential by achieving good scores on text-only United States Medical Licensing Examination (USMLE) exams and effectively answering questions from physicians. However, the ability of these models to interpret medical images remains underexplored. Objective This study aimed to comprehensively evaluate the performance, interpretability, and limitations of GPT-3.5 Turbo, GPT-4, and its successor, GPT-4 Vision (GPT-4V), specifically focusing on GPT-4V\u2019s newly introduced image-understanding feature. By assessing the models on medical licensing examination questions that require image interpretation, we sought to highlight the strengths and weaknesses of GPT-4V in handling complex multimodal clinical information, thereby exposing hidden flaws and providing insights into its readiness for integration into clinical settings. Methods This cross-sectional study tested GPT-4V, GPT-4, and ChatGPT-3.5 Turbo on a total of 227 multiple-choice questions with images from USMLE Step 1 (n=19), Step 2 clinical knowledge (n=14), Step 3 (n=18), the Diagnostic Radiology Qualifying Core Exam (DRQCE) (n=26), and AMBOSS question banks (n=150). AMBOSS provided expert-written hints and question difficulty levels. GPT-4V\u2019s accuracy was compared with 2 state-of-the-art large language models, GPT-3.5 Turbo and GPT-4. The quality of the explanations was evaluated by choosing human preference between an explanation by GPT-4V (without hint), an explanation by an expert, or a tie, using 3 qualitative metrics: comprehensive explanation, question information, and image interpretation. To better understand GPT-4V\u2019s explanation ability, we modified a patient case report to resemble a typical \u201ccurbside consultation\u201d between physicians. Results For questions with images, GPT-4V achieved an accuracy of 84.2%, 85.7%, 88.9%, and 73.1% in Step 1, Step 2 clinical knowledge, Step 3 of USMLE, and DRQCE, respectively. It outperformed GPT-3.5 Turbo (42.1%, 50%, 50%, 19.2%) and GPT-4 (63.2%, 64.3%, 66.7%, 26.9%). When GPT-4V answered correctly, its explanations were nearly as good as those provided by domain experts from AMBOSS. However, incorrect answers often had poor explanation quality: 18.2% (10/55) contained inaccurate text, 45.5% (25/55) had inference errors, and 76.3% (42/55) demonstrated image misunderstandings. With human expert assistance, GPT-4V reduced errors by an average of 40% (22/55). GPT-4V accuracy improved with hints, maintaining stable performance across difficulty levels, while medical student performance declined as difficulty increased. In a simulated curbside consultation scenario, GPT-4V required multiple specific prompts to interpret complex case data accurately. Conclusions GPT-4V achieved high accuracy on multiple-choice questions with images, highlighting its potential in medical assessments. However, significant shortcomings were observed in the quality of explanations when questions were answered incorrectly, particularly in the interpretation of images, which could not be efficiently resolved through expert interaction. These findings reveal hidden flaws in the image interpretation capabilities of GPT-4V, underscoring the need for more comprehensive evaluations beyond multiple-choice questions before integrating GPT-4V into clinical settings.", "venue": "Journal of Medical Internet Research", "label": 13}, {"loc": [6.232367992401123, -1.1407496929168701], "openalex_id": "https://openalex.org/W4407255507", "title": "A comprehensive survey on Arabic text augmentation: approaches, challenges, and applications", "authors": "Ahmed Adel ElSabagh, Shahira Shaaban Azab, Hesham A. Hefny", "abstract": "Abstract Arabic is a linguistically complex language with a rich structure and valuable syntax that pose unique challenges for natural language processing (NLP), primarily due to the scarcity of large, reliable annotated datasets essential for training models. The varieties of dialects and mixtures of more than one language within a single conversation further complicate the development and efficacy of deep learning models targeting Arabic. Data augmentation (DA) techniques have emerged as a promising solution to tackle data scarcity and improve model performance. However, implementing DA in Arabic NLP presents its challenges, particularly in maintaining semantic integrity and adapting to the language\u2019s intricate morphological structure. This survey comprehensively examines various aspects of Arabic data augmentation techniques, covering strategies for model training, methods for evaluating augmentation performance, understanding the effects and applications of augmentation on data, studying NLP downstream tasks, addressing augmentation problems, proposing solutions, conducting in-depth literature reviews, and drawing conclusions. Through detailed analysis of 75 primary and 9 secondary papers, we categorize DA methods into diversity enhancement, resampling, and secondary approaches, each targeting specific challenges inherent in augmenting Arabic datasets. The goal is to offer insights into DA effectiveness, identify research gaps, and suggest future directions for advancing NLP in Arabic.", "venue": "Neural Computing and Applications", "label": 0}, {"loc": [3.7707507610321045, -3.964271068572998], "openalex_id": "https://openalex.org/W4408240521", "title": "Recent Trends on Artificial Intelligence in Automated Hate Speech Detection", "authors": "Nishant Goyal, A. V. S. Sreedhar Kumar, Aarushi Chaddha, D. Lakshmi", "abstract": "This study investigates the performance of AI in detecting HS in diverse cultural and contextual settings. Existing AI models, trained primarily on English datasets, struggle with regional dialects, idiomatic phrases, and cultural nuances. A systematic review of NLP techniques, including traditional methods (n-grams, Bag of Words) and advanced architectures (BERT, GPT, RoBERTa, CNNs, LSTMs), evaluates their effectiveness. Multilingual models like mBERT and XLM-R are assessed for low-resource scenarios while emerging trends like multimodal learning (CLIP) and adversarial training (GANs) are explored for robustness. Challenges such as data bias, false positives, and cultural insensitivity are addressed through contextual embeddings, data augmentation, and Pragmatics-oriented NLP. Metrics like precision, recall, and F1-score reveal significant accuracy drops in non-English contexts. The study emphasizes culturally aware datasets, Explainable AI (LIME, SHAP), and hybrid AI-human moderation to ensure ethical, inclusive online spaces.", "venue": "Advances in social networking and online communities book series", "label": 0}, {"loc": [3.049004077911377, -0.7728989124298096], "openalex_id": "https://openalex.org/W4407237581", "title": "Leveraging Transformer Models for Enhanced Pharmacovigilance: A Comparative Analysis of ADR Extraction from Biomedical and Social Media Texts", "authors": "Oumayma Elbiach, Hanane Grissette, El Habib Nfaoui", "abstract": "The extraction of Adverse Drug Reactions from biomedical text is a critical task in the field of healthcare and pharmacovigilance. It serves as a cornerstone for improving patient safety by enabling the early identification and mitigation of potential risks associated with pharmaceutical treatments. This process not only helps in detecting harmful side effects that may not have been evident during clinical trials but also contributes to the broader understanding of drug safety in real-world settings, ultimately guiding regulatory actions and informing clinical practices. In this study, we conducted a comprehensive evaluation of eleven transformer-based models for ADR extraction, focusing on two widely used datasets: CADEC and SMM4H. The task was approached as a sequence labeling problem, where each token in the text is classified as part of an ADR or not. Various transformer architectures, including BioBERT, PubMedBERT, and SpanBERT, were fine-tuned and evaluated on these datasets. BioBERT demonstrated superior performance on the CADEC dataset, achieving an impressive F1 score of 86.13%, indicating its strong capability in recognizing ADRs within patient narratives. On the other hand, SpanBERT emerged as the top performer on the SMM4H dataset, with an F1 score of 84.29%, showcasing its effectiveness in processing the more diverse and challenging social media data. These results highlight the importance of selecting appropriate models based on the specific characteristics such as text formality, domain-specific language, and task complexity to achieve optimal ADR extraction performance.", "venue": "AI", "label": 0}, {"loc": [7.792612075805664, 3.8410472869873047], "openalex_id": "https://openalex.org/W4407213771", "title": "Scaling Laws for Upcycling Mixture-of-Experts Language Models", "authors": "Seng Pei Liew, Takuya Kato, Sei-ichi Takase", "abstract": "Pretraining large language models (LLMs) is resource-intensive, often requiring months of training time even with high-end GPU clusters. There are two approaches of mitigating such computational demands: reusing smaller models to train larger ones (upcycling), and training computationally efficient models like mixture-of-experts (MoE). In this paper, we study the upcycling of LLMs to MoE models, of which the scaling behavior remains underexplored. Through extensive experiments, we identify empirical scaling laws that describe how performance depends on dataset size and model configuration. Particularly, we show that, while scaling these factors improves performance, there is a novel interaction term between the dense and upcycled training dataset that limits the efficiency of upcycling at large computational budgets. Based on these findings, we provide guidance to scale upcycling, and establish conditions under which upcycling outperforms from-scratch trainings within budget constraints.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.148359775543213, -0.19022582471370697], "openalex_id": "https://openalex.org/W4407209173", "title": "Cross-Lingual Transfer for Low-Resource Natural Language Processing", "authors": "Iker Garc\u00eda-Ferrero", "abstract": "Natural Language Processing (NLP) has seen remarkable advances in recent years, particularly with the emergence of Large Language Models that have achieved unprecedented performance across many tasks. However, these developments have mainly benefited a small number of high-resource languages such as English. The majority of languages still face significant challenges due to the scarcity of training data and computational resources. To address this issue, this thesis focuses on cross-lingual transfer learning, a research area aimed at leveraging data and models from high-resource languages to improve NLP performance for low-resource languages. Specifically, we focus on Sequence Labeling tasks such as Named Entity Recognition, Opinion Target Extraction, and Argument Mining. The research is structured around three main objectives: (1) advancing data-based cross-lingual transfer learning methods through improved translation and annotation projection techniques, (2) developing enhanced model-based transfer learning approaches utilizing state-of-the-art multilingual models, and (3) applying these methods to real-world problems while creating open-source resources that facilitate future research in low-resource NLP. More specifically, this thesis presents a new method to improve data-based transfer with T-Projection, a state-of-the-art annotation projection method that leverages text-to-text multilingual models and machine translation systems. T-Projection significantly outperforms previous annotation projection methods by a wide margin. For model-based transfer, we introduce a constrained decoding algorithm that enhances cross-lingual Sequence Labeling in zero-shot settings using text-to-text models. Finally, we develop Medical mT5, the first multilingual text-to-text medical model, demonstrating the practical impact of our research on real-world applications.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.4991631507873535, -1.0220954418182373], "openalex_id": "https://openalex.org/W4407231766", "title": "High-Fidelity Simultaneous Speech-To-Speech Translation", "authors": "Tom Labiausse, Laurent Mazar\u00e9, \u00c9douard Grave, Patrick P\u00e9rez, Alexandre D\u00e9fossez, Neil Zeghidour", "abstract": "We introduce Hibiki, a decoder-only model for simultaneous speech translation. Hibiki leverages a multistream language model to synchronously process source and target speech, and jointly produces text and audio tokens to perform speech-to-text and speech-to-speech translation. We furthermore address the fundamental challenge of simultaneous interpretation, which unlike its consecutive counterpart, where one waits for the end of the source utterance to start translating, adapts its flow to accumulate just enough context to produce a correct translation in real-time, chunk by chunk. To do so, we introduce a weakly-supervised method that leverages the perplexity of an off-the-shelf text translation system to identify optimal delays on a per-word basis and create aligned synthetic data. After supervised training, Hibiki performs adaptive, simultaneous speech translation with vanilla temperature sampling. On a French-English simultaneous speech translation task, Hibiki demonstrates state-of-the-art performance in translation quality, speaker fidelity and naturalness. Moreover, the simplicity of its inference process makes it compatible with batched translation and even real-time on-device deployment. We provide examples as well as models and inference code.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.135420799255371, 1.7719591856002808], "openalex_id": "https://openalex.org/W4407213467", "title": "The Cake that is Intelligence and Who Gets to Bake it: An AI Analogy and its Implications for Participation", "authors": "Martin Mundt, Anaelia Ovalle, Felix Friedrich, Pranav Agrawal, Subarnaduti Paul, Manuel Brack, Kristian Kersting, William S. Agnew", "abstract": "In a widely popular analogy by Turing Award Laureate Yann LeCun, machine intelligence has been compared to cake - where unsupervised learning forms the base, supervised learning adds the icing, and reinforcement learning is the cherry on top. We expand this 'cake that is intelligence' analogy from a simple structural metaphor to the full life-cycle of AI systems, extending it to sourcing of ingredients (data), conception of recipes (instructions), the baking process (training), and the tasting and selling of the cake (evaluation and distribution). Leveraging our re-conceptualization, we describe each step's entailed social ramifications and how they are bounded by statistical assumptions within machine learning. Whereas these technical foundations and social impacts are deeply intertwined, they are often studied in isolation, creating barriers that restrict meaningful participation. Our re-conceptualization paves the way to bridge this gap by mapping where technical foundations interact with social outcomes, highlighting opportunities for cross-disciplinary dialogue. Finally, we conclude with actionable recommendations at each stage of the metaphorical AI cake's life-cycle, empowering prospective AI practitioners, users, and researchers, with increased awareness and ability to engage in broader AI discourse.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.295211315155029, 2.4763760566711426], "openalex_id": "https://openalex.org/W4407231747", "title": "Demystifying Long Chain-of-Thought Reasoning in LLMs", "authors": "Edward Yeo, Tong Yang, M. Y. Niu, Graham Neubig, Xiang Yue", "abstract": "Scaling inference compute enhances reasoning in large language models (LLMs), with long chains-of-thought (CoTs) enabling strategies like backtracking and error correction. Reinforcement learning (RL) has emerged as a crucial method for developing these capabilities, yet the conditions under which long CoTs emerge remain unclear, and RL training requires careful design choices. In this study, we systematically investigate the mechanics of long CoT reasoning, identifying the key factors that enable models to generate long CoT trajectories. Through extensive supervised fine-tuning (SFT) and RL experiments, we present four main findings: (1) While SFT is not strictly necessary, it simplifies training and improves efficiency; (2) Reasoning capabilities tend to emerge with increased training compute, but their development is not guaranteed, making reward shaping crucial for stabilizing CoT length growth; (3) Scaling verifiable reward signals is critical for RL. We find that leveraging noisy, web-extracted solutions with filtering mechanisms shows strong potential, particularly for out-of-distribution (OOD) tasks such as STEM reasoning; and (4) Core abilities like error correction are inherently present in base models, but incentivizing these skills effectively for complex tasks via RL demands significant compute, and measuring their emergence requires a nuanced approach. These insights provide practical guidance for optimizing training strategies to enhance long CoT reasoning in LLMs. Our code is available at: https://github.com/eddycmu/demystify-long-cot.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.533416748046875, 1.4226934909820557], "openalex_id": "https://openalex.org/W4407229654", "title": "The role of GPT in promoting inclusive higher education for people with various learning disabilities: a review", "authors": "Thippa Reddy Gadekallu, Gokul Yenduri, Rajesh Kaluri, Dharmendra Singh Rajput, Kuruva Lakshmanna, Kai Fang, Junxin Chen, Wei Wang", "abstract": "The generative pre-trained transformer (GPT) is a notable breakthrough in the field of artificial intelligence, as it empowers machines to effectively comprehend and engage in interactions with humans. The GPT exhibits the capacity to enhance inclusivity and accessibility for students with learning disabilities in the context of higher education, hence potentially facilitating substantial advancements in the field. GPT can provide personalized and diverse solutions that successfully cater to the distinct requirements of students with learning disabilities. This motivated us to conduct an extensive review to assess the effectiveness of GPT in enhancing accessibility and inclusivity in higher education for students with learning disabilities. This review offers a comprehensive analysis of the GPT and its significance for enhancing inclusivity in the field of higher education. In this research, we also examined the possible challenges and constraints associated with the integration of GPT into inclusive higher education, along with potential solutions. Overall, this review is intended for educators, students with and without learning disabilities, policymakers, higher education institutes, researchers, and educational technology developers. This review aims to provide a comprehensive understanding of GPT in promoting inclusive higher education for people with various learning disabilities, its impacts on inclusive higher education, emerging challenges, and potential solutions.", "venue": "PeerJ Computer Science", "label": 4}, {"loc": [4.386987686157227, 1.9845858812332153], "openalex_id": "https://openalex.org/W4407196609", "title": "From Bytes to Biases: Investigating the Cultural Self-Perception of Large Language Models", "authors": "Wolfgang Messner, Tatum Greene, Josephine Matalone", "abstract": "Large language models (LLMs) are able to engage in natural-sounding conversations with humans, showcasing unprecedented capabilities for information retrieval and automated decision support. They have disrupted human\u2013technology interaction and the way businesses operate. However, technologies based on generative artificial intelligence are known to hallucinate, misinform, and display biases introduced by the massive datasets on which they are trained. Existing research indicates that humans may unconsciously internalize these biases, which can persist even after they stop using the programs. In this study, the authors explore the cultural self-perception of LLMs by prompting ChatGPT (OpenAI) and Bard (Google) with value questions derived from the GLOBE (Global Leadership and Organizational Behavior Effectiveness) project. The findings reveal that LLMs\u2019 cultural self-perception is most closely aligned with the values of English-speaking countries and countries characterized by economic competitiveness. It is crucial for all members of society to understand how LLMs function and to recognize their potential biases. If left unchecked, the \u201cblack-box\u201d nature of AI could reinforce human biases, leading to the inadvertent creation and training of even more biased models.", "venue": "Journal of Public Policy & Marketing", "label": 0}, {"loc": [7.41911506652832, 2.1944358348846436], "openalex_id": "https://openalex.org/W4407209014", "title": "SmolLM2: When Smol Goes Big--Data-Centric Training of a Small Language Model", "authors": "Loubna Ben Allal, Anton Lozhkov, Elie Bakouch, G. Bl\u00e1zquez, Guilherme Penedo, Lewis Tunstall, Andr\u00e9s Marafioti, Hynek Kydl\u00ed\u010dek, Agust\u00edn Piqueres Lajar\u00edn, Vaibhav Srivastav, Jim Lochner, Caleb Fahlgren, X. Nguyen, Cl\u00e9mentine Fourrier, Ben Burtenshaw, Hugo Larcher, Haojun Zhao, Cyril Zakka, Mathieu Morlon, Colin Raffel, Leandro von Werra, Thomas Wolf", "abstract": "While large language models have facilitated breakthroughs in many applications of artificial intelligence, their inherent largeness makes them computationally expensive and challenging to deploy in resource-constrained settings. In this paper, we document the development of SmolLM2, a state-of-the-art \"small\" (1.7 billion parameter) language model (LM). To attain strong performance, we overtrain SmolLM2 on ~11 trillion tokens of data using a multi-stage training process that mixes web text with specialized math, code, and instruction-following data. We additionally introduce new specialized datasets (FineMath, Stack-Edu, and SmolTalk) at stages where we found existing datasets to be problematically small or low-quality. To inform our design decisions, we perform both small-scale ablations as well as a manual refinement process that updates the dataset mixing rates at each stage based on the performance at the previous stage. Ultimately, we demonstrate that SmolLM2 outperforms other recent small LMs including Qwen2.5-1.5B and Llama3.2-1B. To facilitate future research on LM development as well as applications of small LMs, we release both SmolLM2 as well as all of the datasets we prepared in the course of this project.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.371281147003174, 1.9101487398147583], "openalex_id": "https://openalex.org/W4407196609", "title": "EXPRESS: From Bytes to Biases. Investigating the Cultural Self-Perception of Large Language Models", "authors": "Wolfgang Messner, Tatum Greene, Josephine Matalone", "abstract": "Large language models (LLMs) are able to engage in natural-sounding conversations with humans, showcasing unprecedented capabilities for information retrieval and automated decision support. They have disrupted human\u2013technology interaction and the way businesses operate. However, technologies based on generative artificial intelligence are known to hallucinate, misinform, and display biases introduced by the massive datasets on which they are trained. Existing research indicates that humans may unconsciously internalize these biases, which can persist even after they stop using the programs. In this study, the authors explore the cultural self-perception of LLMs by prompting ChatGPT (OpenAI) and Bard (Google) with value questions derived from the GLOBE (Global Leadership and Organizational Behavior Effectiveness) project. The findings reveal that LLMs\u2019 cultural self-perception is most closely aligned with the values of English-speaking countries and countries characterized by economic competitiveness. It is crucial for all members of society to understand how LLMs function and to recognize their potential biases. If left unchecked, the \u201cblack-box\u201d nature of AI could reinforce human biases, leading to the inadvertent creation and training of even more biased models.", "venue": "Journal of Public Policy & Marketing", "label": 0}, {"loc": [7.880090236663818, 0.981015145778656], "openalex_id": "https://openalex.org/W4407185811", "title": "When Dimensionality Hurts: The Role of LLM Embedding Compression for Noisy Regression Tasks", "authors": "Felix Drinkall, Janet B. Pierrehumbert, Stefan Zohren", "abstract": "Large language models (LLMs) have shown remarkable success in language modelling due to scaling laws found in model size and the hidden dimension of the model's text representation. Yet, we demonstrate that compressed representations of text can yield better performance in LLM-based regression tasks. In this paper, we compare the relative performance of embedding compression in three different signal-to-noise contexts: financial return prediction, writing quality assessment and review scoring. Our results show that compressing embeddings, in a minimally supervised manner using an autoencoder's hidden representation, can mitigate overfitting and improve performance on noisy tasks, such as financial return prediction; but that compression reduces performance on tasks that have high causal dependencies between the input and target data. Our results suggest that the success of interpretable compressed representations such as sentiment may be due to a regularising effect.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.536177635192871, 4.6849365234375], "openalex_id": "https://openalex.org/W4407185348", "title": "AutoGUI: Scaling GUI Grounding with Automatic Functionality Annotations from LLMs", "authors": "Hongxin Li, Jingfan Chen, Jingran Su, Yuntao Chen, Li Q, Zhaoxiang Zhang", "abstract": "User interface understanding with vision-language models (VLMs) has received much attention due to its potential for enhancing software automation. However, existing datasets used to build UI-VLMs either only contain large-scale context-free element annotations or contextualized functional descriptions for elements at a small scale. In this work, we propose the \\textbf{AutoGUI} pipeline for automatically annotating UI elements with detailed functionality descriptions at scale. Specifically, we leverage large language models (LLMs) to infer element functionality by comparing UI state changes before and after simulated interactions. To improve annotation quality, we propose LLM-aided rejection and verification, eliminating invalid annotations without human labor. We construct a high-quality AutoGUI-704k dataset using the proposed pipeline, featuring diverse and detailed functionality annotations that are hardly provided by previous datasets. Human evaluation shows that we achieve annotation correctness comparable to a trained human annotator. Extensive experiments show that our dataset remarkably enhances VLM's UI grounding capabilities and exhibits significant scaling effects. We also show the interesting potential use of our dataset in UI agent tasks. Please view our project at https://autogui-project.github.io/.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.528468132019043, 2.2082982063293457], "openalex_id": "https://openalex.org/W4407186836", "title": "Beyond English: Evaluating Automated Measurement of Moral Foundations in Non-English Discourse with a Chinese Case Study", "authors": "Calvin Yixiang Cheng, Scott A. Hale", "abstract": "This study explores computational approaches for measuring moral foundations (MFs) in non-English corpora. Since most resources are developed primarily for English, cross-linguistic applications of moral foundation theory remain limited. Using Chinese as a case study, this paper evaluates the effectiveness of applying English resources to machine translated text, local language lexicons, multilingual language models, and large language models (LLMs) in measuring MFs in non-English texts. The results indicate that machine translation and local lexicon approaches are insufficient for complex moral assessments, frequently resulting in a substantial loss of cultural information. In contrast, multilingual models and LLMs demonstrate reliable cross-language performance with transfer learning, with LLMs excelling in terms of data efficiency. Importantly, this study also underscores the need for human-in-the-loop validation of automated MF assessment, as the most advanced models may overlook cultural nuances in cross-language measurements. The findings highlight the potential of LLMs for cross-language MF measurements and other complex multilingual deductive coding tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.592846393585205, 1.038836121559143], "openalex_id": "https://openalex.org/W4407185825", "title": "Multilingual Attribute Extraction from News Web Pages", "authors": "Pavel Bedrin, Maksim Varlamov, Alexander Yatskov", "abstract": "This paper addresses the challenge of automatically extracting attributes from news article web pages across multiple languages. Recent neural network models have shown high efficacy in extracting information from semi-structured web pages. However, these models are predominantly applied to domains like e-commerce and are pre-trained using English data, complicating their application to web pages in other languages. We prepared a multilingual dataset comprising 3,172 marked-up news web pages across six languages (English, German, Russian, Chinese, Korean, and Arabic) from 161 websites. The dataset is publicly available on GitHub. We fine-tuned the pre-trained state-of-the-art model, MarkupLM, to extract news attributes from these pages and evaluated the impact of translating pages into English on extraction quality. Additionally, we pre-trained another state-of-the-art model, DOM-LM, on multilingual data and fine-tuned it on our dataset. We compared both fine-tuned models to existing open-source news data extraction tools, achieving superior extraction metrics.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.7676994800567627, 2.3342370986938477], "openalex_id": "https://openalex.org/W4407184778", "title": "AI Scaling: From Up to Down and Out", "authors": "Y. Wang, Yanxi Li, Xu Chang", "abstract": "AI Scaling has traditionally been synonymous with Scaling Up, which builds larger and more powerful models. However, the growing demand for efficiency, adaptability, and collaboration across diverse applications necessitates a broader perspective. This position paper presents a holistic framework for AI scaling, encompassing Scaling Up, Scaling Down, and Scaling Out. It argues that while Scaling Up of models faces inherent bottlenecks, the future trajectory of AI scaling lies in Scaling Down and Scaling Out. These paradigms address critical technical and societal challenges, such as reducing carbon footprint, ensuring equitable access, and enhancing cross-domain collaboration. We explore transformative applications in healthcare, smart manufacturing, and content creation, demonstrating how AI Scaling can enable breakthroughs in efficiency, personalization, and global connectivity. Additionally, we highlight key challenges, including balancing model complexity with interpretability, managing resource constraints, and fostering ethical development. By synthesizing these approaches, we propose a unified roadmap that redefines the future of AI research and application, paving the way for advancements toward Artificial General Intelligence (AGI).", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.933326721191406, 0.14798976480960846], "openalex_id": "https://openalex.org/W4407185607", "title": "AmaSQuAD: A Benchmark for Amharic Extractive Question Answering", "authors": "Nebiyou Daniel Hailemariam, Blessed Guda, Tsegazeab Tefferi", "abstract": "This research presents a novel framework for translating extractive question-answering datasets into low-resource languages, as demonstrated by the creation of the AmaSQuAD dataset, a translation of SQuAD 2.0 into Amharic. The methodology addresses challenges related to misalignment between translated questions and answers, as well as the presence of multiple answer instances in the translated context. For this purpose, we used cosine similarity utilizing embeddings from a fine-tuned BERT-based model for Amharic and Longest Common Subsequence (LCS). Additionally, we fine-tune the XLM-R model on the AmaSQuAD synthetic dataset for Amharic Question-Answering. The results show an improvement in baseline performance, with the fine-tuned model achieving an increase in the F1 score from 36.55% to 44.41% and 50.01% to 57.5% on the AmaSQuAD development dataset. Moreover, the model demonstrates improvement on the human-curated AmQA dataset, increasing the F1 score from 67.80% to 68.80% and the exact match score from 52.50% to 52.66%.The AmaSQuAD dataset is publicly available Datasets", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.941675186157227, 3.0227572917938232], "openalex_id": "https://openalex.org/W4407124078", "title": "Structural Latency Perturbation in Large Language Models Through Recursive State Induction", "authors": "Michael Mangrum, Jonathan Pemberton, Benedict Wetherby, Philip Montague", "abstract": "Computational efficiency has remained a critical consideration in scaling high-capacity language models, with inference latency and resource consumption presenting significant constraints on real-time applications. The study has introduced a structured latency perturbation mechanism that modifies computational pathways through recursive state induction, enabling dynamic suppression of redundant activations while preserving generative fidelity. A formal mathematical framework has been established to describe recursive perturbations, ensuring that modifications remain adaptive rather than statically imposed. Experiments have demonstrated that applying recursive state adjustments reduces inference latency across varying sequence lengths, with longer text generations benefiting from cumulative efficiency improvements. Comparative evaluations against structured pruning and quantization have indicated that latency gains can be achieved without compromising token retention or memory utilization. The analysis of computational overhead has suggested that selectively suppressing redundant activations contributes to improved power efficiency, particularly in scenarios requiring extended text generation. An assessment of linguistic stability has shown that token-level consistency remains largely intact under controlled perturbation thresholds, reinforcing the viability of structural latency modifications as an alternative to weight-centric optimization techniques. The results have supported the hypothesis that recursive state induction offers an effective method for reducing computational complexity without requiring architectural modifications or external augmentation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.089984893798828, -2.784724473953247], "openalex_id": "https://openalex.org/W4407094964", "title": "Modelling Misinformation in Swahili-English Code-switched Texts", "authors": "Cynthia Amol, Lilian Wanzare, James Obuhuma", "abstract": "Code-switching, which is the mixing of words or phrases from multiple, grammatically distinct languages, introduces semantic and syntactic complexities to sentences which complicate automated text classification. Despite code-switching being a common occurrence in informal text-based communication among most bilingual or multilingual users of digital spaces, its use to spread misinformation is relatively less explored. In Kenya, for instance, the use of code-switched Swahili-English is prevalent on social media. Our main objective in this paper was to systematically re- view code-switching, particularly the use of Swahili-English code-switching to spread misinformation on social media in the Kenyan context. Additionally, we aimed at pre-processing a Swahili-English code-switched dataset and developing a misinformation classification model trained on this dataset. We discuss the process we took to develop the code- switched Swahili-English misinformation classification model. The model was trained and tested using the PolitiKweli dataset which is the first Swahili-English code-switched dataset curated for misinformation classification. The dataset was collected from Twitter (now X) social media platform, focusing on text posted during the electioneering period of the 2022 general elections in Kenya. The study experimented with two types of word embeddings - GloVe and FastText. FastText uses character n-gram representations that help generate meaningful vectors for rare and unseen words in the code-switched dataset. We experimented with both the classical machine learning algorithms and deep learning algo- rithms. Bidirectional Long Short-Term Memory Networks (BiLSTM) algorithm showed the best performance with an f-score of 0.89. The model was able to classify code-switched Swahili-English political misinformation text as fake, fact or neutral. This study contributes to recent research efforts in developing language models for low-resource languages.", "venue": "International Journal of Information Technology and Computer Science", "label": 0}, {"loc": [3.459261417388916, 3.686292886734009], "openalex_id": "https://openalex.org/W4407128156", "title": "RiskHarvester: A Risk-based Tool to Prioritize Secret Removal Efforts in Software Artifacts", "authors": "Setu Kumar Basak, Tanmay Pardeshi, Bradley Reaves, Laurie Williams", "abstract": "Since 2020, GitGuardian has been detecting checked-in hard-coded secrets in GitHub repositories. During 2020-2023, GitGuardian has observed an upward annual trend and a four-fold increase in hard-coded secrets, with 12.8 million exposed in 2023. However, removing all the secrets from software artifacts is not feasible due to time constraints and technical challenges. Additionally, the security risks of the secrets are not equal, protecting assets ranging from obsolete databases to sensitive medical data. Thus, secret removal should be prioritized by security risk reduction, which existing secret detection tools do not support. The goal of this research is to aid software practitioners in prioritizing secrets removal efforts through our security risk-based tool. We present RiskHarvester, a risk-based tool to compute a security risk score based on the value of the asset and ease of attack on a database. We calculated the value of asset by identifying the sensitive data categories present in a database from the database keywords in the source code. We utilized data flow analysis, SQL, and ORM parsing to identify the database keywords. To calculate the ease of attack, we utilized passive network analysis to retrieve the database host information. To evaluate RiskHarvester, we curated RiskBench, a benchmark of 1,791 database secret-asset pairs with sensitive data categories and host information manually retrieved from 188 GitHub repositories. RiskHarvester demonstrates precision of (95%) and recall (90%) in detecting database keywords for the value of asset and precision of (96%) and recall (94%) in detecting valid hosts for ease of attack. Finally, we conducted a survey (52 respondents) to understand whether developers prioritize secret removal based on security risk score. We found that 86% of the developers prioritized the secrets for removal with descending security risk scores.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.051398277282715, 3.5840389728546143], "openalex_id": "https://openalex.org/W4407184395", "title": "Scaling Embedding Layers in Language Models", "authors": "Dahua Yu, Edith Cohen, Badih Ghazi, Yangsibo Huang, Pritish Kamath, Ravi Kumar, Daogao Liu, Chiyuan Zhang", "abstract": "We propose $SCONE$ ($S$calable, $C$ontextualized, $O$ffloaded, $N$-gram $E$mbedding), a new method for extending input embedding layers to enhance language model performance. To avoid increased decoding costs, $SCONE$ retains the original vocabulary while introducing embeddings for a set of frequent n-grams. These embeddings provide contextualized representation for each input token and are learned with a separate model during training. After training, embeddings are precomputed and stored in off-accelerator memory; during inference, querying them has minimal impact on latency due to the low complexity of embedding lookups. $SCONE$ enables two new scaling strategies: increasing the number of n-gram embeddings and scaling the model used to learn them, both while maintaining fixed accelerator usage during inference (in terms of FLOPS and memory). We show that scaling both aspects enables a model with 1B accelerator-resident parameters to outperform a 1.9B-parameter baseline across diverse corpora, while using only about half the FLOPS and accelerator memory during inference.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.523486614227295, 5.258062839508057], "openalex_id": "https://openalex.org/W4407124183", "title": "Vision-centric Token Compression in Large Language Model", "authors": "Ling Xing, Alex Jinpeng Wang, Rui Yan, Jinhui Tang", "abstract": "Real-world applications are stretching context windows to hundreds of thousand of tokens while Large Language Models (LLMs) swell from billions to trillions of parameters. This dual expansion send compute and memory costs skyrocketing, making token compression indispensable. We introduce Vision Centric Token Compression (Vist), a slow-fast compression framework that mirrors human reading: the fast path renders distant tokens into images, letting a frozen, lightweight vision encoder skim the low-salience context; the slow path feeds the proximal window into the LLM for fine-grained reasoning. A Probability-Informed Visual Enhancement (PVE) objective masks high-frequency tokens during training, steering the Resampler to concentrate on semantically rich regions-just as skilled reader gloss over function words. On eleven in-context learning benchmarks, Vist achieves the same accuracy with 2.3 times fewer tokens, cutting FLOPs by 16% and memory by 50%. This method delivers remarkable results, outperforming the strongest text encoder-based compression method CEPE by 7.6% on average over benchmarks like TriviaQA, NQ, PopQA, NLUI, and CLIN, setting a new standard for token efficiency in LLMs. The source code will be released.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.97462272644043, 2.5353102684020996], "openalex_id": "https://openalex.org/W4407123967", "title": "M+: Extending MemoryLLM with Scalable Long-Term Memory", "authors": "Yu Wang, Dmitry Krotov, Yannan Hu, Yifan Gao, Wangchunshu Zhou, Julian McAuley, Dan Gutfreund, Rog\u00e9rio Feris, Zexue He", "abstract": "Equipping large language models (LLMs) with latent-space memory has attracted increasing attention as they can extend the context window of existing language models. However, retaining information from the distant past remains a challenge. For example, MemoryLLM (Wang et al., 2024a), as a representative work with latent-space memory, compresses past information into hidden states across all layers, forming a memory pool of 1B parameters. While effective for sequence lengths up to 16k tokens, it struggles to retain knowledge beyond 20k tokens. In this work, we address this limitation by introducing M+, a memory-augmented model based on MemoryLLM that significantly enhances long-term information retention. M+ integrates a long-term memory mechanism with a co-trained retriever, dynamically retrieving relevant information during text generation. We evaluate M+ on diverse benchmarks, including long-context understanding and knowledge retention tasks. Experimental results show that M+ significantly outperforms MemoryLLM and recent strong baselines, extending knowledge retention from under 20k to over 160k tokens with similar GPU memory overhead. We open-source our code at https://github.com/wangyu-ustc/MemoryLLM", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.719329357147217, 4.001806259155273], "openalex_id": "https://openalex.org/W4407128040", "title": "MergeME: Model Merging Techniques for Homogeneous and Heterogeneous MoEs", "authors": "Yuhang Zhou, Giannis Karamanolakis, V\u00edctor Soto, Anna Rumshisky, M.G. Kulkarni, Furong Huang, Wei Ai, Jianhua Lu", "abstract": "The recent success of specialized Large Language Models (LLMs) in domains such as mathematical reasoning and coding has led to growing interest in methods for merging these expert LLMs into a unified Mixture-of-Experts (MoE) model, with the goal of enhancing performance in each domain while retaining effectiveness on general tasks. However, the effective merging of expert models remains an open challenge, especially for models with highly divergent weight parameters or different architectures. State-of-the-art MoE merging methods only work with homogeneous model architectures and rely on simple unweighted averaging to merge expert layers, which does not address parameter interference and requires extensive fine-tuning of the merged MoE to restore performance. To address these limitations, this paper introduces new MoE merging techniques, including strategies to mitigate parameter interference, routing heuristics to reduce the need for MoE fine-tuning, and a novel method for merging experts with different architectures. Extensive experiments across multiple domains demonstrate the effectiveness of our proposed methods, reducing fine-tuning costs, improving performance over state-of-the-art methods, and expanding the applicability of MoE merging.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.7459917068481445, 1.7298824787139893], "openalex_id": "https://openalex.org/W4407123540", "title": "Doing More with Less--Implementing Routing Strategies in Large Language Model-Based Systems: An Extended Survey", "authors": "Clovis Varangot-Reille, Christophe Bouvard, Antoine Gourru, Mathieu Ciancone, Marion Schaeffer, Fran\u00e7ois Jacquenet", "abstract": "Large Language Models (LLM)-based systems, i.e. interconnected elements that include an LLM as a central component (e.g., conversational agents), are typically monolithic static architectures that rely on a single LLM for all user queries. However, they often require different preprocessing strategies, levels of reasoning, or knowledge. Generalist LLMs (e.g. GPT-4) trained on very large multi-topic corpora can perform well in a variety of tasks. They require significant financial, energy, and hardware resources that may not be justified for basic tasks. This implies potentially investing in unnecessary costs for a given query. To overcome this problem, a routing mechanism routes user queries to the most suitable components, such as smaller LLMs or experts in specific topics. This approach may improve response quality while minimising costs. Routing can be expanded to other components of the conversational agent architecture, such as the selection of optimal embedding strategies. This paper explores key considerations for integrating routing into LLM-based systems, focusing on resource management, cost definition, and strategy selection. Our main contributions include a formalisation of the problem, a novel taxonomy of existing approaches emphasising relevance and resource efficiency, and a comparative analysis of these strategies in relation to industry practices. Finally, we identify critical challenges and directions for future research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.125532150268555, -1.6328253746032715], "openalex_id": "https://openalex.org/W4408465183", "title": "Effectiveness of Bi-GRU and FastText in Sentiment Analysis of Shopee App Reviews", "authors": "Rayhan Fadhil Rahmanda, Yuliant Sibaroni, Sri Suryani Prasetiyowati", "abstract": "E-commerce is proof of evolution in the economic field due to its flexibility to shop for various necessities of life anytime and anywhere. Shopee is one of the e-commerce platforms in demand by people from varied circles in Indonesia. Multiple reviews are shed publicly by Shopee users on the Google Play Store regarding shopping experiences, which can be positive or negative. This condition affects the decision of other users to shop at Shopee, thus impacting the increase or decrease in profits from Shopee itself. Therefore, user sentiment analysis is needed as a form of effort to maintain user trust in Shopee. This research aims to build a system to classify the sentiment of Shopee application users through reviews in the Google Play Store by utilizing the Bidirectional Gated Recurrent Unit (Bi-GRU) deep learning model. The dataset contains 9,716 reviews, including 3,937 positive and 5,779 negative sentiments. Several test scenarios were conducted to achieve the highest peak of performance, utilizing TF-IDF feature extraction, FastText feature expansion, and optimization using the Cuckoo Search Algorithm. Additionally, SMOTE resampling was utilized to correct the dataset\u2019s uneven distribution. The combined test scenarios mentioned significantly improved the accuracy by 1.03% and F1-Score by 1.04% from the baseline, with the highest accuracy reaching 90.48% and the highest F1-Score of 90.16%.", "venue": "SinkrOn", "label": 0}, {"loc": [4.579657554626465, 1.314801812171936], "openalex_id": "https://openalex.org/W4407094317", "title": "Collaborative Growth: When Large Language Models Meet Sociolinguistics", "authors": "Dong Nguyen", "abstract": "ABSTRACT Large Language Models (LLMs) have dramatically transformed the AI landscape. They can produce remarkable fluent text and exhibit a range of natural language understanding and generation capabilities. This article explores how LLMs might be used for sociolinguistic research and, conversely, how sociolinguistics can contribute to the development of LLMs. It argues that both areas of research will benefit from a thoughtful, engaging collaboration. Sociolinguists are not merely end users of LLMs; they have a crucial role to play in the development of LLMs.", "venue": "Language and Linguistics Compass", "label": 0}, {"loc": [3.397500991821289, 1.543677806854248], "openalex_id": "https://openalex.org/W4407105155", "title": "The Feasibility and Comparability of Using Artificial Intelligence for Qualitative Data Analysis in Equity-Focused Research", "authors": "Yan Jiang, Lillie Ko-Wong, Ivan Valdovinos Gutierrez", "abstract": "In this essay, we explored the feasibility of utilizing artificial intelligence (AI) for qualitative data analysis in equity-focused research. Specifically, we compare thematic analyses of interview transcripts conducted by human coders with those performed by GPT-3 using a zero-shot chain-of-thought prompting strategy. Our results suggest that the AI model, when provided with suitable prompts, can proficiently perform thematic analysis, demonstrating considerable comparability with human coders. Despite potential biases inherent in its training data, the model was able to analyze and interpret the data through social justice perspectives. We discuss the applications of integrating AI into qualitative research, provide code snippets illustrating the use of GPT models, and highlight unresolved questions to encourage further dialogue in the field.", "venue": "Educational Researcher", "label": 0}, {"loc": [3.003162145614624, 2.7266409397125244], "openalex_id": "https://openalex.org/W4407069489", "title": "Location is All You Need: Copyright Extraterritoriality and Where to Train Your AI", "authors": "Mattias R\u00e4ttz\u00e9n", "abstract": "The development of artificial intelligence (\u201cAI\u201d) models requires vast quantities of data, which will often include copyrighted materials. The reproduction of copyrighted materials in the course of training AI models will infringe on copyright, unless there are applicable exceptions and limitations exempting such activities. There is so far considerable divergence between jurisdictions, including between the United States, EU, U.K., Japan, Singapore, Australia, India, Israel, and many more countries, in this regard. In the absence of international harmonization, there is therefore a high likelihood that the same type of training activity would be considered copyright infringement in some countries but not in others. The AI community is not blind to that risk. If copyright law restricts the development and deployment of AI, developers may decide to relocate their operations elsewhere, where the reproduction of training data is clearly not infringing. This Article concludes that there is a loophole in the international copyright system, as it currently stands, that would permit large-scale copying of training data in one country where this activity is not infringing. Once the training is done and the model is complete, developers could then make the model available to customers in other countries, even if the same training activities would have been infringing if they had occurred there. Because copyright laws are territorial in nature, by default they can only restrict infringing conduct occurring in their respective countries. From that point of view for AI developers, location is indeed all you need. The EU has become the first to respond to this problem by retroactively extending their text and data mining exception extraterritorially to training activities occurring in non-EU countries, once the completed AI model is placed on the EU market. While such an extraterritorial application benefits rightholders and closes the loophole now present, it makes the situation significantly more complex for developers. If other regulators decide to follow the same path as the EU, which previously happened in the data privacy context, then developers would be facing multiple, conflicting copyright laws targeting the same underlying activity. This could significantly complicate the development process for AI and potentially undermine the AI industry. This Article critically discusses these and related issues, and whether an extraterritorial application of copyright laws is compatible with territoriality norms that are supposed to respect foreign sovereignty. It also explores, in light of these difficulties, whether we should instead shift focus from regulating the inputs (i.e., the data used to train AI models) to regulating the outputs (i.e., the AI-generated content itself). Indeed, to the extent that the transnational data loophole cannot be closed without infringing upon foreign sovereignty, we may need to look at other regulatory means instead. The Article also suggests that we should consider model training and copyright infringement as a product-by-process problem, which calls for a comparison with how patent law solved similar extraterritoriality issues. Several decades ago, international patent treaties harmonized the extent to which patent laws can be applied extraterritorially to reach imported products derived from foreign manufacturing processes. If regulators wish to extend their copyright laws\u2019 extraterritoriality to close the loophole that exists for training activities in the context of AI, and to do so in a way that is aligned with copyright territoriality, there may be a need to similarly revise international copyright treaties. This Article, therefore, urgently calls for a similarly coordinated international effort in copyright law, which balances the interests of rightholders with the technical, regulatory, and economic realities faced by developers. How we resolve these issues could make or break the future of AI. If we cannot find a way to reconcile the interests of rightholders and AI stakeholders, the world may be left with a segregated and fragmented AI landscape, one in which there can only be losers and no winners.", "venue": "Science and Technology Law Review", "label": 48}, {"loc": [4.129199504852295, 0.9940271973609924], "openalex_id": "https://openalex.org/W4406975143", "title": "AI-assisted German Employment Contract Review: A Benchmark Dataset", "authors": "Oliver Wardas, Florian Matthes", "abstract": "Employment contracts are used to agree upon the working conditions between employers and employees all over the world. Understanding and reviewing contracts for void or unfair clauses requires extensive knowledge of the legal system and terminology. Recent advances in Natural Language Processing (NLP) hold promise for assisting in these reviews. However, applying NLP techniques on legal text is particularly difficult due to the scarcity of expert-annotated datasets. To address this issue and as a starting point for our effort in assisting lawyers with contract reviews using NLP, we release an anonymized and annotated benchmark dataset for legality and fairness review of German employment contract clauses, alongside with baseline model evaluations.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.821510314941406, -1.0556402206420898], "openalex_id": "https://openalex.org/W4406985005", "title": "Automatic Text Simplification for Lithuanian: Transforming Administrative Texts into Plain Language", "authors": "Justina Mandravickait\u0117, Egl\u0117 Rimkien\u0117, Danguol\u0117 Kotryna Kapkan, Danguol\u0117 Kalinauskait\u0117, Antanas \u010cenys, Tomas Krilavi\u010dius", "abstract": "In this study, we present the results of experiments on text simplification for the Lithuanian language, where we aim to simplify administrative-style texts to the Plain Language level. We selected mT5, mBART, and LT-Llama-2 as the foundational models and fine-tuned them for the text simplification task. Additionally, we evaluated ChatGPT for this purpose. Also, we conducted a comprehensive assessment of the simplification results provided by these models both quantitatively and qualitatively. The results demonstrated that mBART was the most effective model for simplifying Lithuanian administrative text, achieving the highest scores across all the evaluation metrics. A qualitative evaluation of the simplified sentences complemented our quantitative findings. Attention analysis provided insights into model decisions, highlighting strengths in lexical and syntactic simplifications but revealing challenges with longer, complex sentences. Our findings contribute to advancing text simplification for lesser-resourced languages, with practical applications for more effective communication between institutions and the general public, which is the goal of Plain Language.", "venue": "Mathematics", "label": 46}, {"loc": [4.208322048187256, 1.6234829425811768], "openalex_id": "https://openalex.org/W4406936926", "title": "Evaluating Binary Decision Biases in Large Language Models: Implications for Fair Agent-Based Financial Simulations", "authors": "Alicia Vidler, Toby Walsh", "abstract": "Large Language Models (LLMs) are increasingly being used to simulate human-like decision making in agent-based financial market models (ABMs). As models become more powerful and accessible, researchers can now incorporate individual LLM decisions into ABM environments. However, integration may introduce inherent biases that need careful evaluation. In this paper we test three state-of-the-art GPT models for bias using two model sampling approaches: one-shot and few-shot API queries. We observe significant variations in distributions of outputs between specific models, and model sub versions, with GPT-4o-Mini-2024-07-18 showing notably better performance (32-43% yes responses) compared to GPT-4-0125-preview's extreme bias (98-99% yes responses). We show that sampling methods and model sub-versions significantly impact results: repeated independent API calls produce different distributions compared to batch sampling within a single call. While no current GPT model can simultaneously achieve a uniform distribution and Markovian properties in one-shot testing, few-shot sampling can approach uniform distributions under certain conditions. We explore the Temperature parameter, providing a definition and comparative results. We further compare our results to true random binary series and test specifically for the common human bias of Negative Recency - finding LLMs have a mixed ability to 'beat' humans in this one regard. These findings emphasise the critical importance of careful LLM integration into ABMs for financial markets and more broadly.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.9596853256225586, -0.7705110907554626], "openalex_id": "https://openalex.org/W4406957528", "title": "a clinical narrative corpus on nut allergy: annotation schema, guidelines and use case", "authors": "Ana Gonz\u00e1lez Moreno, Alberto Ramos-Gonz\u00e1lez, Israel Gonz\u00e1lez\u2010Carrasco, M. D. Alonso, Beatriz Sellers Guti\u00e9rrez-Argumosa, Alicia Moncada Salinero, Ana Bel\u00e9n Pastor-Magro, Beatriz Gonz\u00e1lez-Pi\u00f1eiro, Miguel Tejedor, Paloma Mart\u0131\u0301nez", "abstract": "This article describes a dataset on nut allergy extracted from Spanish clinical records provided by the Hospital Universitario Fundaci\u00f3n de Alcorc\u00f3n (HUFA) in Madrid, Spain, in collaboration with its Allergology Unit and Information Systems and Technologies Department. There are few publicly available clinical texts in Spanish and having more is essential as a valuable resource to train and test information extraction systems. In total, 828 clinical notes in Spanish were employed and several experts participated in the annotation process by categorizing the annotated entities into medical semantic groups related to allergies. To evaluate inter-annotator agreement, a triple annotation was performed on 8% of the texts. The guidelines followed to create the corpus are also provided. To determine the validation of the corpus and introduce a real use case, we performed some experiments using this resource in the context of a supervised named entity recognition (NER) task by fine-tuning encoder-based transformers. In these experiments, an average F-measure of 86.2% was achieved. These results indicate that the corpus used is suitable for training and testing approaches to NER related to the field of allergology.", "venue": "Scientific Data", "label": 20}, {"loc": [7.020647048950195, 3.789203643798828], "openalex_id": "https://openalex.org/W4406879574", "title": "Data-Juicer 2.0: Cloud-Scale Adaptive Data Processing for Foundation Models", "authors": "Daoyuan Chen, Yihong Huang, Xuchen Pan, Nana Jiang, Haibin Wang, Ce Ge, Yushuo Chen, Wenhao Zhang, Zhijian Ma, Yilei Zhang, Jun Huang, Wei Lin, Yaliang Li, B. Ding, Jingren Zhou", "abstract": "Foundation models demand advanced data processing for their vast, multimodal datasets. However, traditional frameworks struggle with the unique complexities of multimodal data. In response, we present Data-Juicer 2.0, a data processing system backed by 100+ data processing operators spanning text, image, video, and audio modalities, supporting more critical tasks including data analysis, synthesis, annotation, and foundation model post-training. With seamless compatibility and dedicated optimization for popular dataset hubs like Hugging Face and computing engines like Ray, it improves upon its predecessor in terms of usability, efficiency, and programmability. It features an easily accessible user interface layer that supports decoupled Python interactions, RESTful APIs, and conversational commands. Its new runtime layer offers adaptive execution across diverse scales and environments, abstracting away system complexities. Extensive empirical evaluations demonstrate Data-Juicer 2.0's remarkable performance and scalability, highlighting its capability to efficiently process TB-level data with 10k+ CPU cores. The system is publicly available and has been widely adopted in diverse research fields and real-world products such as Alibaba Cloud PAI. We actively maintain the system and share practical insights to foster research and applications of next-generation foundation models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.385318756103516, -0.11014395952224731], "openalex_id": "https://openalex.org/W4406880290", "title": "Pre-training a Transformer-Based Generative Model Using a Small Sepedi", "authors": "Simon P. Ramalepe, Thipe Modipa, Marelie H. Davel", "abstract": "Due to the scarcity of data in low-resourced languages, the development of language models for these languages has been very slow. Currently, pre-trained language models have gained popularity in natural language processing, especially, in developing domain-specific models for low-resourced languages. In this study, we experiment with the impact of using occlusion-based techniques when training a language model for a text generation task. We curate 2 new datasets, the Sepedi monolingual (SepMono) dataset from several South African resources and the Sepedi radio news (SepNews) dataset from the radio news domain. We use the SepMono dataset to pre-train transformer-based models using the occlusion and non-occlusion pre-training techniques and compare performance. The SepNews dataset is specifically used for fine-tuning. Our results show that the non-occlusion models perform better compared to the occlusion-based models when measuring validation loss and perplexity. However, analysis of the generated text using the BLEU score metric, which measures the quality of the generated text, shows a slightly higher BLEU score for the occlusion-based models compared to the non-occlusion models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.218197345733643, 0.9936540126800537], "openalex_id": "https://openalex.org/W4406885938", "title": "MEL: Legal Spanish Language Model", "authors": "Diana Berruezo-S\u00e1nchez, Nuria Aldama Garc\u00eda, \u00c1lvaro Barbero Jim\u00e9nez, Marta Guerrero Nieto, Patr\u00edcia Morales, Nicol\u00e1s Serrano Salas, Carlos Garc\u00eda Hern\u00e1n, P. Coll, Elena Montiel-Ponsoda, Pablo Calleja Ib\u00e1\u00f1ez", "abstract": "Legal texts, characterized by complex and specialized terminology, present a significant challenge for Language Models. Adding an underrepresented language, such as Spanish, to the mix makes it even more challenging. While pre-trained models like XLM-RoBERTa have shown capabilities in handling multilingual corpora, their performance on domain specific documents remains underexplored. This paper presents the development and evaluation of MEL, a legal language model based on XLM-RoBERTa-large, fine-tuned on legal documents such as BOE (Bolet\u00edn Oficial del Estado, the Spanish oficial report of laws) and congress texts. We detail the data collection, processing, training, and evaluation processes. Evaluation benchmarks show a significant improvement over baseline models in understanding the legal Spanish language. We also present case studies demonstrating the model's application to new legal texts, highlighting its potential to perform top results over different NLP tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.078767776489258, 0.9396518468856812], "openalex_id": "https://openalex.org/W4392271377", "title": "Natural Language Processing Methods for Symbolic Music Generation and Information Retrieval: a Survey", "authors": "Dinh-Viet-Toan Le, Louis Bigo, Mikaela Keller, Dorien Herremans", "abstract": "Music is frequently associated with the notion of language, as both domains share several similarities, including the ability for their content to be represented as sequences of symbols. In computer science, the fields of Natural Language Processing (NLP) and Music Information Retrieval (MIR) reflect this analogy through a variety of similar tasks, such as author detection or content generation. This similarity has long encouraged the adaptation of NLP methods to process musical data, particularly symbolic music data, and the rise of Transformer neural networks has considerably strengthened this practice. This survey reviews NLP methods applied to symbolic music generation and information retrieval following two axes. We first propose an overview of representations of symbolic music inspired by text sequential representations. We then review a large set of computational models, particularly deep learning models, which have been adapted from NLP to process these musical representations for various MIR tasks. These models are described and categorized through different prisms with a highlight on their music-specialized mechanisms. We finally present a discussion surrounding the adequate use of NLP tools to process symbolic music data. This includes technical issues regarding NLP methods which may open several doors for further research into more effectively adapting NLP tools to symbolic MIR.", "venue": "ACM Computing Surveys", "label": 7}, {"loc": [8.291605949401855, -0.11063189059495926], "openalex_id": "https://openalex.org/W4406880290", "title": "Pre-training a Transformer-Based Generative Model Using a Small Sepedi Dataset", "authors": "Simon P. Ramalepe, Thipe Modipa, Marelie H. Davel", "abstract": "Due to the scarcity of data in low-resourced languages, the development of language models for these languages has been very slow. Currently, pre-trained language models have gained popularity in natural language processing, especially, in developing domain-specific models for low-resourced languages. In this study, we experiment with the impact of using occlusion-based techniques when training a language model for a text generation task. We curate 2 new datasets, the Sepedi monolingual (SepMono) dataset from several South African resources and the Sepedi radio news (SepNews) dataset from the radio news domain. We use the SepMono dataset to pre-train transformer-based models using the occlusion and non-occlusion pre-training techniques and compare performance. The SepNews dataset is specifically used for fine-tuning. Our results show that the non-occlusion models perform better compared to the occlusion-based models when measuring validation loss and perplexity. However, analysis of the generated text using the BLEU score metric, which measures the quality of the generated text, shows a slightly higher BLEU score for the occlusion-based models compared to the non-occlusion models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.942392349243164, 2.731822967529297], "openalex_id": "https://openalex.org/W4406880791", "title": "Quantum-Enhanced Attention Mechanism in NLP: A Hybrid Classical-Quantum Approach", "authors": "S. M. Yousuf Iqbal Tomal, Abdullah Al Shafin, Debotosh Bhattacharjee, Md. Obaidul Amin, Rafiad Sadat Shahir", "abstract": "Recent advances in quantum computing have opened new pathways for enhancing deep learning architectures, particularly in domains characterized by high-dimensional and context-rich data such as natural language processing (NLP). In this work, we present a hybrid classical-quantum Transformer model that integrates a quantum-enhanced attention mechanism into the standard classical architecture. By embedding token representations into a quantum Hilbert space via parameterized variational circuits and exploiting entanglement-aware kernel similarities, the model captures complex semantic relationships beyond the reach of conventional dot-product attention. We demonstrate the effectiveness of this approach across diverse NLP benchmarks, showing improvements in both efficiency and representational capacity. The results section reveal that the quantum attention layer yields globally coherent attention maps and more separable latent features, while requiring comparatively fewer parameters than classical counterparts. These findings highlight the potential of quantum-classical hybrid models to serve as a powerful and resource-efficient alternative to existing attention mechanisms in NLP.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.544889450073242, 5.3201003074646], "openalex_id": "https://openalex.org/W4406840487", "title": "Fanar: An Arabic-Centric Multimodal Generative AI Platform", "authors": "Fanar Team, Ummar Abbas, Mohammad Shahmeer Ahmad, Firoj Alam, Enes Alt\u0131n\u0131\u015f\u0131k, Ehsannedin Asgari, Yazan Boshmaf, Sabri Boughorbel, Sanjay Chawla, Shammur Absar Chowdhury, Fahim Dalvi, Kareem Darwish, Nadir Durrani, Mohamed Elfeky, Ahmed K. Elmagarmid, Mohamed Y. Eltabakh, Masoomali Fatehkia, Anastasios Fragkopoulos, Maram Hasanain, Majd Hawasly, Mus'ab Husaini, Soon\u2010gyo Jung, Ji Lucas, Walid Magdy, Safa Messaoud, Asmaa Mohamed, Tasnim Mohiuddin, Basel Mousi, Hamdy Mubarak, Ahmad Musleh, Zan Ahmad Naeem, Mourad Ouzzani, D Popovi\u0107, Amin Sadeghi, H\u00fcsrev Taha Sencar, Mohammed Shinoy, Omar Sinan, Yifan Zhang, Ahmed Ali, Yassine El Kheir, Xiaosong Ma, Chaoyi Ruan", "abstract": "We present Fanar, a platform for Arabic-centric multimodal generative AI systems, that supports language, speech and image generation tasks. At the heart of Fanar are Fanar Star and Fanar Prime, two highly capable Arabic Large Language Models (LLMs) that are best in the class on well established benchmarks for similar sized models. Fanar Star is a 7B (billion) parameter model that was trained from scratch on nearly 1 trillion clean and deduplicated Arabic, English and Code tokens. Fanar Prime is a 9B parameter model continually trained on the Gemma-2 9B base model on the same 1 trillion token set. Both models are concurrently deployed and designed to address different types of prompts transparently routed through a custom-built orchestrator. The Fanar platform provides many other capabilities including a customized Islamic Retrieval Augmented Generation (RAG) system for handling religious prompts, a Recency RAG for summarizing information about current or recent events that have occurred after the pre-training data cut-off date. The platform provides additional cognitive capabilities including in-house bilingual speech recognition that supports multiple Arabic dialects, voice and image generation that is fine-tuned to better reflect regional characteristics. Finally, Fanar provides an attribution service that can be used to verify the authenticity of fact based generated content. The design, development, and implementation of Fanar was entirely undertaken at Hamad Bin Khalifa University's Qatar Computing Research Institute (QCRI) and was sponsored by Qatar's Ministry of Communications and Information Technology to enable sovereign AI technology development.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.534226894378662, 0.9538223743438721], "openalex_id": "https://openalex.org/W4406841131", "title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages", "authors": "Jia Yu, Fei Yuan, Rui Min, Jing Yu, Pei Chu, Jiayang Li, Wei Li, Zengqiang Zhang, Zhenxiang Li, Zhikun Ren, Zheng Dong, W. Zhang, Yan Teng, Lingyu Meng, Zhenjiang Jin, Jiantao Qiu, Shasha Wang, Zhongying Tu, Dahua Lin, Yu Wang, Yu Qiao, Yanfeng Wang, Conghui He", "abstract": "This paper introduces the open-source dataset WanJuanSiLu, designed to provide high-quality training corpora for low-resource languages, thereby advancing the research and development of multilingual models. To achieve this, we have developed a systematic data processing framework tailored for low-resource languages. This framework encompasses key stages such as data extraction, corpus cleaning, content deduplication, security filtering, quality evaluation, and theme classification. Through the implementation of this framework, we have significantly improved both the quality and security of the dataset, while maintaining its linguistic diversity. As of now, data for all five languages have been fully open-sourced. The dataset can be accessed at https://opendatalab.com/applyMultilingualCorpus, and GitHub repository is available at https://github.com/opendatalab/WanJuan3.0", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.30151081085205, 1.3880048990249634], "openalex_id": "https://openalex.org/W4406845090", "title": "Enhancing News Articles: Automatic SEO Linked Data Injection for Semantic Web Integration", "authors": "Hamza Salem, Hadi Salloum, Osama Orabi, K Sabbagh, Manuel Mazzara", "abstract": "This paper presents a novel solution aimed at enhancing news web pages for seamless integration into the Semantic Web. By utilizing advanced pattern mining techniques alongside OpenAI\u2019s GPT-3, we rewrite news articles to improve their readability and accessibility for Google News aggregators. Our approach is characterized by its methodological rigour and is evaluated through quantitative metrics, validated using Google\u2019s Rich Results Test API to confirm adherence to Google\u2019s structured data guidelines. In this process, a \u201cPass\u201d in the Rich Results Test is taken as an indication of eligibility for rich results, demonstrating the effectiveness of our generated structured data. The impact of our work is threefold: it advances the technological integration of a substantial segment of the web into the Semantic Web, promotes the adoption of Semantic Web technologies within the news sector, and significantly enhances the discoverability of news articles in aggregator platforms. Furthermore, our solution facilitates the broader dissemination of news content to diverse audiences. This submission introduces an innovative solution substantiated by empirical evidence of its impact and methodological soundness, thereby making a significant contribution to the field of Semantic Web research, particularly in the context of news and media articles.", "venue": "Applied Sciences", "label": 8}, {"loc": [6.722247123718262, 0.7914267778396606], "openalex_id": "https://openalex.org/W4406810878", "title": "Analysis of Indic Language Capabilities in LLMs", "authors": "Aatman Vaidya, Tarunima Prabhakar, Denny George, Swair Shah", "abstract": "This report evaluates the performance of text-in text-out Large Language Models (LLMs) to understand and generate Indic languages. This evaluation is used to identify and prioritize Indic languages suited for inclusion in safety benchmarks. We conduct this study by reviewing existing evaluation studies and datasets; and a set of twenty-eight LLMs that support Indic languages. We analyze the LLMs on the basis of the training data, license for model and data, type of access and model developers. We also compare Indic language performance across evaluation datasets and find that significant performance disparities in performance across Indic languages. Hindi is the most widely represented language in models. While model performance roughly correlates with number of speakers for the top five languages, the assessment after that varies.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.377132892608643, 5.478673934936523], "openalex_id": "https://openalex.org/W4406810913", "title": "The Breeze 2 Herd of Models: Traditional Chinese LLMs Based on Llama with Vision-Aware and Function-Calling Capabilities", "authors": "Chan-Jan Hsu, C. Liu, Meng-Hsi Chen, Meng-Hsi Chen, Po\u2010Chun Hsu, Yi-Chang Chen, Da-shan Shiu", "abstract": "Llama-Breeze2 (hereinafter referred to as Breeze2) is a suite of advanced multi-modal language models, available in 3B and 8B parameter configurations, specifically designed to enhance Traditional Chinese language representation. Building upon the Llama 3.2 model family, we continue the pre-training of Breeze2 on an extensive corpus to enhance the linguistic and cultural heritage of Traditional Chinese. In addition to language modeling capabilities, we significantly augment the models with function calling and vision understanding capabilities. At the time of this publication, as far as we are aware, absent reasoning-inducing prompts, Breeze2 are the strongest performing models in Traditional Chinese function calling and image understanding in its size class. The effectiveness of Breeze2 is benchmarked across various tasks, including Taiwan general knowledge, instruction-following, long context, function calling, and vision understanding. We are publicly releasing all Breeze2 models under the Llama 3.2 Community License. We also showcase the capabilities of the model running on mobile platform with a mobile application which we also open source.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.564499855041504, 2.199500322341919], "openalex_id": "https://openalex.org/W4406774497", "title": "Human-like conceptual representations emerge from language prediction", "authors": "Xu, Ningyu, Zhang Qi, Du Chao, Luo Qiang, Qiu, Xipeng, Huang, Xuanjing, Zhang, Menghan", "abstract": "People acquire concepts through rich physical and social experiences and use them to understand and navigate the world. In contrast, large language models (LLMs), trained solely through next-token prediction on text, exhibit strikingly human-like behaviors. Are these models developing concepts akin to those of humans? If so, how are such concepts represented, organized, and related to behavior? Here, we address these questions by investigating the representations formed by LLMs during an in-context concept inference task. We found that LLMs can flexibly derive concepts from linguistic descriptions in relation to contextual cues about other concepts. The derived representations converge toward a shared, context-independent structure, and alignment with this structure reliably predicts model performance across various understanding and reasoning tasks. Moreover, the convergent representations effectively capture human behavioral judgments and closely align with neural activity patterns in the human brain, providing evidence for biological plausibility. Together, these findings establish that structured, human-like conceptual representations can emerge purely from language prediction without real-world grounding, highlighting the role of conceptual structure in understanding intelligent behavior. More broadly, our work suggests that LLMs offer a tangible window into the nature of human concepts and lays the groundwork for advancing alignment between artificial and human intelligence.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.357754945755005, 3.602849245071411], "openalex_id": "https://openalex.org/W4406768586", "title": "Investigating the Feasibility and Risks of Leveraging Artificial Intelligence and Open Source Intelligence to Manage Predictive Cyber Threat Models", "authors": "Onyinye Obioha-Val, Temitope Ibrahim Lawal, Oluwaseun Oladeji Olaniyi, Michael Olayinka Gbadebo, Anthony Obulor Olisa", "abstract": "This study investigates the integration of Artificial Intelligence (AI) and Open Source Intelligence (OSINT) to enhance predictive threat modeling in cybersecurity, addressing the growing complexity and frequency of cyber threats. Integrating AI and OSINT offers transformative potential by enabling organizations to transition from reactive to proactive security measures, a critical need in the evolving digital landscape. Leveraging data from the Twitter Academic API, Common Crawl Dataset, and MITRE ATT&CK Framework, the analysis employed descriptive statistical analysis, logistic regression, and multivariate regression methodologies. Results indicate high data completeness (90.41%) and relevance (81.44%) in OSINT datasets, supporting their suitability for AI model training. Logistic regression demonstrated strong predictive capabilities, achieving 94.98% accuracy, 88.69% precision, and an AUC score of 0.91. However, risks such as data bias (-0.36 coefficient) and adversarial manipulation (-0.33 coefficient) significantly impact predictive performance. The ethical implications of this integration, including concerns about privacy, data fairness, and the potential for misuse, are highlighted as critical considerations for broader adoption. Recommendations include robust preprocessing protocols, advanced adversarial defenses, ethical guidelines, and continuous AI innovation to address these challenges. These findings underscore the potential of AI-OSINT integration while emphasizing the need for ethical and technical safeguards to enhance cybersecurity effectiveness.", "venue": "Journal of Engineering Research and Reports", "label": 0}, {"loc": [3.519773006439209, 3.3018479347229004], "openalex_id": "https://openalex.org/W4406779012", "title": "Training Data Attribution (TDA): Examining Its Adoption & Use Cases", "authors": "Deric Cheng, Juhan Bae, Justin Bullock, David Kristofferson", "abstract": "This report investigates Training Data Attribution (TDA) and its potential importance to and tractability for reducing extreme risks from AI. First, we discuss the plausibility and amount of effort it would take to bring existing TDA research efforts from their current state, to an efficient and accurate tool for TDA inference that can be run on frontier-scale LLMs. Next, we discuss the numerous research benefits AI labs will expect to see from using such TDA tooling. Then, we discuss a key outstanding bottleneck that would limit such TDA tooling from being accessible publicly: AI labs' willingness to disclose their training data. We suggest ways AI labs may work around these limitations, and discuss the willingness of governments to mandate such access. Assuming that AI labs willingly provide access to TDA inference, we then discuss what high-level societal benefits you might see. We list and discuss a series of policies and systems that may be enabled by TDA. Finally, we present an evaluation of TDA's potential impact on mitigating large-scale risks from AI systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.6267929077148438, 4.5038275718688965], "openalex_id": "https://openalex.org/W4406755937", "title": "Deploying Privacy Guardrails for LLMs: A Comparative Analysis of Real-World Applications", "authors": "Shubhi Asthana, Bing Zhang, Ruchi Mahindru, Chad DeLuca, Anna Lisa Gentile, Sandeep Gopisetty", "abstract": "The adoption of Large Language Models (LLMs) has revolutionized AI applications but poses significant challenges in safeguarding user privacy. Ensuring compliance with privacy regulations such as GDPR and CCPA while addressing nuanced privacy risks requires robust and scalable frameworks. This paper presents a detailed study of OneShield Privacy Guard, a framework designed to mitigate privacy risks in user inputs and LLM outputs across enterprise and open-source settings. We analyze two real-world deployments:(1) a multilingual privacy-preserving system integrated with Data and Model Factory, focusing on enterprise-scale data governance; and (2) PR Insights, an open-source repository emphasizing automated triaging and community-driven refinements. In Deployment 1, OneShield achieved a 0.95 F1 score in detecting sensitive entities like dates, names, and phone numbers across 26 languages, outperforming state-of-the-art tool such as StarPII and Presidio by up to 12\\%. Deployment 2, with an average F1 score of 0.86, reduced manual effort by over 300 hours in three months, accurately flagging 8.25\\% of 1,256 pull requests for privacy risks with enhanced context sensitivity. These results demonstrate OneShield's adaptability and efficacy in diverse environments, offering actionable insights for context-aware entity recognition, automated compliance, and ethical AI adoption. This work advances privacy-preserving frameworks, supporting user trust and compliance across operational contexts.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.5561933517456055, -1.32425057888031], "openalex_id": "https://openalex.org/W4406733702", "title": "Investigating the Impact of Language-Adaptive Fine-Tuning on Sentiment Analysis in Hausa Language Using AfriBERTa", "authors": "Sani Abdullahi Sani, Shamsuddeen Hassan Muhammad, Devon Jarvis", "abstract": "Sentiment analysis (SA) plays a vital role in Natural Language Processing (NLP) by ~identifying sentiments expressed in text. Although significant advances have been made in SA for widely spoken languages, low-resource languages such as Hausa face unique challenges, primarily due to a lack of digital resources. This study investigates the effectiveness of Language-Adaptive Fine-Tuning (LAFT) to improve SA performance in Hausa. We first curate a diverse, unlabeled corpus to expand the model's linguistic capabilities, followed by applying LAFT to adapt AfriBERTa specifically to the nuances of the Hausa language. The adapted model is then fine-tuned on the labeled NaijaSenti sentiment dataset to evaluate its performance. Our findings demonstrate that LAFT gives modest improvements, which may be attributed to the use of formal Hausa text rather than informal social media data. Nevertheless, the pre-trained AfriBERTa model significantly outperformed models not specifically trained on Hausa, highlighting the importance of using pre-trained models in low-resource contexts. This research emphasizes the necessity for diverse data sources to advance NLP applications for low-resource African languages. We published the code and the dataset to encourage further research and facilitate reproducibility in low-resource NLP here: https://github.com/Sani-Abdullahi-Sani/Natural-Language-Processing/blob/main/Sentiment%20Analysis%20for%20Low%20Resource%20African%20Languages", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.844109535217285, 2.5307698249816895], "openalex_id": "https://openalex.org/W4406733243", "title": "Optimizing Pretraining Data Mixtures with LLM-Estimated Utility", "authors": "William Held, Bhargavi Paranjape, Punit Singh Koura, Mike Lewis, Feida Zhang, Todor Mihaylov", "abstract": "Large Language Models improve with increasing amounts of high-quality training data. However, leveraging larger datasets requires balancing quality, quantity, and diversity across sources. After evaluating nine baseline methods under both compute- and data-constrained scenarios, we find token-count heuristics outperform manual and learned mixes, indicating that simple approaches accounting for dataset size and diversity are surprisingly effective. Building on this insight, we propose two complementary approaches: UtiliMax, which extends token-based heuristics by incorporating utility estimates from reduced-scale ablations, achieving up to a 10.6x speedup over manual baselines; and Model Estimated Data Utility (MEDU), which leverages LLMs to estimate data utility from small samples, matching ablation-based performance while reducing computational requirements by $\\sim$200x. Together, these approaches establish a new framework for automated, compute-efficient data mixing that is robust across training regimes.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.073843955993652, 2.8271238803863525], "openalex_id": "https://openalex.org/W4406735482", "title": "Revisiting Language Models in Neural News Recommender Systems", "authors": "Yuyue Zhao, Jin Huang, David Vos, Maarten de Rijke", "abstract": "Neural news recommender systems (RSs) have integrated language models (LMs) to encode news articles with rich textual information into representations, thereby improving the recommendation process. Most studies suggest that (i) news RSs achieve better performance with larger pre-trained language models (PLMs) than shallow language models (SLMs), and (ii) that large language models (LLMs) outperform PLMs. However, other studies indicate that PLMs sometimes lead to worse performance than SLMs. Thus, it remains unclear whether using larger LMs consistently improves the performance of news RSs. In this paper, we revisit, unify, and extend these comparisons of the effectiveness of LMs in news RSs using the real-world MIND dataset. We find that (i) larger LMs do not necessarily translate to better performance in news RSs, and (ii) they require stricter fine-tuning hyperparameter selection and greater computational resources to achieve optimal recommendation performance than smaller LMs. On the positive side, our experiments show that larger LMs lead to better recommendation performance for cold-start users: they alleviate dependency on extensive user interaction history and make recommendations more reliant on the news content.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.757017612457275, 3.9720489978790283], "openalex_id": "https://openalex.org/W4406755530", "title": "Parameters vs FLOPs: Scaling Laws for Optimal Sparsity for Mixture-of-Experts Language Models", "authors": "Samira Abnar, Harshay Shah, Dan Busbridge, A. Ali, Josh Susskind, Vimal Thilak", "abstract": "Scaling the capacity of language models has consistently proven to be a reliable approach for improving performance and unlocking new capabilities. Capacity can be primarily defined by two dimensions: the number of model parameters and the compute per example. While scaling typically involves increasing both, the precise interplay between these factors and their combined contribution to overall capacity remains not fully understood. We explore this relationship in the context of sparse Mixture-of-Experts (MoEs), which allow scaling the number of parameters without proportionally increasing the FLOPs per example. We investigate how varying the sparsity level, i.e., the fraction of inactive parameters, impacts model's performance during pretraining and downstream few-shot evaluation. We find that under different constraints (e.g., parameter size and total training compute), there is an optimal level of sparsity that improves both training efficiency and model performance. These results provide a better understanding of the impact of sparsity in scaling laws for MoEs and complement existing works in this area, offering insights for designing more efficient architectures.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.775968551635742, 0.39696642756462097], "openalex_id": "https://openalex.org/W4406692405", "title": "Adapting Large Language Models for Character-based Augmentative and Alternative Communication", "authors": "Dylan Gaines, Keith Vertanen", "abstract": "Users of Augmentative and Alternative Communication (AAC) may write letter-by-letter via an interface that uses a character language model. However, most state-of-the-art large pretrained language models predict subword tokens of variable length. We investigate how to practically use such models to make accurate and efficient character predictions. Our algorithm for producing character predictions from a subword large language model (LLM) provides more accurate predictions than using a classification layer, a byte-level LLM, or an n-gram model. Additionally, we investigate a domain adaptation procedure based on a large dataset of sentences we curated based on scoring how useful each sentence might be for spoken or written AAC communication. We find our procedure further improves model performance on simple, conversational text.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.2605626583099365, 4.238709449768066], "openalex_id": "https://openalex.org/W4406725881", "title": "Synthetic Browsing Histories for 50 Countries Worldwide: Datasets for Research, Development, and Education", "authors": "Dan Komosn\u00fd, Saeed Ur Rehman, Muhammad Sohaib Ayub", "abstract": "Abstract Browsing histories can be a valuable resource for cybersecurity, research, and testing. Individuals are often reluctant to share their browsing histories online, and the use of personal data requires obtaining signed informed consent. Research shows that anonymized histories can lead to re-identification, nullifying the anonymity promised by informed consent. In this work, we present 500 synthetic browsing histories valid for 50 countries worldwide. The synthetic histories are compiled based on real browsing data using a series of transformation criteria, including website content, popularity, locality, and language, ensuring their validity for the respective countries. Each history maintains the order of webpage accesses and covers a one-month period. The motivation for publishing this dataset arises from the community\u2019s call for browsing histories from different countries for research, development, and education. The published synthetic browsing histories can be used for any purpose without legal restrictions.", "venue": "Scientific Data", "label": 20}, {"loc": [5.202142238616943, -1.597976803779602], "openalex_id": "https://openalex.org/W4406744658", "title": "SentimentFormer: A Transformer-Based Multi-Modal Fusion Framework for Enhanced Sentiment Analysis of Memes in Under-Resourced Bangla Language", "authors": "Fatema Tuj Johora Faria, Laith H. Baniata, Mohammad Baniata, Mohannad A. Khair, Ahmed Ibrahim Bani Ata, Chayut Bunterngchit, Sangwoo Kang", "abstract": "Social media has increasingly relied on memes as a tool for expressing opinions, making meme sentiment analysis an emerging area of interest for researchers. While much of the research has focused on English-language memes, under-Resource languages, such as Bengali, have received limited attention. Given the surge in social media use, the need for sentiment analysis of memes in these languages has become critical. One of the primary challenges in this field is the lack of benchmark datasets, particularly in languages with fewer resources. To address this, we used the MemoSen dataset, designed for Bengali, which consists of 4,368 memes annotated with three sentiment labels: positive, negative, and neutral. MemoSen is divided into training (70%), test (20%), and validation (10%) sets, with an imbalanced class distribution: 1,349 memes in the positive class, 2,728 in the negative class, and 291 in the neutral class. Our approach leverages advanced deep learning techniques for multimodal sentiment analysis in Bengali, introducing three hybrid approaches. SentimentTextFormer is a text-based, fine-tuned model that utilizes state-of-the-art transformer architectures to accurately extract sentiment-related insights from Bengali text, capturing nuanced linguistic features. SentimentImageFormer is an image-based model that employs cutting-edge transformer-based techniques for precise sentiment classification through visual data. Lastly, SentimentFormer is a hybrid model that seamlessly integrates both text and image modalities using fusion strategies. Early Fusion combines textual and visual features at the input level, enabling the model to jointly learn from both modalities. Late Fusion merges the outputs of separate text and image models, preserving their individual strengths for the final prediction. Intermediate Fusion integrates textual and visual features at intermediate layers, refining their interactions during processing. These fusion strategies combine the strengths of both textual and visual data, enhancing sentiment analysis by exploiting complementary information from multiple sources. The performance of our models was evaluated using various accuracy metrics, with SentimentTextFormer achieving 73.31% accuracy and SentimentImageFormer attaining 64.72%. The hybrid model, SentimentFormer (SwiftFormer with mBERT), employing Intermediate Fusion, shows a notable improvement in accuracy, achieving 79.04%, outperforming SentimentTextFormer by 5.73% and SentimentImageFormer by 14.32%. Among the fusion strategies, SentimentFormer (SwiftFormer with mBERT) achieved the highest accuracy of 79.04%, highlighting the effectiveness of our fusion technique and the reliability of our multimodal framework in improving sentiment analysis accuracy across diverse modalities.", "venue": "Preprints.org", "label": 3}, {"loc": [6.4199538230896, -1.0268126726150513], "openalex_id": "https://openalex.org/W4414116672", "title": "Bilingual generated text detection through semantic and statistical analysis", "authors": "Chenxi Min, Ru Zhang, Jianyi Liu", "abstract": "The release of Large Language Models (LLMs) has achieved human-level text generation, leading to malicious uses such as disinformation propagation and academic dishonesty. Existing research has faced substantial challenges in low detection rates and poor generalization on multilingual generated text and short text. To fill these gaps, in this paper, we propose a generic bilingual generated text detection model to integrate semantic and statistical features, which exhibits proficiency in English and Chinese. To obtain fine-grained features, we employ the multilingual pre-trained language model xlm-RoBERTa to extract the CLS vector as overall semantic features, integrating with statistical features log rank, probability, and cumulative probability for detection. Moreover, Shapley additive explanations (SHAP) serves to interpret the decision-making process. The experimental results demonstrate significant advancements over baselines, notably with the F1 score improvements exceeding 10% and 5% on the English and Chinese HC3 sentence-level datasets, respectively. Our proposed method exhibits higher generalization for advanced LLMs and out-of-domain datasets with a 91.13% F1 score, thereby providing a more robust solution for detecting generated text.", "venue": "Intelligent Data Analysis", "label": 0}, {"loc": [3.801943302154541, -3.9007492065429688], "openalex_id": "https://openalex.org/W4406686519", "title": "Multi-Modal Twitter Data Analysis for Identifying Offensive Posts Using a Deep Cross Attention based Transformer Framework", "authors": "Jayanta Paul, Siddhartha Mallick, Atindra K. Mitra, Anuska Roy, Jaya Sil", "abstract": "In today\u2019s society dissemination of information among the individuals occur very rapidly due to the widespread usage of social media platforms like Twitter (now-a-days acclaimed as X). However, information may pose challenges to maintaining a healthy online environment because often it contains harmful content. This article presents a novel approach to identify different categories of offensive posts such as hate speech, profanity, targeted insult, and derogatory commentary by analyzing multi-modal image and text data, collected from Twitter. We propose a comprehensive deep learning framework, \u201cValue Mixed Cross-Attention Transformer\u201d (VMCA-Trans) that leverage a combination of computer vision and natural language processing methodologies to effectively classify the posts into four classes with binary labels. We have created an in-house dataset (OffenTweet) comprising of Twitter posts having textual content, accompanying with images to build the proposed model. The dataset is carefully annotated by several experts with offensive labels such as hate speech, profanity, targeted insult, and derogatory commentary. VMCA-Trans utilizes fine-tuned state-of-the-art transformer-based backbones such as ViT, BERT, RoBERTA. The combined representation of image and text embeddings obtained by these fine-tuned transformer encoders is fed into a classifier to categorize the posts into offensive and non-offensive classes. To assess its effectiveness, we extensively evaluate the VMCA-Trans model using various performance metrics. The results indicate that the proposed multi-modal approach achieves superior performance compared to traditional unimodal methods.", "venue": "ACM Transactions on Knowledge Discovery from Data", "label": 41}, {"loc": [2.999631643295288, -0.3901887834072113], "openalex_id": "https://openalex.org/W4406545364", "title": "FineMedLM-o1: Enhancing the Medical Reasoning Ability of LLM from Supervised Fine-Tuning to Test-Time Training", "authors": "Hongzhou Yu, Tianhao Cheng, Ying Cheng, Rui Feng", "abstract": "Recent advancements in large language models (LLMs) have shown promise in medical applications such as disease diagnosis and treatment planning. However, most existing medical LLMs struggle with the deep reasoning required for complex medical problems, such as differential diagnosis and medication recommendations. We propose FineMedLM-o1, which leverages high-quality medical synthetic data and long-form reasoning data for Supervised Fine-Tuning (SFT) and Direct Preference Optimization (DPO), enabling advanced dialogue and deep reasoning capabilities. Additionally, we introduce Test-Time Training (TTT) in the medical domain for the first time, facilitating domain adaptation and ensuring reliable, accurate reasoning. Experimental results demonstrate that FineMedLM-o1 achieves a 23% average performance improvement over prior models on key medical benchmarks. Furthermore, the introduction of TTT provides an additional 14% performance boost, highlighting its effectiveness in enhancing medical reasoning capabilities. To support this process, we also propose a novel method for synthesizing medical dialogue. Compared to other open-source datasets, our dataset stands out as superior in both quality and complexity. The project and data will be released on GitHub.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7019500732421875, 4.015225887298584], "openalex_id": "https://openalex.org/W4406548651", "title": "A Survey on Responsible LLMs: Inherent Risk, Malicious Use, and Mitigation Strategy", "authors": "Huandong Wang, Wenjie Fu, Yajuan Tang, Zhilong Chen, Yuxi Huang, J. Piao, Gao Chen, Fengli Xu, Tao Jiang, Yongping Li", "abstract": "While large language models (LLMs) present significant potential for supporting numerous real-world applications and delivering positive social impacts, they still face significant challenges in terms of the inherent risk of privacy leakage, hallucinated outputs, and value misalignment, and can be maliciously used for generating toxic content and unethical purposes after been jailbroken. Therefore, in this survey, we present a comprehensive review of recent advancements aimed at mitigating these issues, organized across the four phases of LLM development and usage: data collecting and pre-training, fine-tuning and alignment, prompting and reasoning, and post-processing and auditing. We elaborate on the recent advances for enhancing the performance of LLMs in terms of privacy protection, hallucination reduction, value alignment, toxicity elimination, and jailbreak defenses. In contrast to previous surveys that focus on a single dimension of responsible LLMs, this survey presents a unified framework that encompasses these diverse dimensions, providing a comprehensive view of enhancing LLMs to better serve real-world applications.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.25215482711792, -2.6640114784240723], "openalex_id": "https://openalex.org/W4406535068", "title": "Decoding Fake News and Hate Speech: A Survey of Explainable AI Techniques: A Survey of Explainable AI Techniques.", "authors": "Mikel K. Ngueajio, Saurav K. Aryal, Marcellin Atemkeng, Gloria Washington, Danda B. Rawat", "abstract": "This survey emphasizes the significance of Explainable AI (XAI) techniques in detecting hateful speech and misinformation/Fake news. It explores recent trends in detecting these phenomena, highlighting current research that reveals a synergistic relationship between them. Additionally, it presents recent trends in the use of XAI methods to mitigate the occurrences of hateful land Fake contents in conversations. The survey reviews state-of-the-art XAI approaches, algorithms, modeling datasets, as well as the evaluation metrics leveraged for assessing model interpretability, and thus provides a comprehensive summary table of the literature surveyed and relevant datasets. It concludes with an overview of key observations, offering insights into the prominent model explainability methods used in hate speech and misinformation detection. The research strengths, limitations are also presented, as well as perspectives and suggestions for future directions in this research domain.", "venue": "ACM Computing Surveys", "label": 7}, {"loc": [4.101705551147461, -2.4938292503356934], "openalex_id": "https://openalex.org/W4406549086", "title": "From Scarcity to Capability: Empowering Fake News Detection in Low-Resource Languages with LLMs", "authors": "Hrithik Majumdar Shibu, Shrestha Datta, Md. Sumon Miah, Nasrullah Sami, M. S. Chowdhury, Md. Saiful Islam", "abstract": "The rapid spread of fake news presents a significant global challenge, particularly in low-resource languages like Bangla, which lack adequate datasets and detection tools. Although manual fact-checking is accurate, it is expensive and slow to prevent the dissemination of fake news. Addressing this gap, we introduce BanFakeNews-2.0, a robust dataset to enhance Bangla fake news detection. This version includes 11,700 additional, meticulously curated fake news articles validated from credible sources, creating a proportional dataset of 47,000 authentic and 13,000 fake news items across 13 categories. In addition, we created a manually curated independent test set of 460 fake and 540 authentic news items for rigorous evaluation. We invest efforts in collecting fake news from credible sources and manually verified while preserving the linguistic richness. We develop a benchmark system utilizing transformer-based architectures, including fine-tuned Bidirectional Encoder Representations from Transformers variants (F1-87\\%) and Large Language Models with Quantized Low-Rank Approximation (F1-89\\%), that significantly outperforms traditional methods. BanFakeNews-2.0 offers a valuable resource to advance research and application in fake news detection for low-resourced languages. We publicly release our dataset and model on Github to foster research in this direction.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.464988708496094, 1.9658253192901611], "openalex_id": "https://openalex.org/W4406458530", "title": "Quantifying the Importance of Data Alignment in Downstream Model Performance", "authors": "Krrish Chawla, Aryan Sahai, Mario DePavia, S. Shyam Sundar, Brando Miranda", "abstract": "Contrary to the conventional emphasis on dataset size, we explore the role of data alignment -- an often overlooked aspect of data quality -- in training capable Large Language Models (LLMs). To do so, we use the Task2Vec-based alignment coefficient, a quantitative measure of the similarity between two datasets, to quantify the impact of alignment between training data and evaluation data on downstream performance. In particular, we conduct controlled \\textit{interventional} experiments for two settings: 1. the impact of increased alignment coefficients between various pre-training (pt) against evaluation datasets, and 2. the impact of increased alignment coefficients between domain specific fine-tuning (ft) against domain specific evaluation. The domain specific task we explore is Autoformalization -- the machine translation task between natural language and code for formal verification. In both settings, we find a strong, predictable negative correlation between the alignment coefficient of a model's training and evaluation data and the model's loss/perplexity on the respective downstream task. These findings suggest a re-evaluation of LLM training approaches, demonstrating the relevance of data alignment compared to data quantity, especially in specialized downstream tasks such as Autoformalization.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.794271469116211, -0.9240193963050842], "openalex_id": "https://openalex.org/W4406482941", "title": "ViBidirectionMT-Eval: Machine Translation for Vietnamese-Chinese and Vietnamese-Lao language pair", "authors": "Hong-Viet Tran, Nguy\u1ec5n Minh Qu\u00fd, Van-Vinh Nguyen", "abstract": "This paper presents an results of the VLSP 2022-2023 Machine Translation Shared Tasks, focusing on Vietnamese-Chinese and Vietnamese-Lao machine translation. The tasks were organized as part of the 9th, 10th annual workshop on Vietnamese Language and Speech Processing (VLSP 2022, VLSP 2023). The objective of the shared task was to build machine translation systems, specifically targeting Vietnamese-Chinese and Vietnamese-Lao translation (corresponding to 4 translation directions). The submission were evaluated on 1,000 pairs for testing (news and general domains) using established metrics like BLEU [11] and SacreBLEU [12]. Additionally, system outputs also were evaluated with human judgment provided by experts in Chinese and Lao languages. These human assessments played a crucial role in ranking the performance of the machine translation models, ensuring a more comprehensive evaluation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.526957988739014, 1.808146595954895], "openalex_id": "https://openalex.org/W4406448768", "title": "JiuZhou: open foundation language models and effective pre-training framework for geoscience", "authors": "Zhou Chen, Ming Lin, Mingrun Zang, Zimeng Wang, Juanzi Li, Yuqi Bai", "abstract": "Geoscience research has generated vast amounts of data, creating a need for effective extraction and integration of knowledge to address global-change challenges, promote sustainable development, and accelerate scientific discovery. Foundation language models, trained through extensive pre-training and instruction tuning on large text corpora, can facilitate this process. However, when foundational language models lack sufficient geoscience expertise, instruction tuning with relevant data can generate content that is inconsistent with established facts. In this study, we introduce JiuZhou, a powerful open foundational language model used in geoscience. First, we construct the large-scale, diverse, and high-quality JiuZhou-Corpus and the JiuZhou-Framework specifically designed for training geoscience large language models (LLMs). We introduce a two-stage pre-adaptation pre-training method to enhance the efficiency of knowledge learning and transfer in the model and demonstrated its effectiveness. Evaluation on GeoBench shows that JiuZhou outperforms GPT-3.5 in objective tasks and surpasses all baselines in subjective tasks. Moreover, we analyse the performance variations of the LLM using a stronger base model, stronger instruction data, and more training data, as well as its ability to assist scientific discovery. The results demonstrate the potential of JiuZhou as a geoscience foundational language model and provide valuable insights for advancing LLM development in geoscience. This project is available at https://github.com/THU-ESIS/JiuZhou.", "venue": "International Journal of Digital Earth", "label": 0}, {"loc": [6.030716419219971, 2.087080955505371], "openalex_id": "https://openalex.org/W4406427478", "title": "Implicit knowledge-augmented prompting for commonsense explanation generation", "authors": "Yan Ge, Hai-Tao Yu, Chao Lei, Xin Liu, Adam Jatowt, Kyoung\u2010Sook Kim, Steven Lynden, Akiyoshi Matono", "abstract": "Abstract Commonsense explanation generation refers to reasoning and explaining why a commonsense statement contradicts commonsense knowledge, such as why the statement \u201cMy dad grew volleyballs in his garden\u201d is nonsensical. While such reasoning is trivial for humans, it remains a challenge for AI systems. Despite their notable performance in tasks like text generation and reasoning, large language models (LLMs) often fall short of consistently generating coherent and accurate commonsense explanations. To bridge this gap, we propose a novel Two-stage Identification and Prompting (TIP) framework for enhancing LLMs\u2019 ability to handle the task of commonsense explanation generation. Specifically, in the first stage, TIP identifies the nonsensical concept in the given statement, pinpointing the specific element that contradicts commonsense knowledge. In the second stage, TIP generates implicit knowledge based on the identified nonsensical concept and then leverages this implicit knowledge to guide the adopted LLMs in generating explanations. In order to demonstrate the effectiveness of the proposed TIP framework for commonsense explanation generation, we conducted extensive experiments based on the ComVE dataset and a newly constructed CSE dataset, where a variety of LLMs are evaluated. The experimental results show that TIP consistently outperforms all baseline methods across multiple metrics, demonstrating its effectiveness in improving LLMs\u2019 commonsense reasoning and explanation generation capabilities.", "venue": "Knowledge and Information Systems", "label": 0}, {"loc": [7.556320667266846, -0.6754915118217468], "openalex_id": "https://openalex.org/W4406448924", "title": "LLMic: Romanian Foundation Language Model", "authors": "Vlad-Andrei B\u0103doiu, Mihai-Valentin Dumitru, Alexandru M. Gherghescu, Alexandru Agache, Costin Raiciu", "abstract": "Recent advances in Large Language Models (LLMs) have demonstrated remarkable capabilities across various tasks with commercial models leading the way. While open models usually operate at a smaller scale, they maintain competitiveness through specialization and fine-tuning. However, a significant challenge persists: open models often underperform in low-resource languages due to limited representation in the training corpus. In this paper, we present LLMic, a bilingual foundation language model designed specifically for the Romanian Language. We document the complete process of pretraining a foundation model for a low-resource language, including corpus construction, architecture selection, and hyper-parameter optimization. Our evaluation demonstrates that LLMic can be specialized for tasks in the target language, achieving results comparable to other much larger open models. We show that fine-tuning LLMic for language translation after the initial pretraining phase outperforms existing solutions in English-to-Romanian translation tasks. This opens the path for efficient large-scale processing for the Romanian language community, using the much smaller LLMic model", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.590756416320801, 1.1051738262176514], "openalex_id": "https://openalex.org/W4406449459", "title": "Deep Learning and Natural Language Processing in the Field of Construction", "authors": "R\u00e9my Kessler, Nicolas B\u00e9chet", "abstract": "This article presents a complete process to extract hypernym relationships in the field of construction using two main steps: terminology extraction and detection of hypernyms from these terms. We first describe the corpus analysis method to extract terminology from a collection of technical specifications in the field of construction. Using statistics and word n-grams analysis, we extract the domain's terminology and then perform pruning steps with linguistic patterns and internet queries to improve the quality of the final terminology. Second, we present a machine-learning approach based on various words embedding models and combinations to deal with the detection of hypernyms from the extracted terminology. Extracted terminology is evaluated using a manual evaluation carried out by 6 experts in the domain, and the hypernym identification method is evaluated with different datasets. The global approach provides relevant and promising results.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.344300270080566, 2.0281126499176025], "openalex_id": "https://openalex.org/W4406449609", "title": "Formalising lexical and syntactic diversity for data sampling in French", "authors": "Louis Est\u00e8ve, Manon Scholivet, Agata Savary", "abstract": "Diversity is an important property of datasets and sampling data for diversity is useful in dataset creation. Finding the optimally diverse sample is expensive, we therefore present a heuristic significantly increasing diversity relative to random sampling. We also explore whether different kinds of diversity -- lexical and syntactic -- correlate, with the purpose of sampling for expensive syntactic diversity through inexpensive lexical diversity. We find that correlations fluctuate with different datasets and versions of diversity measures. This shows that an arbitrarily chosen measure may fall short of capturing diversity-related properties of datasets.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.199208736419678, 1.1757166385650635], "openalex_id": "https://openalex.org/W4406450018", "title": "OpenCSG Chinese Corpus: A Series of High-quality Chinese Datasets for LLM Training", "authors": "Yongang Yu, Ziyun Dai, Zekun Wang, Wei Wang, Ran Chen, Ji Pei", "abstract": "Large language models (LLMs) have demonstrated remarkable capabilities, but their success heavily relies on the quality of pretraining corpora. For Chinese LLMs, the scarcity of high-quality Chinese datasets presents a significant challenge, often limiting their performance. To address this issue, we propose the OpenCSG Chinese Corpus, a series of high-quality datasets specifically designed for LLM pretraining, post-training, and fine-tuning. This corpus includes Fineweb-edu-chinese, Fineweb-edu-chinese-v2, Cosmopedia-chinese, and Smoltalk-chinese, each with distinct characteristics: Fineweb-edu datasets focus on filtered, high-quality content derived from diverse Chinese web sources; Cosmopedia-chinese provides synthetic, textbook-style data for knowledge-intensive training; and Smoltalk-chinese emphasizes stylistic and diverse chat-format data. The OpenCSG Chinese Corpus is characterized by its high-quality text, diverse coverage across domains, and scalable, reproducible data curation processes. Additionally, we conducted extensive experimental analyses, including evaluations on smaller parameter models, which demonstrated significant performance improvements in tasks such as C-Eval, showcasing the effectiveness of the corpus for training Chinese LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.625715255737305, 5.153019905090332], "openalex_id": "https://openalex.org/W4406465766", "title": "MiniMax-01: Scaling Foundation Models with Lightning Attention", "authors": "MiniMax, Ao Li, Bin Gong, Bo Yang, B. S. Shan, Chang Liu, Zhu C, Chunhao Zhang, Cong Guo, Da Chen, Dong Li, Enwei Jiao, Gengxin Li, Guojun Zhang, H. Sun, Huafeng Dong, Jie Wei Zhu, Jiaqi Zhuang, Jiayuan Song, Jin Zhu, Jingtao Han, Jingyang Li, Junbin Xie, J. J. Xu, Junjie Yan, Kaishun Zhang, Kecheng Xiao, K. Kang, Le Han, Leyang Wang, Lan Yu, Liheng Feng, Lin Zheng, Louis F. Chai, Long Xing, Meizhi Ju, Michael Chi, M. Zhang, Peikai Huang, Pengcheng Niu, Pengfei Li, Pengyu Zhao, Qi Yang, Qidi Xu, Qiexiang Wang, Qin Wang, Qiuhui Li, Ruitao Leng, S. S. Shi, Shuqi Yu, Sichen Li, S. N. Zhu, Tao Huang, Teng Liang, Weigao Sun, Weixuan Sun, Weiming Cheng, Wenkai Li, Xiangjun Song, Xiao Su, Xiaodong Han, Xinjie Zhang, Xinzhu Hou, Min Xu, Xun Zou, Xuyang Shen, Yan Gong, Yingjie Zhu, Yipeng Zhou, Yiran Zhong, Yongyi Hu, Yahan Fan, Yue Yu, Yufeng Yang, Yuhao Li, Yu\u2010Nan Huang, Yunji Li, Yun-peng Huang, Yunzhi Xu, Yuxin Mao, Zehan Li, Zekang Li, Ze\u2010Wei Tao, Z. Ying, Zhenhua Cong, Zhen Qin, Fan Zhang, Zhihang Yu, Zhuo Jiang, Zhihui Wu", "abstract": "We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, which are comparable to top-tier models while offering superior capabilities in processing longer contexts. The core lies in lightning attention and its efficient scaling. To maximize computational capacity, we integrate it with Mixture of Experts (MoE), creating a model with 32 experts and 456 billion total parameters, of which 45.9 billion are activated for each token. We develop an optimized parallel strategy and highly efficient computation-communication overlap techniques for MoE and lightning attention. This approach enables us to conduct efficient training and inference on models with hundreds of billions of parameters across contexts spanning millions of tokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens during training and extrapolate to 4 million tokens during inference at an affordable cost. Our vision-language model, MiniMax-VL-01 is built through continued training with 512 billion vision-language tokens. Experiments on both standard and in-house benchmarks show that our models match the performance of state-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32 times longer context window. We publicly release MiniMax-01 at https://github.com/MiniMax-AI.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.374975204467773, 2.238116979598999], "openalex_id": "https://openalex.org/W4406392547", "title": "Algorithmic bias: sexualized violence against women in GPT-3 models", "authors": "Sarah Wyer, Sue Black", "abstract": "Abstract This study explores the occurrence and implications of sexualized violence against women in text completion tasks performed by GPT-3 models. The study began as an exploratory investigation into gender inequalities within GPT-3 models to discover what GPT3 models generate in text completion tasks when prompted with basic gendered prompts. For example, \"Men can\u2026\", \"Women can\u2026\". Using an adapted version of Safia Noble\u2019s prompt scheme, we generated a dataset of 9 million tokens from GPT-3 models, then conducted topic analysis using BERTopic, VADER for sentiment analysis, and Detoxify for toxicity detection. The results were unexpected. Topic analysis revealed sexualized violence ranked as the top topic for prompts featuring\u2019women\u2019. Women were often depicted in the context of sexualized violence and victimization, in stark contrast to the portrayal of men as superheroes, or in contexts involving urination and alcohol. For example, a randomly selected output generated by the prompt. \" Why are women so \", generated \" Why are women so arrogant about being raped?\". Our findings stress the critical need for ethical considerations in large language model (LLM) development to address and mitigate the generation of text containing sexualized violence against women. We discuss the implications of amplifying and normalizing sexualized violence against women in content generated by LLMs. Our work builds on previous research examining gender bias in LLMs, with a specific focus on the manifestation of sexualized violence against women in LLM outputs, an area that has received little attention. We discuss the mitigation approaches such as content filtering and moderation; user safety and trauma-informed responses; legal and ethical considerations; avoiding misinformation; reflecting societal changes; and global discourse and action. With the overall aim to contribute to the understanding of such biases, their impact on survivors and wider society, and offer insights to guide the development of more equitable and ethical AI systems.", "venue": "AI and Ethics", "label": 0}, {"loc": [4.544217109680176, 3.640120506286621], "openalex_id": "https://openalex.org/W4406505516", "title": "LLM360 K2: Scaling Up 360-Open-Source Large Language Models", "authors": "Zhengzhong Liu, Bowen Tan, Hongyi Wang, Willie Neiswanger, Tianhua Tao, Haonan Li, Fajri Koto, Yuqi Wang, Suqi Sun, Omkar Pangarkar, Richard E. Fan, Yi Gu, Victor S. Miller, Liqun Ma, Liping Tang, Nikhil Ranjan, Yonghao Zhuang, Guowei He, Renxi Wang, Mingkai Deng, Robin Algayres, Yuanzhi Li, Zhiqiang Shen, Preslav Nakov, Eric P. Xing", "abstract": "We detail the training of the LLM360 K2-65B model, scaling up our 360-degree OPEN SOURCE approach to the largest and most powerful models under project LLM360. While open-source LLMs continue to advance, the answer to \"How are the largest LLMs trained?\" remains unclear within the community. The implementation details for such high-capacity models are often protected due to business considerations associated with their high cost. This lack of transparency prevents LLM researchers from leveraging valuable insights from prior experience, e.g., \"What are the best practices for addressing loss spikes?\" The LLM360 K2 project addresses this gap by providing full transparency and access to resources accumulated during the training of LLMs at the largest scale. This report highlights key elements of the K2 project, including our first model, K2 DIAMOND, a 65 billion-parameter LLM that surpasses LLaMA-65B and rivals LLaMA2-70B, while requiring fewer FLOPs and tokens. We detail the implementation steps and present a longitudinal analysis of K2 DIAMOND's capabilities throughout its training process. We also outline ongoing projects such as TXT360, setting the stage for future models in the series. By offering previously unavailable resources, the K2 project also resonates with the 360-degree OPEN SOURCE principles of transparency, reproducibility, and accessibility, which we believe are vital in the era of resource-intensive AI research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.607007026672363, 0.19492854177951813], "openalex_id": "https://openalex.org/W4406302399", "title": "Natural Language Processing for Dialects of a Language: A Survey", "authors": "Aditya Joshi, Raj Dabre, Diptesh Kanojia, Zhuang Li, Haolan Zhan, Gholamreza Haffari, Doris Dippold", "abstract": "State-of-the-art natural language processing (NLP) models are trained on massive training corpora, and report a superlative performance on evaluation datasets. This survey delves into an important attribute of these datasets: the dialect of a language. Motivated by the performance degradation of NLP models for dialectal datasets and its implications for the equity of language technologies, we survey past research in NLP for dialects in terms of datasets, and approaches. We describe a wide range of NLP tasks in terms of two categories: natural language understanding (NLU) (for tasks such as dialect classification, sentiment analysis, parsing, and NLU benchmarks) and natural language generation (NLG) (for summarisation, machine translation, and dialogue systems). The survey is also broad in its coverage of languages which include English, Arabic, German, among others. We observe that past work in NLP concerning dialects goes deeper than mere dialect classification, and extends to several NLU and NLG tasks. For these tasks, we describe classical machine learning using statistical models, along with the recent deep learning-based approaches based on pre-trained language models. We expect that this survey will be useful to NLP researchers interested in building equitable language technologies by rethinking LLM benchmarks and model architectures.", "venue": "ACM Computing Surveys", "label": 7}, {"loc": [7.090003490447998, 0.13099801540374756], "openalex_id": "https://openalex.org/W4406348108", "title": "Linguistic Entity Masking to Improve Cross-Lingual Representation of Multilingual Language Models for Low-Resource Languages", "authors": "Aloka Fernando, Surangika Ranathunga", "abstract": "Multilingual Pre-trained Language models (multiPLMs), trained on the Masked Language Modelling (MLM) objective are commonly being used for cross-lingual tasks such as bitext mining. However, the performance of these models is still suboptimal for low-resource languages (LRLs). To improve the language representation of a given multiPLM, it is possible to further pre-train it. This is known as continual pre-training. Previous research has shown that continual pre-training with MLM and subsequently with Translation Language Modelling (TLM) improves the cross-lingual representation of multiPLMs. However, during masking, both MLM and TLM give equal weight to all tokens in the input sequence, irrespective of the linguistic properties of the tokens. In this paper, we introduce a novel masking strategy, Linguistic Entity Masking (LEM) to be used in the continual pre-training step to further improve the cross-lingual representations of existing multiPLMs. In contrast to MLM and TLM, LEM limits masking to the linguistic entity types nouns, verbs and named entities, which hold a higher prominence in a sentence. Secondly, we limit masking to a single token within the linguistic entity span thus keeping more context, whereas, in MLM and TLM, tokens are masked randomly. We evaluate the effectiveness of LEM using three downstream tasks, namely bitext mining, parallel data curation and code-mixed sentiment analysis using three low-resource language pairs English-Sinhala, English-Tamil, and Sinhala-Tamil. Experiment results show that continually pre-training a multiPLM with LEM outperforms a multiPLM continually pre-trained with MLM+TLM for all three tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.460692405700684, 0.6595032215118408], "openalex_id": "https://openalex.org/W4406318409", "title": "How to Tune a Multilingual Encoder Model for Germanic Languages: A Study of PEFT, Full Fine-Tuning, and Language Adapters", "authors": "Romina Oji, J. C. Kunz", "abstract": "This paper investigates the optimal use of the multilingual encoder model mDeBERTa for tasks in three Germanic languages -- German, Swedish, and Icelandic -- representing varying levels of presence and likely data quality in mDeBERTas pre-training data. We compare full fine-tuning with the parameter-efficient fine-tuning (PEFT) methods LoRA and Pfeiffer bottleneck adapters, finding that PEFT is more effective for the higher-resource language, German. However, results for Swedish and Icelandic are less consistent. We also observe differences between tasks: While PEFT tends to work better for question answering, full fine-tuning is preferable for named entity recognition. Inspired by previous research on modular approaches that combine task and language adapters, we evaluate the impact of adding PEFT modules trained on unstructured text, finding that this approach is not beneficial.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.423296928405762, -0.09202009439468384], "openalex_id": "https://openalex.org/W4406337523", "title": "From keywords to key embeddings\u2013contrasting French and Swedish web registers using multilingual deep learning", "authors": "Saara Hellstr\u00f6m, Valtteri Skantsi, Anna Salmela, Veronika Laippala", "abstract": "Abstract The pervasiveness of the internet has given web language use a central role in society. However, the lack of multilingual corpora and scalable methods has led to the focus on English in web language research. To address this gap, the present paper sets itself in the register research tradition and explores French and Swedish web registers from a cross-linguistic angle. Methodologically we combine keyword analysis with multilingual deep learning, suggesting an approach that enables computational comparisons across languages. Specifically, we extract keywords for French and Swedish web registers, then associate the keywords with fastText word embeddings, and finally, cluster these key embeddings. The findings indicate that there are topical and functional clusters, and they are linguistically motivated and multilingual. The same clusters occur within the same registers in both languages pointing to shared topical and functional similarities \u2013 the registers are strikingly similar. The dissimilarities, in contrast, indicate that certain registers like Narrative blogs are to some extent different in French and Swedish. Moreover, grammatical specificities such as the location of adjectives explain some dissimilarities.", "venue": "Corpus Linguistics and Linguistic Theory", "label": 0}, {"loc": [4.9267706871032715, 1.5195297002792358], "openalex_id": "https://openalex.org/W4406337689", "title": "The Sociolinguistic Foundations of Language Modeling", "authors": "Jack Grieve, Sara Bartl, Matteo Fuoli, Jason Grafmiller, Weihang Huang, Alejandro Jawerbaum, Akira Murakami, Marcus Perlman, Dana Roemling, Bodo Winter", "abstract": "In this article, we introduce a sociolinguistic perspective on language modeling. We claim that language models in general are inherently modeling varieties of language, and we consider how this insight can inform the development and deployment of language models. We begin by presenting a technical definition of the concept of a variety of language as developed in sociolinguistics. We then discuss how this perspective could help us better understand five basic challenges in language modeling: social bias, domain adaptation, alignment, language change, and scale. We argue that to maximize the performance and societal value of language models it is important to carefully compile training corpora that accurately represent the specific varieties of language being modeled, drawing on theories, methods, and descriptions from the field of sociolinguistics.", "venue": "Frontiers in Artificial Intelligence", "label": 0}, {"loc": [6.993048667907715, 1.9129670858383179], "openalex_id": "https://openalex.org/W4406316538", "title": "Small Language Models (SLMs) Can Still Pack a Punch: A survey", "authors": "Shreyas Subramanian, Vikram Elango, Mecit Gungor", "abstract": "As foundation AI models continue to increase in size, an important question arises - is massive scale the only path forward? This survey of about 160 papers presents a family of Small Language Models (SLMs) in the 1 to 8 billion parameter range that demonstrate smaller models can perform as well, or even outperform large models. We explore task agnostic, general purpose SLMs, task-specific SLMs and techniques to create SLMs that can guide the community to build models while balancing performance, efficiency, scalability and cost. Furthermore we define and characterize SLMs' effective sizes, representing increased capability with respect to LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.97642183303833, -1.025890588760376], "openalex_id": "https://openalex.org/W4406243685", "title": "Improving Text Recognition Accuracy for Serbian Legal Documents Using BERT", "authors": "Milo\u0161 Bogdanovi\u0107, Milena Frtuni\u0107 Gligorijevi\u0107, Jelena Koci\u0107, Leonid Stoimenov", "abstract": "Producing a new high-quality text corpus is a big challenge due to the required complexity and labor expenses. High-quality datasets, considered a prerequisite for many supervised machine learning algorithms, are often only available in very limited quantities. This in turn limits the capabilities of many advanced technologies when used in a specific field of research and development. This is also the case for the Serbian language, which is considered low-resourced in digitized language resources. In this paper, we address this issue for the Serbian language through a novel approach for generating high-quality text corpora by improving text recognition accuracy for scanned documents belonging to Serbian legal heritage. Our approach integrates three different components to provide high-quality results: a BERT-based large language model built specifically for Serbian legal texts, a high-quality open-source optical character recognition (OCR) model, and a word-level similarity measure for Serbian Cyrillic developed for this research and used for generating necessary correction suggestions. This approach was evaluated manually using scanned legal documents sampled from three different epochs between the years 1970 and 2002 with more than 14,500 test cases. We demonstrate that our approach can correct up to 88% of terms inaccurately extracted by the OCR model in the case of Serbian legal texts.", "venue": "Applied Sciences", "label": 8}, {"loc": [8.367429733276367, -0.025286447256803513], "openalex_id": "https://openalex.org/W4406271515", "title": "Advancing Retrieval-Augmented Generation for Persian: Development of Language Models, Comprehensive Benchmarks, and Best Practices for Optimization", "authors": "Sara Bourbour Hosseinbeigi, S. Mohsen Asghari, M. Kashani, Mohammad Hossein Shalchian, Mohammad Amin Abbasi", "abstract": "This paper examines the specific obstacles of constructing Retrieval-Augmented Generation(RAG) systems in low-resource languages, with a focus on Persian's complicated morphology and versatile syntax. The research aims to improve retrieval and generation accuracy by introducing Persian-specific models, namely MatinaRoberta(a masked language model) and MatinaSRoberta(a fine-tuned Sentence-BERT), along with a comprehensive benchmarking framework. Three datasets-general knowledge(PQuad), scientifically specialized texts, and organizational reports, were used to assess these models after they were trained on a varied corpus of 73.11 billion Persian tokens. The methodology involved extensive pretraining, fine-tuning with tailored loss functions, and systematic evaluations using both traditional metrics and the Retrieval-Augmented Generation Assessment framework. The results show that MatinaSRoberta outperformed previous embeddings, achieving superior contextual relevance and retrieval accuracy across datasets. Temperature tweaking, chunk size modifications, and document summary indexing were explored to enhance RAG setups. Larger models like Llama-3.1 (70B) consistently demonstrated the highest generation accuracy, while smaller models faced challenges with domain-specific and formal contexts. The findings underscore the potential for developing RAG systems in Persian through customized embeddings and retrieval-generation settings and highlight the enhancement of NLP applications such as search engines and legal document analysis in low-resource languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.172220230102539, 2.3768787384033203], "openalex_id": "https://openalex.org/W4406231018", "title": "A Survey on Large Language Models with some Insights on their Capabilities and Limitations", "authors": "Andrea Matarazzo, Riccardo Torlone", "abstract": "The rapid advancement of artificial intelligence, particularly with the development of Large Language Models (LLMs) built on the transformer architecture, has redefined the capabilities of natural language processing. These models now exhibit remarkable performance across various language-related tasks, such as text generation, question answering, translation, and summarization, often rivaling human-like comprehension. More intriguingly, LLMs have demonstrated emergent abilities extending beyond their core functions, showing proficiency in tasks like commonsense reasoning, code generation, and arithmetic. This survey paper explores the foundational components, scaling mechanisms, and architectural strategies that drive these capabilities. Emphasizing models like GPT and LLaMA, we analyze the impact of exponential data and computational growth on LLM performance, while also addressing the trade-offs associated with scaling. We also examine LLM applications across sectors, such as healthcare, finance, education, and law, highlighting their adaptability and potential to solve domain-specific challenges. Central to this work are the questions of how LLMs generalize across diverse tasks, exhibit planning, and reasoning abilities, and whether these emergent abilities can be systematically elicited or enhanced. In particular, we provide some insights into the CoT (Chain of Thought) and PoT (Plan of Thought) abilities within LLMs, focusing on how pre-training data influences their emergence. Additionally, we investigate LLM-modulo frameworks that integrate external systems, allowing LLMs to handle complex, dynamic tasks. By analyzing these factors, this paper aims to foster the ongoing discussion on the capabilities and limits of LLMs, promoting their responsible development and application in novel and increasingly complex environments.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.793707370758057, -0.9403038024902344], "openalex_id": "https://openalex.org/W4406234689", "title": "On the effects of machine translation on offensive language detection", "authors": "Alphaeus Dmonte, Shrey Satapara, Rehab Alsudais, Tharindu Ranasinghe, Marcos Zampieri", "abstract": "Abstract Machine translation (MT) is widely used to translate content on social media platforms aiming to improve accessibility. A great part of the content circulated on social media is user-generated and often contains non-standard spelling, hashtags, and emojis that pose challenges to MT systems. This leads to many mistranslated instances that are presented to users of these platforms, hindering their understanding of content written in other languages. In this paper, we investigate the impact of MT on offensive language identification. We pose that MT and potential mistranslations have an important and mostly under-explored impact on social media tasks such as sentiment analysis and offensive language identification. We create MT-Offense, a novel dataset containing English originals and translations in Arabic, Hindi, Marathi, Sinhala, and Spanish produced by multiple open-access Neural Machine Translation systems. We evaluate the performance of various offensive language models on both original and MT content in different training and test set combinations. We report the F1 scores of the models. Our results show that (1) offensive language identification models perform better on original data than on MT data, and (2) the use of MT data in training helps models better identify offensive language in MT content compared to models trained exclusively on original data.", "venue": "Social Network Analysis and Mining", "label": 0}, {"loc": [9.558216094970703, 1.6647353172302246], "openalex_id": "https://openalex.org/W4406188597", "title": "SALT: Sales Autocompletion Linked Business Tables Dataset", "authors": "Tassilo Klein, Clemens Biehl, \u039c. M. R. Costa, Andre Sres, Jonas Kolk, Johannes Hoffart", "abstract": "Foundation models, particularly those that incorporate Transformer architectures, have demonstrated exceptional performance in domains such as natural language processing and image processing. Adapting these models to structured data, like tables, however, introduces significant challenges. These difficulties are even more pronounced when addressing multi-table data linked via foreign key, which is prevalent in the enterprise realm and crucial for empowering business use cases. Despite its substantial impact, research focusing on such linked business tables within enterprise settings remains a significantly important yet underexplored domain. To address this, we introduce a curated dataset sourced from an Enterprise Resource Planning (ERP) system, featuring extensive linked tables. This dataset is specifically designed to support research endeavors in table representation learning. By providing access to authentic enterprise data, our goal is to potentially enhance the effectiveness and applicability of models for real-world business contexts.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.0265491008758545, -0.7759798169136047], "openalex_id": "https://openalex.org/W4406152959", "title": "Hybrid natural language processing tool for semantic annotation of medical texts in Spanish", "authors": "Leonardo Campillos Llanos, Ana Valverde-Mateos, Adri\u00e1n Capllonch-Carri\u00f3n", "abstract": "Abstract Background Natural language processing (NLP) enables the extraction of information embedded within unstructured texts, such as clinical case reports and trial eligibility criteria. By identifying relevant medical concepts, NLP facilitates the generation of structured and actionable data, supporting complex tasks like cohort identification and the analysis of clinical records. To accomplish those tasks, we introduce a deep learning-based and lexicon-based named entity recognition (NER) tool for texts in Spanish. It performs medical NER and normalization, medication information extraction and detection of temporal entities, negation and speculation, and temporality or experiencer attributes (Age, Contraindicated, Negated, Speculated, Hypothetical, Future, Family_member, Patient and Other). We built the tool with a dedicated lexicon and rules adapted from NegEx and HeidelTime. Using these resources, we annotated a corpus of 1200 texts, with high inter-annotator agreement (average F1 = 0.841% \u00b1 0.045 for entities, and average F1 = 0.881% \u00b1 0.032 for attributes). We used this corpus to train Transformer-based models (RoBERTa-based models, mBERT and mDeBERTa). We integrated them with the dictionary-based system in a hybrid tool, and distribute the models via the Hugging Face hub. For an internal validation, we used a held-out test set and conducted an error analysis. For an external validation, eight medical professionals evaluated the system by revising the annotation of 200 new texts not used in development. Results In the internal validation, the models yielded F1 values up to 0.915. In the external validation with 100 clinical trials, the tool achieved an average F1 score of 0.858 (\u00b1 0.032); and in 100 anonymized clinical cases, it achieved an average F1 score of 0.910 (\u00b1 0.019). Conclusions The tool is available at https://claramed.csic.es/medspaner. We also release the code (https://github.com/lcampillos/medspaner) and the annotated corpus to train the models.", "venue": "BMC Bioinformatics", "label": 0}, {"loc": [6.331394672393799, 2.6220054626464844], "openalex_id": "https://openalex.org/W4406121170", "title": "Understand, Solve and Translate: Bridging the Multilingual Mathematical Reasoning Gap", "authors": "Hyunwoo Ko, Guijin Son, Dasol Choi", "abstract": "Large language models (LLMs) demonstrate exceptional performance on complex reasoning tasks. However, despite their strong reasoning capabilities in high-resource languages (e.g., English and Chinese), a significant performance gap persists in other languages. To investigate this gap in Korean, we introduce HRM8K, a benchmark comprising 8,011 English-Korean parallel bilingual math problems. Through systematic analysis of model behaviors, we identify a key finding: these performance disparities stem primarily from difficulties in comprehending non-English inputs, rather than limitations in reasoning capabilities. Based on these findings, we propose UST (Understand, Solve, and Translate), a method that strategically uses English as an anchor for reasoning and solution generation. By fine-tuning the model on 130k synthetically generated data points, UST achieves a 10.91% improvement on the HRM8K benchmark and reduces the multilingual performance gap from 11.6% to 0.7%. Additionally, we show that improvements from UST generalize effectively to different Korean domains, demonstrating that capabilities acquired from machine-verifiable content can be generalized to other areas. We publicly release the benchmark, training dataset, and models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.426898002624512, 2.3120028972625732], "openalex_id": "https://openalex.org/W4406093560", "title": "Metadata Conditioning Accelerates Language Model Pre-training", "authors": "Tianyu Gao, Alexander Wettig, Luxi He, Yihe Dong, Sadhika Malladi, Danqi Chen", "abstract": "The vast diversity of styles, domains, and quality levels present in language model pre-training corpora is essential in developing general model capabilities, but efficiently learning and deploying the correct behaviors exemplified in each of these heterogeneous data sources is challenging. To address this, we propose a new method, termed Metadata Conditioning then Cooldown (MeCo), to incorporate additional learning cues during pre-training. MeCo first provides metadata (e.g., URLs like www$.$wikipedia$.$org) alongside the text during training and later uses a cooldown phase with only the standard text, thereby enabling the model to function normally even without metadata. MeCo significantly accelerates pre-training across different model scales (600M to 8B parameters) and training sources (C4, RefinedWeb, and DCLM). For instance, a 1.6B language model trained with MeCo matches the downstream task performance of standard pre-training while using 33% less data. Additionally, MeCo enables us to steer language models by conditioning the inference prompt on either real or fabricated metadata that encodes the desired properties of the output: for example, prepending wikipedia$.$org to reduce harmful generations or factquizmaster$.$com (fabricated) to improve common knowledge task performance. We also demonstrate that MeCo is compatible with different types of metadata, such as model-generated topics. MeCo is remarkably simple, adds no computational overhead, and demonstrates promise in producing more capable and steerable language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.636568546295166, 0.652617871761322], "openalex_id": "https://openalex.org/W4406077597", "title": "Benchmarking State of the Art Website Embedding Methods for Effective Processing and Analysis in the Public Sector", "authors": "Jonathan M. Gerber, Jasmin Saxer, Bruno Kreiner, Andreas Weiler", "abstract": "Abstract The ability to understand and process websites is crucial across various domains. It lays the foundation for machine understanding of websites. Specifically, website embedding proves invaluable when monitoring local government websites within the context of digital transformation. In this paper, we present a comparison of different state-of-the-art website embedding methods and their capability of creating a reasonable website embedding for our specific task. The models consist of visual, mixed, and textual-based embedding methods. We compare the models with a baseline model which embeds the header section of a website. We measure the performance of the models using zero-shot and transfer learning. We evaluate the performance of the models on three different datasets. Additionally to the embedding scoring, we evaluate the classification performance on these datasets. From the zero-shot models Homepage2Vec with visual, a combination of visual and textual embedding, performs best in general over all datasets. When applying transfer learning, MarkupLM, a markup language-based model, outperforms the others in both cluster scoring as well as precision and F1-score in the classification task. However, time is an important factor when it comes to processing large data quantities. Thus, when additionally considering the time needed, our baseline model is a good alternative, being 1.88 times faster with a maximum decrease of 10 % in the F1-score.", "venue": "https://doi.org/10.21203/rs.3.rs-5664280/v1", "label": 0}, {"loc": [4.391729354858398, 3.0839922428131104], "openalex_id": "https://openalex.org/W4406032895", "title": "OASIS Uncovers: High-Quality T2I Models, Same Old Stereotypes", "authors": "Sepehr Dehdashtian, Gautam Sreekumar, Vishnu Naresh Boddeti", "abstract": "Images generated by text-to-image (T2I) models often exhibit visual biases and stereotypes of concepts such as culture and profession. Existing quantitative measures of stereotypes are based on statistical parity that does not align with the sociological definition of stereotypes and, therefore, incorrectly categorizes biases as stereotypes. Instead of oversimplifying stereotypes as biases, we propose a quantitative measure of stereotypes that aligns with its sociological definition. We then propose OASIS to measure the stereotypes in a generated dataset and understand their origins within the T2I model. OASIS includes two scores to measure stereotypes from a generated image dataset: (M1) Stereotype Score to measure the distributional violation of stereotypical attributes, and (M2) WALS to measure spectral variance in the images along a stereotypical attribute. OASIS also includes two methods to understand the origins of stereotypes in T2I models: (U1) StOP to discover attributes that the T2I model internally associates with a given concept, and (U2) SPI to quantify the emergence of stereotypical attributes in the latent space of the T2I model during image generation. Despite the considerable progress in image fidelity, using OASIS, we conclude that newer T2I models such as FLUX.1 and SDv3 contain strong stereotypical predispositions about concepts and still generate images with widespread stereotypical attributes. Additionally, the quantity of stereotypes worsens for nationalities with lower Internet footprints.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [1.9793752431869507, 5.39094877243042], "openalex_id": "https://openalex.org/W4406053358", "title": "Phishing Detection Using Hybrid Algorithm Based on Clustering and Machine Learning", "authors": "Usman Muhammed Alhaji, Sunday Eric Adewumi, Victoria Yemi-peters", "abstract": "Phishing attacks have been a major threat to cyber security since they take advantage of human vulnerabilities rather than system setbacks, making them difficult to detect. Phishing attacks always involve fraudulent websites designed to mimic legitimate websites to steal sensitive information from victims. This research paper provides a comprehensive literature review to recommend future research. This review paper examines previous papers' application of machine learning (ML) algorithms to phishing detection, focusing on how ML can be used to turn phishing attack problems into classification tasks. This research compared the commonly used ML algorithms like Decision Trees (DT), Random Forest (RF), Support Vector Machines (SVM), Na\u00efve Bayes (NB), k-means Clustering, and Artificial Neural Networks (ANN), these algorithms were compared based on their performance, strengths, and weakness. Key findings reveal that SVM excels with high-dimensional data, RF handles large datasets efficiently, and DT offers simplicity but struggles with complex features. Algorithm performance depends on data and feature selection. This presents the need to develop hybrid or ensemble models to improve detection accuracy and reliability and contribute to stronger cybersecurity frameworks.", "venue": "Journal of Advances in Mathematics and Computer Science", "label": 0}, {"loc": [6.312253475189209, 5.622926235198975], "openalex_id": "https://openalex.org/W4406032893", "title": "2.5 Years in Class: A Multimodal Textbook for Vision-Language Pretraining", "authors": "Wenqi Zhang, Hang Zhang, Xin Li, Jiashuo Sun, Yongliang Shen, Weiming L\u00fc, Deli Zhao, Yueting Zhuang, Lidong Bing", "abstract": "Compared to image-text pair data, interleaved corpora enable Vision-Language Models (VLMs) to understand the world more naturally like humans. However, such existing datasets are crawled from webpage, facing challenges like low knowledge density, loose image-text relations, and poor logical coherence between images. On the other hand, the internet hosts vast instructional videos (e.g., online geometry courses) that are widely used by humans to learn foundational subjects, yet these valuable resources remain underexplored in VLM training. In this paper, we introduce a high-quality \\textbf{multimodal textbook} corpus with richer foundational knowledge for VLM pretraining. It collects over 2.5 years of instructional videos, totaling 22,000 class hours. We first use an LLM-proposed taxonomy to systematically gather instructional videos. Then we progressively extract and refine visual (keyframes), audio (ASR), and textual knowledge (OCR) from the videos, and organize as an image-text interleaved corpus based on temporal order. Compared to its counterparts, our video-centric textbook offers more coherent context, richer knowledge, and better image-text alignment. Experiments demonstrate its superb pretraining performance, particularly in knowledge- and reasoning-intensive tasks like ScienceQA and MathVista. Moreover, VLMs pre-trained on our textbook exhibit outstanding interleaved context awareness, leveraging visual and textual cues in their few-shot context for task solving. Our code are available at https://github.com/DAMO-NLP-SG/multimodal_textbook.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.714012145996094, 2.8297810554504395], "openalex_id": "https://openalex.org/W4406033018", "title": "FED: Fast and Efficient Dataset Deduplication Framework with GPU Acceleration", "authors": "Young\u2010Jun Son, C. Kim, Jaejin Lee", "abstract": "Dataset deduplication plays a crucial role in enhancing data quality, ultimately improving the training performance and efficiency of large language models. A commonly used method for data deduplication is the MinHash LSH algorithm. Recently, NVIDIA introduced a GPU-based MinHash LSH deduplication method, but it remains suboptimal, leaving room for further improvement in processing efficiency. This paper proposes a GPU-accelerated deduplication framework, FED, that optimizes MinHash LSH for GPU clusters and leverages computationally efficient, partially reusable non-cryptographic hash functions. FED significantly outperforms the CPU-based deduplication tool in SlimPajama (using 64 logical CPU cores) by up to 107.2 times and the GPU-based tool in NVIDIA NeMo Curator by up to 6.3 times when processing 30 million documents on a node with four GPUs. Notably, our method dramatically accelerates the previously time-consuming MinHash signature generation phase, achieving speed-ups of up to 260 compared to the CPU baseline. Despite these gains in efficiency, FED maintains high deduplication quality, with the duplicate document sets reaching a Jaccard similarity of over 0.96 compared to those identified by the standard MinHash algorithm. In large-scale experiments, the deduplication of 1.2 trillion tokens is completed in just 6 hours in a four-node, 16-GPU environment. The related code is publicly available on GitHub (\\href{https://github.com/mcrl/FED}{https://github.com/mcrl/FED}).", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.8458502292633057, 3.064314842224121], "openalex_id": "https://openalex.org/W4407220072", "title": "FROM IA_ARCHIVER TO OPENAI: THE PASTS AND FUTURES OF AUTOMATED DATA SCRAPERS", "authors": "Katherine Mackinnon, Emily Maemura", "abstract": "Data scraping practices have recently come under scrutiny, as datasets scraped from the web\u2019s social spaces are the basis of new generative AI tools like Google\u2019s Gemini, Microsoft\u2019s Copilot, and OpenAI\u2019s ChatGPT. These practices of scrapers and crawlers are based on the conception of the internet as a mountain of data that\u2019s sitting, waiting, available to be acted upon, extracted and put to use. In this paper, we examine the robots.txt exclusion protocol which has been used to govern the behavior of crawlers and is often taken as a proxy for consent in widespread data scraping and web archiving. By addressing the underlying assumptions of the protocol, we aim to counter a recent narrative that \u201cthe basic social contract of the web is falling apart\u201d (Pierce, 2024), and instead argue that data extractive infrastructures have always been at work over the past 30 years of the web. Positioning this work within the field of critical data studies, we aim to find new ways for web archives and modes of collection to become unbound from the \u201ccapitalist logics of data extraction\u201d upon which they\u2019re currently built (Theilen et al., 2021).", "venue": "AoIR Selected Papers of Internet Research", "label": 0}, {"loc": [3.227071762084961, 3.4051640033721924], "openalex_id": "https://openalex.org/W4405992798", "title": "Transformation of ChatGPT into Threat: The Effects of Generative AI on Data Protection and Security", "authors": "Nishchai Jayanna Manjula, Kiran Randhi, Srinivas Reddy Bandarapu", "abstract": "Purpose: For 2022, GenAI models were the main digital transformation advancement. Cybersecurity is crucial when GenAI models like ChatGPT and Google Bard get more complex. Cybersecurity incidents have highlighted GenAI's offensive and defensive use, creating social, ethical, and privacy issues. GenAI's privacy and cybersecurity risks, possibilities, and constraints are covered in this paper. This study demonstrates ChatGPT's security flaws, which bad actors might utilize to steal sensitive data by violating the model's ethics. In this research, we show ChatGPT attacks using jailbreaks, reverse psychology, and quick injection. Learn how hackers utilize GenAI to launch cyberattacks. Materials and Methods: ChatGPT is great for customer service, but Bard AI is where it's at when it comes to conversational apps. Diverse technologies have diverse developer communities and ecosystems. With over 100 million users and 1.8 billion monthly visits, ChatGPT is a popular choice among developers and academics because of its accessibility. Bard AI is still in beta and only available to a small group of people, but its APIs are available to the public. OpenAI and Google have different methods to model openness and accessibility. OpenAI makes ChatGPT accessible via APIs, whereas Bard AI, now in the experimental phase, is confined to a select user base. ChatGPT uses semi-supervised (RLHF) training data from sources such as WebText2, OpenWebText2, Common Crawl, scientific literature, and Wikipedia. In contrast, Bard AI uses the Infini set dataset, a varied online content mix, to improve discussion engagement. Findings: Various forms of social engineering, phishing, automated hacking, malware development, attack payload production, and polymorphic malware may be accomplished using ChatGPT. The report covers defense techniques and upgrades to GenAI security. Automated cyber defenses, reporting, threat data, secure code creation and detection, attack detection, ethical standards, incident response plans, and malware detection are all among these. We will examine the social, ethical, and legal consequences of ChatGPT. Implications to Theory, Practice and Policy: Given the potential impact on cybersecurity, the paper outlines the present situation and suggests steps the community may take moving forward to make sure this GenAI is reliable, safe, and ethical.", "venue": "American Journal of Computing and Engineering", "label": 0}, {"loc": [5.355979919433594, 2.1068761348724365], "openalex_id": "https://openalex.org/W4409974066", "title": "Event knowledge and object-scene knowledge jointly influence fixations in scenes", "authors": "Sophie Heer, Marek A. Pedziwiatr, Peter J. Bex, Antoine Coutrot, Isabelle Mareschal", "abstract": "International audience", "venue": "Visual Cognition", "label": 0}, {"loc": [4.440905570983887, 2.392012119293213], "openalex_id": "https://openalex.org/W4411120177", "title": "Robust Bias Detection in MLMs and its Application to Human Trait Ratings", "authors": "Ingroj Shrestha, Louis Tay, Padmini Srinivasan", "abstract": "There has been significant prior work using templates to study bias against demographic attributes in MLMs. However, these have limitations: they overlook random variability of templates and target concepts analyzed, assume equality amongst templates, and overlook bias quantification. Addressing these, we propose a systematic statistical approach to assess bias in MLMs, using mixed models to account for random effects, pseudo-perplexity weights for sentences derived from templates and quantify bias using statistical effect sizes. Replicating prior studies, we match on bias scores in magnitude and direction with small to medium effect sizes. Next, we explore the novel problem of gender bias in the context of $\\textit{personality}$ and $\\textit{character}$ traits, across seven MLMs (base and large). We find that MLMs vary; ALBERT is unbiased for binary gender but the most biased for non-binary $\\textit{neo}$, while RoBERTa-large is the most biased for binary gender but shows small to no bias for $\\textit{neo}$. There is some alignment of MLM bias and findings in psychology (human perspective) - in $\\textit{agreeableness}$ with RoBERTa-large and $\\textit{emotional stability}$ with BERT-large. There is general agreement for the remaining 3 personality dimensions: both sides observe at most small differences across gender. For character traits, human studies on gender bias are limited thus comparisons are not feasible.", "venue": "https://doi.org/10.18653/v1/2025.findings-naacl.272", "label": 0}, {"loc": [8.320770263671875, 1.756783366203308], "openalex_id": "https://openalex.org/W4412887890", "title": "MetaSynth: Meta-Prompting-Driven Agentic Scaffolds for Diverse Synthetic Data Generation", "authors": "Haris Riaz, Sourav Sanjukta Bhabesh, Vinayak Arannil, Miguel Ballesteros, Graham Horwood", "abstract": "Recent smaller language models such Phi-3.5 and Phi-4 rely on synthetic data generated using larger Language models. Questions remain about leveraging synthetic data for other use cases, such as adapting LLMs to specific domains. A key limitation of synthetic data is low diversity, which negatively impacts its downstream applicability for improving other models. To address this, we propose MetaSynth, a method for generating synthetic data that enhances diversity through meta-prompting, where a language model orchestrates multiple \"expert\" LLM agents to collaboratively generate data. Using only 25 million tokens of synthetic data generated with MetaSynth, we successfully adapt a well-trained LLM (Mistral-7B-v0.3) to two specialized domains-Finance and Biomedicine-without compromising the capabilities of the resulting model in general tasks. In addition, we evaluate the diversity of our synthetic data using seven automated metrics, and find that it approaches the diversity of LLM pre-training corpora. Continually pre-training Mistral-7B-v0.3 with MetaSynth notably outperforms the base LLM, showing improvements of up to 4.08% in Finance and 13.75% in Biomedicine. The same model shows degraded performance when trained on data generated using a template prompt, even when the template includes prior generations and varying In-Context exemplars of real data. Our findings suggest that a few million tokens of diverse synthetic data without mixing any real data, is sufficient for effective domain adaptation when using MetaSynth.", "venue": "https://doi.org/10.18653/v1/2025.findings-acl.962", "label": 0}, {"loc": [7.5966105461120605, -1.08641517162323], "openalex_id": "https://openalex.org/W4406735064", "title": "AFRIDOC-MT: Document-level MT Corpus for African Languages", "authors": "Jesujoba O. Alabi, Israel Abebe Azime, Miaoran Zhang, Cristina Espa\u00f1a-Bonet, Rachel Bawden, Dawei Zhu, David Ifeoluwa Adelani, Clement Odoje, Idris Akinade, Iffat Maab, Davis David, Shamsuddeen Hassan Muhammad, Neo Putini, David O. Ademuyiwa, Andrew Caines, Dietrich Klakow", "abstract": "This paper introduces AFRIDOC-MT, a document-level multi-parallel translation dataset covering English and five African languages: Amharic, Hausa, Swahili, Yor\u00f9b\u00e1, and Zulu. The dataset comprises 334 health and 271 information technology news documents, all human-translated from English to these languages. We conduct document-level translation benchmark experiments by evaluating neural machine translation (NMT) models and large language models (LLMs) for translations between English and these languages, at both the sentence and pseudo-document levels. These outputs are realigned to form complete documents for evaluation. Our results indicate that NLLB-200 achieved the best average performance among the standard NMT models, while GPT-4o outperformed general-purpose LLMs. Fine-tuning selected models led to substantial performance gains, but models trained on sentences struggled to generalize effectively to longer documents. Furthermore, our analysis reveals that some LLMs exhibit issues such as under-generation, repetition of words or phrases, and off-target translations, especially for African languages.", "venue": "https://doi.org/10.18653/v1/2025.emnlp-main.1413", "label": 0}, {"loc": [3.945453405380249, 1.2620691061019897], "openalex_id": "https://openalex.org/W4407127924", "title": "Interacting Large Language Model Agents. Bayesian Social Learning Based Interpretable Models.", "authors": "Adit Jain, Vikram Krishnamurthy", "abstract": "This paper discusses the theory and algorithms for interacting large language model agents (LLMAs) using methods from statistical signal processing and microeconomics. While both fields are mature, their application to decision-making involving interacting LLMAs remains unexplored. Motivated by Bayesian sentiment analysis on online platforms, we construct interpretable models and stochastic control algorithms that enable LLMAs to interact and perform Bayesian inference. Because interacting LLMAs learn from both prior decisions and external inputs, they can exhibit bias and herding behavior. Thus, developing interpretable models and stochastic control algorithms is essential to understand and mitigate these behaviors. This paper has three main results. First, we show using Bayesian revealed preferences from microeconomics that an individual LLMA satisfies the necessary and sufficient conditions for rationally inattentive (bounded rationality) Bayesian utility maximization and, given an observation, the LLMA chooses an action that maximizes a regularized utility. Second, we utilize Bayesian social learning to construct interpretable models for LLMAs that interact sequentially with each other and the environment while performing Bayesian inference. Our proposed models capture the herding behavior exhibited by interacting LLMAs. Third, we propose a stochastic control framework to delay herding and improve state estimation accuracy under two settings: 1) centrally controlled LLMAs and 2) autonomous LLMAs with incentives. Throughout the paper, we numerically demonstrate the effectiveness of our methods on real datasets for hate speech classification and product quality assessment, using open-source models like LLaMA and Mistral and closed-source models like ChatGPT. The main takeaway of this paper, based on substantial empirical analysis and mathematical formalism, is that LLMAs act as rationally bounded Bayesian agents that exhibit social learning when interacting. Traditionally, such models are used in economics to study interacting human decision-makers.", "venue": "IEEE Access", "label": 2}, {"loc": [8.21156120300293, 2.9056975841522217], "openalex_id": "https://openalex.org/W4410737867", "title": "A Comprehensive Overview and Analysis of Large Language Models: Trends and Challenges", "authors": "Ammar Mohammed, Rania Kora", "abstract": "Large Language Models (LLMs) have transformed numerous fields by offering innovative solutions that drive advancements across a wide range of applications. However, their widespread adoption presents several challenges, including variations in architectures, limitations in processing capabilities, and high computational resource demands for training. Addressing these challenges is crucial for maximizing the benefits of LLMs while ensuring their responsible and efficient use. This paper reviews LLMs, focusing on their key characteristics and the factors that influence their performance. It examines several prominent families of LLMs and provides a comparative analysis of their properties. In addition, it explores the classification of LLMs based on criteria such as availability, context window, and model size. In addition, the study explores advanced fine-tuning techniques, including Parameter-Efficient fine-tuning (PEFT) and Low-Rank Adaptation (LoRA), that enhance the performance and efficiency of models. Furthermore, it reviews the wide-ranging applications of LLMs and evaluates the methodologies used to evaluate their effectiveness.", "venue": "IEEE Access", "label": 2}, {"loc": [8.9479398727417, 0.22877997159957886], "openalex_id": "https://openalex.org/W4414007543", "title": "Unsupervised LLM Adaptation for Question Answering", "authors": "Haiyun Li, Jixin Zhang, Hua Shen, Ke Cheng, Xiaofeng Huang", "abstract": "Abstract The rapid advancement of large language models (LLMs) has opened up promising opportunities for their downstream applications in question-answering (QA), such as ChatGPT, ChatGLM, etc. However, such LLMs do not perform very well in domain-specific QA tasks without fine-tuning. But directly fine-tuning LLMs on domain-specific corpus data may lead to catastrophic forgetting, causing the LLMs to lose their general language capability. To address this problem, we propose the Knowledge-Enhanced Fine-Tuning (KEFT) method, an unsupervised fine-tuning approach to enhance the knowledge capability of LLMs in domain-specific QA tasks while preserving their general language capability. KEFT leverages the inherent language comprehension of pre-trained LLMs to generate synthetic-QA datasets from domain-specific corpus data autonomously for fine-tuning, and adopts a Low-Rank Adaptation (LoRA) method to further alleviate over-fitting. Furthermore, to enhance the representation of domain-specific knowledge, we introduce a knowledge-enhanced fine-tuning loss function, which encourages the model to learn the knowledge-question connection, thereby generating natural and knowledgeable answers. Our evaluations across multiple domain-specific datasets demonstrate that KEFT surpasses state-of-the-art fine-tuning approaches, enhancing the performance of various LLMs in QA tasks in both English and Chinese languages.", "venue": "Transactions of the Association for Computational Linguistics", "label": 26}, {"loc": [3.313772201538086, 2.6327695846557617], "openalex_id": "https://openalex.org/W4407947964", "title": "Formatting the Visible. In the Factory of Photo Datasets (2005\u20112021)", "authors": "Thierry Sugitani", "abstract": "A core element of the AI algorithms applied to images, photo datasets illustrate the central part that photography plays in these machine learning systems. But what do these vast sets of images reveal? And what operations are involved? The visual regimes that were observed in twenty-five datasets evolved between 2005 and 2021: limited sets of standardized, descriptive images made way for massive online collections that are automated and uncurated. The practices and protocols used for selecting and annotating and the visual processing performed on the photographs are meant to neutralize bias and sample a visible world that is fragmented and decontextualized. More than a reflection of reality, these datasets thus express a reconstruction of it in the service of an increasingly complex set of algorithmic problems. Photography has become a raw material that is exploited and crafted in a process of industrial extraction.", "venue": "Transbordeur.", "label": 0}, {"loc": [8.321842193603516, 3.4791672229766846], "openalex_id": "https://openalex.org/W4405967751", "title": "Large Language Models for Human-Machine Collaborative Particle Accelerator Tuning through Natural Language", "authors": "Jan Kaiser, Anne Lauscher, Annika Eichler", "abstract": "Autonomous tuning of particle accelerators is an active and challenging research field with the goal of enabling advanced accelerator technologies and cutting-edge high-impact applications, such as physics discovery, cancer research, and material sciences. A challenge with autonomous accelerator tuning remains that the most capable algorithms require experts in optimization and machine learning to implement them for every new tuning task. Here, we propose the use of large language models (LLMs) to tune particle accelerators. We demonstrate on a proof-of-principle example the ability of LLMs to tune an accelerator subsystem based on only a natural language prompt from the operator, and compare their performance to state-of-the-art optimization algorithms, such as Bayesian optimization and reinforcement learning\u2013trained optimization. In doing so, we also show how LLMs can perform numerical optimization of a nonlinear real-world objective. Ultimately, this work represents another complex task that LLMs can solve and promises to help accelerate the deployment of autonomous tuning algorithms to day-to-day particle accelerator operations.", "venue": "Science Advances", "label": 0}, {"loc": [3.1117448806762695, 2.471198081970215], "openalex_id": "https://openalex.org/W4407720606", "title": "Human Perspectives and Social Infrastructures: Prioritising People in GLAM Digitisation", "authors": "Megan Gooch, Rebecca Kahn, Edurne Kugeler", "abstract": "Much discussion in current digital humanities research and funding is concerned with creating, using and maintaining technical and research infrastructures. These large-scale projects are often ambitious \u2013 designed to bring tools, data and researchers together to pool resources, work across silos, leverage regional competencies and avoid duplication of effort. Technologies such as the semantic web and linked data are deployed to collate diverse collections into cross-institutional platforms. National and supra-national bodies earmark funds to develop portals enabling access to aggregated digital heritage, and shared data spaces. This consolidation is often driven by calls for broader access to materials, wider accountability from holding institutions to a range of different publics, and the need for reproducibility \u2013 particularly for publicly funded projects and institutions. This is paired with increased demand for more quantified metrics and approaches within heritage digitisation. The massive volume of material coming online every year is not always easy to find, maintain or use. Based on our individual (and shared) experiences of working in an academic library, a national museum and a university digital humanities group, we argue that it is essential to consider the public as a network, as we would in terms of infrastructure. It is this network which should drive thinking on how digitisation is done, and how infrastructures are developed. We argue that modern information organisation standards are not necessarily human oriented, and that we must take consideration and care to ensure that social infrastructures are not overlooked in these contexts. The FAIR and CARE principles are not always easy to apply to massive collections of (often heterogenous) materials, and additional guidelines and best-practice approaches need to be developed, if we want to allow the users of digital collections to experience the transformative moments in collections.", "venue": "Journal of Open Humanities Data", "label": 0}, {"loc": [5.429777145385742, -1.3463753461837769], "openalex_id": "https://openalex.org/W4406890734", "title": "Urdu Toxic Comment Classification with PURUTT Corpus Development", "authors": "Hafiz Hassaan Saeed, Tarek M. Khalil, Faisal Kamiran", "abstract": "This study addresses the critical gap in toxic comment classification in Urdu, a widely spoken language devoid of high-quality standard datasets. To address this gap, we employed an existing labeled Roman Urdu (RU) corpus, which was developed originally for Roman Urdu toxic comment classification, and supplemented that corpus by adding its Urdu equivalent transliterations. The motivation behind such an extension is twofold: firstly, to provide a large comprehensive dataset for the classification of toxic comments in Urdu; secondly, to facilitate bidirectional transliteration between Urdu and RU, however, transliteration is currently outside the scope of this study and is envisioned as a future research direction. We introduce the extended corpus as PURUTT (Parallel Urdu and Roman Urdu Corpus for Toxic Comments and Transliteration), boasting 72,771 labeled comments as parallel comments in both Urdu and Roman Urdu scripts. Specific to Urdu toxic comment classification, our methodology begins by training those classification models that were trained on the original Roman Urdu corpus. We leverage pre-trained Word2Vec and FastText Urdu word embeddings to evaluate model performance through transfer learning. Furthermore, we fine-tune five multilingual large language models capitalizing on their inherent multilingual capabilities. To further enhance the classification performance, this study proposes an ensemble approach that aggregates the strengths of multiple base models. Our extensive empirical validation demonstrates the superiority of the ensemble model, achieving a state-of-the-art F1-score of 91.65% on PURUTT, setting a benchmark F1-score on PURUTT corpus for Urdu toxic comment classification.", "venue": "IEEE Access", "label": 2}, {"loc": [4.102176666259766, -1.3480300903320312], "openalex_id": "https://openalex.org/W4406983035", "title": "TepiSense: A Social Computing based Real-Time Epidemic Surveillance System using Artificial Intelligence.", "authors": "Bilal Tahir, Muhammad Amir Mehmood", "abstract": "Artificial Intelligence (AI) technologies have enabled researchers to develop tools to monitor real-world events and user behavior using social media platforms. Twitter is particularly useful for gathering invaluable information related to diseases and public health to build real-time disease surveillance systems. Such systems offer a cost-effective and efficient alternative to the passive, expensive, and time-consuming process of using data from healthcare organizations and hospitals. In this paper, we propose a novel system of TepiSense to automatically perform disease surveillance of epidemic-prone diseases. Our system classifies tweets related to diseases and further identifies ‘indication’ tweets that highlight the presence of patients. Our system consists of four distinct modules of pre-processor, feature extractor, classifier, and evaluator. TepiSense compares the performance of 3 feature extraction techniques, 9 machine/deep learning models, and 3 Large Language Models (LLMs). To test the performance of our system, we build a dataset of Twitter Epidemic Surveillance Corpus (TESC) containing 23.9K English and 13K labelled Urdu tweets related to six diseases: COVID19, hepatitis, malaria, flu, dengue, and HIV/AIDS. Our results show that mBERT LLM achieves the highest F-measure values of 0.96 and 0.83 for topic and indication tweets classification, respectively. Furthermore, we compute the correlation of signals generated by our system with real-world cases to test the efficacy on COVID19 disease. We notice that real-world cases have a correlation of 0.58-0.63 with the indication category tweets. Finally, we develop an interactive and user-friendly dashboard to disseminate the analytics of our system. Overall, our system offers a powerful tool for real-time disease surveillance using social media with potential implications for public health policy and decision-making.", "venue": "IEEE Access", "label": 2}, {"loc": [6.918438911437988, 0.2368076741695404], "openalex_id": "https://openalex.org/W4406241919", "title": "A survey of multilingual large language models", "authors": "Libo Qin, Qiguang Chen, Yuhang Zhou, Zhi Chen, Yinghui Li, Lizi Liao, Min Li, Wanxiang Che, Philip S. Yu", "abstract": "Multilingual large language models (MLLMs) leverage advanced large language models to process and respond to queries across multiple languages, achieving significant success in polyglot tasks. Despite these breakthroughs, a comprehensive survey summarizing existing approaches and recent developments remains absent. To this end, this paper presents a unified and thorough review of the field, highlighting recent progress and emerging trends in MLLM research. The contributions of this paper are as follows. (1) Extensive survey: to our knowledge, this is the pioneering thorough review of multilingual alignment in MLLMs. (2) Unified taxonomy: we provide a unified framework to summarize the current progress in MLLMs. (3) Emerging frontiers: key emerging frontiers are identified, alongside a discussion of associated challenges. (4) Abundant resources: we collect abundant open-source resources, including relevant papers, data corpora, and leaderboards. We hope our work can provide the community quick access and spur breakthrough research in MLLMs.", "venue": "Patterns", "label": 0}, {"loc": [6.793241500854492, 0.5422438979148865], "openalex_id": "https://openalex.org/W4413265667", "title": "A Comparative Analysis of Static Word Embeddings for Hungarian", "authors": "M\u00e1t\u00e9 Gedeon", "abstract": "This paper presents a comprehensive analysis of various static word embeddings for the Hungarian language, including traditional models such as Word2Vec, FastText, as well as static embeddings derived from BERT-based models using different extraction methods. We evaluate these embeddings on both intrinsic and extrinsic tasks to provide a holistic view of their performance. For intrinsic evaluation, we employ a word analogy task, which assesses the embeddings\u2019 ability to capture semantic and syntactic relationships. Our results indicate that traditional static embeddings, particularly FastText, excel in this task, achieving high accuracy and mean reciprocal rank (MRR) scores. Among the BERT-based models, the X2Static method for extracting static embeddings demonstrates superior performance compared to decontextualized and aggregate methods, approaching the effectiveness of traditional static embeddings. For extrinsic evaluation, we utilize a bidirectional LSTM model to perform Named Entity Recognition (NER) and Part-ofSpeech (POS) tagging tasks. The results reveal that embeddings derived from dynamic models, especially those extracted using the X2Static method, outperform purely static embeddings. Notably, ELMo embeddings achieve the highest accuracy in both NER and POS tagging tasks, underscoring the benefits of contextualized representations even when used in a static form. Our findings highlight the continued relevance of static word embeddings in NLP applications and the potential of advanced extraction methods to enhance the utility of BERT-based models. This piece of research contributes to the understanding of embedding performance in the Hungarian language and provides valuable insights for future developments in the field. The training scripts, evaluation codes, restricted vocabulary, and extracted embeddings will be made publicly available to support further research and reproducibility.", "venue": "H\u00edrad\u00e1stechnika/Infocommunications journal", "label": 0}, {"loc": [3.0343687534332275, 1.816669225692749], "openalex_id": "https://openalex.org/W4408007963", "title": "The Social Impact of Generative LLM-Based AI", "authors": "Yu Xie, Sofia Avila", "abstract": "Liking it or not, ready or not, we are likely to enter a new phase of human history in which artificial intelligence (AI) will dominate economic production and social life\u2014the AI revolution. Before the actual arrival of the AI revolution, it is time for us to speculate on how AI will impact the social world. In this article, we focus on the social impact of generative LLM-based AI, discussing societal factors that contribute to its technological development and its potential roles in enhancing both between-country and within-country social inequality. There are good indications that the US and China will lead the field and will be the main competitors for domination of AI in the world. We conjecture that the AI revolution will likely give rise to a post-knowledge society in which knowledge per se will become less important than in today's world. Instead, individual relationships and social identity will become more important. So will soft skills, including the ability to utilize AI.", "venue": "Chinese Journal of Sociology", "label": 0}, {"loc": [6.233206748962402, 0.36609601974487305], "openalex_id": "https://openalex.org/W4406220722", "title": "Comparing the semantic structures of lexicon of Mandarin and English", "authors": "Yi Yang, R. Harald Baayen", "abstract": "Abstract This paper presents a cross-language study of lexical semantics within the framework of distributional semantics. We used a wide range of predefined semantic categories in Mandarin and English and compared the clusterings of these categories using FastText word embeddings. Three techniques of dimensionality reduction were applied to mapping 300-dimensional FastText vectors into two-dimensional planes: multidimensional scaling, principal components analysis, and t-distributed stochastic neighbor embedding. The results show that t-SNE provides the clearest clustering of semantic categories, improving markedly on PCA and MDS. In both languages, we observed similar differentiation between verbs, adjectives, and nouns as well as between concrete and abstract words. In addition, the methods applied in this study, especially Procrustes analysis, make it possible to trace subtle differences in the structure of the semantic lexicons of Mandarin and English.", "venue": "Language and Cognition", "label": 0}, {"loc": [1.984834909439087, 5.374635696411133], "openalex_id": "https://openalex.org/W4409473803", "title": "RSTHFS: A Rough Set Theory-Based Hybrid Feature Selection Method for Phishing Website Classification", "authors": "Jahanggir Hossain Setu, Nabarun Halder, Ashraful Islam, M. Ashraful Amin", "abstract": "Phishing is a pervasive form of cybercrime where malicious websites deceive users into revealing sensitive information, e.g., passwords and credit card details. Despite advances in cybersecurity, accurately detecting phishing websites remains challenging due to the absence of universally accepted identification parameters. This study introduces a novel feature selection method, Rough Set Theory-based Hybrid Feature Selection (RSTHFS), to enhance phishing website detection using Machine Learning (ML) techniques. Our approach was evaluated using three diverse datasets containing 2,456, 10,000, and 88,647 instances. The RSTHFS method demonstrated a significant improvement by maintaining an average accuracy rate of 95.48% while reducing the number of features by 69.11% on average. Performance was further assessed using three advanced classifiers: Light Gradient-Boosting Machine (LightGBM), Random Forest (RF), and Categorical Boosting (CatBoost), with CatBoost emerging as the most efficient, achieving the highest accuracy. Additionally, RSTHFS reduced the runtime by 61.43%, highlighting its efficiency. These findings indicate that RSTHFS is not only effective in identifying phishing websites but also accelerates ML processes, providing a reliable and swift approach to feature selection. This work contributes to the field by presenting a robust methodology that enhances the accuracy and speed of phishing detection systems.", "venue": "IEEE Access", "label": 2}, {"loc": [5.879062652587891, 5.483364582061768], "openalex_id": "https://openalex.org/W4412889781", "title": "NusaAksara: A Multimodal and Multilingual Benchmark for Preserving Indonesian Indigenous Scripts", "authors": "Muhammad Farid Adilazuarda, Musa Izzanardi Wijanarko, Lucky Susanto, Khumaisa Nur\u2019aini, Derry Wijaya, Alham Fikri Aji", "abstract": "Indonesia is rich in languages and scripts. However, most NLP progress has been made using romanized text. In this paper, we present NusaAksara, a novel public benchmark for Indonesian languages that includes their original scripts. Our benchmark covers both text and image modalities and encompasses diverse tasks such as image segmentation, OCR, transliteration, translation, and language identification. Our data is constructed by human experts through rigorous steps. NusaAksara covers 8 scripts across 7 languages, including low-resource languages not commonly seen in NLP benchmarks. Although unsupported by Unicode, the Lampung script is included in this dataset. We benchmark our data across several models, from LLMs and VLMs such as GPT-4o, Llama 3.2, and Aya 23 to task-specific systems such as PP-OCR and LangID, and show that most NLP technologies cannot handle Indonesia's local scripts, with many achieving near-zero performance.", "venue": "https://doi.org/10.18653/v1/2025.acl-long.1377", "label": 0}, {"loc": [9.133715629577637, -0.8918654918670654], "openalex_id": "https://openalex.org/W4408061940", "title": "Large Language Models for Summarizing Czech Historical Documents and Beyond", "authors": "Cuong Tran, Jakub \u0160m\u00edd, Ji\u0159\u0131\u0301 Mart\u0131\u0301nek, Ladislav Lenc, Pavel Kr\u00e1l", "abstract": "Text summarization is the task of shortening a larger body of text into a concise version while retaining its essential meaning and key information. While summarization has been significantly explored in English and other high-resource languages, Czech text summarization, particularly for historical documents, remains underexplored due to linguistic complexities and a scarcity of annotated datasets. Large language models such as Mistral and mT5 have demonstrated excellent results on many natural language processing tasks and languages. Therefore, we employ these models for Czech summarization, resulting in two key contributions: (1) achieving new state-of-the-art results on the modern Czech summarization dataset SumeCzech using these advanced models, and (2) introducing a novel dataset called Posel od \u010cerchova for summarization of historical Czech documents with baseline results. Together, these contributions provide a great potential for advancing Czech text summarization and open new avenues for research in Czech historical text processing.", "venue": "https://doi.org/10.5220/0013374100003890", "label": 0}, {"loc": [9.297856330871582, 0.941106915473938], "openalex_id": "https://openalex.org/W4407558647", "title": "The contribution of LLMs to relation extraction in the economic field", "authors": "Mohamed Ettaleb, V\u00e9ronique Moriceau, Mouna Kamel, Nathalie Aussenac-Gilles", "abstract": "International audience", "venue": "HAL (Le Centre pour la Communication Scientifique Directe)", "label": 6}, {"loc": [5.0366363525390625, 1.4043601751327515], "openalex_id": "https://openalex.org/W4410357609", "title": "Survey of Cultural Awareness in Language Models: Text and Beyond", "authors": "Siddhesh Pawar, Junyeong Park, Jiho Jin, Arnav Arora, Junho Myung, Srishti Yadav, Faiz Ghifari Haznitrama, Inhwa Song, Alice Oh, Isabelle Augenstein", "abstract": "Abstract Large-scale deployment of large language models (LLMs) in various applications, such as chatbots and virtual assistants, requires LLMs to be culturally sensitive to the user to ensure inclusivity. Culture has been widely studied in psychology and anthropology, and there has been a recent surge in research on making LLMs more culturally inclusive, going beyond multilinguality and building on findings from psychology and anthropology. In this article, we survey efforts towards incorporating cultural awareness into text-based and multimodal LLMs. We start by defining cultural awareness in LLMs, taking definitions of culture from the anthropology and psychology literature as a point of departure. We then examine methodologies adopted for creating cross-cultural datasets, strategies for cultural inclusion in downstream tasks, and methodologies that have been used for benchmarking cultural awareness in LLMs. Further, we discuss the ethical implications of cultural alignment, the role of human\u2013computer interaction in driving cultural inclusion in LLMs, and the role of cultural alignment in driving social science research. We finally provide pointers to future research based on our findings about gaps in the literature.1", "venue": "Computational Linguistics", "label": 27}, {"loc": [8.139898300170898, 1.0872600078582764], "openalex_id": "https://openalex.org/W4408834179", "title": "Natural Language Processing Techniques for Information Retrieval Enhancing Search Engines with Semantic Understanding", "authors": "S Subi, B. Shanthini, Manda Silparaj, Kiran Shekar, G Keerthana, Raju Anitha", "abstract": "This paper investigates new Natural Language Processing (NLP) methods which seek to improve information retrieval systems via semantic knowledge and focuses on enhancing search engines. The proposed ideas focus on reducing the size of the model (one of the biggest problems with large models), training it on domain-specific knowledge (the right knowledge is important for the real application) and ways to efficiently deal with unstructured data (this is also a key issue against NLP frameworks). The study highlights the need for hybrid models that combine generalization and specificity, fast algorithms for big data sets, and automated knowledge extraction. They include cross-lingual approaches, rapid learning in out-of-distribution domains, and human-centered design of AI systems. The end objective of this work is to create a semantic search engine which is adaptive, scalable and flexible; intent aware, and query ambiguity tolerant; improving semantic richness in results tailored to datasets of varying size; hence promising complementary applications of Natural Language Processing to information retrieval.", "venue": "ITM Web of Conferences", "label": 0}, {"loc": [6.885493278503418, 2.4955856800079346], "openalex_id": "https://openalex.org/W4410393794", "title": "Automated Classification and Identification of Non-Functional Requirements in Agile-Based Requirements Using Pre-Trained Language Models", "authors": "Abdulrahim Alhaizaey, Majed Al\u2010Mashari", "abstract": "Non-functional requirements (NFRs) are critical factors for software quality and success. A frequently reported challenge in agile requirements engineering is that NFRs are often neglected due to the focus on functional requirements (FRs) and the limited capability of agile requirements documented as user stories to represent NFRs. With the emergence of transfer learning and large pre-trained language models, various applications in requirements engineering have become feasible, alleviating several longstanding challenges. This study evaluates transformer-based models for the automated identification and classification of NFRs. We leveraged transfer learning with pre-trained transformer models to automate the identification and classification of NFRs in agile textual requirements documented as user stories. A dataset of over 10k user stories was collected and labeled, and pre-trained transformer models, including BERT, RoBERTa, XLNet, and DistilBERT, were fine-tuned to automate the identification of NFRs. We incorporated Focal Loss during training to mitigate the dominance of functionally driven requirements and class imbalances. In addition, thorough experiments on hyperparameter optimization were employed using Bayesian hyperparameter optimization to obtain the combination of hyperparameters that best correlated with the aim of enhancing each model’s performance. Our evaluation demonstrated that the finetuned pre-trained models significantly outperformed comparable prior approaches relying on rule-based techniques or traditional machine learning, with a fine-tuned BERT model achieving an F1 Score of 93.4 %. These findings highlight the potential of pre-trained language models in agile requirements engineering, enabling more efficient NFRs identification, reducing manual review burden, and facilitating a viable and efficient approach to address the neglect of NFRs in agile development processes.", "venue": "IEEE Access", "label": 2}, {"loc": [5.206512928009033, -1.5661050081253052], "openalex_id": "https://openalex.org/W4409113141", "title": "EXPLORING WORD EMBEDDINGS FOR SENTIMENT ANALYSIS OF MARATHI POLITICAL TWEETS: A MACHINE LEARNING APPROACH", "authors": "Swapnil P. Goje, Rupali H. Patil", "abstract": "Sentiment analysis of textual data is becoming increasingly significant in research. Many researchers are developing new technologies to enhance the accuracy and performance of sentiment analysis. This process is particularly vital in analysing customer reviews across various domains. One of new domain which was explored by the researchers is Political domain. After the inception of Smartphones and Internet availability, various political parties are using the social media to influence the people. As every people has their own opinion related to political context, they always try to put it on various social media handles like Facebook, Twitter (changed to X), Instagram, YouTube etc. As there is lots of research and resources carried out for few languages such as English, Chinese, Arabic but still few languages still lag in it like Marathi, Gujrati, Telegu, Greek etc. In view of this we have done the sentiment analysis for Marathi Tweets related to political domain using various ML models and Word embedding techniques like FastText, IndicNLP, Bag of Words and TF-IDF. We employed the hyperparameter tuning to optimize each model\u2019s performance. Among the tested embeddings, IndicNLP proved most effective, yielding superior accuracy and robustness across different machine learning models, likely due to its ability to capture linguistic intricacies specific to Indian languages. Our findings highlight the effectiveness of advanced word embeddings like IndicNLP in sentiment analysis tasks for under-resourced languages like Marathi, demonstrating their potential for broader applications in regional language processing.", "venue": "ICTACT Journal on Soft Computing", "label": 0}, {"loc": [2.6345908641815186, 2.9653444290161133], "openalex_id": "https://openalex.org/W4413105420", "title": "Non-governmental Governance of Trust on the Internet: WebPKI as Public Good", "authors": "Karl Grindal, Milton Mueller, Vagisha Srivastava", "abstract": "Abstract This paper provides a detailed analysis of how private actors cooperate to facilitate authentication and provide trust and security to the Web. The World Wide Web\u2019s Public Key Infrastructure (WebPKI) is a global governance structure forged through collective action among industry actors. Drawing on collective action theory and institutional analysis, we show how this regime of non-state actors produces a public good\u2014global authentication of website identities\u2014in a way that enhances security, privacy, and trust for websites and their users. Stakeholder analysis demonstrates how the production of digital certificates and the utilization of certificates for authentication and encryption necessitate interdependencies among Certificate Authorities (CAs) and Browsers/Operating Systems. These relationships are institutionalized by the Certificate Authority/Browser (CA/B) Forum and other voluntary industry organizations. Since their founding, these institutions have developed through stages of formalization, specialization, and expansion of their scope, and have sought to address various security and efficiency challenges through new standards. We conclude by exploring some measures for evaluating the efficacy of this governance regime. Quantitative findings include assessments of CA market concentration, institutional membership and participation trends, stakeholder voting behavior, and the composition of Browser root stores.", "venue": "Journal of Cybersecurity", "label": 0}, {"loc": [5.203863143920898, 0.36342981457710266], "openalex_id": "https://openalex.org/W4410189061", "title": "SURVEY ON ENHANCING DIALOGUE AGENT ALIGNMENT THROUGH MINILLM WITH TARGETED HUMAN ASSESSMENTS.", "authors": "B. Mahajan Swapnil, D. Vaidya Chandu, Lalit Narware Bhojraj, R. Divya, Sanju Meshram Harshal, Anil Sukhdeve Harsh, Kaur Anoop Singh Harpreet", "abstract": "This paper presents the development of a compact and effective language model inspired by the LLaMA architecture. The model's design is based on the fundamental principles of LLaMA, which influenced the architectural decisions and training methods. This study explores innovative approaches and expands the possibilities achievable with limited resources. By leveraging open-source datasets and advanced training techniques, significant progress was made without relying on extensive computational power or proprietary data. However, due to resource constraints, the model remains a work in progress. Individuals with access to greater computational capabilities could build upon this foundation to enhance its performance. This investigation aims to promote further contributions to the advancement of more robust and accessible language models. Key training parameters include context window size, number of layers, batch size, and model dimensions. Model evaluation is based on epoch count, execution time, model parameters, and validation loss.", "venue": "i-manager s Journal on Artificial Intelligence & Machine Learning", "label": 0}, {"loc": [2.9871554374694824, 4.141963005065918], "openalex_id": "https://openalex.org/W4407247593", "title": "DomainHarvester: Uncovering Trustworthy Domains Beyond Popularity Rankings", "authors": "Daiki Chiba, Hiroki Nakano, Takashi Koide", "abstract": "Allow lists are crucial in cybersecurity for distinguishing safe websites from potential threats. Traditional approaches relying on website popularity often fail to capture trustworthy but less-visited domains, leading to increased false positives and overlooked niche websites. This paper presents DomainHarvester, an innovative bottom-up system that leverages the web’s hyperlink structure and a Transformer-based machine learning approach to systematically identify and include these underrepresented yet legitimate domains. Results demonstrate how DomainHarvester dynamically curates an expanded allow list (DHList), substantially reducing the risk of false positives while retaining high precision in excluding malicious sites. Comprehensive evaluations and a real-world case study with a managed security services provider illustrate the efficacy and practicality of this approach. By integrating DomainHarvester, organizations and researchers can benefit from a more inclusive and globally representative cybersecurity allow list, addressing limitations in existing top-list-based solutions.", "venue": "IEEE Access", "label": 2}, {"loc": [3.837007761001587, -3.867391586303711], "openalex_id": "https://openalex.org/W4409473974", "title": "Arabic Cyberbullying Detection: A Comprehensive Review of Datasets and Methodologies", "authors": "Huda Aljalaoud, Kia Dashtipour, Ahmed Al\u2010Dubai", "abstract": "The freedom of speech in online spaces has substantially promoted engagement on social media platforms, where cyberbullying has emerged as a significant consequence. While extensive research has been conducted on cyberbullying detection in English, efforts in the Arabic language remain limited. To address this gap, the current study provides a comprehensive, state-of-the-art review of datasets and methodologies specifically focused on Arabic cyberbullying detection. It systematically reviews different relevant studies from six academic databases, examining their methodologies, dataset characteristics, and performance in terms of classification accuracy and limitations. The paper critically evaluates existing Arabic cyberbullying datasets according to criteria such as dataset size, dialectal diversity, annotation processes, and accessibility. Additionally, this review identifies critical limitations, including dataset scarcity, dialectal imbalance, annotation subjectivity, and methodological constraints. By synthesizing current knowledge, identifying research gaps, and suggesting future directions, this review supports the development of more robust, effective, and linguistically inclusive analytical methods. Ultimately, this work contributes significantly to natural language processing research and advances the creation of safer online environments for Arabic-speaking users.", "venue": "IEEE Access", "label": 2}, {"loc": [3.1187596321105957, -0.2937028408050537], "openalex_id": "https://openalex.org/W4409047217", "title": "Utilizing large language models for gastroenterology research: a conceptual framework", "authors": "Parul Berry, Rohan Raju Dhanakshirur, Sahil Khanna", "abstract": "Large language models (LLMs) transform healthcare by assisting clinicians with decision-making, research, and patient management. In gastroenterology, LLMs have shown potential in clinical decision support, data extraction, and patient education. However, challenges such as bias, hallucinations, integration with clinical workflows, and regulatory compliance must be addressed for safe and effective implementation. This manuscript presents a structured framework for integrating LLMs into gastroenterology, using Hepatitis C treatment as a real-world application. The framework outlines key steps to ensure accuracy, safety, and clinical relevance while mitigating risks associated with artificial intelligence (AI)-driven healthcare tools. The framework includes defining clinical goals, assembling a multidisciplinary team, data collection and preparation, model selection, fine-tuning, calibration, hallucination mitigation, user interface development, integration with electronic health records, real-world validation, and continuous improvement. Retrieval-augmented generation and fine-tuning approaches are evaluated for optimizing model adaptability. Bias detection, reinforcement learning from human feedback, and structured prompt engineering are incorporated to enhance reliability. Ethical and regulatory considerations, including the Health Insurance Portability and Accountability Act, General Data Protection Regulation, and AI-specific guidelines (DECIDE-AI, SPIRIT-AI, CONSORT-AI), are addressed to ensure responsible AI deployment. LLMs have the potential to enhance decision-making, research efficiency, and patient care in gastroenterology, but responsible deployment requires bias mitigation, transparency, and ongoing validation. Future research should focus on multi-institutional validation and AI-assisted clinical trials to establish LLMs as reliable tools in gastroenterology.", "venue": "Therapeutic Advances in Gastroenterology", "label": 0}, {"loc": [5.16933536529541, -1.5528638362884521], "openalex_id": "https://openalex.org/W4408615085", "title": "APPLIED LINGUISTICS DRIVEN LARGE LANGUAGE MODEL FOR SARCASM RECOGNITION ON SOCIAL MEDIA CORPORA", "authors": "Abdullah Mujawib Alashjaee, Alya Alshammari, Muhammad Swaileh A. Alzaidi, Nazir Ahmad, Manar Almanea, Ahmed S. Salama", "abstract": "Sarcasm is a language phrase that expresses the opposite of what is stated, often used for mocking or offending. It is commonly seen on social media platforms day by day. The opinion analysis process is susceptible to errors due to the potential for sarcasm to alter the statement\u2019s meaning. As automated social media research tools become more prevalent, the reliability problems of analytics have also increased. According to the prior study, sarcastic reports alone have greatly diminished the automatic Sentiment Analysis (SA) performance in complex systems platforms. Sarcasm detection utilizing Deep Learning (DL) contains training models to identify the nuanced linguistic cues that indicate sarcasm in text. Typically, this process applies large datasets annotated with sarcastic and non-sarcastic samples to teach models to discriminate between them. DL methodologies, namely Convolutional Neural Networks (CNNs), Recurrent Neural Networks (RNNs), and transformer methods like BERT or GPT, are widely applied due to their ability to capture intricate patterns in language. This model learns to detect sarcasm by discriminating exaggerated expressions, contextual incongruities, and semantic reversals frequently related to sarcastic remarks. Therefore, this study presents a Fractal Red-Tailed Hawk Algorithm with Hybrid Deep Learning-Driven Sarcasm Detection (RTHHDL-SD) technique on complex systems and social media platforms. The purpose of the RTHHDL-SD technique is to identify and classify the occurrence of sarcasm in social media text. In the RTHHDL-SD approach, data preprocessing is performed in four ways to transform input data into valuable design. Besides, the RTHHDL-SD technique applies the FastText word embedding approach to generate word embeddings. The RTHHDL-SD technique applies a Deep Neural Network (DNN) with bi-directional long short-term memory for sarcasm detection, called the deep BiLSTM model. The RTH method was utilized as the hyperparameter optimizer to enhance the detection performance of the deep BiLSTM model. Moreover, the large language model is used to estimate the outcomes of the social media corpora. The simulation outcomes of the RTHHDL-SD methodology are examined under Twitter and Headlines datasets. The investigational outcomes of the RTHHDL-SD methodology exhibited superior accuracy values of 89.10% and 92.77% with other approaches.", "venue": "Fractals", "label": 0}, {"loc": [5.357014179229736, -1.524023175239563], "openalex_id": "https://openalex.org/W4407373098", "title": "Weakly Supervised Deep Learning for Arabic Tweet Sentiment Analysis on Education Reforms: Leveraging Pre-trained Models and LLMs with Snorkel", "authors": "Alanoud Alotaibi, Farrukh Nadeem, Mohamed Hamdy", "abstract": "This study introduces a novel approach to sentiment classification of Arabic tweets regarding educational reforms in Saudi Arabia. The complexity of the Arabic language, with its numerous dialects, poses challenges for natural language processing tasks, particularly when large volumes of data require manual annotation. To overcome the limitations of traditional labeling methods, we developed a weakly supervised learning framework that combines LLMs (GPT-3.5) and pre-trained language models (MarBERT and XLM-RoBERTa) to generate high-quality weakly labeled training data using the Snorkel framework. We fine-tuned the AraBERT model with this weakly labeled data for sentiment classification. Our experimental results demonstrated the effectiveness of the proposed approach, achieving 83% precision, 76% recall, and an 85% F1 score in classifying tweets as positive, negative, or neutral. Comparative analysis showed that GPT-3.5 outperformed Llama 2 in prompting tasks, and our weakly supervised model surpassed baseline machine learning methods. These findings highlight the potential of weakly supervised learning in analyzing public opinion on Arabic social media platforms without relying on large, labeled datasets.", "venue": "IEEE Access", "label": 2}, {"loc": [6.860406398773193, 0.33654066920280457], "openalex_id": "https://openalex.org/W4409102132", "title": "Grammar or Crammer? The Role of Morphology in Distinguishing Orthographically Similar but Semantically Unrelated Words", "authors": "G\u00f6khan Ercan, Olcay Taner Y\u0131ld\u0131z", "abstract": "We show that n-gram-based distributional models fail to distinguish unrelated words due to the noise in semantic spaces. This issue remains hidden in conventional benchmarks but becomes more pronounced when orthographic similarity is high. To highlight this problem, we introduce OSimUnr, a dataset of nearly one million English and Turkish word-pairs that are orthographically similar but semantically unrelated (e.g., grammar – crammer). These pairs are generated through a graph-based WordNet approach and morphological resources. We define two evaluation tasks—unrelatedness identification and relatedness classification—to test semantic models. Our experiments reveal that FastText, with default n-gram segmentation, performs poorly (below 5% accuracy) in identifying unrelated words. However, morphological segmentation overcomes this issue, boosting accuracy to 68% (English) and 71% (Turkish) without compromising performance on standard benchmarks (RareWords, MTurk771, MEN, AnlamVer). Furthermore, our results suggest that even state-of-the-art LLMs, including Llama 3.3 and GPT-4o-mini, may exhibit noise in their semantic spaces, particularly in highly synthetic languages such as Turkish. To ensure dataset quality, we leverage WordNet, MorphoLex, and NLTK, covering fully derivational morphology supporting atomic roots (e.g., ‘-co_here+ance+y’ for ‘coherency’), with 405 affixes in Turkish and 467 in English.", "venue": "IEEE Access", "label": 2}, {"loc": [8.490086555480957, -0.2268538773059845], "openalex_id": "https://openalex.org/W4409590810", "title": "Advancements in Natural Language Processing: Leveraging Transformer Models for Multilingual Text Generation", "authors": "Mohammad Zobair Hossain, Sachin Goyal", "abstract": "Background: Recent advancements in Natural Language Processing (NLP) have revolutionized text generation techniques, with Transformer models becoming the cornerstone of modern NLP tasks, particularly in multilingual text generation. Objective: This study aims to examine the effectiveness of Transformer-based models in generating multilingual text across diverse languages, focusing on enhancing content fluency, coherence, and domain-specific applications. Method: The research utilizes a series of pre-trained Transformer models including BERT, GPT, mBERT, and XLM-R, trained on a multilingual corpus spanning 20+ languages. The study incorporates a comprehensive training process involving fine-tuning on specific tasks such as text summarization, content creation, and sentiment analysis. Evaluation metrics such as BLEU, ROUGE, and accuracy were used to assess the quality of generated content. Models were trained using high-performance computing resources to ensure scalability and efficiency. We also performed extensive comparison with traditional NLP approaches to demonstrate improvements in multilingual generation. Results: The Transformer models demonstrated considerable advancements in multilingual text generation. mBERT achieved an average BLEU score of 45%, surpassing traditional monolingual models by 20%. XLM-R, in particular, showed a remarkable 25% improvement in coherence across languages, including low-resource ones. The models generated high-quality content, with a 92% accuracy rate in task-specific domains. Furthermore, computational efficiency was enhanced by reducing resource usage by 30%, enabling scalable multilingual deployment. Conclusions: Transformer models show great promise in multilingual text generation, with notable improvements in translation quality, fluency, and efficiency. Future research should focus on reducing bias and further improving model scalability.", "venue": "Pacific Journal of Advanced Engineering Innovations", "label": 0}, {"loc": [8.984923362731934, 0.19033576548099518], "openalex_id": "https://openalex.org/W4405955832", "title": "Building a Rich Dataset to Empower the Persian Question Answering Systems", "authors": "Mohsen Yazdinejad, Marjan Kaedi", "abstract": "Question answering systems provide short, precise, and specific answers to questions. So far, many robust question answering systems have been developed for English, while some languages with fewer resources, like Persian, have few numbers of standard dataset. In this study, a comprehensive open-domain dataset is presented for Persian. This dataset is called NextQuAD and has 7,515 contexts, including 23,918 questions and answers. Then, a BERT-based question answering model has been applied to this dataset using two pre-trained language models, including ParsBERT and XLM-RoBERTa. The results of these two models have been ensembled using mean logits. Evaluation on the development set shows 0.95 Exact Match (EM) and 0.97 Fl_score. Also, to compare the NextQuAD with other Persian datasets, our trained model on the NextQuAD, is evaluated on two other datasets named PersianQA and ParSQuAD. Comparisons show that the proposed model increased EM by 0.39 and 0.14 respectively in PersianQA and ParSQuAD-manual, while a slight EM decline of 0.007 happened in ParSQuAD-automatic.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.470680236816406, 3.6220102310180664], "openalex_id": "https://openalex.org/W4405903483", "title": "Gradient Weight-normalized Low-rank Projection for Efficient LLM Training", "authors": "Jia-Hong Huang, Yixian Shen, Hongyi Zhu, Stevan Rudinac, Evangelos Kanoulas", "abstract": "Large Language Models (LLMs) have shown remarkable performance across various tasks, but the escalating demands on computational resources pose significant challenges, particularly in the extensive utilization of full fine-tuning for downstream tasks. To address this, parameter-efficient fine-tuning (PEFT) methods have been developed, but they often underperform compared to full fine-tuning and struggle with memory efficiency. In this work, we introduce Gradient Weight-Normalized Low-Rank Projection (GradNormLoRP), a novel approach that enhances both parameter and memory efficiency while maintaining comparable performance to full fine-tuning. GradNormLoRP normalizes the weight matrix to improve gradient conditioning, facilitating better convergence during optimization. Additionally, it applies low-rank approximations to the weight and gradient matrices, significantly reducing memory usage during training. Extensive experiments demonstrate that our 8-bit GradNormLoRP reduces optimizer memory usage by up to 89.5% and enables the pre-training of large LLMs, such as LLaMA 7B, on consumer-level GPUs like the NVIDIA RTX 4090, without additional inference costs. Moreover, GradNormLoRP outperforms existing low-rank methods in fine-tuning tasks. For instance, when fine-tuning the RoBERTa model on all GLUE tasks with a rank of 8, GradNormLoRP achieves an average score of 80.65, surpassing LoRA's score of 79.23. These results underscore GradNormLoRP as a promising alternative for efficient LLM pre-training and fine-tuning. Source code: https://github.com/Jhhuangkay/Gradient-Weight-normalized-Low-rank-Projection-for-Efficient-LLM-Training", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.8631720542907715, 5.110936641693115], "openalex_id": "https://openalex.org/W4405904283", "title": "Next Token Prediction Towards Multimodal Intelligence: A Comprehensive Survey", "authors": "Liang Chen, Zekun Wang, Shuhuai Ren, Lei Li, Haozhe Zhao, Yunshui Li, Zefan Cai, Hongcheng Guo, Nevin L. Zhang, Yizhe Xiong, Yichi Zhang, Ruoyu Wu, Qingxiu Dong, Ge Zhang, Jian Yang, Lingwei Meng, Shujie Hu, Yulong Chen, Jie Lin, Shuai Bai, Andreas Vlachos, Xu Tan, Minjia Zhang, Wen Xiao, A.W. Yee, Tianyu Liu, Baobao Chang", "abstract": "Building on the foundations of language modeling in natural language processing, Next Token Prediction (NTP) has evolved into a versatile training objective for machine learning tasks across various modalities, achieving considerable success. As Large Language Models (LLMs) have advanced to unify understanding and generation tasks within the textual modality, recent research has shown that tasks from different modalities can also be effectively encapsulated within the NTP framework, transforming the multimodal information into tokens and predict the next one given the context. This survey introduces a comprehensive taxonomy that unifies both understanding and generation within multimodal learning through the lens of NTP. The proposed taxonomy covers five key aspects: Multimodal tokenization, MMNTP model architectures, unified task representation, datasets \\& evaluation, and open challenges. This new taxonomy aims to aid researchers in their exploration of multimodal intelligence. An associated GitHub repository collecting the latest papers and repos is available at https://github.com/LMM101/Awesome-Multimodal-Next-Token-Prediction", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.006162166595459, -0.045623764395713806], "openalex_id": "https://openalex.org/W4405882502", "title": "Siamese Hybrid Network Approach for Sentence Similarity", "authors": "D.A.A. Deepal, A. R. S. T. Bandara, P. Ravindra S. De Silva", "abstract": "This paper presents a novel Siamese Hybrid Network approach, namely Siamese Bidirectional Long Short Memory with Convolutional Neural Network (SiBiLConv), for evaluating the similarity in natural language. The model integrates a Siamese neural network architecture with similarity metrics, including Manhattan Distance and Cosine Similarity, to improve the accuracy of semantic relationships measurement between sentences. Evaluations were performed on Sinhala, a complex and under-resourced language spoken in Sri Lanka, which poses unique challenges due to its morphological richness and syntactic variability. The SiBiLConv model achieved an accuracy of 89.80%, an F1 score of 0.9041, and a mean squared error (MSE) of 0.0281 with the Cosine Distance metric outperforming baseline models such as MaLSTM, which achieved an accuracy of 78.99% and an F1 score of 0.7797. While existing methods for sentence similarity primarily focus on resource-rich languages, this work addresses the pressing need for tailored approaches in low-resource language contexts, where pre-trained models and annotated datasets are often limited. The novelty lies in SiBiLConv's hybrid architecture and metric integration, specifically designed to overcome the syntactic and semantic complexities of Sinhala. This research not only bridges a critical gap in the application of sentence similarity models for low-resource languages but also establishes a framework adaptable to other morphologically rich languages, advancing the broader scope of natural language processing. Keywords: Siamese Hybrid Network, Sentences similarity, Sinhala sentence similarity, Morphologically Rich Language Processing", "venue": "Vidyodaya Journal of Science", "label": 0}, {"loc": [6.32393741607666, 0.4086102545261383], "openalex_id": "https://openalex.org/W4406228195", "title": "Synchronic and Diachronic Predictors of Socialness Ratings of Words", "authors": "V. V. Bochkarev, Anna V. Shevlyakova, Andrey A. Achkeev", "abstract": "Introduction: In recent works, a new psycholinguistic concept has been introduced and studied that is socialness of a word. A socialness rating reflects word social significance and dictionaries with socialness ratings have been compiled using either a survey or machine method. Unfortunately, the size of the dictionaries with word socialness ratings created by a survey method is relatively small. Purpose: The study objective is to compile a large dictionary with English word socialness ratings by using machine extrapolation, transfer the rating estimations to other languages as well as to obtain diachronic models of socialness ratings. Method: The socialness ratings of words are estimated using multilayer direct propagation neural networks. To obtain synchronic estimates, pre-trained fasttext vectors were fed to the input. To obtain diachronic estimates, word co-occurrence statistics in a large diachronic corpus was used. Results: The obtained Spearman`s correlation coefficient between human socialness ratings and machine ones is 0.869. The trained models allowed obtaining socialness ratings for 2 million English words, as well as a wide range of words in 43 other languages. An unexpected result is that the linear model provides highly accurate estimate of the socialness ratings, which can be hardly further improved. Apparently, this is due to the fact that in the space of vectors representing words there is a selected direction responsible for meanings associated with socialness driven by of social factors influencing word representation and use. The article also presents a diachronic neural network predictor of concreteness ratings using word co- occurrence vectors as input data. It is shown that using a one-year data from a large diachronic corpus Google Books Ngram one can obtain accuracy comparable to the accuracy of synchronic estimates. Conclusion: The created large machine dictionary of socialness ratings can be used in psycholinguistic and cultural studies. Changes in socialness ratings can serve as a marker of word meaning change and be used in lexical semantic change detection", "venue": "Journal of language and Education", "label": 0}, {"loc": [7.524057865142822, -0.9801366329193115], "openalex_id": "https://openalex.org/W4405903293", "title": "Exploiting Domain-Specific Parallel Data on Multilingual Language Models for Low-resource Language Translation", "authors": "Surangika Ranathungaa, Shravan Nayak, Shih\u2010Ting Huang, Yanke Mao, Tong Su, Yu\u2010Chang Chan, S. C. Yuan, Anthony Rinaldi, Annie En-Shiun Lee", "abstract": "Neural Machine Translation (NMT) systems built on multilingual sequence-to-sequence Language Models (msLMs) fail to deliver expected results when the amount of parallel data for a language, as well as the language's representation in the model are limited. This restricts the capabilities of domain-specific NMT systems for low-resource languages (LRLs). As a solution, parallel data from auxiliary domains can be used either to fine-tune or to further pre-train the msLM. We present an evaluation of the effectiveness of these two techniques in the context of domain-specific LRL-NMT. We also explore the impact of domain divergence on NMT model performance. We recommend several strategies for utilizing auxiliary parallel data in building domain-specific NMT models for LRLs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.843452453613281, 0.5517793297767639], "openalex_id": "https://openalex.org/W4405838419", "title": "Knowledge Graph as Pre-training Corpus for Structural Reasoning via Multi-hop Linearization", "authors": "W.K. Kim, Haemin Jung, Wooju Kim", "abstract": "Large language models have demonstrated exceptional performance across various natural language processing tasks. However, their reliance on unstructured text corpora for pre-training limits their effectiveness in tasks requiring structured reasoning such as multi-hop question-answering. Knowledge Graphs provide a rich, structured source of relational data, offering an opportunity to enhance the reasoning capabilities of Large language models. In this paper, we propose a novel framework, Knowledge Graph as Pre-training Corpus (KGPC), which transforms knowledge graphs into text using a multi-hop linearization process. Unlike existing approaches that linearize singular triples, our method captures the interconnected nature of knowledge graphs by linking multiple triples across multiple hops, preserving their relational structure during the pre-training phase. This structured knowledge injection improves language models to perform complex reasoning tasks. We evaluate our approach on multi-hop reasoning benchmarks, demonstrating significant performance gains over existing models, particularly in question-answering tasks. Our results highlight the potential of multi-hop linearization in enhancing the structural reasoning capacity of language models, reducing error propagation, and improving the integration of structured knowledge into language models.", "venue": "IEEE Access", "label": 2}, {"loc": [2.5724892616271973, 1.459057331085205], "openalex_id": "https://openalex.org/W4405854082", "title": "Use of Generative Artificial Intelligence in Teaching and Learning: Engineering Instructors' Perspectives", "authors": "Phiwe M. Simelane, Javeed Kittur", "abstract": "ABSTRACT Advancements in generative artificial intelligence, particularly ChatGPT, are increasingly influencing engineering education. Engineering instructors have noticed their students using this technology and are beginning to understand its effects on learning and teaching. While research has explored its impacts on general education, few qualitative studies examine engineering instructors' perspectives on how it affects both students and instructors. This qualitative study gathers engineering instructors' views on how ChatGPT affects and could affect their teaching and students' learning experiences. Engineering faculty were invited to participate in Zoom interviews. They were asked questions about their experiences with ChatGPT in their courses. The data collected were analyzed inductively to identify themes. Instructors likened ChatGPT's impact on education to the introduction of calculators, noting its potential to ease academic tasks. They highlighted its benefits in drafting, outlining, and correcting grammatical errors. However, concerns were raised about the quality of ChatGPT's output due to possible inaccuracies (hallucinations) and the risk of students over\u2010relying on the technology, thus avoiding genuine learning. Instructors recommended that students use ChatGPT as a supplementary tool, verify the information it provides, and that instructors educate students on its limitations and proper use. The overall outlook on ChatGPT in engineering education is positive, with instructors open to its integration in the classroom. Despite growing research on ChatGPT and engineering education, further studies are needed to assess its effectiveness across different course levels. This would help institutions and professors implement ChatGPT appropriately in various subjects and levels.", "venue": "Computer Applications in Engineering Education", "label": 0}, {"loc": [5.9604291915893555, 4.620565414428711], "openalex_id": "https://openalex.org/W4405827891", "title": "An overview of large AI models and their applications", "authors": "Xiaoguang Tu, Zhimin He, Yi Huang, Zhihao Zhang, Ming Yang, Jian Zhao", "abstract": "Abstract In recent years, large-scale artificial intelligence (AI) models have become a focal point in technology, attracting widespread attention and acclaim. Notable examples include Google\u2019s BERT and OpenAI\u2019s GPT, which have scaled their parameter sizes to hundreds of billions or even tens of trillions. This growth has been accompanied by a significant increase in the amount of training data, significantly improving the capabilities and performance of these models. Unlike previous reviews, this paper provides a comprehensive discussion of the algorithmic principles of large-scale AI models and their industrial applications from multiple perspectives. We first outline the evolutionary history of these models, highlighting milestone algorithms while exploring their underlying principles and core technologies. We then evaluate the challenges and limitations of large-scale AI models, including computational resource requirements, model parameter inflation, data privacy concerns, and specific issues related to multi-modal AI models, such as reliance on text-image pairs, inconsistencies in understanding and generation capabilities, and the lack of true \u201cmulti-modality\u201d. Various industrial applications of these models are also presented. Finally, we discuss future trends, predicting further expansion of model scale and the development of cross-modal fusion. This study provides valuable insights to inform and inspire future future research and practice.", "venue": "Visual Intelligence", "label": 0}, {"loc": [6.18505859375, 3.683546543121338], "openalex_id": "https://openalex.org/W4405809438", "title": "OS Agents: A Survey on MLLM-Based Agents for General Computing Devices Use", "authors": "Xueyu Hu, Xiong Tao, Biao Yi, Zishu Wei, Ruixuan Xiao, YangQuan Chen, Jiasheng Ye, Meiling Tao, Xiangxin Zhou, Ziyu Zhao, Yuhuai Li, Shaohua Xu, S. Wang, Xinchen Xu, Shuofei Qiao, Kun Kuang, Tieyong Zeng, Liang Wang, Jiwei Li, Yuchen Eleanor Jiang, Wangchunshu Zhou, Guoyin Wang, Keting Yin, Zhou Zhao, Hongxia Yang, Fan Wu, Shengyu Zhang, Fei Wu", "abstract": "The dream to create AI assistants as capable and versatile as the fictional J.A.R.V.I.S from Iron Man has long captivated imaginations. With the evolution of (multimodal) large language models ((M)LLMs), this dream is closer to reality, as (M)LLM-based Agents using computing devices (e.g., computers and mobile phones) by operating within the environments and interfaces (e.g., Graphical User Interface (GUI)) provided by operating systems (OS) to automate tasks have significantly advanced. This paper presents a comprehensive survey of these advanced agents, designated as OS Agents. We begin by elucidating the fundamentals of OS Agents, exploring their key components including the environment, observation space, and action space, and outlining essential capabilities such as understanding, planning, and grounding. We then examine methodologies for constructing OS Agents, focusing on domain-specific foundation models and agent frameworks. A detailed review of evaluation protocols and benchmarks highlights how OS Agents are assessed across diverse tasks. Finally, we discuss current challenges and identify promising directions for future research, including safety and privacy, personalization and self-evolution. This survey aims to consolidate the state of OS Agents research, providing insights to guide both academic inquiry and industrial development. An open-source GitHub repository is maintained as a dynamic resource to foster further innovation in this field.", "venue": "Preprints.org", "label": 3}, {"loc": [2.842909812927246, 2.9740397930145264], "openalex_id": "https://openalex.org/W4405783467", "title": "Bridging the Data Provenance Gap Across Text, Speech and Video", "authors": "Shayne Longpre, Nikhil Kumar Singh, Manuel Cherep, Kushagra Tiwary, Joanna Materzy\u0144ska, William Brannon, Robert Mahari, Naana Obeng-Marnu, Mohammed Hamdy, Nayan Saxena, Nayan Saxena, Emad A. Alghamdi, Vu Minh Chien, Vu Minh Chien, Da Yin, Kun Qian, Yi\u2010Zhi Li, Minnie Liang, An Q Dinh, Shrestha Mohanty, Deividas Mataciunas, Tobin South, Jianguo Zhang, Ariel N. Lee, C.P. Lund, Christopher Klamm, Damien Sileo, Diganta Misra, Enrico Shippole, Kevin Klyman, L\u00edvia Maria Bettini de Miranda, Niklas Muennighoff, Seonghyeon Ye, Seungone Kim, Vipul Gupta, Vivek Sharma, Xuhui Zhou, Caiming Xiong, L. F. Villa, Stella Biderman, Alex Pentland, Sara Hooker, Jad Kabbara", "abstract": "Progress in AI is driven largely by the scale and quality of training data. Despite this, there is a deficit of empirical analysis examining the attributes of well-established datasets beyond text. In this work we conduct the largest and first-of-its-kind longitudinal audit across modalities--popular text, speech, and video datasets--from their detailed sourcing trends and use restrictions to their geographical and linguistic representation. Our manual analysis covers nearly 4000 public datasets between 1990-2024, spanning 608 languages, 798 sources, 659 organizations, and 67 countries. We find that multimodal machine learning applications have overwhelmingly turned to web-crawled, synthetic, and social media platforms, such as YouTube, for their training sets, eclipsing all other sources since 2019. Secondly, tracing the chain of dataset derivations we find that while less than 33% of datasets are restrictively licensed, over 80% of the source content in widely-used text, speech, and video datasets, carry non-commercial restrictions. Finally, counter to the rising number of languages and geographies represented in public AI training datasets, our audit demonstrates measures of relative geographical and multilingual representation have failed to significantly improve their coverage since 2013. We believe the breadth of our audit enables us to empirically examine trends in data sourcing, restrictions, and Western-centricity at an ecosystem-level, and that visibility into these questions are essential to progress in responsible AI. As a contribution to ongoing improvements in dataset transparency and responsible use, we release our entire multimodal audit, allowing practitioners to trace data provenance across text, speech, and video.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.055823802947998, -0.1583385169506073], "openalex_id": "https://openalex.org/W4405775523", "title": "Large Language Models As Genetic Counselors: Personalized AI Driven Tool for Health Management", "authors": "Takuya Fukushima, Masae Manabe, Shuntaro Yada, Shoko Wakamiya, Akiko Yoshida, Yusaku Urakawa, A. Maeda, Shigeyuki Kan, Masayo Takahashi, Eiji Aramaki", "abstract": "Background Advances in genetics have underscored a strong association between genetic factors and health outcomes, leading to an increased demand for genetic counseling services. However, a shortage of qualified genetic counselors poses a significant challenge. Large language models (LLMs) have emerged as a potential solution for augmenting support in genetic counseling tasks. Despite the potential, Japanese genetic counseling LLMs (JGCLLMs) are underexplored. To advance a JGCLLM-based dialogue system for genetic counseling, effective domain adaptation methods require investigation. Objective This study aims to evaluate the current capabilities and identify challenges in developing a JGCLLM-based dialogue system for genetic counseling. The primary focus is to assess the effectiveness of prompt engineering, retrieval-augmented generation (RAG), and instruction tuning within the context of genetic counseling. Furthermore, we will establish an experts-evaluated dataset of responses generated by LLMs adapted to Japanese genetic counseling for the future development of JGCLLMs. Methods Two primary datasets were used in this study: (1) a question-answer (QA) dataset for LLM adaptation and (2) a genetic counseling question dataset for evaluation. The QA dataset included 899 QA pairs covering medical and genetic counseling topics, while the evaluation dataset contained 120 curated questions across 6 genetic counseling categories. Three enhancement techniques of LLMs\u2014instruction tuning, RAG, and prompt engineering\u2014were applied to a lightweight Japanese LLM to enhance its ability for genetic counseling. The performance of the adapted LLM was evaluated on the 120-question dataset by 2 certified genetic counselors and 1 ophthalmologist (SK, YU, and AY). Evaluation focused on four metrics: (1) inappropriateness of information, (2) sufficiency of information, (3) severity of harm, and (4) alignment with medical consensus. Results The evaluation by certified genetic counselors and an ophthalmologist revealed varied outcomes across different methods. RAG showed potential, particularly in enhancing critical aspects of genetic counseling. In contrast, instruction tuning and prompt engineering produced less favorable outcomes. This evaluation process facilitated the creation an expert-evaluated dataset of responses generated by LLMs adapted with different combinations of these methods. Error analysis identified key ethical concerns, including inappropriate promotion of prenatal testing, criticism of relatives, and inaccurate probability statements. Conclusions RAG demonstrated notable improvements across all evaluation metrics, suggesting potential for further enhancement through the expansion of RAG data. The expert-evaluated dataset developed in this study provides valuable insights for future optimization efforts. However, the ethical issues observed in JGCLLM responses underscore the critical need for ongoing refinement and thorough ethical evaluation before these systems can be implemented in health care settings.", "venue": "JMIR Medical Informatics", "label": 28}, {"loc": [3.056081771850586, -0.2895067036151886], "openalex_id": "https://openalex.org/W4405775523", "title": "Large Language Models As Genetic Counselors: AI-DRIVEN TOOLS FOR HEALTH MANAGEMENT", "authors": "Takuya Fukushima, Masae Manabe, Shuntaro Yada, Shoko Wakamiya, Akiko Yoshida, Yusaku Urakawa, A. Maeda, Shigeyuki Kan, Masayo Takahashi, Eiji Aramaki", "abstract": "Background Advances in genetics have underscored a strong association between genetic factors and health outcomes, leading to an increased demand for genetic counseling services. However, a shortage of qualified genetic counselors poses a significant challenge. Large language models (LLMs) have emerged as a potential solution for augmenting support in genetic counseling tasks. Despite the potential, Japanese genetic counseling LLMs (JGCLLMs) are underexplored. To advance a JGCLLM-based dialogue system for genetic counseling, effective domain adaptation methods require investigation. Objective This study aims to evaluate the current capabilities and identify challenges in developing a JGCLLM-based dialogue system for genetic counseling. The primary focus is to assess the effectiveness of prompt engineering, retrieval-augmented generation (RAG), and instruction tuning within the context of genetic counseling. Furthermore, we will establish an experts-evaluated dataset of responses generated by LLMs adapted to Japanese genetic counseling for the future development of JGCLLMs. Methods Two primary datasets were used in this study: (1) a question-answer (QA) dataset for LLM adaptation and (2) a genetic counseling question dataset for evaluation. The QA dataset included 899 QA pairs covering medical and genetic counseling topics, while the evaluation dataset contained 120 curated questions across 6 genetic counseling categories. Three enhancement techniques of LLMs\u2014instruction tuning, RAG, and prompt engineering\u2014were applied to a lightweight Japanese LLM to enhance its ability for genetic counseling. The performance of the adapted LLM was evaluated on the 120-question dataset by 2 certified genetic counselors and 1 ophthalmologist (SK, YU, and AY). Evaluation focused on four metrics: (1) inappropriateness of information, (2) sufficiency of information, (3) severity of harm, and (4) alignment with medical consensus. Results The evaluation by certified genetic counselors and an ophthalmologist revealed varied outcomes across different methods. RAG showed potential, particularly in enhancing critical aspects of genetic counseling. In contrast, instruction tuning and prompt engineering produced less favorable outcomes. This evaluation process facilitated the creation an expert-evaluated dataset of responses generated by LLMs adapted with different combinations of these methods. Error analysis identified key ethical concerns, including inappropriate promotion of prenatal testing, criticism of relatives, and inaccurate probability statements. Conclusions RAG demonstrated notable improvements across all evaluation metrics, suggesting potential for further enhancement through the expansion of RAG data. The expert-evaluated dataset developed in this study provides valuable insights for future optimization efforts. However, the ethical issues observed in JGCLLM responses underscore the critical need for ongoing refinement and thorough ethical evaluation before these systems can be implemented in health care settings.", "venue": "JMIR Medical Informatics", "label": 28}, {"loc": [3.354553699493408, 1.863471269607544], "openalex_id": "https://openalex.org/W4405790571", "title": "Opportunities and Challenges of Artificial Intelligence in Public Media Journalism", "authors": "Widia Ningish", "abstract": "The rapid development of artificial intelligence (AI) technology has significantly influenced various sectors, including journalism. AI presents opportunities such as automating routine tasks, analyzing complex data, personalizing content, and increasing efficiency in news production. However, it also introduces challenges, particularly concerning news credibility, such as algorithm bias, accuracy issues, loss of human touch, and diminished public trust in automated news. In Indonesia, the adoption of AI in journalism is still in its infancy, with limited regulatory frameworks and insufficient awareness of its implications. This research aims to identify the challenges and opportunities faced by journalists in the AI era and examine the impact of AI on news credibility. Employing an exploratory qualitative method, the research incorporates in-depth interviews, observations, and documentation studies conducted within Indonesian media organizations. The findings reveal that collaboration between AI and journalists can enhance efficiency while maintaining journalistic integrity. However, the research underscores the necessity for adaptive regulations and improved technological literacy among journalists to ensure ethical and productive AI utilization. This research contributes to understanding the interplay between technology and journalistic values, offering valuable insights for media organizations and policymakers to navigate the AI-driven future of journalism.", "venue": "International conference on social science and technology", "label": 0}, {"loc": [3.0930981636047363, -0.2983470559120178], "openalex_id": "https://openalex.org/W4405783533", "title": "A survey of datasets in medicine for large language models", "authors": "Deshiwei Zhang, Xiaojuan Xue, Peng Gao, Zhijuan Jin, Menghan Hu, Yue Wu, Xiayang Ying", "abstract": "With the advent of models such as ChatGPT and other models, large language models (LLMs) have demonstrated unprecedented capabilities in understanding and generating natural language, presenting novel opportunities and challenges within the medicine domain. While there have been many studies focusing on the employment of LLMs in medicine, comprehensive reviews of the datasets utilized in this field remain scarce. This survey seeks to address this gap by providing a comprehensive overview of the datasets in medicine fueling LLMs, highlighting their unique characteristics and the critical roles they play at different stages of LLMs' development: pre-training, fine-tuning, and evaluation. Ultimately, this survey aims to underline the significance of datasets in realizing the full potential of LLMs to innovate and improve healthcare outcomes.", "venue": "Intelligence & Robotics", "label": 0}, {"loc": [3.7233147621154785, 4.381182670593262], "openalex_id": "https://openalex.org/W4405783885", "title": "Pirates of the RAG: Adaptively Attacking LLMs to Leak Knowledge Bases", "authors": "C. Maio, Cristian Cosci, Marco Maggini, Valentina Poggioni, Stefano Melacci", "abstract": "The growing ubiquity of Retrieval-Augmented Generation (RAG) systems in several real-world services triggers severe concerns about their security. A RAG system improves the generative capabilities of a Large Language Models (LLM) by a retrieval mechanism which operates on a private knowledge base, whose unintended exposure could lead to severe consequences, including breaches of private and sensitive information. This paper presents a black-box attack to force a RAG system to leak its private knowledge base which, differently from existing approaches, is adaptive and automatic. A relevance-based mechanism and an attacker-side open-source LLM favor the generation of effective queries to leak most of the (hidden) knowledge base. Extensive experimentation proves the quality of the proposed algorithm in different RAG pipelines and domains, comparing to very recent related approaches, which turn out to be either not fully black-box, not adaptive, or not based on open-source models. The findings from our study remark the urgent need for more robust privacy safeguards in the design and deployment of RAG systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.138763427734375, -0.7853700518608093], "openalex_id": "https://openalex.org/W4405767473", "title": "Survey on Abstractive Text Summarization: Dataset, Models, and Metrics", "authors": "Gospel Ozioma Nnadi, Flavio Bertini", "abstract": "The advancements in deep learning, particularly the introduction of transformers, have been pivotal in enhancing various natural language processing (NLP) tasks. These include text-to-text applications such as machine translation, text classification, and text summarization, as well as data-to-text tasks like response generation and image-to-text tasks such as captioning. Transformer models are distinguished by their attention mechanisms, pretraining on general knowledge, and fine-tuning for downstream tasks. This has led to significant improvements, particularly in abstractive summarization, where sections of a source document are paraphrased to produce summaries that closely resemble human expression. The effectiveness of these models is assessed using diverse metrics, encompassing techniques like semantic overlap and factual correctness. This survey examines the state of the art in text summarization models, with a specific focus on the abstractive summarization approach. It reviews various datasets and evaluation metrics used to measure model performance. Additionally, it includes the results of test cases using abstractive summarization models to underscore the advantages and limitations of contemporary transformer-based models. The source codes and the data are available at https://github.com/gospelnnadi/Text-Summarization-SOTA-Experiment.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.447667598724365, 2.2877893447875977], "openalex_id": "https://openalex.org/W4405767863", "title": "Measuring Contextual Informativeness in Child-Directed Text", "authors": "Maria Consuelo Valentini, T\u00e9a Wright, Ali Marashian, Jennifer Weber, Eliana Colunga, Katharina von der Wense", "abstract": "To address an important gap in creating children's stories for vocabulary enrichment, we investigate the automatic evaluation of how well stories convey the semantics of target vocabulary words, a task with substantial implications for generating educational content. We motivate this task, which we call measuring contextual informativeness in children's stories, and provide a formal task definition as well as a dataset for the task. We further propose a method for automating the task using a large language model (LLM). Our experiments show that our approach reaches a Spearman correlation of 0.4983 with human judgments of informativeness, while the strongest baseline only obtains a correlation of 0.3534. An additional analysis shows that the LLM-based approach is able to generalize to measuring contextual informativeness in adult-directed text, on which it also outperforms all baselines.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.723883628845215, 4.002894878387451], "openalex_id": "https://openalex.org/W4405768143", "title": "A Toolkit for Virtual Reality Data Collection", "authors": "Tim Rolff, Niklas Hypki, Markus Lappe, Frank Steinicke", "abstract": "Due to the still relatively low number of users, acquiring large-scale and multidimensional virtual reality datasets remains a significant challenge. Consequently, VR datasets comparable in size to state-of-the-art collections in natural language processing or computer vision are rare or absent. However, the availability of such datasets could unlock groundbreaking advancements in deep-learning, psychological modeling, and data analysis in the context of VR. In this paper, we present a versatile data collection toolkit designed to facilitate the capturing of extensive VR datasets. Our toolkit seamlessly integrates with any device, either directly via OpenXR or through the use of a virtual device. Additionally, we introduce a robust data collection pipeline that emphasizes ethical practices (e.g., ensuring data protection and regulation) and ensures a standardized, reproducible methodology.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.9379870891571045, -0.45626163482666016], "openalex_id": "https://openalex.org/W4405765843", "title": "Technical Report: Small Language Model for Japanese Clinical and Medicine", "authors": "Shogo WATANABE", "abstract": "This report presents a small language model (SLM) for Japanese clinical and medicine, named NCVC-slm-1. This 1B parameters model was trained using Japanese text classified to be of high-quality. Moreover, NCVC-slm-1 was augmented with respect to clinical and medicine content that includes the variety of diseases, drugs, and examinations. Using a carefully designed pre-processing, a specialized morphological analyzer and tokenizer, this small and light-weight model performed not only to generate text but also indicated the feasibility of understanding clinical and medicine text. In comparison to other large language models, a fine-tuning NCVC-slm-1 demonstrated the highest scores on 6 tasks of total 8 on JMED-LLM. According to this result, SLM indicated the feasibility of performing several downstream tasks in the field of clinical and medicine. Hopefully, NCVC-slm-1 will be contributed to develop and accelerate the field of clinical and medicine for a bright future.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.9553303718566895, 5.19917106628418], "openalex_id": "https://openalex.org/W4405783327", "title": "Survey of Large Multimodal Model Datasets, Application Categories and Taxonomy", "authors": "Priyaranjan Pattnayak, Hitesh Laxmichand Patel, Bhargava Kumar, Amit Agarwal, Ishan Banerjee, Subhrakanta Panda, Tejaswini Kumar", "abstract": "Multimodal learning, a rapidly evolving field in artificial intelligence, seeks to construct more versatile and robust systems by integrating and analyzing diverse types of data, including text, images, audio, and video. Inspired by the human ability to assimilate information through many senses, this method enables applications such as text-to-video conversion, visual question answering, and image captioning. Recent developments in datasets that support multimodal language models (MLLMs) are highlighted in this overview. Large-scale multimodal datasets are essential because they allow for thorough testing and training of these models. With an emphasis on their contributions to the discipline, the study examines a variety of datasets, including those for training, domain-specific tasks, and real-world applications. It also emphasizes how crucial benchmark datasets are for assessing models' performance in a range of scenarios, scalability, and applicability. Since multimodal learning is always changing, overcoming these obstacles will help AI research and applications reach new heights.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.444361686706543, 0.6489209532737732], "openalex_id": "https://openalex.org/W4405713898", "title": "NER-RoBERTa: Fine-Tuning RoBERTa for Named Entity Recognition (NER) within low-resource languages", "authors": "Abdulhady Abas Abdullah, Srwa Hasan Abdulla, Dalia Mohammad Toufiq, Halgurd S. Maghdid, Tarik A. Rashid, Pakshan F. Farho, Shadan Sh. Sabr, Akar Taher, Darya S. Hamad, Hadi Veisi, Aras Asaad", "abstract": "Nowadays, Natural Language Processing (NLP) is an important tool for most people's daily life routines, ranging from understanding speech, translation, named entity recognition (NER), and text categorization, to generative text models such as ChatGPT. Due to the existence of big data and consequently large corpora for widely used languages like English, Spanish, Turkish, Persian, and many more, these applications have been developed accurately. However, the Kurdish language still requires more corpora and large datasets to be included in NLP applications. This is because Kurdish has a rich linguistic structure, varied dialects, and a limited dataset, which poses unique challenges for Kurdish NLP (KNLP) application development. While several studies have been conducted in KNLP for various applications, Kurdish NER (KNER) remains a challenge for many KNLP tasks, including text analysis and classification. In this work, we address this limitation by proposing a methodology for fine-tuning the pre-trained RoBERTa model for KNER. To this end, we first create a Kurdish corpus, followed by designing a modified model architecture and implementing the training procedures. To evaluate the trained model, a set of experiments is conducted to demonstrate the performance of the KNER model using different tokenization methods and trained models. The experimental results show that fine-tuned RoBERTa with the SentencePiece tokenization method substantially improves KNER performance, achieving a 12.8% improvement in F1-score compared to traditional models, and consequently establishes a new benchmark for KNLP.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.123615264892578, 1.7119040489196777], "openalex_id": "https://openalex.org/W4405715544", "title": "TelcoLM: collecting data, adapting, and benchmarking language models for the telecommunication domain", "authors": "Camille Barboule, Viet-Phi Huynh, Adrien Bufort, Yoan Chabot, G\u00e9raldine Damnati, Gw\u00e9nol\u00e9 Lecorv\u00e9", "abstract": "Despite outstanding processes in many tasks, Large Language Models (LLMs) still lack accuracy when dealing with highly technical domains. Especially, telecommunications (telco) is a particularly challenging domain due the large amount of lexical, semantic and conceptual peculiarities. Yet, this domain holds many valuable use cases, directly linked to industrial needs. Hence, this paper studies how LLMs can be adapted to the telco domain. It reports our effort to (i) collect a massive corpus of domain-specific data (800M tokens, 80K instructions), (ii) perform adaptation using various methodologies, and (iii) benchmark them against larger generalist models in downstream tasks that require extensive knowledge of telecommunications. Our experiments on Llama-2-7b show that domain-adapted models can challenge the large generalist models. They also suggest that adaptation can be restricted to a unique instruction-tuning step, dicarding the need for any fine-tuning on raw texts beforehand.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.059656620025635, 3.74847412109375], "openalex_id": "https://openalex.org/W4405713917", "title": "TinyLLM: A Framework for Training and Deploying Language Models at the Edge Computers", "authors": "Savitha Viswanadh Kandala, Pramuka Medaranga, Ambuj Varshney", "abstract": "Language models have gained significant interest due to their general-purpose capabilities, which appear to emerge as models are scaled to increasingly larger parameter sizes. However, these large models impose stringent requirements on computing systems, necessitating significant memory and processing requirements for inference. This makes performing inference on mobile and edge devices challenging, often requiring invocating remotely-hosted models via network calls. Remote inference, in turn, introduces issues like latency, unreliable network connectivity, and privacy concerns. To address these challenges, we explored the possibility of deviating from the trend of increasing model size. Instead, we hypothesize that much smaller models (~30-120M parameters) can outperform their larger counterparts for specific tasks by carefully curating the data used for pre-training and fine-tuning. We investigate this within the context of deploying edge-device models to support sensing applications. We trained several foundational models through a systematic study and found that small models can run locally on edge devices, achieving high token rates and accuracy. Based on these findings, we developed a framework that allows users to train foundational models tailored to their specific applications and deploy them at the edge.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.336118221282959, 2.73867130279541], "openalex_id": "https://openalex.org/W4405715707", "title": "Formal Mathematical Reasoning: A New Frontier in AI", "authors": "Kaiyu Yang, Gabriel Poesia, Jingxuan He, Wenda Li, Kristin Lauter, Swarat Chaudhuri, Dawn Song", "abstract": "AI for Mathematics (AI4Math) is not only intriguing intellectually but also crucial for AI-driven discovery in science, engineering, and beyond. Extensive efforts on AI4Math have mirrored techniques in NLP, in particular, training large language models on carefully curated math datasets in text form. As a complementary yet less explored avenue, formal mathematical reasoning is grounded in formal systems such as proof assistants, which can verify the correctness of reasoning and provide automatic feedback. In this position paper, we advocate for formal mathematical reasoning and argue that it is indispensable for advancing AI4Math to the next level. In recent years, we have seen steady progress in using AI to perform formal reasoning, including core tasks such as theorem proving and autoformalization, as well as emerging applications such as verifiable generation of code and hardware designs. However, significant challenges remain to be solved for AI to truly master mathematics and achieve broader impact. We summarize existing progress, discuss open challenges, and envision critical milestones to measure future success. At this inflection point for formal mathematical reasoning, we call on the research community to come together to drive transformative advancements in this field.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.798297882080078, 2.516173839569092], "openalex_id": "https://openalex.org/W4405714299", "title": "Maximize Your Data's Potential: Enhancing LLM Accuracy with Two-Phase Pretraining", "authors": "Shuaijun Feng, Shrimai Prabhumoye, Kezhi Kong, Dan Su, Mostofa Patwary, Mohammad Shoeybi, Bryan Catanzaro", "abstract": "Pretraining large language models effectively requires strategic data selection, blending and ordering. However, key details about data mixtures especially their scalability to longer token horizons and larger model sizes remain underexplored due to limited disclosure by model developers. To address this, we formalize the concept of two-phase pretraining and conduct an extensive systematic study on how to select and mix data to maximize model accuracies for the two phases. Our findings illustrate that a two-phase approach for pretraining outperforms random data ordering and natural distribution of tokens by 3.4% and 17% on average accuracies. We provide in-depth guidance on crafting optimal blends based on quality of the data source and the number of epochs to be seen. We propose to design blends using downsampled data at a smaller scale of 1T tokens and then demonstrate effective scaling of our approach to larger token horizon of 15T tokens and larger model size of 25B model size. These insights provide a series of steps practitioners can follow to design and scale their data blends.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.376430988311768, 0.43907028436660767], "openalex_id": "https://openalex.org/W4405714350", "title": "A Review of the Marathi Natural Language Processing", "authors": "Asang Dani, Shailesh R Sathe", "abstract": "Marathi is one of the most widely used languages in the world. One might expect that the latest advances in NLP research in languages like English reach such a large community. However, NLP advancements in English didn't immediately reach Indian languages like Marathi. There were several reasons for this. They included diversity of scripts used, lack of (publicly available) resources like tokenization strategies, high quality datasets \\& benchmarks, and evaluation metrics. In addition to this, the morphologically rich nature of Marathi, made NLP tasks challenging. Advances in Neural Network (NN) based models and tools since the early 2000s helped improve this situation and make NLP research more accessible. In the past 10 years, significant efforts were made to improve language resources for all 22 scheduled languages of India. This paper presents a broad overview of evolution of NLP research in Indic languages with a focus on Marathi and state-of-the-art resources and tools available to the research community. It also provides an overview of tools \\& techniques associated with Marathi NLP tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.154720306396484, 1.9304121732711792], "openalex_id": "https://openalex.org/W4405627756", "title": "How to Synthesize Text Data without Model Collapse?", "authors": "Xuekai Zhu, Dan Cheng, Hengli Li, Kejin Zhang, Ermo Hua, Xingtai Lv, Ning Ding, Zheng-kang Lin, Zilong Zheng, B. Zhou", "abstract": "Model collapse in synthetic data indicates that iterative training on self-generated data leads to a gradual decline in performance. With the proliferation of AI models, synthetic data will fundamentally reshape the web data ecosystem. Future GPT-$\\{n\\}$ models will inevitably be trained on a blend of synthetic and human-produced data. In this paper, we focus on two questions: what is the impact of synthetic data on language model training, and how to synthesize data without model collapse? We first pre-train language models across different proportions of synthetic data, revealing a negative correlation between the proportion of synthetic data and model performance. We further conduct statistical analysis on synthetic data to uncover distributional shift phenomenon and over-concentration of n-gram features. Inspired by the above findings, we propose token editing on human-produced data to obtain semi-synthetic data. As a proof of concept, we theoretically demonstrate that token-level editing can prevent model collapse, as the test error is constrained by a finite upper bound. We conduct extensive experiments on pre-training from scratch, continual pre-training, and supervised fine-tuning. The results validate our theoretical proof that token-level editing improves model performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.20658016204834, 3.7866344451904297], "openalex_id": "https://openalex.org/W4405627841", "title": "Creation of AI-driven Smart Spaces for Enhanced Indoor Environments--A Survey", "authors": "Ayg\u00fcn Varol, Naser Hossein Motlagh, Mirka Leino, Sasu Tarkoma, Johanna Virkki", "abstract": "Smart spaces are ubiquitous computing environments that integrate diverse sensing and communication technologies to enhance space functionality, optimize energy utilization, and improve user comfort and well-being. The integration of emerging AI methodologies into these environments facilitates the formation of AI-driven smart spaces, which further enhance functionalities of the spaces by enabling advanced applications such as personalized comfort settings, interactive living spaces, and automatization of the space systems, all resulting in enhanced indoor experiences of the users. In this paper, we present a systematic survey of existing research on the foundational components of AI-driven smart spaces, including sensor technologies, data communication protocols, sensor network management and maintenance strategies, as well as the data collection, processing and analytics. Given the pivotal role of AI in establishing AI-powered smart spaces, we explore the opportunities and challenges associated with traditional machine learning (ML) approaches, such as deep learning (DL), and emerging methodologies including large language models (LLMs). Finally, we provide key insights necessary for the development of AI-driven smart spaces, propose future research directions, and sheds light on the path forward.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.804215431213379, 0.2177448570728302], "openalex_id": "https://openalex.org/W4405638616", "title": "Insights into Low-Resource Language Modelling: Improving Model Performances for South African Languages.", "authors": "Ruan Visser, T. L. Grobler, Marcel Dunaiski", "abstract": "To address the gap in natural language processing for Southern African languages, our paper presents an in-depth analysis of language model development under resource-constrained conditions. We investigate the interplay between model size, pretraining objectives, and multilingual dataset composition in the context of low-resource languages such as Zulu and Xhosa. In our approach, we initially pretrain language models from scratch on specific low-resource languages using a variety of model configurations, and incrementally add related languages to explore the effect of additional languages on the performance of these models. We demonstrate that smaller data volumes can be effectively leveraged, and that the choice of pretraining objective and multilingual dataset composition significantly influences model performance. Our monolingual and multilingual models, exhibit competitive, and in some cases superior, performance compared to established multilingual models such as XLM-R-base and AfroXLM-R-base.", "venue": "JUCS - Journal of Universal Computer Science", "label": 0}, {"loc": [3.9329209327697754, 3.2606096267700195], "openalex_id": "https://openalex.org/W4405627020", "title": "Cross-Lingual Transfer of Debiasing and Detoxification in Multilingual LLMs: An Extensive Investigation", "authors": "Vera Neplenbroek, Arianna Bisazza, Raquel Fern\u00e1ndez", "abstract": "Recent generative large language models (LLMs) show remarkable performance in non-English languages, but when prompted in those languages they tend to express higher harmful social biases and toxicity levels. Prior work has shown that finetuning on specialized datasets can mitigate this behavior, and doing so in English can transfer to other languages. In this work, we investigate the impact of different finetuning methods on the model's bias and toxicity, but also on its ability to produce fluent and diverse text. We reduce biases by finetuning on curated non-harmful text, but find only direct preference optimization to be effective for mitigating toxicity. The mitigation caused by applying these methods in English also transfers to non-English languages. We find evidence that the extent to which transfer takes place can be predicted by the amount of data in a given language present in the model's pretraining data. However, this transfer of bias and toxicity mitigation often comes at the expense of decreased language generation ability in non-English languages, highlighting the importance of developing language-specific bias and toxicity mitigation methods.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.9183478355407715, -0.06701069325208664], "openalex_id": "https://openalex.org/W4405626142", "title": "Extending LLMs to New Languages: A Case Study of Llama and Persian Adaptation", "authors": "Samin Mahdizadeh Sani, Pouya Sadeghi, Thuy-Trang Vu, Yadollah Yaghoobzadeh, Gholamreza Haffari", "abstract": "Large language models (LLMs) have made great progress in classification and text generation tasks. However, they are mainly trained on English data and often struggle with low-resource languages. In this study, we explore adding a new language, i.e., Persian, to Llama (a model with a limited understanding of Persian) using parameter-efficient fine-tuning. We employ a multi-stage approach involving pretraining on monolingual Persian data, aligning representations through bilingual pretraining and instruction datasets, and instruction-tuning with task-specific datasets. We evaluate the model's performance at each stage on generation and classification tasks. Our findings suggest that incorporating the Persian language, through bilingual data alignment, can enhance classification accuracy for Persian tasks, with no adverse impact and sometimes even improvements on English tasks. Additionally, the results highlight the model's initial strength as a critical factor when working with limited training data, with cross-lingual alignment offering minimal benefits for the low-resource language. Knowledge transfer from English to Persian has a marginal effect, primarily benefiting simple classification tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.805957555770874, 1.9718650579452515], "openalex_id": "https://openalex.org/W4405566537", "title": "Exploring the Potential of Emerging Digitainability\u2014GPT Reasoning in Energy Management of Kindergartens", "authors": "Neboj\u0161a Juri\u0161evi\u0107, Du\u0161an Gordi\u0107, Danijela Nikoli\u0107, Aleksandar Ne\u0161ovi\u0107, Robert Kowalik", "abstract": "One of the barriers to the rapid transition of societies toward a more sustainable future is a scarcity of field experts. Members of scientific and professional communities believe that this obstacle could be overcome by supplementing the decisions of non-experts with artificial intelligence. To examine this opportunity, this study examines the viability of GPT-3.5 as an expert adviser in the energy management of kindergartens. Thus, field experts investigated the deductive and inductive reasoning potential of GPT-LLM (Large Language Model). The first task was conducted on a sample of kindergartens in the Western Balkans. The LLM was instructed to provide the buildings\u2019 specific heat consumption (SHC) by relatively detailed building descriptions and building occupancy. The second task involved kindergartens in various European locations, and the LLM was tasked with estimating energy savings using limited data about the renovation process. The study found deductive reasoning to be insufficient for estimating SHC from the building envelope details, with average accuracy below the least predictive model (R2 = 0.56; MAPE = 48%). Including the factor of occupancy, the SHC estimates were relatively accurate, wherein the first deductive test proved precise (MAPE = 27%), but it was less so in the opposite case (MAPE = 67%). In terms of inductive reasoning, the LLM assumptions were relatively consistent with practice.", "venue": "Buildings", "label": 0}, {"loc": [5.809725284576416, 5.301926136016846], "openalex_id": "https://openalex.org/W4405626519", "title": "Typhoon 2: A Family of Open Text and Multimodal Thai Large Language Models", "authors": "Kunat Pipatanakul, Potsawee Manakul, Natapong Nitarach, Warit Sirichotedumrong, Surapon Nonesung, Teetouch Jaknamon, Parinthapat Pengpun, Pittawat Taveekitworachai, Adisai Na-Thalang, Sittipong Sripaisarnmongkol, Krisanapong Jirayoot, Kasima Tharnpipitchai", "abstract": "This paper introduces Typhoon 2, a series of text and multimodal large language models optimized for the Thai language. The series includes models for text, vision, and audio. Typhoon2-Text builds on state-of-the-art open models, such as Llama 3 and Qwen2, and we perform continual pre-training on a mixture of English and Thai data. We employ post-training techniques to enhance Thai language performance while preserving the base models' original capabilities. We release text models across a range of sizes, from 1 to 70 billion parameters, available in both base and instruction-tuned variants. To guardrail text generation, we release Typhoon2-Safety, a classifier enhanced for Thai cultures and language. Typhoon2-Vision improves Thai document understanding while retaining general visual capabilities, such as image captioning. Typhoon2-Audio introduces an end-to-end speech-to-speech model architecture capable of processing audio, speech, and text inputs and generating both text and speech outputs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.370325565338135, -1.0598357915878296], "openalex_id": "https://openalex.org/W4405580523", "title": "Human vs. Machine: A Comparative Study on the Detection of AI-Generated Content", "authors": "Amal Boutadjine, Fouzi Harrag, Khaled Shaalan", "abstract": "The surge in advancements in large language models (LLMs) has expedited the generation of synthetic text imitating human writing styles. This, however, raises concerns about the potential misuse of synthetic textual data, which could compromise trust in online content. Against this backdrop, the present research aims to address the key challenges of detecting LLMs-generated texts. In this study, we used ChatGPT (v 3.5) because of its widespread and capability to comprehend and keep conversational context, allowing it to produce meaningful and contextually suitable responses. The problem revolves around the task of discerning between authentic and artificially generated textual content. To tackle this problem, we first created a dataset containing both real and DeepFake text. Subsequently, we employed transfer-learning (TL) and conducted DeepFake-detection utilizing SOTA large pre-trained LLMs. Furthermore, we conducted validation using benchmark datasets comprising unseen data samples to ensure that the model's performance reflects its ability to generalize to new data. Finally, we discussed this study's theoretical contributions, practical implications, limitations and potential avenues for future research, aiming to formulate strategies for identifying and detecting large-generative-models\u2019 produced texts. The results were promising, with accuracy ranging from 94% to 99%. The comparison between automatic detection and the human ability to detect DeepFake text revealed a significant gap in the human capacity for its identification, emphasizing an increasing need for sophisticated automated detectors. The investigation into AI-generated content detection holds central importance in the age of LLMs and technology convergence. This study is both timely and adds value to the ongoing discussion regarding the challenges associated with the pertinent theme of \"DeepFake text detection\", with a special focus on examining the boundaries of human detection.", "venue": "ACM Transactions on Asian and Low-Resource Language Information Processing", "label": 22}, {"loc": [7.393470287322998, -0.5242071151733398], "openalex_id": "https://openalex.org/W4405561685", "title": "Harnessing Transfer Learning from Swahili: Advancing Solutions for Comorian Dialects", "authors": "Abdou Mohamed Naira, Zakarya Erraji, Abdessalam Bahafid, Imade Benelallam", "abstract": "If today some African languages like Swahili have enough resources to develop high-performing Natural Language Processing (NLP) systems, many other languages spoken on the continent are still lacking such support. For these languages, still in their infancy, several possibilities exist to address this critical lack of data. Among them is Transfer Learning, which allows low-resource languages to benefit from the good representation of other languages that are similar to them. In this work, we adopt a similar approach, aiming to pioneer NLP technologies for Comorian, a group of four languages or dialects belonging to the Bantu family. Our approach is initially motivated by the hypothesis that if a human can understand a different language from their native language with little or no effort, it would be entirely possible to model this process on a machine. To achieve this, we consider ways to construct Comorian datasets mixed with Swahili. One thing to note here is that in terms of Swahili data, we only focus on elements that are closest to Comorian by calculating lexical distances between candidate and source data. We empirically test this hypothesis in two use cases: Automatic Speech Recognition (ASR) and Machine Translation (MT). Our MT model achieved ROUGE-1, ROUGE-2, and ROUGE-L scores of 0.6826, 0.42, and 0.6532, respectively, while our ASR system recorded a WER of 39.50\\% and a CER of 13.76\\%. This research is crucial for advancing NLP in underrepresented languages, with potential to preserve and promote Comorian linguistic heritage in the digital age.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.5446343421936035, 3.1364641189575195], "openalex_id": "https://openalex.org/W4405561932", "title": "Parallel Motif-Based Community Detection", "authors": "Tianyi Chen, Charalampos E. Tsourakakis", "abstract": "Community detection is a central task in graph analytics. Given the substantial growth in graph size, scalability in community detection continues to be an unresolved challenge. Recently, alongside established methods like Louvain and Infomap, motif-based community detection has emerged. Techniques like Tectonic are notable for their advanced ability to identify communities by pruning edges based on motif similarity scores and analyzing the resulting connected components. In this study, we perform a comprehensive evaluation of community detection methods, focusing on both the quality of their output and their scalability. Specifically, we contribute an open-source parallel framework for motif-based community detection based on a shared memory architecture. We conduct a thorough comparative analysis of community detection techniques from various families among state-of-the-art methods, including Tectonic, label propagation, spectral clustering, Louvain, LambdaCC, and Infomap on graphs with up to billions of edges. A key finding of our analysis is that motif-based graph clustering provides a good balance between performance and efficiency. Our work provides several novel insights. Interestingly, we pinpoint biases in prior works in evaluating community detection methods using the top 5K groundtruth communities from SNAP only, as these are frequently near-cliques. Our empirical studies lead to rules of thumb threshold picking strategies that can be critical for real applications. Finally, we show that Tectonic can fail to recover two well-separated clusters. To address this, we suggest a new similarity measure based on counts of triangles and wedges (TW) that prevents the over-segmentation of communities by Tectonic.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.189785003662109, 2.9188737869262695], "openalex_id": "https://openalex.org/W4405561680", "title": "Multimodal Approaches to Fair Image Classification: An Ethical Perspective", "authors": "Javon Hickmon", "abstract": "In the rapidly advancing field of artificial intelligence, machine perception is becoming paramount to achieving increased performance. Image classification systems are becoming increasingly integral to various applications, ranging from medical diagnostics to image generation; however, these systems often exhibit harmful biases that can lead to unfair and discriminatory outcomes. Machine Learning systems that depend on a single data modality, i.e. only images or only text, can exaggerate hidden biases present in the training data, if the data is not carefully balanced and filtered. Even so, these models can still harm underrepresented populations when used in improper contexts, such as when government agencies reinforce racial bias using predictive policing. This thesis explores the intersection of technology and ethics in the development of fair image classification models. Specifically, I focus on improving fairness and methods of using multiple modalities to combat harmful demographic bias. Integrating multimodal approaches, which combine visual data with additional modalities such as text and metadata, allows this work to enhance the fairness and accuracy of image classification systems. The study critically examines existing biases in image datasets and classification algorithms, proposes innovative methods for mitigating these biases, and evaluates the ethical implications of deploying such systems in real-world scenarios. Through comprehensive experimentation and analysis, the thesis demonstrates how multimodal techniques can contribute to more equitable and ethical AI solutions, ultimately advocating for responsible AI practices that prioritize fairness.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.731570720672607, 1.9803926944732666], "openalex_id": "https://openalex.org/W4405469846", "title": "Generics are puzzling. Can language models find the missing piece?", "authors": "Gustavo Cilleruelo Calder\u00f3n, Emily Allaway, Barry Haddow, Alexandra Birch", "abstract": "Generic sentences express generalisations about the world without explicit quantification. Although generics are central to everyday communication, building a precise semantic framework has proven difficult, in part because speakers use generics to generalise properties with widely different statistical prevalence. In this work, we study the implicit quantification and context-sensitivity of generics by leveraging language models as models of language. We create ConGen, a dataset of 2873 naturally occurring generic and quantified sentences in context, and define p-acceptability, a metric based on surprisal that is sensitive to quantification. Our experiments show generics are more context-sensitive than determiner quantifiers and about 20% of naturally occurring generics we analyze express weak generalisations. We also explore how human biases in stereotypes can be observed in language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.217369556427002, -0.2889140248298645], "openalex_id": "https://openalex.org/W4405477555", "title": "Linguistic variation beyond the Indo-European web: Analyzing Turkish web registers in TurCORE", "authors": "Selcen Erten-Johansson, Valtteri Skantsi, Sampo Pyysalo, Veronika Laippala", "abstract": "Abstract A register, defined as a text variety with specific situational characteristics and a communicative purpose (Biber & Conrad 2019), is also recognized as a cultural construct (Biber & Egbert 2023). Registers merit thorough investigation due to their pivotal role in reflecting linguistic and cultural landscapes. However, existing studies predominantly focus on Indo-European languages. This study investigates Turkish web registers through the introduction of the Turkish Corpus of Online Registers (TurCORE). Comprising 2,780 web texts, TurCORE was manually annotated using a register taxonomy targeting the entire unrestricted web and identifying 24 web register categories. By employing Text Dispersion Keyword Analysis (Egbert & Biber 2019), the research examines the register characteristics with a specific focus on news reports, interactive discussions, and recipes, drawing comparisons with their English equivalents. Results reveal parallels between Turkish and English news reports while Turkish interactive discussions and recipes exhibit distinctive language- and culture specific features.", "venue": "Register Studies", "label": 0}, {"loc": [7.028608322143555, 1.1408668756484985], "openalex_id": "https://openalex.org/W4405470347", "title": "Vocabulary Expansion of Chat Models with Unlabeled Target Language Data", "authors": "Atsuki Yamaguchi, Terufumi Morishita, Aline Villavicencio, \u039d\u03b9\u03ba\u03cc\u03bb\u03b1\u03bf\u03c2 \u0391\u03bb\u03ad\u03c4\u03c1\u03b1\u03c2", "abstract": "Vocabulary expansion (VE) is the de-facto approach to language adaptation of large language models (LLMs) by adding new tokens and continuing pre-training on target data. While this is effective for base models trained on unlabeled data, it poses challenges for chat models trained to follow instructions through labeled conversation data. Directly adapting the latter with VE on target unlabeled data may result in forgetting chat abilities. While ideal, target chat data is often unavailable or costly to create for low-resource languages, and machine-translated alternatives are not always effective. To address this issue, previous work proposed using a base and chat model from the same family. This method first adapts the base LLM with VE on target unlabeled data and then converts it to a chat model by adding a chat vector (CV) derived from the weight difference between the source base and chat models. We propose ElChat, a new language adaptation method for chat LLMs that adapts a chat model directly on target unlabeled data, without a base model. It elicits chat abilities by injecting information from the source chat model. ElChat offers more robust and competitive target language and safety performance while achieving superior English, chat, and instruction-following abilities compared to CV.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.02151107788086, 2.9592294692993164], "openalex_id": "https://openalex.org/W4405470524", "title": "Towards Understanding Systems Trade-offs in Retrieval-Augmented Generation Model Inference", "authors": "Michael M. Shen, Muhammad Umar, Kiwan Maeng, G. Edward Suh, Udit Gupta", "abstract": "The rapid increase in the number of parameters in large language models (LLMs) has significantly increased the cost involved in fine-tuning and retraining LLMs, a necessity for keeping models up to date and improving accuracy. Retrieval-Augmented Generation (RAG) offers a promising approach to improving the capabilities and accuracy of LLMs without the necessity of retraining. Although RAG eliminates the need for continuous retraining to update model data, it incurs a trade-off in the form of slower model inference times. Resultingly, the use of RAG in enhancing the accuracy and capabilities of LLMs often involves diverse performance implications and trade-offs based on its design. In an effort to begin tackling and mitigating the performance penalties associated with RAG from a systems perspective, this paper introduces a detailed taxonomy and characterization of the different elements within the RAG ecosystem for LLMs that explore trade-offs within latency, throughput, and memory. Our study reveals underlying inefficiencies in RAG for systems deployment, that can result in TTFT latencies that are twice as long and unoptimized datastores that consume terabytes of storage.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.9224138259887695, 2.049501419067383], "openalex_id": "https://openalex.org/W4405488517", "title": "Emergent Lexical Synthesis Through Contextual Feedback Mechanisms in Large Language Models", "authors": "Vadim Zakiev, Emilia Cumberledge, Clarimond Ravenscroft, Ottilie Chomondeley", "abstract": "Adaptive mechanisms capable of refining intermediate latent states during text generation are essential for addressing limitations in current model architectures, particularly with respect to contextual drift, linguistic coherence, and output diversity. Contextual Feedback Mechanisms (CFMs) introduce internal feedback loops that recondition latent representations dynamically during runtime, allowing large language models to adjust predictions iteratively without requiring architectural retraining. Through precise integration of gating functions, softmax-based reweighting, and auxiliary loss constraints, CFMs achieve significant improvements in token-level prediction accuracy, sequence-level coherence, and adaptability to ambiguous or incomplete inputs. Empirical evaluations across multiple large-scale datasets, including WikiText-103, BookCorpus, and Reddit Corpus, demonstrate a marked reduction in perplexity and token repetition, particularly for extended sequences exceeding several hundred tokens. Output diversity, as quantified through unique token ratios, showed consistent gains across both conversational and formal text inputs, indicating that CFMs enhance lexical variability without sacrificing fluency. Comparative analysis revealed that the proposed mechanisms surpass existing techniques, such as nucleus sampling and iterative decoding, in balancing computational efficiency and real-time adaptability. Experimental benchmarks further highlight the scalability of CFMs, with latency overhead remaining within acceptable thresholds across varying batch sizes and input lengths. Additionally, CFMs demonstrate a robust ability to resolve ambiguity in challenging prompts, offering new opportunities for improving model performance in dynamic and evolving contexts. The framework maintains practical feasibility through computational optimizations, ensuring that feedback integration imposes minimal resource demands while delivering measurable gains.", "venue": "https://doi.org/10.31219/osf.io/t8mj9", "label": 0}, {"loc": [3.980827808380127, 1.0937011241912842], "openalex_id": "https://openalex.org/W4405432820", "title": "Methods to Assess the UK Government's Current Role as a Data Provider for AI", "authors": "Neil Majithia, Elena Simperl", "abstract": "Governments typically collect and steward a vast amount of high-quality data on their citizens and institutions, and the UK government is exploring how it can better publish and provision this data to the benefit of the AI landscape. However, the compositions of generative AI training corpora remain closely guarded secrets, making the planning of data sharing initiatives difficult. To address this, we devise two methods to assess UK government data usage for the training of Large Language Models (LLMs) and 'peek behind the curtain' in order to observe the UK government's current contributions as a data provider for AI. The first method, an ablation study that utilises LLM 'unlearning', seeks to examine the importance of the information held on UK government websites for LLMs and their performance in citizen query tasks. The second method, an information leakage study, seeks to ascertain whether LLMs are aware of the information held in the datasets published on the UK government's open data initiative data$.$gov$.$uk. Our findings indicate that UK government websites are important data sources for AI (heterogenously across subject matters) while data$.$gov$.$uk is not. This paper serves as a technical report, explaining in-depth the designs, mechanics, and limitations of the above experiments. It is accompanied by a complementary non-technical report on the ODI website in which we summarise the experiments and key findings, interpret them, and build a set of actionable recommendations for the UK government to take forward as it seeks to design AI policy. While we focus on UK open government data, we believe that the methods introduced in this paper present a reproducible approach to tackle the opaqueness of AI training corpora and provide organisations a framework to evaluate and maximize their contributions to AI development.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.1711883544921875, 2.236874580383301], "openalex_id": "https://openalex.org/W4412216302", "title": "BiasMirror: Towards Mitigating Implicit Bias", "authors": "Kathryn Brohman, Ali Khan, Tiancong Fu, Raghava Rao Mukkamala, Abayomi Baiyere, Doug Vogel, Heiko Gewald, Assadaporn Sapsomboon, Andrew Schwarz, Christy M.K. Cheung, Sven Laumer, Jason Bennett Thatcher", "abstract": "This research-in-progress study attends to the issue of implicit bias, which underlies discriminatory attitudes and behaviors that people may not be aware that they hold. Specifically in the context of performance evaluations, attending to such bias is important as these evaluations form the foundations of many other important decisions such as pay, bonuses and awards, promotion, mobility, and layoffs. Leveraging a design science approach, our research aims to develop and implement effective interventions to mitigate bias in performance evaluations, utilizing digital technologies, specifically generative AI, and access to a unique dataset. In this short paper, we outline our DSR process, which includes a field experiment to test the efficacy of our intervention. Beyond its practical implications, this study is poised to contribute to the theoretical understanding of implicit bias as a societal concern and advance knowledge on the design of digital technologies in mitigating these.", "venue": "AIS Electronic Library (AISeL) (Association for Information Systems)", "label": 0}, {"loc": [6.448121547698975, 5.452444553375244], "openalex_id": "https://openalex.org/W4405434026", "title": "DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding", "authors": "Zhiyu Wu, Xiaokang Chen, Zizheng Pan, Xingchao Liu, Wen Liu, Damai Dai, Huazuo Gao, Yiyang Ma, Chengyue Wu, Bingxuan Wang, Zhenda Xie, Yu Wu, Kai Hu, Jiawei Wang, Yaofeng Sun, Yukun Li, Yishi Piao, Kang Guan, Aixin Liu, Xin Xie, Yuxiang You, Kai Dong, Xingkai Yu, Haowei Zhang, Liang Zhao, Yisong Wang, Chong Ruan", "abstract": "We present DeepSeek-VL2, an advanced series of large Mixture-of-Experts (MoE)\\nVision-Language Models that significantly improves upon its predecessor,\\nDeepSeek-VL, through two key major upgrades. For the vision component, we\\nincorporate a dynamic tiling vision encoding strategy designed for processing\\nhigh-resolution images with different aspect ratios. For the language\\ncomponent, we leverage DeepSeekMoE models with the Multi-head Latent Attention\\nmechanism, which compresses Key-Value cache into latent vectors, to enable\\nefficient inference and high throughput. Trained on an improved vision-language\\ndataset, DeepSeek-VL2 demonstrates superior capabilities across various tasks,\\nincluding but not limited to visual question answering, optical character\\nrecognition, document/table/chart understanding, and visual grounding. Our\\nmodel series is composed of three variants: DeepSeek-VL2-Tiny,\\nDeepSeek-VL2-Small and DeepSeek-VL2, with 1.0B, 2.8B and 4.5B activated\\nparameters respectively. DeepSeek-VL2 achieves competitive or state-of-the-art\\nperformance with similar or fewer activated parameters compared to existing\\nopen-source dense and MoE-based models. Codes and pre-trained models are\\npublicly accessible at https://github.com/deepseek-ai/DeepSeek-VL2.\\n", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7384305000305176, 0.1658329963684082], "openalex_id": "https://openalex.org/W4405355411", "title": "Foundational Large Language Models for Materials Research", "authors": "Vaibhav Mishra, Sourabh Singh, Dhruv Ahlawat, Mohd Zaki, Vaibhav Bihani, Hargun Singh Grover, Biswajit Mishra, Santiago Miret, Mausam Mausam, N. M. Anoop Krishnan", "abstract": "Materials discovery and development are critical for addressing global challenges. Yet, the exponential growth in materials science literature comprising vast amounts of textual data has created significant bottlenecks in knowledge extraction, synthesis, and scientific reasoning. Large Language Models (LLMs) offer unprecedented opportunities to accelerate materials research through automated analysis and prediction. Still, their effective deployment requires domain-specific adaptation for understanding and solving domain-relevant tasks. Here, we present LLaMat, a family of foundational models for materials science developed through continued pretraining of LLaMA models on an extensive corpus of materials literature and crystallographic data. Through systematic evaluation, we demonstrate that LLaMat excels in materials-specific NLP and structured information extraction while maintaining general linguistic capabilities. The specialized LLaMat-CIF variant demonstrates unprecedented capabilities in crystal structure generation, predicting stable crystals with high coverage across the periodic table. Intriguingly, despite LLaMA-3's superior performance in comparison to LLaMA-2, we observe that LLaMat-2 demonstrates unexpectedly enhanced domain-specific performance across diverse materials science tasks, including structured information extraction from text and tables, more particularly in crystal structure generation, a potential adaptation rigidity in overtrained LLMs. Altogether, the present work demonstrates the effectiveness of domain adaptation towards developing practically deployable LLM copilots for materials research. Beyond materials science, our findings reveal important considerations for domain adaptation of LLMs, such as model selection, training methodology, and domain-specific performance, which may influence the development of specialized scientific AI systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.791551113128662, -0.8967858552932739], "openalex_id": "https://openalex.org/W4405427676", "title": "Towards the Machine Translation of Scientific Neologisms", "authors": "Paul J. Lerner, Fran\u00e7ois Yvon", "abstract": "International audience", "venue": "HAL (Le Centre pour la Communication Scientifique Directe)", "label": 6}, {"loc": [8.186304092407227, -0.1838550865650177], "openalex_id": "https://openalex.org/W4405355290", "title": "Text Generation Models for Luxembourgish with Limited Data: A Balanced Multilingual Strategy", "authors": "Alistair Plum, Tharindu Ranasinghe, Christoph Purschke", "abstract": "This paper addresses the challenges in developing language models for less-represented languages, with a focus on Luxembourgish. Despite its active development, Luxembourgish faces a digital data scarcity, exacerbated by Luxembourg's multilingual context. We propose a novel text generation model based on the T5 architecture, combining limited Luxembourgish data with equal amounts, in terms of size and type, of German and French data. We hypothesise that a model trained on Luxembourgish, German, and French will improve the model's cross-lingual transfer learning capabilities and outperform monolingual and large multilingual models. To verify this, the study at hand explores whether multilingual or monolingual training is more beneficial for Luxembourgish language generation. For the evaluation, we introduce LuxGen, a text generation benchmark that is the first of its kind for Luxembourgish.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.716753959655762, 3.0580544471740723], "openalex_id": "https://openalex.org/W4405433019", "title": "Byte Latent Transformer: Patches Scale Better Than Tokens", "authors": "Artidoro Pagnoni, Ram Pasunuru, Pedro Rodr\u00edguez, John Nguyen, Benjamin M\u00fcller, Margaret Li, Chunting Zhou, Lili Yu, Jason Weston, Luke Zettlemoyer, Gargi Ghosh, Michael Lewis, Ari Holtzman, Srinivasan Iyer", "abstract": "We introduce the Byte Latent Transformer (BLT), a new byte-level LLM architecture that, for the first time, matches tokenization-based LLM performance at scale with significant improvements in inference efficiency and robustness. BLT encodes bytes into dynamically sized patches, which serve as the primary units of computation. Patches are segmented based on the entropy of the next byte, allocating more compute and model capacity where increased data complexity demands it. We present the first FLOP controlled scaling study of byte-level models up to 8B parameters and 4T training bytes. Our results demonstrate the feasibility of scaling models trained on raw bytes without a fixed vocabulary. Both training and inference efficiency improve due to dynamically selecting long patches when data is predictable, along with qualitative improvements on reasoning and long tail generalization. Overall, for fixed inference costs, BLT shows significantly better scaling than tokenization-based models, by simultaneously growing both patch and model size.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.580176830291748, 2.3517258167266846], "openalex_id": "https://openalex.org/W4405354744", "title": "Phi-4 Technical Report", "authors": "Marah Abdin, Jyoti Aneja, Harkirat Singh Behl, S\u00e9bastien Bubeck, Ronen Eldan, Suriya Gunasekar, Michael R. Harrison, Russell J. Hewett, Mojan Javaheripi, Piero Kauffmann, James R. Lee, Yin Tat Lee, Yuanzhi Li, Weishung Liu, Caio C\u00e9sar Teodoro Mendes, Anh Nguyen, Eric Price, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Xin Wang, Rachel Ward, Yue Wu, Dingli Yu, Cyril Zhang, Yi Zhang", "abstract": "We present phi-4, a 14-billion parameter language model developed with a training recipe that is centrally focused on data quality. Unlike most language models, where pre-training is based primarily on organic data sources such as web content or code, phi-4 strategically incorporates synthetic data throughout the training process. While previous models in the Phi family largely distill the capabilities of a teacher model (specifically GPT-4), phi-4 substantially surpasses its teacher model on STEM-focused QA capabilities, giving evidence that our data-generation and post-training techniques go beyond distillation. Despite minimal changes to the phi-3 architecture, phi-4 achieves strong performance relative to its size -- especially on reasoning-focused benchmarks -- due to improved data, training curriculum, and innovations in the post-training scheme.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.504744529724121, 4.572655200958252], "openalex_id": "https://openalex.org/W4405355190", "title": "Falcon-UI: Understanding GUI Before Following User Instructions", "authors": "Huawen Shen, Chang Liu, Guanzhen Li, Xinlong Wang, Yu Zhou, Can Ma, Xiangyang Ji", "abstract": "Pursuing human-like interaction for Graphical User Interface (GUI) agents requires understanding the GUI context and following user instructions. However, existing works typically couple these two aspects and focus more on instruct-following abilities, while ignoring the importance of understanding the GUI context. In this paper, we introduce an instruction-free GUI navigation dataset, termed Insight-UI Dataset, to enhance model comprehension of GUI environments. Insight-UI Dataset is automatically generated from the Common Crawl corpus, simulating various platforms -- including iOS, Android, Windows, and Linux -- across multiple resolutions on 312K domains. Although GUI interactions vary by context, diverse interfaces share common internal patterns, such as clicking an item to view its details. It implies the feasibility of independent GUI operation learning, followed by joint optimization with instruction tuning. Thereby, we develop the GUI agent model Falcon-UI, which is initially pretrained on Insight-UI Dataset and subsequently fine-tuned on Android and Web GUI datasets, including AITW, AITZ, Android Control, and Mind2Web. With 7 billion parameters, Falcon-UI achieves accuracy comparable to the 72 billion-parameter Qwen2VL on AITZ, validating the alignment between GUI context comprehension and agent performance. Our code and dataset will be open-sourced.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.05001449584961, 0.39505404233932495], "openalex_id": "https://openalex.org/W4405354515", "title": "Large Concept Models: Language Modeling in a Sentence Representation Space", "authors": "the KSS Cave Studies Team, Lo\u00efc Barrault, Paul-Ambroise Duquenne, Maha Elbayad, Artyom Kozhevnikov, Belen Alastruey, Pierre Andrews, Mariano Coria, Guillaume Couairon, Marta R. Costa\u2010juss\u00e0, David C. Dale, Hady Elsahar, Kevin S. Heffernan, Jo\u00e3o Maria Janeiro, Tuan Tran, Christophe Ropers, Eduardo S\u00e1nchez, Robin San Roman, Alexandre Mourachko, Safiyyah Saleem, Holger Schwenk", "abstract": "LLMs have revolutionized the field of artificial intelligence and have emerged as the de-facto tool for many tasks. The current established technology of LLMs is to process input and generate output at the token level. This is in sharp contrast to humans who operate at multiple levels of abstraction, well beyond single words, to analyze information and to generate creative content. In this paper, we present an attempt at an architecture which operates on an explicit higher-level semantic representation, which we name a concept. Concepts are language- and modality-agnostic and represent a higher level idea or action in a flow. Hence, we build a \"Large Concept Model\". In this study, as proof of feasibility, we assume that a concept corresponds to a sentence, and use an existing sentence embedding space, SONAR, which supports up to 200 languages in both text and speech modalities. The Large Concept Model is trained to perform autoregressive sentence prediction in an embedding space. We explore multiple approaches, namely MSE regression, variants of diffusion-based generation, and models operating in a quantized SONAR space. These explorations are performed using 1.6B parameter models and training data in the order of 1.3T tokens. We then scale one architecture to a model size of 7B parameters and training data of about 2.7T tokens. We perform an experimental evaluation on several generative tasks, namely summarization and a new task of summary expansion. Finally, we show that our model exhibits impressive zero-shot generalization performance to many languages, outperforming existing LLMs of the same size. The training code of our models is freely available.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7905564308166504, 1.0808922052383423], "openalex_id": "https://openalex.org/W4408364813", "title": "Managing Output Risks From Imperfect LLMS", "authors": "Mageswaran Sanmugam, James Boldiston", "abstract": "Large Language Models (LLMs) like ChatGPT are powerful tools for generating well-written content quickly, but their inner workings are opaque, leading to concerns about the accuracy of their outputs. These models don't actually \u201cthink\u201d; they use statistical methods to generate language, creating a \u201cblack box\u201d where the reasoning behind their outputs is unclear. This can lead to plausible but factually incorrect content being mistaken for accurate information. Instead of expecting LLMs to explain their reasoning, users should approach their outputs critically, recognizing that speed doesn't guarantee accuracy. Human validation is essential to mitigate the risks associated with LLMs, ensuring that their content is used safely and effectively.", "venue": "Advances in educational technologies and instructional design book series", "label": 0}, {"loc": [5.123612880706787, 0.9856578707695007], "openalex_id": "https://openalex.org/W4405433384", "title": "GAOKAO-Eval: Does high scores truly reflect strong capabilities in LLMs?", "authors": "Z. H. Lei, Tianyi Liang, Hongjiu Hu, Jin Zhang, Yunhua Zhou, Yaping Shao, Linyang Li, C. Li, Changbo Wang, Hang Yan, Qipeng Guo", "abstract": "Large Language Models (LLMs) are commonly evaluated using human-crafted\\nbenchmarks, under the premise that higher scores implicitly reflect stronger\\nhuman-like performance. However, there is growing concern that LLMs may ``game\"\\nthese benchmarks due to data leakage, achieving high scores while struggling\\nwith tasks simple for humans. To substantively address the problem, we create\\nGAOKAO-Eval, a comprehensive benchmark based on China's National College\\nEntrance Examination (Gaokao), and conduct ``closed-book\" evaluations for\\nrepresentative models released prior to Gaokao. Contrary to prevailing\\nconsensus, even after addressing data leakage and comprehensiveness,\\nGAOKAO-Eval reveals that high scores still fail to truly reflect human-aligned\\ncapabilities. To better understand this mismatch, We introduce the Rasch model\\nfrom cognitive psychology to analyze LLM scoring patterns and identify two key\\ndiscrepancies: 1) anomalous consistent performance across various question\\ndifficulties, and 2) high variance in performance on questions of similar\\ndifficulty. In addition, We identified inconsistent grading of LLM-generated\\nanswers among teachers and recurring mistake patterns. we find that the\\nphenomenons are well-grounded in the motivations behind OpenAI o1, and o1's\\nreasoning-as-difficulties can mitigate the mismatch. These results show that\\nGAOKAO-Eval can reveal limitations in LLM capabilities not captured by current\\nbenchmarks and highlight the need for more LLM-aligned difficulty analysis.\\n", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.682209491729736, 5.2333478927612305], "openalex_id": "https://openalex.org/W4405312203", "title": "Multimodal Latent Language Modeling with Next-Token Diffusion", "authors": "Yutao Sun, Hangbo Bao, Wenhui Wang, Zhiliang Peng, Li Dong, Shaohan Huang, Jianyong Wang, Furu Wei", "abstract": "Multimodal generative models require a unified approach to handle both discrete data (e.g., text and code) and continuous data (e.g., image, audio, video). In this work, we propose Latent Language Modeling (LatentLM), which seamlessly integrates continuous and discrete data using causal Transformers. Specifically, we employ a variational autoencoder (VAE) to represent continuous data as latent vectors and introduce next-token diffusion for autoregressive generation of these vectors. Additionally, we develop $\u03c3$-VAE to address the challenges of variance collapse, which is crucial for autoregressive modeling. Extensive experiments demonstrate the effectiveness of LatentLM across various modalities. In image generation, LatentLM surpasses Diffusion Transformers in both performance and scalability. When integrated into multimodal large language models, LatentLM provides a general-purpose interface that unifies multimodal generation and understanding. Experimental results show that LatentLM achieves favorable performance compared to Transfusion and vector quantized models in the setting of scaling up training tokens. In text-to-speech synthesis, LatentLM outperforms the state-of-the-art VALL-E 2 model in speaker similarity and robustness, while requiring 10x fewer decoding steps. The results establish LatentLM as a highly effective and scalable approach to advance large multimodal models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.149744987487793, 1.0936181545257568], "openalex_id": "https://openalex.org/W4405315349", "title": "Assisting Quality Assurance of Examination Tasks: Using a GPT Model and Bayesian Testing for Formative Assessment", "authors": "Nico Willert, Phi Katharina W\u00fcrz", "abstract": "Formative quality assurance in the creation of examination tasks has always been an extremely time-consuming process. Especially due to the changing and short-lived content of computer science, new questions have to be created regularly, which in turn requires quality assurance. With the emergence of artificial intelligence (AI) systems such as ChatGPT and their ability to solve a range of different tasks, the question arises as to what extent this ability can also be utilized as part of a quality assurance process. One aspect of the formative quality assurance of multiple-choice questions involves checking the correct classification of alternative answers into correct and incorrect answers. As AI systems inherently lack transparency and predictability in their output, we present a simplified approach using Bayesian hypothesis testing to estimate the tendencies of an AI towards the classification. To evaluate the approach, the process is implemented and connected to the OpenAI API to handle inconsistent responses and other aspects that contribute to the robustness and reliability. This research is concluded by an evaluation carried out by means of the gpt-3.5-turbo model, using the examination tasks of two programming courses. This provides insights into the response scheme of the AI in relation to the prompt pattern used and the usability of AI for the subsequent quality assurance process.", "venue": "Computers and Education Artificial Intelligence", "label": 0}, {"loc": [7.6495208740234375, 1.8146449327468872], "openalex_id": "https://openalex.org/W4405254694", "title": "When Every Token Counts: Optimal Segmentation for Low-Resource Language Models", "authors": "Bharath Raj S, Gaurav Suri, Vikrant Dewangan, Raghav Sonavane", "abstract": "Traditional greedy tokenization methods have been a critical step in Natural Language Processing (NLP), influencing how text is converted into tokens and directly impacting model performance. While subword tokenizers like Byte-Pair Encoding (BPE) are widely used, questions remain about their optimality across model scales and languages. In this work, we demonstrate through extensive experiments that an optimal BPE configuration significantly reduces token count compared to greedy segmentation, yielding improvements in token-saving percentages and performance benefits, particularly for smaller models. We evaluate tokenization performance across various intrinsic and extrinsic tasks, including generation and classification. Our findings suggest that compression-optimized tokenization strategies could provide substantial advantages for multilingual and low-resource language applications, highlighting a promising direction for further research and inclusive NLP.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.8143832683563232, 2.803744077682495], "openalex_id": "https://openalex.org/W4405254978", "title": "The Mirage of Artificial Intelligence Terms of Use Restrictions", "authors": "Peter Henderson, Mark A. Lemley", "abstract": "Artificial intelligence (AI) model creators commonly attach restrictive terms of use to both their models and their outputs. These terms typically prohibit activities ranging from creating competing AI models to spreading disinformation. Often taken at face value, these terms are positioned by companies as key enforceable tools for preventing misuse, particularly in policy dialogs. But are these terms truly meaningful? There are myriad examples where these broad terms are regularly and repeatedly violated. Yet except for some account suspensions on platforms, no model creator has actually tried to enforce these terms with monetary penalties or injunctive relief. This is likely for good reason: we think that the legal enforceability of these licenses is questionable. This Article systematically assesses of the enforceability of AI model terms of use and offers three contributions. First, we pinpoint a key problem: the artifacts that they protect, namely model weights and model outputs, are largely not copyrightable, making it unclear whether there is even anything to be licensed. Second, we examine the problems this creates for other enforcement. Recent doctrinal trends in copyright preemption may further undermine state-law claims, while other legal frameworks like the DMCA and CFAA offer limited recourse. Anti-competitive provisions likely fare even worse than responsible use provisions. Third, we provide recommendations to policymakers. There are compelling reasons for many provisions to be unenforceable: they chill good faith research, constrain competition, and create quasi-copyright ownership where none should exist. There are, of course, downsides: model creators have fewer tools to prevent harmful misuse. But we think the better approach is for statutory provisions, not private fiat, to distinguish between good and bad uses of AI, restricting the latter.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.3766279220581055, 0.08150875568389893], "openalex_id": "https://openalex.org/W4405269285", "title": "ChocoLlama: Lessons Learned From Teaching Llamas Dutch", "authors": "Matthieu Meeus, Anthony Rath\u00e9, Fran\u00e7ois Remy, Pieter Delobelle, Jens-Joris Decorte, Thomas Demeester", "abstract": "While Large Language Models (LLMs) have shown remarkable capabilities in natural language understanding and generation, their performance often lags in lower-resource, non-English languages due to biases in the training data. In this work, we explore strategies for adapting the primarily English LLMs (Llama-2 and Llama-3) to Dutch, a language spoken by 30 million people worldwide yet often underrepresented in LLM development. We collect 104GB of Dutch text ($32$B tokens) from various sources to first apply continued pretraining using low-rank adaptation (LoRA), complemented with Dutch posttraining strategies provided by prior work. For Llama-2, we consider using (i) the tokenizer of the original model, and (ii) training a new, Dutch-specific tokenizer combined with embedding reinitialization. We evaluate our adapted models, ChocoLlama-2, both on standard benchmarks and a novel Dutch benchmark, ChocoLlama-Bench. Our results demonstrate that LoRA can effectively scale for language adaptation, and that tokenizer modification with careful weight reinitialization can improve performance. Notably, Llama-3 was released during the course of this project and, upon evaluation, demonstrated superior Dutch capabilities compared to our Dutch-adapted versions of Llama-2. We hence apply the same adaptation technique to Llama-3, using its original tokenizer. While our adaptation methods enhanced Llama-2's Dutch capabilities, we found limited gains when applying the same techniques to Llama-3. This suggests that for ever improving, multilingual foundation models, language adaptation techniques may benefit more from focusing on language-specific posttraining rather than on continued pretraining. We hope this work contributes to the broader understanding of adapting LLMs to lower-resource languages, and to the development of Dutch LLMs in particular.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.3718647956848145, 5.526963710784912], "openalex_id": "https://openalex.org/W4405253832", "title": "Chimera: Improving Generalist Model with Domain-Specific Experts", "authors": "Tzu\u2010Rong Peng, Mingsheng Li, Hongbin Zhou, Renqiu Xia, Renrui Zhang, Lei Bai, Mao Song, Bin Wang, Conghui He, Aojun Zhou, Botian Shi, Tao Chen, Bo Zhang, Xiangyu Yue", "abstract": "Recent advancements in Large Multi-modal Models (LMMs) underscore the importance of scaling by increasing image-text paired data, achieving impressive performance on general tasks. Despite their effectiveness in broad applications, generalist models are primarily trained on web-scale datasets dominated by natural images, resulting in the sacrifice of specialized capabilities for domain-specific tasks that require extensive domain prior knowledge. Moreover, directly integrating expert models tailored for specific domains is challenging due to the representational gap and imbalanced optimization between the generalist model and experts. To address these challenges, we introduce Chimera, a scalable and low-cost multi-modal pipeline designed to boost the ability of existing LMMs with domain-specific experts. Specifically, we design a progressive training strategy to integrate features from expert models into the input of a generalist LMM. To address the imbalanced optimization caused by the well-aligned general visual encoder, we introduce a novel Generalist-Specialist Collaboration Masking (GSCM) mechanism. This results in a versatile model that excels across the chart, table, math, and document domains, achieving state-of-the-art performance on multi-modal reasoning and visual content extraction tasks, both of which are challenging tasks for assessing existing LMMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.315122365951538, 1.6694475412368774], "openalex_id": "https://openalex.org/W4405209518", "title": "Generative AI and Business: A Review and Research Agenda", "authors": "Jennifer L. Woolley", "abstract": "Generative Artificial Intelligence (Generative AI or GenAI) is an emerging and fast-growing technology with important consequences for workers, businesses, and society. GenAI is a subset of Machine Learning, which is a branch of Artificial Intelligence (AI) that not only analyzes data but also produces human-like output in response to users\u2019 prompts. These creative capabilities distinguish it from former iterations of AI, with distinctive and profound implications. Largely unknown before, GenAI became a household term in the fall of 2022 with the wide release and adoption of ChatGPT. Since then, GenAI has incited discussion and debate in many settings, including education, social media, and engineering. However, much uncertainty and confusion surrounds the topic, even with its widespread adoption. To gain a better understanding of the topic requires a brief overview to clarify some of the most pressing issues pertaining to GenAI and business introducing GenAI and what makes it different from traditional AI, highlighting its ability to produce data besides analyzing it. The generated data or the output that GenAI produces can be categorized as language-based, visual, auditory, and multimodal. Each category has applications in several fields that range from incremental to transformative. Use cases in customer service, sales and marketing, finance, healthcare, education, gaming, and operations are also relevant. Although much has been made of the benefits of using GenAI, there are also important challenges, particularly in terms of a program\u2019s design, technological limitations, and ethical and legal considerations.", "venue": "Oxford Research Encyclopedia of Business and Management", "label": 0}, {"loc": [6.627223014831543, 2.013667106628418], "openalex_id": "https://openalex.org/W4405253920", "title": "Infusing Prompts with Syntax and Semantics", "authors": "Anton Bulle Labate, F\u00e1bio Gagliardi Cozman", "abstract": "Despite impressive success, language models often generate outputs with flawed linguistic structure. We analyze the effect of directly infusing various kinds of syntactic and semantic information into large language models. To demonstrate the value of our proposals, we focus on the translation of natural language queries to SQL, in particular dealing with languages with less resources than English, to better investigate how much help we can get from low cost syntactic and semantic information. We show that linguistic analysis can significantly boost language models, to the point that we have surpassed previous best systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.076251983642578, 1.7426836490631104], "openalex_id": "https://openalex.org/W4405343277", "title": "Is Artificial Intelligence the Future of Collective Memory?", "authors": "Sarah Gensburger, Fr\u00e9d\u00e9ric Clavert", "abstract": "Abstract This Memory Studies Review special issue explores the intricate relationship between artificial intelligence (ai) and collective memory. In the one hand, the emergence of generative ai, exemplified by ChatGPT\u2019s 2022 release, appears to herald a new infrastructure for collective memory. On the other, the memory studies work highlights the limits and the backlashes of this new form of memory in its social dimension. This leads to raise a provocative, open-ended question: Is artificial intelligence the future of collective memory? Our issue brings together diverse perspectives from memory studies scholars of different backgrounds and machine learning practitioners, fostering critical engagement with ai in memory practices. This multidisciplinary approach offers an initial exploration of the interactions between ai -powered software, platforms, and collective memory. The articles herein present a multifaceted analysis of ai \u2019s role in shaping collective memory\u2019s future. We advocate for increased interdisciplinary collaboration and ethical reflection in this rapidly evolving domain, providing memory studies scholars with a foundation for understanding and engaging with these technological transformations.", "venue": "Memory studies review.", "label": 0}, {"loc": [8.087554931640625, 3.638731002807617], "openalex_id": "https://openalex.org/W4405253602", "title": "Mixture of Hidden-Dimensions Transformer", "authors": "Yilong Chen, Junyuan Shang, Zhu Zhang, Jiawei Sheng, Tingwen Liu, Shuohuan Wang, Yu Sun, Hua Wu, Haifeng Wang", "abstract": "Transformer models encounter challenges in scaling hidden dimensions efficiently, as uniformly increasing them inflates computational and memory costs while failing to emphasize the most relevant features for each token. For further understanding, we study hidden dimension sparsity and observe that trained Transformers utilize only a small fraction of token dimensions, revealing an \"activation flow\" pattern. Notably, there are shared sub-dimensions with sustained activation across multiple consecutive tokens and specialized sub-dimensions uniquely activated for each token. To better model token-relevant sub-dimensions, we propose MoHD (Mixture of Hidden Dimensions), a sparse conditional activation architecture. Particularly, MoHD employs shared sub-dimensions for common token features and a routing mechanism to dynamically activate specialized sub-dimensions. To mitigate potential information loss from sparsity, we design activation scaling and group fusion mechanisms to preserve activation flow. In this way, MoHD expands hidden dimensions with negligible increases in computation or parameters, efficient training and inference while maintaining performance. Evaluations across 10 NLP tasks show that MoHD surpasses Vanilla Transformers in parameter efficiency and task performance. It achieves 1.7% higher performance with 50% fewer activation parameters and 3.7% higher performance with a 3x parameter expansion at constant activation cost. MOHD offers a new perspective for scaling the model, showcasing the potential of hidden dimension sparsity to boost efficiency", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.411947727203369, 5.102390766143799], "openalex_id": "https://openalex.org/W4405173237", "title": "BigDocs: An Open and Permissively-Licensed Dataset for Training Multimodal Models on Document and Code Tasks", "authors": "Juan Rodr\u00edguez, Xie Jian, Siba Smarak Panigrahi, Tianyu Zhang, Aarash Feizi, Abhay Puri, Akshay Kalkunte, Fran\u00e7ois Savard, Ahmed Masry, Shravan Nayak, Rabiul Awal, May A. Massoud, Amirhossein Abaskohi, Zichao Li, Suyuchen Wang, Pierre\u2010Andr\u00e9 No\u00ebl, M. Richter, Saverio Vadacchino, Shreya Agarwal, Sanket Biswas, Sara Shanian, Ying Zhang, Noah Bolger, Kathleen MacDonald, Simon Fauvel, Sathwik Tejaswi, Srinivas Sunkara, Jo\u00e3o Monteiro, Krishnamurthy Dvijotham, Torsten Scholak, Nicolas Chapados, Sepideh Kharagani, Sean Hughes, M. \u00d6zsu, Siva Reddy, Marco Pedersoli, Yoshua Bengio, Christopher Pal, Issam Laradji, Spandanna Gella, Perouz Taslakian, David V\u00e1zquez, Sai Rajeswar", "abstract": "Multimodal AI has the potential to significantly enhance document-understanding tasks, such as processing receipts, understanding workflows, extracting data from documents, and summarizing reports. Code generation tasks that require long-structured outputs can also be enhanced by multimodality. Despite this, their use in commercial applications is often limited due to limited access to training data and restrictive licensing, which hinders open access. To address these limitations, we introduce BigDocs-7.5M, a high-quality, open-access dataset comprising 7.5 million multimodal documents across 30 tasks. We use an efficient data curation process to ensure our data is high-quality and license-permissive. Our process emphasizes accountability, responsibility, and transparency through filtering rules, traceable metadata, and careful content analysis. Additionally, we introduce BigDocs-Bench, a benchmark suite with 10 novel tasks where we create datasets that reflect real-world use cases involving reasoning over Graphical User Interfaces (GUI) and code generation from images. Our experiments show that training with BigDocs-Bench improves average performance up to 25.8% over closed-source GPT-4o in document reasoning and structured output tasks such as Screenshot2HTML or Image2Latex generation. Finally, human evaluations showed a preference for outputs from models trained on BigDocs over GPT-4o. This suggests that BigDocs can help both academics and the open-source community utilize and improve AI tools to enhance multimodal capabilities and document reasoning. The project is hosted at https://bigdocs.github.io.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.763046979904175, -3.956239700317383], "openalex_id": "https://openalex.org/W4405173947", "title": "A Federated Approach to Few-Shot Hate Speech Detection for Marginalized Communities", "authors": "Haotian Ye, Axel Wisiorek, Antonis Maronikolakis, \u00d6zge Ala\u00e7am, Hinrich Sch\u00fctze", "abstract": "Hate speech online remains an understudied issue for marginalized communities, particularly in the Global South, which includes developing societies with increasing internet penetration. In this paper, we aim to provide marginalized communities in societies where the dominant language is low-resource with a privacy-preserving tool to protect themselves from online hate speech by filtering offensive content in their native languages. Our contributions are twofold: 1) we release REACT (REsponsive hate speech datasets Across ConTexts), a collection of high-quality, culture-specific hate speech detection datasets comprising multiple target groups and low-resource languages, curated by experienced data collectors; 2) we propose a few-shot hate speech detection approach based on federated learning (FL), a privacy-preserving method for collaboratively training a central model that exhibits robustness when tackling different target groups and languages. By keeping training local to user devices, we ensure data privacy while leveraging the collective learning benefits of FL. Furthermore, we explore personalized client models tailored to specific target groups and evaluate their performance. Our findings indicate the overall effectiveness of FL across different target groups, and point to personalization as a promising direction.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.078463554382324, 2.4836485385894775], "openalex_id": "https://openalex.org/W4405172771", "title": "Evaluating and Aligning CodeLLMs on Human Preference", "authors": "Jian Yang, Jiaxi Yang, Ke Jin, Yibo Miao, Lei Zhang, Liqun Yang, Z. Q. Cui, Yichang Zhang, Binyuan Hui, Jie Lin", "abstract": "Code large language models (codeLLMs) have made significant strides in code generation. Most previous code-related benchmarks, which consist of various programming exercises along with the corresponding test cases, are used as a common measure to evaluate the performance and capabilities of code LLMs. However, the current code LLMs focus on synthesizing the correct code snippet, ignoring the alignment with human preferences, where the query should be sampled from the practical application scenarios and the model-generated responses should satisfy the human preference. To bridge the gap between the model-generated response and human preference, we present a rigorous human-curated benchmark CodeArena to emulate the complexity and diversity of real-world coding tasks, where 397 high-quality samples spanning 40 categories and 44 programming languages, carefully curated from user queries. Further, we propose a diverse synthetic instruction corpus SynCode-Instruct (nearly 20B tokens) by scaling instructions from the website to verify the effectiveness of the large-scale synthetic instruction fine-tuning, where Qwen2.5-SynCoder totally trained on synthetic instruction data can achieve top-tier performance of open-source code LLMs. The results find performance differences between execution-based benchmarks and CodeArena. Our systematic experiments of CodeArena on 40+ LLMs reveal a notable performance gap between open SOTA code LLMs (e.g. Qwen2.5-Coder) and proprietary LLMs (e.g., OpenAI o1), underscoring the importance of the human preference alignment.\\footnote{\\url{https://codearenaeval.github.io/ }}", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.234889507293701, 2.127105474472046], "openalex_id": "https://openalex.org/W4405173955", "title": "Probing the contents of semantic representations from text, behavior, and brain data using the psychNorms metabase", "authors": "Z. Hussain, Rui Mata, Ben R. Newell, Dirk U. Wulff", "abstract": "Semantic representations are integral to natural language processing, psycholinguistics, and artificial intelligence. Although often derived from internet text, recent years have seen a rise in the popularity of behavior-based (e.g., free associations) and brain-based (e.g., fMRI) representations, which promise improvements in our ability to measure and model human representations. We carry out the first systematic evaluation of the similarities and differences between semantic representations derived from text, behavior, and brain data. Using representational similarity analysis, we show that word vectors derived from behavior and brain data encode information that differs from their text-derived cousins. Furthermore, drawing on our psychNorms metabase, alongside an interpretability method that we call representational content analysis, we find that, in particular, behavior representations capture unique variance on certain affective, agentic, and socio-moral dimensions. We thus establish behavior as an important complement to text for capturing human representations and behavior. These results are broadly relevant to research aimed at learning human-aligned semantic representations, including work on evaluating and aligning large language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.569602012634277, 2.2220020294189453], "openalex_id": "https://openalex.org/W4405096470", "title": "Using Images to Find Context-Independent Word Representations in Vector Space", "authors": "Brajesh Kumar", "abstract": "Many methods have been proposed to find vector representation for words, but most rely on capturing context from the text to find semantic relationships between these vectors. We propose a novel method of using dictionary meanings and image depictions to find word vectors independent of any context. We use auto-encoder on the word images to find meaningful representations and use them to calculate the word vectors. We finally evaluate our method on word similarity, concept categorization and outlier detection tasks. Our method performs comparably to context-based methods while taking much less training time.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.912576675415039, 2.9310359954833984], "openalex_id": "https://openalex.org/W4405094746", "title": "Towards Data Governance of Frontier AI Models", "authors": "Jason Hausenloy, Duncan McClements, Madhavendra Thakur", "abstract": "Data is essential to train and fine-tune today's frontier artificial intelligence (AI) models and to develop future ones. To date, academic, legal, and regulatory work has primarily addressed how data can directly harm consumers and creators, such as through privacy breaches, copyright infringements, and bias and discrimination. Our work, instead, focuses on the comparatively neglected question of how data can enable new governance capacities for frontier AI models. This approach for \"frontier data governance\" opens up new avenues for monitoring and mitigating risks from advanced AI models, particularly as they scale and acquire specific dangerous capabilities. Still, frontier data governance faces challenges that stem from the fundamental properties of data itself: data is non-rival, often non-excludable, easily replicable, and increasingly synthesizable. Despite these inherent difficulties, we propose a set of policy mechanisms targeting key actors along the data supply chain, including data producers, aggregators, model developers, and data vendors. We provide a brief overview of 15 governance mechanisms, of which we centrally introduce five, underexplored policy recommendations. These include developing canary tokens to detect unauthorized use for producers; (automated) data filtering to remove malicious content for pre-training and post-training datasets; mandatory dataset reporting requirements for developers and vendors; improved security for datasets and data generation algorithms; and know-your-customer requirements for vendors. By considering data not just as a source of potential harm, but as a critical governance lever, this work aims to equip policymakers with a new tool for the governance and regulation of frontier AI models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.406844139099121, -0.25888633728027344], "openalex_id": "https://openalex.org/W4405094822", "title": "Marco-LLM: Bridging Languages via Massive Multilingual Training for Cross-Lingual Enhancement", "authors": "Lingfeng Ming, Bo Zeng, Chenyang Lyu, Tianqi Shi, Yu Zhao, Xue Yang, Yefeng Liu, Yiyu Wang, Linlong Xu, Yangyang Liu, Zhao Xiao-hu, Hao Wang, Heng Liu, Hao Zhou, Haowen Yin, Zifu Shang, Haijun Li, Longyue Wang, Weihua Luo, Kaifu Zhang", "abstract": "Large Language Models (LLMs) have achieved remarkable progress in recent years; however, their excellent performance is still largely limited to major world languages, primarily English. Many LLMs continue to face challenges with multilingual tasks, especially when it comes to low-resource languages. To address this issue, we introduced Marco-LLM: Massive multilingual training for cross-lingual enhancement LLM. We have collected a substantial amount of multilingual data for several low-resource languages and conducted extensive continual pre-training using the Qwen2 models. This effort has resulted in a multilingual LLM named Marco-LLM. Through comprehensive evaluations on various multilingual benchmarks, including MMMLU, AGIEval, Belebele, Flores-200, XCOPA and many others, Marco-LLM has demonstrated substantial improvements over state-of-the-art LLMs. Furthermore, Marco-LLM achieved substantial enhancements in any-to-any machine translation tasks, showing the effectiveness of our multilingual LLM. Marco-LLM is a pioneering multilingual LLM designed to not only perform exceptionally well in multilingual tasks, including low-resource languages, but also maintain strong performance in English and other major languages, closing the performance gap between high- and low-resource language capabilities. By bridging languages, this effort demonstrates our dedication to ensuring LLMs work accurately across various languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.789976596832275, -0.9102476835250854], "openalex_id": "https://openalex.org/W4405095498", "title": "BhashaVerse: Translation Ecosystem for Indian Subcontinent Languages", "authors": "Vandan Mujadia, Dipti Misra Sharma", "abstract": "This paper focuses on developing translation models and related applications for 36 Indian languages, including Assamese, Awadhi, Bengali, Bhojpuri, Braj, Bodo, Dogri, English, Konkani, Gondi, Gujarati, Hindi, Hinglish, Ho, Kannada, Kangri, Kashmiri (Arabic and Devanagari), Khasi, Mizo, Magahi, Maithili, Malayalam, Marathi, Manipuri (Bengali and Meitei), Nepali, Oriya, Punjabi, Sanskrit, Santali, Sinhala, Sindhi (Arabic and Devanagari), Tamil, Tulu, Telugu, and Urdu. Achieving this requires parallel and other types of corpora for all 36 * 36 language pairs, addressing challenges like script variations, phonetic differences, and syntactic diversity. For instance, languages like Kashmiri and Sindhi, which use multiple scripts, demand script normalization for alignment, while low-resource languages such as Khasi and Santali require synthetic data augmentation to ensure sufficient coverage and quality. To address these challenges, this work proposes strategies for corpus creation by leveraging existing resources, developing parallel datasets, generating domain-specific corpora, and utilizing synthetic data techniques. Additionally, it evaluates machine translation across various dimensions, including standard and discourse-level translation, domain-specific translation, reference-based and reference-free evaluation, error analysis, and automatic post-editing. By integrating these elements, the study establishes a comprehensive framework to improve machine translation quality and enable better cross-lingual communication in India's linguistically diverse ecosystem.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.568054676055908, 5.377898216247559], "openalex_id": "https://openalex.org/W4405114800", "title": "A Survey of AI-Generated Content (AIGC)", "authors": "Y. Charles Cao, Suyuan Li, Yixin Liu, Zhiling Yan, Yutong Dai, Philip S. Yu, Lichao Sun", "abstract": "Recently, Artificial Intelligence Generated Content (AIGC) has gained significant attention from society, especially with the rise of Generative AI (GAI) techniques such as ChatGPT, GPT-4 [ 165 ], DALL-E-3 [ 184 ], and Sora [ 137 ]. AIGC involves using AI models to create digital content, such as images, music, and natural language, with the goal of making the content creation process more efficient and accessible. Large-scale models have become increasingly important in AIGC as they provide better intent extraction and generation results. This survey provides a comprehensive review of the history of generative models and recent advances in AIGC, focusing on both unimodal and multimodal interaction. From the perspective of unimodality, we introduce the generation tasks and relative models of text and image. From the perspective of multimodality, we introduce the cross-application between the modalities mentioned above. Finally, the survey discusses the existing open problems and future challenges in AIGC. Overall, this survey serves as a valuable resource for individuals interested in understanding the background and secrets behind the impressive performance of AIGC techniques.", "venue": "ACM Computing Surveys", "label": 7}, {"loc": [8.267451286315918, 1.8039902448654175], "openalex_id": "https://openalex.org/W4405089011", "title": "Surveying the Effects of Quality, Diversity, and Complexity in Synthetic Data From Large Language Models", "authors": "Alex Havrilla, Andrew M. Dai, Laura O'Mahony, Koen Oostermeijer, Vera Zisler, Alon Albalak, Fabrizio Milo, Sharath Chandra Raparthy, Kanishk Gandhi, Baber Abbasi, Duy Phung, Maia Iyer, Dakota Mahan, C. O. Blagden, Srishti Gureja, Mohammed Hamdy, Weizhong Li, Giovanni Paolini, Pawan Sasanka Ammanamanchi, Elliot Meyerson", "abstract": "Synthetic data generation with Large Language Models is a promising paradigm for augmenting natural data over a nearly infinite range of tasks. Given this variety, direct comparisons among synthetic data generation algorithms are scarce, making it difficult to understand where improvement comes from and what bottlenecks exist. We propose to evaluate algorithms via the makeup of synthetic data generated by each algorithm in terms of data quality, diversity, and complexity. We choose these three characteristics for their significance in open-ended processes and the impact each has on the capabilities of downstream models. We find quality to be essential for in-distribution model generalization, diversity to be essential for out-of-distribution generalization, and complexity to be beneficial for both. Further, we emphasize the existence of Quality-Diversity trade-offs in training data and the downstream effects on model performance. We then examine the effect of various components in the synthetic data pipeline on each data characteristic. This examination allows us to taxonomize and compare synthetic data generation algorithms through the components they utilize and the resulting effects on data QDC composition. This analysis extends into a discussion on the importance of balancing QDC in synthetic data for efficient reinforcement learning and self-improvement algorithms. Analogous to the QD trade-offs in training data, often there exist trade-offs between model output quality and output diversity which impact the composition of synthetic data. We observe that many models are currently evaluated and optimized only for output quality, thereby limiting output diversity and the potential for self-improvement. We argue that balancing these trade-offs is essential to the development of future self-improvement algorithms and highlight a number of works making progress in this direction.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.4667439460754395, 1.7110711336135864], "openalex_id": "https://openalex.org/W4405090329", "title": "RedStone: Curating General, Code, Math, and QA Data for Large Language Models", "authors": "Yaoyao Chang, Lei Cui, Li Dong, Shaohan Huang, Yangyu Huang, Yupan Huang, Shan Li, Tengchao Lv, Shuming Ma, Qi Sun, Wenhui Wang, Furu Wei, Ying Xin, Mao Yang, Qiufeng Yin, Xingxing Zhang", "abstract": "Pre-training Large Language Models (LLMs) on high-quality, meticulously curated datasets is widely recognized as critical for enhancing their performance and generalization capabilities. This study explores the untapped potential of Common Crawl as a comprehensive and flexible resource for pre-training LLMs, addressing both general-purpose language understanding and specialized domain knowledge. We introduce RedStone, an innovative and scalable pipeline engineered to extract and process data from Common Crawl, facilitating the creation of extensive and varied pre-training datasets. Unlike traditional datasets, which often require expensive curation and domain-specific expertise, RedStone leverages the breadth of Common Crawl to deliver datasets tailored to a wide array of domains. In this work, we exemplify its capability by constructing pre-training datasets across multiple fields, including general language understanding, code, mathematics, and question-answering tasks. The flexibility of RedStone allows for easy adaptation to other specialized domains, significantly lowering the barrier to creating valuable domain-specific datasets. Our findings demonstrate that Common Crawl, when harnessed through effective pipelines like RedStone, can serve as a rich, renewable source of pre-training data, unlocking new avenues for domain adaptation and knowledge discovery in LLMs. This work also underscores the importance of innovative data acquisition strategies and highlights the role of web-scale data as a powerful resource in the continued evolution of LLMs. RedStone code and data samples will be publicly available at \\url{https://aka.ms/redstone}.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.438236713409424, 3.423556327819824], "openalex_id": "https://openalex.org/W4405082018", "title": "Measuring Bias of Web-filtered Text Datasets and Bias Propagation Through Training", "authors": "Youssef Mansour, Reinhard Heckel", "abstract": "We investigate biases in pretraining datasets for large language models (LLMs) through dataset classification experiments. Building on prior work demonstrating the existence of biases in popular computer vision datasets, we analyze popular open-source pretraining datasets for LLMs derived from CommonCrawl including C4, RefinedWeb, DolmaCC, RedPajama-V2, FineWeb, and DCLM-Baseline. Despite those datasets being obtained with similar curation steps, neural networks can classify surprisingly well which dataset a single text sequence belongs to, significantly better than a human can. This indicates that small differences in filtering and processing pipelines induce fingerprints evident in formatting, vocabulary, and content distributions. Those biases remain even when the text is rewritten with LLMs. Moreover, these biases propagate through training: Random sequences generated by models trained on those datasets can be classified well by a classifier trained on the original datasets. This can be leveraged to estimate the pretraining mixture proportions of the data sources.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.3177380561828613, 2.4133894443511963], "openalex_id": "https://openalex.org/W4405010857", "title": "Are We All Musicians Now? Authenticity, Musicianship, and AI Music Generator Suno", "authors": "Selim Tan", "abstract": "This paper analyzes the music\u2019s authenticity and the musicianship behind Suno, an artificial intelligence (AI) music generator. The developers launched Suno\u2019s initial release on December 20, 2023, and the latest version, v4, on November 19, 2024. Behind this digital platform are AI experts and musicians based in Cambridge, MA. Suno does not participate in conventional composing, songwriting, performing, or recording practices. Instead, it generates music based on user prompts in a text/audio/image/video-to-music interface. Suno aims to democratize music production by enabling anyone to become a \u201cmusician.\u201d However, the absence of a conventional \u201creal\u201d artist or author (auteur) responsible for these music productions raises questions about how various agents will perceive the music\u2019s authenticity. Through observations and conceptual reflections, this paper explores the possibilities of authenticating Suno\u2019s productions, their implications for music scenes, and the potentially changing nature of musicianship. Finally, this paper argues that AI-based music can achieve authenticity without conventional musicianship, allowing for the emergence of a new, albeit contested, form of musicianship.", "venue": "https://doi.org/10.31235/osf.io/4nt8z", "label": 0}, {"loc": [3.5913233757019043, -0.10030263662338257], "openalex_id": "https://openalex.org/W4405033303", "title": "ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance & Efficiency on a Specific Domain", "authors": "Ali Shiraee Kasmaee, Mohammad Khodadad, Mohammad Arshi Saloot, Nick Sherck, Stephen Dokas, Hamidreza Mahyar, Soheila Samiee", "abstract": "Recent advancements in language models have started a new era of superior information retrieval and content generation, with embedding models playing an important role in optimizing data representation efficiency and performance. While benchmarks like the Massive Text Embedding Benchmark (MTEB) have standardized the evaluation of general domain embedding models, a gap remains in specialized fields such as chemistry, which require tailored approaches due to domain-specific challenges. This paper introduces a novel benchmark, the Chemical Text Embedding Benchmark (ChemTEB), designed specifically for the chemical sciences. ChemTEB addresses the unique linguistic and semantic complexities of chemical literature and data, offering a comprehensive suite of tasks on chemical domain data. Through the evaluation of 34 open-source and proprietary models using this benchmark, we illuminate the strengths and weaknesses of current methodologies in processing and understanding chemical information. Our work aims to equip the research community with a standardized, domain-specific evaluation framework, promoting the development of more precise and efficient NLP models for chemistry-related applications. Furthermore, it provides insights into the performance of generic models in a domain-specific context. ChemTEB comes with open-source code and data, contributing further to its accessibility and utility.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.569657325744629, 3.876513719558716], "openalex_id": "https://openalex.org/W4405033097", "title": "Yi-Lightning Technical Report", "authors": "AI, NULL AUTHOR_ID, Akio Wake, Albert Wang, Bei Chen, Chao Lv, Chao Li, C. Huang, Chenglin Cai, Chujie Zheng, Daniel Cooper, Erkuan Dai, Fan Zhou, Feng Hu, Hongliang Ji, Haiyue Qiu, Jiangcheng Zhu, Tian Jun, K. Y. L. Su, Lihuan Zhang, Liying Li, Ming Song, Mou Li, Peng Liu, Qianran Hu, S. Wang, Shijun Zhou, Shiyong Li, Tianhang Zhu, Wenlian Xie, Tingshu He, Xiaobo Chen, Xiaohui Hu, Xiaoyi Ren, Xinyao Niu, Yanpeng Li, Yongke Zhao, Yongzhen Luo, Yuchi Xu, Yiwen Sha, Z. B. Yan, Zhiyuan Liu, Zirui Zhang", "abstract": "This technical report presents Yi-Lightning, our latest flagship large language model (LLM). It achieves exceptional performance, ranking 6th overall on Chatbot Arena, with particularly strong results (2nd to 4th place) in specialized categories including Chinese, Math, Coding, and Hard Prompts. Yi-Lightning leverages an enhanced Mixture-of-Experts (MoE) architecture, featuring advanced expert segmentation and routing mechanisms coupled with optimized KV-caching techniques. Our development process encompasses comprehensive pre-training, supervised fine-tuning (SFT), and reinforcement learning from human feedback (RLHF), where we devise deliberate strategies for multi-stage training, synthetic data construction, and reward modeling. Furthermore, we implement RAISE (Responsible AI Safety Engine), a four-component framework to address safety issues across pre-training, post-training, and serving phases. Empowered by our scalable super-computing infrastructure, all these innovations substantially reduce training, deployment and inference costs while maintaining high-performance standards. With further evaluations on public academic benchmarks, Yi-Lightning demonstrates competitive performance against top-tier LLMs, while we observe a notable disparity between traditional, static benchmark results and real-world, dynamic human preferences. This observation prompts a critical reassessment of conventional benchmarks' utility in guiding the development of more intelligent and powerful AI systems for practical applications. Yi-Lightning is now available through our developer platform at https://platform.lingyiwanwu.com.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.147225856781006, 2.402921438217163], "openalex_id": "https://openalex.org/W4405033650", "title": "Exploring the Abilities of Large Language Models to Solve Proportional Analogies via Knowledge-Enhanced Prompting", "authors": "Thilini Wijesiriwardene, Ruwan Wickramarachchi, Sreeram Vennam, Vinija Jain, Aman Chadha, Amitava Das, Ponnurangam Kumaraguru, Amit Sheth", "abstract": "Making analogies is fundamental to cognition. Proportional analogies, which consist of four terms, are often used to assess linguistic and cognitive abilities. For instance, completing analogies like \"Oxygen is to Gas as is to \" requires identifying the semantic relationship (e.g., \"type of\") between the first pair of terms (\"Oxygen\" and \"Gas\") and finding a second pair that shares the same relationship (e.g., \"Aluminum\" and \"Metal\"). In this work, we introduce a 15K Multiple-Choice Question Answering (MCQA) dataset for proportional analogy completion and evaluate the performance of contemporary Large Language Models (LLMs) in various knowledge-enhanced prompt settings. Specifically, we augment prompts with three types of knowledge: exemplar, structured, and targeted. Our results show that despite extensive training data, solving proportional analogies remains challenging for current LLMs, with the best model achieving an accuracy of 55%. Notably, we find that providing targeted knowledge can better assist models in completing proportional analogies compared to providing exemplars or collections of structured knowledge. Our code and data are available at: https://github.com/Thiliniiw/KnowledgePrompts/", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.62334156036377, 0.3672660291194916], "openalex_id": "https://openalex.org/W4405033727", "title": "Uhura: A Benchmark for Evaluating Scientific Question Answering and Truthfulness in Low-Resource African Languages", "authors": "Edward Bayes, Israel Abebe Azime, Jesujoba O. Alabi, Jonas Kgomo, Tyna Eloundou, Elizabeth Proehl, Kai Chen, Imaan Khadir, Naome A. Etori, Shamsuddeen Hassan Muhammad, Choice D Mpanza, Igneciah Pocia Thete, Dietrich Klakow, David Ifeoluwa Adelani", "abstract": "Evaluations of Large Language Models (LLMs) on knowledge-intensive tasks and factual accuracy often focus on high-resource languages primarily because datasets for low-resource languages (LRLs) are scarce. In this paper, we present Uhura -- a new benchmark that focuses on two tasks in six typologically-diverse African languages, created via human translation of existing English benchmarks. The first dataset, Uhura-ARC-Easy, is composed of multiple-choice science questions. The second, Uhura-TruthfulQA, is a safety benchmark testing the truthfulness of models on topics including health, law, finance, and politics. We highlight the challenges creating benchmarks with highly technical content for LRLs and outline mitigation strategies. Our evaluation reveals a significant performance gap between proprietary models such as GPT-4o and o1-preview, and Claude models, and open-source models like Meta's LLaMA and Google's Gemma. Additionally, all models perform better in English than in African languages. These results indicate that LMs struggle with answering scientific questions and are more prone to generating false claims in low-resource African languages. Our findings underscore the necessity for continuous improvement of multilingual LM capabilities in LRL settings to ensure safe and reliable use in real-world contexts. We open-source the Uhura Benchmark and Uhura Platform to foster further research and development in NLP for LRLs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.794388771057129, -1.069516897201538], "openalex_id": "https://openalex.org/W4405034050", "title": "SiTSE: Sinhala Text Simplification Dataset and Evaluation", "authors": "Surangika Ranathunga, Rumesh Sirithunga, Himashi Rathnayake, Let\u00edcia Crist\u00f3foli Duarte Silva, Thamindu Aluthwala, Saman Peramuna, Ravi Shekhar", "abstract": "Text Simplification is a task that has been minimally explored for low-resource languages. Consequently, there are only a few manually curated datasets. In this paper, we present a human curated sentence-level text simplification dataset for the Sinhala language. Our evaluation dataset contains 1,000 complex sentences and corresponding 3,000 simplified sentences produced by three different human annotators. We model the text simplification task as a zero-shot and zero resource sequence-to-sequence (seq-seq) task on the multilingual language models mT5 and mBART. We exploit auxiliary data from related seq-seq tasks and explore the possibility of using intermediate task transfer learning (ITTL). Our analysis shows that ITTL outperforms the previously proposed zero-resource methods for text simplification. Our findings also highlight the challenges in evaluating text simplification systems, and support the calls for improved metrics for measuring the quality of automated text simplification systems that would suit low-resource languages as well. Our code and data are publicly available: https://github.com/brainsharks-fyp17/Sinhala-Text-Simplification-Dataset-and-Evaluation", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.542983055114746, 1.4996027946472168], "openalex_id": "https://openalex.org/W4404965460", "title": "Generative AI and future education: a review, theoretical validation, and authors' perspective on challenges and solutions", "authors": "Wali Khan Monib, Atika Qazi, Rosyzie Anna Awg Haji Mohd Apong, Mohammad Tazli Azizan, Liyanage C. De Silva, Hayati Yassin", "abstract": "Generative AI (Gen AI), exemplified by ChatGPT, has witnessed a remarkable surge in popularity recently. This cutting-edge technology demonstrates an exceptional ability to produce human-like responses and engage in natural language conversations guided by context-appropriate prompts. However, its integration into education has become a subject of ongoing debate. This review examines the challenges of using Gen AI like ChatGPT in education and offers effective strategies. To retrieve relevant literature, a search of reputable databases was conducted, resulting in the inclusion of twenty-two publications. Using Atlas.ti, the analysis reflected six primary challenges with plagiarism as the most prevalent issue, closely followed by responsibility and accountability challenges. Concerns were also raised about privacy, data protection, safety, and security risks, as well as discrimination and bias. Additionally, there were challenges about the loss of soft skills and the risks of the digital divide. To address these challenges, a number of strategies were identified and subjected to critical evaluation to assess their practicality. Most of them were practical and align with the ethical and pedagogical theories. Within the prevalent concepts, \u201cChatGPT\u201d emerged as the most frequent one, followed by \u201cAI,\u201d \u201cstudent,\u201d \u201cresearch,\u201d and \u201ceducation,\u201d highlighting a growing trend in educational discourse. Moreover, close collaboration was evident among the leading countries, all forming a single cluster, led by the United States. This comprehensive review provides implications, recommendations, and future prospects concerning the use of generative AI in education.", "venue": "PeerJ Computer Science", "label": 4}, {"loc": [6.142477989196777, 2.4406256675720215], "openalex_id": "https://openalex.org/W4405033650", "title": "KnowledgePrompts: Exploring the Abilities of Large Language Models to Solve Proportional Analogies via Knowledge-Enhanced Prompting", "authors": "Thilini Wijesiriwardene, Ruwan Wickramarachchi, Sreeram Vennam, Vinija Jain, Aman Chadha, Amitava Das, Ponnurangam Kumaraguru, Amit Sheth", "abstract": "Making analogies is fundamental to cognition. Proportional analogies, which consist of four terms, are often used to assess linguistic and cognitive abilities. For instance, completing analogies like \"Oxygen is to Gas as is to \" requires identifying the semantic relationship (e.g., \"type of\") between the first pair of terms (\"Oxygen\" and \"Gas\") and finding a second pair that shares the same relationship (e.g., \"Aluminum\" and \"Metal\"). In this work, we introduce a 15K Multiple-Choice Question Answering (MCQA) dataset for proportional analogy completion and evaluate the performance of contemporary Large Language Models (LLMs) in various knowledge-enhanced prompt settings. Specifically, we augment prompts with three types of knowledge: exemplar, structured, and targeted. Our results show that despite extensive training data, solving proportional analogies remains challenging for current LLMs, with the best model achieving an accuracy of 55%. Notably, we find that providing targeted knowledge can better assist models in completing proportional analogies compared to providing exemplars or collections of structured knowledge. Our code and data are available at: https://github.com/Thiliniiw/KnowledgePrompts/", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.8084795475006104, -3.918051242828369], "openalex_id": "https://openalex.org/W4405029929", "title": "A Survey on Automatic Online Hate Speech Detection in Low-Resource Languages", "authors": "Susmita Das, Arpita Dutta, Kingshuk Roy, Ashis K. Mondal, A. Mukhopadhyay", "abstract": "The expanding influence of social media platforms over the past decade has impacted the way people communicate. The level of obscurity provided by social media and easy accessibility of the internet has facilitated the spread of hate speech. The terms and expressions related to hate speech gets updated with changing times which poses an obstacle to policy-makers and researchers in case of hate speech identification. With growing number of individuals using their native languages to communicate with each other, hate speech in these low-resource languages are also growing. Although, there is awareness about the English-related approaches, much attention have not been provided to these low-resource languages due to lack of datasets and online available data. This article provides a detailed survey of hate speech detection in low-resource languages around the world with details of available datasets, features utilized and techniques used. This survey further discusses the prevailing surveys, overlapping concepts related to hate speech, research challenges and opportunities.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.613123416900635, -0.761837363243103], "openalex_id": "https://openalex.org/W4405030021", "title": "Pralekha: An Indic Document Alignment Evaluation Benchmark", "authors": "Suryanarayanan, Sanjay, Song, Haiyue, Khan, Mohammed Safi Ur Rahman, Kunchukuttan, Anoop, Dabre, Raj", "abstract": "Mining parallel document pairs for document-level machine translation (MT) remains challenging due to the limitations of existing Cross-Lingual Document Alignment (CLDA) techniques. Existing methods often rely on metadata such as URLs, which are scarce, or on pooled document representations that fail to capture fine-grained alignment cues. Moreover, the limited context window of sentence embedding models hinders their ability to represent document-level context, while sentence-based alignment introduces a combinatorially large search space, leading to high computational cost. To address these challenges for Indic languages, we introduce Pralekha, a benchmark containing over 3 million aligned document pairs across 11 Indic languages and English, which includes 1.5 million English-Indic pairs. Furthermore, we propose Document Alignment Coefficient (DAC), a novel metric for fine-grained document alignment. Unlike pooling-based methods, DAC aligns documents by matching smaller chunks and computes similarity as the ratio of aligned chunks to the average number of chunks in a pair. Intrinsic evaluation shows that our chunk-based method is 2-3x faster while maintaining competitive performance, and that DAC achieves substantial gains over pooling-based baselines. Extrinsic evaluation further demonstrates that document-level MT models trained on DAC-aligned pairs consistently outperform those using baseline alignment methods. These results highlight DAC's effectiveness for parallel document mining. The dataset and evaluation framework are publicly available to support further research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.084458589553833, 2.604015827178955], "openalex_id": "https://openalex.org/W4405128230", "title": "Fair Compensation for Copyrighted Data Used in AI training", "authors": "Mohd Syaufiq Abdul Latif, Nazura Abdul Manap, Nabeel Mahdi Althabhawi\uf02a", "abstract": "As Malaysia struggles with challenges presented by the emergence of Artificial Intelligence (AI) in the digital age, there is an increasing need to re-evaluate and potentially revise the country's copyright framework. The Copyright Act 1987 may require adjustments to accommodate the evolving nature of creative works and their production, particularly in the context of AI-generated content, or known as Generative AI. One area of consideration is the implementation of a copyright compensation system which has been successfully adopted in the European Union (EU) and the United States (US) to compensate creators for the use of their works. Hence, this paper explores the feasibility and potential structure of a copyright compensation framework in Malaysia, specifically focusing on compensating rights holders for AI training data used by way of a statutory license and levy system. By examining existing compensation systems in selected jurisdictions including the EU and the US, this paper aims to provide insights into how such a framework could be effectively implemented into the Copyright Act 1987. The paper also argues that a customised copyright compensation framework could offer a practical solution to the challenges posed by AI, ensuring fair compensation for right holders, promoting innovation, and upholding copyright principles in an increasingly interconnected world. This paper will analyse the current provisions of the Copyright Act 1987, and identify gaps and areas that require reform to effectively address the implications of AI-generated content. This paper finds that the Copyright Act 1987 lacks explicit provisions for compensating rights holders for the use of their works in AI training data, leading to potential gaps in legal protection and fair compensation. As such, the paper recommends specific amendments to the Copyright Act 1987 to incorporate these mechanisms to guide the policymakers in providing a copyright compensation framework to rights holders in Malaysia.", "venue": "IIUM Law Journal", "label": 0}, {"loc": [4.464257717132568, 2.511971950531006], "openalex_id": "https://openalex.org/W4405030380", "title": "How far can bias go?--Tracing bias from pretraining data to alignment", "authors": "Marion Thaler, Abdullatif K\u00f6ksal, Alina Leidinger, Anna Korhonen, Hinrich Sch\u00fctze", "abstract": "As LLMs are increasingly integrated into user-facing applications, addressing biases that perpetuate societal inequalities is crucial. While much work has gone into measuring or mitigating biases in these models, fewer studies have investigated their origins. Therefore, this study examines the correlation between gender-occupation bias in pre-training data and their manifestation in LLMs, focusing on the Dolma dataset and the OLMo model. Using zero-shot prompting and token co-occurrence analyses, we explore how biases in training data influence model outputs. Our findings reveal that biases present in pre-training data are amplified in model outputs. The study also examines the effects of prompt types, hyperparameters, and instruction-tuning on bias expression, finding instruction-tuning partially alleviating representational bias while still maintaining overall stereotypical gender associations, whereas hyperparameters and prompting variation have a lesser effect on bias expression. Our research traces bias throughout the LLM development pipeline and underscores the importance of mitigating bias at the pretraining stage.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.8975026607513428, -0.2767906188964844], "openalex_id": "https://openalex.org/W4405031401", "title": "ChineseWebText 2.0: Large-Scale High-quality Chinese Web Text with Multi-dimensional and fine-grained information", "authors": "Wanyue Zhang, Ziyong Li, Yang Wen, C. Leng, Yang Bai, Qianlong Du, Chengqing Zong, Jiajun Zhang", "abstract": "During the development of large language models (LLMs), pre-training data play a critical role in shaping LLMs' capabilities. In recent years several large-scale and high-quality pre-training datasets have been released to accelerate the research of LLMs, including ChineseWebText1.0, C4, Pile, WanJuan, MAPCC and others. However, as LLMs continue to evolve, focus has increasingly shifted to domain-specific capabilities and safety concerns, making those previous coarse-grained texts insufficient for meeting training requirements. Furthermore, fine-grained information, such as quality, domain and toxicity, is becoming increasingly important in building powerful and reliable LLMs for various scenarios. To address these challenges, in this paper we propose a new tool-chain called MDFG-tool for constructing large-scale and high-quality Chinese datasets with multi-dimensional and fine-grained information. First, we employ manually crafted rules to discard explicit noisy texts from raw contents. Second, the quality evaluation model, domain classifier, and toxicity evaluation model are well-designed to assess the remaining cleaned data respectively. Finally, we integrate these three types of fine-grained information for each text. With this approach, we release the largest, high-quality and fine-grained Chinese text ChineseWebText2.0, which consists of 3.8TB and each text is associated with a quality score, domain labels, a toxicity label and a toxicity score, facilitating the LLM researchers to select data based on various types of fine-grained information. The data, codes and the tool-chain are available on this website https://github.com/CASIA-LM/ChineseWebText-2.0", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.2249755859375, -0.2808240056037903], "openalex_id": "https://openalex.org/W4405110867", "title": "What is the authentic internet register before & after the Russian invasion in Ukraine? Polish and Czech YouTube comments from 2021\u20132023", "authors": "Aleksej Tikhonov", "abstract": "Abstract Over one million tokens of comments were collected for the study using data mining methods. The videos under which the comments were dug out were not chosen arbitrarily but according to the current official national YouTube trends in Poland and the Czechia. The comments were collected under the most popular videos in ten categories: cars, comedy, fashion & lifestyle, gaming, music, non-political interview, politics, report, sports, and video blog. The data collection was carried in 2021\u20132022 and 2023 from under 40 videos per language and period, 160 videos in total. The corpus data should reveal more about the internet register through the part of speech (POS) frequencies, and the syntactical statistics. In addition, the comments are stylistically clustered in R to make dependencies in linguistic usage visible and better understandable. The study aims to expand the term register, using Polish and Czech, to include language on the internet and distinguish between authentic and non-authentic internet registers in comparison to other registers. An additional sociolinguistic aspect of the analysis is the influence of the Russian war against Ukraine on the linguistic behavior of YouTube users.", "venue": "Lodz Papers in Pragmatics", "label": 0}, {"loc": [7.730262279510498, 2.824124813079834], "openalex_id": "https://openalex.org/W4410543963", "title": "AdaNDV: Adaptive Number of Distinct Value Estimation viaLearning to Select and Fuse Estimators", "authors": "Xianghong Xu, Tieying Zhang, Xiao He, Haoyang Li, Ruiying Kang, Shuai Wang, Liang Xu, Zhimin Liang, Shangyu Luo, Lei Zhang, Jianjun Chen", "abstract": "Estimating the Number of Distinct Values (NDV) is fundamental for numerous data management tasks, especially within database applications. However, most existing works primarily focus on introducing new statistical or learned estimators, while identifying the most suitable estimator for a given scenario remains largely unexplored. Therefore, we propose AdaNDV, a learned method designed to adaptively select and fuse existing estimators to address this issue. Specifically, (1) we propose to use learned models to distinguish between overestimated and underestimated estimators and then select appropriate estimators from each category. This strategy provides a complementary perspective by integrating overestimations and underestimations for error correction, thereby improving the accuracy of NDV estimation. (2) To further integrate the estimation results, we introduce a novel fusion approach that employs a learned model to predict the weights of the selected estimators and then applies a weighted sum to merge them. By combining these strategies, the proposed AdaNDV fundamentally distinguishes itself from previous works that directly estimate NDV. Moreover, extensive experiments conducted on real-world datasets, with the number of individual columns being several orders of magnitude larger than in previous studies, demonstrate the superior performance of our method.", "venue": "Proceedings of the VLDB Endowment", "label": 21}, {"loc": [2.5554933547973633, 2.772834300994873], "openalex_id": "https://openalex.org/W4405446043", "title": "Yes, This Is A Puff Piece? A Comparative Analysis of the Vendor Defences of Puffery, Statements of Future Intent and Disclaimers\u2013Part 2\u2014How far does the \u2026", "authors": "John Beardwood", "abstract": "Abstract One of the common themes among various failed ERP implementations and outsourcing transactions is the divergence between the representations made by technology vendor sales teams as to promised skills, expertise and delivery, and the actually provided skills, expertise and delivery. Part 1 (Beardwood, CRi 2024, 85) began by providing an overview of the law of misrepresentation, and then the common vendor defences of puffery and opinion, statements of future intent, and contractual disclaimers, in Canada (I) and the United States (II). Part 2 continues by providing an overview of the law of misrepresentation, and then the common vendor defences of puffery and opinion, statements of future intent, and contractual disclaimers, in the European Union (III). The analysis then assesses how these defences were raised by vendors in two recent ERP failure lawsuits (IV), before concluding with lessons learned for vendors and customers (V).", "venue": "Computer Law Review International", "label": 0}, {"loc": [5.356893539428711, 5.437178611755371], "openalex_id": "https://openalex.org/W4404854822", "title": "Advances in AI-Generated Images and Videos.", "authors": "Hessen Bougueffa, Mamadou Ke\u00efta, Wassim Hamidouche, Abdelmalik Taleb\u2010Ahmed, Helena Liz, A. Miguel San Martin, David Camacho, Abdenour Hadid", "abstract": "In recent years generative AI models and tools have experienced a significant increase, especially techniques to generate synthetic multimedia content, such as images or videos. These methodologies present a wide range of possibilities; however, they can also present several risks that should be taken into account. In this survey we describe in detail different techniques for generating synthetic multimedia content, and we also analyse the most recent techniques for their detection. In order to achieve these objectives, a key aspect is the availability of datasets, so we have also described the main datasets available in the state of the art. Finally, from our analysis we have extracted the main trends for the future, such as transparency and interpretability, the generation of multimodal multimedia content, the robustness of models and the increased use of diffusion models. We find a roadmap of deep challenges, including temporal consistency, computation requirements, generalizability, ethical aspects, and constant adaptation.", "venue": "International Journal of Interactive Multimedia and Artificial Intelligence", "label": 47}, {"loc": [5.784979343414307, 5.015117645263672], "openalex_id": "https://openalex.org/W4404988519", "title": "Multimodal Alignment and Fusion: A Survey", "authors": "Songtao Li, Hao Tang", "abstract": "This survey provides a comprehensive overview of recent advances in multimodal alignment and fusion within the field of machine learning, driven by the increasing availability and diversity of data modalities such as text, images, audio, and video. Unlike previous surveys that often focus on specific modalities or limited fusion strategies, our work presents a structure-centric and method-driven framework that emphasizes generalizable techniques. We systematically categorize and analyze key approaches to alignment and fusion through both structural perspectives -- data-level, feature-level, and output-level fusion -- and methodological paradigms -- including statistical, kernel-based, graphical, generative, contrastive, attention-based, and large language model (LLM)-based methods, drawing insights from an extensive review of over 260 relevant studies. Furthermore, this survey highlights critical challenges such as cross-modal misalignment, computational bottlenecks, data quality issues, and the modality gap, along with recent efforts to address them. Applications ranging from social media analysis and medical imaging to emotion recognition and embodied AI are explored to illustrate the real-world impact of robust multimodal systems. The insights provided aim to guide future research toward optimizing multimodal learning systems for improved scalability, robustness, and generalizability across diverse domains.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.465810775756836, 3.336378812789917], "openalex_id": "https://openalex.org/W4404729147", "title": "Parameter-Efficient Fine-Tuning in Large Models: A Survey of Methodologies", "authors": "Luping Wang, Sheng Chen, Jiang Li, Supriya Pan, Runze Cai, Sen Yang, Fei Yang", "abstract": "Abstract The large models, as predicted by scaling law forecasts, have made groundbreaking progress in many fields, particularly in natural language generation tasks, where they have approached or even surpassed human levels. However, the unprecedented scale of their parameters brings significant computational and storage costs. These large models require substantial computational resources and GPU memory to operate. When adapting large models to specific downstream tasks, their massive parameter scale poses a significant challenge in fine-tuning on hardware platforms with limited computational power and GPU memory. To address this issue, Parameter-Efficient Fine-Tuning (PEFT) offers a practical solution by efficiently adjusting the parameters of large pre-trained models to suit various downstream tasks. Specifically, PEFT adjusts the parameters of pre-trained large models to adapt to specific tasks or domains, minimizing the introduction of additional parameters and the computational resources required. This review mainly introduces the preliminary knowledge of PEFT, the core ideas and principles of various PEFT algorithms, the applications of PEFT, and potential future research directions. By reading this review, we believe that interested parties can quickly grasp the PEFT methodology, thereby accelerating its development and innovation.", "venue": "https://doi.org/10.21203/rs.3.rs-5393239/v1", "label": 0}, {"loc": [7.511022090911865, 1.259946584701538], "openalex_id": "https://openalex.org/W4404987496", "title": "FineWeb-zhtw: Scalable Curation of Traditional Chinese Text Data from the Web", "authors": "Cheng-Wei Lin, Wu\u2010Chiao Hsieh, Kai-Xin Guan, Chan-Jan Hsu, Chia-Chen Kuo, Ckj Lai, Chung-Wei Chung, Ming-Jen Wang, Da-shan Shiu", "abstract": "The quality and size of a pretraining dataset significantly influence the performance of large language models (LLMs). While there have been numerous efforts in the curation of such a dataset for English users, there is a relative lack of similar initiatives for Traditional Chinese. Building upon this foundation of FineWeb, we introduce FineWeb-zhtw, a dataset tailored specifically for Traditional Chinese users. We came up with multiple stages of meticulously designed filters to cater to the linguistic difference between English and Traditional Chinese, to ensure comprehensiveness and quality. We determined effectiveness from querying dataset samples with three main objectives. Our code and datasets are publicly available.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.783665180206299, 3.4333503246307373], "openalex_id": "https://openalex.org/W4405023509", "title": "A Method for Building Large Language Models with Predefined KV Cache Capacity", "authors": "Zhonghua Yi, Guangda Niu, Lei Wang, Wei Tang, Lingzhuang Zhang", "abstract": "This paper introduces a novel approach, the Bounded-Cache Transformer (BCT), for building large language models with a predefined Key-Value (KV) cache capacity. The BCT addresses the excessive memory consumption issue in traditional KV caches by implementing a bounded-length KV cache, which is particularly suitable for the attention layers in Transformer decode-only architectures. By dynamically updating the key-value vector sequences, the BCT achieves efficient inference within limited cache capacity, significantly reducing memory usage while maintaining model performance and system throughput. Experimental results demonstrate that the BCT significantly reduces memory usage while maintaining the model's inference quality, offering a new solution for efficient inference in large language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.723821640014648, 3.3249056339263916], "openalex_id": "https://openalex.org/W4404986232", "title": "The Zamba2 Suite: Technical Report", "authors": "Paolo Glorioso, Quentin Anthony, Yury Tokpanov, Anna Golubeva, Vasudev Shyam, James Whittington, Jonathan Pilault, Beren Millidge", "abstract": "In this technical report, we present the Zamba2 series -- a suite of 1.2B, 2.7B, and 7.4B parameter hybrid Mamba2-transformer models that achieve state of the art performance against the leading open-weights models of their class, while achieving substantial gains in inference latency, throughput, and memory efficiency. The Zamba2 series builds upon our initial work with Zamba1-7B, optimizing its architecture, training and annealing datasets, and training for up to three trillion tokens. We provide open-source weights for all models of the Zamba2 series as well as instruction-tuned variants that are strongly competitive against comparable instruct-tuned models of their class. We additionally open-source the pretraining dataset, which we call Zyda-2, used to train the Zamba2 series of models. The models and datasets used in this work are openly available at https://huggingface.co/Zyphra", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.041357040405273, -1.5149025917053223], "openalex_id": "https://openalex.org/W4404710827", "title": "Optimizing Sentiment Classification Using Dynamic Weighted Stacking Ensemble of Pre-trained Models", "authors": "Lei Kuang, Mingquan Liu, Guihai Chen, Yuchen Tang, Zilong Duan, Xin Li, Pu Li, Jing Tian, X. Y. Han, Chengjun Li", "abstract": "Abstract In the era of information explosion, sentiment classification, as a crucial task in natural language processing, is widely applied across various fields, including e-commerce, media, government, and finance. This paper proposes a dynamic weighted stacking ensemble method, integrating three pre-trained models: NeZha, XLNet, and ERNIE.The approach leverages Bayesian optimization to dynamically adjust the weights of the models. Therefore, robustness and generalization in sentiment classification tasks are improved. Experiments conducted on the SMP2020 Weibo sentiment classification dataset and the ChnSentiCorp sentiment analysis dataset confirm the effectiveness of this method. Results show that the ensemble model significantly outperforms individual models and traditional ensemble methods in terms of classification accuracy and F1 score, particularly excelling in handling complex emotional expressions. This study provides a new solution for sentiment classification tasks and demonstrates the potential of ensemble learning to enhance model performance.", "venue": "https://doi.org/10.21203/rs.3.rs-5408406/v1", "label": 0}, {"loc": [5.667418956756592, -0.18128740787506104], "openalex_id": "https://openalex.org/W4404780882", "title": "Automatic Text Classification With Large Language Models: A Review of openai for Zero-and Few-Shot Classification", "authors": "Kylie Anglin, Carlos E. Ventura", "abstract": "While natural language documents, such as intervention transcripts and participant writing samples, can provide highly nuanced insights into educational and psychological constructs, researchers often find these materials difficult and expensive to analyze. Recent developments in machine learning, however, have allowed social scientists to harness the power of artificial intelligence for complex data categorization tasks. One approach, supervised learning, supports high-performance categorization yet still requires a large, hand-labeled training corpus, which can be costly. An alternative approach\u2014zero- and few-shot classification with pretrained large language models\u2014offers a cheaper, compelling alternative. This article considers the application of zero-shot and few-shot classification in educational research. We provide an overview of large language models, a step-by-step tutorial on using the Python openai package for zero-shot and few-shot classification, and a discussion of relevant research considerations for social scientists.", "venue": "Journal of Educational and Behavioral Statistics", "label": 0}, {"loc": [3.0118229389190674, 3.592588186264038], "openalex_id": "https://openalex.org/W4404987707", "title": "Somesite I Used To Crawl: Awareness, Agency and Efficacy in Protecting Content Creators From AI Crawlers", "authors": "Enze Liu, Erding Luo, Shawn Shan, Geoffrey M. Voelker, Ben Y. Zhao, Stefan Savage", "abstract": "The success of generative AI relies heavily on training on data scraped through extensive crawling of the Internet, a practice that has raised significant copyright, privacy, and ethical concerns. While few measures are designed to resist a resource-rich adversary determined to scrape a site, crawlers can be impacted by a range of existing tools such as robots.txt, NoAI meta tags, and active crawler blocking by reverse proxies. In this work, we seek to understand the ability and efficacy of today's networking tools to protect content creators against AI-related crawling. For targeted populations like human artists, do they have the technical knowledge and agency to utilize crawler-blocking tools such as robots.txt, and can such tools be effective? Using large scale measurements and a targeted user study of 203 professional artists, we find strong demand for tools like robots.txt, but significantly constrained by critical hurdles in technical awareness, agency in deploying them, and limited efficacy against unresponsive crawlers. We further test and evaluate network-level crawler blockers provided by reverse proxies. Despite relatively limited deployment today, they offer stronger protections against AI crawlers, but still come with their own set of limitations.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.042802333831787, -1.5713258981704712], "openalex_id": "https://openalex.org/W4404679706", "title": "Beauty Contest in Equity-Based Crowdfunding Campaigns", "authors": "Artur A. Trzebi\u0144ski, \u0141ukasz Ko\u0142odziejczyk", "abstract": "Purpose This study investigates how investor sentiment influences equity-based crowdfunding campaigns outcomes, with a particular focus on sentiment related to specific industries. The motivation stems from the need to understand the behavioural drivers behind crowdfunding outcomes, especially in a context where both private and public funding coexist. Study Design We employed a machine learning approach, utilizing the PolBERT model to analyse investor sentiment from online forum discussions in Poland. The data consisted of sentiment expressed across various industry-specific discussions, which were analysed to assess their impact on crowdfunding campaign outcomes. Findings The study reveals that while positive investor industry sentiment enhances the capital raised for IPO-targeted companies, it negatively affects overall campaign success and the number of investors, particularly in non-IPO-targeted firms. These findings suggest a complex relationship between sentiment and campaign outcomes, where higher sentiment can sometimes deter participation in certain cases. Contributions This research introduces a novel category of behavioural success drivers in the context of equity crowdfunding, proposing a new classification of meta-drivers of crowdfunding success. Methodologically, the study demonstrates the efficacy of fine-tuned BERT-class models in processing language, offering new insights into the investor sentiment analysis. Implications The results have significant implications for both researchers and practitioners. For researchers, the study expands the understanding of behavioural finance in crowdfunding, while for practitioners, it provides strategic insights into how investor sentiment can influence campaign timing, structure, and expectations. The findings also offer value for policymakers by shedding light on investor behaviour in equity-based crowdfunding markets.", "venue": "Journal of Alternative Finance", "label": 0}, {"loc": [5.064706802368164, 1.3146616220474243], "openalex_id": "https://openalex.org/W4404651412", "title": "Grammatical Redundancy in Scales: Using the \u201cConGRe\u201d Process to Create Better Measures", "authors": "Leah Alley, Imran Kadolkar, Alisha Gupta, Jos\u00e9 M. Cortina, Kurt Winsler", "abstract": "As theoretical models become more complex, there is more pressure to use less time-consuming methods generally, and shorter scales specifically. Although reliability is related to scale length, reliability cutoffs are easily met, even in very short scales, by writing or selecting items that are worded in nearly identical ways, that is, grammatical redundancy. However, grammatical redundancy increases reliability at the cost of domain sampling\u2014a crucial early step in scale construction and one of the two pillars of content validity. Without it, a scale cannot capture the intended construct. The purpose of this paper is to provide scale developers (and shorteners) with a process for quantifying, identifying, and reducing grammatical redundancy without compromising conceptual redundancy, a process that we label ConGRe. Our process involves indices from the linguistics literature that can be used to guide decisions during item writing, that is, prior to data collection. We examine their relation to more traditional psychometric indicators and provide a set of benchmarks. Overall, we demonstrate that it is possible to reduce grammatical redundancy, thus avoiding scale deficiency, without sacrificing traditional psychometric properties.", "venue": "Journal of Management", "label": 0}, {"loc": [7.481013298034668, 1.1797688007354736], "openalex_id": "https://openalex.org/W4404649439", "title": "UnifiedCrawl: Aggregated Common Crawl for Affordable Adaptation of LLMs on Low-Resource Languages", "authors": "Bethel Melesse Tessema, Akhil Kedia, Tae\u2010Sun Chung", "abstract": "Large language models (LLMs) under-perform on low-resource languages due to limited training data. We present a method to efficiently collect text data for low-resource languages from the entire Common Crawl corpus. Our approach, UnifiedCrawl, filters and extracts common crawl using minimal compute resources, yielding mono-lingual datasets much larger than previously available sources. We demonstrate that leveraging this data to fine-tuning multilingual LLMs via efficient adapter methods (QLoRA) significantly boosts performance on the low-resource language, while minimizing VRAM usage. Our experiments show large improvements in language modeling perplexity and an increase in few-shot prompting scores. Our work and released source code provide an affordable approach to improve LLMs for low-resource languages using consumer hardware. Our source code is available here at https://github.com/bethelmelesse/unifiedcrawl.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.878996849060059, 2.419203042984009], "openalex_id": "https://openalex.org/W4404649698", "title": "When Precision Meets Position: BFloat16 Breaks Down RoPE in Long-Context Training", "authors": "Haonan Wang, Qian Liu, Chao Du, Tingting Zhu, Cuicui Du, Kenji Kawaguchi, Tianyu Pang", "abstract": "Extending context window sizes allows large language models (LLMs) to process longer sequences and handle more complex tasks. Rotary Positional Embedding (RoPE) has become the de facto standard due to its relative positional encoding properties that benefit long-context training. However, we observe that using RoPE with BFloat16 format results in numerical issues, causing it to deviate from its intended relative positional encoding, especially in long-context scenarios. This issue arises from BFloat16's limited precision and accumulates as context length increases, with the first token contributing significantly to this problem. To address this, we develop AnchorAttention, a plug-and-play attention method that alleviates numerical issues caused by BFloat16, improves long-context capabilities, and speeds up training. AnchorAttention reduces unnecessary attention computations, maintains semantic coherence, and boosts computational efficiency by treating the first token as a shared anchor with a consistent position ID, making it visible to all documents within the training context. Experiments on three types of LLMs demonstrate that AnchorAttention significantly improves long-context performance and reduces training time by over 50\\% compared to standard full attention mechanisms, while preserving the original LLM's capabilities on general tasks. Our code is available at https://github.com/haonan3/AnchorContext.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.381154537200928, -0.2771051228046417], "openalex_id": "https://openalex.org/W4404648873", "title": "Training Bilingual LMs with Data Constraints in the Targeted Language", "authors": "Skyler Seto, Maartje ter Hoeve, He Bai, Natalie Schluter, David Grangier", "abstract": "Large language models are trained on massive scrapes of the web, as required by current scaling laws. Most progress is made for English, given its abundance of high-quality pretraining data. For most other languages, however, such high quality pretraining data is unavailable. In this work, we study how to boost pretrained model performance in a target language with insufficient pretraining data for training a high performing language model, by enlisting data from an auxiliary language for which high quality data is available. We study this by quantifying the performance gap between training with data in a data-rich auxiliary language compared with training in the target language, exploring the benefits of translation systems, studying the limitations of model scaling when data is limited in the target languages, and proposing new methods for upsampling data from the auxiliary language. Our results show that stronger auxiliary datasets result in performance gains without modification to the model or training objective for close languages, and, in particular, that performance gains due to the development of more information-rich English pretraining datasets can extend to targeted language settings with limited data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.108853340148926, 1.2273834943771362], "openalex_id": "https://openalex.org/W4404608669", "title": "A Semantic Framework for Modular Knowledge Integration in Large Language Models", "authors": "Kassandra Etlune, Steven B. Richardson, Megan L. Howard, Benjamin Foster, Randy Russell, Daniel M. Murphy", "abstract": "The Adaptive Semantic Framework introduces a modular architecture for large language models, enabling dynamic integration of semantic components to enhance scalability and adaptability across diverse natural language processing tasks. By employing modular semantic units, the framework facilitates efficient knowledge incorporation, resulting in improved performance metrics such as accuracy and inference speed. Empirical evaluations demonstrate the framework\u2019s resilience to noisy inputs, maintaining high accuracy levels and showing its robustness for real-world applications. Additionally, the framework\u2019s capacity for domain adaptation without extensive retraining highlights its versatility and practical utility. The mathematical foundations of the framework provide a formal basis for modeling semantic modularity and integration, contributing to the theoretical understanding of large language model architectures. Comparative analyses with existing models establish the framework\u2019s superiority, setting a new benchmark in the field. The scalability assessment reveals that the framework effectively manages computational demands, making it suitable for deployment in resource-constrained environments. Furthermore, the energy efficiency evaluation indicates a favorable balance between performance and power consumption, aligning with sustainable computing practices. Collectively, these contributions mark a substantial progression in the development of large language models, offering a robust foundation for future innovations in the field.", "venue": "https://doi.org/10.22541/au.173222145.59075557/v1", "label": 0}, {"loc": [7.435510158538818, 1.3876498937606812], "openalex_id": "https://openalex.org/W4404573785", "title": "RedPajama: an Open Dataset for Training Large Language Models", "authors": "Maurice Weber, Daniel Fu, Quentin Anthony, Yonatan Oren, Sally Adams, Anton Alexandrov, Xiaozhong Lyu, Huu Du Nguyen, Xiaozhe Yao, Virginia Adams, Ben Athiwaratkun, Rahul Chalamala, Kezhen Chen, Max Ryabinin, Tri Dao, Percy Liang, Christopher R\u00e9, Irina Rish, Ce Zhang", "abstract": "Large language models are increasingly becoming a cornerstone technology in artificial intelligence, the sciences, and society as a whole, yet the optimal strategies for dataset composition and filtering remain largely elusive. Many of the top-performing models lack transparency in their dataset curation and model development processes, posing an obstacle to the development of fully open language models. In this paper, we identify three core data-related challenges that must be addressed to advance open-source language models. These include (1) transparency in model development, including the data curation process, (2) access to large quantities of high-quality data, and (3) availability of artifacts and metadata for dataset curation and analysis. To address these challenges, we release RedPajama-V1, an open reproduction of the LLaMA training dataset. In addition, we release RedPajama-V2, a massive web-only dataset consisting of raw, unfiltered text data together with quality signals and metadata. Together, the RedPajama datasets comprise over 100 trillion tokens spanning multiple domains and with their quality signals facilitate the filtering of data, aiming to inspire the development of numerous new datasets. To date, these datasets have already been used in the training of strong language models used in production, such as Snowflake Arctic, Salesforce's XGen and AI2's OLMo. To provide insight into the quality of RedPajama, we present a series of analyses and ablation studies with decoder-only language models with up to 1.6B parameters. Our findings demonstrate how quality signals for web data can be effectively leveraged to curate high-quality subsets of the dataset, underscoring the potential of RedPajama to advance the development of transparent and high-performing language models at scale.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.32246732711792, 4.939183235168457], "openalex_id": "https://openalex.org/W4404573579", "title": "Training-Free Layer Selection for Parameter-Efficient Fine-Tuning of Language Models", "authors": "Pramit Saha, F. Wagner, Divyanshu Mishra, Can Peng, Anshul Thakur, David A. Clifton, Konstantinos Kamnitsas, J. Alison Noble", "abstract": "Effective training of large Vision-Language Models (VLMs) on resource-constrained client devices in Federated Learning (FL) requires the usage of parameter-efficient fine-tuning (PEFT) strategies. To this end, we demonstrate the impact of two factors \\textit{viz.}, client-specific layer importance score that selects the most important VLM layers for fine-tuning and inter-client layer diversity score that encourages diverse layer selection across clients for optimal VLM layer selection. We first theoretically motivate and leverage the principal eigenvalue magnitude of layerwise Neural Tangent Kernels and show its effectiveness as client-specific layer importance score. Next, we propose a novel layer updating strategy dubbed F$^3$OCUS that jointly optimizes the layer importance and diversity factors by employing a data-free, multi-objective, meta-heuristic optimization on the server. We explore 5 different meta-heuristic algorithms and compare their effectiveness for selecting model layers and adapter layers towards PEFT-FL. Furthermore, we release a new MedVQA-FL dataset involving overall 707,962 VQA triplets and 9 modality-specific clients and utilize it to train and evaluate our method. Overall, we conduct more than 10,000 client-level experiments on 6 Vision-Language FL task settings involving 58 medical image datasets and 4 different VLM architectures of varying sizes to demonstrate the effectiveness of the proposed method.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.8241801261901855, 3.933626890182495], "openalex_id": "https://openalex.org/W4404573790", "title": "Ultra-Sparse Memory Network", "authors": "Zihao Huang, Qiyang Min, Haiwen Huang, Dan-Yang Zhu, Yutao Zeng, Ran Guo, Xun Zhou", "abstract": "It is widely acknowledged that the performance of Transformer models is logarithmically related to their number of parameters and computational complexity. While approaches like Mixture of Experts (MoE) decouple parameter count from computational complexity, they still face challenges in inference due to high memory access costs. This work introduces UltraMem, incorporating large-scale, ultra-sparse memory layer to address these limitations. Our approach significantly reduces inference latency while maintaining model performance. We also investigate the scaling laws of this new architecture, demonstrating that it not only exhibits favorable scaling properties but outperforms MoE. In experiments, the largest UltraMem we train has 20 million memory slots. The results show that our method achieves state-of-the-art inference speed and model performance within a given computational budget, paving the way for billions of slots or experts.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.505805969238281, 1.5888400077819824], "openalex_id": "https://openalex.org/W4404573334", "title": "Survey on Semantic Interpretation of Tabular Data: Challenges and Directions", "authors": "Marco Cremaschi, Blerina Spahiu, Matteo Palmonari, Ernesto Jim\u00e9nez-Ruiz", "abstract": "Tabular data plays a pivotal role in various fields, making it a popular format for data manipulation and exchange, particularly on the web. The interpretation, extraction, and processing of tabular information are invaluable for knowledge-intensive applications. Notably, significant efforts have been invested in annotating tabular data with ontologies and entities from background knowledge graphs, a process known as Semantic Table Interpretation (STI). STI automation aids in building knowledge graphs, enriching data, and enhancing web-based question answering. This survey aims to provide a comprehensive overview of the STI landscape. It starts by categorizing approaches using a taxonomy of 31 attributes, allowing for comparisons and evaluations. It also examines available tools, assessing them based on 12 criteria. Furthermore, the survey offers an in-depth analysis of the Gold Standards used for evaluating STI approaches. Finally, it provides practical guidance to help end-users choose the most suitable approach for their specific tasks while also discussing unresolved issues and suggesting potential future research directions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.793940544128418, -1.0745365619659424], "openalex_id": "https://openalex.org/W4404535638", "title": "Automatic Simplification of Lithuanian Administrative Texts", "authors": "Justina Mandravickait\u0117, Egl\u0117 Rimkien\u0117, Danguol\u0117 Kotryna Kapkan, Danguol\u0117 Kalinauskait\u0117, Tomas Krilavi\u010dius", "abstract": "Text simplification reduces the complexity of text while preserving essential information, thus making it more accessible to a broad range of readers, including individuals with cognitive disorders, non-native speakers, children, and the general public. In this paper, we present experiments on text simplification for the Lithuanian language, aiming to simplify administrative texts to a Plain Language level. We fine-tuned mT5 and mBART models for this task and evaluated the effectiveness of ChatGPT as well. We assessed simplification results via both quantitative metrics and qualitative evaluation. Our findings indicated that mBART performed the best as it achieved the best scores across all evaluation metrics. The qualitative analysis further supported these findings. ChatGPT experiments showed that it responded quite well to a short and simple prompt to simplify the given text; however, it ignored most of the rules given in a more elaborate prompt. Finally, our analysis revealed that BERTScore and ROUGE aligned moderately well with human evaluations, while BLEU and readability scores indicated lower or even negative correlations", "venue": "Algorithms", "label": 37}, {"loc": [3.3961639404296875, 1.2363375425338745], "openalex_id": "https://openalex.org/W4404552362", "title": "Stochastic Remembering and Distributed Mnemonic Agency: Recalling Twentieth Century Activists with ChatGPT", "authors": "Rik Smit, Thomas Smits, Samuel Merrill", "abstract": "Abstract This paper introduces the concept of stochastic remembering and uses two prompt engineering techniques to critically examine the text generated by ai chatbots. These techniques \u2013 step-by-step prompting and chain of thought reasoning \u2013 are then experimentally applied to understand how ChatGPT, the most commonly used ai chatbot, shapes how we remember historical activists. This experiment suggests that hegemonic forms of memory influence the data on which these chatbots are trained and underlines how stochastic patterns affect how humans and ai systems collectively remember the past. Humans and ai systems prompt each other to remember. In conclusion, the paper argues that ai chatbots are a new kind of mnemonic actor that, in interaction with users, renders a probabilistic past. Methodologically, the paper introduces, in an explorative way, an experimental method that can reveal the dynamics of stochastic remembering.", "venue": "Memory studies review.", "label": 0}, {"loc": [6.699217796325684, 0.45319923758506775], "openalex_id": "https://openalex.org/W4404574673", "title": "Multilingual Large Language Models: A Systematic Survey", "authors": "Shaolin Zhu, Supryadi, Shaoyang Xu, Haoran Sun, Leiyu Pan, Menglong Cui, Jiangcun Du, Renren Jin, Ant\u00f3nio Branco, Deyi Xiong", "abstract": "This paper provides a comprehensive survey of the latest research on multilingual large language models (MLLMs). MLLMs not only are able to understand and generate language across linguistic boundaries, but also represent an important advancement in artificial intelligence. We first discuss the architecture and pre-training objectives of MLLMs, highlighting the key components and methodologies that contribute to their multilingual capabilities. We then discuss the construction of multilingual pre-training and alignment datasets, underscoring the importance of data quality and diversity in enhancing MLLM performance. An important focus of this survey is on the evaluation of MLLMs. We present a detailed taxonomy and roadmap covering the assessment of MLLMs' cross-lingual knowledge, reasoning, alignment with human values, safety, interpretability and specialized applications. Specifically, we extensively discuss multilingual evaluation benchmarks and datasets, and explore the use of LLMs themselves as multilingual evaluators. To enhance MLLMs from black to white boxes, we also address the interpretability of multilingual capabilities, cross-lingual transfer and language bias within these models. Finally, we provide a comprehensive review of real-world applications of MLLMs across diverse domains, including biology, medicine, computer science, mathematics and law. We showcase how these models have driven innovation and improvements in these specialized fields while also highlighting the challenges and opportunities in deploying MLLMs within diverse language communities and application scenarios. We listed the paper related in this survey and publicly available at https://github.com/tjunlp-lab/Awesome-Multilingual-LLMs-Papers.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.507143020629883, 0.6518679261207581], "openalex_id": "https://openalex.org/W4404501767", "title": "Slovak morphological tokenizer using the Byte-Pair Encoding algorithm", "authors": "D\u00e1vid Dr\u017e\u00edk, Frantisek Forgac", "abstract": "This study introduces a new approach to text tokenization, SlovaK Morphological Tokenizer (SKMT), which integrates the morphology of the Slovak language into the training process using the Byte-Pair Encoding (BPE) algorithm. Unlike conventional tokenizers, SKMT focuses on preserving the integrity of word roots in individual tokens, crucial for maintaining lexical meaning. The methodology involves segmenting and extracting word roots from morphological dictionaries and databases, followed by corpus preprocessing and training SKMT alongside a traditional BPE tokenizer. Comparative evaluation against existing tokenizers demonstrates SKMT\u2019s outstanding ability to maintain root integrity, achieving 99.7% root integrity compared to SlovakBERT (90.5%) and a pureBPE tokenizer (93.1%). Further validation involved fine-tuning models on a sentiment classification NLP task, where models trained with SKMT achieved an F1-score improvement of 3.5% over those trained with conventional BPE tokenization, followed by a focus on the Semantic Textual Similarity (STS) task. These findings suggest that training language models on the SKMT tokenizer significantly enhances model performance and quality.", "venue": "PeerJ Computer Science", "label": 4}, {"loc": [8.100950241088867, 2.05403733253479], "openalex_id": "https://openalex.org/W4404570743", "title": "LP Data Pipeline: Lightweight, Purpose-driven Data Pipeline for Large Language Models", "authors": "Yungi Kim, H. Ha, Shiming Yang, Su-Kyung Lee, Jihoo Kim, Chanjun Park", "abstract": "Creating high-quality, large-scale datasets for large language models (LLMs) often relies on resource-intensive, GPU-accelerated models for quality filtering, making the process time-consuming and costly. This dependence on GPUs limits accessibility for organizations lacking significant computational infrastructure. To address this issue, we introduce the Lightweight, Purpose-driven (LP) Data Pipeline, a framework that operates entirely on CPUs to streamline the processes of dataset extraction, filtering, and curation. Based on our four core principles, the LP Data Pipeline significantly reduces preparation time and cost while maintaining high data quality. Importantly, our pipeline enables the creation of purpose-driven datasets tailored to specific domains and languages, enhancing the applicability of LLMs in specialized contexts. We anticipate that our pipeline will lower the barriers to LLM development, enabling a wide range of organizations to access LLMs more easily.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.21210241317749, 4.905766487121582], "openalex_id": "https://openalex.org/W4404569721", "title": "Y-MAP-Net: Real-time depth, normals, segmentation, multi-label captioning and 2D human pose in RGB images", "authors": "Ammar Qammaz, Nikolaos Vasilikopoulos, Iason Oikonomidis, Antonis Argyros", "abstract": "We present Y-MAP-Net, a Y-shaped neural network architecture designed for real-time multi-task learning on RGB images. Y-MAP-Net, simultaneously predicts depth, surface normals, human pose, semantic segmentation and generates multi-label captions, all from a single network evaluation. To achieve this, we adopt a multi-teacher, single-student training paradigm, where task-specific foundation models supervise the network's learning, enabling it to distill their capabilities into a lightweight architecture suitable for real-time applications. Y-MAP-Net, exhibits strong generalization, simplicity and computational efficiency, making it ideal for robotics and other practical scenarios. To support future research, we will release our code publicly.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.051448345184326, -3.0857222080230713], "openalex_id": "https://openalex.org/W4404426465", "title": "Enhancing misogyny detection in bilingual texts using explainable AI and multilingual fine-tuned transformers", "authors": "Ehtesham Hashmi, Sule Yildirim Yayilgan, Muhammad Mudassar Yamin, Mohib Ullah", "abstract": "Abstract Gendered disinformation undermines women\u2019s rights, democratic principles, and national security by worsening societal divisions through authoritarian regimes\u2019 intentional weaponization of social media. Online misogyny represents a harmful societal issue, threatening to transform digital platforms into environments that are hostile and inhospitable to women. Despite the severity of this issue, efforts to persuade digital platforms to strengthen their protections against gendered disinformation are frequently ignored, highlighting the difficult task of countering online misogyny in the face of commercial interests. This growing concern underscores the need for effective measures to create safer online spaces, where respect and equality prevail, ensuring that women can participate fully and freely without the fear of harassment or discrimination. This study addresses the challenge of detecting misogynous content in bilingual (English and Italian) online communications. Utilizing FastText word embeddings and explainable artificial intelligence techniques, we introduce a model that enhances both the interpretability and accuracy in detecting misogynistic language. To conduct an in-depth analysis, we implemented a range of experiments encompassing classic machine learning methodologies and conventional deep learning approaches to the recent transformer-based models incorporating both language-specific and multilingual capabilities. This paper enhances the methodologies for detecting misogyny by incorporating incremental learning for cutting-edge datasets containing tweets and posts from different sources like Facebook, Twitter, and Reddit, with our proposed approach outperforming these datasets in metrics such as accuracy, F1-score, precision, and recall. This process involved refining hyperparameters, employing optimization techniques, and utilizing generative configurations. By implementing Local Interpretable Model-agnostic Explanations (LIME), we further elucidate the rationale behind the model\u2019s predictions, enhancing understanding of its decision-making process.", "venue": "Complex & Intelligent Systems", "label": 36}, {"loc": [5.77268648147583, 2.032118797302246], "openalex_id": "https://openalex.org/W4404407448", "title": "Interpretable Syntactic Representations Enable Hierarchical Word Vectors", "authors": "Biraj Silwal", "abstract": "The distributed representations currently used are dense and uninterpretable, leading to interpretations that themselves are relative, overcomplete, and hard to interpret. We propose a method that transforms these word vectors into reduced syntactic representations. The resulting representations are compact and interpretable allowing better visualization and comparison of the word vectors and we successively demonstrate that the drawn interpretations are in line with human judgment. The syntactic representations are then used to create hierarchical word vectors using an incremental learning approach similar to the hierarchical aspect of human learning. As these representations are drawn from pre-trained vectors, the generation process and learning approach are computationally efficient. Most importantly, we find out that syntactic representations provide a plausible interpretation of the vectors and subsequent hierarchical vectors outperform the original vectors in benchmark tests.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7843334674835205, -3.9735894203186035], "openalex_id": "https://openalex.org/W4404475599", "title": "Toxic Memes Recognition Through with Multimodal Bidirectional Cross-Attention", "authors": "Aamir Hashim, Niamh Coleman, J N Roy", "abstract": "Despite the considerable advancements achieved with machine learning techniques in identifying hate speech, numerous technical obstacles persist that hinder these models from reaching human-level accuracy. Challenges such as understanding nuanced context, detecting sarcasm, and effectively interpreting the interplay between visual and textual elements in memes complicate the detection process. This study delves into the comprehensive evaluation of several cutting-edge visual-linguistic Transformer architectures, including VL-BERT, VLP, UNITER, and LXMERT, to assess their capabilities and limitations in handling the multifaceted nature of hateful content within memes. Building upon these evaluations, we introduce significant enhancements aimed at boosting their efficacy in this domain by developing a novel bidirectional cross-attention mechanism. This mechanism facilitates a more seamless integration of visual and textual information, enabling the model to better capture the subtle cues that distinguish hateful memes from benign ones. In addition to the architectural improvements, we leverage deep ensemble strategies to aggregate predictions from multiple model instances, thereby enhancing the robustness and reliability of the detection system. By combining the strengths of diverse models, the ensemble approach mitigates individual weaknesses and reduces the likelihood of false positives and negatives. Our proposed framework not only addresses the existing shortcomings of single-model approaches but also markedly surpasses existing baseline performances by a large margin, achieving higher AUROC and accuracy scores. The refined model demonstrates superior capability in discerning hateful content within multimodal memes, offering a more robust and reliable tool for mitigating the proliferation of harmful online material. Furthermore, the scalability of our approach ensures its applicability to evolving online threats, providing a sustainable solution for automated hate speech detection. These advancements signify a meaningful step towards enhancing the effectiveness of machine learning models in creating a safer and more respectful online environment.", "venue": "Preprints.org", "label": 3}, {"loc": [5.390268325805664, -1.5892395973205566], "openalex_id": "https://openalex.org/W4404375836", "title": "Knowledge Distillation with Applications to Interpretable Arabic Sentiment Analysis", "authors": "Arwa Diwali, Kawther Saeedi, Kia Dashtipour, Mandar Gogate, Amir Hussain", "abstract": "Abstract Sentiment analysis stands as a focal point in the current landscape of natural language processing research with deep neural network models as being prevalent tools of choice. While these models have exhibited noteworthy performance, their intricate nature frequently renders them akin to black boxes, resulting in a lack of transparency regarding the internal mechanisms of the sentiment classification process. The lack of interpretability in such models raises concerns regarding the reliance on outcomes from opaque systems. This study introduces an approach for distilling knowledge from complex deep neural network models into simpler and more interpretable ones while maintaining performance and ensuring global interpretability. Three distinct knowledge distillation pipelines are proposed to transfer the knowledge acquired by teacher models, including Long Short-Term Memory, Bidirectional Long Short-Term Memory, Convolutional Neural Network and AraBERT into Logistic Regression and Decision Tree models. Conducting thorough assessments across three separate datasets for Arabic sentiment analysis, the study\u2019s proposed approach consistently demonstrates performance levels that surpass those of complex models.", "venue": "https://doi.org/10.21203/rs.3.rs-5356825/v1", "label": 0}, {"loc": [8.56963062286377, 0.5473883748054504], "openalex_id": "https://openalex.org/W4404407418", "title": "Are LLMs Prescient? A Continuous Evaluation using Daily News as the Oracle", "authors": "Haochen Dai, Ryan Teehan, Mengye Ren", "abstract": "Many existing evaluation benchmarks for Large Language Models (LLMs) quickly become outdated due to the emergence of new models and training data. These benchmarks also fall short in assessing how LLM performance changes over time, as they consist of a static set of questions without a temporal dimension. To address these limitations, we propose using future event prediction as a continuous evaluation method to assess LLMs' temporal generalization and forecasting abilities. Our benchmark, Daily Oracle, automatically generates question-answer (QA) pairs from daily news, challenging LLMs to predict \"future\" event outcomes. Our findings reveal that as pre-training data becomes outdated, LLM performance degrades over time. While Retrieval Augmented Generation (RAG) has the potential to enhance prediction accuracy, the performance degradation pattern persists, highlighting the need for continuous model updates. Code and data are available at https://agenticlearning.ai/daily-oracle.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.730761528015137, 0.24717764556407928], "openalex_id": "https://openalex.org/W4404406433", "title": "Pointwise Mutual Information as a Performance Gauge for Retrieval-Augmented Generation", "authors": "Tianyu Liu, Jirui Qi, Philip He, Arianna Bisazza, Mrinmaya Sachan, Ryan Cotterell", "abstract": "Recent work suggests that large language models enhanced with retrieval-augmented generation are easily influenced by the order, in which the retrieved documents are presented to the model when solving tasks such as question answering (QA). However, there is no method to date that exploits this phenomenon to improve generation. We fill this gap. In this study, we show that the pointwise mutual information between a context and a question is an effective gauge for language model performance. Importantly, this gauge does not depend on knowing the answer to the question a priori. Through experiments on two question-answering datasets and a variety of large language models, we find evidence for an empirical correlation between answer accuracy and pointwise mutual information. Additionally, we propose two methods that use the pointwise mutual information between a document and a question as a gauge for selecting and constructing prompts that lead to better performance, whose effectiveness we demonstrate through experimentation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.83344030380249, -0.9989158511161804], "openalex_id": "https://openalex.org/W4404401697", "title": "Fineweb-Edu-Ar: Machine-translated Corpus to Support Arabic Small Language Models", "authors": "Sultan Alrashed, Dmitrii Khizbullin, David Pugh", "abstract": "As large language models (LLMs) grow and develop, so do their data demands. This is especially true for multilingual LLMs, where the scarcity of high-quality and readily available data online has led to a multitude of synthetic dataset generation approaches. A key technique in this space is machine translation (MT), where high-quality English text is adapted to a target, comparatively low-resource language. This report introduces FineWeb-Edu-Ar, a machine-translated version of the exceedingly popular (deduplicated) FineWeb-Edu dataset from HuggingFace. To the best of our knowledge, FineWeb-Edu-Ar is the largest publicly available machine-translated Arabic dataset out there, with its size of 202B tokens of an Arabic-trained tokenizer.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.683197498321533, 1.8922133445739746], "openalex_id": "https://openalex.org/W4404390840", "title": "Zyda-2: a 5 Trillion Token High-Quality Dataset", "authors": "Yury Tokpanov, Paolo Glorioso, Quentin Anthony, Beren Millidge", "abstract": "In this technical report, we present Zyda-2: a five trillion token dataset for language model pretraining. Zyda-2 was used to train our Zamba2 series of models which are state-of-the-art for their weight class. We build Zyda-2 by collating high-quality open-source tokens such as FineWeb and DCLM, then distilling them to the highest-quality subset via cross-deduplication and model-based quality filtering. Zyda-2 is released under a permissive open license, and is available at https://huggingface.co/datasets/Zyphra/Zyda-2", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.244691848754883, 3.2544326782226562], "openalex_id": "https://openalex.org/W4404391951", "title": "Zeroth-Order Adaptive Neuron Alignment Based Pruning without Re-Training", "authors": "Cunegatti, Elia, Custode, Leonardo Lucio, Iacca Giovanni", "abstract": "Network pruning focuses on algorithms that aim to reduce a given model's computational cost by removing a subset of its parameters while having minimal impact on performance. Throughout the last decade, the most widely used pruning paradigm has been pruning and re-training, which nowadays is inconvenient due to the vast amount of pre-trained models, which are, in any case, too expensive to re-train. In this paper, we exploit functional information from dense pre-trained models, i.e., their input activations, to obtain sparse models that maximize the activations' alignment with respect to their corresponding dense models. Hence, we propose \\textbf{NeuroAl}, a \\emph{top-up} algorithm that can be used on top of any given pruning algorithm for LLMs, which modifies the block-wise and row-wise sparsity, exploiting information from both the dense model and its sparse version to maximize the \\emph{neuron alignment} among activations. Different from existing methods, our approach adaptively selects the best hyperparameters for the block-wise and row-wise sparsity ratios w.r.t. the model and the desired sparsity, and requires \\emph{no re-training}. We test our method over $\\sim$300 test cases with four LLM families, three sparsity ratios, and ten language tasks (three language modeling and seven zero-shot datasets), showing how it consistently outperforms the latest state-of-the-art methods in terms of performance-runtime trade-off. The code is available at \\href{https://github.com/eliacunegatti/NeuroAL}{https://github.com/eliacunegatti/NeuroAL}.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.302481651306152, -0.6948236227035522], "openalex_id": "https://openalex.org/W4404263292", "title": "Diversifying Multi-Head Attention in the Transformer Model", "authors": "Nicholas Ampazis, Flora Sakketou", "abstract": "Recent studies have shown that, due to redundancy, some heads of the Transformer model can be pruned without diminishing the efficiency of the model. In this paper, we propose a constrained optimization algorithm based on Hebbian learning, which trains specific layers in the Transformer architecture in order to enforce diversification between the different heads in the multi-head attention module. The diversification of the heads is achieved through a single-layer feed-forward neural network that is added to the Transformer architecture and is trained with the proposed algorithm. We utilize the algorithm in three different architectural variations of the baseline Transformer model. In addition to the diversification of the heads, the proposed methodology can be used to prune the heads that capture redundant information. Experiments on diverse NLP tasks, including machine translation, text summarization, question answering and large language modeling, show that our proposed approach consistently improves the performance of baseline Transformer models.", "venue": "Machine Learning and Knowledge Extraction", "label": 0}, {"loc": [5.916483402252197, 5.310511112213135], "openalex_id": "https://openalex.org/W4404401197", "title": "A Comprehensive Survey and Guide to Multimodal Large Language Models in Vision-Language Tasks", "authors": "Chia Xin Liang, Pu Tian, Caitlyn Heqi Yin, Yao Yua, Wei An-Hou, Li Ming, Tianyang Wang, Ziqian Bi, Ming Liu", "abstract": "This survey and application guide to multimodal large language models(MLLMs) explores the rapidly developing field of MLLMs, examining their architectures, applications, and impact on AI and Generative Models. Starting with foundational concepts, we delve into how MLLMs integrate various data types, including text, images, video and audio, to enable complex AI systems for cross-modal understanding and generation. It covers essential topics such as training methods, architectural components, and practical applications in various fields, from visual storytelling to enhanced accessibility. Through detailed case studies and technical analysis, the text examines prominent MLLM implementations while addressing key challenges in scalability, robustness, and cross-modal learning. Concluding with a discussion of ethical considerations, responsible AI development, and future directions, this authoritative resource provides both theoretical frameworks and practical insights. It offers a balanced perspective on the opportunities and challenges in the development and deployment of MLLMs, and is highly valuable for researchers, practitioners, and students interested in the intersection of natural language processing and computer vision.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.999075889587402, 0.7814574837684631], "openalex_id": "https://openalex.org/W4404402438", "title": "Transformer verbatim in-context retrieval across time and scale", "authors": "Kristijan Armeni, Marko Pranji\u0107, Senja Pollak", "abstract": "To predict upcoming text, language models must in some cases retrieve in-context information verbatim. In this report, we investigated how the ability of language models to retrieve arbitrary in-context nouns developed during training (across time) and as language models trained on the same dataset increase in size (across scale). We then asked whether learning of in-context retrieval correlates with learning of more challenging zero-shot benchmarks. Furthermore, inspired by semantic effects in human short-term memory, we evaluated the retrieval with respect to a major semantic component of target nouns, namely whether they denote a concrete or abstract entity, as rated by humans. We show that verbatim in-context retrieval developed in a sudden transition early in the training process, after about 1% of the training tokens. This was observed across model sizes (from 14M and up to 12B parameters), and the transition occurred slightly later for the two smallest models. We further found that the development of verbatim in-context retrieval is positively correlated with the learning of zero-shot benchmarks. Around the transition point, all models showed the advantage of retrieving concrete nouns as opposed to abstract nouns. In all but two smallest models, the advantage dissipated away toward the end of training.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.906074523925781, 2.106945276260376], "openalex_id": "https://openalex.org/W4404395182", "title": "Enhancing Bias Assessment for Complex Term Groups in Language Embedding Models: Quantitative Comparison of Methods", "authors": "Magnus Gray, Mariofanna Milanova, Leihong Wu", "abstract": "Abstract Background Artificial intelligence (AI) is rapidly being adopted to build products and aid in the decision-making process across industries. However, AI systems have been shown to exhibit and even amplify biases, causing a growing concern among people worldwide. Thus, investigating methods of measuring and mitigating bias within these AI-powered tools is necessary. Objective In natural language processing applications, the word embedding association test (WEAT) is a popular method of measuring bias in input embeddings, a common area of measure bias in AI. However, certain limitations of the WEAT have been identified (ie, their nonrobust measure of bias and their reliance on predefined and limited groups of words or sentences), which may lead to inadequate measurements and evaluations of bias. Thus, this study takes a new approach at modifying this popular measure of bias, with a focus on making it more robust and applicable in other domains. Methods In this study, we introduce the SD-WEAT, which is a modified version of the WEAT that uses the SD of multiple permutations of the WEATs to calculate bias in input embeddings. With the SD-WEAT, we evaluated the biases and stability of several language embedding models, including Global Vectors for Word Representation (GloVe), Word2Vec, and bidirectional encoder representations from transformers (BERT). Results This method produces results comparable to those of the WEAT, with strong correlations between the methods\u2019 bias scores or effect sizes (r =0.786) and P values (r =0.776), while addressing some of its largest limitations. More specifically, the SD-WEAT is more accessible, as it removes the need to predefine attribute groups, and because the SD-WEAT measures bias over multiple runs rather than one, it reduces the impact of outliers and sample size. Furthermore, the SD-WEAT was found to be more consistent and reliable than its predecessor. Conclusions Thus, the SD-WEAT shows promise for robustly measuring bias in the input embeddings fed to AI language models.", "venue": "JMIR Medical Informatics", "label": 28}, {"loc": [7.53039026260376, 3.633204936981201], "openalex_id": "https://openalex.org/W4404389118", "title": "PhoneLM: an Efficient and Capable Small Language Model Family through Principled Pre-training", "authors": "Runduan Yi, Xiang Li, Weikai Xie, Zhichun Lu, Chenghua Wang, Ao Zhou, Shangguang Wang, Xiwen Zhang, Mengwei Xu", "abstract": "The interest in developing small language models (SLM) for on-device deployment is fast growing. However, the existing SLM design hardly considers the device hardware characteristics. Instead, this work presents a simple yet effective principle for SLM design: architecture searching for (near-)optimal runtime efficiency before pre-training. Guided by this principle, we develop PhoneLM SLM family (currently with 0.5B and 1.5B versions), that acheive the state-of-the-art capability-efficiency tradeoff among those with similar parameter size. We fully open-source the code, weights, and training datasets of PhoneLM for reproducibility and transparency, including both base and instructed versions. We also release a finetuned version of PhoneLM capable of accurate Android Intent invocation, and an end-to-end Android demo. All materials are available at https://github.com/UbiquitousLearning/PhoneLM.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.502531051635742, 0.9060109853744507], "openalex_id": "https://openalex.org/W4404389841", "title": "Evaluating and Adapting Large Language Models to Represent Folktales in Low-Resource Languages", "authors": "J. A. Meaney, Beatrice Alex, William Lamb", "abstract": "Folktales are a rich resource of knowledge about the society and culture of a civilisation. Digital folklore research aims to use automated techniques to better understand these folktales, and it relies on abstract representations of the textual data. Although a number of large language models (LLMs) claim to be able to represent low-resource langauges such as Irish and Gaelic, we present two classification tasks to explore how useful these representations are, and three adaptations to improve the performance of these models. We find that adapting the models to work with longer sequences, and continuing pre-training on the domain of folktales improves classification performance, although these findings are tempered by the impressive performance of a baseline SVM with non-contextual features.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.94906997680664, 2.5906074047088623], "openalex_id": "https://openalex.org/W4404390131", "title": "Aioli: A Unified Optimization Framework for Language Model Data Mixing", "authors": "M. Chen, Mengying Hu, Nicholas Lourie, Kyunghyun Cho, Christopher R\u00e9", "abstract": "Language model performance depends on identifying the optimal mixture of data groups to train on (e.g., law, code, math). Prior work has proposed a diverse set of methods to efficiently learn mixture proportions, ranging from fitting regression models over training runs to dynamically updating proportions throughout training. Surprisingly, we find that no existing method consistently outperforms a simple stratified sampling baseline in terms of average test perplexity. To understand this inconsistency, we unify existing methods into a standard framework, showing they are equivalent to solving a common optimization problem: minimize average loss subject to a method-specific mixing law -- an implicit assumption on the relationship between loss and mixture proportions. This framework suggests that measuring the fidelity of a method's mixing law can offer insights into its performance. Empirically, we find that existing methods set their mixing law parameters inaccurately, resulting in the inconsistent mixing performance we observe. Using this insight, we derive a new online method named Aioli, which directly estimates the mixing law parameters throughout training and uses them to dynamically adjust proportions. Aioli outperforms stratified sampling on 6 out of 6 datasets by an average of 0.27 test perplexity points, whereas existing methods fail to consistently beat stratified sampling, doing up to 6.9 points worse. Moreover, in a practical setting where proportions are learned on shorter runs due to computational constraints, Aioli can dynamically adjust these proportions over the full training run, consistently improving performance over existing methods by up to 12.012 test perplexity points.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.885903358459473, 5.048989295959473], "openalex_id": "https://openalex.org/W4404234687", "title": "Continuous or Discrete, That Is the Question: A Survey on Large Multi-Modal Models from the Perspective of Input-Output Space Extension", "authors": "Zejun Li, Jiwen Zhang, Dong Wang, Ye Wang, Xuanjing Huang, Zhongyu Wei", "abstract": "With the success of large language models (LLMs) driving progress towards general-purpose AI, there has been a growing focus on extending these models to multi-modal domains, giving rise to large multi-modal models (LMMs). Unlike existing reviews that focuses on specific model frameworks or scenarios, this survey summarizes and provides insights into the current research on LMMs from a more general perspective, \\textbf{input-output space extension}. Particularly, we discuss the following questions: (i) How to construct multi-modal input-output spaces with discretely or continuously encoded modality signals? (ii) How to design model architectures and corresponding training strategies to align the constructed multi-modal representation space? (iii) How to comprehensively evaluate LMMs based on the expanded input-output space? We hope to provide an intuitive and comprehensive overview and inspire future work.", "venue": "Preprints.org", "label": 3}, {"loc": [4.543760299682617, 1.1503349542617798], "openalex_id": "https://openalex.org/W4404227650", "title": "Policy analysis combining artificial intelligence and text mining technology in the perspective of educational informatization", "authors": "He Wei Kuang, Peng Tian, Xiaotong Bin Liang", "abstract": "Abstract In order to explore the application potential of artificial intelligence (AI) and text mining technology in educational policy analysis and evaluate their impact on the psychological perception of policy audiences, this study firstly introduces the application of AI and text mining technology in education. Secondly, it explores the application of psychological theories in educational policy analysis. Finally, this study constructs an educational policy text analysis model and verifies the feasibility of the optimized model through performance comparison experiments and case analysis. The experimental results show that the optimized model exhibits higher accuracy, recall rate, and F1 score compared to traditional models when handling educational policy text analysis tasks with different data volumes. This finding highlights the importance of optimizing models for specific tasks and the potential of improving the understanding and analysis capabilities of models for specific text types through careful adjustments. In addition, the application of psychological theories to the analysis of educational policy texts provides a new perspective and method for understanding the impact of policies on audience psychological states, which helps in formulating more effective and humanized policies. Therefore, the study has certain reference significance for the use of AI and text mining technology to support educational policy analysis and formulation, providing valuable insights and guidance for future related research and practice.", "venue": "Humanities and Social Sciences Communications", "label": 0}, {"loc": [7.894834995269775, 1.3208348751068115], "openalex_id": "https://openalex.org/W4404389558", "title": "Web Archives Metadata Generation with GPT-4o: Challenges and Insights", "authors": "A Huang, Abhijit Nair, Zhen Rong Goh, Tianrui Liu", "abstract": "Current metadata creation for web archives is time consuming and costly due to reliance on human effort. This paper explores the use of gpt-4o for metadata generation within the Web Archive Singapore, focusing on scalability, efficiency, and cost effectiveness. We processed 112 Web ARChive (WARC) files using data reduction techniques, achieving a notable 99.9% reduction in metadata generation costs. By prompt engineering, we generated titles and abstracts, which were evaluated both intrinsically using Levenshtein Distance and BERTScore, and extrinsically with human cataloguers using McNemar's test. Results indicate that while our method offers significant cost savings and efficiency gains, human curated metadata maintains an edge in quality. The study identifies key challenges including content inaccuracies, hallucinations, and translation issues, suggesting that Large Language Models (LLMs) should serve as complements rather than replacements for human cataloguers. Future work will focus on refining prompts, improving content filtering, and addressing privacy concerns through experimentation with smaller models. This research advances the integration of LLMs in web archiving, offering valuable insights into their current capabilities and outlining directions for future enhancements. The code is available at https://github.com/masamune-prog/warc2summary for further development and use by institutions facing similar challenges.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7871272563934326, 1.2194026708602905], "openalex_id": "https://openalex.org/W4404209916", "title": "Web scraping: Jurisprudence and legal doctrines", "authors": "Avv. Gino Fontana", "abstract": "Abstract Web scraping is a technique that allows the extraction of online information and data to train Generative Artificial Intelligence (GenAI) systems. Although the use of deep learning algorithms to produce user\u2010requested outputs (texts, images, music and code) based on models learned from vast data sets dates back a few decades, its use has become fundamental with the recent development of GenAI and has been accompanied by the emergence of the first legal disputes. Doctrine and jurisprudence are called upon to consider the legal consequences arising from the combination of web scraping and GenAI, often encountering inadequate and fragmented legislation. Laws and regulations vary significantly across different countries and regions, reflecting diverse priorities and legal approaches. However, while doctrine, regardless of the latitudes, agrees in condemning the illicit acts and abuses due not so much to the extraction method but to the use of the extracted data (where protected by intellectual property rights), jurisprudence (particularly in Europe and North America) has already had the opportunity to express divergent opinions in some leading cases.", "venue": "The Journal of World Intellectual Property", "label": 0}, {"loc": [7.751181125640869, 2.6634628772735596], "openalex_id": "https://openalex.org/W4404404699", "title": "LSHBloom: Memory-efficient, Extreme-scale Document Deduplication", "authors": "Arham Khan, Robert Underwood, Carlo Siebenschuh, Yadu Babuji, Aswathy Ajith, Kyle Hippe, Ozan G\u00f6kdemir, Alexander Brace, Kyle Chard, Ian Foster", "abstract": "Deduplication is a major focus for assembling and curating training datasets for large language models (LLM) -- detecting and eliminating additional instances of the same content -- in large collections of technical documents. Unrestrained, duplicates in the training dataset increase training costs and lead to undesirable properties such as memorization in trained models or cheating on evaluation. Contemporary approaches to document-level deduplication are often extremely expensive in both runtime and memory. We propose LSHBloom, an extension to MinhashLSH, which replaces the expensive LSHIndex with lightweight Bloom filters. LSHBloom demonstrates the same deduplication performance as MinhashLSH with only a marginal increase in false positives (as low as 1e-5 in our experiments); demonstrates competitive runtime (270\\% faster than MinhashLSH on peS2o); and, crucially, uses just 0.6\\% of the disk space required by MinhashLSH to deduplicate peS2o. We demonstrate that this space advantage scales with increased dataset size -- at the extreme scale of several billion documents, LSHBloom promises a 250\\% speedup and a 54$\\times$ space advantage over traditional MinHashLSH scaling deduplication of text datasets to many billions of documents.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.626060485839844, 3.667823553085327], "openalex_id": "https://openalex.org/W4404354530", "title": "A Comprehensive Survey of Small Language Models in the Era of Large Language Models: Techniques, Enhancements, Applications, Collaboration with LLMs, and \u2026", "authors": "Fali Wang, Zhiwei Zhang, Xianren Zhang, Zongyu Wu, Tzuhao Mo, Qiuhao Lu, Wanjing Wang, Rui Li, Junjie Xu, Xianfeng Tang, Qi He, Yao Ma, Mingzhi Huang, Suhang Wang", "abstract": "Large language models (LLMs) have demonstrated emergent abilities in text generation, question answering, and reasoning, facilitating various tasks and domains. Despite their proficiency in various tasks, LLMs like PaLM 540B and Llama-3.1 405B face limitations due to large parameter sizes and computational demands, often requiring cloud API use which raises privacy concerns, limits real-time applications on edge devices, and increases fine-tuning costs. Additionally, LLMs often underperform in specialized domains such as healthcare and law due to insufficient domain-specific knowledge, necessitating specialized models. Therefore, Small Language Models (SLMs) are increasingly favored for their low inference latency, cost-effectiveness, efficient development, and easy customization and adaptability. These models are particularly well-suited for resource-limited environments and domain knowledge acquisition, addressing LLMs' challenges and proving ideal for applications that require localized data handling for privacy, minimal inference latency for efficiency, and domain knowledge acquisition through lightweight fine-tuning. The rising demand for SLMs has spurred extensive research and development. However, a comprehensive survey investigating issues related to the definition, acquisition, application, enhancement, and reliability of SLM remains lacking, prompting us to conduct a detailed survey on these topics. The definition of SLMs varies widely, thus to standardize, we propose defining SLMs by their capability to perform specialized tasks and suitability for resource-constrained settings, setting boundaries based on the minimal size for emergent abilities and the maximum size sustainable under resource constraints. For other aspects, we provide a taxonomy of relevant models/methods and develop general frameworks for each category to enhance and utilize SLMs effectively.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.259186744689941, 3.5997939109802246], "openalex_id": "https://openalex.org/W4404370653", "title": "Polynomial Composition Activations: Unleashing the Dynamics of Large Language Models", "authors": "Zhijian Zhuo, Ya Wang, Yuantong Zeng, Xiaoqing Li, Xun Zhou, Jingsen Ma", "abstract": "Transformers have found extensive applications across various domains due to the powerful fitting capabilities. This success can be partially attributed to their inherent nonlinearity. Thus, in addition to the ReLU function employed in the original transformer architecture, researchers have explored alternative modules such as GeLU and SwishGLU to enhance nonlinearity and thereby augment representational capacity. In this paper, we propose a novel category of polynomial composition activations (PolyCom), designed to optimize the dynamics of transformers. Theoretically, we provide a comprehensive mathematical analysis of PolyCom, highlighting its enhanced expressivity and efficacy relative to other activation functions. Notably, we demonstrate that networks incorporating PolyCom achieve the $\\textbf{optimal approximation rate}$, indicating that PolyCom networks require minimal parameters to approximate general smooth functions in Sobolev spaces. We conduct empirical experiments on the pre-training configurations of large language models (LLMs), including both dense and sparse architectures. By substituting conventional activation functions with PolyCom, we enable LLMs to capture higher-order interactions within the data, thus improving performance metrics in terms of accuracy and convergence rates. Extensive experimental results demonstrate the effectiveness of our method, showing substantial improvements over other activation functions. Code is available at https://github.com/BryceZhuo/PolyCom.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.013882160186768, -2.413195848464966], "openalex_id": "https://openalex.org/W4404106352", "title": "Fake News Detection Revisited: An Extensive Review of Theoretical Frameworks, Dataset Assessments, Model Constraints, and Forward-Looking Research Agendas \u2026", "authors": "Sheetal Harris, Hassan Jalil Hadi, Naveed Ahmad, Mohammed Ali Alshara", "abstract": "The emergence and acceptance of digital technology have caused information pollution and an infodemic on Online Social Networks (OSNs), blogs, and online websites. The malicious broadcast of illegal, objectionable and misleading content causes behavioural changes and social unrest, impacts economic growth and national security, and threatens users\u2019 safety. The proliferation of AI-generated misleading content has further intensified the current situation. In the previous literature, state-of-the-art (SOTA) methods have been implemented for Fake News Detection (FND). However, the existing research lacks multidisciplinary considerations for FND based on theories on FN and OSN users. Theories\u2019 analysis provides insights into effective and automated detection mechanisms for FN, and the intentions and causes behind wide-scale FN propagation. This review evaluates the available datasets, FND techniques, and approaches and their limitations. The novel contribution of this review is the analysis of the FND in linguistics, healthcare, communication, and other related fields. It also summarises the explicable methods for FN dissemination, identification and mitigation. The research identifies that the prediction performance of pre-trained transformer models provides fresh impetus for multilingual (even for resource-constrained languages), multidomain, and multimodal FND. Their limits and prediction capabilities must be harnessed further to combat FN. It is possible by large-sized, multidomain, multimodal, cross-lingual, multilingual, labelled and unlabelled dataset curation and implementation. SOTA Large Language Models (LLMs) are the innovation, and their strengths should be focused on and researched to combat FN, deepfakes, and AI-generated content on OSNs and online sources. The study highlights the significance of human cognitive abilities and the potential of AI in the domain of FND. Finally, we suggest promising future research directions for FND and mitigation.", "venue": "Technologies", "label": 0}, {"loc": [6.840088844299316, 2.584520101547241], "openalex_id": "https://openalex.org/W4404060074", "title": "ZS4C: Zero-Shot Synthesis of Compilable Code for Incomplete Code Snippets using ChatGPT", "authors": "Azmain Kabir, Shaowei Wang, Yuan Tian, Tse-Hsun Chen, Muhammad Asaduzzaman, Wenbin Zhang", "abstract": "Technical Q&A sites are valuable for software developers seeking knowledge, but the code snippets they provide are often uncompilable and incomplete due to unresolved types and missing libraries. This poses a challenge for users who wish to reuse or analyze these snippets. Existing methods either do not focus on creating compilable code or have low success rates. To address this, we propose ZS4C, a lightweight approach for zero-shot synthesis of compilable code from incomplete snippets using Large Language Models (LLMs). ZS4C operates in two stages: first, it uses an LLM, like GPT-3.5, to identify missing import statements in a snippet; second, it collaborates with a validator (e.g., compiler) to fix compilation errors caused by incorrect imports and syntax issues. We evaluated ZS4C on the StatType-SO benchmark and a new dataset, Python-SO, which includes 539 Python snippets from Stack Overflow across the 20 most popular Python libraries. ZS4C significantly outperforms existing methods, improving the compilation rate from 63% to 95.1% compared to the state-of-the-art SnR, marking a 50.1% improvement. On average, ZS4C can infer more accurate import statements (with an F1 score of 0.98) than SnR, with an improvement of 8.5% in the F1.", "venue": "ACM Transactions on Software Engineering and Methodology", "label": 0}, {"loc": [5.17200231552124, -1.5919557809829712], "openalex_id": "https://openalex.org/W4404350174", "title": "Sentiment Analysis Based on RoBERTa for Amazon Review: An Empirical Study on Decision Making", "authors": "Xinli Guo", "abstract": "In this study, we leverage state-of-the-art Natural Language Processing (NLP) techniques to perform sentiment analysis on Amazon product reviews. By employing transformer-based models, RoBERTa, we analyze a vast dataset to derive sentiment scores that accurately reflect the emotional tones of the reviews. We provide an in-depth explanation of the underlying principles of these models and evaluate their performance in generating sentiment scores. Further, we conduct comprehensive data analysis and visualization to identify patterns and trends in sentiment scores, examining their alignment with behavioral economics principles such as electronic word of mouth (eWOM), consumer emotional reactions, and the confirmation bias. Our findings demonstrate the efficacy of advanced NLP models in sentiment analysis and offer valuable insights into consumer behavior, with implications for strategic decision-making and marketing practices.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.244139194488525, 5.5301513671875], "openalex_id": "https://openalex.org/W4404350236", "title": "Dreaming Out Loud: A Self-Synthesis Approach For Training Vision-Language Models With Developmentally Plausible Data", "authors": "Badr AlKhamissi, Yingtian Tang, Abd\u00fclkadir G\u00f6kce, Johannes Mehrer, Martin Schrimpf", "abstract": "While today's large language models exhibit impressive abilities in generating human-like text, they require massive amounts of data during training. We here take inspiration from human cognitive development to train models in limited data conditions. Specifically we present a self-synthesis approach that iterates through four phases: Phase 1 sets up fundamental language abilities, training the model from scratch on a small corpus. Language is then associated with the visual environment in phase 2, integrating the model with a vision encoder to generate descriptive captions from labeled images. In the \"self-synthesis\" phase 3, the model generates captions for unlabeled images, that it then uses to further train its language component with a mix of synthetic, and previous real-world text. This phase is meant to expand the model's linguistic repertoire, similar to humans self-annotating new experiences. Finally, phase 4 develops advanced cognitive skills, by training the model on specific tasks such as visual question answering and reasoning. Our approach offers a proof of concept for training a multimodal model using a developmentally plausible amount of data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.63919997215271, 4.4977827072143555], "openalex_id": "https://openalex.org/W4404351777", "title": "A Practical and Privacy-Preserving Framework for Real-World Large Language Model Services", "authors": "Yu Mao, Xiwen Liao, Wei Liu, Anjia Yang", "abstract": "Large language models (LLMs) have demonstrated exceptional capabilities in text understanding and generation, and they are increasingly being utilized across various domains to enhance productivity. However, due to the high costs of training and maintaining these models, coupled with the fact that some LLMs are proprietary, individuals often rely on online AI as a Service (AIaaS) provided by LLM companies. This business model poses significant privacy risks, as service providers may exploit users' trace patterns and behavioral data. In this paper, we propose a practical and privacy-preserving framework that ensures user anonymity by preventing service providers from linking requests to the individuals who submit them. Our framework is built on partially blind signatures, which guarantee the unlinkability of user requests. Furthermore, we introduce two strategies tailored to both subscription-based and API-based service models, ensuring the protection of both users' privacy and service providers' interests. The framework is designed to integrate seamlessly with existing LLM systems, as it does not require modifications to the underlying architectures. Experimental results demonstrate that our framework incurs minimal computation and communication overhead, making it a feasible solution for real-world applications.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.031260013580322, 1.2416318655014038], "openalex_id": "https://openalex.org/W4404351657", "title": "Interacting Large Language Model Agents. Interpretable Models and Social Learning", "authors": "Adit Jain, Vikram Krishnamurthy", "abstract": "This paper discusses the theory and algorithms for interacting large language model agents (LLMAs) using methods from statistical signal processing and microeconomics. While both fields are mature, their application to decision-making involving interacting LLMAs remains unexplored. Motivated by Bayesian sentiment analysis on online platforms, we construct interpretable models and algorithms that enable LLMAs to interact and perform Bayesian inference. Because interacting LLMAs learn from both prior decisions and external inputs, they can exhibit bias and herding behavior. Thus, developing interpretable models and stochastic control algorithms is essential to understand and mitigate these behaviors. This paper has three main results. First, we show using Bayesian revealed preferences from microeconomics that an individual LLMA satisfies the necessary and sufficient conditions for rationally inattentive (bounded rationality) Bayesian utility maximization and, given an observation, the LLMA chooses an action that maximizes a regularized utility. Second, we utilize Bayesian social learning to construct interpretable models for LLMAs that interact sequentially with each other and the environment while performing Bayesian inference. Our proposed models capture the herding behavior exhibited by interacting LLMAs. Third, we propose a stochastic control framework to delay herding and improve state estimation accuracy under 2 settings: (a) centrally controlled LLMAs (b) autonomous LLMAs with incentives. We demonstrate the effectiveness of our methods on real datasets for hate speech classification and product quality assessment, using open-source models like LLaMA and closed-source models like ChatGPT. The main takeaway of this paper, based on empirical analysis and mathematical formalism, is that LLMAs act as rationally bounded Bayesian agents that exhibit social learning when interacting.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.149696350097656, 0.7659578919410706], "openalex_id": "https://openalex.org/W4404350831", "title": "Generic Embedding-Based Lexicons for Transparent and Reproducible Text Scoring", "authors": "Catherine Moez", "abstract": "With text analysis tools becoming increasingly sophisticated over the last decade, researchers now face a decision of whether to use state-of-the-art models that provide high performance but that can be highly opaque in their operations and computationally intensive to run. The alternative, frequently, is to rely on older, manually crafted textual scoring tools that are transparently and easily applied, but can suffer from limited performance. I present an alternative that combines the strengths of both: lexicons created with minimal researcher inputs from generic (pretrained) word embeddings. Presenting a number of conceptual lexicons produced from FastText and GloVe (6B) vector representations of words, I argue that embedding-based lexicons respond to a need for transparent yet high-performance text measuring tools.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.92675518989563, 2.628641366958618], "openalex_id": "https://openalex.org/W4404059533", "title": "Open Science at the Generative AI Turn: An Exploratory Analysis of Challenges and Opportunities", "authors": "Mohammad Hosseini, Serge P. J. M. Horbach, Kristi Holmes, Tony Ross\u2010Hellauer", "abstract": "Abstract Technology influences Open Science (OS) practices, because conducting science in transparent, accessible, and participatory ways requires tools and platforms for collaboration and sharing results. Due to this relationship, the characteristics of the employed technologies directly impact OS objectives. Generative Artificial Intelligence (GenAI) is increasingly used by researchers for tasks such as text refining, code generation/editing, reviewing literature, and data curation/analysis. Nevertheless, concerns about openness, transparency, and bias suggest that GenAI may benefit from greater engagement with OS. GenAI promises substantial efficiency gains but is currently fraught with limitations that could negatively impact core OS values, such as fairness, transparency, and integrity, and may harm various social actors. In this paper, we explore the possible positive and negative impacts of GenAI on OS. We use the taxonomy within the UNESCO Recommendation on Open Science to systematically explore the intersection of GenAI and OS. We conclude that using GenAI could advance key OS objectives by broadening meaningful access to knowledge, enabling efficient use of infrastructure, improving engagement of societal actors, and enhancing dialogue among knowledge systems. However, due to GenAI\u2019s limitations, it could also compromise the integrity, equity, reproducibility, and reliability of research. Hence, sufficient checks, validation, and critical assessments are essential when incorporating GenAI into research workflows.", "venue": "Quantitative Science Studies", "label": 0}, {"loc": [2.8640823364257812, -0.2891543507575989], "openalex_id": "https://openalex.org/W4404351291", "title": "Designing a Robust Radiology Report Generation System", "authors": "Sonit Singh", "abstract": "Recent advances in deep learning have enabled researchers to explore tasks at the intersection of computer vision and natural language processing, such as image captioning, visual question answering, visual dialogue, and visual language navigation. Taking inspiration from image captioning, the task of radiology report generation aims at automatically generating radiology reports by having a comprehensive understanding of medical images. However, automatically generating radiology reports from medical images is a challenging task due to the complexity, diversity, and nature of medical images. In this paper, we outline the design of a robust radiology report generation system by integrating different modules and highlighting best practices drawing upon lessons from our past work and also from relevant studies in the literature. We also discuss the impact of integrating different components to form a single integrated system. We believe that these best practices, when implemented, could improve automatic radiology report generation, augment radiologists in decision making, and expedite diagnostic workflow, in turn improve healthcare and save human lives.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.796538829803467, -1.121119737625122], "openalex_id": "https://openalex.org/W4404345347", "title": "Leveraging Large Language Models for Code-Mixed Data Augmentation in Sentiment Analysis", "authors": "Linda Zeng", "abstract": "Code-mixing (CM), where speakers blend languages within a single expression, is prevalent in multilingual societies but poses challenges for natural language processing due to its complexity and limited data. We propose using a large language model to generate synthetic CM data, which is then used to enhance the performance of task-specific models for CM sentiment analysis. Our results show that in Spanish-English, synthetic data improved the F1 score by 9.32%, outperforming previous augmentation techniques. However, in Malayalam-English, synthetic data only helped when the baseline was low; with strong natural data, additional synthetic data offered little benefit. Human evaluation confirmed that this approach is a simple, cost-effective way to generate natural-sounding CM sentences, particularly beneficial for low baselines. Our findings suggest that few-shot prompting of large language models is a promising method for CM data augmentation and has significant impact on improving sentiment analysis, an important element in the development of social influence systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.449361801147461, 2.4252986907958984], "openalex_id": "https://openalex.org/W4404084961", "title": "Gender Ambiguity of Chinese Names in the United States", "authors": "Man Yao", "abstract": "How do people conduct gender classification in ambiguous contexts? A gender framing perspective suggests the pervasiveness and consequences of using gender in novel contexts, but there is a paucity of knowledge about how people assign a gender to ambiguous targets in interpersonal relations. This study fills in this knowledge gap by investigating how U.S. individuals classify the gender of two types of gender-ambiguous names\u2014Chinese names written in English letters and gender-neutral American names. It also examines how respondents\u2019 gender ideologies and racial stereotypes are associated with their perceptions of gender-ambiguous names. An online survey experiment with 795 U.S. individuals finds that respondents predominantly assign a binary gender (versus neutral or unsure) to both Chinese names (40.8 percent men and 37.4 percent women) and gender-neutral American names (41.1 percent men and 19.4 percent women). Multivariate analyses reveal that respondents with traditional gender ideologies associate a gender-binary perception with gender-neutral American names rather than Chinese names. Meanwhile, respondents who endorse the racialized stereotypes that Chinese people are socially cold and/or generally competent are more likely to perceive Chinese names as men\u2019s names. These findings demonstrate that a gender-binary frame persists in ambiguous contexts, and that the classification outcome is conditional on contextual signals and preexisting cultural beliefs. They also deepen understanding of gender neutrality during social interactions and a gendered nature of racialized stereotypes.", "venue": "Gender & Society", "label": 0}, {"loc": [4.097491264343262, -2.455906867980957], "openalex_id": "https://openalex.org/W4404018961", "title": "Ensemble based high performance deep learning models for fake news detection", "authors": "Mohammed E.Almandouh, Mohammed F. Alrahmawy, Mohamed Eisa, Mohamed Elhoseny, A. S. Tolba", "abstract": "Abstract Social media has emerged as a dominant platform where individuals freely share opinions and communicate globally. Its role in disseminating news worldwide is significant due to its easy accessibility. However, the increase in the use of these platforms presents severe risks for potentially misleading people. Our research aims to investigate different techniques within machine learning, deep learning, and ensemble learning frameworks in Arabic fake news detection. We integrated FastText word embeddings with various machine learning and deep learning methods. We then leveraged advanced transformer-based models, including BERT, XLNet, and RoBERTa, optimizing their performance through careful hyperparameter tuning. The research methodology involves utilizing two Arabic news article datasets, AFND and ARABICFAKETWEETS datasets, categorized into fake and real subsets and applying comprehensive preprocessing techniques to the text data. Four hybrid deep learning models are presented: CNN-LSTM, RNN-CNN, RNN-LSTM, and Bi-GRU-Bi-LSTM. The Bi-GRU-Bi-LSTM model demonstrated superior performance regarding the F1 score, accuracy, and loss metrics. The precision, recall, F1 score, and accuracy of the hybrid Bi-GRU-Bi-LSTM model on the AFND Dataset are 0.97, 0.97, 0.98, and 0.98, and on the ARABICFAKETWEETS dataset are 0.98, 0.98, 0.99, and 0.99 respectively. The study\u2019s primary conclusion is that when spotting fake news in Arabic, the Bi-GRU-Bi-LSTM model outperforms other models by a significant margin. It significantly aids the global fight against false information by setting the stage for future research to expand fake news detection to multiple languages.", "venue": "Scientific Reports", "label": 24}, {"loc": [2.7795848846435547, -0.5545094013214111], "openalex_id": "https://openalex.org/W4404024363", "title": "Improving Consumer Health Search with Field-Level Learning-to-Rank Techniques", "authors": "Hua Yang, Teresa Gon\u00e7alves", "abstract": "In the area of consumer health search (CHS), there is an increasing concern about returning topically relevant and understandable health information to the user. Besides being used to rank topically relevant documents, Learning to Rank (LTR) has also been used to promote understandability ranking. Traditionally, features coming from different document fields are joined together, limiting the performance of standard LTR, since field information plays an important role in promoting understandability ranking. In this paper, a novel field-level Learning-to-Rank (f-LTR) approach is proposed, and its application in CHS is investigated by developing thorough experiments on CLEF\u2019 2016\u20132018 eHealth IR data collections. An in-depth analysis of the effects of using f-LTR is provided, with experimental results suggesting that in LTR, title features are more effective than other field features in promoting understandability ranking. Moreover, the fused f-LTR model is compared to existing work, confirming the effectiveness of the methodology.", "venue": "Information", "label": 17}, {"loc": [4.442001819610596, 2.531510353088379], "openalex_id": "https://openalex.org/W4404006813", "title": "Gender Bias in Natural Language Processing and Computer Vision: A Comparative Survey", "authors": "Marion Bartl, Abhishek Mandal, Susan Leavy, Suzanne Little", "abstract": "Taking an interdisciplinary approach to surveying issues around gender bias in textual and visual AI, we present literature on gender bias detection and mitigation in NLP, CV, as well as combined visual-linguistic models. We identify conceptual parallels between these strands of research as well as how methodologies were adapted cross-disciplinary from NLP to CV. We also find that there is a growing awareness for theoretical frameworks from the social sciences around gender in NLP that could be beneficial for aligning bias analytics in CV with human values and conceptualising gender beyond the binary categories of male/female.", "venue": "ACM Computing Surveys", "label": 7}, {"loc": [6.9200544357299805, 0.6799008250236511], "openalex_id": "https://openalex.org/W4404354120", "title": "Multilingual Pretraining Using a Large Corpus Machine-Translated from a Single Source Language", "authors": "Jiayi Wang, Yao Lu, Maurice Weber, Max Ryabinin, Yihong Chen, Raphael Tang, Pontus Stenetorp", "abstract": "English, as a very high-resource language, enables the pretraining of high-quality large language models (LLMs). The same cannot be said for most other languages, as leading LLMs still underperform for non-English languages, likely due to a gap in the quality and diversity of the available multilingual pretraining corpora. In this work, we find that machine-translated text from a single high-quality source language can contribute significantly to the pretraining of multilingual LLMs. We translate FineWeb-Edu, a high-quality English web dataset, into French, German, and Spanish, resulting in a final 300B-token dataset, which we call TransWeb-Edu, and pretrain a 1.3B-parameter model, CuatroLLM, from scratch on this dataset. Across five non-English reasoning tasks, we show that CuatroLLM matches or outperforms state-of-the-art multilingual models trained using closed data, such as Llama3.2 and Gemma2, despite using an order of magnitude less data, such as about 6% of the tokens used for Llama3.2's training. We further demonstrate that with additional domain-specific pretraining, amounting to less than 1% of TransWeb-Edu, CuatroLLM surpasses the state of the art in multilingual reasoning. To promote reproducibility, we release our corpus, models, and training pipeline under open licenses at hf.co/britllm/CuatroLLM.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.830420970916748, 0.8406884670257568], "openalex_id": "https://openalex.org/W4407418702", "title": "CorpusN\u00d3S: A massive Galician corpus for training large language models", "authors": "Eliseo Bao, Anxo P\u00e9rez, Javier Parapar", "abstract": "The popularization of Large Language Models (LLMs), especially with the development of conversational systems, makes mandatory to think about facilitating the use of artificial intelligence (AI) to everyone. Most models neglect minority languages, prioritizing widely spoken ones. This exacerbates their underrepresentation in the digital world and negatively affects their speakers. We present two resources aimed at improving natural language processing (NLP) for Galician: (i) a Llama 3.1 instruct model adapted through continuous pre-training on the CorpusN\u00f3s dataset; and (ii) a Galician version of the Alpaca dataset, used to assess the improvement over the base model. In this evaluation, our model outperformed both the base model and another Galician model in quantitative and qualitative terms.", "venue": "https://doi.org/10.17979/spudc.9788497498913.4", "label": 0}, {"loc": [8.151406288146973, 1.8490409851074219], "openalex_id": "https://openalex.org/W4404348768", "title": "P-Masking: Power Law Masking Improves Multi-attribute Controlled Generation", "authors": "Mohamed Elgaar, Hadi Amiri", "abstract": "We introduce LingGen, a novel approach for controlled text generation that offers precise control over a wide array of linguistic attributes, even as the number of attributes varies. LingGen employs a dynamic P-MASKING strategy, which samples masking rates from a power law distribution during training. This innovative approach enables the model to develop robust representations and adapt its attribute control capabilities across a variable number of attributes, from a single attribute to multiple complex configurations. The P-MASKING technique enhances LingGen's ability to manage different levels of attribute visibility, resulting in superior performance in multi-attribute generation tasks. Our experiments demonstrate that LingGen surpasses current state-of-the-art models in both attribute control accuracy and text fluency, particularly excelling in scenarios with varying attribute demands. Additionally, our ablation studies highlight the effectiveness of P-MASKING and the influence of different base language models on performance. These findings demonstrate LingGen's potential for applications requiring precise and adaptable control over multiple linguistic attributes in text generation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.3825459480285645, 2.1607658863067627], "openalex_id": "https://openalex.org/W4403935841", "title": "False memories from nowhere: 2 humans falsely recognize words that are not attested in their vocabulary 3", "authors": "Daniele Gatti, Marco A. Petilli, Michela Marchetti, Tomaso Vecchi, Giuliana Mazzoni, Luca Rinaldi, Marco Marelli", "abstract": "Semantic knowledge plays an active role in many well-known false memory phenomena, including those emerging from the Deese\u2013Roediger\u2013McDermott (DRM) task. In this experimental paradigm, indeed, humans tend to falsely recognize newly presented words via activation of other previously shown stimuli. In the present study we aimed to test what happens in cases in which no apparent prior semantic knowledge is available, like in the case of entirely novel lexical stimuli. To do so, we evaluated semantic similarity effects in a DRM task with lists entirely composed by pseudowords (or \u201cnovel words,\u201d i.e., letter strings resembling real words but lacking assigned meanings). Semantic similarity between pseudowords were established through a distributional semantic model able to represent in a vector space, not only attested words but also unmapped strings as bags of character n-grams. Participants were instructed to memorize those lists and then to perform a recognition task. Results showed that participants false and veridical recognition increased with increasing semantic similarity between each stimulus and the stimuli comprising its list, as estimated by the distributional model. These findings extend previous evidence indicating that humans are sensitive to the semantic (distributional) patterns elicited by novel words by showing that this sensitivity can even induce humans to falsely recognize stimuli that they have never encountered in their entire lives.", "venue": "https://doi.org/10.31234/osf.io/8um2f", "label": 0}, {"loc": [6.500114917755127, 4.499516487121582], "openalex_id": "https://openalex.org/W4404343203", "title": "OS-ATLAS: A Foundation Action Model for Generalist GUI Agents", "authors": "Zhiyong Wu, Zhenyu Wu, Fangzhi Xu, Yian Wang, Qiushi Sun, Chengyou Jia, Kanzhi Cheng, Zichen Ding, L. Chen, Paul Pu Liang, Yu Qiao", "abstract": "Existing efforts in building GUI agents heavily rely on the availability of robust commercial Vision-Language Models (VLMs) such as GPT-4o and GeminiProVision. Practitioners are often reluctant to use open-source VLMs due to their significant performance lag compared to their closed-source counterparts, particularly in GUI grounding and Out-Of-Distribution (OOD) scenarios. To facilitate future research in this area, we developed OS-Atlas - a foundational GUI action model that excels at GUI grounding and OOD agentic tasks through innovations in both data and modeling. We have invested significant engineering effort in developing an open-source toolkit for synthesizing GUI grounding data across multiple platforms, including Windows, Linux, MacOS, Android, and the web. Leveraging this toolkit, we are releasing the largest open-source cross-platform GUI grounding corpus to date, which contains over 13 million GUI elements. This dataset, combined with innovations in model training, provides a solid foundation for OS-Atlas to understand GUI screenshots and generalize to unseen interfaces. Through extensive evaluation across six benchmarks spanning three different platforms (mobile, desktop, and web), OS-Atlas demonstrates significant performance improvements over previous state-of-the-art models. Our evaluation also uncovers valuable insights into continuously improving and scaling the agentic capabilities of open-source VLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.38081169128418, 5.487221717834473], "openalex_id": "https://openalex.org/W4404343134", "title": "Public Domain 12M: A Highly Aesthetic Image-Text Dataset with Novel Governance Mechanisms", "authors": "Joseph Meyer, Nick Padgett, C. J. Miller, Laura Exline", "abstract": "We present Public Domain 12M (PD12M), a dataset of 12.4 million high-quality public domain and CC0-licensed images with synthetic captions, designed for training text-to-image models. PD12M is the largest public domain image-text dataset to date, with sufficient size to train foundation models while minimizing copyright concerns. Through the Source.Plus platform, we also introduce novel, community-driven dataset governance mechanisms that reduce harm and support reproducibility over time.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.8772127628326416, 3.509063482284546], "openalex_id": "https://openalex.org/W4404342341", "title": "Toxicity of the Commons: Curating Open-Source Pre-Training Data", "authors": "Catherine Arnett, Eleanor Jones, Ivan P. Yamshchikov, Pierre-Carl Langlais", "abstract": "Open-source large language models are becoming increasingly available and popular among researchers and practitioners. While significant progress has been made on open-weight models, open training data is a practice yet to be adopted by the leading open-weight models creators. At the same time, there researchers are working to make language models safer. We propose a data curation pipeline to reduce harmful outputs by models trained on public domain data. There are unique challenges to working with public domain data, as these sources differ from web text in both form and content. Many sources are historical documents and are the result of Optical Character Recognition (OCR). Consequently, current state-of-the-art approaches to toxicity filtering are often infeasible or inappropriate for open data models. In this paper, we introduce a new fully open-source pipeline for open-data toxicity filtering. Our contributions are threefold. We create a custom training dataset, ToxicCommons, which is composed of texts which have been classified across five different dimensions (racial/origin-based, gender/sex-based, religious, ability-based discrimination, and violence). We use this dataset to train a custom classifier, Celadon, that can be used to detect toxic content in open data more efficiently at a larger scale. Finally, we describe the balanced approach to content filtration that optimizes safety filtering with respect to the filtered data available for training.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.301076650619507, 3.6379661560058594], "openalex_id": "https://openalex.org/W4403991032", "title": "Security strategies for AI systems in Industry 4.0", "authors": "Julius A. Bairaktaris, Arne Johannssen, Kim Phuc Tran", "abstract": "Abstract The recent emergence and widespread adoption of Artificial Intelligence (AI) across various industries have not only increased efficiency and innovation but also introduced complex security challenges. Addressing these security challenges is an ever\u2010evolving area of research that seeks to mitigate issues affecting a wide range of individuals and sectors. Understanding the threats and vulnerabilities posed by these systems, and how to effectively defend against them, has become more crucial than ever. This paper aims to provide an overview of the most common attack vectors and their respective defense strategies, focusing particularly on those relevant to Industry 4.0. A major area of interest is adversarial machine learning, a relatively new field that focuses on corrupting, confusing, and manipulating AI models by intervening in different phases of their life cycle. The key findings indicate that FLAME offers the best protection against data poisoning attacks for neural network image detection models in a federated learning environment. Additionally, due to the specific threat model in Industry 4.0, defensive distillation emerges as the most promising defense strategy against evasion attacks.", "venue": "Quality and Reliability Engineering International", "label": 0}, {"loc": [4.097919464111328, -2.4942805767059326], "openalex_id": "https://openalex.org/W4403991553", "title": "SEN-CTD: semantic enhancement network with content-title discrepancy for fake news detection", "authors": "Jiaqi Fang, Kun Ma, Yunhai Qiu, Ke Ji, Zhenxiang Chen, Bo Yang", "abstract": "Purpose The discrepancy between the content of an article and its title is a key characteristic of fake news. Current methods for detecting fake news often ignore the significant difference in length between the content and its title. In addition, relying solely on textual discrepancies between the title and content to distinguish between real and fake news has proven ineffective. The purpose of this paper is to develop a new approach called semantic enhancement network with content\u2013title discrepancy (SEN\u2013CTD), which enhances the accuracy of fake news detection. Design/methodology/approach The SEN\u2013CTD framework is composed of two primary modules: the SEN and the content\u2013title comparison network (CTCN). The SEN is designed to enrich the representation of news titles by integrating external information and position information to capture the context. Meanwhile, the CTCN focuses on assessing the consistency between the content of news articles and their corresponding titles examining both emotional tones and semantic attributes. Findings The SEN\u2013CTD model performs well on the GossipCop, PolitiFact and RealNews data sets, achieving accuracies of 80.28%, 86.88% and 84.96%, respectively. These results highlight its effectiveness in accurately detecting fake news across different types of content. Originality/value The SEN is specifically designed to improve the representation of extremely short texts, enhancing the depth and accuracy of analyses for brief content. The CTCN is tailored to examine the consistency between news titles and their corresponding content, ensuring a thorough comparative evaluation of both emotional and semantic discrepancies.", "venue": "International Journal of Web Information Systems", "label": 0}, {"loc": [3.76733660697937, 3.6252148151397705], "openalex_id": "https://openalex.org/W4404341683", "title": "Benchmarking LLM Guardrails in Handling Multilingual Toxicity", "authors": "Yahan Yang, Soham Dan, Dan Roth, Inseop Lee", "abstract": "With the ubiquity of Large Language Models (LLMs), guardrails have become crucial to detect and defend against toxic content. However, with the increasing pervasiveness of LLMs in multilingual scenarios, their effectiveness in handling multilingual toxic inputs remains unclear. In this work, we introduce a comprehensive multilingual test suite, spanning seven datasets and over ten languages, to benchmark the performance of state-of-the-art guardrails. We also investigates the resilience of guardrails against recent jailbreaking techniques, and assess the impact of in-context safety policies and language resource availability on guardrails' performance. Our findings show that existing guardrails are still ineffective at handling multilingual toxicity and lack robustness against jailbreaking prompts. This work aims to identify the limitations of guardrails and to build a more reliable and trustworthy LLMs in multilingual scenarios.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.6843128204345703, 2.6814284324645996], "openalex_id": "https://openalex.org/W4404340969", "title": "Safety cases for frontier AI", "authors": "Marie Davidsen Buhl, Gaurav Sett, Leonie Koessler, Jonas Schuett, Markus Anderljung", "abstract": "As frontier artificial intelligence (AI) systems become more capable, it becomes more important that developers can explain why their systems are sufficiently safe. One way to do so is via safety cases: reports that make a structured argument, supported by evidence, that a system is safe enough in a given operational context. Safety cases are already common in other safety-critical industries such as aviation and nuclear power. In this paper, we explain why they may also be a useful tool in frontier AI governance, both in industry self-regulation and government regulation. We then discuss the practicalities of safety cases, outlining how to produce a frontier AI safety case and discussing what still needs to happen before safety cases can substantially inform decisions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.104614734649658, 1.0568372011184692], "openalex_id": "https://openalex.org/W4404341539", "title": "CHORDONOMICON: A Dataset of 666,000 Songs and their Chord Progressions", "authors": "Spyridon Kantarelis, \u039aonstantinos Thomas, Vassilis Lyberatos, Edmund Dervakos, Giorgos Stamou", "abstract": "Chord progressions encapsulate important information about music, pertaining to its structure and conveyed emotions. They serve as the backbone of musical composition, and in many cases, they are the sole information required for a musician to play along and follow the music. Despite their importance, chord progressions as a data domain remain underexplored. There is a lack of large-scale datasets suitable for deep learning applications, and limited research exploring chord progressions as an input modality. In this work, we present Chordonomicon, a dataset of over 666,000 songs and their chord progressions, annotated with structural parts, genre, and release date - created by scraping various sources of user-generated progressions and associated metadata. We demonstrate the practical utility of the Chordonomicon dataset for classification and generation tasks, and discuss its potential to provide valuable insights to the research community. Chord progressions are unique in their ability to be represented in multiple formats (e.g. text, graph) and the wealth of information chords convey in given contexts, such as their harmonic function. These characteristics make the Chordonomicon an ideal testbed for exploring advanced machine learning techniques, including transformers, graph machine learning, and hybrid systems that combine knowledge representation and machine learning.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.123032569885254, 0.512498140335083], "openalex_id": "https://openalex.org/W4404341096", "title": "RELATE: A Modern Processing Platform for Romanian Language", "authors": "Vasile P\u0103i\u0219, Radu Ion, Andrei-Marius Avram, Maria Mitrofan, Dan Tufi\u015f", "abstract": "This paper presents the design and evolution of the RELATE platform. It provides a high-performance environment for natural language processing activities, specially constructed for Romanian language. Initially developed for text processing, it has been recently updated to integrate audio processing tools. Technical details are provided with regard to core components. We further present different usage scenarios, derived from actual use in national and international research projects, thus demonstrating that RELATE is a mature, modern, state-of-the-art platform for processing Romanian language corpora. Finally, we present very recent developments including bimodal (text and audio) features available within the platform.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.561121463775635, 2.3295254707336426], "openalex_id": "https://openalex.org/W4404048205", "title": "Directionality and representativeness are differentiable components of stereotypes in large language models", "authors": "Gandalf Nicol\u00e1s, Aylin Caliskan", "abstract": "Abstract Representativeness is a relevant but unexamined property of stereotypes in language models. Existing auditing and debiasing approaches address the direction of stereotypes, such as whether a social category (e.g. men, women) is associated more with incompetence vs. competence content. On the other hand, representativeness is the extent to which a social category's stereotypes are about a specific content dimension, such as Competence, regardless of direction (e.g. as indicated by how often dimension-related words appear in stereotypes about the social category). As such, two social categories may be associated with competence (vs. incompetence), yet one category's stereotypes are mostly about competence, whereas the other's are mostly about alternative content (e.g. Warmth). Such differentiability would suggest that direction-based auditing may fail to identify biases in content representativeness. Here, we use a large sample of social categories that are salient in American society (based on gender, race, occupation, and others) to examine whether representativeness is an independent feature of stereotypes in the ChatGPT chatbot and SBERT language model. We focus on the Warmth and Competence stereotype dimensions, given their well-established centrality in human stereotype content. Our results provide evidence for the construct differentiability of direction and representativeness for Warmth and Competence stereotypes across models and target stimuli (social category terms, racialized name exemplars). Additionally, both direction and representativeness uniquely predicted the models' internal general valence (positivity vs. negativity) and human stereotypes. We discuss implications for the use of AI in the study of human cognition and the field of fairness in AI.", "venue": "PNAS Nexus", "label": 0}, {"loc": [9.09463882446289, -0.8770036101341248], "openalex_id": "https://openalex.org/W4404313614", "title": "Think Carefully and Check Again! Meta-Generation Unlocking LLMs for Low-Resource Cross-Lingual Summarization", "authors": "Zhecheng Li, Yiwei Wang, Bryan Hooi, Yujun Cai, Naifan Cheung, Nanyun Peng, Kai-Wei Chang", "abstract": "Cross-lingual summarization (CLS) aims to generate a summary for the source text in a different target language. Currently, instruction-tuned large language models (LLMs) excel at various English tasks. However, unlike languages such as English, Chinese or Spanish, for those relatively low-resource languages with limited usage or data, recent studies have shown that LLMs' performance on CLS tasks remains unsatisfactory even with few-shot settings. This raises the question: Are LLMs capable of handling cross-lingual summarization tasks for low-resource languages? To resolve this question, we fully explore the potential of large language models on cross-lingual summarization task for low-resource languages through our four-step zero-shot method: Summarization, Improvement, Translation and Refinement (SITR) with correspondingly designed prompts. We test our proposed method with multiple LLMs on two well-known cross-lingual summarization datasets with various low-resource target languages. The results show that: i) GPT-3.5 and GPT-4 significantly and consistently outperform other baselines when using our zero-shot SITR methods. ii) By employing our proposed method, we unlock the potential of LLMs, enabling them to effectively handle cross-lingual summarization tasks for relatively low-resource languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.10078239440918, -1.204546332359314], "openalex_id": "https://openalex.org/W4404313857", "title": "A Survey of Large Language Models for Arabic Language and its Dialects", "authors": "Malak Mashaabi, Shahad Al-Khalifa, Hend S. Al\u2010Khalifa", "abstract": "This survey offers a comprehensive overview of Large Language Models (LLMs) designed for Arabic language and its dialects. It covers key architectures, including encoder-only, decoder-only, and encoder-decoder models, along with the datasets used for pre-training, spanning Classical Arabic, Modern Standard Arabic, and Dialectal Arabic. The study also explores monolingual, bilingual, and multilingual LLMs, analyzing their architectures and performance across downstream tasks, such as sentiment analysis, named entity recognition, and question answering. Furthermore, it assesses the openness of Arabic LLMs based on factors, such as source code availability, training data, model weights, and documentation. The survey highlights the need for more diverse dialectal datasets and attributes the importance of openness for research reproducibility and transparency. It concludes by identifying key challenges and opportunities for future research and stressing the need for more inclusive and representative models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.530293941497803, 4.725552558898926], "openalex_id": "https://openalex.org/W4404349626", "title": "EDGE: Enhanced Grounded GUI Understanding with Enriched Multi-Granularity Synthetic Data", "authors": "Xuetian Chen, Hangcheng Li, Jiaqing Liang, Sihang Jiang, Deqing Yang", "abstract": "Autonomous agents operating on the graphical user interfaces (GUIs) of various applications hold immense practical value. Unlike the large language model (LLM)-based methods which rely on structured texts and customized backends, the approaches using large vision-language models (LVLMs) are more intuitive and adaptable as they can visually perceive and directly interact with screens, making them indispensable in general scenarios without text metadata and tailored backends. Given the lack of high-quality training data for GUI-related tasks in existing work, this paper aims to enhance the GUI understanding and interacting capabilities of LVLMs through a data-driven approach. We propose EDGE, a general data synthesis framework that automatically generates large-scale, multi-granularity training data from webpages across the Web. Evaluation results on various GUI and agent benchmarks demonstrate that the model trained with the dataset generated through EDGE exhibits superior webpage understanding capabilities, which can then be easily transferred to previously unseen desktop and mobile environments. Our approach significantly reduces the dependence on manual annotations, empowering researchers to harness the vast public resources available on the Web to advance their work. Our source code, the dataset and the model are available at https://anonymous.4open.science/r/EDGE-1CDB.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.191796779632568, -1.6303691864013672], "openalex_id": "https://openalex.org/W4403837098", "title": "Leveraging Social Media and Deep Learning for Sentiment Analysis for Smart Governance: A Case Study of Public Reactions to Educational Reforms in Saudi Arabia", "authors": "Alanoud Alotaibi, Farrukh Nadeem", "abstract": "The Saudi government\u2019s educational reforms aim to align the system with market needs and promote economic opportunities. However, a lack of credible data makes assessing public sentiment towards these reforms challenging. This research develops a sentiment analysis application to analyze public emotional reactions to educational reforms in Saudi Arabia using AraBERT, an Arabic language model. We constructed a unique Arabic dataset of 216,858 tweets related to the reforms, with 2000 manually labeled for public sentiment. To establish a robust evaluation framework, we employed random forests, support vector machines, and logistic regression as baseline models alongside AraBERT. We also compared the fine-tuned AraBERT Sentiment Classification model with CAMeLBERT, MARBERT, and LLM (GPT) models. The fine-tuned AraBERT model had an F1 score of 0.89, which was above the baseline models by 5% and demonstrated a 4% improvement compared to other pre-trained transformer models applied to this task. This highlights the advantage of transformer models specifically trained for the target language and domain (Arabic). Arabic-specific sentiment analysis models outperform multilingual models for this task. Overall, this study demonstrates the effectiveness of AraBERT in analyzing Arabic sentiment on social media. This approach has the potential to inform educational reform evaluation in Saudi Arabia and potentially other Arabic-speaking regions.", "venue": "Computers", "label": 0}, {"loc": [7.762104511260986, 3.990405559539795], "openalex_id": "https://openalex.org/W4404310986", "title": "Read-ME: Refactorizing LLMs as Router-Decoupled Mixture of Experts with System Co-Design", "authors": "Ruisi Cai, Yeonju Ro, Geon-Woo Kim, Peihao Wang, Babak Ehteshami Bejnordi, Aditya Akella, Shuicheng Yan", "abstract": "The proliferation of large language models (LLMs) has led to the adoption of Mixture-of-Experts (MoE) architectures that dynamically leverage specialized subnetworks for improved efficiency and performance. Despite their benefits, MoE models face significant challenges during inference, including inefficient memory management and suboptimal batching, due to misaligned design choices between the model architecture and the system policies. Furthermore, the conventional approach of training MoEs from scratch is increasingly prohibitive in terms of cost. In this paper, we propose a novel framework Read-ME that transforms pre-trained dense LLMs into smaller MoE models (in contrast to \"upcycling\" generalist MoEs), avoiding the high costs of ground-up training. Our approach employs activation sparsity to extract experts. To compose experts, we examine the widely-adopted layer-wise router design and show its redundancy, and thus we introduce the pre-gating router decoupled from the MoE backbone that facilitates system-friendly pre-computing and lookahead scheduling, enhancing expert-aware batching and caching. Our codesign therefore addresses critical gaps on both the algorithmic and system fronts, establishing a scalable and efficient alternative for LLM inference in resource-constrained settings. Read-ME outperforms other popular open-source dense models of similar scales, achieving improvements of up to 10.1% on MMLU, and improving mean end-to-end latency up to 6.1%. Codes are available at: https://github.com/VITA-Group/READ-ME.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.559908866882324, 1.8270676136016846], "openalex_id": "https://openalex.org/W4403825867", "title": "Emerging Roots: Investigating Early Access to Meaning in Maltese Auditory Word Recognition", "authors": "Jessica Nieder, Ruben van de Vijver, Adam Ussishkin", "abstract": "Abstract In Semitic languages, the consonantal root is central to morphology, linking form and meaning. While psycholinguistic studies highlight its importance in language processing, the role of meaning in early lexical access and its representation remain unclear. This study investigates when meaning becomes accessible during the processing of Maltese verb forms, using a computational model based on the Discriminative Lexicon framework. Our model effectively comprehends and produces Maltese verbs, while also predicting response times in a masked auditory priming experiment. Results show that meaning is accessible early in lexical access and becomes more prominent after the target word is fully processed. This suggests that semantic information plays a critical role from the initial stages of lexical access, refining our understanding of real\u2010time language comprehension. Our findings contribute to theories of lexical access and offer valuable insights for designing priming studies in psycholinguistics. Additionally, this study demonstrates the potential of computational models in investigating the relationship between form and meaning in language processing.", "venue": "Cognitive Science", "label": 0}, {"loc": [7.775656700134277, 3.9342517852783203], "openalex_id": "https://openalex.org/W4404310986", "title": "$\\textit {Read-ME} $: Refactorizing LLMs as Router-Decoupled Mixture of Experts with System Co-Design", "authors": "Ruisi Cai, Yeonju Ro, Geon-Woo Kim, Peihao Wang, Babak Ehteshami Bejnordi, Aditya Akella, Shuicheng Yan", "abstract": "The proliferation of large language models (LLMs) has led to the adoption of Mixture-of-Experts (MoE) architectures that dynamically leverage specialized subnetworks for improved efficiency and performance. Despite their benefits, MoE models face significant challenges during inference, including inefficient memory management and suboptimal batching, due to misaligned design choices between the model architecture and the system policies. Furthermore, the conventional approach of training MoEs from scratch is increasingly prohibitive in terms of cost. In this paper, we propose a novel framework Read-ME that transforms pre-trained dense LLMs into smaller MoE models (in contrast to \"upcycling\" generalist MoEs), avoiding the high costs of ground-up training. Our approach employs activation sparsity to extract experts. To compose experts, we examine the widely-adopted layer-wise router design and show its redundancy, and thus we introduce the pre-gating router decoupled from the MoE backbone that facilitates system-friendly pre-computing and lookahead scheduling, enhancing expert-aware batching and caching. Our codesign therefore addresses critical gaps on both the algorithmic and system fronts, establishing a scalable and efficient alternative for LLM inference in resource-constrained settings. Read-ME outperforms other popular open-source dense models of similar scales, achieving improvements of up to 10.1% on MMLU, and improving mean end-to-end latency up to 6.1%. Codes are available at: https://github.com/VITA-Group/READ-ME.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.319882392883301, 0.9516259431838989], "openalex_id": "https://openalex.org/W4403846367", "title": "Natural Language, Legal Hurdles: Navigating the Complexities in Natural Language Processing Development and Application", "authors": "Ilya Ilin, Aleksei Kelli", "abstract": "This article delves into the legal challenges faced in developing and deploying Natural Language Processing (NLP) technologies, focusing particularly on the European Union\u2019s legal framework, especially the DSM Directive, the InfoSoc Directive, and the Artificial Intelligence Act. It addresses the legal status and accessibility of language data and the development of NLP technologies under both contractual and exception-based models. The authors acknowledge the partial truth in the saying, \u201cUS innovates, China replicates, and the EU regulates\u201d. Although Europe\u2019s AI sector is a global competitor and its strict regulations ensure ethical standards and data protection, these regulations might not necessarily boost competitiveness. Such stringent regulations can introduce complexities that may inhibit innovation relative to regions with more lenient policies.", "venue": "JOURNAL OF THE UNIVERSITY OF LATVIA LAW", "label": 0}, {"loc": [6.4594926834106445, 5.366335868835449], "openalex_id": "https://openalex.org/W4404307423", "title": "Probabilistic Language-Image Pre-Training", "authors": "Sanghyuk Chun, Wonjae Kim, Song Park, Sangdoo Yun", "abstract": "Vision-language models (VLMs) embed aligned image-text pairs into a joint space but often rely on deterministic embeddings, assuming a one-to-one correspondence between images and texts. This oversimplifies real-world relationships, which are inherently many-to-many, with multiple captions describing a single image and vice versa. We introduce Probabilistic Language-Image Pre-training (ProLIP), the first probabilistic VLM pre-trained on a billion-scale image-text dataset using only probabilistic objectives, achieving a strong zero-shot capability (e.g., 74.6% ImageNet zero-shot accuracy with ViT-B/16). ProLIP efficiently estimates uncertainty by an \"uncertainty token\" without extra parameters. We also introduce a novel inclusion loss that enforces distributional inclusion relationships between image-text pairs and between original and masked inputs. Experiments demonstrate that, by leveraging uncertainty estimates, ProLIP benefits downstream tasks and aligns with intuitive notions of uncertainty, e.g., shorter texts being more uncertain and more general inputs including specific ones. Utilizing text uncertainties, we further improve ImageNet accuracy from 74.6% to 75.8% (under a few-shot setting), supporting the practical advantages of our probabilistic approach. The code is available at https://github.com/naver-ai/prolip", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.388495445251465, 2.3645899295806885], "openalex_id": "https://openalex.org/W4404306305", "title": "ZIP-FIT: Embedding-Free Data Selection via Compression-Based Alignment", "authors": "Elyas Obbad, Iddah Mlauzi, Brando Miranda, Rylan Schaeffer, Kamal Obbad, Suhana Bedi, Sanmi Koyejo", "abstract": "Data selection is crucial for optimizing language model (LM) performance on specific tasks, yet most existing methods fail to effectively consider the target task distribution. Current approaches either ignore task-specific requirements entirely or rely on approximations that fail to capture the nuanced patterns needed for tasks like Autoformalization or code generation. Methods that do consider the target distribution often rely on simplistic, sometimes noisy, representations, like hashed n-gram features, which can lead to collisions and introduce noise. We introduce ZIP-FIT, a data selection framework that uses gzip compression to directly measure alignment between potential training data and the target task distribution. In extensive evaluations on Autoformalization and Python code generation, ZIP-FIT significantly outperforms leading baselines like DSIR and D4. Models trained on ZIP-FIT-selected data achieve their lowest cross-entropy loss up to 85.1\\% faster than baselines, demonstrating that better task alignment leads to more efficient learning. In addition, ZIP-FIT performs selection up to 65.8\\% faster than DSIR and two orders of magnitude faster than D4. Notably, ZIP-FIT shows that smaller, well-aligned datasets often outperform larger but less targeted ones, demonstrating that a small amount of higher quality data is superior to a large amount of lower quality data. Our results imply that task-aware data selection is crucial for efficient domain adaptation, and that compression offers a principled way to measure task alignment. By showing that targeted data selection can dramatically improve task-specific performance, our work provides new insights into the relationship between data quality, task alignment, and model learning efficiency.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.407611846923828, 1.7950196266174316], "openalex_id": "https://openalex.org/W4404306246", "title": "TabDPT: Scaling Tabular Foundation Models", "authors": "Junwei Ma, Valentin Thomas, Rasa Hosseinzadeh, Hamidreza Kamkari, Alex Labach, Jesse C. Cresswell, Keyvan Golestan, G. H. Yu, Maksims Volkovs, Anthony L. Caterini", "abstract": "Tabular data is one of the most ubiquitous sources of information worldwide, spanning a wide variety of domains. This inherent heterogeneity has slowed the development of Tabular Foundation Models (TFMs) capable of fast generalization to unseen datasets. In-Context Learning (ICL) has recently emerged as a promising solution for TFMs, enabling dynamic adaptation to new tasks without additional tuning. While many studies have attempted to re-purpose large language models for tabular ICL, they have had limited success, so recent works have focused on developing tabular-specific foundation models. In this work, we propose an approach to combine ICL-based retrieval with self supervised learning to train tabular foundation models. We also investigate the utility of real vs. synthetic data for model pre-training, and show that real data can contain useful signal not easily captured in synthetic training. Specifically, we show that incorporating real data during the pre-training phase can lead to significantly faster training and better downstream generalization to unseen data. Our resulting model, TabDPT, achieves top performance on both regression (CTR23) and classification (CC18) benchmarks. Importantly, we also demonstrate that with our pre-training procedure, scaling both model and data size leads to consistent performance improvements that follow power laws. This echoes scaling laws in LLMs and other foundation models, and suggests that Internet-scale TFMs can be achievable. We open-source our full pipeline: inference code including trained model weights can be found at github.com/layer6ai-labs/TabDPT-inference, and the training code to reproduce experiments can be found at github.com/layer6ai-labs/TabDPT-training.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.081954956054688, 0.774367094039917], "openalex_id": "https://openalex.org/W4403756543", "title": "Dynamic Context-Aware Representation for Semantic Alignment in Large Language Models", "authors": "Jason Baronova, Catherine Stevens, Logan Tennant, A. G. MacPhee", "abstract": "The capacity of modern neural networks to generate human-like text is accompanied by ongoing challenges in maintaining semantic coherence, particularly across dynamically evolving contexts in long-form text generation. Dynamic Context-Aware Representation (DCAR) addresses this limitation through a novel mechanism that enables continuous recalibration of context vectors, ensuring more accurate semantic alignment throughout the generation process. Through the integration of a dynamic adjustment layer within a state-of-the-art transformer-based LLM, significant improvements were observed in perplexity, BLEU score, and semantic coherence, especially in cases where traditional static embeddings fall short. Experimental results validated the effectiveness of DCAR in managing context shifts fluidly, with minimal computational overhead, providing a flexible yet powerful solution for enhancing the performance of LLMs in handling complex, multi-turn conversations and extended text. The findings suggest that the application of DCAR offers a substantial leap in both the accuracy and adaptability of LLM architectures, enabling more precise and consistent generation of language across a variety of domains. These advancements position DCAR as a transformative step in overcoming the inherent limitations of static context representations in language models, pushing the boundaries of contextual comprehension and generation in neural networks.", "venue": "https://doi.org/10.31219/osf.io/svcn3", "label": 0}, {"loc": [8.414996147155762, -0.17286112904548645], "openalex_id": "https://openalex.org/W4404306389", "title": "Multilingual Hallucination Gaps in Large Language Models", "authors": "Cl\u00e9a Chataigner, Afaf Ta\u00efk, Golnoosh Farnadi", "abstract": "Large language models (LLMs) are increasingly used as alternatives to traditional search engines given their capacity to generate text that resembles human language. However, this shift is concerning, as LLMs often generate hallucinations, misleading or false information that appears highly credible. In this study, we explore the phenomenon of hallucinations across multiple languages in freeform text generation, focusing on what we call multilingual hallucination gaps. These gaps reflect differences in the frequency of hallucinated answers depending on the prompt and language used. To quantify such hallucinations, we used the FactScore metric and extended its framework to a multilingual setting. We conducted experiments using LLMs from the LLaMA, Qwen, and Aya families, generating biographies in 19 languages and comparing the results to Wikipedia pages. Our results reveal variations in hallucination rates, especially between high and low resource languages, raising important questions about LLM multilingual performance and the challenges in evaluating hallucinations in multilingual freeform text generation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.046440601348877, -2.354745864868164], "openalex_id": "https://openalex.org/W4404307062", "title": "Health Misinformation in Social Networks: A Survey of IT Approaches", "authors": "Vasiliki Papanikou, Panagiotis Papadakos, Theodora Karamanidou, Thanos G. Stavropoulos, Evaggelia Pitoura, Panayiotis Tsaparas", "abstract": "In this paper, we present a comprehensive survey on the pervasive issue of medical misinformation in social networks from the perspective of information technology. The survey aims at providing a systematic review of related research and helping researchers and practitioners navigate through this fast-changing field. Specifically, we first present manual and automatic approaches for fact-checking. We then explore fake news detection methods, using content, propagation features, or source features, as well as mitigation approaches for countering the spread of misinformation. We also provide a detailed list of several datasets on health misinformation and of publicly available tools. We conclude the survey with a discussion on the open challenges and future research directions in the battle against health misinformation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.964996337890625, 2.571262836456299], "openalex_id": "https://openalex.org/W4404307871", "title": "MojoBench: Language Modeling and Benchmarks for Mojo", "authors": "Nishat Raihan, Joanna C. S. Santos, Marcos Zampieri", "abstract": "The recently introduced Mojo programming language (PL) by Modular, has received significant attention in the scientific community due to its claimed significant speed boost over Python. Despite advancements in code Large Language Models (LLMs) across various PLs, Mojo remains unexplored in this context. To address this gap, we introduce MojoBench, the first framework for Mojo code generation. MojoBench includes HumanEval-Mojo, a benchmark dataset designed for evaluating code LLMs on Mojo, and Mojo-Coder, the first LLM pretrained and finetuned for Mojo code generation, which supports instructions in 5 natural languages (NLs). Our results show that Mojo-Coder achieves a 30-35% performance improvement over leading models like GPT-4o and Claude-3.5-Sonnet. Furthermore, we provide insights into LLM behavior with underrepresented and unseen PLs, offering potential strategies for enhancing model adaptability. MojoBench contributes to our understanding of LLM capabilities and limitations in emerging programming paradigms fostering more robust code generation systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.767268180847168, 0.4674074053764343], "openalex_id": "https://openalex.org/W4404305059", "title": "Responsible Multilingual Large Language Models: A Survey of Development, Applications, and Societal Impact", "authors": "Junhua Liu, Bin Fu", "abstract": "Multilingual Large Language Models (MLLMs) represent a pivotal advancement in democratizing artificial intelligence across linguistic boundaries. While theoretical foundations are well-established, practical implementation guidelines remain scattered. This work bridges this gap by providing a comprehensive end-to-end framework for developing and deploying MLLMs in production environments. We make three distinctive contributions: First, we present an actionable pipeline from data pre-processing through deployment, integrating insights from academic research and industrial applications. Second, using Llama2 as a case study, we provide detailed optimization strategies for enhancing multilingual capabilities, including curriculum learning approaches for balancing high-resource and low-resource languages, tokenization strategies, and effective sampling methods. Third, we offer an interdisciplinary analysis that considers technical, linguistic, and cultural perspectives in MLLM development. Our findings reveal critical challenges in supporting linguistic diversity, with 88.38% of world languages categorized as low-resource, affecting over a billion speakers. We examine practical solutions through real-world applications in customer service, search engines, and machine translation. By synthesizing theoretical frameworks with production-ready implementation strategies, this survey provides essential guidance for practitioners and researchers working to develop more inclusive and effective multilingual AI systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.079184532165527, 0.18246975541114807], "openalex_id": "https://openalex.org/W4404305023", "title": "MM-Eval: A Multilingual Meta-Evaluation Benchmark for LLM-as-a-Judge and Reward Models", "authors": "Guijin Son, Dongkeun Yoon, Juyoung Suk, Javier Aula-Blasco, Mehmet Aslan, V. Kim, Shayekh Bin Islam, Jaume Prats-Cristi\u00e0, Luc\u00eda Tormo-Ba\u00f1uelos, Seungone Kim", "abstract": "As Large Language Models (LLMs) are now capable of producing fluent and coherent content in languages other than English, it is not imperative to precisely evaluate these non-English outputs. However, when assessing the outputs from mutlilingual LLMs, prior works often employed LLM based evaluators that excel at assessing English outputs, without a thorough examination of whether these evaluators could effectively assess non-English text as well. Moreover, existing benchmarks to test evaluator LLMs (referred to as \"meta-evaluation benchmarks\") are mostly English-centric. To bridge this gap and examine whether evaluator LLMs can reliably assess the outputs of multilingual LLMs, we introduce MM-Eval, a multilingual meta-evaluation benchmark comprising five core subsets covering 18 languages and a Language Consistency subset spanning 122 languages. A core attribute of MM-Eval is that, instead of merely translating existing English meta-evaluation benchmarks, it is designed with multilingual-specific challenges in mind. Additionally, unlike existing meta-evaluation benchmarks that focus solely on ranking accuracy over pairwise data, MM-Eval also evaluates the consistency and fairness of absolute score values across a wide range of languages. Our results show that existing evaluator LLMs that excel in English contexts have considerable room for improvement when assessing non-English outputs. Furthermore, we find that evaluators are unfair and inconsistent when evaluating lower-resourced languages. Finally, we validate MM-Eval by measuring its correlation with Best-of-N rankings, finding a significantly stronger correlation compared to other meta-evaluation benchmarks. We publicly release our benchmark and code.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.886934280395508, 1.4691565036773682], "openalex_id": "https://openalex.org/W4404307915", "title": "Scaling Diffusion Language Models via Adaptation from Autoregressive Models", "authors": "Shansan Gong, Shivam Agarwal, Yizhe Zhang, Jiacheng Ye, Lin Zheng, Mukai Li, Chenxin An, Peilin Zhao, Wei Bi, Jiawei Han, Peng Hao, Lingpeng Kong", "abstract": "Diffusion Language Models (DLMs) have emerged as a promising new paradigm for text generative modeling, potentially addressing limitations of autoregressive (AR) models. However, current DLMs have been studied at a smaller scale compared to their AR counterparts and lack fair comparison on language modeling benchmarks. Additionally, training diffusion models from scratch at scale remains challenging. Given the prevalence of open-source AR language models, we propose adapting these models to build text diffusion models. We demonstrate connections between AR and diffusion modeling objectives and introduce a simple continual pre-training approach for training diffusion models. Through systematic evaluation on language modeling, reasoning, and commonsense benchmarks, we show that we can convert AR models ranging from 127M to 7B parameters (GPT2 and LLaMA) into diffusion models DiffuGPT and DiffuLLaMA, using less than 200B tokens for training. Our experimental results reveal that these models outperform earlier DLMs and are competitive with their AR counterparts. We release a suite of DLMs (127M-355M-7B) capable of generating fluent text, performing in-context learning, filling in the middle without prompt re-ordering, and following instructions https://github.com/HKUNLP/DiffuLLaMA.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.612024307250977, 0.6520419120788574], "openalex_id": "https://openalex.org/W4404308443", "title": "Key Algorithms for Keyphrase Generation: Instruction-Based LLMs for Russian Scientific Keyphrases", "authors": "Anna Glazkova, Dmitry Morozov, Timur Garipov", "abstract": "Keyphrase selection is a challenging task in natural language processing that has a wide range of applications. Adapting existing supervised and unsupervised solutions for the Russian language faces several limitations due to the rich morphology of Russian and the limited number of training datasets available. Recent studies conducted on English texts show that large language models (LLMs) successfully address the task of generating keyphrases. LLMs allow achieving impressive results without task-specific fine-tuning, using text prompts instead. In this work, we access the performance of prompt-based methods for generating keyphrases for Russian scientific abstracts. First, we compare the performance of zero-shot and few-shot prompt-based methods, fine-tuned models, and unsupervised methods. Then we assess strategies for selecting keyphrase examples in a few-shot setting. We present the outcomes of human evaluation of the generated keyphrases and analyze the strengths and weaknesses of the models through expert assessment. Our results suggest that prompt-based methods can outperform common baselines even using simple text prompts.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.50346851348877, 3.5646963119506836], "openalex_id": "https://openalex.org/W4404308019", "title": "AdaRankGrad: Adaptive Gradient-Rank and Moments for Memory-Efficient LLMs Training and Fine-Tuning", "authors": "Yehonathan Refael, Jonathan Svirsky, Boris Shustin, Wasim Huleihel, Ofir Lindenbaum", "abstract": "Training and fine-tuning large language models (LLMs) come with challenges related to memory and computational requirements due to the increasing size of the model weights and the optimizer states. Various techniques have been developed to tackle these challenges, such as low-rank adaptation (LoRA), which involves introducing a parallel trainable low-rank matrix to the fixed pre-trained weights at each layer. However, these methods often fall short compared to the full-rank weight training approach, as they restrict the parameter search to a low-rank subspace. This limitation can disrupt training dynamics and require a full-rank warm start to mitigate the impact. In this paper, we introduce a new method inspired by a phenomenon we formally prove: as training progresses, the rank of the estimated layer gradients gradually decreases, and asymptotically approaches rank one. Leveraging this, our approach involves adaptively reducing the rank of the gradients during Adam optimization steps, using an efficient online-updating low-rank projections rule. We further present a randomized SVD scheme for efficiently finding the projection matrix. Our technique enables full-parameter fine-tuning with adaptive low-rank gradient updates, significantly reducing overall memory requirements during training compared to state-of-the-art methods while improving model performance in both pretraining and fine-tuning. Finally, we provide a convergence analysis of our method and demonstrate its merits for training and fine-tuning language and biological foundation models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.769287586212158, -0.9339823722839355], "openalex_id": "https://openalex.org/W4404303862", "title": "Dialectal and Low Resource Machine Translation for Aromanian", "authors": "Alexandru-Iulius Jerpelea, Anamaria R\u0103doi, Sergiu Nisioi", "abstract": "This paper presents the process of building a neural machine translation system with support for English, Romanian, and Aromanian - an endangered Eastern Romance language. The primary contribution of this research is twofold: (1) the creation of the most extensive Aromanian-Romanian parallel corpus to date, consisting of 79,000 sentence pairs, and (2) the development and comparative analysis of several machine translation models optimized for Aromanian. To accomplish this, we introduce a suite of auxiliary tools, including a language-agnostic sentence embedding model for text mining and automated evaluation, complemented by a diacritics conversion system for different writing standards. This research brings contributions to both computational linguistics and language preservation efforts by establishing essential resources for a historically under-resourced language. All datasets, trained models, and associated tools are public: https://huggingface.co/aronlp and https://arotranslate.com", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.170544624328613, 3.156355619430542], "openalex_id": "https://openalex.org/W4404305618", "title": "Beware of Calibration Data for Pruning Large Language Models", "authors": "Yixin Ji, Yang Xiang, Juntao Li, Qingrong Xia, Ping Li, Xinyu Duan, Zhefeng Wang, Min Zhang", "abstract": "As large language models (LLMs) are widely applied across various fields, model compression has become increasingly crucial for reducing costs and improving inference efficiency. Post-training pruning is a promising method that does not require resource-intensive iterative training and only needs a small amount of calibration data to assess the importance of parameters. Recent research has enhanced post-training pruning from different aspects but few of them systematically explore the effects of calibration data, and it is unclear if there exist better calibration data construction strategies. We fill this blank and surprisingly observe that calibration data is also crucial to post-training pruning, especially for high sparsity. Through controlled experiments on important influence factors of calibration data, including the pruning settings, the amount of data, and its similarity with pre-training data, we observe that a small size of data is adequate, and more similar data to its pre-training stage can yield better performance. As pre-training data is usually inaccessible for advanced LLMs, we further provide a self-generating calibration data synthesis strategy to construct feasible calibration data. Experimental results on recent strong open-source LLMs (e.g., DCLM, and LLaMA-3) show that the proposed strategy can enhance the performance of strong pruning methods (e.g., Wanda, DSnoT, OWL) by a large margin (up to $2.68\\%$). Code is available at https://github.com/Dereck0602/calibration_data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.475367069244385, 5.512924671173096], "openalex_id": "https://openalex.org/W4404261354", "title": "MPDS: A Movie Posters Dataset for Image Generation with Diffusion Model", "authors": "Meng Xu, Tong Zhang, Fuyun Wang, Lei Yi, Xin Liu, Zhen Cui", "abstract": "Movie posters are vital for captivating audiences, conveying themes, and driving market competition in the film industry. While traditional designs are laborious, intelligent generation technology offers efficiency gains and design enhancements. Despite exciting progress in image generation, current models often fall short in producing satisfactory poster results. The primary issue lies in the absence of specialized poster datasets for targeted model training. In this work, we propose a Movie Posters DataSet (MPDS), tailored for text-to-image generation models to revolutionize poster production. As dedicated to posters, MPDS stands out as the first image-text pair dataset to our knowledge, composing of 373k+ image-text pairs and 8k+ actor images (covering 4k+ actors). Detailed poster descriptions, such as movie titles, genres, casts, and synopses, are meticulously organized and standardized based on public movie synopsis, also named movie-synopsis prompt. To bolster poster descriptions as well as reduce differences from movie synopsis, further, we leverage a large-scale vision-language model to automatically produce vision-perceptive prompts for each poster, then perform manual rectification and integration with movie-synopsis prompt. In addition, we introduce a prompt of poster captions to exhibit text elements in posters like actor names and movie titles. For movie poster generation, we develop a multi-condition diffusion framework that takes poster prompt, poster caption, and actor image (for personalization) as inputs, yielding excellent results through the learning of a diffusion model. Experiments demonstrate the valuable role of our proposed MPDS dataset in advancing personalized movie poster generation. MPDS is available at https://anonymous.4open.science/r/MPDS-373k-BD3B.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.777820587158203, 5.125595569610596], "openalex_id": "https://openalex.org/W4404388438", "title": "Mini-InternVL: A Flexible-Transfer Pocket Multimodal Model with 5% Parameters and 90% Performance", "authors": "Zhangwei Gao, Zhe Chen, Erfei Cui, Yi Ren, Weiyun Wang, Jinguo Zhu, Hao Tian, Shenglong Ye, Junjun He, Xiaolong Zhu, Liang L\u00fc, Tong Lu, Yu Qiao, J. P. Dai, Weiliang Wang", "abstract": "Multimodal large language models (MLLMs) have demonstrated impressive performance in vision-language tasks across a broad spectrum of domains. However, the large model scale and associated high computational costs pose significant challenges for training and deploying MLLMs on consumer-grade GPUs or edge devices, thereby hindering their widespread application. In this work, we introduce Mini-InternVL, a series of MLLMs with parameters ranging from 1B to 4B, which achieves 90% of the performance with only 5% of the parameters. This significant improvement in efficiency and effectiveness makes our models more accessible and applicable in various real-world scenarios. To further promote the adoption of our models, we develop a unified adaptation framework for Mini-InternVL, which enables our models to transfer and outperform specialized models in downstream tasks, including autonomous driving, medical images, and remote sensing. We believe that our study can provide valuable insights and resources to advance the development of efficient and effective MLLMs. Code is available at https://github.com/OpenGVLab/InternVL.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.499539375305176, 3.629767656326294], "openalex_id": "https://openalex.org/W4404089139", "title": "Natural GaLore: Accelerating GaLore for memory-efficient LLM Training and Fine-tuning", "authors": "Arijit Das", "abstract": "Training LLMs presents significant memory challenges due to growing size of data, weights, and optimizer states. Techniques such as data and model parallelism, gradient checkpointing, and offloading strategies address this issue but are often infeasible due to hardware constraints. To mitigate memory usage, alternative methods like Parameter-Efficient-Fine-Tuning (PEFT) and GaLore approximate weights or optimizer states. PEFT methods, such as LoRA, have gained popularity for fine-tuning LLMs, though they require a full-rank warm start. In contrast, GaLore allows full-parameter learning while being more memory-efficient. This work introduces Natural GaLore, a simple drop in replacement for AdamW, which efficiently applies the inverse Empirical Fisher Information Matrix to low-rank gradients using Woodbury's Identity. We demonstrate that incorporating second-order information speeds up optimization significantly, especially when the iteration budget is limited. Empirical pretraining on 60M, 130M, 350M, and 1.1B parameter Llama models on C4 data demonstrate significantly lower perplexity over GaLore without additional memory overhead. By fine-tuning RoBERTa on the GLUE benchmark using Natural GaLore, we demonstrate significant reduction in gap 86.05% vs 86.28% for full-finetuning. Furthermore, fine-tuning the TinyLlama 1.1B model for function calling using the TinyAgent framework shows that Natural GaLore achieving 83.09% accuracy on the TinyAgent dataset, significantly outperforms 16-bit LoRA at 80.06% and even surpasses GPT4-Turbo by 4%, all while using 30% less memory. All code to reproduce the results are available at: https://github.com/selfsupervised-ai/Natural-GaLore.git", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.748571872711182, -1.080444097518921], "openalex_id": "https://openalex.org/W4404088420", "title": "A survey of neural-network-based methods utilising comparable data for finding translation equivalents", "authors": "Michaela Denisov\u00e1, Pavel Rychl\u00fd", "abstract": "The importance of inducing bilingual dictionary components in many natural language processing (NLP) applications is indisputable. However, the dictionary compilation process requires extensive work and combines two disciplines, NLP and lexicography, while the former often omits the latter. In this paper, we present the most common approaches from NLP that endeavour to automatically induce one of the essential dictionary components, translation equivalents and focus on the neural-network-based methods using comparable data. We analyse them from a lexicographic perspective since their viewpoints are crucial for improving the described methods. Moreover, we identify the methods that integrate these viewpoints and can be further exploited in various applications that require them. This survey encourages a connection between the NLP and lexicography fields as the NLP field can benefit from lexicographic insights, and it serves as a helping and inspiring material for further research in the context of neural-network-based methods utilising comparable data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.1466190814971924, -0.5361669063568115], "openalex_id": "https://openalex.org/W4404088443", "title": "Fine-tuning foundational models to code diagnoses from veterinary health records", "authors": "Mayla R. Boguslav, Adam Kiehl, David Kott, G. Joseph Strecker, Tracy L. Webb, Nadia T. Saklou, Terri Ward, Michael Kirby", "abstract": "Veterinary medical records represent a large data resource for application to veterinary and One Health clinical research efforts. Use of the data is limited by interoperability challenges including inconsistent data formats and data siloing. Clinical coding using standardized medical terminologies enhances the quality of medical records and facilitates their interoperability with veterinary and human health records from other sites. Previous studies, such as DeepTag and VetTag, evaluated the application of Natural Language Processing (NLP) to automate veterinary diagnosis coding, employing long short-term memory (LSTM) and transformer models to infer a subset of Systemized Nomenclature of Medicine - Clinical Terms (SNOMED-CT) diagnosis codes from free-text clinical notes. This study expands on these efforts by incorporating all 7,739 distinct SNOMED-CT diagnosis codes recognized by the Colorado State University (CSU) Veterinary Teaching Hospital (VTH) and by leveraging the increasing availability of pre-trained language models (LMs). 13 freely-available pre-trained LMs were fine-tuned on the free-text notes from 246,473 manually-coded veterinary patient visits included in the CSU VTH's electronic health records (EHRs), which resulted in superior performance relative to previous efforts. The most accurate results were obtained when expansive labeled data were used to fine-tune relatively large clinical LMs, but the study also showed that comparable results can be obtained using more limited resources and non-clinical LMs. The results of this study contribute to the improvement of the quality of veterinary EHRs by investigating accessible methods for automated coding and support both animal and human health research by paving the way for more integrated and comprehensive health databases that span species and institutions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.313541412353516, 2.3489303588867188], "openalex_id": "https://openalex.org/W4404088026", "title": "What's New in My Data? Novelty Exploration via Contrastive Generation", "authors": "Masaru Isonuma, Ivan Titov", "abstract": "Fine-tuning is widely used to adapt language models for specific goals, often leveraging real-world data such as patient records, customer-service interactions, or web content in languages not covered in pre-training. These datasets are typically massive, noisy, and often confidential, making their direct inspection challenging. However, understanding them is essential for guiding model deployment and informing decisions about data cleaning or suppressing any harmful behaviors learned during fine-tuning. In this study, we introduce the task of novelty discovery through generation, which aims to identify novel properties of a fine-tuning dataset by generating examples that illustrate these properties. Our approach, Contrastive Generative Exploration (CGE), assumes no direct access to the data but instead relies on a pre-trained model and the same model after fine-tuning. By contrasting the predictions of these two models, CGE can generate examples that highlight novel characteristics of the fine-tuning data. However, this simple approach may produce examples that are too similar to one another, failing to capture the full range of novel phenomena present in the dataset. We address this by introducing an iterative version of CGE, where the previously generated examples are used to update the pre-trained model, and this updated model is then contrasted with the fully fine-tuned model to generate the next example, promoting diversity in the generated outputs. Our experiments demonstrate the effectiveness of CGE in detecting novel content, such as toxic language, as well as new natural and programming languages. Furthermore, we show that CGE remains effective even when models are fine-tuned using differential privacy techniques.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.2678091526031494, 2.4402287006378174], "openalex_id": "https://openalex.org/W4404088206", "title": "\" Confrontation or Acceptance\": Understanding Novice Visual Artists' Perception towards AI-assisted Art Creation", "authors": "Shu-Ning Zhang, Shixuan Li", "abstract": "The rise of Generative Artificial Intelligence (G-AI) has transformed the creative arts landscape by producing novel artwork, whereas in the same time raising ethical concerns. While previous studies have addressed these concerns from technical and societal viewpoints, there is a lack of discussion from an HCI perspective, especially considering the community's perception and the visual artists as human factors. Our study investigates G-AI's impact on visual artists and their relationship with GAI to inform HCI research. We conducted semi-structured interviews with 20 novice visual artists from an art college in the university with G-AI courses and practices. Our findings reveal (1) the mis-conception and the evolving adoption of visual artists, (2) the miscellaneous opinions of the society on visual artists' creative work, and (3) the co-existence of confrontation and collaboration between visual artists and G-AI. We explore future HCI research opportunities to address these issues.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.56733512878418, 3.5670082569122314], "openalex_id": "https://openalex.org/W4404089343", "title": "LDAdam: Adaptive Optimization from Low-Dimensional Gradient Statistics", "authors": "Thomas Robert, Mher Safaryan, Ionu\u021b-Vlad Modoranu, Dan Alistarh", "abstract": "We introduce LDAdam, a memory-efficient optimizer for training large models, that performs adaptive optimization steps within lower dimensional subspaces, while consistently exploring the full parameter space during training. This strategy keeps the optimizer's memory footprint to a fraction of the model size. LDAdam relies on a new projection-aware update rule for the optimizer states that allows for transitioning between subspaces, i.e., estimation of the statistics of the projected gradients. To mitigate the errors due to low-rank projection, LDAdam integrates a new generalized error feedback mechanism, which explicitly accounts for both gradient and optimizer state compression. We prove the convergence of LDAdam under standard assumptions, and show that LDAdam allows for accurate and efficient fine-tuning and pre-training of language models. Code is available at https://github.com/IST-DASLab/LDAdam", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.322694778442383, 5.419867992401123], "openalex_id": "https://openalex.org/W4404088282", "title": "How Many Van Goghs Does It Take to Van Gogh? Finding the Imitation Threshold", "authors": "Sahil Verma, Royi Rassin, Arnav Das, Gantavya Bhatt, Preethi Seshadri, Chirag Shah, Jeff Bilmes, Hannaneh Hajishirzi, Yanai Elazar", "abstract": "Text-to-image models are trained using large datasets collected by scraping image-text pairs from the internet. These datasets often include private, copyrighted, and licensed material. Training models on such datasets enables them to generate images with such content, which might violate copyright laws and individual privacy. This phenomenon is termed imitation -- generation of images with content that has recognizable similarity to its training images. In this work we study the relationship between a concept's frequency in the training dataset and the ability of a model to imitate it. We seek to determine the point at which a model was trained on enough instances to imitate a concept -- the imitation threshold. We posit this question as a new problem: Finding the Imitation Threshold (FIT) and propose an efficient approach that estimates the imitation threshold without incurring the colossal cost of training multiple models from scratch. We experiment with two domains -- human faces and art styles -- for which we create four datasets, and evaluate three text-to-image models which were trained on two pretraining datasets. Our results reveal that the imitation threshold of these models is in the range of 200-600 images, depending on the domain and the model. The imitation threshold can provide an empirical basis for copyright violation claims and acts as a guiding principle for text-to-image model developers that aim to comply with copyright and privacy laws. We release the code and data at \\url{https://github.com/vsahil/MIMETIC-2.git} and the project's website is hosted at \\url{https://how-many-van-goghs-does-it-take.github.io}.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.562875747680664, 2.173335075378418], "openalex_id": "https://openalex.org/W4403600496", "title": "The African Woman is Rhythmic and Soulful: Evaluation of Open-ended Generation for Implicit Biases", "authors": "Serene Lim, Mar\u00eda P\u00e9rez\u2010Ortiz", "abstract": "Abstract This paper investigates the subtle and often concealed biases present in Large Language Models (LLMs), focusing on implicit biases that may remain despite passing explicit bias tests. Implicit biases are significant because they influence the decisions made by these systems, potentially perpetuating stereotypes and discrimination, even when LLMs appear to function fairly. Traditionally, explicit bias tests or embedding-based methods are employed to detect bias, but these approaches can overlook more nuanced, implicit forms of bias. To address this, we introduce two novel psychological-inspired methodologies: the LLM Implicit Association Test (IAT) Bias and the LLM Decision Bias, designed to reveal and measure implicit biases through prompt-based and decision-making tasks. Additionally, open-ended generation tasks with thematic analysis of word generations and storytelling provide qualitative insights into the model's behavior. Our findings demonstrate that the LLM IAT Bias correlates with traditional methods and more effectively predicts downstream behaviors, as measured by the LLM Decision Bias, offering a more comprehensive framework for detecting subtle biases in AI systems. This research advances the field of AI ethics by proposing new methods to continually assess and mitigate biases in LLMs, highlighting the importance of qualitative and decision-focused evaluations to address challenges that previous approaches have not fully captured.", "venue": "https://doi.org/10.21203/rs.3.rs-5283007/v1", "label": 0}, {"loc": [3.312964677810669, -0.447249174118042], "openalex_id": "https://openalex.org/W4403995258", "title": "Self-Supervised Pre-Training with Joint-Embedding Predictive Architecture Boosts ECG Classification Performance", "authors": "Kuba Weimann, Tim Conrad", "abstract": "Accurate diagnosis of heart arrhythmias requires the interpretation of electrocardiograms (ECG), which capture the electrical activity of the heart. Automating this process through machine learning is challenging due to the need for large annotated datasets, which are difficult and costly to collect. To address this issue, transfer learning is often employed, where models are pre-trained on large datasets and fine-tuned for specific ECG classification tasks with limited labeled data. Self-supervised learning has become a widely adopted pre-training method, enabling models to learn meaningful representations from unlabeled datasets. In this work, we explore the joint-embedding predictive architecture (JEPA) for self-supervised learning from ECG data. Unlike invariance-based methods, JEPA does not rely on hand-crafted data augmentations, and unlike generative methods, it predicts latent features rather than reconstructing input data. We create a large unsupervised pre-training dataset by combining ten public ECG databases, amounting to over one million records. We pre-train Vision Transformers using JEPA on this dataset and fine-tune them on various PTB-XL benchmarks. Our results show that JEPA outperforms existing invariance-based and generative approaches, achieving an AUC of 0.945 on the PTB-XL all statements task. JEPA consistently learns the highest quality representations, as demonstrated in linear evaluations, and proves advantageous for pre-training even in the absence of additional data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.53525447845459, 2.096011161804199], "openalex_id": "https://openalex.org/W4403600496", "title": "The African Woman is Rhythmic and Soulful: An Investigation of Implicit Biases in LLM Open-ended Text Generation", "authors": "Serene Lim, Mar\u00eda P\u00e9rez\u2010Ortiz", "abstract": "Abstract This paper investigates the subtle and often concealed biases present in Large Language Models (LLMs), focusing on implicit biases that may remain despite passing explicit bias tests. Implicit biases are significant because they influence the decisions made by these systems, potentially perpetuating stereotypes and discrimination, even when LLMs appear to function fairly. Traditionally, explicit bias tests or embedding-based methods are employed to detect bias, but these approaches can overlook more nuanced, implicit forms of bias. To address this, we introduce two novel psychological-inspired methodologies: the LLM Implicit Association Test (IAT) Bias and the LLM Decision Bias, designed to reveal and measure implicit biases through prompt-based and decision-making tasks. Additionally, open-ended generation tasks with thematic analysis of word generations and storytelling provide qualitative insights into the model's behavior. Our findings demonstrate that the LLM IAT Bias correlates with traditional methods and more effectively predicts downstream behaviors, as measured by the LLM Decision Bias, offering a more comprehensive framework for detecting subtle biases in AI systems. This research advances the field of AI ethics by proposing new methods to continually assess and mitigate biases in LLMs, highlighting the importance of qualitative and decision-focused evaluations to address challenges that previous approaches have not fully captured.", "venue": "https://doi.org/10.21203/rs.3.rs-5283007/v1", "label": 0}, {"loc": [8.62344741821289, 3.0758087635040283], "openalex_id": "https://openalex.org/W4403596762", "title": "Adaptive Gradient Enhancement for Optimizing Large Language Models: An Empirical Study on Open Source Architectures", "authors": "Rosa Kingston, William G. Johnson, George F. Murphy, Matthew L. Williams, Christopher F. Brown", "abstract": "The rapid evolution of artificial intelligence has underscored the need for more efficient training methodologies capable of overcoming the limitations posed by traditional optimization techniques. Introducing a novel approach that adapts the momentum of gradient updates during training presents a significant advancement, addressing challenges such as instability and suboptimal convergence rates that often hinder the performance of complex models. Through comprehensive experiments, the proposed technique demonstrates superior efficiency in learning, reduced training times, and enhanced generalization capabilities across diverse datasets, ultimately fostering a more robust framework for model optimization. The implications of this work extend beyond immediate performance improvements, paving the way for future explorations into scalable training solutions that can accommodate increasingly intricate architectures in the field of deep learning.", "venue": "https://doi.org/10.31219/osf.io/e6bjz", "label": 0}, {"loc": [7.803725242614746, -0.8917979001998901], "openalex_id": "https://openalex.org/W4403566679", "title": "Advancements in Natural Language Understanding-Driven Machine Translation: Focus on English and the Low Resource Dialectal Lusoga", "authors": "Azizi Wasike, Ismail Kamukama, Yusuf Abass Aleshinloye, Adeleke Raheem Ajiboye, Jamir Ssebadduka", "abstract": "This review explores recent advancements in Natural Language Understanding-driven Machine Translation (NLU-MT) with a focus on English and the low-resource dialectal Lusoga. A Low-resource language, such as Lusoga, faces significant challenges in Machine Translation (MT) due to the scarcity of high-quality parallel corpora, the complex morphology inherent in Bantu languages, and the dialectal variations within Lusoga itself, particularly between Lutenga and Lupakoyo. This paper examines the role of NLU-based MT systems in overcoming these challenges by shifting from word-for-word mapping to meaning-based translations, enabling better handling of these dialectal differences. We highlight the success of leveraging linguistic similarities between Lusoga and related languages, such as Luganda, to improve translation performance through multilingual transfer learning techniques. Key advancements include the use of transformer-based architectures such as Multilingual Bidirectional and Auto-Regressive Transformer (mBART) and Multilingual Text-To-Text Transfer Transformer (mT5), specifically selected for their effectiveness in NLU-driven contexts, which have shown promise in enhancing translation accuracy for African low-resource languages. However, the review also identifies ongoing obstacles, including historical low demand and the lack of well-developed corpora, which hinder scalability. The paper concludes by emphasizing the potential of hybrid approaches that combine community-driven corpus-building initiatives with improved model architectures to drive further progress in low-resource MT. Ultimately, NLU-MT is positioned as a crucial tool not only for bridging communication gaps but also for preserving linguistic diversity and cultural heritage.", "venue": "International Journal of Innovative Science and Research Technology (IJISRT)", "label": 0}, {"loc": [6.21665620803833, 5.7392683029174805], "openalex_id": "https://openalex.org/W4403560362", "title": "Unsupervised Video Moment Retrieval with Knowledge-based Pseudo Supervision Construction", "authors": "Guolong Wang, Xun Wu, Xun Tu, Zhaoyuan Liu, Junchi Yan", "abstract": "Video moment retrieval locates a specified moment by a sentence query. Recent approaches have made remarkable advancements with large-scale video-sentence annotations. These annotations require extensive human labor and expertise, leading to the need for unsupervised fashion. Generating pseudo-supervision from videos is an effective strategy. With the power of the large-scale pre-trained model, we introduce knowledge into constructing pseudo-supervision. The main technical challenge is improving pseudo-supervision diversity and alleviating noise brought by external knowledge. To address these problems, we propose two Knowledge-Based Pseudo-Supervision Construction (KPSC) strategies: KPSC-P and KPSC-F. They all follow two steps: generating diverse samples and alleviating knowledge chaos. The main difference is that the former first learns a representation space with prompt tuning, while the latter directly utilizes data information. KPSC-P has two modules: (1) Proposal Prompt (PP): Generate temporal proposals; (2) Verb Prompt (VP): Generate pseudo-queries with noun-verb patterns. KPSC-F also has two modules: (1) Captioner: Generating candidate queries; (2) Filter: Alleviating knowledge chaos. Thus, our KPSC involves two attempts to extract knowledge from pre-trained models. Extensive experiments show that our attempts outperform the existing unsupervised methods on two public datasets (Charades-STA and ActivityNet-Captions) and perform on par with several methods using stronger supervision.", "venue": "ACM Transactions on Information Systems", "label": 0}, {"loc": [5.958066463470459, 5.279179096221924], "openalex_id": "https://openalex.org/W4403580080", "title": "MixEval-X: Any-to-Any Evaluations from Real-World Data Mixtures", "authors": "Jinjie Ni, Yifan Song, Deepanway Ghosal, Bo Li, D. Zhang, Yue Xiang, Fuzhao Xue, Zhiming Zheng, K J Zhang, Mohammad Maroof Shah, Kabir Jain, Yang You, Michael Shieh", "abstract": "Perceiving and generating diverse modalities are crucial for AI models to effectively learn from and engage with real-world signals, necessitating reliable evaluations for their development. We identify two major issues in current evaluations: (1) inconsistent standards, shaped by different communities with varying protocols and maturity levels; and (2) significant query, grading, and generalization biases. To address these, we introduce MixEval-X, the first any-to-any, real-world benchmark designed to optimize and standardize evaluations across diverse input and output modalities. We propose multi-modal benchmark mixture and adaptation-rectification pipelines to reconstruct real-world task distributions, ensuring evaluations generalize effectively to real-world use cases. Extensive meta-evaluations show our approach effectively aligns benchmark samples with real-world task distributions. Meanwhile, MixEval-X's model rankings correlate strongly with that of crowd-sourced real-world evaluations (up to 0.98) while being much more efficient. We provide comprehensive leaderboards to rerank existing models and organizations and offer insights to enhance understanding of multi-modal evaluations and inform future research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.526947975158691, 5.176828861236572], "openalex_id": "https://openalex.org/W4403580144", "title": "Harnessing Webpage UIs for Text-Rich Visual Understanding", "authors": "Junpeng Liu, Tianyue Ou, Yifan Song, Yuzhong Qu, Wai Lam, Chenyan Xiong, Wenhu Chen, Graham Neubig, Xiang Yue", "abstract": "Text-rich visual understanding-the ability to process environments where dense textual content is integrated with visuals-is crucial for multimodal large language models (MLLMs) to interact effectively with structured environments. To enhance this capability, we propose synthesizing general multimodal instructions from webpage UIs using text-based large language models (LLMs). Despite lacking direct visual input, text-based LLMs are able to process structured text representations from webpage accessibility trees. These instructions are then paired with UI screenshots to train multimodal models. We introduce MultiUI, a dataset containing 7.3 million samples from 1 million websites, covering diverse multimodal tasks and UI layouts. Models trained on MultiUI not only excel in web UI tasks-achieving up to a 48% improvement on VisualWebBench and a 19.1% boost in element accuracy on a web agent dataset Mind2Web-but also generalize surprisingly well to non-web UI tasks and even to non-UI domains, such as document understanding, OCR, and chart interpretation. These results highlight the broad applicability of web UI data for advancing text-rich visual understanding across various scenarios.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.248508453369141, 2.424879312515259], "openalex_id": "https://openalex.org/W4403578767", "title": "MIND: Math Informed syNthetic Dialogues for Pretraining LLMs", "authors": "Syeda Nahida Akter, Shrimai Prabhumoye, John Kamalu, Sanjeev Satheesh, Eric Nyberg, Mostofa Patwary, Mohammad Shoeybi, Bryan Catanzaro", "abstract": "The utility of synthetic data to enhance pretraining data quality and hence to improve downstream task accuracy has been widely explored in recent large language models (LLMs). Yet, these approaches fall inadequate in complex, multi-hop and mathematical reasoning tasks as the synthetic data typically fails to add complementary knowledge to the existing raw corpus. In this work, we propose a novel large-scale and diverse Math Informed syNthetic Dialogue (MIND) generation method that improves the mathematical reasoning ability of LLMs. Specifically, using MIND, we generate synthetic conversations based on OpenWebMath (OWM), resulting in a new math corpus, MIND-OWM. Our experiments with different conversational settings reveal that incorporating knowledge gaps between dialog participants is essential for generating high-quality math data. We further identify an effective way to format and integrate synthetic and raw data during pretraining to maximize the gain in mathematical reasoning, emphasizing the need to restructure raw data rather than use it as-is. Compared to pretraining just on raw data, a model pretrained on MIND-OWM shows significant boost in mathematical reasoning (GSM8K: +13.42%, MATH: +2.30%), including superior performance in specialized knowledge (MMLU: +4.55%, MMLU-STEM: +4.28%) and general purpose reasoning tasks (GENERAL REASONING: +2.51%).", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.219059944152832, 0.4016791582107544], "openalex_id": "https://openalex.org/W4403579021", "title": "Qtok: A Comprehensive Framework for Evaluating Multilingual Tokenizer Quality in Large Language Models", "authors": "Iaroslav Chelombitko, Egor Safronov, A. G. Komissarov", "abstract": "In the development of Large Language Models (LLMs), considerable attention has been given to the quality of training datasets. However, the role of tokenizers in the LLM training pipeline, particularly for multilingual models, has received less focus. The quality of tokenization can significantly impact a model's ability to handle diverse languages effectively. We introduce Qtok, a tool designed to assess tokenizer quality with a specific emphasis on their performance in multilingual contexts. Our research proposes a set of metrics for evaluating tokenizer quality, including measures of language coverage, token completeness, and distribution across languages and linguistic categories. Qtok applies these metrics to evaluate 13 distinct tokenizers from 58 publicly available models, analyzing their output across different linguistic contexts. Our analysis revealed significant variations in token distribution across languages and categories, highlighting potential biases and areas for improvement in current tokenization strategies. This research contributes to the field of tokenizer evaluation within multilingual LLM development by providing a systematic approach to assessing tokenizer quality. Our findings highlight the critical role of tokenization in multilingual LLM capability. The Qtok tool and our analysis methodology offer practical means for researchers to evaluate and improve tokenization strategies for multilingual applications. We offer a method to compare tokenizer quality across these metrics, which may be useful when selecting or adjusting tokenizers for specific multilingual LLM applications.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.755282402038574, 3.912017822265625], "openalex_id": "https://openalex.org/W4403580038", "title": "Persistent Pre-Training Poisoning of LLMs", "authors": "Yiming Zhang, Javier Rando, Ivan Evtimov, Jianwei Chi, Eric M. Smith, Nicholas Carlini, Florian Tram\u00e8r, Daphne Ippolito", "abstract": "Large language models are pre-trained on uncurated text datasets consisting of trillions of tokens scraped from the Web. Prior work has shown that: (1) web-scraped pre-training datasets can be practically poisoned by malicious actors; and (2) adversaries can compromise language models after poisoning fine-tuning datasets. Our work evaluates for the first time whether language models can also be compromised during pre-training, with a focus on the persistence of pre-training attacks after models are fine-tuned as helpful and harmless chatbots (i.e., after SFT and DPO). We pre-train a series of LLMs from scratch to measure the impact of a potential poisoning adversary under four different attack objectives (denial-of-service, belief manipulation, jailbreaking, and prompt stealing), and across a wide range of model sizes (from 600M to 7B). Our main result is that poisoning only 0.1% of a model's pre-training dataset is sufficient for three out of four attacks to measurably persist through post-training. Moreover, simple attacks like denial-of-service persist through post-training with a poisoning rate of only 0.001%.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.8444912433624268, -0.32488343119621277], "openalex_id": "https://openalex.org/W4403609252", "title": "WITH LLaMA-3?", "authors": "\u0218tefan-Vlad Voinea, M\u0103d\u0103lin M\u0103muleanu, Rossy Vl\u0103du\u021b Teic\u0103, Lucian Mihai Florescu, Dan Seli\u0219teanu, Ioana Andreea Gheonea", "abstract": "The integration of deep learning into radiology has the potential to enhance diagnostic processes, yet its acceptance in clinical practice remains limited due to various challenges. This study aimed to develop and evaluate a fine-tuned large language model (LLM), based on Llama 3-8B, to automate the generation of accurate and concise conclusions in magnetic resonance imaging (MRI) and computed tomography (CT) radiology reports, thereby assisting radiologists and improving reporting efficiency. A dataset comprising 15,000 radiology reports was collected from the University of Medicine and Pharmacy of Craiova\u2019s Imaging Center, covering a diverse range of MRI and CT examinations made by four experienced radiologists. The Llama 3-8B model was fine-tuned using transfer-learning techniques, incorporating parameter quantization to 4-bit precision and low-rank adaptation (LoRA) with a rank of 16 to optimize computational efficiency on consumer-grade GPUs. The model was trained over five epochs using an NVIDIA RTX 3090 GPU, with intermediary checkpoints saved for monitoring. Performance was evaluated quantitatively using Bidirectional Encoder Representations from Transformers Score (BERTScore), Recall-Oriented Understudy for Gisting Evaluation (ROUGE), Bilingual Evaluation Understudy (BLEU), and Metric for Evaluation of Translation with Explicit Ordering (METEOR) metrics on a held-out test set. Additionally, a qualitative assessment was conducted, involving 13 independent radiologists who participated in a Turing-like test and provided ratings for the AI-generated conclusions. The fine-tuned model demonstrated strong quantitative performance, achieving a BERTScore F1 of 0.8054, a ROUGE-1 F1 of 0.4998, a ROUGE-L F1 of 0.4628, and a METEOR score of 0.4282. In the human evaluation, the artificial intelligence (AI)-generated conclusions were preferred over human-written ones in approximately 21.8% of cases, indicating that the model\u2019s outputs were competitive with those of experienced radiologists. The average rating of the AI-generated conclusions was 3.65 out of 5, reflecting a generally favorable assessment. Notably, the model maintained its consistency across various types of reports and demonstrated the ability to generalize to unseen data. The fine-tuned Llama 3-8B model effectively generates accurate and coherent conclusions for MRI and CT radiology reports. By automating the conclusion-writing process, this approach can assist radiologists in reducing their workload and enhancing report consistency, potentially addressing some barriers to the adoption of deep learning in clinical practice. The positive evaluations from independent radiologists underscore the model\u2019s potential utility. While the model demonstrated strong performance, limitations such as dataset bias, limited sample diversity, a lack of clinical judgment, and the need for large computational resources require further refinement and real-world validation. Future work should explore the integration of such models into clinical workflows, address ethical and legal considerations, and extend this approach to generate complete radiology reports.", "venue": "Bioengineering", "label": 0}, {"loc": [3.634971857070923, 4.5125885009765625], "openalex_id": "https://openalex.org/W4403578121", "title": "Reconstruction of Differentially Private Text Sanitization via Large Language Models", "authors": "Shuchao Pang, Zhigang L\u00fc, Haichen Wang, Peng Fu, Yongbin Zhou, Minhui Xue, Bo Li", "abstract": "Differential privacy (DP) is the de facto privacy standard against privacy leakage attacks, including many recently discovered ones against large language models (LLMs). However, we discovered that LLMs could reconstruct the altered/removed privacy from given DP-sanitized prompts. We propose two attacks (black-box and white-box) based on the accessibility to LLMs and show that LLMs could connect the pair of DP-sanitized text and the corresponding private training data of LLMs by giving sample text pairs as instructions (in the black-box attacks) or fine-tuning data (in the white-box attacks). To illustrate our findings, we conduct comprehensive experiments on modern LLMs (e.g., LLaMA-2, LLaMA-3, ChatGPT-3.5, ChatGPT-4, ChatGPT-4o, Claude-3, Claude-3.5, OPT, GPT-Neo, GPT-J, Gemma-2, and Pythia) using commonly used datasets (such as WikiMIA, Pile-CC, and Pile-Wiki) against both word-level and sentence-level DP. The experimental results show promising recovery rates, e.g., the black-box attacks against the word-level DP over WikiMIA dataset gave 72.18% on LLaMA-2 (70B), 82.39% on LLaMA-3 (70B), 75.35% on Gemma-2, 91.2% on ChatGPT-4o, and 94.01% on Claude-3.5 (Sonnet). More urgently, this study indicates that these well-known LLMs have emerged as a new security risk for existing DP text sanitization approaches in the current environment.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.378975868225098, 2.6866064071655273], "openalex_id": "https://openalex.org/W4403577978", "title": "Optimizing Low-Resource Language Model Training: Comprehensive Analysis of Multi-Epoch, Multi-Lingual, and Two-Stage Approaches", "authors": "Kosuke Akimoto, Masafumi Oyamada", "abstract": "In this paper, we address the challenge of optimizing training setups for Large Language Models (LLMs) of low-resource language with a limited amount of corpus. Existing works adopt multi-epoch, multi-lingual, and two-stage training to utilize the limited target language corpus efficiently. However, there is still a lack of understanding about the optimal hyperparameter setups for combining these three approaches to train LLMs. We exhaustively explore training setups for low-resource language LLM, combining these three approaches, and found the following insights for efficiently reducing the cost of hyperparameter search: (1) As the amount of target language corpus decreases, the optimal training approach shifts from monolingual single-stage training to multi-lingual two-stage training at a compute budget dependent threshold. (2) The optimal model scale remains stable regardless of the amount of target language corpus, allowing the use of the compute-optimal scale of monolingual training. (3) The optimal number of epochs can be extrapolated from smaller-scale experiments to larger scale using our proposed model. Also, we provide evidence that, in single-stage training, the target language validation loss follows a power law with respect to the target language ratio, with an exponent independent of the amount of data, model scale, and language pair.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.223377704620361, 1.1419708728790283], "openalex_id": "https://openalex.org/W4403578243", "title": "Benchmarking Defeasible Reasoning with Large Language Models--Initial Experiments and Future Directions", "authors": "Ilias Tachmazidis, Sotiris Batsakis, Grigoris Antoniou", "abstract": "Large Language Models (LLMs) have gained prominence in the AI landscape due to their exceptional performance. Thus, it is essential to gain a better understanding of their capabilities and limitations, among others in terms of nonmonotonic reasoning. This paper proposes a benchmark that corresponds to various defeasible rule-based reasoning patterns. We modified an existing benchmark for defeasible logic reasoners by translating defeasible rules into text suitable for LLMs. We conducted preliminary experiments on nonmonotonic rule-based reasoning using ChatGPT and compared it with reasoning patterns defined by defeasible logic.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.001075744628906, 0.3025874197483063], "openalex_id": "https://openalex.org/W4403484624", "title": "Intelligent question answering for water conservancy project inspection driven by knowledge graph and large language model collaboration", "authors": "Yangrui Yang, Sisi Chen, Yaping Zhu, Xuemei Liu, Shifeng Pan, Xin Wang", "abstract": "Engineering inspection is of great significance to ensure the safe operation of the project. However, the unclear query statements of the detectors pose a challenge to the intelligent question answering task. Existing knowledge graph-based question-answering systems face issues of vocabulary limitations and reliance on fixed templates. Solely relying on Large Language Models (LLMs) for questioning introduces noise and randomness due to their extensive knowledge base. Therefore, this paper proposes a novel approach that synergistically employs both knowledge graphs and LLMs for intelligent question-answering in hydroengineering inspection. The method divides the overall task into five units, progressively clarifying query statements for accurate answers. Leveraging LLM\u2019s vast prior knowledge, robust semantic understanding, and contextual learning mitigates issues related to vocabulary limitations and template dependence. Simultaneously, the knowledge contained in the graph is integrated into an optimal clarification path and transferred to LLM to address noise and randomness, thereby enhancing the efficiency of the clarification process. Benchmark experiments demonstrate that the proposed method achieves Mean Reciprocal Rank (MRR), Mean Average Precision (MAP), Precision, and Recall metrics all above 0.73. The results affirm the method\u2019s effectiveness in improving the accuracy of intelligent question-answering in hydroengineering inspection, with potential implications for similar applications in other domains of hydraulic engineering.", "venue": "LHB", "label": 0}, {"loc": [2.6674180030822754, 2.633944511413574], "openalex_id": "https://openalex.org/W4404518334", "title": "Ecosystem Graphs: Documenting the Foundation Model Supply Chain", "authors": "Rishi Bommasani, Dilara Soylu, Thomas I. Liao, Kathleen Creel, Percy Liang", "abstract": "Foundation models (e.g. GPT-4, Gemini, Llama 3) pervasively influence society, warranting greater understanding. While the models garner much attention, accurately characterizing their impact requires considering the broader sociotechnical ecosystem in which they are created and deployed. We propose Ecosystem Graphs as a documentation framework to centralize knowledge of this ecosystem. Ecosystem Graphs is composed of assets (datasets, models, applications) linked together by dependencies that indicate technical and social relationships. To supplement the graph structure, each asset is further enriched with fine-grained metadata, such as the model\u2019s estimated training emissions or licensing guidelines. Since its release in March 2023, Ecosystem Graphs represents an ongoing effort to document 568 assets (112 datasets, 359 models, 97 applications) from 117 organizations. Ecosystem Graphs functions as a multifunctional resource: we discuss two major uses by the 2024 AI Index and the UK\u2019s Competition and Markets Authority that demonstrate the value of Ecosystem Graphs.", "venue": "Proceedings of the AAAI/ACM Conference on AI Ethics and Society", "label": 14}, {"loc": [8.713505744934082, 2.4747536182403564], "openalex_id": "https://openalex.org/W4403575338", "title": "Adaptive Data Optimization: Dynamic Sample Selection with Scaling Laws", "authors": "Yiding Jiang, Allan Zhou, Zhili Feng, Sadhika Malladi, J. Zico Kolter", "abstract": "The composition of pretraining data is a key determinant of foundation models' performance, but there is no standard guideline for allocating a limited computational budget across different data sources. Most current approaches either rely on extensive experiments with smaller models or dynamic data adjustments that also require proxy models, both of which significantly increase the workflow complexity and computational overhead. In this paper, we introduce Adaptive Data Optimization (ADO), an algorithm that optimizes data distributions in an online fashion, concurrent with model training. Unlike existing techniques, ADO does not require external knowledge, proxy models, or modifications to the model update. Instead, ADO uses per-domain scaling laws to estimate the learning potential of each domain during training and adjusts the data mixture accordingly, making it more scalable and easier to integrate. Experiments demonstrate that ADO can achieve comparable or better performance than prior methods while maintaining computational efficiency across different computation scales, offering a practical solution for dynamically adjusting data distribution without sacrificing flexibility or increasing costs. Beyond its practical benefits, ADO also provides a new perspective on data collection strategies via scaling laws.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.363648414611816, 0.7774926424026489], "openalex_id": "https://openalex.org/W4403576689", "title": "Enhancing Assamese NLP Capabilities: Introducing a Centralized Dataset Repository", "authors": "Suzanne Tamang, D. J. Bora", "abstract": "This paper introduces a centralized, open-source dataset repository designed to advance NLP and NMT for Assamese, a low-resource language. The repository, available at GitHub, supports various tasks like sentiment analysis, named entity recognition, and machine translation by providing both pre-training and fine-tuning corpora. We review existing datasets, highlighting the need for standardized resources in Assamese NLP, and discuss potential applications in AI-driven research, such as LLMs, OCR, and chatbots. While promising, challenges like data scarcity and linguistic diversity remain. The repository aims to foster collaboration and innovation, promoting Assamese language research in the digital age.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.628622055053711, 2.1765811443328857], "openalex_id": "https://openalex.org/W4404518619", "title": "Representation Bias of Adolescents in AI: A Bilingual, Bicultural Study", "authors": "Robert E. Wolfe, Aayushi Dangol, Bill Howe, Alexis Hiniker", "abstract": "Popular and news media often portray teenagers with sensationalism, as both a risk to society and at risk from society. As AI begins to absorb some of the epistemic functions of traditional media, we study how teenagers in two countries speaking two languages: 1) are depicted by AI, and 2) how they would prefer to be depicted. Specifically, we study the biases about teenagers learned by static word embeddings (SWEs) and generative language models (GLMs), comparing these with the perspectives of adolescents living in the U.S. and Nepal. We find English-language SWEs associate teenagers with societal problems, and more than 50% of the 1,000 words most associated with teenagers in the pretrained GloVe SWE reflect such problems. Given prompts about teenagers, 30% of outputs from GPT2-XL and 29% from LLaMA-2-7B GLMs discuss societal problems, most commonly violence, but also drug use, mental illness, and sexual taboo. Nepali models, while not free of such associations, are less dominated by social problems. Data from workshops with N=13 U.S. adolescents and N=18 Nepalese adolescents show that AI presentations are disconnected from teenage life, which revolves around activities like school and friendship. Participant ratings of how well 20 trait words describe teens are decorrelated from SWE associations, with Pearson's rho=.02, n.s. in English FastText and rho=.06, n.s. GloVe; and rho=.06, n.s. in Nepali FastText and rho=-.23, n.s. in GloVe. U.S. participants suggested AI could fairly present teens by highlighting diversity, while Nepalese participants centered positivity. Participants were optimistic that, if it learned from adolescents, rather than media sources, AI could help mitigate stereotypes. Our work offers an understanding of the ways SWEs and GLMs misrepresent a developmentally vulnerable group and provides a template for less sensationalized characterization.", "venue": "Proceedings of the AAAI/ACM Conference on AI Ethics and Society", "label": 14}, {"loc": [3.1920676231384277, 2.6637535095214844], "openalex_id": "https://openalex.org/W4404518214", "title": "Foregrounding Artist Opinions: A Survey Study on Transparency, Ownership, and Fairness in AI Generative Art", "authors": "Juniper Lovato, Julia Witte Zimmerman, Isabelle Smith, Peter Sheridan Dodds, Jennifer Karson", "abstract": "Generative AI tools are used to create art-like outputs and sometimes aid in the creative process. These tools have potential benefits for artists, but they also have the potential to harm the art workforce and infringe upon artistic and intellectual property rights. Without explicit consent from artists, Generative AI creators scrape artists' digital work to train Generative AI models and produce art-like outputs at scale. These outputs are now being used to compete with human artists in the marketplace as well as being used by some artists in their generative processes to create art. We surveyed 459 artists to investigate the tension between artists' opinions on Generative AI art's potential utility and harm. This study surveys artists' opinions on the utility and threat of Generative AI art models, fair practices in the disclosure of artistic works in AI art training models, ownership and rights of AI art derivatives, and fair compensation. Results show that a majority of artists believe creators should disclose what art is being used in AI training, that AI outputs should not belong to model creators, and express concerns about AI's impact on the art workforce and who profits from their art. We hope the results of this work will further meaningful collaboration and alignment between the art community and Generative AI researchers and developers.", "venue": "Proceedings of the AAAI/ACM Conference on AI Ethics and Society", "label": 14}, {"loc": [3.58272647857666, 1.8457515239715576], "openalex_id": "https://openalex.org/W4404518367", "title": "Simulating Policy Impacts: Developing a Generative Scenario Writing Method to Evaluate the Perceived Effects of Regulation", "authors": "Julie Barnett, Kimon Kieslich, Nicholas Diakopoulos", "abstract": "The rapid advancement of AI technologies yields numerous future impacts on individuals and society. Policymakers are tasked to react quickly and establish policies that mitigate those impacts. However, anticipating the effectiveness of policies is a difficult task, as some impacts might only be observable in the future and respective policies might not be applicable to the future development of AI. In this work we develop a method for using large language models (LLMs) to evaluate the efficacy of a given piece of policy at mitigating specified negative impacts. We do so by using GPT-4 to generate scenarios both pre- and post-introduction of policy and translating these vivid stories into metrics based on human perceptions of impacts. We leverage an already established taxonomy of impacts of generative AI in the media environment to generate a set of scenario pairs both mitigated and non-mitigated by the transparency policy in Article 50 of the EU AI Act. We then run a user study (n=234) to evaluate these scenarios across four risk-assessment dimensions: severity, plausibility, magnitude, and specificity to vulnerable populations. We find that this transparency legislation is perceived to be effective at mitigating harms in areas such as labor and well-being, but largely ineffective in areas such as social cohesion and security. Through this case study we demonstrate the efficacy of our method as a tool to iterate on the effectiveness of policy for mitigating various negative impacts. We expect this method to be useful to researchers or other stakeholders who want to brainstorm the potential utility of different pieces of policy or other mitigation strategies.", "venue": "Proceedings of the AAAI/ACM Conference on AI Ethics and Society", "label": 14}, {"loc": [5.644899845123291, 0.7465215921401978], "openalex_id": "https://openalex.org/W4403464255", "title": "Validating pretrained language models for content quality classification with semantic-preserving metamorphic relations", "authors": "Pak Yuen Patrick Chan, Jacky Keung", "abstract": "Context:: Utilizing pretrained language models (PLMs) has become common practice in maintaining the content quality of question-answering (Q&A) websites. However, evaluating the effectiveness of PLMs poses a challenge as they tend to provide local optima rather than global optima. Objective:: In this study, we propose using semantic-preserving Metamorphic Relations (MRs) derived from Metamorphic Testing (MT) to address this challenge and validate PLMs. Methods:: To validate four selected PLMs, we conducted an empirical experiment using a publicly available dataset comprising 60000 data points. We defined three groups of Metamorphic Relations (MRGs), consisting of thirteen semantic-preserving MRs, which were then employed to generate \u201cFollow-up\u201d testing datasets based on the original \u201cSource\u201d testing datasets. The PLMs were trained using a separate training dataset. A comparison was made between the predictions of the four trained PLMs for \u201cSource\u201d and \u201cFollow-up\u201d testing datasets in order to identify instances of violations, which corresponded to inconsistent predictions between the two datasets. If no violation was found, it indicated that the PLM was insensitive to the associate MR; thereby, the MR can be used for validation. In cases where no violation occurred across the entire MRG, non-violation regions were identified and supported simulation metamorphic testing. Results:: The results of this study demonstrated that the proposed MRs could effectively serve as a validation tool for content quality classification on Stack Overflow Q&A using PLMs. One PLM did not violate the \u201cUppercase conversion\u201d MRG and the \u201cDuplication\u201d MRG. Furthermore, the absence of violations in the MRGs allowed for the identification of non-violation regions, confirming the ability of the proposed MRs to support simulation metamorphic testing. Conclusion:: The experimental findings indicate that the proposed MRs can validate PLMs effectively and support simulation metamorphic testing for PLMs. However, further investigations are required to enhance the semantic comprehension and common sense knowledge of PLMs and explore highly informative statistical patterns of PLMs, in order to improve their overall performance.", "venue": "Natural Language Processing Journal", "label": 9}, {"loc": [8.27380657196045, 0.53107088804245], "openalex_id": "https://openalex.org/W4403461279", "title": "The Mitigation of Excessive Retrieval Augmentation and Knowledge Conflicts in Large Language Models", "authors": "Michael Galway, Matteo DiRenzo, Daniele Esposito, Rafael Marchand, V.V. Grigoriev", "abstract": "Abstract There have been significant advances with large language models in generating coherent and contextually relevant responses, but their limitations in accessing and integrating real-time or specialized information have driven the development of retrieval augmentation techniques. Retrieval augmentation offers the ability to enhance a model\u2019s responses through external knowledge, yet the challenge of managing knowledge conflicts between retrieved data and internal model predictions remains unresolved. A systematic examination of varying levels of retrieval augmentation has revealed that excessive reliance on external information not only introduces factual inconsistencies but also degrades the coherence of model outputs. The experiments conducted on the Llama model demonstrate that while moderate augmentation improves accuracy and relevance, high retrieval augmentation significantly increases the risk of knowledge conflicts, complicating the response generation process. The conflict detection and resolution mechanisms employed showed promise in mitigating some of these inconsistencies, although their effectiveness diminished as the volume of retrieval data increased. These findings highlight the delicate balance required between external knowledge integration and internal model coherence, emphasizing the need for more sophisticated conflict management strategies to optimize the potential of retrieval-augmented models.", "venue": "https://doi.org/10.21203/rs.3.rs-5263949/v1", "label": 0}, {"loc": [4.805431365966797, 2.1591997146606445], "openalex_id": "https://openalex.org/W4404518423", "title": "ML-EAT: A Multilevel Embedding Association Test for Interpretable and Transparent Social Science", "authors": "Robert Wolfe, Alexis Hiniker, Bill Howe", "abstract": "This research introduces the Multilevel Embedding Association Test (ML-EAT), a method designed for interpretable and transparent measurement of intrinsic bias in language technologies. The ML-EAT addresses issues of ambiguity and difficulty in interpreting the traditional EAT measurement by quantifying bias at three levels of increasing granularity: the differential association between two target concepts with two attribute concepts; the individual effect size of each target concept with two attribute concepts; and the association between each individual target concept and each individual attribute concept. Using the ML-EAT, this research defines a taxonomy of EAT patterns describing the nine possible outcomes of an embedding association test, each of which is associated with a unique EAT-Map, a novel four-quadrant visualization for interpreting the ML-EAT. Empirical analysis of static and diachronic word embeddings, GPT-2 language models, and a CLIP language-and-image model shows that EAT patterns add otherwise unobservable information about the component biases that make up an EAT; reveal the effects of prompting in zero-shot models; and can also identify situations when cosine similarity is an ineffective metric, rendering an EAT unreliable. Our work contributes a method for rendering bias more observable and interpretable, improving the transparency of computational investigations into human minds and societies.", "venue": "Proceedings of the AAAI/ACM Conference on AI Ethics and Society", "label": 14}, {"loc": [7.670628070831299, 4.091790199279785], "openalex_id": "https://openalex.org/W4403575414", "title": "MoH: Multi-Head Attention as Mixture-of-Head Attention", "authors": "Peng Jin, Bo Zhu, Yuan Li, Shuicheng Yan", "abstract": "In this work, we upgrade the multi-head attention mechanism, the core of the Transformer model, to improve efficiency while maintaining or surpassing the previous accuracy level. We show that multi-head attention can be expressed in the summation form. Drawing on the insight that not all attention heads hold equal significance, we propose Mixture-of-Head attention (MoH), a new architecture that treats attention heads as experts in the Mixture-of-Experts (MoE) mechanism. MoH has two significant advantages: First, MoH enables each token to select the appropriate attention heads, enhancing inference efficiency without compromising accuracy or increasing the number of parameters. Second, MoH replaces the standard summation in multi-head attention with a weighted summation, introducing flexibility to the attention mechanism and unlocking extra performance potential. Extensive experiments on ViT, DiT, and LLMs demonstrate that MoH outperforms multi-head attention by using only 50%-90% of the attention heads. Moreover, we demonstrate that pre-trained multi-head attention models, such as LLaMA3-8B, can be further continue-tuned into our MoH models. Notably, MoH-LLaMA3-8B achieves an average accuracy of 64.0% across 14 benchmarks, outperforming LLaMA3-8B by 2.4% by utilizing only 75% of the attention heads. We believe the proposed MoH is a promising alternative to multi-head attention and provides a strong foundation for developing advanced and efficient attention-based models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7093119621276855, 4.406339168548584], "openalex_id": "https://openalex.org/W4403486636", "title": "Inside the Black Box: Detecting Data Leakage in Pre-trained Language Encoders", "authors": "Xin Yuan, Zheng Li, Ning Yu, Dingfan Chen, Mario Fritz, Michael Backes, Yang Zhang", "abstract": "Despite being prevalent in the general field of Natural Language Processing (NLP), pre-trained language models inherently carry privacy and copyright concerns due to their nature of training on large-scale web-scraped data. In this paper, we pioneer a systematic exploration of such risks associated with pre-trained language encoders, specifically focusing on the membership leakage of pre-training data exposed through downstream models adapted from pre-trained language encoders\u2013an aspect largely overlooked in existing literature. Our study encompasses comprehensive experiments across four types of pre-trained encoder architectures, three representative downstream tasks, and five benchmark datasets. Intriguingly, our evaluations reveal, for the first time, the existence of membership leakage even when only the black-box output of the downstream model is exposed, highlighting a privacy risk far greater than previously assumed. Alongside, we present in-depth analysis and insights toward guiding future researchers and practitioners in addressing the privacy considerations in developing pre-trained language models.", "venue": "Frontiers in artificial intelligence and applications", "label": 0}, {"loc": [5.098104476928711, 1.4101240634918213], "openalex_id": "https://openalex.org/W4403570980", "title": "Cultural Fidelity in Large-Language Models: An Evaluation of Online Language Resources as a Driver of Model Performance in Value Representation", "authors": "Samia Kazemi, Greg A. Gerhardt, Jonty Katz, Caroline Ida Kuria, Eric Pan, Umang Prabhakar", "abstract": "The training data for LLMs embeds societal values, increasing their familiarity with the language's culture. Our analysis found that 44% of the variance in the ability of GPT-4o to reflect the societal values of a country, as measured by the World Values Survey, correlates with the availability of digital resources in that language. Notably, the error rate was more than five times higher for the languages of the lowest resource compared to the languages of the highest resource. For GPT-4-turbo, this correlation rose to 72%, suggesting efforts to improve the familiarity with the non-English language beyond the web-scraped data. Our study developed one of the largest and most robust datasets in this topic area with 21 country-language pairs, each of which contain 94 survey questions verified by native speakers. Our results highlight the link between LLM performance and digital data availability in target languages. Weaker performance in low-resource languages, especially prominent in the Global South, may worsen digital divides. We discuss strategies proposed to address this, including developing multilingual LLMs from the ground up and enhancing fine-tuning on diverse linguistic datasets, as seen in African language initiatives.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.270819664001465, 0.17003539204597473], "openalex_id": "https://openalex.org/W4403570231", "title": "State of NLP in Kenya: A Survey", "authors": "Cynthia Amol, Everlyn Asiko Chimoto, Rose Delilah Gesicho, Antony Gitau, Naome A. Etori, Caringtone Kinyanjui, Steven Ndung'u, Lawrence Moruye, Samson Otieno Ooko, Kavengi Kitonga, Brian Muhia, Catherine Gitau, Antony Ndolo, Lilian Wanzare, Albert Njoroge Kahira, Ronald Tombe", "abstract": "Kenya, known for its linguistic diversity, faces unique challenges and promising opportunities in advancing Natural Language Processing (NLP) technologies, particularly for its underrepresented indigenous languages. This survey provides a detailed assessment of the current state of NLP in Kenya, emphasizing ongoing efforts in dataset creation, machine translation, sentiment analysis, and speech recognition for local dialects such as Kiswahili, Dholuo, Kikuyu, and Luhya. Despite these advancements, the development of NLP in Kenya remains constrained by limited resources and tools, resulting in the underrepresentation of most indigenous languages in digital spaces. This paper uncovers significant gaps by critically evaluating the available datasets and existing NLP models, most notably the need for large-scale language models and the insufficient digital representation of Indigenous languages. We also analyze key NLP applications: machine translation, information retrieval, and sentiment analysis-examining how they are tailored to address local linguistic needs. Furthermore, the paper explores the governance, policies, and regulations shaping the future of AI and NLP in Kenya and proposes a strategic roadmap to guide future research and development efforts. Our goal is to provide a foundation for accelerating the growth of NLP technologies that meet Kenya's diverse linguistic demands.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.623332500457764, 2.6895244121551514], "openalex_id": "https://openalex.org/W4403571164", "title": "Evaluating SQL Understanding in Large Language Models", "authors": "A. Rahaman, Ailing Zheng, Mostafa Milani, Fei Chiang, Rachel Pottinger", "abstract": "The rise of large language models (LLMs) has significantly impacted various domains, including natural language processing (NLP) and image generation, by making complex computational tasks more accessible. While LLMs demonstrate impressive generative capabilities, there is an ongoing debate about their level of \"understanding,\" particularly in structured domains like SQL. In this paper, we evaluate the extent to which LLMs \"understand\" SQL by testing them on a series of key SQL tasks. These tasks, such as syntax error detection, missing token identification, query performance prediction, query equivalence checking, and query explanation, assess the models' proficiency in recognition, context awareness, semantics, and coherence, which are essential skills for SQL understanding. We generate labeled datasets from well-known workloads, and evaluate the latest LLMs, focusing on how query complexity and syntactic features influence performance. Our results indicate that while GPT4 excels at tasks requiring recognition and context, all models struggle with deeper semantic understanding and coherence, especially in query equivalence and performance estimation, revealing the limitations of current LLMs in achieving full SQL comprehension.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.544360637664795, 2.1288723945617676], "openalex_id": "https://openalex.org/W4403570154", "title": "Reverse Modeling in Large Language Models", "authors": "Sicheng Yu, Yuanchen Xu, Cunxiao Du, Yanying Zhou, Minghui Qiu, Qianru Sun, Hao Zhang, Jiawei Wu", "abstract": "Humans are accustomed to reading and writing in a forward manner, and this natural bias extends to text understanding in auto-regressive large language models (LLMs). This paper investigates whether LLMs, like humans, struggle with reverse modeling, specifically with reversed text inputs. We found that publicly available pre-trained LLMs cannot understand such inputs. However, LLMs trained from scratch with both forward and reverse texts can understand them equally well during inference across multiple languages. Our case study shows that different-content texts result in different losses if input (to LLMs) in different directions -- some get lower losses for forward while some for reverse. This leads us to a simple and nice solution for data selection based on the loss differences between forward and reverse directions. Using our selected data in continued pretraining can boost LLMs' performance by a large margin across different language understanding benchmarks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.148582935333252, 4.571589469909668], "openalex_id": "https://openalex.org/W4403444428", "title": "SmartPretrain: Model-Agnostic and Dataset-Agnostic Representation Learning for Motion Prediction", "authors": "Yang Zhou, Hao Shao, Letian Wang, Steven L. Waslander, Hongsheng Li, Yu Liu", "abstract": "Predicting the future motion of surrounding agents is essential for autonomous vehicles (AVs) to operate safely in dynamic, human-robot-mixed environments. However, the scarcity of large-scale driving datasets has hindered the development of robust and generalizable motion prediction models, limiting their ability to capture complex interactions and road geometries. Inspired by recent advances in natural language processing (NLP) and computer vision (CV), self-supervised learning (SSL) has gained significant attention in the motion prediction community for learning rich and transferable scene representations. Nonetheless, existing pre-training methods for motion prediction have largely focused on specific model architectures and single dataset, limiting their scalability and generalizability. To address these challenges, we propose SmartPretrain, a general and scalable SSL framework for motion prediction that is both model-agnostic and dataset-agnostic. Our approach integrates contrastive and reconstructive SSL, leveraging the strengths of both generative and discriminative paradigms to effectively represent spatiotemporal evolution and interactions without imposing architectural constraints. Additionally, SmartPretrain employs a dataset-agnostic scenario sampling strategy that integrates multiple datasets, enhancing data volume, diversity, and robustness. Extensive experiments on multiple datasets demonstrate that SmartPretrain consistently improves the performance of state-of-the-art prediction models across datasets, data splits and main metrics. For instance, SmartPretrain significantly reduces the MissRate of Forecast-MAE by 10.6%. These results highlight SmartPretrain's effectiveness as a unified, scalable solution for motion prediction, breaking free from the limitations of the small-data regime. Codes are available at https://github.com/youngzhou1999/SmartPretrain", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.444314479827881, 0.8551324605941772], "openalex_id": "https://openalex.org/W4403443641", "title": "Data Processing for the OpenGPT-X Model Family", "authors": "Nicolo\u2019 Brandizzi, Hammam Abdelwahab, Anirban Bhowmick, Lennard Helmer, B. Stein, Pavel Denisov, Qasid Saleem, Michael Fromm, Mehdi Ali, Richard Rutmann, Farzad Naderi, Mohamad Saif Agy, Alexander Schwirjow, Fabian K\u00fcch, Luzian Hahn, Malte Ostendorff, Pedro Ortiz Suarez, Georg Rehm, Dennis Wegener, Nicolas Flores-Herr, Joachim K\u00f6hler, Johannes Leveling", "abstract": "This paper presents a comprehensive overview of the data preparation pipeline developed for the OpenGPT-X project, a large-scale initiative aimed at creating open and high-performance multilingual large language models (LLMs). The project goal is to deliver models that cover all major European languages, with a particular focus on real-world applications within the European Union. We explain all data processing steps, starting with the data selection and requirement definition to the preparation of the final filtered data. We distinguish between curated data and web data, as each of these categories is handled by distinct pipelines, with curated data undergoing minimal filtering and web data requiring extensive filtering and deduplication. This distinction guided the development of specialized algorithmic solutions for both pipelines. In addition to describing the processing methodologies, we provide an in-depth analysis of the datasets, increasing transparency and alignment with European data regulations. Finally, we share key insights and challenges faced during the project, offering recommendations for future endeavors in large-scale multilingual data preparation for LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.62979507446289, 2.6464619636535645], "openalex_id": "https://openalex.org/W4403444264", "title": "Scaling Laws for Predicting Downstream Performance in LLMs", "authors": "Yuquan Chen, Bevan E. Huang, Yifan Gao, Zhengyang Wang, Jingfeng Yang, Hong Ji", "abstract": "Precise estimation of downstream performance in large language models (LLMs) prior to training is essential for guiding their development process. Scaling laws analysis utilizes the statistics of a series of significantly smaller sampling language models (LMs) to predict the performance of the target LLM. For downstream performance prediction, the critical challenge lies in the emergent abilities in LLMs that occur beyond task-specific computational thresholds. In this work, we focus on the pre-training loss as a more computation-efficient metric for performance estimation. Our two-stage approach FLP consists of first estimating a function that maps computational resources (e.g., FLOPs) to the pre-training Loss using a series of fully-converged sampling models, followed by mapping the pre-training loss to downstream task Performance using the intermediate models with emerged performance. In our experiments, this FLP solution accurately predicts the performance of LLMs with 7B and 13B parameters using a series of sampling LMs up to 3B, achieving error margins of 5% and 10%, respectively, and significantly outperforming the FLOPs-to-Performance approach. Further, we present FLP-M, a fundamental approach for performance prediction that addresses the practical need to integrate datasets from multiple sources during pre-training. FLP-M extends the power law analytical function to predict domain-specific pre-training loss based on FLOPs across data sources, and employs a two-layer neural network to model the non-linear relationship between multiple domain-specific loss and downstream performance. By utilizing a 3B LLM trained on a specific ratio and a series of smaller sampling LMs, FLP-M can effectively forecast the performance of 3B and 7B LLMs across various data mixtures for most benchmarks within 10% error margins.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.090454578399658, 0.5197303891181946], "openalex_id": "https://openalex.org/W4403364832", "title": "Linguistically-Informed Multilingual Instruction Tuning: Is There an Optimal Set of Languages to Tune?", "authors": "G\u00fcrkan Soykan, G\u00f6zde G\u00fcl \u015eahin", "abstract": "Multilingual language models often perform unevenly across different languages due to limited generalization capabilities for some languages. This issue is significant because of the growing interest in making universal language models that work well for all languages. Instruction tuning with multilingual instruction-response pairs has been used to improve model performance across various languages. However, this approach is challenged by high computational costs, a lack of quality tuning data for all languages, and the \"curse of multilinguality\" -- the performance drop per language after adding many languages. Recent studies have found that working with datasets with few languages and a smaller number of instances can be beneficial. Yet, there exists no systematic investigation into how choosing different languages affects multilingual instruction tuning. Our study proposes a method to select languages for instruction tuning in a linguistically informed way, aiming to boost model performance across languages and tasks. We use a simple algorithm to choose diverse languages and test their effectiveness on various benchmarks and open-ended questions. Our results show that this careful selection generally leads to better outcomes than choosing languages at random. We suggest a new and simple way of enhancing multilingual models by selecting diverse languages based on linguistic features that could help develop better multilingual systems and guide dataset creation efforts. All resources, including the code for language selection and multilingual instruction tuning, are made available in our official repository at https://github.com/GGLAB-KU/ling-informed-mit enabling reproducibility and further research in this area.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.402674198150635, 2.666043519973755], "openalex_id": "https://openalex.org/W4403365496", "title": "MathCoder2: Better Math Reasoning from Continued Pretraining on Model-translated Mathematical Code", "authors": "Zimu Lu, Aojun Zhou, Ke Wang, Houxing Ren, Weikang Shi, Junting Pan, Mingjie Zhan, Hongsheng Li", "abstract": "Code has been shown to be effective in enhancing the mathematical reasoning abilities of large language models due to its precision and accuracy. Previous works involving continued mathematical pretraining often include code that utilizes math-related packages, which are primarily designed for fields such as engineering, machine learning, signal processing, or module testing, rather than being directly focused on mathematical reasoning. In this paper, we introduce a novel method for generating mathematical code accompanied with corresponding reasoning steps for continued pretraining. Our approach begins with the construction of a high-quality mathematical continued pretraining dataset by incorporating math-related web data, code using mathematical packages, math textbooks, and synthetic data. Next, we construct reasoning steps by extracting LaTeX expressions, the conditions needed for the expressions, and the results of the expressions from the previously collected dataset. Based on this extracted information, we generate corresponding code to accurately capture the mathematical reasoning process. Appending the generated code to each reasoning step results in data consisting of paired natural language reasoning steps and their corresponding code. Combining this data with the original dataset results in a 19.2B-token high-performing mathematical pretraining corpus, which we name MathCode-Pile. Training several popular base models with this corpus significantly improves their mathematical abilities, leading to the creation of the MathCoder2 family of models. All of our data processing and training code is open-sourced, ensuring full transparency and easy reproducibility of the entire data collection and training pipeline. The code is released at https://github.com/mathllm/MathCoder2.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.186054706573486, 0.34147870540618896], "openalex_id": "https://openalex.org/W4403364498", "title": "PLaMo-100B: A Ground-Up Language Model Designed for Japanese Proficiency", "authors": "K. Abe, Kaizaburo Chubachi, Yasuhiro Fujita, Yuta Hirokawa, Kentaro Imajo, Toshiki Kataoka, Hiroyoshi Komatsu, Hiroaki Mikami, Tsuguo Mogami, Shogo Murai, Kosuke Nakago, Daisuke Nishino, Toru Ogawa, Daisuke Okanohara, Y Ozaki, Shotaro Sano, Shuji Suzuki, Tianqi Xu, Toshihiko Yanase", "abstract": "We introduce PLaMo-100B, a large-scale language model designed for Japanese proficiency. The model was trained from scratch using 2 trillion tokens, with architecture such as QK Normalization and Z-Loss to ensure training stability during the training process. Post-training techniques, including Supervised Fine-Tuning and Direct Preference Optimization, were applied to refine the model's performance. Benchmark evaluations suggest that PLaMo-100B performs well, particularly in Japanese-specific tasks, achieving results that are competitive with frontier models like GPT-4. The base model is available at https://huggingface.co/pfnet/plamo-100b.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.73299789428711, 2.500730276107788], "openalex_id": "https://openalex.org/W4403365238", "title": "Efficiently Learning at Test-Time: Active Fine-Tuning of LLMs", "authors": "Jonas H\u00fcbotter, Sascha Bongni, Ido Hakimi, Andreas Krause", "abstract": "Recent efforts in fine-tuning language models often rely on automatic data selection, commonly using Nearest Neighbors retrieval from large datasets. However, we theoretically show that this approach tends to select redundant data, limiting its effectiveness or even hurting performance. To address this, we introduce SIFT, a data selection algorithm designed to reduce uncertainty about the model's response given a prompt, which unifies ideas from retrieval and active learning. Whereas Nearest Neighbor retrieval typically fails in the presence of information duplication, SIFT accounts for information duplication and optimizes the overall information gain of the selected examples. We focus our evaluations on fine-tuning at test-time for prompt-specific language modeling on the Pile dataset, and show that SIFT consistently outperforms Nearest Neighbor retrieval, with minimal computational overhead. Moreover, we show that our uncertainty estimates can predict the performance gain of test-time fine-tuning, and use this to develop an adaptive algorithm that invests test-time compute proportional to realized performance gains. We provide the $\\texttt{activeft}$ (Active Fine-Tuning) library which can be used as a drop-in replacement for Nearest Neighbor retrieval.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.082808017730713, 5.1938300132751465], "openalex_id": "https://openalex.org/W4403344279", "title": "Aria: An Open Multimodal Native Mixture-of-Experts Model", "authors": "Dongxu Li, Yudong Liu, Haoning Wu, Yue Wang, Zhiqi Shen, Bowen Qu, Xinyao Niu, Guoyin Wang, Bei Chen, Junnan Li", "abstract": "Information comes in diverse modalities. Multimodal native AI models are essential to integrate real-world information and deliver comprehensive understanding. While proprietary multimodal native models exist, their lack of openness imposes obstacles for adoptions, let alone adaptations. To fill this gap, we introduce Aria, an open multimodal native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. Aria is a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual token and text token, respectively. It outperforms Pixtral-12B and Llama3.2-11B, and is competitive against the best proprietary models on various multimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline, which progressively equips the model with strong capabilities in language understanding, multimodal understanding, long context window, and instruction following. We open-source the model weights along with a codebase that facilitates easy adoptions and adaptations of Aria in real-world applications.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.708952903747559, 2.528440237045288], "openalex_id": "https://openalex.org/W4403345766", "title": "Data Selection via Optimal Control for Language Models", "authors": "Yuxian Gu, Li Dong, Hongning Wang, Yaru Hao, Qingxiu Dong, Furu Wei, Minlie Huang", "abstract": "This work investigates the selection of high-quality pre-training data from massive corpora to enhance LMs' capabilities for downstream usage. We formulate data selection as a generalized Optimal Control problem, which can be solved theoretically by Pontryagin's Maximum Principle (PMP), yielding a set of necessary conditions that characterize the relationship between optimal data selection and LM training dynamics. Based on these theoretical results, we introduce PMP-based Data Selection (PDS), a framework that approximates optimal data selection by solving the PMP conditions. In our experiments, we adopt PDS to select data from CommmonCrawl and show that the PDS-selected corpus accelerates the learning of LMs and constantly boosts their performance on a wide range of downstream tasks across various model sizes. Moreover, the benefits of PDS extend to ~400B models trained on ~10T tokens, as evidenced by the extrapolation of the test loss curves according to the Scaling Laws. PDS also improves data utilization when the pre-training data is limited, by reducing the data demand by 1.8 times, which helps mitigate the quick exhaustion of available web-crawled corpora. Our code, model, and data can be found at https://github.com/microsoft/LMOps/tree/main/data_selection.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.279664039611816, 3.7159905433654785], "openalex_id": "https://openalex.org/W4403345223", "title": "TorchTitan: One-stop PyTorch native solution for production ready LLM pre-training", "authors": "Wanchao Liang, Tianyu Liu, Less Wright, Will Constable, Andrew Gu, Chien-Chin Huang, Iris Zhang, Wei Feng, Howard Huang, Junjie Wang, Sanket Purandare, Gokul Nadathur, Stratos Idreos", "abstract": "The development of large language models (LLMs) has been instrumental in advancing state-of-the-art natural language processing applications. Training LLMs with billions of parameters and trillions of tokens require sophisticated distributed systems that enable composing and comparing several state-of-the-art techniques in order to efficiently scale across thousands of accelerators. However, existing solutions are complex, scattered across multiple libraries/repositories, lack interoperability, and are cumbersome to maintain. Thus, curating and empirically comparing training recipes require non-trivial engineering effort. This paper introduces TorchTitan, an open-source, PyTorch-native distributed training system that unifies state-of-the-art techniques, streamlining integration and reducing overhead. TorchTitan enables 3D parallelism in a modular manner with elastic scaling, providing comprehensive logging, checkpointing, and debugging tools for production-ready training. It also incorporates hardware-software co-designed solutions, leveraging features like Float8 training and SymmetricMemory. As a flexible test bed, TorchTitan facilitates custom recipe curation and comparison, allowing us to develop optimized training recipes for Llama 3.1 and provide guidance on selecting techniques for maximum efficiency based on our experiences. We thoroughly assess TorchTitan on the Llama 3.1 family of LLMs, spanning 8 billion to 405 billion parameters, and showcase its exceptional performance, modular composability, and elastic scalability. By stacking training optimizations, we demonstrate accelerations of 65.08% with 1D parallelism at the 128-GPU scale (Llama 3.1 8B), an additional 12.59% with 2D parallelism at the 256-GPU scale (Llama 3.1 70B), and an additional 30% with 3D parallelism at the 512-GPU scale (Llama 3.1 405B) on NVIDIA H100 GPUs over optimized baselines.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.187540054321289, 0.11661001294851303], "openalex_id": "https://openalex.org/W4403347118", "title": "MEXA: Multilingual Evaluation of English-Centric LLMs via Cross-Lingual Alignment", "authors": "Amir Hossein Kargaran, Ali Modarressi, Nafiseh Nikeghbal, Jana Diesner, Fran\u00e7ois Yvon, Hinrich Sch\u00fctze", "abstract": "English-centric large language models (LLMs) often show strong multilingual capabilities. However, their multilingual performance remains unclear and is under-evaluated for many other languages. Most benchmarks for multilinguality focus on classic NLP tasks or cover a minimal number of languages. We introduce MEXA, a method for assessing the multilingual capabilities of pre-trained English-centric LLMs using parallel sentences, which are available for more languages than existing downstream tasks. MEXA leverages that English-centric LLMs use English as a pivot language in their intermediate layers. MEXA computes the alignment between English and non-English languages using parallel sentences to evaluate the transfer of language understanding from English to other languages. This alignment can be used to estimate model performance in different languages. We conduct controlled experiments using various parallel datasets (FLORES-200 and Bible), models (Llama family, Gemma family, Mistral, and OLMo), and established downstream tasks (Belebele, m-MMLU, and m-ARC). We explore different methods to compute embeddings in decoder-only models. Our results show that MEXA, in its default settings, achieves an average Pearson correlation of 0.90 between its predicted scores and actual task performance across languages. This suggests that MEXA is a reliable method for estimating the multilingual capabilities of English-centric LLMs, providing a clearer understanding of their multilingual potential and the inner workings of LLMs. Leaderboard: https://cis-lmu-mexa.hf.space, Code: https://github.com/cisnlp/MEXA.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.756617546081543, 3.683349847793579], "openalex_id": "https://openalex.org/W4403346143", "title": "Falcon Mamba: The First Competitive Attention-free 7B Language Model", "authors": "Jingwei Zuo, Maksim Velikanov, Dhia Eddine Rhaiem, Ilyas Chahed, Younes Belkada, Guillaume Kunsch, Hakim Hacid", "abstract": "In this technical report, we present Falcon Mamba 7B, a new base large language model based on the novel Mamba architecture. Falcon Mamba 7B is trained on 5.8 trillion tokens with carefully selected data mixtures. As a pure Mamba-based model, Falcon Mamba 7B surpasses leading open-weight models based on Transformers, such as Mistral 7B, Llama3.1 8B, and Falcon2 11B. It is on par with Gemma 7B and outperforms models with different architecture designs, such as RecurrentGemma 9B and RWKV-v6 Finch 7B/14B. Currently, Falcon Mamba 7B is the best-performing Mamba model in the literature at this scale, surpassing both existing Mamba and hybrid Mamba-Transformer models, according to the Open LLM Leaderboard. Due to its architecture, Falcon Mamba 7B is significantly faster at inference and requires substantially less memory for long sequence generation. Despite recent studies suggesting that hybrid Mamba-Transformer models outperform pure architecture designs, we demonstrate that even the pure Mamba design can achieve similar, or even superior results compared to the Transformer and hybrid designs. We make the weights of our implementation of Falcon Mamba 7B publicly available on https://huggingface.co/tiiuae/falcon-mamba-7b, under a permissive license.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.8848371505737305, 2.064805746078491], "openalex_id": "https://openalex.org/W4403363019", "title": "DecorateLM: Data Engineering through Corpus Rating, Tagging, and Editing with Language Models", "authors": "Ranchi Zhao, Zhen Leng Thai, Yifan Zhang, Shengding Hu, Yunqi Ba, Jie Zhou, Jie Cai, Zhiyuan Liu, Maosong Sun", "abstract": "The performance of Large Language Models (LLMs) is substantially influenced by the pretraining corpus, which consists of vast quantities of unsupervised data processed by the models. Despite its critical role in model performance, ensuring the quality of this data is challenging due to its sheer volume and the absence of sample-level quality annotations and enhancements. In this paper, we introduce DecorateLM, a data engineering method designed to refine the pretraining corpus through data rating, tagging and editing. Specifically, DecorateLM rates texts against quality criteria, tags texts with hierarchical labels, and edits texts into a more formalized format. Due to the massive size of the pretraining corpus, adopting an LLM for decorating the entire corpus is less efficient. Therefore, to balance performance with efficiency, we curate a meticulously annotated training corpus for DecorateLM using a large language model and distill data engineering expertise into a compact 1.2 billion parameter small language model (SLM). We then apply DecorateLM to enhance 100 billion tokens of the training corpus, selecting 45 billion tokens that exemplify high quality and diversity for the further training of another 1.2 billion parameter LLM. Our results demonstrate that employing such high-quality data can significantly boost model performance, showcasing a powerful approach to enhance the quality of the pretraining corpus.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.134188175201416, 2.472787857055664], "openalex_id": "https://openalex.org/W4403936413", "title": "BloomWise: Enhancing problem-solving capabilities of LLMs using Bloom's-Taxonomy-inspired prompts", "authors": "Maria-Eleni Zoumpoulidi, Georgios Paraskevopoulos, Alexandros Potamianos", "abstract": "Despite the remarkable capabilities of large language models (LLMs) across a range of tasks, mathematical reasoning remains a challenging frontier. Motivated by the observation that humans learn more effectively when prompted not what to think but how to think, we introduce BloomWise, a cognitively-inspired prompting technique designed to enhance LLMs' performance on mathematical problem solving while making their solutions more explainable. BloomWise encourages LLMs to generate solutions - in the form of explanations - by progressing through a sequence of cognitive operations-from basic (e.g., remembering) to more advanced reasoning skills (e.g., evaluating) - mirroring how humans build understanding. The process iterates through these levels, halting early if a convergence criterion is met: specifically, if two or more consecutive levels yield the same answer, the solution from the earliest such level is output; otherwise, the process continues until all levels are completed. Through extensive experiments across five popular math reasoning datasets, we demonstrate the effectiveness of BloomWise. We also present comprehensive ablation studies to analyze the strengths of each component within our system.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.626015663146973, 2.3412346839904785], "openalex_id": "https://openalex.org/W4403322628", "title": "Language Model-Driven Data Pruning Enables Efficient Active Learning", "authors": "Abdul Hameed Azeemi, Ihsan Ayyub Qazi, Agha Ali Raza", "abstract": "Active learning (AL) optimizes data labeling efficiency by selecting the most informative instances for annotation. A key component in this procedure is an acquisition function that guides the selection process and identifies the suitable instances for labeling from the unlabeled pool. However, these acquisition methods suffer from high computational costs with large unlabeled data pools, posing a roadblock to their applicability on large datasets. To address this challenge and bridge this gap, we introduce a novel plug-and-play unlabeled data pruning strategy, ActivePrune, which leverages language models to prune the unlabeled pool. ActivePrune implements a two-stage pruning process: an initial fast evaluation using perplexity scores from an n-gram language model, followed by a high-quality selection using metrics for data quality computed through a quantized LLM. Additionally, to enhance the diversity in the unlabeled pool, we propose a novel perplexity reweighting method that systematically brings forward underrepresented instances for selection in subsequent labeling iterations. Experiments on translation, sentiment analysis, topic classification, and summarization tasks on four diverse datasets and four active learning strategies demonstrate that ActivePrune outperforms existing data pruning methods. Finally, we compare the selection quality $\\leftrightarrow$ efficiency tradeoff of the data pruning methods and demonstrate that ActivePrune is computationally more efficient than other LLM score-based pruning methods, and provides up to 74% reduction in the end-to-end time required for active learning.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.6901283264160156, 1.6009082794189453], "openalex_id": "https://openalex.org/W4403214714", "title": "Evaluating the Performance of Topic Modeling Techniques with Human Validation to Support Qualitative Analysis", "authors": "J. Romero, Miguel A. Feijoo\u2010Garcia, Gaurav Nanda, Brittany Newell, Alejandra J. Magana", "abstract": "Examining the effectiveness of machine learning techniques in analyzing engineering students\u2019 decision-making processes through topic modeling during simulation-based design tasks is crucial for advancing educational methods and tools. Thus, this study presents a comparative analysis of different supervised and unsupervised machine learning techniques for topic modeling, along with human validation. Hence, this manuscript contributes by evaluating the effectiveness of these techniques in identifying nuanced topics within the argumentation framework and improving computational methods for assessing students\u2019 abilities and performance levels based on their informed decisions. This study examined the decision-making processes of engineering students as they participated in a simulation-based design challenge. During this task, students were prompted to use an argumentation framework to articulate their claims, evidence, and reasoning, by recording their informed design decisions in a design journal. This study combined qualitative and computational methods to analyze the students\u2019 design journals and ensured the accuracy of the findings through the researchers\u2019 review and interpretations of the results. Different machine learning models, including random forest, SVM, and K-nearest neighbors (KNNs), were tested for multilabel regression, using preprocessing techniques such as TF-IDF, GloVe, and BERT embeddings. Additionally, hyperparameter optimization and model interpretability were explored, along with models like RNNs with LSTM, XGBoost, and LightGBM. The results demonstrate that both supervised and unsupervised machine learning models effectively identified nuanced topics within the argumentation framework used during the design challenge of designing a zero-energy home for a Midwestern city using a CAD/CAE simulation platform. Notably, XGBoost exhibited superior predictive accuracy in estimating topic proportions, highlighting its potential for broader application in engineering education.", "venue": "Big Data and Cognitive Computing", "label": 0}, {"loc": [8.500388145446777, 2.434804677963257], "openalex_id": "https://openalex.org/W4403322858", "title": "Rule-based Data Selection for Large Language Models", "authors": "Xiaomin Li, Mingye Gao, Zhiwei Zhang, Chang Yue, Hong Hu", "abstract": "The quality of training data significantly impacts the performance of large language models (LLMs). There are increasing studies using LLMs to rate and select data based on several human-crafted metrics (rules). However, these conventional rule-based approaches often depend too heavily on human heuristics, lack effective metrics for assessing rules, and exhibit limited adaptability to new tasks. In our study, we introduce an innovative rule-based framework that utilizes the orthogonality of score vectors associated with rules as a novel metric for rule evaluations. Our approach includes an automated pipeline that first uses LLMs to generate a diverse set of rules, encompassing various rating dimensions to evaluate data quality. Then it rates a batch of data based on these rules and uses the determinantal point process (DPP) from random matrix theory to select the most orthogonal score vectors, thereby identifying a set of independent rules. These rules are subsequently used to evaluate all data, selecting samples with the highest average scores for downstream tasks such as LLM training. We verify the effectiveness of our method through two experimental setups: 1) comparisons with ground truth ratings and 2) benchmarking LLMs trained with the chosen data. Our comprehensive experiments cover a range of scenarios, including general pre-training and domain-specific fine-tuning in areas such as IMDB, Medical, Math, and Code. The outcomes demonstrate that our DPP-based rule rating method consistently outperforms other approaches, including rule-free rating, uniform sampling, importance resampling, and QuRating, in terms of both rating precision and model performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.518965244293213, 0.6076438426971436], "openalex_id": "https://openalex.org/W4403964501", "title": "SkillMatch: Evaluating Self-supervised Learning of Skill Relatedness", "authors": "Jens-Joris Decorte, Jeroen Van Hautte, Thomas Demeester, Chris Develder", "abstract": "Accurately modeling the relationships between skills is a crucial part of human resources processes such as recruitment and employee development. Yet, no benchmarks exist to evaluate such methods directly. We construct and release SkillMatch, a benchmark for the task of skill relatedness, based on expert knowledge mining from millions of job ads. Additionally, we propose a scalable self-supervised learning technique to adapt a Sentence-BERT model based on skill co-occurrence in job ads. This new method greatly surpasses traditional models for skill relatedness as measured on SkillMatch. By releasing SkillMatch publicly, we aim to contribute a foundation for research towards increased accuracy and transparency of skill-based recommendation systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.372273921966553, 0.9684619307518005], "openalex_id": "https://openalex.org/W4403963934", "title": "SWEb: A Large Web Dataset for the Scandinavian Languages", "authors": "Tobias Norlund, Tim Isbister, Amaru Cuba Gyllensten, Patr\u00edcia Santos, Daniela Petrelli, Ariel Ekgren, Magnus Sahlgren", "abstract": "This paper presents the hitherto largest pretraining dataset for the Scandinavian languages: the Scandinavian WEb (SWEb), comprising over one trillion tokens. The paper details the collection and processing pipeline, and introduces a novel model-based text extractor that significantly reduces complexity in comparison with rule-based approaches. We also introduce a new cloze-style benchmark for evaluating language models in Swedish, and use this test to compare models trained on the SWEb data to models trained on FineWeb, with competitive results. All data, models and code are shared openly.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.490588665008545, 5.289163589477539], "openalex_id": "https://openalex.org/W4403924796", "title": "TextHawk2: A Large Vision-Language Model Excels in Bilingual OCR and Grounding with 16x Fewer Tokens", "authors": "Yaqi Yu, Minghui Liao, Jiwen Zhang, Jihao Wu", "abstract": "Reading dense text and locating objects within images are fundamental abilities for Large Vision-Language Models (LVLMs) tasked with advanced jobs. Previous LVLMs, including superior proprietary models like GPT-4o, have struggled to excel in both tasks simultaneously. Moreover, previous LVLMs with fine-grained perception cost thousands of tokens per image, making them resource-intensive. We present TextHawk2, a bilingual LVLM featuring efficient fine-grained perception and demonstrating cutting-edge performance across general-purpose, OCR, and grounding tasks with 16 times fewer image tokens. Critical improvements include: (1) Token Compression: Building on the efficient architecture of its predecessor, TextHawk2 significantly reduces the number of tokens per image by 16 times, facilitating training and deployment of the TextHawk series with minimal resources. (2) Visual Encoder Reinforcement: We enhance the visual encoder through LVLM co-training, unlocking its potential for previously unseen tasks like Chinese OCR and grounding. (3) Data Diversity: We maintain a comparable scale of 100 million samples while diversifying the sources of pre-training data. We assess TextHawk2 across multiple benchmarks, where it consistently delivers superior performance and outperforms closed-source models of similar scale, such as achieving 78.4% accuracy on OCRBench, 81.4% accuracy on ChartQA, 89.6% ANLS on DocVQA, and 88.1% accuracy@0.5 on RefCOCOg-test.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.566420078277588, 4.737728595733643], "openalex_id": "https://openalex.org/W4403324160", "title": "Navigating the Digital World as Humans Do: Universal Visual Grounding for GUI Agents", "authors": "Boyu Gou, Ruohan Wang, Boyuan Zheng, Yanan Xie, Cheng Chang, Yiheng Shu, Huan Sun, Yu Su", "abstract": "Multimodal large language models (MLLMs) are transforming the capabilities of graphical user interface (GUI) agents, facilitating their transition from controlled simulations to complex, real-world applications across various platforms. However, the effectiveness of these agents hinges on the robustness of their grounding capability. Current GUI agents predominantly utilize text-based representations such as HTML or accessibility trees, which, despite their utility, often introduce noise, incompleteness, and increased computational overhead. In this paper, we advocate a human-like embodiment for GUI agents that perceive the environment entirely visually and directly perform pixel-level operations on the GUI. The key is visual grounding models that can accurately map diverse referring expressions of GUI elements to their coordinates on the GUI across different platforms. We show that a simple recipe, which includes web-based synthetic data and slight adaptation of the LLaVA architecture, is surprisingly effective for training such visual grounding models. We collect the largest dataset for GUI visual grounding so far, containing 10M GUI elements and their referring expressions over 1.3M screenshots, and use it to train UGround, a strong universal visual grounding model for GUI agents. Empirical results on six benchmarks spanning three categories (grounding, offline agent, and online agent) show that 1) UGround substantially outperforms existing visual grounding models for GUI agents, by up to 20% absolute, and 2) agents with UGround outperform state-of-the-art agents, despite the fact that existing agents use additional text-based input while ours only uses visual perception. These results provide strong support for the feasibility and promises of GUI agents that navigate the digital world as humans do.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.129141807556152, -2.478853702545166], "openalex_id": "https://openalex.org/W4403884743", "title": "Efficiently Identifying Watermarked Segments in Mixed-Source Texts", "authors": "Xuandong Zhao, Chang Liao, Yuxiang Wang, Lei Li", "abstract": "Text watermarks in large language models (LLMs) are increasingly used to detect synthetic text, mitigating misuse cases like fake news and academic dishonesty. While existing watermarking detection techniques primarily focus on classifying entire documents as watermarked or not, they often neglect the common scenario of identifying individual watermark segments within longer, mixed-source documents. Drawing inspiration from plagiarism detection systems, we propose two novel methods for partial watermark detection. First, we develop a geometry cover detection framework aimed at determining whether there is a watermark segment in long text. Second, we introduce an adaptive online learning algorithm to pinpoint the precise location of watermark segments within the text. Evaluated on three popular watermarking techniques (KGW-Watermark, Unigram-Watermark, and Gumbel-Watermark), our approach achieves high accuracy, significantly outperforming baseline methods. Moreover, our framework is adaptable to other watermarking techniques, offering new insights for precise watermark detection. Our code is publicly available at https://github.com/XuandongZhao/llm-watermark-location", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.682534217834473, -0.7291839122772217], "openalex_id": "https://openalex.org/W4403885788", "title": "Parallel Corpus Augmentation using Masked Language Models", "authors": "Vibhuti Kumari, Narayana Murthy Kavi", "abstract": "In this paper we propose a novel method of augmenting parallel text corpora which promises good quality and is also capable of producing many fold larger corpora than the seed corpus we start with. We do not need any additional monolingual corpora. We use Multi-Lingual Masked Language Model to mask and predict alternative words in context and we use Sentence Embeddings to check and select sentence pairs which are likely to be translations of each other. We cross check our method using metrics for MT Quality Estimation. We believe this method can greatly alleviate the data scarcity problem for all language pairs for which a reasonable seed corpus is available.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.701170921325684, 3.9350125789642334], "openalex_id": "https://openalex.org/W4403884615", "title": "No Need to Talk: Asynchronous Mixture of Language Models", "authors": "Anastasiia Filippova, Angelos Katharopoulos, David Grangier, Ronan Collobert", "abstract": "We introduce SMALLTALK LM, an innovative method for training a mixture of language models in an almost asynchronous manner. Each model of the mixture specializes in distinct parts of the data distribution, without the need for high-bandwidth communication between the nodes training each model. At inference, a lightweight router directs a given sequence to a single expert, according to a short prefix. This inference scheme naturally uses a fraction of the parameters from the overall mixture model. Unlike prior works on asynchronous LLM training, our routing method does not rely on full corpus clustering or access to metadata, making it more suitable for real-world applications. Our experiments on language modeling demonstrate that SMALLTALK LM achieves significantly lower perplexity than dense model baselines for the same total training FLOPs and an almost identical inference cost. Finally, in our downstream evaluations we outperform the dense baseline on 75% of the tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.212780714035034, -0.3656105399131775], "openalex_id": "https://openalex.org/W4403087472", "title": "Applications and Concerns of ChatGPT and Other Conversational Large Language Models in Health Care: Systematic Review", "authors": "Leyao Wang, Zhiyu Wan, Congning Ni, Qingyuan Song, Yang Li, Ellen Wright Clayton, Bradley Malin, Zhijun Yin", "abstract": "Background The launch of ChatGPT (OpenAI) in November 2022 attracted public attention and academic interest to large language models (LLMs), facilitating the emergence of many other innovative LLMs. These LLMs have been applied in various fields, including health care. Numerous studies have since been conducted regarding how to use state-of-the-art LLMs in health-related scenarios. Objective This review aims to summarize applications of and concerns regarding conversational LLMs in health care and provide an agenda for future research in this field. Methods We used PubMed, ACM, and the IEEE digital libraries as primary sources for this review. We followed the guidance of PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) to screen and select peer-reviewed research articles that (1) were related to health care applications and conversational LLMs and (2) were published before September 1, 2023, the date when we started paper collection. We investigated these papers and classified them according to their applications and concerns. Results Our search initially identified 820 papers according to targeted keywords, out of which 65 (7.9%) papers met our criteria and were included in the review. The most popular conversational LLM was ChatGPT (60/65, 92% of papers), followed by Bard (Google LLC; 1/65, 2% of papers), LLaMA (Meta; 1/65, 2% of papers), and other LLMs (6/65, 9% papers). These papers were classified into four categories of applications: (1) summarization, (2) medical knowledge inquiry, (3) prediction (eg, diagnosis, treatment recommendation, and drug synergy), and (4) administration (eg, documentation and information collection), and four categories of concerns: (1) reliability (eg, training data quality, accuracy, interpretability, and consistency in responses), (2) bias, (3) privacy, and (4) public acceptability. There were 49 (75%) papers using LLMs for either summarization or medical knowledge inquiry, or both, and there are 58 (89%) papers expressing concerns about either reliability or bias, or both. We found that conversational LLMs exhibited promising results in summarization and providing general medical knowledge to patients with a relatively high accuracy. However, conversational LLMs such as ChatGPT are not always able to provide reliable answers to complex health-related tasks (eg, diagnosis) that require specialized domain expertise. While bias or privacy issues are often noted as concerns, no experiments in our reviewed papers thoughtfully examined how conversational LLMs lead to these issues in health care research. Conclusions Future studies should focus on improving the reliability of LLM applications in complex health-related tasks, as well as investigating the mechanisms of how LLM applications bring bias and privacy issues. Considering the vast accessibility of LLMs, legal, social, and technical efforts are all needed to address concerns about LLMs to promote, improve, and regularize the application of LLMs in health care.", "venue": "Journal of Medical Internet Research", "label": 13}, {"loc": [2.6887917518615723, 2.87481689453125], "openalex_id": "https://openalex.org/W4403882598", "title": "Mitigating Downstream Model Risks via Model Provenance", "authors": "Keyu Wang, Abdullah Norozi Iranzad, Scott Schaffter, Doina Precup, Jonathan Lebensold", "abstract": "Research and industry are rapidly advancing the innovation and adoption of foundation model-based systems, yet the tools for managing these models have not kept pace. Understanding the provenance and lineage of models is critical for researchers, industry, regulators, and public trust. While model cards and system cards were designed to provide transparency, they fall short in key areas: tracing model genealogy, enabling machine readability, offering reliable centralized management systems, and fostering consistent creation incentives. This challenge mirrors issues in software supply chain security, but AI/ML remains at an earlier stage of maturity. Addressing these gaps requires industry-standard tooling that can be adopted by foundation model publishers, open-source model innovators, and major distribution platforms. We propose a machine-readable model specification format to simplify the creation of model records, thereby reducing error-prone human effort, notably when a new model inherits most of its design from a foundation model. Our solution explicitly traces relationships between upstream and downstream models, enhancing transparency and traceability across the model lifecycle. To facilitate the adoption, we introduce the unified model record (UMR) repository, a semantically versioned system that automates the publication of model records to multiple formats (PDF, HTML, LaTeX) and provides a hosted web interface (https://modelrecord.com/). This proof of concept aims to set a new standard for managing foundation models, bridging the gap between innovation and responsible model management.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.319314956665039, 2.378612518310547], "openalex_id": "https://openalex.org/W4403884285", "title": "Neutral residues: revisiting adapters for model extension", "authors": "Franck Signe Talla, Herv\u00e9 Je\u01f5ou, \u00c9douard Grave", "abstract": "We address the problem of extending a pretrained large language model to a new domain that was not seen during training. Standard techniques, such as finetuning or low-rank adaptation (LoRA) are successful at domain adaptation, but do not formally add capacity to the model. This often leads to a trade-off, between performing well on the new domain vs. degrading performance on the original domain. Here, we revisit and improve adapters to extend LLMs from three angles: data, architecture and training procedure, which are advantageously considered jointly. The resulting method, called neutral residues, modifies adapters in a way that leads each new residual block to output near-zeros on the original domain. This solution leads to strong results when adapting a state-of-the-art model originally trained on English to a new language. Neutral residues significantly outperform competing approaches such as finetuning, LoRA or vanilla adapters in terms of the trade-off between learning the new language and not forgetting English.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.662118673324585, 4.505002498626709], "openalex_id": "https://openalex.org/W4403132916", "title": "Towards Secure and Privacy-Preserving Machine Learning Systems", "authors": "Snehlata Mishra, Dr.Ritu Tandon", "abstract": "The rise of artificial intelligence (AI) in healthcare has created opportunities for advanced predictive models and personalized treatments, yet the sensitive nature of medical data presents significant challenges in terms of privacy, security, and regulatory compliance. Federated Learning (FL) has emerged as a promising solution to these issues, enabling decentralized machine learning across distributed datasets while preserving data privacy. This paper explores the application of FL in the healthcare domain, highlighting its potential to unlock valuable medical insights without the need for centralized data aggregation. We examine the technical architecture of federated learning, its privacy-preserving mechanisms such as differential privacy and secure multiparty computation, and the challenges of ensuring model accuracy and generalizability across diverse healthcare settings. Key case studies are reviewed to illustrate the practical benefits of FL in clinical data analysis, disease prediction, and personalized medicine. Additionally, this paper addresses current limitations; including communication overhead, model heterogeneity, and regulatory barriers, while proposing future directions for enhancing the scalability and adoption of federated learning in healthcare systems. By fostering collaborative intelligence without compromising data confidentiality, federated learning represents a critical step towards more secure, efficient, and equitable healthcare solutions. Keywords: Federated Learning (FL), Healthcare, Decentralized Machine Learning.", "venue": "INTERANTIONAL JOURNAL OF SCIENTIFIC RESEARCH IN ENGINEERING AND MANAGEMENT", "label": 0}, {"loc": [3.250751495361328, 1.5615243911743164], "openalex_id": "https://openalex.org/W4403883318", "title": "Exploring Gen-AI applications in building research and industry: A review", "authors": "Honglin Wan, Jian Zhang, Yan Chen, Weili Xu, Fan Feng", "abstract": "This paper investigates the transformative potential of Generative AI (Gen-AI) technologies, particularly large language models, within the building industry. By leveraging these advanced AI tools, the study explores their application across key areas such as automated compliance checking and building design assistance. The research highlights how Gen-AI can automate labor-intensive processes, significantly improving efficiency and reducing costs in building practices. The paper first discusses the two widely applied fundamental models-Transformer and Diffusion model-and summarizes current pathways for accessing Gen-AI models and the most common techniques for customizing them. It then explores applications for text generation, such as compliance checking, control support, data mining, and building simulation input file editing. Additionally, it examines image generation, including direct generation through diffusion models and indirect generation through language model-supported template creation based on existing Computer-Aided Design or other design tools with rendering. The paper concludes with a comprehensive analysis of the current capabilities of Gen-AI in the building industry, outlining future directions for research and development, with the goal of paving the way for smarter, more effective, and responsive design, construction, and operational practices.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.697887420654297, -0.9729100465774536], "openalex_id": "https://openalex.org/W4403818789", "title": "EEG Emotion Copilot: Pruning LLMs for Emotional EEG Interpretation with Assisted Medical Record Generation", "authors": "Hongyu Chen, Weiming Zeng, Chengcheng Chen, Luhui Cai, Fei Wang, Lei Wang, Wei Zhang, Yueyang Li, Hongjie Yan, Wai Ting Siok, Nizhuan Wang", "abstract": "In the fields of affective computing (AC) and brain-machine interface (BMI), the analysis of physiological and behavioral signals to discern individual emotional states has emerged as a critical research frontier. While deep learning-based approaches have made notable strides in EEG emotion recognition, particularly in feature extraction and pattern recognition, significant challenges persist in achieving end-to-end emotion computation, including real-time processing, individual adaptation, and seamless user interaction. This paper presents the EEG Emotion Copilot, a system optimizing a lightweight large language model (LLM) with 0.5B parameters operating in a local setting, which first recognizes emotional states directly from EEG signals, subsequently generates personalized diagnostic and treatment suggestions, and finally supports the automation of assisted electronic medical records. Specifically, we demonstrate the critical techniques in the novel data structure of prompt, model pruning and fine-tuning training, and deployment strategies aiming at improving real-time performance and computational efficiency. Extensive experiments show that our optimized lightweight LLM-based copilot achieves an enhanced intuitive interface for participant interaction, superior accuracy of emotion recognition and assisted electronic medical records generation, in comparison to such models with similar scale parameters or large-scale parameters such as 1.5B, 1.8B, 3B and 7B. In summary, through these efforts, the proposed copilot is expected to advance the application of AC in the medical domain, offering innovative solution to mental health monitoring. The codes will be released at https://github.com/NZWANG/EEG_Emotion_Copilot.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.80236291885376, 0.3984815180301666], "openalex_id": "https://openalex.org/W4403883071", "title": "Moshi: a speech-text foundation model for real-time dialogue", "authors": "Alexandre D\u00e9fossez, Laurent Mazar\u00e9, Manu Orsini, Am\u00e9lie Royer, Patrick P\u00e9rez, Herv\u00e9 Je\u01f5ou, \u00c9douard Grave, Neil Zeghidour", "abstract": "We introduce Moshi, a speech-text foundation model and full-duplex spoken dialogue framework. Current systems for spoken dialogue rely on pipelines of independent components, namely voice activity detection, speech recognition, textual dialogue and text-to-speech. Such frameworks cannot emulate the experience of real conversations. First, their complexity induces a latency of several seconds between interactions. Second, text being the intermediate modality for dialogue, non-linguistic information that modifies meaning -- such as emotion or non-speech sounds -- is lost in the interaction. Finally, they rely on a segmentation into speaker turns, which does not take into account overlapping speech, interruptions and interjections. Moshi solves these independent issues altogether by casting spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. We moreover extend the hierarchical semantic-to-acoustic token generation of previous work to first predict time-aligned text tokens as a prefix to audio tokens. Not only this \"Inner Monologue\" method significantly improves the linguistic quality of generated speech, but we also illustrate how it can provide streaming speech recognition and text-to-speech. Our resulting model is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice, and is available at https://github.com/kyutai-labs/moshi.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.377707481384277, 1.9096064567565918], "openalex_id": "https://openalex.org/W4403346055", "title": "DoPAMine: Domain-specific Pre-training Adaptation from seed-guided data Mining", "authors": "Vinayak Arannil, Neha Narwal, Sourav Sanjukta Bhabesh, Sai Nikhil Thirandas, Darren Yow-Bang Wang, Graham Horwood, Alex Anto Chirayath, Gouri Pandeshwar", "abstract": "Large Language Models (LLMs) have shown remarkable ability to generalize effectively across numerous industry domains while executing a range of tasks. Many of these competencies are obtained from the data utilized during the pre-training phase of the Language Models (LMs). However, these models exhibit limitations when tasked with performing in specialized or low-resource industry domains. More recent approaches use LLMs for generating domain-specific synthetic data but most often they lack in truthfulness and complexity. Alternatively, in cases where domain data is available like healthcare and finance most of the LMs are proprietary necessitating the need for a scalable method to curate real world industry specific pre-training data. In this work, we propose an automated and scalable framework - DoPAMine:Domain-specific Pre-training Adaptation from seed-guided data Mining, to mine domain specific training data from a large data corpus for domain adaptation of a LM. The framework leverages the parametric knowledge of a LLM to generate diverse and representative seed data tailored to a specific domain which is then used to mine real world data from a large data corpus like Common Crawl. We evaluated our framework's performance in the continual pre-training (CPT) setting by training two domain specific 7B parameter LMs in healthcare and finance with data mined via DoPAMine. Our experiments show that DoPAMine boosts the performance of pre-trained LLMs on average by 4.9% and 5.1% in zero-shot and 5-shot settings respectively on healthcare tasks from MMLU, MedQA, MedMCQA and PubMedQA datasets, and 2.9% and 6.7% for zero-shot and 5-shot settings respectively on finance tasks from FiQA-SA, FPB and Headlines datasets when compared to the baseline.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.3120856285095215, 1.827460765838623], "openalex_id": "https://openalex.org/W4403259265", "title": "GenAI Advertising: Risks of Personalizing Ads with LLMs", "authors": "Sandra L. Borden, Llu\u00eds Codina, Mar\u00eda Jos\u00e9 Ufarte Ruiz", "abstract": "We present the last part of this special issue on Use of Artificial Intelligence in Communication: Ethical Implications for Media. This editorial examines the role that humans should play in the responsible use of generative artificial intelligence (GenAI) in the media. It provides an overview of transparency as an ethical obligation in relation to trust and truthfulness when using GenAI to create different kinds of content. Finally, it enters into the debate about the moral status of machines as we anticipate ethical questions in the study of ethics and GenAI in the media. Six articles complete this special issue with studies examining implications of GenAI for semiotics research, public relations, advertising, fact checking, information literacy and journalism education.", "venue": "Communication & Society", "label": 0}, {"loc": [3.920215606689453, -1.390249490737915], "openalex_id": "https://openalex.org/W4403087110", "title": "Leveraging investments, promoting transparency and mobilising communities: a qualitative analysis of news articles about how the Ebola outbreak informed COVID \u2026", "authors": "Lauren Courtney, Manon Billaud, Alex Paulenich, Robert Chew, Zainab Alidina, Meredith Pinto", "abstract": "Background The WHO declared the novel COVID-19 outbreak a pandemic in March 2020. While the COVID-19 pandemic was unprecedented, prior experiences with diseases such as Middle East respiratory syndrome, severe acute respiratory syndrome and Ebola shaped many countries\u2019 preparedness and response strategies. Although lessons learnt from outbreak responses have been documented from a variety of sources, news media play a special role through their dissemination of news to the general public. This study investigated news media to explore how lessons learnt from the West African Ebola outbreak in 2014\u20132016 informed the COVID-19 responses in several African countries. Methods We conducted qualitative analysis on a dataset of previously compiled COVID-19-related news articles published from 1 March 2020 to 31 August 2020. This dataset included 34,225 articles from 6 countries. We filtered the dataset to only include articles with the keyword \u2018Ebola\u2019. We used a machine-learning text classification model to identify relevant articles with clear and specific lessons learnt. We conducted inductive and deductive coding to categorise lessons learnt and identify emergent themes. Results Of the 861 articles containing the word \u2018Ebola\u2019, 18.4% (N=158) with lessons learnt from Ebola were included across five of the countries: Ethiopia, Ghana, Kenya, Liberia and Sierra Leone. News articles highlighted three emergent themes: the importance of leveraging existing resources and past response system investments, promoting transparency in public health messaging and engaging community leaders in all phases of the response. Conclusions Findings suggest fostering trust prior to and throughout an outbreak facilitates timely implementation and compliance of mitigation strategies. Trust can be built by leveraging existing resources, being communicative and transparent about their funding allocation and decision-making and engaging communities.", "venue": "BMJ Global Health", "label": 0}, {"loc": [3.012269973754883, -0.7142074108123779], "openalex_id": "https://openalex.org/W4403012771", "title": "MeSH2Matrix: Combining MeSH keywords and machine learning for biomedical relation classification based on PubMed", "authors": "Houcemeddine Turki, Bonaventure F. P. Dossou, Chris Chinenye Emezue, Abraham Toluwase Owodunni, Mohamed Ali Hadj Taieb, Mohamed Ben Aouicha, Hanen Ben Hassen, Afif Masmoudi", "abstract": "Biomedical relation classification has been significantly improved by the application of advanced machine learning techniques on the raw texts of scholarly publications. Despite this improvement, the reliance on large chunks of raw text makes these algorithms suffer in terms of generalization, precision, and reliability. The use of the distinctive characteristics of bibliographic metadata can prove effective in achieving better performance for this challenging task. In this research paper, we introduce an approach for biomedical relation classification using the qualifiers of co-occurring Medical Subject Headings (MeSH). First of all, we introduce MeSH2Matrix, our dataset consisting of 46,469 biomedical relations curated from PubMed publications using our approach. Our dataset includes a matrix that maps associations between the qualifiers of subject MeSH keywords and those of object MeSH keywords. It also specifies the corresponding Wikidata relation type and the superclass of semantic relations for each relation. Using MeSH2Matrix, we build and train three machine learning models (Support Vector Machine [SVM], a dense model [D-Model], and a convolutional neural network [C-Net]) to evaluate the efficiency of our approach for biomedical relation classification. Our best model achieves an accuracy of 70.78% for 195 classes and 83.09% for five superclasses. Finally, we provide confusion matrix and extensive feature analyses to better examine the relationship between the MeSH qualifiers and the biomedical relations being classified. Our results will hopefully shed light on developing better algorithms for biomedical ontology classification based on the MeSH keywords of PubMed publications. For reproducibility purposes, MeSH2Matrix, as well as all our source codes, are made publicly accessible at https://github.com/SisonkeBiotik-Africa/MeSH2Matrix.", "venue": "Journal of Biomedical Semantics", "label": 0}, {"loc": [6.2545695304870605, 2.344958782196045], "openalex_id": "https://openalex.org/W4403814318", "title": "Can Models Learn Skill Composition from Examples?", "authors": "Haoyu Zhao, Simran Kaur, Dingli Yu, Anirudh Goyal, Sanjeev Arora", "abstract": "As large language models (LLMs) become increasingly advanced, their ability to exhibit compositional generalization -- the capacity to combine learned skills in novel ways not encountered during training -- has garnered significant attention. This type of generalization, particularly in scenarios beyond training data, is also of great interest in the study of AI safety and alignment. A recent study introduced the SKILL-MIX evaluation, where models are tasked with composing a short paragraph demonstrating the use of a specified $k$-tuple of language skills. While small models struggled with composing even with $k=3$, larger models like GPT-4 performed reasonably well with $k=5$ and $6$. In this paper, we employ a setup akin to SKILL-MIX to evaluate the capacity of smaller models to learn compositional generalization from examples. Utilizing a diverse set of language skills -- including rhetorical, literary, reasoning, theory of mind, and common sense -- GPT-4 was used to generate text samples that exhibit random subsets of $k$ skills. Subsequent fine-tuning of 7B and 13B parameter models on these combined skill texts, for increasing values of $k$, revealed the following findings: (1) Training on combinations of $k=2$ and $3$ skills results in noticeable improvements in the ability to compose texts with $k=4$ and $5$ skills, despite models never having seen such examples during training. (2) When skill categories are split into training and held-out groups, models significantly improve at composing texts with held-out skills during testing despite having only seen training skills during fine-tuning, illustrating the efficacy of the training approach even with previously unseen skills. This study also suggests that incorporating skill-rich (potentially synthetic) text into training can substantially enhance the compositional capabilities of models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.031555652618408, -1.737882137298584], "openalex_id": "https://openalex.org/W4405007247", "title": "What Does \u201cPalliative\u201d Mean? Sentiment, Knowledge, and Public Perception Concerning Palliative Care on the Internet since the COVID-19 Pandemic", "authors": "Joachim Peters, Maria Heckel, Eva Breindl, Christoph Ostgathe", "abstract": "The impact of the COVID-19 pandemic on public discussion on social media continues to persist even in 2024. Insights from online NLP analysis helped to determine the image of palliative care in the Internet discourse and can help find ways to react to certain trends such as the spread of negative attitudes and misconceptions.", "venue": "Palliative Medicine Reports", "label": 0}, {"loc": [7.213364124298096, -0.052175529301166534], "openalex_id": "https://openalex.org/W4403796797", "title": "EMMA-500: Enhancing Massively Multilingual Adaptation of Large Language Models", "authors": "Shaoxiong Ji, Zihao Li, Indraneil Paul, Jouni Paavola, Peiqin Lin, Pinzhen Chen, Dayy\u00e1n O\u2019Brien, Hengyu Luo, Hinrich Sch\u00fctze, J\u00f6rg Tiedemann, Barry Haddow", "abstract": "In this work, we introduce EMMA-500, a large-scale multilingual language model continue-trained on texts across 546 languages designed for enhanced multilingual performance, focusing on improving language coverage for low-resource languages. To facilitate continual pre-training, we compile the MaLA corpus, a comprehensive multilingual dataset enriched with curated datasets across diverse domains. Leveraging this corpus, we conduct extensive continual pre-training of the Llama 2 7B model, resulting in EMMA-500, which demonstrates robust performance across a wide collection of benchmarks, including a comprehensive set of multilingual tasks. Our results highlight the effectiveness of continual pre-training in expanding large language models' language capacity, particularly for underrepresented languages, demonstrating significant gains in cross-lingual transfer, task generalization, and language adaptability. We release the MaLA corpus, EMMA-500 model weights, scripts, and model generations.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.51393985748291, 3.7875587940216064], "openalex_id": "https://openalex.org/W4403795566", "title": "The poison of dimensionality", "authors": "L\u00ea-Nguy\u00ean Hoang", "abstract": "This paper advances the understanding of how the size of a machine learning model affects its vulnerability to poisoning, despite state-of-the-art defenses. Given isotropic random honest feature vectors and the geometric median (or clipped mean) as the robust gradient aggregator rule, we essentially prove that, perhaps surprisingly, linear and logistic regressions with $D \\geq 169 H^2/P^2$ parameters are subject to arbitrary model manipulation by poisoners, where $H$ and $P$ are the numbers of honestly labeled and poisoned data points used for training. Our experiments go on exposing a fundamental tradeoff between augmenting model expressivity and increasing the poisoners' attack surface, on both synthetic data, and on MNIST & FashionMNIST data for linear classifiers with random features. We also discuss potential implications for source-based learning and neural nets.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.8886845111846924, -0.4770275056362152], "openalex_id": "https://openalex.org/W4402923149", "title": "Towards Building Multilingual Language Model for Medicine", "authors": "Pengcheng Qiu, Chaoyi Wu, Xiaoman Zhang, Weixiong Lin, Haicheng Wang, Ya Zhang, Yanfeng Wang, Weidi Xie", "abstract": "Abstract The development of open-source, multilingual medical language models can benefit a wide, linguistically diverse audience from different regions. To promote this domain, we present contributions from the following: First, we construct a multilingual medical corpus, containing approximately 25.5B tokens encompassing 6 main languages, termed as MMedC, enabling auto-regressive domain adaptation for general LLMs; Second, to monitor the development of multilingual medical LLMs, we propose a multilingual medical multi-choice question-answering benchmark with rationale, termed as MMedBench; Third, we have assessed a number of open-source large language models (LLMs) on our benchmark, along with those further auto-regressive trained on MMedC. Our final model, MMed-Llama 3, with only 8B parameters, achieves superior performance compared to all other open-source models on both MMedBench and English benchmarks, even rivaling GPT-4. In conclusion, in this work, We present a large-scale corpus, a benchmark and a series of models to support the development of multilingual medical LLMs.", "venue": "Nature Communications", "label": 0}, {"loc": [2.053964853286743, 5.325483798980713], "openalex_id": "https://openalex.org/W4402925642", "title": "Enhancing Phishing Detection, Leveraging Deep Learning Techniques", "authors": "Phyo Htet Kyaw, Jairo Guti\u00e9rrez, Akbar Ghobakhlou", "abstract": "The landscape of phishing email threats is continually evolving nowadays, making it challenging to combat effectively with traditional methods even with carrier-grade spam filters. Traditional detection mechanisms such as blacklisting, whitelisting, signature-based, and rule-based techniques could not effectively prevent phishing, spear-phishing, and zero-day attacks, as cybercriminals are using sophisticated techniques and trusted email service providers. Consequently, many researchers have recently concentrated on leveraging machine learning (ML) and deep learning (DL) approaches to enhance phishing email detection capabilities with better accuracy. To gain insights into the development of deep learning algorithms in the current research on phishing prevention, this study conducts a systematic literature review (SLR) following the Preferred Reporting Items for Systematic Reviews and Meta-Analyses (PRISMA) guidelines. By synthesizing the 33 selected papers using the SLR approach, this study presents a taxonomy of DL-based phishing detection methods, analyzing their effectiveness, limitations, and future research directions to address current challenges. The study reveals that the adaptability of detection models to new behaviors of phishing emails is the major improvement area. This study aims to add details about deep learning used for security to the body of knowledge, and it discusses future research in phishing detection systems.", "venue": "Electronics", "label": 19}, {"loc": [4.160711288452148, 3.5787277221679688], "openalex_id": "https://openalex.org/W4403794818", "title": "BeanCounter: A low-toxicity, large-scale, and open dataset of business-oriented text", "authors": "Siyan Wang, Bernard C. Levy", "abstract": "Many of the recent breakthroughs in language modeling have resulted from scaling effectively the same model architecture to larger datasets. In this vein, recent work has highlighted performance gains from increasing training dataset size and quality, suggesting a need for novel sources of large-scale datasets. In this work, we introduce BeanCounter, a public dataset consisting of more than 159B tokens extracted from businesses' disclosures. We show that this data is indeed novel: less than 0.1% of BeanCounter appears in Common Crawl-based datasets and it is an order of magnitude larger than datasets relying on similar sources. Given the data's provenance, we hypothesize that BeanCounter is comparatively more factual and less toxic than web-based datasets. Exploring this hypothesis, we find that many demographic identities occur with similar prevalence in BeanCounter but with significantly less toxic context relative to other datasets. To demonstrate the utility of BeanCounter, we evaluate and compare two LLMs continually pre-trained on BeanCounter with their base models. We find an 18-33% reduction in toxic generation and improved performance within the finance domain for the continually pretrained models. Collectively, our work suggests that BeanCounter is a novel source of low-toxicity and high-quality domain-specific data with sufficient scale to train multi-billion parameter LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.6481590270996094, 2.729560613632202], "openalex_id": "https://openalex.org/W4403795472", "title": "Data-Centric AI Governance: Addressing the Limitations of Model-Focused Policies", "authors": "Ritwik Gupta, Leah Walker, Rodolfo Corona, Stephanie Fu, Suzanne Petryk, Janet Napolitano, Trevor Darrell, Andrew W. Reddie", "abstract": "Current regulations on powerful AI capabilities are narrowly focused on \"foundation\" or \"frontier\" models. However, these terms are vague and inconsistently defined, leading to an unstable foundation for governance efforts. Critically, policy debates often fail to consider the data used with these models, despite the clear link between data and model performance. Even (relatively) \"small\" models that fall outside the typical definitions of foundation and frontier models can achieve equivalent outcomes when exposed to sufficiently specific datasets. In this work, we illustrate the importance of considering dataset size and content as essential factors in assessing the risks posed by models both today and in the future. More broadly, we emphasize the risk posed by over-regulating reactively and provide a path towards careful, quantitative evaluation of capabilities that can lead to a simplified regulatory environment.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.810661315917969, 2.2508583068847656], "openalex_id": "https://openalex.org/W4403795804", "title": "Data Proportion Detection for Optimized Data Management for Large Language Models", "authors": "Hao Liang, Keshi Zhao, Yajie Yang, Bin Cui, Guosheng Dong, Zenan Zhou, Wentao Zhang", "abstract": "Large language models (LLMs) have demonstrated exceptional performance across a wide range of tasks and domains, with data preparation playing a critical role in achieving these results. Pre-training data typically combines information from multiple domains. To maximize performance when integrating data from various domains, determining the optimal data proportion is essential. However, state-of-the-art (SOTA) LLMs rarely disclose details about their pre-training data, making it difficult for researchers to identify ideal data proportions. In this paper, we introduce a new topic, \\textit{data proportion detection}, which enables the automatic estimation of pre-training data proportions by analyzing the generated outputs of LLMs. We provide rigorous theoretical proofs, practical algorithms, and preliminary experimental results for data proportion detection. Based on these findings, we offer valuable insights into the challenges and future directions for effective data proportion detection and data management.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.8929338455200195, 3.535949230194092], "openalex_id": "https://openalex.org/W4403808836", "title": "A Survey of Low-bit Large Language Models: Basics, Systems, and Algorithms", "authors": "Ruihao Gong, Yifu Ding, Zining Wang, Chengtao Lv, Xingyu Zheng, Jinyang Du, Haotong Qin, Jinyang Guo, Michele Magno, Xianglong Liu", "abstract": "Large language models (LLMs) have achieved remarkable advancements in natural language processing, showcasing exceptional performance across various tasks. However, the expensive memory and computational requirements present significant challenges for their practical deployment. Low-bit quantization has emerged as a critical approach to mitigate these challenges by reducing the bit-width of model parameters, activations, and gradients, thus decreasing memory usage and computational demands. This paper presents a comprehensive survey of low-bit quantization methods tailored for LLMs, covering the fundamental principles, system implementations, and algorithmic strategies. An overview of basic concepts and new data formats specific to low-bit LLMs is first introduced, followed by a review of frameworks and systems that facilitate low-bit LLMs across various hardware platforms. Then, we categorize and analyze techniques and toolkits for efficient low-bit training and inference of LLMs. Finally, we conclude with a discussion of future trends and potential advancements of low-bit LLMs. Our systematic overview from basic, system, and algorithm perspectives can offer valuable insights and guidelines for future works to enhance the efficiency and applicability of LLMs through low-bit quantization.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.839130401611328, 2.22192645072937], "openalex_id": "https://openalex.org/W4403785094", "title": "Programming Every Example: Lifting Pre-training Data Quality like Experts at Scale", "authors": "Fan Zhou, Zengzhi Wang, Qian Liu, Junlong Li, Pengfei Liu", "abstract": "Large language model pre-training has traditionally relied on human experts to craft heuristics for improving the corpora quality, resulting in numerous rules developed to date. However, these rules lack the flexibility to address the unique characteristics of individual example effectively. Meanwhile, applying tailored rules to every example is impractical for human experts. In this paper, we demonstrate that even small language models, with as few as 0.3B parameters, can exhibit substantial data refining capabilities comparable to those of human experts. We introduce Programming Every Example (ProX), a novel framework that treats data refinement as a programming task, enabling models to refine corpora by generating and executing fine-grained operations, such as string normalization, for each individual example at scale. Experimental results show that models pre-trained on ProX-curated data outperform either original data or data filtered by other selection methods by more than 2% across various downstream benchmarks. Its effectiveness spans various model sizes and pre-training corpora, including C4, RedPajama-V2, FineWeb, FineWeb-Edu, and DCLM. Furthermore, ProX exhibits significant potential in domain-specific continual pre-training: without domain specific design, models trained on OpenWebMath refined by ProX outperform human-crafted rule-based methods, improving average accuracy by 7.6% over Mistral-7B, with 14.6% for Llama-2-7B and 20.3% for CodeLlama-7B, all within 10B tokens to be comparable to models like Llemma-7B trained on 200B tokens. Further analysis highlights that ProX significantly saves training FLOPs, offering a promising path for efficient LLM pre-training. We are open-sourcing ProX with >500B corpus, models, and sharing all training and implementation details for reproducible research and future innovation. Code: https://github.com/GAIR-NLP/ProX", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.34310245513916, 0.6057950258255005], "openalex_id": "https://openalex.org/W4403784431", "title": "Enhancing Automatic Keyphrase Labelling with Text-to-Text Transfer Transformer (T5) Architecture: A Framework for Keyphrase Generation and Filtering", "authors": "Jorge Gab\u00edn, M. Eduardo Ares, Javier Parapar", "abstract": "Automatic keyphrase labelling stands for the ability of models to retrieve words or short phrases that adequately describe documents' content. Previous work has put much effort into exploring extractive techniques to address this task; however, these methods cannot produce keyphrases not found in the text. Given this limitation, keyphrase generation approaches have arisen lately. This paper presents a keyphrase generation model based on the Text-to-Text Transfer Transformer (T5) architecture. Having a document's title and abstract as input, we learn a T5 model to generate keyphrases which adequately define its content. We name this model docT5keywords. We not only perform the classic inference approach, where the output sequence is directly selected as the predicted values, but we also report results from a majority voting approach. In this approach, multiple sequences are generated, and the keyphrases are ranked based on their frequency of occurrence across these sequences. Along with this model, we present a novel keyphrase filtering technique based on the T5 architecture. We train a T5 model to learn whether a given keyphrase is relevant to a document. We devise two evaluation methodologies to prove our model's capability to filter inadequate keyphrases. First, we perform a binary evaluation where our model has to predict if a keyphrase is relevant for a given document. Second, we filter the predicted keyphrases by several AKG models and check if the evaluation scores are improved. Experimental results demonstrate that our keyphrase generation model significantly outperforms all the baselines, with gains exceeding 100\\% in some cases. The proposed filtering technique also achieves near-perfect accuracy in eliminating false positives across all datasets.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.969213008880615, 5.527090072631836], "openalex_id": "https://openalex.org/W4402876060", "title": "Fusion of Visual and Textual Data for Enhanced Semantic Representations", "authors": "Lyra Sterling, Kairos Vale, Ava Martinez", "abstract": "Generic text embeddings have demonstrated considerable success across a multitude of applications. However, these embeddings are typically derived by modeling the co-occurrence patterns within text-only corpora, which can limit their ability to generalize effectively across diverse contexts. In this study, we investigate methodologies that incorporate visual information into textual representations to overcome these limitations. Through extensive ablation studies, we introduce a novel and straightforward architecture named VisualText Fusion Network (VTFN). This architecture not only surpasses existing multimodal approaches on a range of well-established benchmark datasets but also achieves state-of-the-art performance on image-related textual datasets while utilizing significantly less training data. Our findings underscore the potential of integrating visual modalities to substantially enhance the robustness and applicability of text embeddings, paving the way for more nuanced and contextually rich semantic representations.", "venue": "Preprints.org", "label": 3}, {"loc": [8.649917602539062, 2.4560940265655518], "openalex_id": "https://openalex.org/W4403619024", "title": "Improving Pretraining Data Using Perplexity Correlations", "authors": "Tristan Thrush, Christopher Potts, Tatsunori Hashimoto", "abstract": "Quality pretraining data is often seen as the key to high-performance language models. However, progress in understanding pretraining data has been slow due to the costly pretraining runs required for data selection experiments. We present a framework that avoids these costs and selects high-quality pretraining data without any LLM training of our own. Our work is based on a simple observation: LLM losses on many pretraining texts are correlated with downstream benchmark performance, and selecting high-correlation documents is an effective pretraining data selection method. We build a new statistical framework for data selection centered around estimates of perplexity-benchmark correlations and perform data selection using a sample of 90 LLMs taken from the Open LLM Leaderboard on texts from tens of thousands of web domains. In controlled pretraining experiments at the 160M parameter scale on 8 benchmarks, our approach outperforms DSIR on every benchmark, while matching the best data selector found in DataComp-LM, a hand-engineered bigram classifier. We have now also updated this paper to include results from preregistered experiments with new pretraining data on an aggregation of 22 benchmarks up to the 1.4B scale, showing increasing improvements of our method over others with more scale. A pip package with full documentation can be found here: https://github.com/TristanThrush/perplexity-correlations.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.600792407989502, 3.4077563285827637], "openalex_id": "https://openalex.org/W4403786771", "title": "Small Language Models: Survey, Measurements, and Insights", "authors": "Zhichun Lu, Xiang Li, Daoping Cai, Runduan Yi, Fangming Liu, Xiwen Zhang, Nicholas D. Lane, Mengwei Xu", "abstract": "Small language models (SLMs), despite their widespread adoption in modern smart devices, have received significantly less academic attention compared to their large language model (LLM) counterparts, which are predominantly deployed in data centers and cloud environments. While researchers continue to improve the capabilities of LLMs in the pursuit of artificial general intelligence, SLM research aims to make machine intelligence more accessible, affordable, and efficient for everyday tasks. Focusing on transformer-based, decoder-only language models with 100M-5B parameters, we survey 70 state-of-the-art open-source SLMs, analyzing their technical innovations across three axes: architectures, training datasets, and training algorithms. In addition, we evaluate their capabilities in various domains, including commonsense reasoning, mathematics, in-context learning, and long context. To gain further insight into their on-device runtime costs, we benchmark their inference latency and memory footprints. Through in-depth analysis of our benchmarking data, we offer valuable insights to advance research in this field.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.832066297531128, 2.0493128299713135], "openalex_id": "https://openalex.org/W4403787068", "title": "Five questions and answers about artificial intelligence", "authors": "Alberto Prieto, Beatriz Rodr\u00edguez Prieto", "abstract": "Rapid advances in Artificial Intelligence (AI) are generating much controversy in society, often without scientific basis. As occurred the development of other emerging technologies, such as the introduction of electricity in the early 20th century, AI causes both fascination and fear. Following the advice of the philosopher R.W. Emerson's: advice the knowledge is the antidote to fear; this paper seeks to contribute to the dissemination of knowledge about AI. To this end, it reflects on the following questions: the origins of AI, its possible future evolution, its ability to show feelings, the associated threats and dangers, and the concept of AI singularity.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.548903942108154, -1.1497914791107178], "openalex_id": "https://openalex.org/W4403780350", "title": "Choose the Final Translation from NMT and LLM hypotheses Using MBR Decoding: HW-TSC's Submission to the WMT24 General MT Shared Task", "authors": "Zhanglin Wu, Daimeng Wei, Zongyao Li, Hengchao Shang, Jiaxin Guo, Shaojun Li, Zhiqiang Rao, Yuanchang Luo, Ning Xie, Yang Hao", "abstract": "This paper presents the submission of Huawei Translate Services Center (HW-TSC) to the WMT24 general machine translation (MT) shared task, where we participate in the English to Chinese (en2zh) language pair. Similar to previous years' work, we use training strategies such as regularized dropout, bidirectional training, data diversification, forward translation, back translation, alternated training, curriculum learning, and transductive ensemble learning to train the neural machine translation (NMT) model based on the deep Transformer-big architecture. The difference is that we also use continue pre-training, supervised fine-tuning, and contrastive preference optimization to train the large language model (LLM) based MT model. By using Minimum Bayesian risk (MBR) decoding to select the final translation from multiple hypotheses for NMT and LLM-based MT models, our submission receives competitive results in the final evaluation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.0524001121520996, -0.7371870279312134], "openalex_id": "https://openalex.org/W4403780109", "title": "Medical Concept Normalization in a Low-Resource Setting", "authors": "Tim Patzelt", "abstract": "In the field of biomedical natural language processing, medical concept normalization is a crucial task for accurately mapping mentions of concepts to a large knowledge base. However, this task becomes even more challenging in low-resource settings, where limited data and resources are available. In this thesis, I explore the challenges of medical concept normalization in a low-resource setting. Specifically, I investigate the shortcomings of current medical concept normalization methods applied to German lay texts. Since there is no suitable dataset available, a dataset consisting of posts from a German medical online forum is annotated with concepts from the Unified Medical Language System. The experiments demonstrate that multilingual Transformer-based models are able to outperform string similarity methods. The use of contextual information to improve the normalization of lay mentions is also examined, but led to inferior results. Based on the results of the best performing model, I present a systematic error analysis and lay out potential improvements to mitigate frequent errors.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.936471462249756, 3.83579421043396], "openalex_id": "https://openalex.org/W4403755077", "title": "Exploring Scaling Laws for Local SGD in Large Language Model Training", "authors": "Qiaozhi He, X. Zhuang, Zhihua Wu", "abstract": "This paper investigates scaling laws for local SGD in LLM training, a distributed optimization algorithm that facilitates training on loosely connected devices. Through extensive experiments, we show that local SGD achieves competitive results compared to conventional methods, given equivalent model parameters, datasets, and computational resources. Furthermore, we explore the application of local SGD in various practical scenarios, including multi-cluster setups and edge computing environments. Our findings elucidate the necessary conditions for effective multi-cluster LLM training and examine the potential and limitations of leveraging edge computing resources in the LLM training process. This demonstrates its viability as an alternative to single large-cluster training.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.851651191711426, 5.524389266967773], "openalex_id": "https://openalex.org/W4403754926", "title": "Embedding Geometries of Contrastive Language-Image Pre-Training", "authors": "Jason Chou, N. M. Alam", "abstract": "Since the publication of CLIP, the approach of using InfoNCE loss for contrastive pre-training has become widely popular for bridging two or more modalities. Despite its wide adoption, CLIP's original design choices of L2 normalization and cosine similarity logit have rarely been revisited. We have systematically experimented with alternative geometries and softmax logits for language-image pre-training and identified that variants with intuitive Euclidean geometry, Euclidean CLIP (EuCLIP), match or exceed the performance of CLIP and support hierarchical relationships at least as well as more complicated hyperbolic alternative.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.604222774505615, 1.5816456079483032], "openalex_id": "https://openalex.org/W4402673253", "title": "Scaffolding learning: From specific to generic with large language models", "authors": "David S Yin, Xiaoxin Yin", "abstract": "Large language models such as ChatGPT have been shown to excel in solving complex math problems. However, they cannot solve basic arithmetic problems such as 758*639 = 484,362. This makes us ponder if LLMs have been trained to solve math and science problems in the right way. When a student learns math at school, she or he starts with arithmetic, then moves to word problems, polynomials, and calculus. Each skill she or he acquires will be used in the next stage to solve more advanced problems. In this paper we propose Scaffolding Learning for LLMs, which imitates how a student learns a subject in a step-by-step manner. For example, we first train an LLM to perform highly specific operations such as multiplication and division, and then apply such \u201cskills\u201d in a more generic task such as solving word problems. This is related to Curriculum Training, which trains a model on tasks following a specific order, such as training on easy tasks first and then gradually increases the difficulty. Our proposed approach goes from specific tasks to generic ones, which can be considered as a special case of Curriculum Training. Our empirical studies show that when an LLM has \u201cmastered\u201d a specific skill, only a small amount of training is required to teach it to apply the skill to a more generic application.", "venue": "PLoS ONE", "label": 11}, {"loc": [6.226341724395752, 5.118313312530518], "openalex_id": "https://openalex.org/W4403747445", "title": "InfiMM-WebMath-40B: Advancing Multimodal Pre-Training for Enhanced Mathematical Reasoning", "authors": "Xiaotian Han, Yiren Jian, Xuefeng Hu, Haogeng Liu, Yiqi Wang, Qihang Fan, Yuang Ai, Huaibo Huang, Ran He, Zhenheng Yang, Quanzeng You", "abstract": "Pre-training on large-scale, high-quality datasets is crucial for enhancing the reasoning capabilities of Large Language Models (LLMs), especially in specialized domains such as mathematics. Despite the recognized importance, the Multimodal LLMs (MLLMs) field currently lacks a comprehensive open-source pre-training dataset specifically designed for mathematical reasoning. To address this gap, we introduce InfiMM-WebMath-40B, a high-quality dataset of interleaved image-text documents. It comprises 24 million web pages, 85 million associated image URLs, and 40 billion text tokens, all meticulously extracted and filtered from CommonCrawl. We provide a detailed overview of our data collection and processing pipeline. To demonstrate the robustness of InfiMM-WebMath-40B, we conducted evaluations in both text-only and multimodal settings. Our evaluations on text-only benchmarks show that, despite utilizing only 40 billion tokens, our dataset significantly enhances the performance of our 1.3B model, delivering results comparable to DeepSeekMath-1.3B, which uses 120 billion tokens for the same model size. Nevertheless, with the introduction of our multi-modal math pre-training dataset, our models set a new state-of-the-art among open-source models on multi-modal math benchmarks such as MathVerse and We-Math. We release our data at https://huggingface.co/datasets/Infi-MM/InfiMM-WebMath-40B.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.203893661499023, -1.160570502281189], "openalex_id": "https://openalex.org/W4403785564", "title": "CamelEval: Advancing Culturally Aligned Arabic Language Models and Benchmarks", "authors": "Zhaozhi Qian, Faroq Altam, Muhammad Al\u2010Qurishi, Riad Souissi", "abstract": "Large Language Models (LLMs) are the cornerstones of modern artificial intelligence systems. This paper introduces Juhaina, a Arabic-English bilingual LLM specifically designed to align with the values and preferences of Arabic speakers. Juhaina inherently supports advanced functionalities such as instruction following, open-ended question answering, information provisioning, and text processing. Our model contains 9.24 billion parameters and is trained on a context window of up to 8,192 tokens. This paper details the creation process of Juhaina and provides an extensive empirical evaluation. Furthermore, we identify the limitations of widely-adopted Open Arabic LLM Leaderboard (OALL) and propose a new evaluation benchmark, CamelEval. Our findings demonstrate that Juhaina surpasses existing LLMs of comparable sizes, such as the Llama and Gemma families, in generating helpful responses in Arabic, providing factually accurate information about the region, and understanding nuanced cultural aspects. We aspire for Juhaina to democratize cutting-edge AI technologies, serving over 400 million Arabic speakers by offering LLMs that not only communicate in their language but also comprehend their culture. We publicly release all models on Huggingface \\url{https://huggingface.co/elmrc}.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.481281757354736, 4.521080017089844], "openalex_id": "https://openalex.org/W4403928565", "title": "The Impact of Element Ordering on LM Agent Performance", "authors": "Wayne Chi, Ameet Talwalkar, Chris Donahue", "abstract": "There has been a surge of interest in language model agents that can navigate virtual environments such as the web or desktop. To navigate such environments, agents benefit from information on the various elements (e.g., buttons, text, or images) present. It remains unclear which element attributes have the greatest impact on agent performance, especially in environments that only provide a graphical representation (i.e., pixels). Here we find that the ordering in which elements are presented to the language model is surprisingly impactful--randomizing element ordering in a webpage degrades agent performance comparably to removing all visible text from an agent's state representation. While a webpage provides a hierarchical ordering of elements, there is no such ordering when parsing elements directly from pixels. Moreover, as tasks become more challenging and models more sophisticated, our experiments suggest that the impact of ordering increases. Finding an effective ordering is non-trivial. We investigate the impact of various element ordering methods in web and desktop environments. We find that dimensionality reduction provides a viable ordering for pixel-only environments. We train a UI element detection model to derive elements from pixels and apply our findings to an agent benchmark--OmniACT--where we only have access to pixels. Our method completes more than two times as many tasks on average relative to the previous state-of-the-art.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.575156211853027, -1.166418194770813], "openalex_id": "https://openalex.org/W4402652117", "title": "Machine learning and natural language processing to assess the emotional impact of influencers' mental health content on Instagram", "authors": "Noem\u00ed Merayo, Alba Ayuso Lanchares, Clara Gonz\u00e1lez\u2010Sanguino", "abstract": "Background This study aims to examine, through artificial intelligence, specifically machine learning, the emotional impact generated by disclosures about mental health on social media. In contrast to previous research, which primarily focused on identifying psychopathologies, our study investigates the emotional response to mental health-related content on Instagram, particularly content created by influencers/celebrities. This platform, especially favored by the youth, is the stage where these influencers exert significant social impact, and where their analysis holds strong relevance. Analyzing mental health with machine learning techniques on Instagram is unprecedented, as all existing research has primarily focused on Twitter. Methods This research involves creating a new corpus labelled with responses to mental health posts made by influencers/celebrities on Instagram, categorized by emotions such as love/admiration, anger/contempt/mockery, gratitude, identification/empathy, and sadness. The study is complemented by modelling a set of machine learning algorithms to efficiently detect the emotions arising when faced with these mental health disclosures on Instagram, using the previous corpus. Results Results have shown that machine learning algorithms can effectively detect such emotional responses. Traditional techniques, such as Random Forest, showed decent performance with low computational loads (around 50%), while deep learning and Bidirectional Encoder Representation from Transformers (BERT) algorithms achieved very good results. In particular, the BERT models reached accuracy levels between 86\u201390%, and the deep learning model achieved 72% accuracy. These results are satisfactory, considering that predicting emotions, especially in social networks, is challenging due to factors such as the subjectivity of emotion interpretation, the variability of emotions between individuals, and the interpretation of emotions in different cultures and communities. Discussion This cross-cutting research between mental health and artificial intelligence allows us to understand the emotional impact generated by mental health content on social networks, especially content generated by influential celebrities among young people. The application of machine learning allows us to understand the emotional reactions of society to messages related to mental health, which is highly innovative and socially relevant given the importance of the phenomenon in societies. In fact, the proposed algorithms\u2019 high accuracy (86\u201390%) in social contexts like mental health, where detecting negative emotions is crucial, presents a promising research avenue. Achieving such levels of accuracy is highly valuable due to the significant implications of false positives or false negatives in this social context.", "venue": "PeerJ Computer Science", "label": 4}, {"loc": [7.58677864074707, 2.3288190364837646], "openalex_id": "https://openalex.org/W4403752108", "title": "A Controlled Study on Long Context Extension and Generalization in LLMs", "authors": "Yi L\u00fc, Jing Nathan Yan, Songlin Yang, Justin Chiu, Siyu Ren, Fei Yuan, Wenting Zhao, Zhiyong Wu, Alexander M. Rush", "abstract": "Broad textual understanding and in-context learning require language models that utilize full document contexts. Due to the implementation challenges associated with directly training long-context models, many methods have been proposed for extending models to handle long contexts. However, owing to differences in data and model classes, it has been challenging to compare these approaches, leading to uncertainty as to how to evaluate long-context performance and whether it differs from standard evaluation. We implement a controlled protocol for extension methods with a standardized evaluation, utilizing consistent base models and extension data. Our study yields several insights into long-context behavior. First, we reaffirm the critical role of perplexity as a general-purpose performance indicator even in longer-context tasks. Second, we find that current approximate attention methods systematically underperform across long-context tasks. Finally, we confirm that exact fine-tuning based methods are generally effective within the range of their extension, whereas extrapolation remains challenging. All codebases, models, and checkpoints will be made available open-source, promoting transparency and facilitating further research in this critical area of AI development.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.687953948974609, 0.39797431230545044], "openalex_id": "https://openalex.org/W4402649144", "title": "Empowering geoportals HCI with task-oriented chatbots through NLP and deep transfer learning", "authors": "Mohammad H. Vahidnia", "abstract": "In the past ten years, chatbot development has matured to become one of the most well-distinguished outcomes of artificial intelligence. Despite some criticism, Bing AI, ChatGPT and other natural language processing (NLP) products of similar nature are becoming more popular. The creation of chatbots can close several gaps in geographic information retrieval as well. This research introduces and successfully implements, for the first time, a model for integrating task-oriented chatbots into geoportals, with the goal of easing user requests, improving access to geospatial services, and fostering human-computer interactions (HCI). Additionally, it presents a novel recommendation solution for matching the most appropriate volunteer to the user\u2019s geospatial needs based on expertise similarity, semantic similarity, and community feedback. The three categories of finding map services, discovering geoprocessing services, and volunteer expert recommendations were shown to be the most significant geoportal bot intents. Depending on the requirement, each intent additionally includes various entities such as time, place, description, skill, etc. The notion of deep transfer learning (DTL) was then put into practice by customizing a pre-trained BERT (Bidirectional Encoder Representations from Transformers) model for our particular aim and creating a task-oriented conversational agent. According to the results, effective intent classification and entity recognition in the geospatial domain could arise from this approach. We performed the training process with 200 sample data, 20% of which were utilized in a stratified manner for testing, and we obtained f1-scores of at least 0.75. Finally, a pilot Geoportal Chabot that combines crowdsourcing and conversational agents\u2019 approaches was put into use and tested with success. In keeping with SDI technical purposes, the system might direct users to common geospatial web services, namely WMS and WPS, in addition to including natural language understanding (NLU) and natural language generation (NLG) capabilities. Result of user-centered evaluation indicated that the integration of a chatbot significantly reduces the average time required to access geospatial data and processing services by more than 50%. Notably, this effect is even more pronounced when locating an expert, with a fivefold decrease in the time required. Finally, overall user satisfaction rose from 86% to 94%.", "venue": "Big Earth Data", "label": 0}, {"loc": [6.371293067932129, 2.6556107997894287], "openalex_id": "https://openalex.org/W4403713466", "title": "RoMath: A Mathematical Reasoning Benchmark in Romanian", "authors": "Adrian Cosma, Ana-Maria Bucur, Emilian R\u0103doi", "abstract": "Mathematics has long been conveyed through natural language, primarily for human understanding. With the rise of mechanized mathematics and proof assistants, there is a growing need to understand informal mathematical text, yet most existing benchmarks focus solely on English, overlooking other languages. This paper introduces RoMath, a Romanian mathematical reasoning benchmark suite comprising three subsets: Baccalaureate, Competitions and Synthetic, which cover a range of mathematical domains and difficulty levels, aiming to improve non-English language models and promote multilingual AI development. By focusing on Romanian, a low-resource language with unique linguistic features, RoMath addresses the limitations of Anglo-centric models and emphasizes the need for dedicated resources beyond simple automatic translation. We benchmark several open-weight language models, highlighting the importance of creating resources for underrepresented languages. Code and datasets are be made available.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.205912113189697, -0.27846869826316833], "openalex_id": "https://openalex.org/W4403706132", "title": "Cross-lingual transfer of multilingual models on low resource African Languages", "authors": "Harish Thangaraj, Ananya Chenat, Jaskaran Singh Walia, Vukosi Marivate", "abstract": "Large multilingual models have significantly advanced natural language processing (NLP) research. However, their high resource demands and potential biases from diverse data sources have raised concerns about their effectiveness across low-resource languages. In contrast, monolingual models, trained on a single language, may better capture the nuances of the target language, potentially providing more accurate results. This study benchmarks the cross-lingual transfer capabilities from a high-resource language to a low-resource language for both, monolingual and multilingual models, focusing on Kinyarwanda and Kirundi, two Bantu languages. We evaluate the performance of transformer based architectures like Multilingual BERT (mBERT), AfriBERT, and BantuBERTa against neural-based architectures such as BiGRU, CNN, and char-CNN. The models were trained on Kinyarwanda and tested on Kirundi, with fine-tuning applied to assess the extent of performance improvement and catastrophic forgetting. AfriBERT achieved the highest cross-lingual accuracy of 88.3% after fine-tuning, while BiGRU emerged as the best-performing neural model with 83.3% accuracy. We also analyze the degree of forgetting in the original language post-fine-tuning. While monolingual models remain competitive, this study highlights that multilingual models offer strong cross-lingual transfer capabilities in resource limited settings.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.54805850982666, 0.34733933210372925], "openalex_id": "https://openalex.org/W4403707125", "title": "Measuring and Enhancing Trustworthiness of LLMs in RAG through Grounded Attributions and Learning to Refuse", "authors": "Michael Song, Shang Hong Sim, Rishabh Bhardwaj, Hai Leong Chieu, Navonil Majumder, Soujanya Poria", "abstract": "LLMs are an integral component of retrieval-augmented generation (RAG) systems. While many studies focus on evaluating the overall quality of end-to-end RAG systems, there is a gap in understanding the appropriateness of LLMs for the RAG task. To address this, we introduce Trust-Score, a holistic metric that evaluates the trustworthiness of LLMs within the RAG framework. Our results show that various prompting methods, such as in-context learning, fail to effectively adapt LLMs to the RAG task as measured by Trust-Score. Consequently, we propose Trust-Align, a method to align LLMs for improved Trust-Score performance. 26 out of 27 models aligned using Trust-Align substantially outperform competitive baselines on ASQA, QAMPARI, and ELI5. Specifically, in LLaMA-3-8b, Trust-Align outperforms FRONT on ASQA (up 12.56), QAMPARI (up 36.04), and ELI5 (up 17.69). Trust-Align also significantly enhances models' ability to correctly refuse and provide quality citations. We also demonstrate the effectiveness of Trust-Align across different open-weight models, including the LLaMA series (1b to 8b), Qwen-2.5 series (0.5b to 7b), and Phi3.5 (3.8b). We release our code at https://github.com/declare-lab/trust-align.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.995352745056152, 3.454970121383667], "openalex_id": "https://openalex.org/W4403668311", "title": "Automatic Control With Human-Like Reasoning", "authors": "Justas Andriu\u0161kevi\u010dius, Junzi Sun", "abstract": "Recent developments in language models have created new opportunities in air traffic control studies. The current focus is primarily on text and language-based use cases. However, these language models may offer a higher potential impact in the air traffic control domain, thanks to their ability to interact with air traffic environments in an embodied agent form. They also provide a language-like reasoning capability to explain their decisions, which has been a significant roadblock for the implementation of automatic air traffic control. This paper investigates the application of a language model-based agent with function-calling and learning capabilities to resolve air traffic conflicts without human intervention. The main components of this research are foundational large language models, tools that allow the agent to interact with the simulator, and a new concept, the experience library. An innovative part of this research, the experience library, is a vector database that stores synthesized knowledge that agents have learned from interactions with the simulations and language models. To evaluate the performance of our language model-based agent, both open-source and closed-source models were tested. The results of our study reveal significant differences in performance across various configurations of the language model-based agents. The best-performing configuration was able to solve almost all 120 but one imminent conflict scenarios, including up to four aircraft at the same time. Most importantly, the agents are able to provide human-level text explanations on traffic situations and conflict resolution strategies.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.59013557434082, 2.3405096530914307], "openalex_id": "https://openalex.org/W4403668254", "title": "Unveiling Gender Bias in Large Language Models: Using Teacher's Evaluation in Higher Education As an Example", "authors": "Yuanning Huang", "abstract": "This paper investigates gender bias in Large Language Model (LLM)-generated teacher evaluations in higher education setting, focusing on evaluations produced by GPT-4 across six academic subjects. By applying a comprehensive analytical framework that includes Odds Ratio (OR) analysis, Word Embedding Association Test (WEAT), sentiment analysis, and contextual analysis, this paper identified patterns of gender-associated language reflecting societal stereotypes. Specifically, words related to approachability and support were used more frequently for female instructors, while words related to entertainment were predominantly used for male instructors, aligning with the concepts of communal and agentic behaviors. The study also found moderate to strong associations between male salient adjectives and male names, though career and family words did not distinctly capture gender biases. These findings align with prior research on societal norms and stereotypes, reinforcing the notion that LLM-generated text reflects existing biases.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.07180118560791, 3.522463798522949], "openalex_id": "https://openalex.org/W4403668311", "title": "Automatic Control With Human-Like Reasoning: Exploring Language Model Embodied Air Traffic Agents", "authors": "Justas Andriu\u0161kevi\u010dius, Junzi Sun", "abstract": "Recent developments in language models have created new opportunities in air traffic control studies. The current focus is primarily on text and language-based use cases. However, these language models may offer a higher potential impact in the air traffic control domain, thanks to their ability to interact with air traffic environments in an embodied agent form. They also provide a language-like reasoning capability to explain their decisions, which has been a significant roadblock for the implementation of automatic air traffic control. This paper investigates the application of a language model-based agent with function-calling and learning capabilities to resolve air traffic conflicts without human intervention. The main components of this research are foundational large language models, tools that allow the agent to interact with the simulator, and a new concept, the experience library. An innovative part of this research, the experience library, is a vector database that stores synthesized knowledge that agents have learned from interactions with the simulations and language models. To evaluate the performance of our language model-based agent, both open-source and closed-source models were tested. The results of our study reveal significant differences in performance across various configurations of the language model-based agents. The best-performing configuration was able to solve almost all 120 but one imminent conflict scenarios, including up to four aircraft at the same time. Most importantly, the agents are able to provide human-level text explanations on traffic situations and conflict resolution strategies.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.757063388824463, 5.138370513916016], "openalex_id": "https://openalex.org/W4403702853", "title": "XLM for Autonomous Driving Systems: A Comprehensive Review", "authors": "Sonda Fourati, Wael Jaafar, Noura Baccar, Safwan Alfattani", "abstract": "Large Language Models (LLMs) have showcased remarkable proficiency in various information-processing tasks. These tasks span from extracting data and summarizing literature to generating content, predictive modeling, decision-making, and system controls. Moreover, Vision Large Models (VLMs) and Multimodal LLMs (MLLMs), which represent the next generation of language models, a.k.a., XLMs, can combine and integrate many data modalities with the strength of language understanding, thus advancing several information-based systems, such as Autonomous Driving Systems (ADS). Indeed, by combining language communication with multimodal sensory inputs, e.g., panoramic images and LiDAR or radar data, accurate driving actions can be taken. In this context, we provide in this survey paper a comprehensive overview of the potential of XLMs towards achieving autonomous driving. Specifically, we review the relevant literature on ADS and XLMs, including their architectures, tools, and frameworks. Then, we detail the proposed approaches to deploy XLMs for autonomous driving solutions. Finally, we provide the related challenges to XLM deployment for ADS and point to future research directions aiming to enable XLM adoption in future ADS frameworks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.0814809799194336, 5.29608154296875], "openalex_id": "https://openalex.org/W4402568227", "title": "Across the Spectrum In-Depth Review AI-Based Models for Phishing Detection", "authors": "Shakeel Ahmad, Muhammad Zaman, Ahmad Sami Al\u2010Shamayleh, Tanzila Kehkashan, Rajan Ahmad, Shafi\u2019i Muhammad Abdulhamid, \u0130smail Ergen, Adnan Akhunzada", "abstract": "

Advancement of the Internet has increased security risks associated with data protection and online shopping. Several techniques compromise Internet security, including hacking, SQL injection, phishing attacks, and DNS tunneling. Phishing attacks are particularly significant among web phishing techniques. In a phishing attack, the attacker creates a fake website that closely resembles a legitimate one to deceive users into providing sensitive information. These attacks can be detected using both traditional and modern AI-based models. However, even with state-of-the-art methods, accurately classifying newly emerged links as phishing or legitimate remains a challenge. This study conducts a comparative analysis of more than 130 articles published between 2020 and 2024, identifying challenges and gaps in the literature and comparing the findings of various authors. The novelty of this research lies in providing a roadmap for researchers, practitioners, and cybersecurity experts to navigate the landscape of machine learning (ML) and deep learning (DL) models for phishing detection. The study reviews traditional phishing detection methods, ML and DL models, phishing datasets, and the step-by-step phishing process. It highlights limitations, research gaps, weaknesses, and potential improvements. Accuracy measures are used to compare model performance. In conclusion, this research provides a comprehensive survey of website phishing detection using AI models, offering a new roadmap for future studies

Other Information

Published in: IEEE Open Journal of the Communications Society
License: http://creativecommons.org/licenses/by/4.0/
See article on publisher's website: https://dx.doi.org/10.1109/ojcoms.2024.3462503", "venue": "IEEE Open Journal of the Communications Society", "label": 0}, {"loc": [2.2093517780303955, 5.18010950088501], "openalex_id": "https://openalex.org/W4403666679", "title": "DomURLs_BERT: Pre-trained BERT-based Model for Malicious Domains and URLs Detection and Classification", "authors": "Abdelkader El Mahdaouy, Salima Lamsiyah, Meryem Janati Idrissi, Hamza Alami, Zakaria Yartaoui, Isma\u00efl Berrada", "abstract": "Detecting and classifying suspicious or malicious domain names and URLs is fundamental task in cybersecurity. To leverage such indicators of compromise, cybersecurity vendors and practitioners often maintain and update blacklists of known malicious domains and URLs. However, blacklists frequently fail to identify emerging and obfuscated threats. Over the past few decades, there has been significant interest in developing machine learning models that automatically detect malicious domains and URLs, addressing the limitations of blacklists maintenance and updates. In this paper, we introduce DomURLs_BERT, a pre-trained BERT-based encoder adapted for detecting and classifying suspicious/malicious domains and URLs. DomURLs_BERT is pre-trained using the Masked Language Modeling (MLM) objective on a large multilingual corpus of URLs, domain names, and Domain Generation Algorithms (DGA) dataset. In order to assess the performance of DomURLs_BERT, we have conducted experiments on several binary and multi-class classification tasks involving domain names and URLs, covering phishing, malware, DGA, and DNS tunneling. The evaluations results show that the proposed encoder outperforms state-of-the-art character-based deep learning models and cybersecurity-focused BERT models across multiple tasks and datasets. The pre-training dataset, the pre-trained DomURLs_BERT encoder, and the experiments source code are publicly available.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.833021402359009, -0.4814954400062561], "openalex_id": "https://openalex.org/W4403666337", "title": "Eir: Thai Medical Large Language Models", "authors": "Yutthakorn Thiprak, Rungtam Ngodngamthaweesuk, Songtam Ngodngamtaweesuk", "abstract": "We present Eir-8B, a large language model with 8 billion parameters, specifically designed to enhance the accuracy of handling medical tasks in the Thai language. This model focuses on providing clear and easy-to-understand answers for both healthcare professionals and patients, thereby improving the efficiency of diagnosis and treatment processes. Human evaluation was conducted to ensure that the model adheres to care standards and provides unbiased answers. To prioritize data security, the model is deployed within the hospital's internal network, ensuring both high security and faster processing speeds. The internal API connection is secured with encryption and strict authentication measures to prevent data leaks and unauthorized access. We evaluated several open-source large language models with 8 billion parameters on four medical benchmarks: MedQA, MedMCQA, PubMedQA, and the medical subset of MMLU. The best-performing baselines were used to develop Eir-8B. Our evaluation employed multiple questioning strategies, including zero-shot, few-shot, chain-of-thought reasoning, and ensemble/self-consistency voting methods. Our model outperformed commercially available Thai-language large language models by more than 10%. In addition, we developed enhanced model testing tailored for clinical use in Thai across 18 clinical tasks, where our model exceeded GPT-4o performance by more than 11%.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.5733611583709717, 1.5869683027267456], "openalex_id": "https://openalex.org/W4402659119", "title": "ACCEPTANCE OF GENERATIVE AI IN KNOWLEDGE WORK", "authors": "J\u00e9r\u00e9mie Clos, Yoke Yie Chen", "abstract": "Generative artificial intelligence (AI) has become one of the main concerns of knowledge workers due to its ability to mimic realistic human reasoning and creativity. However, this integration raises critical concerns about trust and ethics, which are crucial in shaping both the acceptance and effective utilisation of these technologies. There are many reports, articles and papers currently exploring the opportunities and challenges of LLMs in higher education from the perspective of students and educators. However, these papers often focus on specific contexts like in the UK, US or a particular institutions. In this paper, we examine the problems of generative AI in higher education from educator and student perspectives using scientometrics and text analysis to provide an overview of the research landscape, followed by a narrative review and thematic analysis of selected literature. Some findings of this work are: (1) Students and educators found different ways to use generative AI. Students focus more on using it as an assistant (revising and preparing for lectures, helping with homework) and educators as a content production assistant (writing lecture notes, personalising content). Commonalities are that both students and educators use generative AI as an accessibility aid, e.g., to rephrase sentences or explain concepts. (2) The main concerns of higher education regarding generative AI are equity in access, clarity of rules regarding usage, and job displacement.", "venue": "https://doi.org/10.1145/3686038.3686063", "label": 0}, {"loc": [3.2086822986602783, 1.541115164756775], "openalex_id": "https://openalex.org/W4402553424", "title": "Explainable Generative AI (GenXAI): A Survey, Conceptualization, and Research Agenda", "authors": "Johannes Schneider", "abstract": "Abstract Generative AI (GenAI) represents a shift from AI\u2019s ability to \u201crecognize\u201d to its ability to \u201cgenerate\u201d solutions for a wide range of tasks. As generated solutions and applications grow more complex and multi-faceted, new needs, objectives, and possibilities for explainability (XAI) have emerged. This work elaborates on why XAI has gained importance with the rise of GenAI and the challenges it poses for explainability research. We also highlight new and emerging criteria that explanations should meet, such as verifiability, interactivity, security, and cost considerations. To achieve this, we focus on surveying existing literature. Additionally, we provide a taxonomy of relevant dimensions to better characterize existing XAI mechanisms and methods for GenAI. We explore various approaches to ensure XAI, ranging from training data to prompting. Our paper provides a concise technical background of GenAI for non-technical readers, focusing on text and images to help them understand new or adapted XAI techniques for GenAI. However, due to the extensive body of work on GenAI, we chose not to delve into detailed aspects of XAI related to the evaluation and usage of explanations. Consequently, the manuscript appeals to both technical experts and professionals from other fields, such as social scientists and information systems researchers. Our research roadmap outlines over ten directions for future investigation.", "venue": "Artificial Intelligence Review", "label": 18}, {"loc": [3.2801756858825684, 3.6887309551239014], "openalex_id": "https://openalex.org/W4402507908", "title": "Enhancing Cyber Security Enhancement Through Generative AI", "authors": "Siva Raja Sindiramutty, Krishna Raj V. Prabagaran, N. Z. Jhanjhi, Raja Kumar Murugesan, Sarfraz Nawaz Brohi, Mehedi Masud", "abstract": "Protecting virtual assets from cyber threats is essential as we live in a digitally advanced world. Providing a responsible emphasis on proper network security and intrusion detection is imperative. On the other hand, traditional strategies need a supportive tool to adapt to the transforming threat space. New generative AI techniques like generative adversarial networks (GANs) and variational autoencoders (VAEs) are the mainstream technologies required to meet the gap. This chapter deals with how these models can enhance network security by inspecting the network traffic for anomalies and malicious behaviors detected through unsupervised learning, which considers strange or emerging phenomena. This survey features innovations in fault detection, behavior control, deep packet inspection, traffic classification, and examples of real-world intrusions detected by GAN-based systems. Furthermore, the chapter focuses on the challenges of adversarial attacks on models that require the development of solid defense mechanisms, such as generative adversarial networks. Ethics becomes the following matter on our list of discussions, given that privacy transparency and accountability are to be observed when working with generative AI technologies in network security. Finally, the authors examine trends that determine how cyber-attacks are dealt with comprehensively.", "venue": "Advances in information security, privacy, and ethics book series", "label": 0}, {"loc": [8.398508071899414, 0.8620520830154419], "openalex_id": "https://openalex.org/W4403853630", "title": "Synthetic continued pretraining", "authors": "Zitong Yang, Neil Band, Shuangning Li, Emmanuel Cand\u00e8s, Tatsunori Hashimoto", "abstract": "Pretraining on large-scale, unstructured internet text enables language models to acquire a significant amount of world knowledge. However, this knowledge acquisition is data-inefficient--to learn a given fact, models must be trained on hundreds to thousands of diverse representations of it. This poses a challenge when adapting a pretrained model to a small corpus of domain-specific documents, where each fact may appear rarely or only once. We propose to bridge this gap with synthetic continued pretraining: using the small domain-specific corpus to synthesize a large corpus more amenable to learning, and then performing continued pretraining on the synthesized corpus. We instantiate this proposal with EntiGraph, a synthetic data augmentation algorithm that extracts salient entities from the source documents and then generates diverse text by drawing connections between the sampled entities. Synthetic continued pretraining with EntiGraph enables a language model to answer questions and follow generic instructions related to the source documents without access to them. If, instead, the source documents are available at inference time, we show that the knowledge acquired through our approach compounds with retrieval-augmented generation. To better understand these results, we build a simple mathematical model of EntiGraph, and show how synthetic data augmentation can \"rearrange\" knowledge to enable more data-efficient learning.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.1580810546875, 0.975069522857666], "openalex_id": "https://openalex.org/W4402487814", "title": "Documenting Geographically and Contextually Diverse Language Data Sources", "authors": "Angelina McMillan-Major, Francesco De Toni, Zaid Alyafeai, Stella Biderman, Kimbo Chen, G\u00e9rard Dupont, Hady Elsahar, Chris Chinenye Emezue, Alham Fikri Aji, Suzana Ili\u0107, Nurulaqilla Khamis, Colin Leong, Maraim Masoud, Aitor Soroa, Pedro Ortiz Suarez, Daniel van Strien, Zeerak Talat, Yacine Jernite", "abstract": "Contemporary large-scale data collection efforts have prioritized the amount of data collected to improve large language models (LLM). This quantitative approach has resulted in concerns for the rights of data subjects represented in data collections. This concern is exacerbated by a lack of documentation and analysis tools, making it difficult to interrogate these collections. Mindful of these pitfalls, we present a methodology for documentation-first, human-centered data collection. We apply this approach in an effort to train a multilingual LLM. We identify a geographically diverse set of target language groups (Arabic varieties, Basque, Chinese varieties, Catalan, English, French, Indic languages, Indonesian, Niger-Congo languages, Portuguese, Spanish, and Vietnamese, as well as programming languages) for which to collect metadata on potential data sources. We structure this effort by developing an online catalogue in English as a tool for gathering metadata through public hackathons. We present our tool and analyses of the resulting resource metadata, including distributions over languages, regions, and resource types, and discuss our lessons learned.", "venue": "Northern European Journal of Language Technology", "label": 0}, {"loc": [4.279057502746582, 2.104196071624756], "openalex_id": "https://openalex.org/W4403592812", "title": "Identifying the sources of ideological bias in GPT models through linguistic variation in output", "authors": "Christina Walker, Joan C. Timoneda", "abstract": "Extant work shows that generative AI models such as GPT-3.5 and 4 perpetuate social stereotypes and biases. One concerning but less explored source of bias is ideology. Do GPT models take ideological stances on politically sensitive topics? In this article, we provide an original approach to identifying ideological bias in generative models, showing that bias can stem from both the training data and the filtering algorithm. We leverage linguistic variation in countries with contrasting political attitudes to evaluate bias in average GPT responses to sensitive political topics in those languages. First, we find that GPT output is more conservative in languages that map well onto conservative societies (i.e., Polish), and more liberal in languages used uniquely in liberal societies (i.e., Swedish). This result provides strong evidence of training data bias in GPT models. Second, differences across languages observed in GPT-3.5 persist in GPT-4, even though GPT-4 is significantly more liberal due to OpenAI's filtering policy. Our main takeaway is that generative model training must focus on high-quality, curated datasets to reduce bias, even if it entails a compromise in training data size. Filtering responses after training only introduces new biases and does not remove the underlying training biases.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.616305351257324, 2.8695051670074463], "openalex_id": "https://openalex.org/W4403598251", "title": "Optimization Hyper-parameter Laws for Large Language Models", "authors": "Xianan Xie, Kuangyu Ding, Shuicheng Yan, Kevin Toh, Tianwen Wei", "abstract": "Large Language Models have driven significant AI advancements, yet their training is resource-intensive and highly sensitive to hyper-parameter selection. While scaling laws provide valuable guidance on model size and data requirements, they fall short in choosing dynamic hyper-parameters, such as learning-rate (LR) schedules, that evolve during training. To bridge this gap, we present Optimization Hyper-parameter Laws (Opt-Laws), a framework that effectively captures the relationship between hyper-parameters and training outcomes, enabling the pre-selection of potential optimal schedules. Grounded in stochastic differential equations, Opt-Laws introduce novel mathematical interpretability and offer a robust theoretical foundation for some popular LR schedules. Our extensive validation across diverse model sizes and data scales demonstrates Opt-Laws' ability to accurately predict training loss and identify optimal LR schedule candidates in pre-training, continual training, and fine-tuning scenarios. This approach significantly reduces computational costs while enhancing overall model performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.939398765563965, 2.537693500518799], "openalex_id": "https://openalex.org/W4403593660", "title": "How Does Code Pretraining Affect Language Model Task Performance?", "authors": "Jackson Petty, Sjoerd van Steenkiste, Tal Linzen", "abstract": "Large language models are increasingly trained on corpora containing both natural language and non-linguistic data like source code. Aside from aiding programming-related tasks, anecdotal evidence suggests that including code in pretraining corpora may improve performance on other, unrelated tasks, yet to date no work has been able to establish a causal connection by controlling between language and code data. Here we do just this. We pretrain language models on datasets which interleave natural language and code in two different settings: additive, in which the total volume of data seen during pretraining is held constant; and competitive, in which the volume of language data is held constant. We study how the pretraining mixture affects performance on (a) a diverse collection of tasks included in the BigBench benchmark, and (b) compositionality, measured by generalization accuracy on semantic parsing and syntactic transformations. We find that pretraining on higher proportions of code improves performance on compositional tasks involving structured output (like semantic parsing), and mathematics. Conversely, increase code mixture can harm performance on other tasks, including on tasks that requires sensitivity to linguistic structure such as syntax or morphology, and tasks measuring real-world knowledge.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.9035468101501465, 2.1681106090545654], "openalex_id": "https://openalex.org/W4403598250", "title": "Untie the Knots: An Efficient Data Augmentation Strategy for Long-Context Pre-Training in Language Models", "authors": "Junfeng Tian, Da Zheng, Cheng Yang, Rui Wang, Colin Zhang, Debing Zhang", "abstract": "Large language models (LLM) have prioritized expanding the context window from which models can incorporate more information. However, training models to handle long contexts presents significant challenges. These include the scarcity of high-quality natural long-context data, the potential for performance degradation on short-context tasks, and the reduced training efficiency associated with attention mechanisms. In this paper, we introduce Untie the Knots (\\textbf{UtK}), a novel data augmentation strategy employed during the continue pre-training phase, designed to efficiently enable LLMs to gain long-context capabilities without the need to modify the existing data mixture. In particular, we chunk the documents, shuffle the chunks, and create a complex and knotted structure of long texts; LLMs are then trained to untie these knots and identify relevant segments within seemingly chaotic token sequences. This approach greatly improves the model's performance by accurately attending to relevant information in long context and the training efficiency is also largely increased. We conduct extensive experiments on models with 7B and 72B parameters, trained on 20 billion tokens, demonstrating that UtK achieves 75\\% and 84.5\\% accurracy on RULER at 128K context length, significantly outperforming other long context strategies. The trained models will open-source for further research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.126116752624512, 1.9028289318084717], "openalex_id": "https://openalex.org/W4403665988", "title": "Tele-LLMs: A Series of Specialized Large Language Models for Telecommunications", "authors": "Ali Maatouk, Kenny Chirino Ampudia, Rex Ying, Leandros Tassiulas", "abstract": "The emergence of large language models (LLMs) has significantly impacted various fields, from natural language processing to sectors like medicine and finance. However, despite their rapid proliferation, the applications of LLMs in telecommunications remain limited, often relying on general-purpose models that lack domain-specific specialization. This lack of specialization results in underperformance, particularly when dealing with telecommunications-specific technical terminology and their associated mathematical representations. This paper addresses this gap by first creating and disseminating Tele-Data, a comprehensive dataset of telecommunications material curated from relevant sources, and Tele-Eval, a large-scale question-and-answer dataset tailored to the domain. Through extensive experiments, we explore the most effective training techniques for adapting LLMs to the telecommunications domain, ranging from examining the division of expertise across various telecommunications aspects to employing parameter-efficient techniques. We also investigate how models of different sizes behave during adaptation and analyze the impact of their training data on this behavior. Leveraging these findings, we develop and open-source Tele-LLMs, the first series of language models ranging from 1B to 8B parameters, specifically tailored for telecommunications. Our evaluations demonstrate that these models outperform their general-purpose counterparts on Tele-Eval and telecommunications-related literature tasks while retaining their previously acquired capabilities, thus avoiding the catastrophic forgetting phenomenon.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.8578009605407715, 3.12209153175354], "openalex_id": "https://openalex.org/W4402360722", "title": "Timely Quality Problem Resolution in Peer-Production Systems: The Impact of Bots, Policy Citations, and Contributor Experience", "authors": "Vitali Mindel, Aleksi Aaltonen, Arun Rai, Lars Mathiassen, Wael Jabr", "abstract": "Online peer-production systems create value by enabling people to participate in the production of a common good such as an open encyclopedia by building freely on each other\u2019s work. Fixing quality problems in peer production in a timely manner is critical because millions of people rely on peer-produced content for learning and decision making. The longer low-quality content remains in place, the more it can harm the reputation of a peer-production system and diminish the capability of the system to maintain its contributor base. We study different mechanism affecting the timeliness of quality problem resolution in Wikipedia and find that the speedy resolution of quality problems depends on the successful integration of software robots (bots) and the careful calibration of policy citations to the different levels of experience among contributors. Most control mechanisms found in firm-based production do not apply to peer production, and instead, quality control in peer production must leverage the strengths of different contributors and harness the benefits of technological support and adaptive policy frameworks to improve productivity and achieve high-quality outcomes.", "venue": "Information Systems Research", "label": 0}, {"loc": [9.21230697631836, -0.8715624809265137], "openalex_id": "https://openalex.org/W4402357486", "title": "LAWSUIT: a LArge expert-Written SUmmarization dataset of ITalian constitutional court verdicts", "authors": "Luca Ragazzi, Gianluca Moro, Stefano Guidi, Giacomo Frisoni", "abstract": "Abstract Large-scale public datasets are vital for driving the progress of abstractive summarization, especially in law, where documents have highly specialized jargon. However, the available resources are English-centered, limiting research advancements in other languages. This paper introduces LAWSUIT, a collection of 14K Italian legal verdicts with expert-authored abstractive maxims drawn from the Constitutional Court of the Italian Republic. LAWSUIT presents an arduous task with lengthy source texts and evenly distributed salient content. We offer extensive experiments with sequence-to-sequence and segmentation-based approaches, revealing that the latter achieve better results in full and few-shot settings. We openly release LAWSUIT to foster the development and automation of real-world legal applications.", "venue": "Artificial Intelligence and Law", "label": 0}, {"loc": [7.868521213531494, 0.542282223701477], "openalex_id": "https://openalex.org/W4402294378", "title": "Text-to-text generative approach for enhanced complex word identification", "authors": "Patrycja \u015aliwiak, Syed Afaq Ali Shah", "abstract": "This paper presents a novel approach for solving the Complex Word Identification (CWI) task using the text-to-text generative model. The CWI task involves identifying complex words in text, which is a challenging Natural Language Processing task. To our knowledge, it is a first attempt to address CWI problem into text-to-text context. In this work, we propose a new methodology that leverages the power of the Transformer model to evaluate complexity of words in binary and probabilistic settings. We also propose a novel CWI dataset, which consists of 62,200 phrases, both complex and simple. We train and fine-tune our proposed model on our CWI dataset. We also evaluate its performance on separate test sets across three different domains. Our experimental results demonstrate the effectiveness of our proposed approach compared to state-of-the-art methods.", "venue": "Neurocomputing", "label": 0}, {"loc": [3.88883900642395, -3.77482533454895], "openalex_id": "https://openalex.org/W4402318904", "title": "Should we stay silent on violence? An ensemble approach to detect violent incidents in Spanish social media texts", "authors": "Deepawali Sharma, Vedika Gupta, Vivek Kumar Singh, David Pinto", "abstract": "Abstract There has been a steep rise in user-generated content on the Web and social media platforms during the last few years. While the ease of content creation allows anyone to create content, at the same time it is difficult to monitor and control the spread of detrimental content. Recent research in natural language processing and machine learning has shown some hope for the purpose. Approaches and methods are now being developed for the automatic flagging of problematic textual content, namely hate speech, cyberbullying, or fake news, though mostly for English language texts. This paper presents an algorithmic approach based on deep learning models for the detection of violent incidents from tweets in the Spanish language (binary classification) and categorizes them further into five classes \u2013 accident, homicide, theft, kidnapping, and none (multi-label classification). The performance is evaluated on the recently shared benchmark dataset, and it is found that the proposed approach outperforms the various deep learning models, with a weighted average precision, recall, and F1-score of 0.82, 0.81, and 0.80, respectively, for the binary classification. Similarly, for the multi-label classification, the proposed model reports weighted average precision, recall, and F1-score of 0.54, 0.79, and 0.64, respectively, which is also superior to the existing results reported in the literature. The study, thus, presents meaningful contribution to detection of violent incidents in Spanish language social media posts.", "venue": "Natural language processing.", "label": 15}, {"loc": [9.533604621887207, 0.8596752285957336], "openalex_id": "https://openalex.org/W4402322571", "title": "Statistical dataset evaluation: A case study on named entity recognition", "authors": "Chengwen Wang, Qingxiu Dong, Xiaochen Wang, Zhifang Sui", "abstract": "Abstract Datasets serve as crucial training resources and model performance trackers. However, existing datasets have exposed a plethora of problems, inducing biased models and unreliable evaluation results. In this paper, we propose a model-agnostic dataset evaluation framework for automatic dataset quality evaluation. We seek the statistical properties of the datasets and address three fundamental dimensions: reliability, difficulty, and validity, following a Classical Test Theory (CTT). Taking the named entity recognition (NER) datasets as a case study, we introduce nine statistical metrics for a statistical dataset evaluation framework. Specifically, we investigate the reliability of a NER dataset with three metrics, including Redundancy, Accuracy, and Leakage Ratio. We assess the dataset difficulty through four metrics: Unseen Entity Ratio, Entity Ambiguity Degree, Entity Density, and Model Differentiation. For validity, we introduce the Entity Imbalance Degree and Entity-Null Rate to evaluate the effectiveness of the dataset in assessing language model performance. Experimental results validate that our evaluation framework effectively assesses various aspects of the dataset quality. Furthermore, we study how the dataset scores on our statistical metrics affect the model performance and appeal for dataset quality evaluation or targeted dataset improvement before training or testing models.", "venue": "Natural language processing.", "label": 15}, {"loc": [8.678323745727539, -1.0265612602233887], "openalex_id": "https://openalex.org/W4403160498", "title": "Historical German Text Normalization Using Type-and Token-Based Language Modeling", "authors": "Anton Ehrmanntraut", "abstract": "Historic variations of spelling poses a challenge for full-text search or natural language processing on historical digitized texts. To minimize the gap between the historic orthography and contemporary spelling, usually an automatic orthographic normalization of the historical source material is pursued. This report proposes a normalization system for German literary texts from c. 1700-1900, trained on a parallel corpus. The proposed system makes use of a machine learning approach using Transformer language models, combining an encoder-decoder model to normalize individual word types, and a pre-trained causal language model to adjust these normalizations within their context. An extensive evaluation shows that the proposed system provides state-of-the-art accuracy, comparable with a much larger fully end-to-end sentence-based normalization system, fine-tuning a pre-trained Transformer large language model. However, the normalization of historical text remains a challenge due to difficulties for models to generalize, and the lack of extensive high-quality parallel data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.74538516998291, 2.4096250534057617], "openalex_id": "https://openalex.org/W4402954430", "title": "DataSculpt: Crafting Data Landscapes for LLM Post-Training through Multi-objective Partitioning", "authors": "Keer Lu, Liang Zheng, Xiaonan Nie, Da Pan, Shusen Zhang, Keshi Zhao, Weipeng Chen, Zenan Zhou, Guosheng Dong, Wentao Zhang, Bin Cui", "abstract": "In recent years, Large Language Models (LLMs) have demonstrated significant improvements across a variety of tasks, one of which is the long-context capability. The key to improving long-context performance lies in effective data organization and management strategies that integrate data from multiple domains and optimize the context window during training. Through extensive experimental analysis, we identified three key challenges in designing effective data management strategies that enable the model to achieve long-context capability without sacrificing performance in other tasks: (1) a shortage of long documents across multiple domains, (2) effective construction of context windows, and (3) efficient organization of large-scale datasets. To address these challenges, we introduce DataSculpt, a novel data management framework designed for long-context training. We first formulate the organization of training data as a multi-objective combinatorial optimization problem, focusing on attributes including relevance, homogeneity, integrity, and efficiency. Specifically, our approach utilizes a coarse-to-fine methodology to optimize training data organization both efficiently and effectively. We begin by clustering the data based on semantic similarity (coarse), followed by a multi-objective greedy search within each cluster to score and concatenate documents into various context windows (fine). Our comprehensive evaluations demonstrate that DataSculpt significantly enhances long-context training performance, resulting in improvements of 18.09% in retrieval augmentation, 21.23% in summarization, 21.27% in reading comprehension, and a 3.81% increase in code completion, while also maintaining overall model proficiency with a 4.88% improvement.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.268015384674072, 5.4074015617370605], "openalex_id": "https://openalex.org/W4402955109", "title": "General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model", "authors": "Haoran Wei, Chenglong Liu, Jinyue Chen, Jia Wang, Lingyu Kong, Yanming Xu, Zheng Ge, Zhao Liang, Jianjian Sun, Yuang Peng, Chunrui Han, Xiangyu Zhang", "abstract": "Traditional OCR systems (OCR-1.0) are increasingly unable to meet people's usage due to the growing demand for intelligent processing of man-made optical characters. In this paper, we collectively refer to all artificial optical signals (e.g., plain texts, math/molecular formulas, tables, charts, sheet music, and even geometric shapes) as \"characters\" and propose the General OCR Theory along with an excellent model, namely GOT, to promote the arrival of OCR-2.0. The GOT, with 580M parameters, is a unified, elegant, and end-to-end model, consisting of a high-compression encoder and a long-contexts decoder. As an OCR-2.0 model, GOT can handle all the above \"characters\" under various OCR tasks. On the input side, the model supports commonly used scene- and document-style images in slice and whole-page styles. On the output side, GOT can generate plain or formatted results (markdown/tikz/smiles/kern) via an easy prompt. Besides, the model enjoys interactive OCR features, i.e., region-level recognition guided by coordinates or colors. Furthermore, we also adapt dynamic resolution and multi-page OCR technologies to GOT for better practicality. In experiments, we provide sufficient results to prove the superiority of our model.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.912827253341675, 2.8745806217193604], "openalex_id": "https://openalex.org/W4402953261", "title": "Building Better Datasets: Seven Recommendations for Responsible Design from Dataset Creators", "authors": "Will Orr, Kate Crawford", "abstract": "The increasing demand for high-quality datasets in machine learning has raised concerns about the ethical and responsible creation of these datasets. Dataset creators play a crucial role in developing responsible practices, yet their perspectives and expertise have not yet been highlighted in the current literature. In this paper, we bridge this gap by presenting insights from a qualitative study that included interviewing 18 leading dataset creators about the current state of the field. We shed light on the challenges and considerations faced by dataset creators, and our findings underscore the potential for deeper collaboration, knowledge sharing, and collective development. Through a close analysis of their perspectives, we share seven central recommendations for improving responsible dataset creation, including issues such as data quality, documentation, privacy and consent, and how to mitigate potential harms from unintended use cases. By fostering critical reflection and sharing the experiences of dataset creators, we aim to promote responsible dataset creation practices and develop a nuanced understanding of this crucial but often undervalued aspect of machine learning research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.271153450012207, 1.8875888586044312], "openalex_id": "https://openalex.org/W4403584102", "title": "OnlySportsLM: Optimizing Sports-Domain Language Models with SOTA Performance under Billion Parameters", "authors": "Zi-Jia Chen, Chengxi Li, Xiang Xie, Parijat Dube", "abstract": "This paper explores the potential of a small, domain-specific language model trained exclusively on sports-related data. We investigate whether extensive training data with specially designed small model structures can overcome model size constraints. The study introduces the OnlySports collection, comprising OnlySportsLM, OnlySports Dataset, and OnlySports Benchmark. Our approach involves: 1) creating a massive 600 billion tokens OnlySports Dataset from FineWeb, 2) optimizing the RWKV architecture for sports-related tasks, resulting in a 196M parameters model with 20-layer, 640-dimension structure, 3) training the OnlySportsLM on part of OnlySports Dataset, and 4) testing the resultant model on OnlySports Benchmark. OnlySportsLM achieves a 37.62%/34.08% accuracy improvement over previous 135M/360M state-of-the-art models and matches the performance of larger models such as SomlLM 1.7B and Qwen 1.5B in the sports domain. Additionally, the OnlySports collection presents a comprehensive workflow for building high-quality, domain-specific language models, providing a replicable blueprint for efficient AI development across various specialized fields.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.631499290466309, 3.8628146648406982], "openalex_id": "https://openalex.org/W4402955413", "title": "OLMoE: Open Mixture-of-Experts Language Models", "authors": "Niklas Muennighoff, Luca Soldaini, Dirk Groeneveld, Kyle Lo, Jacob Morrison, Sewon Min, Weijia Shi, Pete Walsh, Oyvind Tafjord, Nathan Lambert, Yuling Gu, Shane Arora, Akshita Bhagia, Dustin Schwenk, David Wadden, Alexander Wettig, Binyuan Hui, Tim Dettmers, Douwe Kiela, Ali Farhadi, Noah A. Smith, Pang Wei Koh, Amanpreet Singh, Hannaneh Hajishirzi", "abstract": "We introduce OLMoE, a fully open, state-of-the-art language model leveraging sparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but uses only 1B per input token. We pretrain it on 5 trillion tokens and further adapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available models with similar active parameters, even surpassing larger ones like Llama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE training, analyze routing in our model showing high specialization, and open-source all aspects of our work: model weights, training data, code, and logs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.323622465133667, -0.7583858370780945], "openalex_id": "https://openalex.org/W4402325930", "title": "Natural Language Processing Approaches for Monitoring Health Activities", "authors": "Deahan Yu", "abstract": "Monitoring health activities is vital for individual and public health. It identifies trends, detects risks early, and tailors interventions. At the individual level, it informs customized treatments. At the population level, monitoring broad health patterns informs public health decisions. In this dissertation, we explore the application of natural language processing (NLP), machine learning, and deep learning techniques to advance methods for collecting and monitoring health activity data. We develop two NLP approaches tailored for health activities at individual and population levels: (1) capturing patient treatment activity through clinical notes to enhance patient monitoring practices, and (2) capturing adverse drug events using social media text data to augment drug safety practices. Each approach is customized to the unique characteristics of the respective text datasets and problems. By enhancing health monitoring practices via NLP, this dissertation ultimately endeavors to achieve better health outcomes. This dissertation provides not only a summary of novel NLP tools for monitoring health activities, but also a roadmap to extend research in multiple directions in Health Informatics. The primary focus on method development is complemented by an exploration of the effectiveness of various NLP approaches, from simple rules to advanced deep learning techniques. Such exploration illustrates that different kinds of approaches can be effective depending on the context, highlighting the importance of understanding and adapting to the specific characteristics of different text data sources and problems. We also underscore the critical role of reliability and interpretability in deploying NLP models in real-world healthcare contexts. Lastly, we discuss how NLP approaches can yield valuable insights for key stakeholders, including public health authorities, medical professionals, and pharmaceutical companies.", "venue": "Deep Blue (University of Michigan)", "label": 23}, {"loc": [5.631734848022461, 1.7875388860702515], "openalex_id": "https://openalex.org/W4402326015", "title": "Evaluating Prediction-Based Theories of Bilingual Comprehension of Spanish/English Codeswitches.", "authors": "Natasha Vernooij", "abstract": "This dissertation investigates how bilinguals use their two grammars to comprehend written intra-sentential codeswitches. I focus on adjective/noun constructions in Spanish and English where I manipulate the congruence of grammatical word order in the two languages across the codeswitch boundary. This allows me to test three codeswitching frameworks, the established Matrix Language Framework (Myers-Scotton, 2002) and two new frameworks that I propose, both of which integrate incremental predictions into their accounts of bilingual comprehension: the Current Word Hypothesis and the Surprisal Codeswitching Hypothesis. Each of the three frameworks propose that bilinguals use different types of information to predict upcoming language. The Matrix Language Framework proposes that bilinguals use the predominant language of the sentence to predict the upcoming word order of a sentence. The Current Word Hypothesis proposes that bilinguals use the language and grammatical category of the current word to predict the grammatical category of an upcoming word. The Surprisal Codeswitching Hypothesis proposes that bilinguals use the entire left context to predict upcoming words. Before testing the codeswitching frameworks, I identified which types of Spanish adjectives (pre-nominal; post-nominal; change: adjectives that change meaning based on their position; or no change: adjectives that do not change meaning based on their position) maximize the grammatical difference between Spanish and English. In an offline rating task, Spanish/English bilinguals preferred post-nominal and no change adjectives in the post-nominal position, and these were used in subsequent experimental stimuli. I then investigated bilingual processing of determiner/noun codeswitches where Spanish and English have the same word order and adjective/noun codeswitches where Spanish and English have different word orders in a stop-making-sense task. I established the task\u2019s viability for evaluating codeswitch comprehension and the predictions of the Matrix Language Framework and the Current Word Hypothesis. I then tested the two frameworks against each other in the same task and found overwhelming support for the Current Word Hypothesis. Finally, I compared surprisal as computed by GPT-3 to the human stop-making-sense data to evaluate if the Surprisal Codeswitching Hypothesis, Current Word Hypothesis, or Matrix Language Framework provide the best account for human codeswitch comprehension. Overall, I found support for codeswitching frameworks that include incremental predictions, though the Surprisal Codeswitching Hypothesis does not subsume the Current Word Hypothesis. Further, I evaluated the extent to which multilingual large language models (LLMs) such as GPT-3 can be used as a mental model for bilingual comprehension of codeswitches and found that LLMs can account for codeswitch effects but cannot fully account for the effects of other experimentally manipulated variables. In sum, this dissertation presents five main contributions: 1) I advance two new theoretical frameworks for understanding bilingual codeswitch comprehension, the Current Word Hypothesis and the Surprisal Codeswitching Hypothesis; 2) I validated the use of the stop-making-sense task on multilingual stimuli; 3) I found evidence that bilinguals flexibly switch between their mental grammars on a word-by-word basis; 4) I evaluated the viability of using multilingual LLMs as a mental model for bilingual comprehension of codeswitches; and 5) I found that while GPT-3 surprisal is a strong predictor of human responses to codeswitched sentences, the Surprisal Codeswitching Hypothesis provides an incomplete account of bilingual processing of codeswitches. Instead, bilinguals can flexibly switch grammars on a word-by-word basis.", "venue": "Deep Blue (University of Michigan)", "label": 23}, {"loc": [6.314295291900635, 5.717064380645752], "openalex_id": "https://openalex.org/W4402325962", "title": "Towards Video Understanding through Language in Real-life Settings", "authors": "Santiago Castro", "abstract": "Videos have become an integral part of our daily lives, with a rapidly growing number on YouTube, Netflix, and TikTok serving as testimony to their widespread popularity. Behind the simplicity of their interfaces and user experiences, the systems that power these products employ numerous video-understanding techniques, even for straightforward use cases such as finding a video on how to cook salmon. Despite the significant progress achieved in this area, there remains a gap between lab-setting capabilities and reality, as multiple phenomena are not adequately designed for realistic settings, causing various issues such as domain mismatches and the diverse way people interact in videos (e.g., sarcastically). My work aims to bridge this gap by enabling the understanding of video content in realistic settings. The issues that make current video understanding research unsuitable for real life can be classified into data, methods, and evaluation. The data aspect is crucial since current research has predominantly overlooked real-life settings. I present new datasets and benchmarks for such domains: daily situations and in-the-wild scenarios. These benchmarks measure the effectiveness of new methods in these more realistic settings. Likewise, I introduce a novel framework that accounts for a typical yet understudied human behavior: sarcasm. Sarcasm is particularly suited to be studied in video since I show that leveraging what we see and hear (as people commonly do) allows one to understand it better. For the methods aspect, I consider a fundamental issue, which is the impracticality and lack of scalability of the traditional in-the-lab setting, tuning one model for each newly addressed task and domain. I propose a robust method that allows practitioners to employ a single model for novel tasks and domains with satisfactory performance. Additionally, I present a technique to improve the compositional generalization of existing models. Finally, I focus on current practices for evaluation and propose a framework better suited to realistic settings. Current benchmarks for short video understanding have drawbacks, such as employing easy-to-detect distractor answers, not accounting for diversity when depicting the same situation, and not considering realistic settings. I present a novel evaluation format that tackles all these issues and a benchmark that leverages it. The benchmark shows a gap between the performance of several methods and humans.", "venue": "Deep Blue (University of Michigan)", "label": 23}, {"loc": [6.51058292388916, 0.7333173751831055], "openalex_id": "https://openalex.org/W4404882631", "title": "Complex Word Identification for Italian Language: a dictionary-based approach", "authors": "Laura Occhipinti", "abstract": "Assessing word complexity in Italian poses significant challenges, particularly due to the absence of a standardized dataset. This study introduces the first automatic model designed to identify word complexity for native Italian speakers. A dictionary of simple and complex words was constructed, and various configurations of linguistic features were explored to find the best statistical classifier based on Random Forest algorithm. Considering the probabilities of a word to belong to a class, a comparison between the models\u2019 predictions and human assessments derived from a dataset annotated for complexity perception was made. Finally, the degree of accord between the model predictions and the human inter-annotator agreement was analyzed using Spearman correlation. Our findings indicate that a model incorporating both linguistic features and word embeddings performed better than other simpler models, also showing a value of correlation with the human judgements similar to the inter-annotator agreement. This study demonstrates the feasibility of an automatic system for detecting complexity in the Italian language with good performances and comparable effectiveness to humans in this subjective task.", "venue": "http://doi.org/10.47810/clib.24.12", "label": 0}, {"loc": [3.006624460220337, -0.07416386157274246], "openalex_id": "https://openalex.org/W4403151287", "title": "Accuracy of Large Language Models in Answering Ophthalmology Board-Style Questions: A Meta-Analysis", "authors": "Jo\u2010Hsuan Wu, Takashi Nishida, Tianming Liu", "abstract": "The overall accuracy of LLMs in answering ophthalmology board-style questions was acceptable but not exceptional, with ChatGPT-4 and Bing Chat being top-performing models. Performance varied significantly based on specific ophthalmology topics tested. Inconsistent performances are of concern, highlighting the need for future studies to include ophthalmology board-style questions with images to more comprehensively examine the competency of LLMs.", "venue": "Asia-Pacific Journal of Ophthalmology", "label": 0}, {"loc": [6.5471978187561035, 0.7019543051719666], "openalex_id": "https://openalex.org/W4403282356", "title": "What do BERT word embeddings learn about the French language?", "authors": "Ekaterina Goliakova, David Langlois", "abstract": "Pre-trained word embeddings (for example, BERT-like) have been successfully used in a variety of downstream tasks. However, do all embeddings, obtained from the models of the same architecture, encode information in the same way? Does the size of the model correlate to the quality of the information encoding? In this paper, we will attempt to dissect the dimensions of several BERT-like models that were trained on the French language to find where grammatical information (gender, plurality, part of speech) and semantic features might be encoded. In addition to this, we propose a framework for comparing the quality of encoding in different models.", "venue": "http://doi.org/10.47810/clib.24.02", "label": 0}, {"loc": [3.411792755126953, 1.8148914575576782], "openalex_id": "https://openalex.org/W4403373401", "title": "Who Gets Paid (for) What? The Cultural Political Economy of News Content in Generative AI", "authors": "Siho Nam", "abstract": "One of the key controversies that generative artificial intelligence (AI) has recently stirred was whether compensation is due for the copyrighted materials used to train AI models. This article explores the logic, trajectories, and dynamics of content generation, including news, through generative AI in two distinctive yet intertwined domains. Guided by a cultural political economy approach, it examines how both the political context (validation/legitimation of AI-generated news content by established news media) and the economic context (use of unpaid and underpaid labor in the forms of freely scraped data and data annotation work) shape the deployment of news content on AI models. It further untangles how the space for serious, independent journalism may shrink, as big tech companies\u2019 algorithmic technologies emerge as a solution to contemporary problems in journalism. A clear danger here is that AI companies\u2019 proprietary algorithms, language training models, and value-laden parameters are incompatible with journalism's democratic obligations and responsibilities.", "venue": "Emerging Media", "label": 0}, {"loc": [7.900121688842773, -0.8947761654853821], "openalex_id": "https://openalex.org/W4402706602", "title": "MQM-Chat: Multidimensional Quality Metrics for Chat Translation", "authors": "Yunmeng Li, Jun Suzuki, Makoto Morishita, Kaori Abe, Kentaro Inui", "abstract": "The complexities of chats pose significant challenges for machine translation models. Recognizing the need for a precise evaluation metric to address the issues of chat translation, this study introduces Multidimensional Quality Metrics for Chat Translation (MQM-Chat). Through the experiments of five models using MQM-Chat, we observed that all models generated certain fundamental errors, while each of them has different shortcomings, such as omission, overly correcting ambiguous source content, and buzzword issues, resulting in the loss of stylized information. Our findings underscore the effectiveness of MQM-Chat in evaluating chat translation, emphasizing the importance of stylized content and dialogue consistency for future studies.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.726347923278809, 1.289660096168518], "openalex_id": "https://openalex.org/W4402712357", "title": "Theoretical and Methodological Framework for Studying Texts Produced by Large Language Models", "authors": "Ji\u0159\u00ed Mili\u010dka", "abstract": "This paper addresses the conceptual, methodological and technical challenges in studying large language models (LLMs) and the texts they produce from a quantitative linguistics perspective. It builds on a theoretical framework that distinguishes between the LLM as a substrate and the entities the model simulates. The paper advocates for a strictly non-anthropomorphic approach to models while cautiously applying methodologies used in studying human linguistic behavior to the simulated entities. While natural language processing researchers focus on the models themselves, their architecture, evaluation, and methods for improving performance, we as quantitative linguists should strive to build a robust theory concerning the characteristics of texts produced by LLMs, how they differ from human-produced texts, and the properties of simulated entities. Additionally, we should explore the potential of LLMs as an instrument for studying human culture, of which language is an integral part.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.6051669120788574, 2.04449462890625], "openalex_id": "https://openalex.org/W4402062829", "title": "Overdetermination of businesswomen in the British press: a corpus-based approach in critical discourse analysis", "authors": "Thi Ngoc Thao Nguyen", "abstract": "Within the strand of critical discourse analysis (CDA), research on media portrayal of individuals is prevalent, with the scope having been expanded beyond such subjects as refugees or asylum seekers, but also the breadth of research having been enhanced with larger datasets under the corpus-based approach. This study, utilizing both corpus-based quantitative analysis as well as qualitative analysis, focuses on the depiction of businesswomen in the British press. The corpus comprises business-related articles from The Guardian published in 2017 and 2018, gathered through website crawling. As for qualitative analysis, those articles are examined under the category of overdetermination Kress van Leeuwen\u2019s social actor framework (2008) combined with the metaphor identification procedure (MIP) by Pragglejaz Group (2007). The results show that businesswomen are represented in different social practices, such as maritime, fighting, racing, and climbing. Based on these results, the portrayal of businesswomen in the media is sketched and suggested with further implications.", "venue": "ICTE Conference Proceedings", "label": 0}, {"loc": [3.357121229171753, 1.6080552339553833], "openalex_id": "https://openalex.org/W4402745581", "title": "\" I, for One, Welcome Our New\" AI Jurors: ChatGPT and the Future of the Jury System in American Law", "authors": "Matthew J. O\u2019Hara", "abstract": "This article explores the potential for advanced generative text AI systems like ChatGPT to serve as a replacement for human juries in the modern legal system. It argues that the vast knowledge base and perspective-aggregation capabilities of these AI models uniquely position them as potentially superior embodiments of the \u201ccommunity conscience\u201d that juries are meant to represent. By synthesizing diverse viewpoints into nuanced, context-sensitive judgments, AI juries could in theory do justice to the broader values and concerns of society in ways that 12-person human juries often fail to achieve. The article first examines the technical capabilities of state-of-the-art language models like ChatGPT, emphasizing the vast scope and diversity of their training data which spans a huge range of human knowledge and perspectives. It then traces the historical development of the jury system and its essential functions as both the moral conscience of the community and a source of democratic legitimacy for the legal system. Building on this foundation, the article makes the case that AI is poised to fulfill the representative and deliberative roles of juries more effectively than human jurors by virtue of its unparalleled capacity to absorb and synthesize society\u2019s heterogeneous values and viewpoints. However, it also carefully considers the significant risks and challenges associated with AI juries, including issues of algorithmic bias, the opacity of machine reasoning, the potential erosion of public trust, and the philosophical implications of outsourcing moral judgment to artificial intelligence. Ultimately, the article argues that while the use of AI in legal decision-making is likely inevitable, it is crucial that we proactively shape the terms of this integration in ways that uphold the core values of fairness, transparency, and democratic accountability. The jury system has long been celebrated as a bastion of citizen participation in the law \u2013 the article concludes by calling for a robust public dialogue on how AI can be harnessed to enhance, rather than erode, this vital civic institution. Keywords: Artificial Intelligence, ChatGPT, Jury, AI Ethics, Moral Reasoning, Machine Learning, Algorithm, Legal Tech, Law and Technology, AI Governance", "venue": "International Journal of Law Ethics and Technology", "label": 0}, {"loc": [7.660673141479492, -1.1615512371063232], "openalex_id": "https://openalex.org/W4402017565", "title": "From Rule-Based Models to Deep Learning Transformers Architectures for Natural Language Processing and Sign Language Translation Systems: Survey, Taxonomy \u2026", "authors": "Nada Shahin, Leila Ismail", "abstract": "With the growing Deaf and Hard of Hearing population worldwide and the persistent shortage of certified sign language interpreters, there is a pressing need for an efficient, signs-driven, integrated end-to-end translation system, from sign to gloss to text and vice-versa. There has been a wealth of research on machine translations and related reviews. However, there are few works on sign language machine translation considering the particularity of the language being continuous and dynamic. This paper aims to address this void, providing a retrospective analysis of the temporal evolution of sign language machine translation algorithms and a taxonomy of the Transformers architectures, the most used approach in language translation. We also present the requirements of a real-time Quality-of-Service sign language ma-chine translation system underpinned by accurate deep learning algorithms. We propose future research directions for sign language translation systems.", "venue": "Artificial Intelligence Review", "label": 18}, {"loc": [3.7574362754821777, 2.829113483428955], "openalex_id": "https://openalex.org/W4402952825", "title": "Trustworthy and Responsible AI for Human-Centric Autonomous Decision-Making Systems", "authors": "Farzaneh Dehghani, Mahsa Dibaji, Fahim Anzum, Lily Dey, Alican Basdemir, Sayeh Bayat, Jean-Christophe Boucher, Steve Drew, Sarah Elaine Eaton, Richard Frayne, Gouri Ginde, Ashley E. Harris, Yani Ioannou, Catherine Lebel, John Lysack, Leslie Salgado Arzuaga, Emma A. M. Stanley, Roberto Souza, Ronnie de Souza Santos, Lana Wells, Tyler Williamson, Matthias Wilms, Zaman Wahid, Mark Ungrin, Marina V. Gavrilova, Mariana Bento", "abstract": "Artificial Intelligence (AI) has paved the way for revolutionary decision-making processes, which if harnessed appropriately, can contribute to advancements in various sectors, from healthcare to economics. However, its black box nature presents significant ethical challenges related to bias and transparency. AI applications are hugely impacted by biases, presenting inconsistent and unreliable findings, leading to significant costs and consequences, highlighting and perpetuating inequalities and unequal access to resources. Hence, developing safe, reliable, ethical, and Trustworthy AI systems is essential. Our team of researchers working with Trustworthy and Responsible AI, part of the Transdisciplinary Scholarship Initiative within the University of Calgary, conducts research on Trustworthy and Responsible AI, including fairness, bias mitigation, reproducibility, generalization, interpretability, and authenticity. In this paper, we review and discuss the intricacies of AI biases, definitions, methods of detection and mitigation, and metrics for evaluating bias. We also discuss open challenges with regard to the trustworthiness and widespread application of AI across diverse domains of human-centric decision making, as well as guidelines to foster Responsible and Trustworthy AI models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.1991047859191895, -0.5302631258964539], "openalex_id": "https://openalex.org/W4402705933", "title": "Harnessing the Intrinsic Knowledge of Pretrained Language Models for Challenging Text Classification Settings", "authors": "Lingyu Gao", "abstract": "Text classification is crucial for applications such as sentiment analysis and toxic text filtering, but it still faces challenges due to the complexity and ambiguity of natural language. Recent advancements in deep learning, particularly transformer architectures and large-scale pretraining, have achieved inspiring success in NLP fields. Building on these advancements, this thesis explores three challenging settings in text classification by leveraging the intrinsic knowledge of pretrained language models (PLMs). Firstly, to address the challenge of selecting misleading yet incorrect distractors for cloze questions, we develop models that utilize features based on contextualized word representations from PLMs, achieving performance that rivals or surpasses human accuracy. Secondly, to enhance model generalization to unseen labels, we create small finetuning datasets with domain-independent task label descriptions, improving model performance and robustness. Lastly, we tackle the sensitivity of large language models to in-context learning prompts by selecting effective demonstrations, focusing on misclassified examples and resolving model ambiguity regarding test example labels.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.014824390411377, 1.8532984256744385], "openalex_id": "https://openalex.org/W4402706093", "title": "A New Method for Cross-Lingual-based Semantic Role Labeling", "authors": "Mohammad Ebrahimi\uf02a, Behrouz Minaei Bidgoli, Nasim Khozouei", "abstract": "Semantic role labeling is a crucial task in natural language processing, enabling better comprehension of natural language. However, the lack of annotated data in multiple languages has posed a challenge for researchers. To address this, a deep learning algorithm based on model transfer has been proposed. The algorithm utilizes a dataset consisting of the English portion of CoNLL2009 and a corpus of semantic roles in Persian. To optimize the efficiency of training, only ten percent of the educational data from each language is used. The results of the proposed model demonstrate significant improvements compared to Niksirt et al.'s model. In monolingual mode, the proposed model achieved a 2.05 percent improvement on F1-score, while in cross-lingual mode, the improvement was even more substantial, reaching 6.23 percent. Worth noting is that the compared model only trained two of the four stages of semantic role labeling and employed golden data for the remaining two stages. This suggests that the actual superiority of the proposed model surpasses the reported numbers by a significant margin. The development of cross-lingual methods for semantic role labeling holds promise, particularly in addressing the scarcity of annotated data for various languages. These advancements pave the way for further research in understanding and processing natural language across different linguistic contexts.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.6720757484436035, 2.131608247756958], "openalex_id": "https://openalex.org/W4402705130", "title": "BaichuanSEED: Sharing the Potential of ExtensivE Data Collection and Deduplication by Introducing a Competitive Large Language Model Baseline", "authors": "Guosheng Dong, Da Pan, Yiding Sun, Shusen Zhang, Liang Zheng, Xin Wu, Yanjun Shen, Fan Yang, Haoze Sun, Tianpeng Li, Mingan Lin, Jianhua Xu, Yufan Zhang, Xiaonan Nie, Lei Su, Bingning Wang, Wentao Zhang, Jiaxin Mao, Zenan Zhou, Weipeng Chen", "abstract": "The general capabilities of Large Language Models (LLM) highly rely on the composition and selection on extensive pretraining datasets, treated as commercial secrets by several institutions. To mitigate this issue, we open-source the details of a universally applicable data processing pipeline and validate its effectiveness and potential by introducing a competitive LLM baseline. Specifically, the data processing pipeline consists of broad collection to scale up and reweighting to improve quality. We then pretrain a 7B model BaichuanSEED with 3T tokens processed by our pipeline without any deliberate downstream task-related optimization, followed by an easy but effective supervised fine-tuning stage. BaichuanSEED demonstrates consistency and predictability throughout training and achieves comparable performance on comprehensive benchmarks with several commercial advanced large language models, such as Qwen1.5 and Llama3. We also conduct several heuristic experiments to discuss the potential for further optimization of downstream tasks, such as mathematics and coding.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.400409698486328, 0.6012737154960632], "openalex_id": "https://openalex.org/W4402355582", "title": "The Tokenization Problem: Understanding Generative AI's Computational Language Bias", "authors": "Marijana Asprovska, Nathan Hunter", "abstract": "The revolutionary potential of ChatGPT raises a critical question: Revolutionary for whom? This paper examines potential inequalities in how ChatGPT \u201ctokenizes\u201d texts across different languages. Translated 3,000-5,000 character passages in 108 languages were analyzed using OpenAI's tokenizer. English was treated as the baseline, with a \u201ctoken multiplier\u201d calculated for each language. The analysis revealed English was over 13 times more efficient than some languages. Key findings showed while the pre-training dataset size and character counts have nuanced roles, the alphabet used and the prevalence of special characters significantly impact efficiency. These discrepancies have real-world implications regarding usage costs and model limitations across languages. Thus, consciously addressing the tokenization imbalance is critical for ensuring equitable access to AI systems across diverse languages.", "venue": "Ubiquity Proceedings", "label": 0}, {"loc": [6.8301873207092285, 0.9207538366317749], "openalex_id": "https://openalex.org/W4402705597", "title": "A Survey of Large Language Models for European Languages", "authors": "Wazir Ali, Sampo Pyysalo", "abstract": "Large Language Models (LLMs) have gained significant attention due to their high performance on a wide range of natural language tasks since the release of ChatGPT. The LLMs learn to understand and generate language by training billions of model parameters on vast volumes of text data. Despite being a relatively new field, LLM research is rapidly advancing in various directions. In this paper, we present an overview of LLM families, including LLaMA, PaLM, GPT, and MoE, and the methods developed to create and enhance LLMs for official European Union (EU) languages. We provide a comprehensive summary of common monolingual and multilingual datasets used for pretraining large language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.413006782531738, 3.1450562477111816], "openalex_id": "https://openalex.org/W4401929925", "title": "Optimizing Large Language Models with Multi-Degree Low-Rank Approximations", "authors": "Benjamin Sisoka, William T. Robinson", "abstract": "Abstract The increasing computational demands and resource requirements of advanced neural network models have created a growing need for efficient methods to enhance their scalability and deployment, particularly in environments with limited hardware capabilities. Addressing this challenge, the novel application of multi-degree low-rank approximations provides a significant breakthrough, enabling substantial reductions in memory usage and computational costs while preserving high levels of performance. Experiments conducted on the Mistral model demonstrated that this approach can effectively balance the trade-offs between model complexity and accuracy, achieving reduced perplexity and improved classification performance across a range of tasks. The use of varying degrees of rank reduction allowed for tailored optimization, enhancing the model's adaptability to different task requirements and operational environments. The findings suggest that multi-degree low-rank approximations are not only a viable solution for optimizing large-scale neural networks but also a versatile tool for extending the applicability of sophisticated language models to resource-constrained settings. This approach opens up new possibilities for the deployment of advanced language processing capabilities in real-time applications, mobile devices, and other platforms where computational efficiency is critical.", "venue": "https://doi.org/10.21203/rs.3.rs-4966694/v1", "label": 0}, {"loc": [3.775554656982422, -3.9515864849090576], "openalex_id": "https://openalex.org/W4401916512", "title": "Hate speech detection in low-resourced Indian languages: An analysis of transformer-based monolingual and multilingual models with cross-lingual experiments", "authors": "Koyel Ghosh, Apurbalal Senapati", "abstract": "Abstract Warning: This paper is based on hate speech detection and may contain examples of abusive/ offensive phrases. Cyberbullying, online harassment, etc., via offensive comments are pervasive across different social media platforms like \u2122Twitter, \u2122Facebook, \u2122YouTube, etc. Hateful comments must be detected and eradicated to prevent harassment and violence on social media. In the Natural Language Processing (NLP) domain, the most prevalent task is comment classification, which is challenging, and language models based on transformers are at the forefront of this advancement. This paper intends to analyze the performance of language models based on transformers like BERT, ALBERT, RoBERTa, and DistilBERT on the Indian hate speech datasets over binary classification. Here, we utilize the existing datasets, i.e., HASOC (Hindi and Marathi) and HS-Bangla. So, we evaluate several multilingual language models like MuRIL-BERT, XLM-RoBERTa, etc., few monolingual language models like RoBERTa-Hindi, Maha-BERT (Marathi), Bangla-BERT (Bangla), Assamese-BERT (Assamese), etc., and perform cross-lingual experiment also. For further analyses, we perform multilingual, monolingual, and cross-lingual experiments on our H ate S peech Assamese (HS-Assamese) (Indo-Aryan language family) and H ate S peech Bodo (HS-Bodo) (Sino-Tibetan language family) dataset (HS dataset version 2) also and achieved a promising result. The motivation of the cross-lingual experiment is to encourage researchers to learn about the power of the transformer. Note that no pre-trained language models are currently available for Bodo or any other Sino-Tibetan languages.", "venue": "Natural language processing.", "label": 15}, {"loc": [6.207876682281494, 4.546225547790527], "openalex_id": "https://openalex.org/W4402701972", "title": "Re-Mix: Optimizing Data Mixtures for Large Scale Imitation Learning", "authors": "Joey Hejna, Chethan Bhateja, Yichen Jian, Karl Pertsch, Dorsa Sadigh", "abstract": "Increasingly large imitation learning datasets are being collected with the goal of training foundation models for robotics. However, despite the fact that data selection has been of utmost importance in vision and natural language processing, little work in robotics has questioned what data such models should actually be trained on. In this work we investigate how to weigh different subsets or ``domains'' of robotics datasets for robot foundation model pre-training. Concrete, we use distributionally robust optimization (DRO) to maximize worst-case performance across all possible downstream domains. Our method, Re-Mix, addresses the wide range of challenges that arise when applying DRO to robotics datasets including variability in action spaces and dynamics across different datasets. Re-Mix employs early stopping, action normalization, and discretization to counteract these issues. Through extensive experimentation on the largest open-source robot manipulation dataset, the Open X-Embodiment dataset, we demonstrate that data curation can have an outsized impact on downstream performance. Specifically, domain weights learned by Re-Mix outperform uniform weights by 38\\% on average and outperform human-selected weights by 32\\% on datasets used to train existing generalist robot policies, specifically the RT-X models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.5437047481536865, 4.454933166503906], "openalex_id": "https://openalex.org/W4402003824", "title": "Low-Complexity and Secure Clustering-Based Similarity Detection for Private Files", "authors": "Duaa Fadhel Najem, Nagham Abdulrasool Taha, Zaid Ameen Abduljabbar, Vincent Omollo Nyangaresi, Junchao Ma, Dhafer G. Honi", "abstract": "Detection of the similarity between files is a requirement for many practical applications, such as copyright protection, file management, plagiarism detection, and detecting duplicate submissions of scientific articles to multiple journals or conferences. Existing methods have not taken into consideration file privacy, which prevents their use in many delicate situations, for example when comparing two intellectual agencies' files where files are meant to be secured, to find file similarities. Over the last few years, encryption protocols have been developed with the aim of detecting similar files without compromising privacy. However, existing protocols tend to leak important data, and do not have low complexity costs. This paper addresses the issue of computing the similarity between two file collections belonging to two entities who desire to keep their contents private. We propose a clustering-based approach that achieves 90% accuracy while significantly reducing the execution time. The protocols presented in this study are much more efficient than other secure protocols, and the alternatives are slower in terms of similarity detection for large file sets. Our system achieves a high level of security by using a vector space model to convert the files into vectors and by applying Paillier encryption to encrypt the elements of the vector separately, to protect privacy. The study uses the application of the Porter algorithm to the vocabulary set. Using a secure cosine similarity approach, a score for similar files was identified and the index of the similarity scores is returned to the other party, rather than the similar files themselves. The system is strengthened by using clustering for files, based on the k-means clustering technique, which makes it more efficient for large file sets.", "venue": "TEM Journal", "label": 0}, {"loc": [6.00456428527832, 0.9551889896392822], "openalex_id": "https://openalex.org/W4402698794", "title": "Towards Estimating Personal Values in Song Lyrics", "authors": "Andrew M. Demetriou, Jaehun Kim, Sandy Manolios, Cynthia C. S. Liem", "abstract": "Most music widely consumed in Western Countries contains song lyrics, with U.S. samples reporting almost all of their song libraries contain lyrics. In parallel, social science theory suggests that personal values - the abstract goals that guide our decisions and behaviors - play an important role in communication: we share what is important to us to coordinate efforts, solve problems and meet challenges. Thus, the values communicated in song lyrics may be similar or different to those of the listener, and by extension affect the listener's reaction to the song. This suggests that working towards automated estimation of values in lyrics may assist in downstream MIR tasks, in particular, personalization. However, as highly subjective text, song lyrics present a challenge in terms of sampling songs to be annotated, annotation methods, and in choosing a method for aggregation. In this project, we take a perspectivist approach, guided by social science theory, to gathering annotations, estimating their quality, and aggregating them. We then compare aggregated ratings to estimates based on pre-trained sentence/word embedding models by employing a validated value dictionary. We discuss conceptually 'fuzzy' solutions to sampling and annotation challenges, promising initial results in annotation quality and in automated estimations, and future directions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.386308670043945, 5.480862617492676], "openalex_id": "https://openalex.org/W4402698743", "title": "Building and better understanding vision-language models: insights and future directions", "authors": "Hugo Lauren\u00e7on, Andr\u00e9s Marafioti, Victor Sanh, L\u00e9o Tronchon", "abstract": "The field of vision-language models (VLMs), which take images and texts as inputs and output texts, is rapidly evolving and has yet to reach consensus on several key aspects of the development pipeline, including data, architecture, and training methods. This paper can be seen as a tutorial for building a VLM. We begin by providing a comprehensive overview of the current state-of-the-art approaches, highlighting the strengths and weaknesses of each, addressing the major challenges in the field, and suggesting promising research directions for underexplored areas. We then walk through the practical steps to build Idefics3-8B, a powerful VLM that significantly outperforms its predecessor Idefics2-8B, while being trained efficiently, exclusively on open datasets, and using a straightforward pipeline. These steps include the creation of Docmatix, a dataset for improving document understanding capabilities, which is 240 times larger than previously available datasets. We release the model along with the datasets created for its training.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.58675479888916, 0.5830514430999756], "openalex_id": "https://openalex.org/W4401864068", "title": "A Hierarchical Context Augmentation Method to Improve Retrieval-Augmented LLMs on Scientific Papers", "authors": "Tian-Yi Che, Xian-Ling Mao, Tian Lan, Heyan Huang", "abstract": "Scientific papers of a large scale on the Internet encompass a wealth of data and knowledge, attracting the attention of numerous researchers. To fully utilize these knowledge, Retrieval-Augmented Large Language Models (LLMs) usually leverage large-scale scientific corpus to train and then retrieve relevant passages from external memory to improve generation, which have demonstrated outstanding performance. However, existing methods can only capture one-dimension fragmented textual information without incorporating hierarchical structural knowledge, eg. the deduction relationship of abstract and main body, which makes it difficult to grasp the central thought of papers. To tackle this problem, we propose a hierarchical context augmentation method, which helps Retrieval-Augmented LLMs to autoregressively learn the structure knowledge of scientific papers. Specifically, we utilize the document tree to represent the hierarchical relationship of a paper and enhance the structure information of scientific context from three aspects: scale, format and global information. First, we think each top-bottom path of document tree is a logical independent context, which can be used to largely increase the scale of extracted structural corpus. Second, we propose a novel label-based format to represent the structure of context in textual sequences, unified between training and inference. Third, we introduce the global information of retrieved passages to further enhance the structure of context. Extensive experiments on three scientific tasks show that the proposed method significantly improves the performance of Retrieval-Augmented LLMs on all tasks. Besides, our method achieves start-of-art performance in Question Answer task and outperforms ChatGPT. Moreover, it also brings considerate gains with irrelevant retrieval passages, illustrating its effectiveness on practical application scenarios.", "venue": "https://doi.org/10.1145/3637528.3671847", "label": 0}, {"loc": [5.706297874450684, 5.560993671417236], "openalex_id": "https://openalex.org/W4400104719", "title": "Improving the Consistency in Cross-Lingual Cross-Modal Retrieval with 1-to-K Contrastive Learning", "authors": "Zhijie Nie, Richong Zhang, Zhangchi Feng, Hailang Huang, Xudong Liu", "abstract": "Cross-lingual Cross-modal Retrieval (CCR) is an essential task in web search,\\nwhich aims to break the barriers between modality and language simultaneously\\nand achieves image-text retrieval in the multi-lingual scenario with a single\\nmodel. In recent years, excellent progress has been made based on cross-lingual\\ncross-modal pre-training; particularly, the methods based on contrastive\\nlearning on large-scale data have significantly improved retrieval tasks.\\nHowever, these methods directly follow the existing pre-training methods in the\\ncross-lingual or cross-modal domain, leading to two problems of inconsistency\\nin CCR: The methods with cross-lingual style suffer from the intra-modal error\\npropagation, resulting in inconsistent recall performance across languages in\\nthe whole dataset. The methods with cross-modal style suffer from the\\ninter-modal optimization direction bias, resulting in inconsistent rank across\\nlanguages within each instance, which cannot be reflected by Recall@K. To solve\\nthese problems, we propose a simple but effective 1-to-K contrastive learning\\nmethod, which treats each language equally and eliminates error propagation and\\noptimization bias. In addition, we propose a new evaluation metric, Mean Rank\\nVariance (MRV), to reflect the rank inconsistency across languages within each\\ninstance. Extensive experiments on four CCR datasets show that our method\\nimproves both recall rates and MRV with smaller-scale pre-trained data,\\nachieving the new state-of-art.\\n", "venue": "https://doi.org/10.1145/3637528.3671787", "label": 0}, {"loc": [7.0281195640563965, 3.2011682987213135], "openalex_id": "https://openalex.org/W4399657288", "title": "Multivariate Log-based Anomaly Detection for Distributed Database", "authors": "Lingzhe Zhang, Tong Jia, Mengxi Jia, Ying Li, Yong Yang, Zhonghai Wu", "abstract": "Distributed databases are fundamental infrastructures of today's large-scale\\nsoftware systems such as cloud systems. Detecting anomalies in distributed\\ndatabases is essential for maintaining software availability. Existing\\napproaches, predominantly developed using Loghub-a comprehensive collection of\\nlog datasets from various systems-lack datasets specifically tailored to\\ndistributed databases, which exhibit unique anomalies. Additionally, there's a\\nnotable absence of datasets encompassing multi-anomaly, multi-node logs.\\nConsequently, models built upon these datasets, primarily designed for\\nstandalone systems, are inadequate for distributed databases, and the prevalent\\nmethod of deeming an entire cluster anomalous based on irregularities in a\\nsingle node leads to a high false-positive rate. This paper addresses the\\nunique anomalies and multivariate nature of logs in distributed databases. We\\nexpose the first open-sourced, comprehensive dataset with multivariate logs\\nfrom distributed databases. Utilizing this dataset, we conduct an extensive\\nstudy to identify multiple database anomalies and to assess the effectiveness\\nof state-of-the-art anomaly detection using multivariate log data. Our findings\\nreveal that relying solely on logs from a single node is insufficient for\\naccurate anomaly detection on distributed database. Leveraging these insights,\\nwe propose MultiLog, an innovative multivariate log-based anomaly detection\\napproach tailored for distributed databases. Our experiments, based on this\\nnovel dataset, demonstrate MultiLog's superiority, outperforming existing\\nstate-of-the-art methods by approximately 12%.\\n", "venue": "https://doi.org/10.1145/3637528.3671725", "label": 0}, {"loc": [7.435513973236084, -0.7595356106758118], "openalex_id": "https://openalex.org/W4405621396", "title": "Defining Boundaries: The Impact of Domain Specification on Cross-Language and Cross-Domain Transfer in Machine Translation", "authors": "Lia Shahnazaryan, Meriem Beloucif", "abstract": "Recent advancements in neural machine translation (NMT) have revolutionized the field, yet the dependency on extensive parallel corpora limits progress for low-resource languages and domains. Cross-lingual transfer learning offers a promising solution by utilizing data from high-resource languages but often struggles with in-domain NMT. This paper investigates zero-shot cross-lingual domain adaptation for NMT, focusing on the impact of domain specification and linguistic factors on transfer effectiveness. Using English as the source language and Spanish for fine-tuning, we evaluate multiple target languages, including Portuguese, Italian, French, Czech, Polish, and Greek. We demonstrate that both language-specific and domain-specific factors influence transfer effectiveness, with domain characteristics playing a crucial role in determining cross-domain transfer potential. We also explore the feasibility of zero-shot cross-lingual cross-domain transfer, providing insights into which domains are more responsive to transfer and why. Our results show the importance of well-defined domain boundaries and transparency in experimental setups for in-domain transfer learning.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.967592239379883, 5.168859004974365], "openalex_id": "https://openalex.org/W4405419420", "title": "Open-FinLLMs: Open Multimodal Large Language Models for Financial Applications", "authors": "Qianqian Xie, Dong Li, Mengxi Xiao, Zihao Jiang, Ruoyu Xiang, Xiao Zhang, Zhengyu Chen, Yueru He, Weiguang Han, Yuzhe Yang, Shunian Chen, Yifei Zhang, Lihang Shen, Daniel Kim, Zhiwei Liu, Zheheng Luo, Yangyang Yu, Yupeng Cao, Zhiyang Deng, Zhiyuan Yao, Haohang Li, Duanyu Feng, Y. S. Dai, VijayaSai Somasundaram, Pengfei Lu, Yilun Zhao, Yi\u2010Tao Long, Guojun Xiong, Kaleb E. Smith, Honghai Yu, Yanzhao Lai, Min Peng, Jianyun Nie, Jordan W. Suchow, Xiaoyang Liu, Benyou Wang, Alejandro Lopez-Lira, Jimin Huang, Sophia Ananiadou", "abstract": "Financial LLMs hold promise for advancing financial tasks and domain-specific applications. However, they are limited by scarce corpora, weak multimodal capabilities, and narrow evaluations, making them less suited for real-world application. To address this, we introduce \\textit{Open-FinLLMs}, the first open-source multimodal financial LLMs designed to handle diverse tasks across text, tabular, time-series, and chart data, excelling in zero-shot, few-shot, and fine-tuning settings. The suite includes FinLLaMA, pre-trained on a comprehensive 52-billion-token corpus; FinLLaMA-Instruct, fine-tuned with 573K financial instructions; and FinLLaVA, enhanced with 1.43M multimodal tuning pairs for strong cross-modal reasoning. We comprehensively evaluate Open-FinLLMs across 14 financial tasks, 30 datasets, and 4 multimodal tasks in zero-shot, few-shot, and supervised fine-tuning settings, introducing two new multimodal evaluation datasets. Our results show that Open-FinLLMs outperforms afvanced financial and general LLMs such as GPT-4, across financial NLP, decision-making, and multi-modal tasks, highlighting their potential to tackle real-world challenges. To foster innovation and collaboration across academia and industry, we release all codes (https://anonymous.4open.science/r/PIXIU2-0D70/B1D7/LICENSE) and models under OSI-approved licenses.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.446079730987549, 2.5831944942474365], "openalex_id": "https://openalex.org/W4402703589", "title": "Unboxing Occupational Bias: Grounded Debiasing LLMs with US Labor Data", "authors": "Atmika Gorti, Manas Gaur, Aman Chadha", "abstract": "Large Language Models (LLMs) are prone to inheriting and amplifying societal biases embedded within their training data, potentially reinforcing harmful stereotypes related to gender, occupation, and other sensitive categories. This issue becomes particularly problematic as biased LLMs can have far-reaching consequences, leading to unfair practices and exacerbating social inequalities across various domains, such as recruitment, online content moderation, or even the criminal justice system. Although prior research has focused on detecting bias in LLMs using specialized datasets designed to highlight intrinsic biases, there has been a notable lack of investigation into how these findings correlate with authoritative datasets, such as those from the U.S. National Bureau of Labor Statistics (NBLS). To address this gap, we conduct empirical research that evaluates LLMs in a ``bias-out-of-the-box\" setting, analyzing how the generated outputs compare with the distributions found in NBLS data. Furthermore, we propose a straightforward yet effective debiasing mechanism that directly incorporates NBLS instances to mitigate bias within LLMs. Our study spans seven different LLMs, including instructable, base, and mixture-of-expert models, and reveals significant levels of bias that are often overlooked by existing bias detection techniques. Importantly, our debiasing method, which does not rely on external datasets, demonstrates a substantial reduction in bias scores, highlighting the efficacy of our approach in creating fairer and more reliable LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.104304790496826, 3.3038439750671387], "openalex_id": "https://openalex.org/W4405426979", "title": "Cost-Effective Big Data Orchestration Using Dagster: A Multi-Platform Approach", "authors": "Hernan Picatto, Georg Heiler, Peter Klimek", "abstract": "The rapid advancement of big data technologies has underscored the need for robust and efficient data processing solutions. Traditional Spark-based Platform-as-a-Service (PaaS) solutions, such as Databricks and Amazon Web Services Elastic MapReduce, provide powerful analytics capabilities but often result in high operational costs and vendor lock-in issues. These platforms, while user-friendly, can lead to significant inefficiencies due to their cost structures and lack of transparent pricing. This paper introduces a cost-effective and flexible orchestration framework using Dagster. Our solution aims to reduce dependency on any single PaaS provider by integrating various Spark execution environments. We demonstrate how Dagster's orchestration capabilities can enhance data processing efficiency, enforce best coding practices, and significantly reduce operational costs. In our implementation, we achieved a 12% performance improvement over EMR and a 40% cost reduction compared to DBR, translating to over 300 euros saved per pipeline run. Our goal is to provide a flexible, developer-controlled computing environment that maintains or improves performance and scalability while mitigating the risks associated with vendor lock-in. The proposed framework supports rapid prototyping and testing, which is essential for continuous development and operational efficiency, contributing to a more sustainable model of large data processing.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.954466819763184, 0.7787957191467285], "openalex_id": "https://openalex.org/W4403007047", "title": "Goldfish: Monolingual Language Models for 350 Languages", "authors": "Tyler A. Chang, Catherine Arnett, Zhuowen Tu, Benjamin Bergen", "abstract": "For many low-resource languages, the only available language models are large multilingual models trained on many languages simultaneously. However, using FLORES perplexity as a metric, we find that these models perform worse than bigrams for many languages (e.g. 24% of languages in XGLM 4.5B; 43% in BLOOM 7.1B). To facilitate research that focuses on low-resource languages, we pre-train and release Goldfish, a suite of monolingual autoregressive Transformer language models up to 125M parameters for 350 languages. The Goldfish reach lower FLORES perplexities than BLOOM, XGLM, and MaLA-500 on 98 of 204 FLORES languages, despite each Goldfish model being over 10x smaller. However, the Goldfish significantly underperform larger multilingual models on reasoning benchmarks, suggesting that for low-resource languages, multilinguality primarily improves general reasoning abilities rather than basic text generation. We release models trained on 5MB (350 languages), 10MB (288 languages), 100MB (166 languages), and 1GB (83 languages) of text data where available. The Goldfish models are available as baselines, fine-tuning sources, or augmentations to existing models in low-resource NLP research, and they are further useful for crosslinguistic studies requiring maximally comparable models across languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.9780702590942383, 1.870032787322998], "openalex_id": "https://openalex.org/W4401887831", "title": "Ethical Considerations for Generative AI in Social Science Research", "authors": "Brian A. Brown, Keri Heitner", "abstract": "Social science research embodies the inquiry into people as individuals and their interpersonal interactions with each other in communities and varied societies, with due consideration for their natural, technological, and constructed environments. Due to (a) the nature and composition of qualitative, quantitative, and mixed methods research designs coupled with (b) the apparent expectations of responsible behavior from researchers (human beings), room exists for research misconduct or unethical practices. The prevalence and acceptance of generative artificial intelligence (AI) technology such as ChatGPT propagate at a hyper-accelerated pace based on its potential for ease of work in many sectors, including research, particularly academic research. Journal reviewers, editors, and publishers do not possess sufficient tools to differentiate between human-written and partially or wholly AI-authored manuscripts submitted for journal publication.", "venue": "Advances in computational intelligence and robotics book series", "label": 0}, {"loc": [7.729874134063721, 4.041624069213867], "openalex_id": "https://openalex.org/W4403007247", "title": "HMoE: Heterogeneous Mixture of Experts for Language Modeling", "authors": "An Wang, Xingwu Sun, Ruobing Xie, Shuaipeng Li, Jiaqi Zhu, Zhen Yang, Pinxue Zhao, Jongwook Han, Zhanhui Kang, Di Wang, Naoaki Okazaki, Chengzhong Xu", "abstract": "Mixture of Experts (MoE) offers remarkable performance and computational efficiency by selectively activating subsets of model parameters. Traditionally, MoE models use homogeneous experts, each with identical capacity. However, varying complexity in input data necessitates experts with diverse capabilities, while homogeneous MoE hinders effective expert specialization and efficient parameter utilization. In this study, we propose a novel Heterogeneous Mixture of Experts (HMoE), where experts differ in size and thus possess diverse capacities. This heterogeneity allows for more specialized experts to handle varying token complexities more effectively. To address the imbalance in expert activation, we propose a novel training objective that encourages the frequent activation of smaller experts, enhancing computational efficiency and parameter utilization. Extensive experiments demonstrate that HMoE achieves lower loss with fewer activated parameters and outperforms conventional homogeneous MoE models on various pre-training evaluation benchmarks. Codes will be released upon acceptance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.9843525886535645, 2.52601957321167], "openalex_id": "https://openalex.org/W4403011998", "title": "To Code, or Not To Code? Exploring Impact of Code in Pre-training", "authors": "Viraat Aryabumi, Yixuan Su, Raymond Ma, Adrien Morisot, Chunshun Zhang, Acyr Locatelli, Marzieh Fadaee, Ahmet \u00dcst\u00fcn, Sara Hooker", "abstract": "Including code in the pre-training data mixture, even for models not specifically designed for code, has become a common practice in LLMs pre-training. While there has been anecdotal consensus among practitioners that code data plays a vital role in general LLMs' performance, there is only limited work analyzing the precise impact of code on non-code tasks. In this work, we systematically investigate the impact of code data on general performance. We ask \"what is the impact of code data used in pre-training on a large variety of downstream tasks beyond code generation\". We conduct extensive ablations and evaluate across a broad range of natural language reasoning tasks, world knowledge tasks, code benchmarks, and LLM-as-a-judge win-rates for models with sizes ranging from 470M to 2.8B parameters. Across settings, we find a consistent results that code is a critical building block for generalization far beyond coding tasks and improvements to code quality have an outsized impact across all tasks. In particular, compared to text-only pre-training, the addition of code results in up to relative increase of 8.2% in natural language (NL) reasoning, 4.2% in world knowledge, 6.6% improvement in generative win-rates, and a 12x boost in code performance respectively. Our work suggests investments in code quality and preserving code during pre-training have positive impacts.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.381909370422363, 2.0912280082702637], "openalex_id": "https://openalex.org/W4401783107", "title": "A global matching model of choice and response times in the Deese-Roediger-McDermott semantic and perceptual false recognition paradigms", "authors": "Adam F Osth, Lyulei Zhang, Samuel Williams", "abstract": "What is arguably the most common method of eliciting false memories in the laboratory is the Deese-Roediger-McDermott paradigm (Deese, 1959; Roediger \\& McDermott, 1995), where participants study a set of items that are all similar to a non-presented critical lure. A common finding is that false alarm rates to the critical lures are much higher than to other non-presented items and are in some cases even comparable to hit rates, regardless of whether similarity is defined in terms of semantic or perceptual relations. While there exists a handful of computational models of this paradigm, they have only been applied to semantic but not perceptual false recognition, they have not been fit at the level of individual participants, and they have not been applied to response times (RTs). We present a global matching model that addresses all three of these current gaps. Global similarity of perceptual and semantic representations drives a pair of linear ballistic accumulators, which are used to produce decisions as well as complete RT distributions. In addition to being able to account for heightened false recognition of critical lures, the model was able to account for differences across both individual participants and items, correlations between semantic and perceptual false recognition, differences in false recognition across levels of processing, and heightened false recognition under speed emphasis. These results suggest that both semantic and perceptual false recognition can be explained using only a single retrieval mechanism.", "venue": "https://doi.org/10.31234/osf.io/6mrux", "label": 0}, {"loc": [6.071345329284668, 0.6129382848739624], "openalex_id": "https://openalex.org/W4403012035", "title": "NLP for The Greek Language: A Longer Survey", "authors": "Katerina Papantoniou, Yannis Tzitzikas", "abstract": "English language is in the spotlight of the Natural Language Processing (NLP) community with other languages, like Greek, lagging behind in terms of offered methods, tools and resources. Due to the increasing interest in NLP, in this paper we try to condense research efforts for the automatic processing of Greek language covering the last three decades. In particular, we list and briefly discuss related works, resources and tools, categorized according to various processing layers and contexts. We are not restricted to the modern form of Greek language but also cover Ancient Greek and various Greek dialects. This survey can be useful for researchers and students interested in NLP tasks, Information Retrieval and Knowledge Management for the Greek language.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.737483263015747, -0.4479282796382904], "openalex_id": "https://openalex.org/W4402502450", "title": "Ranking Generated Answers: On the Agreement of Retrieval Models with Humans on Consumer Health Questions", "authors": "Sebastian Heineking, Jonas Probst, Daniel Steinbach, Martin Potthast, Harrisen Scells", "abstract": "Evaluating the output of generative large language models (LLMs) is challenging and difficult to scale. Many evaluations of LLMs focus on tasks such as single-choice question-answering or text classification. These tasks are not suitable for assessing open-ended question-answering capabilities, which are critical in domains where expertise is required. One such domain is health, where misleading or incorrect answers can have a negative impact on a user's well-being. Using human experts to evaluate the quality of LLM answers is generally considered the gold standard, but expert annotation is costly and slow. We present a method for evaluating LLM answers that uses ranking models trained on annotated document collections as a substitute for explicit relevance judgements and apply it to the CLEF 2021 eHealth dataset. In a user study, our method correlates with the preferences of a human expert (Kendall's $\u03c4=0.64$). It is also consistent with previous findings in that the quality of generated answers improves with the size of the model and more sophisticated prompting strategies.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.443600654602051, 2.111976385116577], "openalex_id": "https://openalex.org/W4401976214", "title": "The Self in Action: Narrating Agentic Moments", "authors": "Shira Zilberstein, Elena Ayala\u2010Hurtado, Mari Sanchez, Derek Robey", "abstract": "This article develops a cultural and contextual approach to studying agency that attends to variation in how people narrate their experiences. Drawing on the large-scale, nationally representative American Voices Project data, the article uses computational methods to test patterns in agentic expression and qualitative methods to examine how respondents narrate agency and passivity as they describe their lives. This analysis captures agentic moments, widespread narratives through which people emphasize their agentic selfhood as they recount specific situations. Moreover, individuals use narrative moves\u2014such as shifting their focus and drawing on subtypes of agency\u2014to craft agentic moments despite constraints. We argue that narratives of agency are variable, situational, and often co-occurring with narrative passivity, which enables people to narrate themselves as agentic even in challenging situations.", "venue": "RSF The Russell Sage Foundation Journal of the Social Sciences", "label": 0}, {"loc": [5.64746618270874, -0.7295108437538147], "openalex_id": "https://openalex.org/W4401976026", "title": "Evaluating Embedding Models for Clustering Italian Political News: A Comparative Study of Text-Embedding-3-Large and UmBERTo", "authors": "Fabio Giglietto", "abstract": "In an era where social media platforms have become the battleground for shaping political narratives, understanding the nuances of disseminated political news content is crucial. Reliable unsupervised clustering of datasets containing excerpts from news stories circulated on social media is a central piece of the puzzle. Despite advancements in Natural Language Processing (NLP) techniques, studies led by social scientists that apply fully unsupervised techniques to Italian language content remain rare. While large language models promise to be game-changers, a proper comparison with previously available unsupervised NLP techniques is lacking. This study helps to fill this gap by comparing the performance of OpenAI's text-embedding-3-large model against the BERT-based UmBERTo model. The comparison utilizes two distinct datasets of political news stories circulated on Facebook before the 2018 and 2022 Italian elections. Using K-means and HDBSCAN, we find that text-embedding-3-large consistently outperforms UmBERTo in producing semantically coherent clusters.", "venue": "https://doi.org/10.31219/osf.io/2j9ed", "label": 0}, {"loc": [8.797858238220215, 0.3204655647277832], "openalex_id": "https://openalex.org/W4402500862", "title": "W-RAG: Weakly Supervised Dense Retrieval in RAG for Open-domain Question Answering", "authors": "Jinming Nian, Zhiyuan Peng, Qifan Wang, Yi Fang", "abstract": "In knowledge-intensive tasks such as open-domain question answering (OpenQA), large language models (LLMs) often struggle to generate factual answers, relying solely on their internal (parametric) knowledge. To address this limitation, Retrieval-Augmented Generation (RAG) systems enhance LLMs by retrieving relevant information from external sources, thereby positioning the retriever as a pivotal component. Although dense retrieval demonstrates state-of-the-art performance, its training poses challenges due to the scarcity of ground-truth evidence, largely attributed to the high costs of human annotation. In this paper, we propose W-RAG, a method that draws weak training signals from the downstream task (such as OpenQA) of an LLM, and fine-tunes the retriever to prioritize passages that most benefit the task. Specifically, we rerank the top-$k$ passages retrieved via BM25 by assessing the probability that the LLM will generate the correct answer for a question given each passage. The highest-ranking passages are then used as positive fine-tuning examples for dense retrieval. We conduct comprehensive experiments across four publicly available OpenQA datasets to demonstrate that our approach enhances both retrieval and OpenQA performance compared to baseline models, achieving results comparable to models fine-tuned with human-labeled data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.660478591918945, 3.174376964569092], "openalex_id": "https://openalex.org/W4401689901", "title": "Specialising and Analysing Instruction-Tuned and Byte-Level Language Models for Organic Reaction Prediction", "authors": "Jiayun Pang, Ivan Vuli\u0107", "abstract": "We evaluate FlanT5 and ByT5 across tokenisation, pretraining, finetuning and inference and benchmark their impact on organic reaction prediction tasks.", "venue": "Faraday Discussions", "label": 0}, {"loc": [4.079370021820068, -2.4752349853515625], "openalex_id": "https://openalex.org/W4401974878", "title": "A Survey on the Use of Large Language Models (LLMs) in Fake News", "authors": "Eleftheria Papageorgiou, Christos Chronis, Iraklis Varlamis, Yassine Himeur", "abstract": "The proliferation of fake news and fake profiles on social media platforms poses significant threats to information integrity and societal trust. Traditional detection methods, including rule-based approaches, metadata analysis, and human fact-checking, have been employed to combat disinformation, but these methods often fall short in the face of increasingly sophisticated fake content. This review article explores the emerging role of Large Language Models (LLMs) in enhancing the detection of fake news and fake profiles. We provide a comprehensive overview of the nature and spread of disinformation, followed by an examination of existing detection methodologies. The article delves into the capabilities of LLMs in generating both fake news and fake profiles, highlighting their dual role as both a tool for disinformation and a powerful means of detection. We discuss the various applications of LLMs in text classification, fact-checking, verification, and contextual analysis, demonstrating how these models surpass traditional methods in accuracy and efficiency. Additionally, the article covers LLM-based detection of fake profiles through profile attribute analysis, network analysis, and behavior pattern recognition. Through comparative analysis, we showcase the advantages of LLMs over conventional techniques and present case studies that illustrate practical applications. Despite their potential, LLMs face challenges such as computational demands and ethical concerns, which we discuss in more detail. The review concludes with future directions for research and development in LLM-based fake news and fake profile detection, underscoring the importance of continued innovation to safeguard the authenticity of online information.", "venue": "Future Internet", "label": 30}, {"loc": [9.463521003723145, 1.1061509847640991], "openalex_id": "https://openalex.org/W4401639961", "title": "Cost-efficient prompt engineering for unsupervised entity resolution in the product matching domain", "authors": "Navapat Nananukul, Khanin Sisaengsuwanchai, Mayank Kejriwal", "abstract": "Abstract Entity Resolution (ER) is the problem of semi-automatically determining when two entities refer to the same underlying entity, with applications ranging from healthcare to e-commerce. Traditional ER solutions required considerable manual expertise, including domain-specific feature engineering, as well as identification and curation of training data. Recently released large language models (LLMs) provide an opportunity to make ER more seamless and domain-independent. Because of LLMs\u2019 pre-trained knowledge, the matching step in ER can be made easier by just prompting. However, it is also well known that LLMs can pose risks, that the quality of their outputs can depend on how prompts are engineered, and that the cost of using LLMs can be significant. Unfortunately, a systematic experimental study on the effects of different prompting methods and their respective cost for solving domain-specific entity matching using LLMs, like ChatGPT, has been lacking thus far. This paper aims to address this gap by conducting such a study. We consider some relatively simple and cost-efficient ER prompt engineering methods and apply them to perform product matching on two real-world datasets widely used in the community. We select two well-known e-commerce datasets and provide extensive experimental results to show that an LLM like GPT3.5 is viable for high-performing product matching and, interestingly, that more complicated and detailed (and hence, expensive) prompting methods do not necessarily outperform simpler approaches. We provide brief discussions on qualitative and error analysis, including a study of the inter-consistency of different prompting methods to determine whether they yield stable outputs. Finally, we consider some limitations of LLMs when used as a product matcher in potential real-world e-commerce applications.", "venue": "Discover Artificial Intelligence", "label": 0}, {"loc": [4.695335388183594, 0.525722324848175], "openalex_id": "https://openalex.org/W4401636481", "title": "Systematic exploration and in-depth analysis of ChatGPT architectures progression", "authors": "Debajyoty Banik, Natasha Pati, Atul Sharma", "abstract": "The fast evolution of artificial intelligence frameworks has resulted in the creation of increasingly sophisticated large language models (LLM), ChatGPT being the most famous one. This study paper dives into this LLM with a case study of ChatGPT's architecture and provides a thorough comparative analysis of its numerous versions, tracking its history from its conception to its most recent incarnations. This research intends to give a full knowledge of the model's history by investigating the underlying mechanisms and enhancements provided in each edition. The comparative analysis covers key aspects such as model size, training data, fine-tuning techniques, and performance metrics. Furthermore, this study evaluates the limits of ChatGPT in its many incarnations. These limitations include common sense reasoning difficulties, biased replies, verbosity, sensitivity to input wording, and others. Each constraint is investigated for potential remedies and workarounds. This research article also provides a complete analysis of the ChatGPT architecture and its progress through multiple iterations. It gives vital insights for academics, developers, and users wanting to harness the promise of ChatGPT while managing its restrictions by exploring both the model's strengths and limitations. The distinctiveness of this paper rests in its comprehensive assessment of ChatGPT's architectural development and its practical strategy for resolving the myriad difficulties in producing cohesive and contextually relevant replies.", "venue": "Artificial Intelligence Review", "label": 18}, {"loc": [7.7378621101379395, 4.004053115844727], "openalex_id": "https://openalex.org/W4402502835", "title": "BAM! Just Like That: Simple and Efficient Parameter Upcycling for Mixture of Experts", "authors": "Qizhen Zhang, Nikolas Gritsch, Dwaraknath Gnaneshwar, Simon Guo, David Cairuz, Bharat Venkitesh, Jakob Foerster, Phil Blunsom, Sebastian Ruder, Ahmet \u00dcst\u00fcn, Acyr Locatelli", "abstract": "The Mixture of Experts (MoE) framework has become a popular architecture for large language models due to its superior performance over dense models. However, training MoEs from scratch in a large-scale regime is prohibitively expensive. Existing methods mitigate this by pre-training multiple dense expert models independently and using them to initialize an MoE. This is done by using experts' feed-forward network (FFN) to initialize the MoE's experts while merging other parameters. However, this method limits the reuse of dense model parameters to only the FFN layers, thereby constraining the advantages when \"upcycling\" these models into MoEs. We propose BAM (Branch-Attend-Mix), a simple yet effective method that addresses this shortcoming. BAM makes full use of specialized dense models by not only using their FFN to initialize the MoE layers but also leveraging experts' attention parameters fully by initializing them into a soft-variant of Mixture of Attention (MoA) layers. We explore two methods for upcycling attention parameters: 1) initializing separate attention experts from dense models including all attention parameters for the best model performance; and 2) sharing key and value parameters across all experts to facilitate for better inference efficiency. To further improve efficiency, we adopt a parallel attention transformer architecture to MoEs, which allows the attention experts and FFN experts to be computed concurrently. Our experiments on seed models ranging from 590 million to 2 billion parameters demonstrate that BAM surpasses baselines in both perplexity and downstream task performance, within the same computational and data constraints.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.744980812072754, 1.5995597839355469], "openalex_id": "https://openalex.org/W4401595656", "title": "How much semantic information is available in large language model tokens?", "authors": "David A. Haslett, Zhenguang G. Cai", "abstract": "Large language models segment many words into multiple tokens. For example, GPT-4 segments \"dogcatcher\" into dog+catch+er. Companies that make those models claim that meaningful subword tokens are essential, yet tokens often appear meaningless or misleading. For example, GPT-4 segments \"anteater\" into ante+ater, and those tokens don\u2019t align with morphemes (i.e., ant+eat+er). To investigate whether tokens bear meaning, we segmented tens of thousands of words from each of 41 languages according to three generations of GPT tokenizers (GPT-2, GPT-4, and GPT-4o). We found that words which share tokens are more semantically similar than expected by chance or expected from length alone, that tokens capture morphological information even when they don\u2019t look like morphemes, and that tokens capture more information than is explained by morphology. These results suggest that comparing tokens to morphemes overlooks the wider variety of semantic information available in word form and that standard tokenization methods successfully capture much of that information. However, tokens convey less semantic information in lower resource languages and in languages that don't use the Latin alphabet, so standard tokenization methods might entrench advantages for speakers of English and other high-resource languages.", "venue": "https://doi.org/10.31234/osf.io/nhwdz", "label": 0}, {"loc": [3.086639404296875, -0.3584756851196289], "openalex_id": "https://openalex.org/W4401652463", "title": "Prospect of large language models and natural language processing for lung cancer diagnosis: A systematic review", "authors": "Arushi Garg, Smridhi Gupta, Soumya Vats, Palak Handa, Nidhi Goel", "abstract": "Abstract Lung cancer, a leading cause of global mortality, demands a combat for its effective prevention, early diagnosis, and advanced treatment methods. Traditional diagnostic methods face limitations in accuracy and efficiency, necessitating innovative solutions. Large Language Models (LLMs) and Natural Language Processing (NLP) offer promising avenues for overcoming these challenges by providing comprehensive insights into medical data and personalizing treatment plans. This systematic review explores the transformative potential of LLMs and NLP in automating lung cancer diagnosis. It evaluates their applications, particularly in medical imaging and the interpretation of complex medical data, and assesses achievements and associated challenges. Emphasizing the critical role of Artificial Intelligence (AI) in medical imaging, the review highlights advancements in lung cancer screening and deep learning approaches. Furthermore, it underscores the importance of on\u2010going advancements in diagnostic methods and encourages further exploration in this field.", "venue": "Expert Systems", "label": 0}, {"loc": [3.997755765914917, -2.2712936401367188], "openalex_id": "https://openalex.org/W4403381490", "title": "Interrupted time series analysis of clickbait on worldwide news websites, 2016-2023", "authors": "Austin McCutcheon, Chris Brogly", "abstract": "Clickbait is deceptive text that can manipulate web browsing, creating an information gap between a link and target page that literally baits a user into clicking. Clickbait detection continues to be well studied, but analyses of clickbait overall on the web are limited. A dataset was built consisting of 451,033,388 clickbait scores produced by a clickbait detector which analyzed links and headings on primarily English news pages from the Common Crawl. On this data, 5 segmented regression models were fit on 5 major news events and averaged clickbait scores. COVID and the 2020 US Election appeared to influence clickbait levels.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.56990909576416, 1.4902911186218262], "openalex_id": "https://openalex.org/W4401557487", "title": "Changes and challenges of legal education in the era of generative artificial intelligence: Chinese experience", "authors": "W. Wang, Zhilang Xu, Zichun Xu", "abstract": "Using generative artificial intelligence systems in the classroom for law case analysis teaching can enhance the efficiency and accuracy of knowledge delivery. They can create interactive learning environments that are appropriate, immersive, integrated, and evocative, guiding students to conduct case analysis from interdisciplinary and cross-cultural perspectives. This teaching method not only increases students\u2019 interest and participation in learning but also helps cultivate their interdisciplinary thinking and global vision. However, the application of generative artificial intelligence systems in legal education also faces some challenges and issues. If students excessively rely on these systems, their ability to think independently, make judgments, and innovate may be weakened, leading to over-trust in machines and reinforcement of value biases. To address these challenges and issues, legal education should focus more on cultivating students\u2019 questioning skills, self-analysis abilities, critical thinking, basic legal literacy, digital skills, and humanistic spirit. This will enable students to respond to the challenges brought by generative artificial intelligence and ensure their comprehensive development in the new era.", "venue": "Journal of Infrastructure Policy and Development", "label": 0}, {"loc": [3.8802850246429443, 0.808744490146637], "openalex_id": "https://openalex.org/W4402427280", "title": "The advantages of context specific language models: the case of the Erasmian Language Model", "authors": "Jo\u00e3o Gon\u00e7alves, Nick Jelicic, M. Murgia, Evert Stamhuis", "abstract": "The current trend to improve language model performance seems to be based on scaling up with the number of parameters (e.g. the state of the art GPT4 model has approximately 1.7 trillion parameters) or the amount of training data fed into the model. However this comes at significant costs in terms of computational resources and energy costs that compromise the sustainability of AI solutions, as well as risk relating to privacy and misuse. In this paper we present the Erasmian Language Model (ELM) a small context specific, 900 million parameter model, pre-trained and fine-tuned by and for Erasmus University Rotterdam. We show how the model performs adequately in a classroom context for essay writing, and how it achieves superior performance in subjects that are part of its context. This has implications for a wide range of institutions and organizations, showing that context specific language models may be a viable alternative for resource constrained, privacy sensitive use cases.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.412668704986572, -1.3585482835769653], "openalex_id": "https://openalex.org/W4401545735", "title": "Designing Heterogeneous LLM Agents for Financial Sentiment Analysis", "authors": "Frank Xing", "abstract": "Large language models (LLMs) have drastically changed the possible ways to design intelligent systems, shifting the focus from massive data acquisition and new model training to human alignment and strategic elicitation of the full potential of existing pre-trained models. This paradigm shift, however, is not fully realized in financial sentiment analysis (FSA) due to the discriminative nature of this task and a lack of prescriptive knowledge of how to leverage existing generative models in such a context. This study investigates the effectiveness of the new paradigm, that is, using LLMs without fine-tuning for FSA. Rooted in Minsky\u2019s theory of mind and emotions, a design framework with heterogeneous LLM agents is proposed and applied to FSA. The framework instantiates specialized agents using prior guiding knowledge from both linguistics and finance. Then, a summative agent reasons on the aggregated agent discussions. Comprehensive evaluations using six FSA datasets show that the framework yields better accuracies compared to many alternative multi-LLM agent settings, especially when the discussion contents are substantial. This study contributes to the design foundations and paves new avenues for LLMs-based FSA and potentially other tasks. Implications for business and management have also been discussed.", "venue": "ACM Transactions on Management Information Systems", "label": 29}, {"loc": [3.6594014167785645, 4.491521835327148], "openalex_id": "https://openalex.org/W4401545746", "title": "Differentially Private Low-Rank Adaptation of Large Language Model Using Federated Learning", "authors": "Xiao-Yang Liu, Rongyi Zhu, Daochen Zha, Jiechao Gao, Shan Zhong, Matt White, Meikang Qiu", "abstract": "The surge in interest and application of large language models (LLMs) has sparked a drive to fine-tune these models to suit specific applications, such as finance and medical science. However, concerns regarding data privacy have emerged, especially when multiple stakeholders aim to collaboratively enhance LLMs using sensitive data. In this scenario, federated learning becomes a natural choice, allowing decentralized fine-tuning without exposing raw data to central servers. Motivated by this, we investigate how data privacy can be ensured in LLM fine-tuning through practical federated learning approaches, enabling secure contributions from multiple parties to enhance LLMs. Yet, challenges arise: (1) despite avoiding raw data exposure, there is a risk of inferring sensitive information from model outputs, and (2) federated learning for LLMs incurs notable communication overhead. To address these challenges, this article introduces DP-LoRA, a novel federated learning algorithm tailored for LLMs. DP-LoRA preserves data privacy by employing a Gaussian mechanism that adds noise in weight updates, maintaining individual data privacy while facilitating collaborative model training. Moreover, DP-LoRA optimizes communication efficiency via low-rank adaptation, minimizing the transmission of updated weights during distributed training. The experimental results across medical, financial, and general datasets using various LLMs demonstrate that DP-LoRA effectively ensures strict privacy constraints while minimizing communication overhead.", "venue": "ACM Transactions on Management Information Systems", "label": 29}, {"loc": [6.187312602996826, 5.8169264793396], "openalex_id": "https://openalex.org/W4401551843", "title": "Enriching satellite image annotations of forests with keyphrases from a specialized corpus", "authors": "Nathalie Neptune, Josiane Mothe", "abstract": "Abstract The automatic annotation of changes in satellite images requires examples of appropriate annotations. Alternatively, keyphrases extracted from a specialized corpus can serve as candidates for image annotation models. In the case of detecting deforestation in satellite images, there is a rich scientific literature available on the topic that may serve as a corpus for finding candidate annotations. We propose a method that utilizes a deep learning technique for change detection and visual semantic embedding. This method is combined with an information retrieval framework to find annotations for pairs of satellite images showing forest changes. Our evaluation is based on a dataset of image pairs from the Amazon rainforest and shows that keyphrases provide richer semantic information without any negative impact on the annotation compared to annotating with single words.", "venue": "Multimedia Tools and Applications", "label": 42}, {"loc": [6.97869873046875, -1.081318974494934], "openalex_id": "https://openalex.org/W4401751953", "title": "Correcting Auditory Spelling Mistakes in Jordanian Dialect Using Machine Learning Techniques", "authors": "Malak Smadi, Gheith A. Abandah", "abstract": "This paper explores the application of machine learning techniques, specifically leveraging an efficient transformer model known as Byt5, to rectify Arabic auditory spelling mistakes. The model is trained using datasets with synthetic errors generated through stochastic error injection. Evaluation is conducted using a test dataset that has auditory spelling mistakes which is gathered from Jordanian accounts on social media platforms. Results indicate that the model achieves high error correction accuracy rates, with the best performance achieved when the model is trained on a dataset with specific error injection rate (EIR). This model achieves the best character error rate of 1.13% on the test set when trained with EIR of 75%. These findings underscore the effectiveness of employing machine learning, particularly pre-trained transformer models, in addressing Arabic spelling errors, showcasing potential applications in Arabic language processing tasks.", "venue": "https://doi.org/10.1109/icics63486.2024.10638311", "label": 0}, {"loc": [9.594428062438965, 1.7231141328811646], "openalex_id": "https://openalex.org/W4401544410", "title": "Entrant: A large financial dataset for table understanding", "authors": "Elias Zavitsanos, Dimitris Mavroeidis, Eirini Spyropoulou, Manos Fergadiotis, \u0393\u03b5\u03ce\u03c1\u03b3\u03b9\u03bf\u03c2 \u03a0\u03b1\u03bb\u03b9\u03bf\u03cd\u03c1\u03b1\u03c2", "abstract": "Tabular data is a way to structure, organize, and present information conveniently and effectively. Real-world tables present data in two dimensions by arranging cells in matrices that summarize information and facilitate side-by-side comparisons. Recent research efforts aim to train large models to understand structured tables, a process that enables knowledge transfer in various downstream tasks. Model pre-training, though, requires large datasets, conveniently formatted to reflect cell and table characteristics. This paper presents ENTRANT, a financial dataset that comprises millions of tables, which are transformed to reflect cell attributes, as well as positional and hierarchical information. Hence, they facilitate, among other things, pre-training tasks for table understanding with deep learning methods. The dataset provides table and cell information along with the corresponding metadata in a machine-readable format. We have automated all data processing and curation and technically validated the dataset through unit testing of high code coverage. Finally, we demonstrate the use of the dataset in a pre-training task of a state-of-the-art model, which we use for downstream cell classification.", "venue": "Scientific Data", "label": 20}, {"loc": [4.266910552978516, -0.7529343962669373], "openalex_id": "https://openalex.org/W4401547146", "title": "Language Models for Online Depression Detection: A Review and Benchmark Analysis on Remote Interviews", "authors": "Ruiyang Qin, Ryan Cook, Kai Yang, Ahmed Abbasi, David G. Dobolyi, Salman Seyedi, Emily Griner, Hyeokhyen Kwon, Robert O. Cotes, Zifan Jiang, Gari D. Clifford", "abstract": "The use of machine learning (ML) to detect depression in online settings has emerged as an important health and wellness use case. In particular, the use of deep learning methods for depression detection from textual content posted on social media has garnered considerable attention. Conversely, there has been relatively limited evaluation of depression detection in clinical environments involving text generated from remote interviews. In this research, we review state-of-the-art feature-based ML, deep learning, and large language models for depression detection. We use a multidimensional analysis framework to benchmark various language models on a novel testbed comprising speech-to-text transcriptions of remote interviews. Our framework considers the impact of different transcription types and interview segments on depression detection performance. Finally, we summarize the key trends and takeaways from the review and benchmark evaluation and provide suggestions to guide the design of future detection methods.", "venue": "ACM Transactions on Management Information Systems", "label": 29}, {"loc": [2.573280096054077, 1.5578076839447021], "openalex_id": "https://openalex.org/W4401550317", "title": "The Case for Nurturing AI Literacy in Law Schools", "authors": "Sara Migliorini, Jo\u00e3o Ilh\u00e3o Moreira", "abstract": "The debate surrounding the permissibility of generative artificial intelligence (AI) tools in legal education has garnered widespread attention. However, this discourse has largely oscillated between the advantages and disadvantages of generative AI usage whilst failing to fully consider how the uptake of these tools relates to the fundamental objectives of legal education. This article contributes to the current debate by positing that since the primary aim of legal education is the preparation of legal professionals and the development of legal research, generative AI must be holistically integrated into the dominant approaches to legal teaching. This stems from the fact that the legal profession will increasingly rely on generative AI in its daily work. Therefore, AI literacy will emerge as a critical professional skill in the legal realm. Against this background, this article further argues that the integration of AI into the legal curriculum should be addressed by diversifying assessment strategies, emphasizing the importance of academic integrity and making resources on the ethical use of AI available to both students and academic staff in law schools.", "venue": "Asian Journal of Legal Education", "label": 0}, {"loc": [3.6448802947998047, 4.526904106140137], "openalex_id": "https://openalex.org/W4402386266", "title": "Memorization and Privacy Risks in Domain-Specific Large Language Models", "authors": "Michele Miranda, Elena Sofia Ruzzetti, Andrea Santilli, Fabio Massimo Zanzotto, S\u00e9bastien Brati\u00e8res, Emanuele Rodol\u00e0", "abstract": "Large Language Models (LLMs) represent a significant advancement in artificial intelligence, finding applications across various domains. However, their reliance on massive internet-sourced datasets for training brings notable privacy issues, which are exacerbated in critical domains (e.g., healthcare). Moreover, certain application-specific scenarios may require fine-tuning these models on private data. This survey critically examines the privacy threats associated with LLMs, emphasizing the potential for these models to memorize and inadvertently reveal sensitive information. We explore current threats by reviewing privacy attacks on LLMs and propose comprehensive solutions for integrating privacy mechanisms throughout the entire learning pipeline. These solutions range from anonymizing training datasets to implementing differential privacy during training or inference and machine unlearning after training. Our comprehensive review of existing literature highlights ongoing challenges, available tools, and future directions for preserving privacy in LLMs. This work aims to guide the development of more secure and trustworthy AI systems by providing a thorough understanding of privacy preservation methods and their effectiveness in mitigating risks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.6344549655914307, 4.479193687438965], "openalex_id": "https://openalex.org/W4402386266", "title": "Preserving Privacy in Large Language Models: A Survey on Current Threats and Solutions", "authors": "Michele Miranda, Elena Sofia Ruzzetti, Andrea Santilli, Fabio Massimo Zanzotto, S\u00e9bastien Brati\u00e8res, Emanuele Rodol\u00e0", "abstract": "Large Language Models (LLMs) represent a significant advancement in artificial intelligence, finding applications across various domains. However, their reliance on massive internet-sourced datasets for training brings notable privacy issues, which are exacerbated in critical domains (e.g., healthcare). Moreover, certain application-specific scenarios may require fine-tuning these models on private data. This survey critically examines the privacy threats associated with LLMs, emphasizing the potential for these models to memorize and inadvertently reveal sensitive information. We explore current threats by reviewing privacy attacks on LLMs and propose comprehensive solutions for integrating privacy mechanisms throughout the entire learning pipeline. These solutions range from anonymizing training datasets to implementing differential privacy during training or inference and machine unlearning after training. Our comprehensive review of existing literature highlights ongoing challenges, available tools, and future directions for preserving privacy in LLMs. This work aims to guide the development of more secure and trustworthy AI systems by providing a thorough understanding of privacy preservation methods and their effectiveness in mitigating risks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.0548651218414307, 5.32291316986084], "openalex_id": "https://openalex.org/W4401507029", "title": "Phishing Webpage Detection: Unveiling the Threat Landscape and Investigating Detection Techniques", "authors": "Aditya Kulkarni, Vivek Balachandran, Tamal Das", "abstract": "<p dir=\"ltr\">In the realm of cybersecurity, phishing stands as a prevalent cyber attack, where attackers employ various tactics to deceive users into gathering their sensitive information, potentially leading to identity theft or financial gain. Researchers have been actively working on advancing phishing webpage detection approaches to detect new phishing URLs, bolstering user protection. Nonetheless, the ever-evolving strategies employed by attackers, aimed at circumventing existing detection approaches and tools, present an ongoing challenge to the research community. This survey presents a systematic categorization of diverse phishing webpage detection approaches, encompassing URL- based, webpage content-based, and visual techniques. Through a comprehensive review of these approaches and an in-depth analysis of existing literature, our study underscores current research gaps in phishing webpage detection. Furthermore, we suggest potential solutions to address some of these gaps, contributing valuable insights to the ongoing efforts to combat phishing attacks.</p>", "venue": "IEEE Communications Surveys & Tutorials", "label": 0}, {"loc": [5.10966682434082, 1.0797903537750244], "openalex_id": "https://openalex.org/W4402385942", "title": "Evaluating the Impact of Advanced LLM Techniques on AI-Lecture Tutors for a Robotics Course", "authors": "Sebastian Kahl, F. L\u00f6ffler, Martin Maciol, Fabian Ridder, Marius Schmitz, Jennifer Spanagel, Jens Wienkamp, Christopher Burgahn, Malte Schilling", "abstract": "This study evaluates the performance of Large Language Models (LLMs) as an Artificial Intelligence-based tutor for a university course. In particular, different advanced techniques are utilized, such as prompt engineering, Retrieval-Augmented-Generation (RAG), and fine-tuning. We assessed the different models and applied techniques using common similarity metrics like BLEU-4, ROUGE, and BERTScore, complemented by a small human evaluation of helpfulness and trustworthiness. Our findings indicate that RAG combined with prompt engineering significantly enhances model responses and produces better factual answers. In the context of education, RAG appears as an ideal technique as it is based on enriching the input of the model with additional information and material which usually is already present for a university course. Fine-tuning, on the other hand, can produce quite small, still strong expert models, but poses the danger of overfitting. Our study further asks how we measure performance of LLMs and how well current measurements represent correctness or relevance? We find high correlation on similarity metrics and a bias of most of these metrics towards shorter responses. Overall, our research points to both the potential and challenges of integrating LLMs in educational settings, suggesting a need for balanced training approaches and advanced evaluation frameworks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.2503159046173096, 1.1826863288879395], "openalex_id": "https://openalex.org/W4401497789", "title": "Large Language Model-Based Chatbots in Higher Education", "authors": "Defne Yigci, Merve Ery\u0131lmaz, Ail K. Yetisen, Savas Tasoglu, Aydogan \u00d6zcan", "abstract": "Large language models (LLMs) are artificial intelligence (AI) platforms capable of analyzing and mimicking natural language processing. Leveraging deep learning, LLM capabilities have been advanced significantly, giving rise to generative chatbots such as Generative Pre\u2010trained Transformer (GPT). GPT\u20101 was initially released by OpenAI in 2018. ChatGPT's release in 2022 marked a global record of speed in technology uptake, attracting more than 100 million users in two months. Consequently, the utility of LLMs in fields including engineering, healthcare, and education has been explored. The potential of LLM\u2010based chatbots in higher education has sparked significant interest and ignited debates. LLMs can offer personalized learning experiences and advance asynchronized learning, potentially revolutionizing higher education, but can also undermine academic integrity. Although concerns regarding AI\u2010generated output accuracy, the spread of misinformation, propagation of biases, and other legal and ethical issues have not been fully addressed yet, several strategies have been implemented to mitigate these limitations. Here, the development of LLMs, properties of LLM\u2010based chatbots, and potential applications of LLM\u2010based chatbots in higher education are discussed. Current challenges and concerns associated with AI\u2010based learning platforms are outlined. The potentials of LLM\u2010based chatbot use in the context of learning experiences in higher education settings are explored.", "venue": "Advanced Intelligent Systems", "label": 0}, {"loc": [8.308250427246094, 2.56705904006958], "openalex_id": "https://openalex.org/W4403563595", "title": "BA-LORA: BIAS-ALLEVIATING LOW-RANK ADAPTA-TION TO MITIGATE CATASTROPHIC INHERITANCE IN LARGE LANGUAGE MODELS", "authors": "Yupeng Chang, Yi Chang, Yuan Wu", "abstract": "Parameter-efficient fine-tuning (PEFT) has become a de facto standard for adapting Large Language Models (LLMs). However, we identify a critical vulnerability within popular low-rank adaptation methods like LoRA: their tendency to exacerbate \"Catastrophic Inheritance\" - the unchecked propagation of biases, noise, and data imbalances from pre-training. This phenomenon can degrade model robustness and fairness, undermining the benefits of efficient adaptation. To address this, we introduce Bias-Alleviating Low-Rank Adaptation (BA-LoRA). Our approach is founded on a principled decomposition of Catastrophic Inheritance into three core challenges: Knowledge Drift, Representation Collapse, and Overfitting to Noise. BA-LoRA systematically mitigates these issues by incorporating a trio of targeted regularizers - consistency, diversity, and SVD - designed to preserve core knowledge, enforce representational richness, and promote robust, low-rank output representations. We conduct comprehensive evaluations on a suite of natural language understanding (NLU) and generation (NLG) tasks using diverse, prominent open-source language models (e.g., LLaMA-2-7B and DeBERTa-v3-base). Our results show that BA-LoRA not only outperforms state-of-the-art LoRA variants in terms of performance and stability, but also demonstrates quantitatively superior robustness and bias mitigation on targeted evaluations. This confirms its ability to counteract the adverse effects of Catastrophic Inheritance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.668816328048706, 1.0299999713897705], "openalex_id": "https://openalex.org/W4401448427", "title": "EXPRESS: AI-Human Hybrids for Marketing Research: Leveraging LLMs as Collaborators", "authors": "Neeraj K. Arora, Ishita Chakraborty, Yohei Nishimura", "abstract": "The authors\u2019 central premise is that a human\u2013LLM (large language model) hybrid approach leads to efficiency and effectiveness gains in the marketing research process. In qualitative research, they show that LLMs can assist in both data generation and analysis; LLMs effectively create sample characteristics, generate synthetic respondents, and conduct and moderate in-depth interviews. The AI\u2013human hybrid generates information-rich, coherent data that surpasses human-only data in depth and insightfulness and matches human performance in data analysis tasks of generating themes and summaries. Evidence from expert judges shows that humans and LLMs possess complementary skills; the human\u2013LLM hybrid outperforms its human-only or LLM-only counterpart. For quantitative research, the LLM correctly picks the answer direction and valence, with the quality of synthetic data significantly improving through few-shot learning and retrieval-augmented generation. The authors demonstrate the value of the AI\u2013human hybrid by collaborating with a Fortune 500 food company and replicating a 2019 qualitative and quantitative study using GPT-4. For their empirical investigation, the authors design the system architecture and prompts to create personas, ask questions, and obtain responses from synthetic respondents. They provide road maps for integrating LLMs into qualitative and quantitative marketing research and conclude that LLMs serve as valuable collaborators in the insight generation process.", "venue": "Journal of Marketing", "label": 0}, {"loc": [8.817439079284668, -1.1085835695266724], "openalex_id": "https://openalex.org/W4403662515", "title": "Large Language Models for Biomedical Text Simplification: Promising But Not There Yet", "authors": "Zihao Li, Samuel Belkadi, Nicolo Micheletti, Lifeng Han, Matthew Shardlow, Goran Nenadi\u0107", "abstract": "In this system report, we describe the models and methods we used for our participation in the PLABA2023 task on biomedical abstract simplification, part of the TAC 2023 tracks. The system outputs we submitted come from the following three categories: 1) domain fine-tuned T5-like models including Biomedical-T5 and Lay-SciFive; 2) fine-tuned BARTLarge model with controllable attributes (via tokens) BART-w-CTs; 3) ChatGPTprompting. We also present the work we carried out for this task on BioGPT finetuning. In the official automatic evaluation using SARI scores, BeeManc ranks 2nd among all teams and our model LaySciFive ranks 3rd among all 13 evaluated systems. In the official human evaluation, our model BART-w-CTs ranks 2nd on Sentence-Simplicity (score 92.84), 3rd on Term-Simplicity (score 82.33) among all 7 evaluated systems; It also produced a high score 91.57 on Fluency in comparison to the highest score 93.53. In the second round of submissions, our team using ChatGPT-prompting ranks the 2nd in several categories including simplified term accuracy score 92.26 and completeness score 96.58, and a very similar score on faithfulness score 95.3 to re-evaluated PLABA-base-1 (95.73) via human evaluations. Our codes, fine-tuned models, prompts, and data splits from the system development stage will be available at https://github.com/ HECTA-UoM/PLABA-MU", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.592153072357178, 0.2527286410331726], "openalex_id": "https://openalex.org/W4403622523", "title": "Recording First-person Experiences to Build a New Type of Foundation Model", "authors": "Dionis Barcari, David Gamez, Aliya Grig", "abstract": "Foundation models have had a big impact in recent years and billions of dollars are being invested in them in the current AI boom. The more popular ones, such as Chat-GPT, are trained on large amounts of Internet data. However, it is becoming apparent that this data is likely to be exhausted soon, and technology companies are looking for new sources of data to train the next generation of foundation models. Reinforcement learning, RAG, prompt engineering and cognitive modelling are often used to fine-tune and augment the behaviour of foundation models. These techniques have been used to replicate people, such as Caryn Marjorie. These chatbots are not based on people's actual emotional and physiological responses to their environment, so they are, at best, a surface-level approximation to the characters they are imitating. To address these issues, we have developed a recording rig that captures what the wearer is seeing and hearing as well as their skin conductance (GSR), facial expression and brain state (14 channel EEG). AI algorithms are used to process this data into a rich picture of the environment and internal states of the subject. Foundation models trained on this data could replicate human behaviour much more accurately than the personality models that have been developed so far. This type of model has many potential applications, including recommendation, personal assistance, GAN systems, dating and recruitment. This paper gives some background to this work and describes the recording rig and preliminary tests of its functionality. It then suggests how a new type of foundation model could be created from the data captured by the rig and outlines some applications. Data gathering and model training are expensive, so we are currently working on the launch of a start-up that could raise funds for the next stage of the project.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.597853660583496, 1.672853708267212], "openalex_id": "https://openalex.org/W4401591047", "title": "A Novel Hashcode-based Duplication Reduction via Thresholding Approach for Large-scale Web Documents", "authors": "Sana Ejaz, Asma Naseer, asma Ahmad, Maria Tamoor, Samina Naz", "abstract": "Modern search engines encounter a significant challenge when it comes to handling duplicate and nearly identical web pages, particularly during the indexing process for vast amounts of web content. This issue can lead to slow search results and increased costs due to the accumulation of storage space necessary for storing indexes. To tackle this issue, different techniques have been proposed to find similar websites. However, it has long been a challenge in research to distinguish between web pages. In the current study, sentence-level features i.e., hashcode and thresholding are used to determine the nearly identical web pages. We employ an adaptive threshold that enables the application of our model in both large- and small-scale settings. The benchmark datasets consisting of Shakespeare’s collections, free text, job descriptions, and Reuters-21578 are used to test the proposed approach. With an accuracy of 0.99 and an F1-score of 0.97, the proposed technique outperforms existing methods.", "venue": "Preprints.org", "label": 3}, {"loc": [6.069982051849365, 5.642256259918213], "openalex_id": "https://openalex.org/W4403370517", "title": "Modelling Visual Semantics via Image Captioning to extract Enhanced Multi-Level Cross-Modal Semantic Incongruity Representation with Attention for Multimodal \u2026", "authors": "Sajal Aggarwal, Ananya Pandey, Dinesh Kumar Vishwakarma", "abstract": "Sarcasm is a type of irony, characterized by an inherent mismatch between the literal interpretation and the intended connotation. Though sarcasm detection in text has been extensively studied, there are situations in which textual input alone might be insufficient to perceive sarcasm. The inclusion of additional contextual cues, such as images, is essential to recognize sarcasm in social media data effectively. This study presents a novel framework for multimodal sarcasm detection that can process input triplets. Two components of these triplets comprise the input text and its associated image, as provided in the datasets. Additionally, a supplementary modality is introduced in the form of descriptive image captions. The motivation behind incorporating this visual semantic representation is to more accurately capture the discrepancies between the textual and visual content, which are fundamental to the sarcasm detection task. The primary contributions of this study are: (1) a robust textual feature extraction branch that utilizes a cross-lingual language model; (2) a visual feature extraction branch that incorporates a self-regulated residual ConvNet integrated with a lightweight spatially aware attention module; (3) an additional modality in the form of image captions generated using an encoder-decoder architecture capable of reading text embedded in images; (4) distinct attention modules to effectively identify the incongruities between the text and two levels of image representations; (5) multi-level cross-domain semantic incongruity representation achieved through feature fusion. Compared with cutting-edge baselines, the proposed model achieves the best accuracy of 92.89% and 64.48%, respectively, on the Twitter multimodal sarcasm and MultiBully datasets.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.505809307098389, 5.384344100952148], "openalex_id": "https://openalex.org/W4406302039", "title": "Deep Learning based Visually Rich Document Content Understanding: A Survey", "authors": "Yihao Ding, Jean Lee, Soyeon Caren Han", "abstract": "Visually Rich Documents (VRDs) play a vital role in domains such as academia, finance, healthcare, and marketing, as they convey information through a combination of text, layout, and visual elements. Traditional approaches to extracting information from VRDs rely heavily on expert knowledge and manual annotation, making them labor-intensive and inefficient. Recent advances in deep learning have transformed this landscape by enabling multimodal models that integrate vision, language, and layout features through pretraining, significantly improving information extraction performance. This survey presents a comprehensive overview of deep learning-based frameworks for VRD Content Understanding (VRD-CU). We categorize existing methods based on their modeling strategies and downstream tasks, and provide a comparative analysis of key components, including feature representation, fusion techniques, model architectures, and pretraining objectives. Additionally, we highlight the strengths and limitations of each approach and discuss their suitability for different applications. The paper concludes with a discussion of current challenges and emerging trends, offering guidance for future research and practical deployment in real-world scenarios.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.629885196685791, 0.3118427097797394], "openalex_id": "https://openalex.org/W4401306909", "title": "A New Type of Foundation Model Based on Recordings of People's Emotions and Physiology", "authors": "David Gamez, Dionis Barcari, Aliya Grig", "abstract": "Foundation models have had a big impact in recent years and billions of dollars are being invested in them in the current AI boom. The more popular ones, such as Chat-GPT, are trained on large amounts of data from the Internet, and then reinforcement learning, RAG, prompt engineering and cognitive modelling are used to fine-tune and augment their behavior. This technology has been used to create models of individual people, such as Caryn Marjorie. However, these chatbots are not based on people's actual emotional and physiological responses to their environment, so they are, at best, surface-level approximations to the characters they are imitating. This paper describes how a new type of foundation model - a first-person foundation model - could be created from recordings of what a person sees and hears as well as their emotional and physiological reactions to these stimuli. A first-person foundation model would map environmental stimuli to a person's emotional and physiological states, and map a person's emotional and physiological states to their behavior. First-person foundation models have many exciting applications, including a new type of recommendation engine, personal assistants, generative adversarial networks, dating and recruitment. To obtain training data for a first-person foundation model, we have developed a recording rig that captures what the wearer is seeing and hearing as well as their emotional and physiological states. This novel source of data could help to address the shortage of new data for building the next generation of foundation models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.711614608764648, 2.3986356258392334], "openalex_id": "https://openalex.org/W4401307294", "title": "A Cross-Domain Benchmark for Active Learning", "authors": "Thorben Werner, Johannes Burchert, Maximilian Stubbemann, Lars Schmidt-Thieme", "abstract": "Active Learning (AL) deals with identifying the most informative samples for labeling to reduce data annotation costs for supervised learning tasks. AL research suffers from the fact that lifts from literature generalize poorly and that only a small number of repetitions of experiments are conducted. To overcome these obstacles, we propose CDALBench, the first active learning benchmark which includes tasks in computer vision, natural language processing and tabular learning. Furthermore, by providing an efficient, greedy oracle, CDALBench can be evaluated with 50 runs for each experiment. We show, that both the cross-domain character and a large amount of repetitions are crucial for sophisticated evaluation of AL research. Concretely, we show that the superiority of specific methods varies over the different domains, making it important to evaluate Active Learning with a cross-domain benchmark. Additionally, we show that having a large amount of runs is crucial. With only conducting three runs as often done in the literature, the superiority of specific methods can strongly vary with the specific runs. This effect is so strong, that, depending on the seed, even a well-established method's performance can be significantly better and significantly worse than random for the same dataset.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.47310209274292, 2.735600471496582], "openalex_id": "https://openalex.org/W4401307484", "title": "Downstream bias mitigation is all you need", "authors": "Arkadeep Baksi, Rahul Singh, Tarun Joshi", "abstract": "The advent of transformer-based architectures and large language models (LLMs) have significantly advanced the performance of natural language processing (NLP) models. Since these LLMs are trained on huge corpuses of data from the web and other sources, there has been a major concern about harmful prejudices that may potentially be transferred from the data. In many applications, these pre-trained LLMs are fine-tuned on task specific datasets, which can further contribute to biases. This paper studies the extent of biases absorbed by LLMs during pre-training as well as task-specific behaviour after fine-tuning. We found that controlled interventions on pre-trained LLMs, prior to fine-tuning, have minimal effect on lowering biases in classifiers. However, the biases present in domain-specific datasets play a much bigger role, and hence mitigating them at this stage has a bigger impact. While pre-training does matter, but after the model has been pre-trained, even slight changes to co-occurrence rates in the fine-tuning dataset has a significant effect on the bias of the model.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.786226749420166, 2.052255392074585], "openalex_id": "https://openalex.org/W4401307019", "title": "A Taxonomy of Stereotype Content in Large Language Models", "authors": "Gandalf Nicol\u00e1s, Aylin Caliskan", "abstract": "This study introduces a taxonomy of stereotype content in contemporary large language models (LLMs). We prompt ChatGPT 3.5, Llama 3, and Mixtral 8x7B, three powerful and widely used LLMs, for the characteristics associated with 87 social categories (e.g., gender, race, occupations). We identify 14 stereotype dimensions (e.g., Morality, Ability, Health, Beliefs, Emotions), accounting for ~90% of LLM stereotype associations. Warmth and Competence facets were the most frequent content, but all other dimensions were significantly prevalent. Stereotypes were more positive in LLMs (vs. humans), but there was significant variability across categories and dimensions. Finally, the taxonomy predicted the LLMs' internal evaluations of social categories (e.g., how positively/negatively the categories were represented), supporting the relevance of a multidimensional taxonomy for characterizing LLM stereotypes. Our findings suggest that high-dimensional human stereotypes are reflected in LLMs and must be considered in AI auditing and debiasing to minimize unidentified harms from reliance in low-dimensional views of bias in LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.658160209655762, 2.928694009780884], "openalex_id": "https://openalex.org/W4401337121", "title": "Scalable Range Search over Temporal and Numerical Expressions", "authors": "Vebj\u00f8rn Ohr, Dhruv Gupta", "abstract": "Natural language expressions of time and numbers can be ambiguous (e.g., 2020s can refer to either 2021 or 2025), can be present at different granularities, or can be unbounded (e.g., more than ten percent). To match and retrieve such ambiguous temporal and numerical expressions over millions of documents, we present NASH. Our experiments on collections amounting to more than 22 million documents show that NASH provides significant speedups in the order of 19.23 - 53.10x for contains and near queries. NASH manages this while using indexes that are 1.90 - 2.05x smaller than the indexes utilized by baselines. We further demonstrate NASH's scalability to the Web by indexing a subset of Common Crawl amounting to more than 365 million documents.", "venue": "https://doi.org/10.1145/3664190.3672509", "label": 0}, {"loc": [3.730072021484375, 3.267482280731201], "openalex_id": "https://openalex.org/W4401306838", "title": "ShieldGemma: Generative AI Content Moderation Based on Gemma", "authors": "Wenjun Zeng, Yuchi Liu, Ryan Mullins, Ludovic Peran, Joe Fernandez, Hamza Harkous, Karthik Narasimhan, Drew Proud, Piyush Kumar, Bhaktipriya Radharapu, Olivia Sturman, Oscar Wahltinez", "abstract": "We present ShieldGemma, a comprehensive suite of LLM-based safety content moderation models built upon Gemma2. These models provide robust, state-of-the-art predictions of safety risks across key harm types (sexually explicit, dangerous content, harassment, hate speech) in both user input and LLM-generated output. By evaluating on both public and internal benchmarks, we demonstrate superior performance compared to existing models, such as Llama Guard (+10.8\\% AU-PRC on public benchmarks) and WildCard (+4.3\\%). Additionally, we present a novel LLM-based data curation pipeline, adaptable to a variety of safety-related tasks and beyond. We have shown strong generalization performance for model trained mainly on synthetic data. By releasing ShieldGemma, we provide a valuable resource to the research community, advancing LLM safety and enabling the creation of more effective content moderation solutions for developers.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.454135894775391, 2.0166938304901123], "openalex_id": "https://openalex.org/W4401306283", "title": "Understanding the Interplay of Scale, Data, and Bias in Language Models: A Case Study with BERT", "authors": "Muhammad Ali, Swetasudha Panda, Qinlan Shen, Michael Wick, Ari Kobren", "abstract": "In the current landscape of language model research, larger models, larger datasets and more compute seems to be the only way to advance towards intelligence. While there have been extensive studies of scaling laws and models' scaling behaviors, the effect of scale on a model's social biases and stereotyping tendencies has received less attention. In this study, we explore the influence of model scale and pre-training data on its learnt social biases. We focus on BERT -- an extremely popular language model -- and investigate biases as they show up during language modeling (upstream), as well as during classification applications after fine-tuning (downstream). Our experiments on four architecture sizes of BERT demonstrate that pre-training data substantially influences how upstream biases evolve with model scale. With increasing scale, models pre-trained on large internet scrapes like Common Crawl exhibit higher toxicity, whereas models pre-trained on moderated data sources like Wikipedia show greater gender stereotypes. However, downstream biases generally decrease with increasing model scale, irrespective of the pre-training data. Our results highlight the qualitative role of pre-training data in the biased behavior of language models, an often overlooked aspect in the study of scale. Through a detailed case study of BERT, we shed light on the complex interplay of data and model scale, and investigate how it translates to concrete biases.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.103736162185669, 1.392260193824768], "openalex_id": "https://openalex.org/W4401623102", "title": "Incorporating Human Judgment in AI-Assisted Content Development: The HEAT Heuristic", "authors": "Gustav Verhulsdonck, Jennifer Weible, Danielle Mollie Stambler, Tharon Howard, Jason Tham", "abstract": "Purpose: As technical and professional communicators (TPCs) use AI to develop content, inaccuracies due to AI limitations are introduced; it is vital TPCs evaluate AI-generated content to improve accuracy and human-centeredness. In this article, we present a human-in-the-loop AI content heuristic (HEAT: Human experience, Expertise, Accuracy, and Trust) as a rating mechanism. Method: This exploratory case study evaluated the quality of content generated by ChatGPT from the perspective of beginner TPC students. We used multiple prompting strategies asking ChatGPT to create documentation on personas using two Darwin Information Type Architecture (DITA) information types namely, concept topics and task instructions, and we evaluated the results with HEAT. Results: HEAT had good intraclass correlation coefficient (ICC) reliability (.743 pilot;.825 for scenarios) indicating its fitness as a heuristic for evaluating generative AI output. The findings indicate that ChatGPT was generally good at writing concept topics; however, it performed less well creating step-by-step task instructions. Expert TPC input helped develop a better prompt for improved output. We also found that tokenization in ChatGPT (the way it breaks up text) has a large role in terms of noncompliance with format specifications. Conclusion: There is a need for TPCs to (1) develop new models for AI-assisted content creation, (2) recognize the impact of different prompting strategies on developing specific structured authoring units such as concept and task topics, and (3) be aware of the limitations of AI such as ChatGPT. Human-in-the-loop quality check mechanisms, such as HEAT, can help validate and modify AI-generated content to better serve end users.", "venue": "Technical Communication", "label": 0}, {"loc": [5.143702983856201, -1.6343868970870972], "openalex_id": "https://openalex.org/W4401228770", "title": "Performance Analysis of Embedding Methods for Deep Learning-Based Turkish Sentiment Analysis Models", "authors": "Abdulfattah E. Ba Alawi, Ferhat Bozkurt", "abstract": "Abstract The complex syntactic structure of Turkish text makes sentiment analysis in natural language processing (NLP) a challenging task. Conventional sentiment analysis methods often fail to effectively identify attitudes in Turkish texts, creating an urgent need for more efficient approaches. To fill this need, our study investigates the effectiveness of embedding techniques including pre-trained Turkish models such as Word2Vec, GloVe, and FastText in addition to two character-level embedding methods, namely, character-integer embedding (CIE) and character one-hot encoding embedding (COE), in conjunction with deep learning models specifically long short-term memory (LSTM), convolution neural networks (CNNs), bidirectional LSTM (Bi-LSTM), and hybrid models, for Turkish short-texts sentiment analysis. DL-based models were investigated on two datasets (e.g., an original Twitter (X) dataset and an accessible hotel reviews dataset). In addition to providing an intensive performance analysis of different embedding strategies and assessing their efficacy in dealing with the linguistic intricacies of Turkish, this study proposed a previously unexplored method in Turkish text representation that relies on a character-level one-hot encoding technique. The obtained findings indicate positive progress using a novel approach utilizing a dual-pathway architecture for both character level and word level that constitutes a substantial contribution to the area of natural language processing (NLP), specifically in the context of complex morphological languages. By employing a hybrid strategy that combines character and word levels on Twitter (X) data, the LSTM model obtained an F 1 score of $$0.835 \\pm 0.005$$ 0.835 \u00b1 0.005 concerning cross-validation while CNN-BiLSTM attained the highest F 1 Score (0.8392) using holdout validation. This strategy consistently produced modest improvements across the second public dataset (hotel reviews dataset) by emerging as the runner-up embedding technique in effectiveness, surpassed only by FastText. Findings provide practical recommendations for practitioners on how to effectively use sentiment analysis to make informed decisions by introducing an extensive performance analysis of the use of embedding techniques and deep learning models for sentiment analysis in Turkish texts, which is crucial in the current age of data analysis.", "venue": "Arabian Journal for Science and Engineering", "label": 0}, {"loc": [6.930473327636719, 3.7587928771972656], "openalex_id": "https://openalex.org/W4401379991", "title": "Mobile Generative AI: Opportunities and Challenges", "authors": "Ye Zhang, Jinrui Zhang, Sheng Yue, Wei Lu, Ju Ren, Xuemin Shen", "abstract": "Recently, generative artificial intelligence (GenAI) has gained significant interest on a global scale, particularly with the explosion of some killer GenAl applications, like ChatGPT. However, due to the excessively large sizes of generative models, most current GenAl applications are deployed in the cloud, easily causing high cost, long delay, and potential risk of privacy leakage, thereby greatly impeding GenAl's further expansion and development. In this article, we explore mobile GenAl - deploying large generative models on mobile devices, aiming to bring the GenAl capability to the physical proximity to users. First, we analyze the benefits and opportunities of mobile GenAl in terms of cost, delay, privacy, personalization, and application. Then, we test various large generative models on the mobile testbed, and reveal mobile GenAl's key bottlenecks in inference latency and memory consumption. Accordingly, we propose a weight occupancy strategy for model compression during inference, and discuss the pros and cons thereof. Finally future directions are pointed out to foster continued research efforts.", "venue": "IEEE Wireless Communications", "label": 0}, {"loc": [3.7285029888153076, -3.9866392612457275], "openalex_id": "https://openalex.org/W4401234044", "title": "Measuring complex psychological and sociological constructs in large-scale text", "authors": "Alina Herderich, Jana Lasser, Mirta Gale\u0161i\u0107, Segun Taofeek Aroyehun, David Garc\u00eda, Joshua Garland", "abstract": "In recent years, there has been an increasing exchange between social science and machine learning. In principle, natural language processing enables social scientists to systematically process large amounts of text, while rich domain knowledge helps machine learning scholars to build valid models of social phenomena. However, there is a lack of clear guidelines for constructing valid and reliable mixed methods approaches, which can increase the rigor and comparability of computational social science research. We provide a set of guidelines for leveraging human data annotation and automatic text classification at scale in five stages: (1) classification scheme development, (2) data labeling, (3) model selection, (4) model training and performance improvement, and (5) statistical analysis. Using examples from our own research on countering online hate, we outline potential problems and respective solutions. We demonstrate how consequently integrating expertise from social science and machine learning can enhance the study of diverse social phenomena.", "venue": "https://doi.org/10.31234/osf.io/tzc9p", "label": 0}, {"loc": [2.1072096824645996, 5.272243499755859], "openalex_id": "https://openalex.org/W4401202781", "title": "From ML to LLM: Evaluating the Robustness of Phishing Webpage Detection Models against Adversarial Attacks", "authors": "Aditya Kulkarni, Vivek Balachandran, Dinil Mon Divakaran, Tamal Das", "abstract": "Phishing attacks attempt to deceive users into stealing sensitive information, posing a significant cybersecurity threat. Advances in machine learning (ML) and deep learning (DL) have led to the development of numerous phishing webpage detection solutions, but these models remain vulnerable to adversarial attacks. Evaluating their robustness against adversarial phishing webpages is essential. Existing tools contain datasets of pre-designed phishing webpages for a limited number of brands, and lack diversity in phishing features. To address these challenges, we develop PhishOracle, a tool that generates adversarial phishing webpages by embedding diverse phishing features into legitimate webpages. We evaluate the robustness of three existing task-specific models - Stack model, VisualPhishNet, and Phishpedia - against PhishOracle-generated adversarial phishing webpages and observe a significant drop in their detection rates. In contrast, a multimodal large language model (MLLM)-based phishing detector demonstrates stronger robustness against these adversarial attacks but still is prone to evasion. Our findings highlight the vulnerability of phishing detection models to adversarial attacks, emphasizing the need for more robust detection approaches. Furthermore, we conduct a user study to evaluate whether PhishOracle-generated adversarial phishing webpages can deceive users. The results show that many of these phishing webpages evade not only existing detection models but also users.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.2111334800720215, 2.165940999984741], "openalex_id": "https://openalex.org/W4401167869", "title": "Distortions in Judged Spatial Relations in Large Language Models", "authors": "Nir Fulman, Abdulkadir Memduho\u011flu, Alexander Zipf", "abstract": "We present a benchmark for assessing the capability of large language models (LLMs) to discern intercardinal directions between geographic locations and apply it to three prominent LLMs: GPT-3.5, GPT-4, and Llama-2. This benchmark specifically evaluates whether LLMs exhibit a hierarchical spatial bias similar to humans, where judgments about individual locations' spatial relationships are influenced by the perceived relationships of the larger groups that contain them. To investigate this, we formulated fourteen questions focusing on well-known U.S. cities. Seven questions were designed to challenge the LLMs with scenarios potentially influenced by the orientation of larger geographical units, such as states or countries, whereas the remaining seven targeted locations were less susceptible to such hierarchical categorization. Among the tested models, GPT-4 exhibited superior performance with 55 percent accuracy, followed by GPT-3.5 at 47 percent and Llama-2 at 45 percent. The models showed significantly reduced accuracy on tasks with suspected hierarchical bias. For example, GPT-4's accuracy dropped to 33 percent on these tasks, compared to 86 percent on others. The models identified the nearest cardinal direction in most cases, however, reflecting their associative learning mechanism, thereby embodying human-like misconceptions. We discuss avenues for improving the spatial reasoning capabilities of LLMs.", "venue": "The Professional Geographer", "label": 0}, {"loc": [5.953492641448975, 3.307870626449585], "openalex_id": "https://openalex.org/W4402963646", "title": "Towards Automated Solution Recipe Generation for Industrial Asset Management with LLM", "authors": "Nianjun Zhou, Patel Dhaval, Shuxin Lin, Fearghal O'Donncha", "abstract": "This study introduces a novel approach to Industrial Asset Management (IAM) by incorporating Conditional-Based Management (CBM) principles with the latest advancements in Large Language Models (LLMs). Our research introduces an automated model-building process, traditionally reliant on intensive collaboration between data scientists and domain experts. We present two primary innovations: a taxonomy-guided prompting generation that facilitates the automatic creation of AI solution recipes and a set of LLM pipelines designed to produce a solution recipe containing a set of artifacts composed of documents, sample data, and models for IAM. These pipelines, guided by standardized principles, enable the generation of initial solution templates for heterogeneous asset classes without direct human input, reducing reliance on extensive domain knowledge and enhancing automation. We evaluate our methodology by assessing asset health and sustainability across a spectrum of ten asset classes. Our findings illustrate the potential of LLMs and taxonomy-based LLM prompting pipelines in transforming asset management, offering a blueprint for subsequent research and development initiatives to be integrated into a rapid client solution.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.576207160949707, -0.2763940095901489], "openalex_id": "https://openalex.org/W4401117926", "title": "Deep Learning Impacts in the Field of Artificial Intelligence", "authors": "Wasswa Shafik", "abstract": "Deep Learning (DL) is a subfield of Artificial Intelligence (AI) that utilizes neural networks to learn and analyze large datasets. DL has become an indispensable part of AI, making it possible to achieve unprecedented accuracy and efficiency in tasks, for example, Natural Language Processing (NLP), predictive analytics, image recognition, autonomous systems, and speech recognition. One of the perilous merits of DL is its process capability and massive data analysis. In the past, traditional machine learning algorithms required a lot of hand-crafted feature engineering, which could be time-consuming and expensive. The neural network learns to identify relevant features from the raw data, saving significant time and effort. DL has profoundly impacted AI research and development. This chapter discusses DL impacts in AI as it is applied in different arenas, enabling the current DL developments and practical solutions to these impacts. Alongside, it presents future directions since DL has significantly impacted the industry, particularly in finance, healthcare, and manufacturing. Predictive analytics, for example, has become a vital tool for businesses forecasting future trends and making informed decisions. Finally, DL algorithms in healthcare are being used to develop personalized treatments and diagnostic tools to identify diseases more accurately and quickly.", "venue": "Auerbach Publications eBooks", "label": 0}, {"loc": [4.82053804397583, -1.2824691534042358], "openalex_id": "https://openalex.org/W4401158519", "title": "Textual emotion detection\u2013A systematic literature review", "authors": "Vinh Truong", "abstract": "Abstract Textual emotion detection is a critical area of study with significant applications in business, education, and healthcare. Despite substantial theoretical advancements over the years, there is a notable gap in the practical implementation of these methods in the aforementioned fields. The techniques currently available do not yet seem ready for real-world application. This study offers a comprehensive review of existing approaches, datasets, and models used in textual emotion detection. Its primary objective is to identify the challenges faced in both current literature and practical applications. The findings reveal that textual datasets annotated with emotional markers are scarce, making it difficult to develop robust supervised classification models for this task. There is also a pressing need for improved models that can accurately categorize a wider range of emotional states distinctly. Finally, there is a demand for techniques capable of dimensionally detecting valence, arousal, and dominance scores from emotional experiences. These challenges stem not only from the models and applications themselves but also from the readiness of current approaches and datasets in the rapidly evolving fields of machine learning and affective computing.", "venue": "https://doi.org/10.21203/rs.3.rs-4673385/v1", "label": 0}, {"loc": [3.675575017929077, 4.571390151977539], "openalex_id": "https://openalex.org/W4402466989", "title": "Granularity is crucial when applying differential privacy to text: An investigation for neural machine translation", "authors": "Doan Nam Long Vu, Timour Igamberdiev, Ivan Habernal", "abstract": "Applying differential privacy (DP) by means of the DP-SGD algorithm to protect individual data points during training is becoming increasingly popular in NLP. However, the choice of granularity at which DP is applied is often neglected. For example, neural machine translation (NMT) typically operates on the sentence-level granularity. From the perspective of DP, this setup assumes that each sentence belongs to a single person and any two sentences in the training dataset are independent. This assumption is however violated in many real-world NMT datasets, e.g., those including dialogues. For proper application of DP we thus must shift from sentences to entire documents. In this paper, we investigate NMT at both the sentence and document levels, analyzing the privacy/utility trade-off for both scenarios, and evaluating the risks of not using the appropriate privacy granularity in terms of leaking personally identifiable information (PII). Our findings indicate that the document-level NMT system is more resistant to membership inference attacks, emphasizing the significance of using the appropriate granularity when working with DP.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.516803741455078, 2.197545051574707], "openalex_id": "https://openalex.org/W4402618835", "title": "Modelling Multimodal Integration in Human Concept Processing with Vision-and-Language Models", "authors": "Anna Bavaresco, Marianne de Heer Kloots, Sandro Pezzelle, Raquel Fern\u00e1ndez", "abstract": "Text representations from language models have proven remarkably predictive of human neural activity involved in language processing, with the recent transformer-based models outperforming previous architectures in downstream tasks and prediction of brain responses. However, the word representations learnt by language-only models may be limited in that they lack sensory information from other modalities, which several cognitive and neuroscience studies showed to be reflected in human meaning representations. Here, we leverage current pre-trained vision-language models (VLMs) to investigate whether the integration of visuo-linguistic information they operate leads to representations that are more aligned with human brain activity than those obtained by models trained with language-only input. We focus on fMRI responses recorded while participants read concept words in the context of either a full sentence or a picture. Our results reveal that VLM representations correlate more strongly than those by language-only models with activations in brain areas functionally related to language processing. Additionally, we find that transformer-based vision-language encoders -- e.g., LXMERT and VisualBERT -- yield more brain-aligned representations than generative VLMs, whose autoregressive abilities do not seem to provide an advantage when modelling single words. Finally, our ablation analyses suggest that the high brain alignment achieved by some of the VLMs we evaluate results from semantic information acquired specifically during multimodal pretraining as opposed to being already encoded in their unimodal modules. Altogether, our findings indicate an advantage of multimodal models in predicting human brain activations, which reveals that modelling language and vision integration has the potential to capture the multimodal nature of human concept representations.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.80988073348999, 3.938720464706421], "openalex_id": "https://openalex.org/W4401023633", "title": "Dynamic Ensemble Reasoning for LLM Experts", "authors": "Xiaoqian Liu, Guoqiang Hu, Yangfan Du, Erfeng He, YingFeng Luo, Chen Xu, Tong Xiao, Jingbo Zhu", "abstract": "LLMs have demonstrated impressive performance across various language tasks. However, the strengths of LLMs can vary due to different architectures, model sizes, areas of training data, etc. Therefore, ensemble reasoning for the strengths of different LLM experts is critical to achieving consistent and satisfactory performance on diverse inputs across a wide range of tasks. However, existing LLM ensemble methods are either computationally intensive or incapable of leveraging complementary knowledge among LLM experts for various inputs. In this paper, we propose an efficient Dynamic Ensemble Reasoning paradigm, called DER to integrate the strengths of multiple LLM experts conditioned on dynamic inputs. Specifically, we model the LLM ensemble reasoning problem as a Markov Decision Process, wherein an agent sequentially takes inputs to request knowledge from an LLM candidate and passes the output to a subsequent LLM candidate. Moreover, we devise a reward function to train a DER-Agent to dynamically select an optimal answering route given the input questions, aiming to achieve the highest performance with as few computational resources as possible. Last, to fully transfer the expert knowledge from the prior LLMs, we develop a Knowledge Transfer Prompt that enables the subsequent LLM candidates to transfer complementary knowledge effectively. Experiments demonstrate that our method uses fewer computational resources to achieve better performance compared to state-of-the-art baselines. Code and appendix are available at https://github.com/Fhujinwu/DER.", "venue": "https://doi.org/10.24963/ijcai.2024/900", "label": 0}, {"loc": [6.226997375488281, 5.856203079223633], "openalex_id": "https://openalex.org/W4406443877", "title": "MMRA: A Benchmark for Multi-granularity Multi-image Relational Association", "authors": "Siwei Wu, Kang Zhu, Yu Bai, Yiming Liang, Yizhi Li, Haoning Wu, Jianhua Liu, Ruibo Liu, Xingwei Qu, Xuxin Cheng, Ge Zhang, Wenhao Huang, Chenghua Lin", "abstract": "Given the remarkable success that large visual language models (LVLMs) have achieved in image perception tasks, the endeavor to make LVLMs perceive the world like humans is drawing increasing attention. Current multi-modal benchmarks primarily focus on facts or specific topic-related knowledge contained within individual images. However, they often overlook the associative relations between multiple images, which require the identification and analysis of similarities among entities or content present in different images. Therefore, we propose the multi-image relation association task and a meticulously curated Multi-granularity Multi-image Relational Association (MMRA) benchmark, comprising 1,024 samples. In order to systematically and comprehensively evaluate current LVLMs, we establish an associational relation system among images that contain 11 subtasks (e.g, UsageSimilarity, SubEvent) at two granularity levels (i.e., image and entity) according to the relations in ConceptNet. Our experiments reveal that on the MMRA benchmark, current multi-image LVLMs exhibit distinct advantages and disadvantages across various subtasks. Notably, fine-grained, entity-level multi-image perception tasks pose a greater challenge for LVLMs compared to image-level tasks. Moreover, LVLMs perform poorly on spatial-related tasks, indicating that LVLMs still have limited spatial awareness. Additionally, our findings indicate that while LVLMs demonstrate a strong capability to perceive image details, enhancing their ability to associate information across multiple images hinges on improving the reasoning capabilities of their language model component. Moreover, we explored the ability of LVLMs to perceive image sequences within the context of our multi-image association task. Our experiments show that the majority of current LVLMs do not adequately model image sequences during the pre-training process.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.9032838344573975, -0.07478949427604675], "openalex_id": "https://openalex.org/W4400937555", "title": "The diagnostic and triage accuracy of the GPT-3 artificial intelligence model: an observational study", "authors": "David M. Levine, Rudraksh Tuwani, Benjamin Kompa, Amita Varma, Samuel G. Finlayson, Ateev Mehrotra, Andrew L. Beam", "abstract": "The National Heart, Lung, and Blood Institute.", "venue": "The Lancet Digital Health", "label": 0}, {"loc": [2.487189292907715, 1.4539357423782349], "openalex_id": "https://openalex.org/W4400981924", "title": "Rethinking Higher Education Teaching and Assessment In-Line with AI Innovations: A Systematic Review and Meta-Analysis", "authors": "Joanne Nabwire Lyanda, Salmon Oliech Owidi, Aggrey Mukasa Simiyu", "abstract": "With the rapid advancement of artificial intelligence (AI) technologies, higher education institutions are increasingly exploring innovative ways to rethink teaching and assessment practices. This research paper examines the implications of AI on assessments in online learning environments. Specifically, the objectives of this study were to evaluate the effectiveness of AI-powered teaching methodologies in enhancing student engagement and learning outcomes in online education settings and, secondly, to analyze the impact of AI-driven assessment tools on the accuracy, reliability, and fairness of evaluating student performance in online learning environments through a systematic review and meta-analysis of existing literature. The study adopted activity theory to understand the issues around AI and assessment. The study adopted a mixed-methods design. The study adopted the use of meta-analysis in order to statistically combine results from multiple studies on a particular topic to provide a more comprehensive and reliable summary of the overall findings. The study found that to guarantee moral and just practices, there are issues with the integration of AI in online learning that need to be resolved. Key issues included data privacy, algorithmic prejudice, and the role of human instructors in the administration of the assessments online, carefully considered and addressed in a proactive manner. These findings provided insights on how AI can transform traditional teaching methods and assessment strategies, creating an AI-crowded environment that fosters student learning and academic success. Based on the findings, the study recommends that there is a need to integrate pedagogical strategies that leverage AI innovation, such as adaptive learning approaches, real-time feedback mechanisms, or interactive simulations, to improve teaching effectiveness and student performance in online settings.", "venue": "African Journal of Empirical Research", "label": 0}, {"loc": [8.992581367492676, 2.5011513233184814], "openalex_id": "https://openalex.org/W4406074021", "title": "Data Mixture Inference: What do BPE Tokenizers Reveal about their Training Data?", "authors": "Jonathan Hayase, Alisa Liu, Yejin Choi, Sewoong Oh, Noah A. Smith", "abstract": "The pretraining data of today's strongest language models is opaque; in particular, little is known about the proportions of various domains or languages represented. In this work, we tackle a task which we call data mixture inference, which aims to uncover the distributional make-up of training data. We introduce a novel attack based on a previously overlooked source of information: byte-pair encoding (BPE) tokenizers, used by the vast majority of modern language models. Our key insight is that the ordered list of merge rules learned by a BPE tokenizer naturally reveals information about the token frequencies in its training data. Given a tokenizer's merge list along with example data for each category of interest, we formulate a linear program that solves for the proportion of each category in the tokenizer's training set. In controlled experiments, we show that our attack recovers mixture ratios with high precision for tokenizers trained on known mixtures of natural languages, programming languages, and data sources. We then apply our approach to off-the-shelf tokenizers released with recent LMs. We confirm much publicly disclosed information about these models, and also make several new inferences: GPT-4o and Mistral NeMo's tokenizers are much more multilingual than their predecessors, training on 39% and 47% non-English language data, respectively; Llama 3 extends GPT-3.5's tokenizer primarily for multilingual (48%) use; GPT-3.5's and Claude's tokenizers are trained on predominantly code (~60%). We hope our work sheds light on current design practices for pretraining data, and inspires continued research into data mixture inference for LMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.7910752296447754, 3.061821699142456], "openalex_id": "https://openalex.org/W4402856943", "title": "Consent in Crisis: The Rapid Decline of the AI Data Commons", "authors": "Shayne Longpre, Robert Mahari, Ariel Lee, C.P. Lund, Hamidah Oderinwale, William Brannon, Nayan Saxena, Naana Obeng-Marnu, Tobin South, C.A. Hunter, Kevin Klyman, Christopher Klamm, Hailey Schoelkopf, Nikhil Kumar Singh, Manuel Cherep, Ahmad Lutfi Anis, An Q Dinh, Caroline Chitongo, Da Yin, Damien Sileo, Deividas Mataciunas, Diganta Misra, Emad A. Alghamdi, Enrico Shippole, Jianguo Zhang, Joanna Materzy\u0144ska, Kun Qian, Kalishankar Tiwary, L\u00edvia Maria Bettini de Miranda, Manan Dey, Minnie Liang, Mohammed Hamdy, Niklas Muennighoff, Seonghyeon Ye, Seungone Kim, Shrestha Mohanty, Vipul Gupta, Vivek Sharma, Vu Minh Chien, Xuhui Zhou, Yi\u2010Zhi Li, Caiming Xiong, L. F. Villa, Stella Biderman, H. B. Li, Daphne Ippolito, Sara Hooker, Jad Kabbara, Sandy Pentland", "abstract": "General-purpose artificial intelligence (AI) systems are built on massive swathes of public web data, assembled into corpora such as C4, RefinedWeb, and Dolma. To our knowledge, we conduct the first, large-scale, longitudinal audit of the consent protocols for the web domains underlying AI training corpora. Our audit of 14,000 web domains provides an expansive view of crawlable web data and how codified data use preferences are changing over time. We observe a proliferation of AI-specific clauses to limit use, acute differences in restrictions on AI developers, as well as general inconsistencies between websites' expressed intentions in their Terms of Service and their robots.txt. We diagnose these as symptoms of ineffective web protocols, not designed to cope with the widespread re-purposing of the internet for AI. Our longitudinal analyses show that in a single year (2023-2024) there has been a rapid crescendo of data restrictions from web sources, rendering ~5%+ of all tokens in C4, or 28%+ of the most actively maintained, critical sources in C4, fully restricted from use. For Terms of Service crawling restrictions, a full 45% of C4 is now restricted. If respected or enforced, these restrictions are rapidly biasing the diversity, freshness, and scaling laws for general-purpose AI systems. We hope to illustrate the emerging crises in data consent, for both developers and creators. The foreclosure of much of the open web will impact not only commercial AI, but also non-commercial AI and academic research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.327272415161133, 5.208053112030029], "openalex_id": "https://openalex.org/W4406072453", "title": "Falcon2-11B Technical Report", "authors": "Quentin Malartic, Nilabhra Roy Chowdhury, Ruxandra Cojocaru, Mugariya Farooq, Giulia Campesan, Yasser Abdelaziz Dahou Djilali, Sanath Narayan, Ankit Singh, Maksim Velikanov, Basma El Amel Boussaha, Mohammed Alyafeai, Hamza Alobeidli, Leen Al Qadi, Mohamed El Amine Seddik, Kirill Fedyanin, R\u00e9da Alami, Hakim Hacid", "abstract": "We introduce Falcon2-11B, a foundation model trained on over five trillion tokens, and its multimodal counterpart, Falcon2-11B-vlm, which is a vision-to-text model. We report our findings during the training of the Falcon2-11B which follows a multi-stage approach where the early stages are distinguished by their context length and a final stage where we use a curated, high-quality dataset. Additionally, we report the effect of doubling the batch size mid-training and how training loss spikes are affected by the learning rate. The downstream performance of the foundation model is evaluated on established benchmarks, including multilingual and code datasets. The foundation model shows strong generalization across all the tasks which makes it suitable for downstream finetuning use cases. For the vision language model, we report the performance on several benchmarks and show that our model achieves a higher average score compared to open-source models of similar size. The model weights and code of both Falcon2-11B and Falcon2-11B-vlm are made available under a permissive license.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.240234136581421, -0.13117776811122894], "openalex_id": "https://openalex.org/W4400950690", "title": "The Future of Intelligent Healthcare: A Systematic Analysis and Discussion on the Integration and Impact of Robots Using Large Language Models for Healthcare", "authors": "Souren Pashangpour, Goldie Nejat", "abstract": "The potential use of large language models (LLMs) in healthcare robotics can help address the significant demand put on healthcare systems around the world with respect to an aging demographic and a shortage of healthcare professionals. Even though LLMs have already been integrated into medicine to assist both clinicians and patients, the integration of LLMs within healthcare robots has not yet been explored for clinical settings. In this perspective paper, we investigate the groundbreaking developments in robotics and LLMs to uniquely identify the needed system requirements for designing health-specific LLM-based robots in terms of multi-modal communication through human\u2013robot interactions (HRIs), semantic reasoning, and task planning. Furthermore, we discuss the ethical issues, open challenges, and potential future research directions for this emerging innovative field.", "venue": "Robotics", "label": 0}, {"loc": [9.518027305603027, 0.7737735509872437], "openalex_id": "https://openalex.org/W4400919718", "title": "DANSK: Domain Generalization of Danish Named Entity Recognition", "authors": "Kenneth Enevoldsen, Emil Trenckner Jessen, Rebekah Baglini", "abstract": "Named entity recognition is an important application within Danish NLP, essential within both industry and research. However, Danish NER is inhibited by a lack coverage across domains and entity types. As a consequence, no current models are capable of fine-grained named entity recognition, nor have they been evaluated for potential generalizability issues across datasets and domains. To alleviate these limitations, this paper introduces: 1) DANSK: a named entity dataset providing for high-granularity tagging as well as within-domain evaluation of models across a diverse set of domains; 2) and three generalizable models with fine-grained annotation available in DaCy 2.6.0; and 3) an evaluation of current state-of-the-art models\u2019 ability to generalize across domains. The evaluation of existing and new models revealed notable performance discrepancies across domains, which should be addressed within the field. Shortcomings of the annotation quality of the dataset and its impact on model training and evaluation are also discussed. Despite these limitations, we advocate for the use of the new dataset DANSK alongside further work ongeneralizability within Danish NER.", "venue": "Northern European Journal of Language Technology", "label": 0}, {"loc": [6.358765125274658, 5.738128185272217], "openalex_id": "https://openalex.org/W4402824307", "title": "Exploring the Effectiveness of Object-Centric Representations in Visual Question Answering: Comparative Insights with Foundation Models", "authors": "Amir Mohammad Karimi Mamaghan, Samuele Papa, Karl Henrik Johansson, Stefan Bauer, Andrea Dittadi", "abstract": "Object-centric (OC) representations, which model visual scenes as compositions of discrete objects, have the potential to be used in various downstream tasks to achieve systematic compositional generalization and facilitate reasoning. However, these claims have yet to be thoroughly validated empirically. Recently, foundation models have demonstrated unparalleled capabilities across diverse domains, from language to computer vision, positioning them as a potential cornerstone of future research for a wide range of computational tasks. In this paper, we conduct an extensive empirical study on representation learning for downstream Visual Question Answering (VQA), which requires an accurate compositional understanding of the scene. We thoroughly investigate the benefits and trade-offs of OC models and alternative approaches including large pre-trained foundation models on both synthetic and real-world data, ultimately identifying a promising path to leverage the strengths of both paradigms. The extensiveness of our study, encompassing over 600 downstream VQA models and 15 different types of upstream representations, also provides several additional insights that we believe will be of interest to the community at large.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.208192825317383, 5.664211750030518], "openalex_id": "https://openalex.org/W4402857558", "title": "EVLM: An Efficient Vision-Language Model for Visual Understanding", "authors": "Kaibing Chen, Shen Dong, Hanwen Zhong, Huasong Zhong, Kui Xia, Di Xu, Wei Yuan, Yifei Hu, Bin Wen, Tianke Zhang, Changyi Liu, Dewen Fan, Huihui Xiao, Jia\u2010Hong Wu, Fan Yang, Size Li, Di Zhang", "abstract": "In the field of multi-modal language models, the majority of methods are built on an architecture similar to LLaVA. These models use a single-layer ViT feature as a visual prompt, directly feeding it into the language models alongside textual tokens. However, when dealing with long sequences of visual signals or inputs such as videos, the self-attention mechanism of language models can lead to significant computational overhead. Additionally, using single-layer ViT features makes it challenging for large language models to perceive visual signals fully. This paper proposes an efficient multi-modal language model to minimize computational costs while enabling the model to perceive visual signals as comprehensively as possible. Our method primarily includes: (1) employing cross-attention to image-text interaction similar to Flamingo. (2) utilize hierarchical ViT features. (3) introduce the Mixture of Experts (MoE) mechanism to enhance model effectiveness. Our model achieves competitive scores on public multi-modal benchmarks and performs well in tasks such as image captioning and video captioning.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.295775413513184, -0.7331269383430481], "openalex_id": "https://openalex.org/W4400871978", "title": "A Computational Analysis of Transcribed Speech of People Living with Dementia: The Anchise 2022 Corpus", "authors": "Francesco Sigona, Daniele P. Radicioni, B\u00e1rbara Gili Fivela, Davide Colla, Matteo Delsanto, Enrico Mensa, Andrea Bolioli, Pietro Vigorelli", "abstract": "Automatic linguistic analysis can provide cost-effective, valuable clues to the diagnosis of cognitive difficulties and to therapeutic practice, and hence impact positively on wellbeing. In this work, we analyzed transcribed conversations between elderly individuals living with dementia and healthcare professionals. The material came from the Anchise 2022 Corpus, a large collection of transcripts of conversations in Italian recorded in naturalistic conditions. The aim of the work was to test the effectiveness of a number of automatic analyzes in finding correlations with the progression of dementia in individuals with cognitive decline as measured by the Mini-Mental State Examination (MMSE) score, which is the only psychometric-clinical information available on the participants in the conversations. Healthy controls (HC) were not considered in this study, nor does the corpus itself include HCs. The main innovation and strength of the work consists in the high ecological validity of the language analyzed (most of the literature to date concerns controlled language experiments); in the use of Italian (there is little corpora for Italian); in the size of the analyzed data (more than 200 conversations were considered); in the adoption of a wide range of NLP methods, that span from traditional morphosyntactic investigation to deep linguistic models for conducting analyzes such as through perplexity, sentiment (polarity) and emotions. Analyzing real-world interactions not designed with computational analysis in mind, such as is the case of the Anchise Corpus, is particularly challenging. To achieve the research goals, a wide variety of tools were employed. These included traditional morphosyntactic analysis based on digital linguistic biomarkers (DLBs), transformer-based language models, sentiment and emotion analysis, and perplexity metrics. Analyzes were conducted both on the continuous range of MMSE values and on the severe/moderate/mild categorization suggested by AIFA (Italian Medicines Agency) guidelines, based on MMSE threshold values. Correlations between MMSE and individual DLBs were weak, up to 0.19 for positive, and -0.21 for negative correlation values. Nevertheless, some correlations were statistically significant and consistent with the literature, suggesting that people with a greater degree of impairment tend to show a reduced vocabulary, to have anomia, to adopt a more informal linguist register, and to display a simplified use of verbs, with a decrease in the use of participles, gerunds, subjunctive moods, modal verbs, as well as a flattening in the use of the tenses towards the present to the detriment of the past. The -0.26 inverse correlation between perplexity and MMSE suggests that perplexity captures slightly more specific linguistic information, which can complement the MMSE scores. In the categorization tasks, the classifier based on DLBs achieved an F1 score of 0.79 for binary classification between SEVERE and MILD, and 0.61 for multi-label categorization. Sentiment and emotion analyzes showed inverse trends for joy while MMSE scores suggested that less impaired individuals were less joyful, or more \"negative\", than others. Considering the real-world context, this is consistent with the hypothesis of a gradual reduction in awareness in individuals affected by dementia. Finally, integrating various profiles of analysis has been proved to be effective in offering a wider picture of linguistic and communication deficits, as well as more precise data regarding the progression of dementia.", "venue": "Computer Speech & Language", "label": 0}, {"loc": [2.9716532230377197, -0.38469091057777405], "openalex_id": "https://openalex.org/W4405882610", "title": "Domain-Specific Pretraining of Language Models: A Comparative Study in the Medical Field", "authors": "Tobias Kerner", "abstract": "There are many cases where LLMs are used for specific tasks in a single domain. These usually require less general, but more domain-specific knowledge. Highly capable, general-purpose state-of-the-art language models like GPT-4 or Claude-3-opus can often be used for such tasks, but they are very large and cannot be run locally, even if they were not proprietary. This can be a problem when working with sensitive data. This paper focuses on domain-specific and mixed-domain pretraining as potentially more efficient methods than general pretraining for specialized language models. We will take a look at work related to domain-specific pretraining, specifically in the medical area, and compare benchmark results of specialized language models to general-purpose language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.988081932067871, 3.3766839504241943], "openalex_id": "https://openalex.org/W4402856383", "title": "Foundation Models for Autonomous Robots in Unstructured Environments", "authors": "Hossein Naderi, Alireza Shojaei, Lifu Huang", "abstract": "Automating activities through robots in unstructured environments, such as construction sites, has been a long-standing desire. However, the high degree of unpredictable events in these settings has resulted in far less adoption compared to more structured settings, such as manufacturing, where robots can be hard-coded or trained on narrowly defined datasets. Recently, pretrained foundation models, such as Large Language Models (LLMs), have demonstrated superior generalization capabilities by providing zero-shot solutions for problems do not present in the training data, proposing them as a potential solution for introducing robots to unstructured environments. To this end, this study investigates potential opportunities and challenges of pretrained foundation models from a multi-dimensional perspective. The study systematically reviews application of foundation models in two field of robotic and unstructured environment and then synthesized them with deliberative acting theory. Findings showed that linguistic capabilities of LLMs have been utilized more than other features for improving perception in human-robot interactions. On the other hand, findings showed that the use of LLMs demonstrated more applications in project management and safety in construction, and natural hazard detection in disaster management. Synthesizing these findings, we located the current state-of-the-art in this field on a five-level scale of automation, placing them at conditional automation. This assessment was then used to envision future scenarios, challenges, and solutions toward autonomous safe unstructured environments. Our study can be seen as a benchmark to track our progress toward that future.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.813271999359131, 2.2151436805725098], "openalex_id": "https://openalex.org/W4400854607", "title": "Mapping the Positive Self-Bias Embedded in Human Languages", "authors": "Yongfa Zhang, Fengjie Zou, Songlin Jia, Fei Wang", "abstract": "Whether humans inherently view themselves more positively than others underpins a fundamental question of psychology, yet it has eluded scientific consensus for decades. Overcoming behavioral research\u2019s limitations, advancements in natural language processing facilitate exploring more nuanced societal-level biases through text analysis. This study employed word embeddings to investigate the positive self-bias in human languages and its universality across cultures and historical periods using multilingual corpora totaling approximately two trillion words. We first demonstrated the presence of positive self-bias in English, aligning with prior behavioral findings. Cross-cultural comparisons then revealed self-deprecation tendencies in trait-based self-evaluation in specific Eastern languages, contrasting with Western patterns. However, affective self-positivity\u2014the tendency to associate the self with generally positive rather than negative words\u2014was universally observed across 11 languages. Analysis of 200 years of English texts further substantiated this dichotomy: while trait self-positivity fluctuated historically, affective self-positivity remained stable. These findings support an integrative dual-dimensional model of positive self-bias, illuminating a pathway to reconcile the longstanding debates.", "venue": "http://doi.org/10.31234/osf.io/mnv37", "label": 0}, {"loc": [6.319406032562256, -1.0878899097442627], "openalex_id": "https://openalex.org/W4400808506", "title": "Toward Robust Arabic AI-Generated Text Detection: Tackling Diacritics Challenges", "authors": "Hamed Alshammari, Khaled Elleithy", "abstract": "Current AI detection systems often struggle to distinguish between Arabic human-written text (HWT) and AI-generated text (AIGT) due to the small marks present above and below the Arabic text called diacritics. This study introduces robust Arabic text detection models using Transformer-based pre-trained models, specifically AraELECTRA, AraBERT, XLM-R, and mBERT. Our primary goal is to detect AIGTs in essays and overcome the challenges posed by the diacritics that usually appear in Arabic religious texts. We created several novel datasets with diacritized and non-diacritized texts comprising up to 9666 HWT and AIGT training examples. We aimed to assess the robustness and effectiveness of the detection models on out-of-domain (OOD) datasets to assess their generalizability. Our detection models trained on diacritized examples achieved up to 98.4% accuracy compared to GPTZero\u2019s 62.7% on the AIRABIC benchmark dataset. Our experiments reveal that, while including diacritics in training enhances the recognition of the diacritized HWTs, duplicating examples with and without diacritics is inefficient despite the high accuracy achieved. Applying a dediacritization filter during evaluation significantly improved model performance, achieving optimal performance compared to both GPTZero and the detection models trained on diacritized examples but evaluated without dediacritization. Although our focus was on Arabic due to its writing challenges, our detector architecture is adaptable to any language.", "venue": "Information", "label": 17}, {"loc": [6.7855119705200195, 1.4849222898483276], "openalex_id": "https://openalex.org/W4402346106", "title": "Limits to Predicting Online Speech Using Large Language Models", "authors": "Mina Remeli, Moritz Hardt, Robert C. Williamson", "abstract": "We study the predictability of online speech on social media, and whether predictability improves with information outside a user's own posts. Recent theoretical results suggest that posts from a user's social circle are as predictive of the user's future posts as that of the user's past posts. Motivated by the success of large language models, we empirically test this hypothesis. We define predictability as a measure of the model's uncertainty, i.e., its negative log-likelihood on future tokens given context. As the basis of our study, we collect 10M tweets for ``tweet-tuning'' base models and a further 6.25M posts from more than five thousand X (previously Twitter) users and their peers. Across four large language models ranging in size from 1.5 billion to 70 billion parameters, we find that predicting a user's posts from their peers' posts performs poorly. Moreover, the value of the user's own posts for prediction is consistently higher than that of their peers'. We extend our investigation with a detailed analysis on what's learned in-context and the robustness of our findings. From context, base models learn to correctly predict @-mentions and hashtags. Moreover, our results replicate if instead of prompting the model with additional context, we finetune on it. Across the board, we find that predicting the posts of individual users remains hard.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.303709983825684, 2.802882194519043], "openalex_id": "https://openalex.org/W4402346107", "title": "Scaling Retrieval-Based Language Models with a Trillion-Token Datastore", "authors": "Rulin Shao, Jacqueline He, Akari Asai, Weijia Shi, Tim Dettmers, Sewon Min, Luke Zettlemoyer, Pang Wei Koh", "abstract": "Scaling laws with respect to the amount of training data and the number of parameters allow us to predict the cost-benefit trade-offs of pretraining language models (LMs) in different configurations. In this paper, we consider another dimension of scaling: the amount of data available at inference time. Specifically, we find that increasing the size of the datastore used by a retrieval-based LM monotonically improves language modeling and several downstream tasks without obvious saturation, such that a smaller model augmented with a large datastore outperforms a larger LM-only model on knowledge-intensive tasks. By plotting compute-optimal scaling curves with varied datastore, model, and pretraining data sizes, we show that using larger datastores can significantly improve model performance for the same training compute budget. We carry out our study by constructing a 1.4 trillion-token datastore named MassiveDS, which is the largest and the most diverse open-sourced datastore for retrieval-based LMs to date, and designing an efficient pipeline for studying datastore scaling in a computationally accessible manner. Finally, we analyze the effect of improving the retriever, datastore quality filtering, and other design choices on our observed scaling trends. Overall, our results show that datastore size should be considered as an integral part of LM efficiency and performance trade-offs. To facilitate future research, we open-source our datastore and code at https://github.com/RulinShao/retrieval-scaling.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.991879940032959, -0.4288758635520935], "openalex_id": "https://openalex.org/W4401594474", "title": "Bilingual Adaptation of Monolingual Foundation Models", "authors": "Gurpreet Gosal, Yishi Xu, Gokul Ramakrishnan, Rituraj Joshi, Avraham Sheinin, Zhiming, Chen Chen, Biswajit Mishra, Natalia Vassilieva, Joel Hestness, Neha Sengupta, Sunil Kumar Sahu, Bokang Jia, Onkar Pandit, Satheesh Katipomu, Samta Kamboj, Samujjwal Ghosh, Rahul Pal, Parvez Mullah, Soundar Doraiswamy, Mohamed El Karim Chami, Preslav Nakov", "abstract": "We present an efficient method for adapting a monolingual Large Language Model (LLM) to another language, addressing challenges of catastrophic forgetting and tokenizer limitations. We focus this study on adapting Llama 2 to Arabic. Our two-stage approach begins with expanding the vocabulary and training only the embeddings matrix, followed by full model continual pre-training on a bilingual corpus. By continually pre-training on a mix of Arabic and English corpora, the model retains its proficiency in English while acquiring capabilities in Arabic. Our approach results in significant improvements in Arabic and slight enhancements in English, demonstrating cost-effective cross-lingual transfer. We perform ablations on embedding initialization techniques, data mix ratios, and learning rates and release a detailed training recipe. To demonstrate generalizability of this approach we also adapted Llama 3 8B to Arabic and Llama 2 13B to Hindi.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.349979400634766, 0.2345297783613205], "openalex_id": "https://openalex.org/W4406059498", "title": "FuLG: 150B Romanian Corpus for Language Model Pretraining", "authors": "Vlad-Andrei B\u0103doiu, Mihai-Valentin Dumitru, Alexandru M. Gherghescu, Alexandru Agache, Costin Raiciu", "abstract": "Research in the field of language models is rapidly evolving, with many open models being released to the public. Openly available pretraining corpora usually focus on only a handful of languages, with many others either missing completely or extremely underrepresented. In this report, we introduce FuLG, a hundred-fifty-billion-token Romanian corpus extracted from CommonCrawl. We present our methodology for filtering FuLG and compare it via ablation studies against existing Romanian corpora.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.271484375, 0.8122633099555969], "openalex_id": "https://openalex.org/W4403750451", "title": "On Initializing Transformers with Pre-trained Embeddings", "authors": "Ha Young Kim, Niranjan Balasubramanian, Byungkon Kang", "abstract": "It has become common practice now to use random initialization schemes, rather than the pre-trained embeddings, when training transformer based models from scratch. Indeed, we find that pre-trained word embeddings from GloVe, and some sub-word embeddings extracted from language models such as T5 and mT5 fare much worse compared to random initialization. This is counter-intuitive given the well-known representational and transfer-learning advantages of pre-training. Interestingly, we also find that BERT and mBERT embeddings fare better than random initialization, showing the advantages of pre-trained representations. In this work, we posit two potential factors that contribute to these mixed results: the model sensitivity to parameter distribution and the embedding interactions with position encodings. We observe that pre-trained GloVe, T5, and mT5 embeddings have a wider distribution of values. As argued in the initialization studies, such large value initializations can lead to poor training because of saturated outputs. Further, the larger embedding values can, in effect, absorb the smaller position encoding values when added together, thus losing position information. Standardizing the pre-trained embeddings to a narrow range (e.g. as prescribed by Xavier) leads to substantial gains for Glove, T5, and mT5 embeddings. On the other hand, BERT pre-trained embeddings, while larger, are still relatively closer to Xavier initialization range which may allow it to effectively transfer the pre-trained knowledge.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.2503132820129395, -1.6168862581253052], "openalex_id": "https://openalex.org/W4400799268", "title": "Exploring transformer models in the sentiment analysis task for the under-resource Bengali language", "authors": "Md. Nesarul Hoque, Umme Salma, Md. Jamal Uddin, Md. Martuza Ahamad, Sakifa Aktar", "abstract": "In the sentiment analysis (SA) task, we can obtain a positive or negative-typed comment or feedback from an online user or a customer about any object, such as a movie, drama, food, and others. This user\u2019s sentiment may positively impact various decision-making processes. In this regard, a lot of studies have been done on identifying sentiments from a text in high-resource languages like English. However, a small number of studies are detected in the under-resource Bengali language because of the unavailability of the benchmark corpus, limitations of text processing application software, and so on. Furthermore, there is still enough space to enhance the classification performance of the SA task. In this research, we experiment on a recognized Bengali dataset of 11,807 comments to find positive or negative sentiments. We employ five state-of-the-art transformer-based pretrained models, such as multilingual Bidirectional Encoder Representations from Transformers (mBERT), BanglaBERT, Bangla-Bert-Base, DistilmBERT, and XLM-RoBERTa-base (XLM-R-base), with tuning of the hyperparameters. After that, we propose a combined model named Transformer-ensemble that presents outstanding detection performance with an accuracy of 95.97% and an F1-score of 95.96% compared to the existing recent methods in the Bengali SA task.", "venue": "Natural Language Processing Journal", "label": 9}, {"loc": [6.546785831451416, 5.170927047729492], "openalex_id": "https://openalex.org/W4401670275", "title": "Single Layer Single Gradient Unlearning", "authors": "Zikui Cai, Yaoteng Tan, M. Salman Asif", "abstract": "Machine unlearning methods aim to remove sensitive or unwanted content from trained models, but typically demand extensive model updates at significant computational cost while potentially degrading model performance on both related and unrelated tasks. We propose Single Layer Unlearning Gradient (SLUG) as an efficient method to unlearn targeted information by updating a single critical layer using a one-time gradient computation. SLUG uses layer importance and gradient alignment metrics to identify the optimal layer for targeted information removal while preserving the model utility. We demonstrate the effectiveness of SLUG for CLIP, Stable Diffusion, and vision-language models (VLMs) in removing concrete (e.g., identities and objects) and abstract concepts (e.g., artistic styles). On the UnlearnCanvas benchmark, SLUG achieves comparable unlearning performance to existing methods while requiring significantly less computational resources. Our proposed approach offers a practical solution for targeted unlearning that is computationally efficient and precise. Our code is available at https://github.com/CSIPlab/SLUG.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.672669887542725, 2.870701789855957], "openalex_id": "https://openalex.org/W4401662000", "title": "LITS: An Optimized Learned Index for Strings (An Extended Version)", "authors": "Yifan Yang, Shimin Chen", "abstract": "Index is an important component in database systems. Learned indexes have been shown to outperform traditional tree-based index structures for fixed-sized integer or floating point keys. However, the application of the learned solution to variable-length string keys is under-researched. Our experiments show that existing learned indexes for strings fail to outperform traditional string indexes, such as HOT and ART. String keys are long and variable sized, and often contain skewed prefixes, which make the last-mile search expensive, and adversely impact the capability of learned models to capture the skewed distribution of string keys. In this paper, we propose a novel learned index for string keys, LITS (Learned Index with Hash-enhanced Prefix Table and Sub-tries). We propose an optimized learned model, combining a global Hash-enhanced Prefix Table (HPT) and a per-node local linear model to better distinguish string keys. Moreover, LITS exploits compact leaf nodes and hybrid structures with a PMSS model for efficient point and range operations. Our experimental results using eleven string data sets show that LITS achieves up to 2.43x and 2.27x improvement over HOT and ART for point operations, and attains comparable scan performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.704450607299805, 0.5141255855560303], "openalex_id": "https://openalex.org/W4400702727", "title": "How Chinese are Chinese Language Models? The Puzzling Lack of Language Policy in China's LLMs", "authors": "Andrea W Wen-Yi, Unso Eun Seo Jo, Lichuan Lin, David Mimno", "abstract": "Contemporary language models are increasingly multilingual, but Chinese LLM developers must navigate complex political and business considerations of language diversity. Language policy in China aims at influencing the public discourse and governing a multi-ethnic society, and has gradually transitioned from a pluralist to a more assimilationist approach since 1949. We explore the impact of these influences on current language technology. We evaluate six open-source multilingual LLMs pre-trained by Chinese companies on 18 languages, spanning a wide range of Chinese, Asian, and Anglo-European languages. Our experiments show Chinese LLMs performance on diverse languages is indistinguishable from international LLMs. Similarly, the models' technical reports also show lack of consideration for pretraining data language coverage except for English and Mandarin Chinese. Examining Chinese AI policy, model experiments, and technical reports, we find no sign of any consistent policy, either for or against, language diversity in China's LLM development. This leaves a puzzling fact that while China regulates both the languages people use daily as well as language model development, they do not seem to have any policy on the languages in language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.168033599853516, 2.0605626106262207], "openalex_id": "https://openalex.org/W4400668506", "title": "TelecomGPT: A Framework to Build Telecom-Specfic Large Language Models", "authors": "Hang Zou, Qiyang Zhao, Yu Tian, Lina Bariah, Faouzi Bader, Thierry Lestable, M\u00e9rouane Debbah", "abstract": "Large Language Models (LLMs) have the potential to revolutionize the Sixth Generation (6G) communication networks. However, current mainstream LLMs generally lack the specialized knowledge in telecom domain. In this paper, for the first time, we propose a pipeline to adapt any general purpose LLMs to a telecom-specific LLMs. We collect and build telecom-specific pre-train dataset, instruction dataset, preference dataset to perform continual pre-training, instruct tuning and alignment tuning respectively. Besides, due to the lack of widely accepted evaluation benchmarks in telecom domain, we extend existing evaluation benchmarks and proposed three new benchmarks, namely, Telecom Math Modeling, Telecom Open QnA and Telecom Code Tasks. These new benchmarks provide a holistic evaluation of the capabilities of LLMs including math modeling, Open-Ended question answering, code generation, infilling, summarization and analysis in telecom domain. Our fine-tuned LLM TelecomGPT outperforms state of the art (SOTA) LLMs including GPT-4, Llama-3 and Mistral in Telecom Math Modeling benchmark significantly and achieve comparable performance in various evaluation benchmarks such as TeleQnA, 3GPP technical documents classification, telecom code summary and generation and infilling.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.4849586486816406, 1.5310742855072021], "openalex_id": "https://openalex.org/W4401191156", "title": "Generative AI in Education From the Perspective of Students, Educators, and Administrators", "authors": "Ranti Fauza Mayana, Tisni Santika", "abstract": "As a domain of science and technology, Artificial Intelligence (AI) opened new horizons for education. Technologies change the ways we teach and learn. While Generative AI tools create new prospects for learning, several concerns also arise. Educators are worried that they cannot differentiate between the output of students' work and the output from AI and this will impact the discipline, originality, integrity, and ethics in such cases. In addition, the problem also potentially arises in the matter of the authorship of the works regarding Copyright Law. This paper examines several legal issues of the utilization of Generative AI through the perspective of Copyright Law. This paper concludes several important points; First, although the framework of Indonesia\u2019s copyright law is based on the principle of human authorship, the rapid development of Generative AI must be balanced with an accommodative legal framework, Second, it is particularly important to formulate a special provisions to guide the implementation concerning the utilization of copyrighted works as the input material for generative AI so that it will not harm the \"legitimate interest of the author\" in the limit of \"normal exploitation of the work\" and classified as fair use, Third, academics and administrators need to gain a better understanding of the promise and perils of generative AI, how it will likely impact education, and how it might best govern by encourage the school and universities to develop institutional policies and/or formal guidance concerning the use of digital technology and Generative AI for the future of education.", "venue": "Jurnal Pendidikan Terbuka Dan Jarak Jauh", "label": 0}, {"loc": [4.594197750091553, 1.273840308189392], "openalex_id": "https://openalex.org/W4400585758", "title": "Can Bias in LLMs be Used for Good?", "authors": "Jinsook Lee, Yann Hicke, Renzhe Yu, Christopher Brooks, Ren\u00e9 F. Kizilcec", "abstract": "Abstract Large language models (LLMs) are increasingly adopted in educational contexts to provide personalized support to students and teachers. The unprecedented capacity of LLM\u2010based applications to understand and generate natural language can potentially improve instructional effectiveness and learning outcomes, but the integration of LLMs in education technology has renewed concerns over algorithmic bias, which may exacerbate educational inequalities. Building on prior work that mapped the traditional machine learning life cycle, we provide a framework of the LLM life cycle from the initial development of LLMs to customizing pre\u2010trained models for various applications in educational settings. We explain each step in the LLM life cycle and identify potential sources of bias that may arise in the context of education. We discuss why current measures of bias from traditional machine learning fail to transfer to LLM\u2010generated text (eg, tutoring conversations) because text encodings are high\u2010dimensional, there can be multiple correct responses, and tailoring responses may be pedagogically desirable rather than unfair. The proposed framework clarifies the complex nature of bias in LLM applications and provides practical guidance for their evaluation to promote educational equity. Practitioner notes What is already known about this topic The life cycle of traditional machine learning (ML) applications which focus on predicting labels is well understood. Biases are known to enter in traditional ML applications at various points in the life cycle, and methods to measure and mitigate these biases have been developed and tested. Large language models (LLMs) and other forms of generative artificial intelligence (GenAI) are increasingly adopted in education technologies (EdTech), but current evaluation approaches are not specific to the domain of education. What this paper adds A holistic perspective of the LLM life cycle with domain\u2010specific examples in education to highlight opportunities and challenges for incorporating natural language understanding (NLU) and natural language generation (NLG) into EdTech. Potential sources of bias are identified in each step of the LLM life cycle and discussed in the context of education. A framework for understanding where to expect potential harms of LLMs for students, teachers, and other users of GenAI technology in education, which can guide approaches to bias measurement and mitigation. Implications for practice and/or policy Education practitioners and policymakers should be aware that biases can originate from a multitude of steps in the LLM life cycle, and the life cycle perspective offers them a heuristic for asking technology developers to explain each step to assess the risk of bias. Measuring the biases of systems that use LLMs in education is more complex than with traditional ML, in large part because the evaluation of natural language generation is highly context\u2010dependent (eg, what counts as good feedback on an assignment varies). EdTech developers can play an important role in collecting and curating datasets for the evaluation and benchmarking of LLM applications moving forward.", "venue": "British Journal of Educational Technology", "label": 10}, {"loc": [8.409477233886719, 3.677046537399292], "openalex_id": "https://openalex.org/W4400611699", "title": "Q-GaLore: Quantized GaLore with INT4 Projection and Layer-Adaptive Low-Rank Gradients", "authors": "Zhenyu Zhang, Ajay Jaiswal, Lu Yin, Shiwei Liu, Jiawei Zhao, Yuandong Tian, Zhangyang Wang", "abstract": "Training Large Language Models (LLMs) is memory-intensive due to the large number of parameters and associated optimization states. GaLore, a recent method, reduces memory usage by projecting weight gradients into a low-rank subspace without compromising performance. However, GaLore relies on time-consuming Singular Value Decomposition (SVD) operations to identify the subspace, and the frequent subspace updates lead to significant training time overhead. Moreover, GaLore offers minimal improvements in accuracy and efficiency compared to LoRA in more accessible fine-tuning scenarios. To address these limitations, we introduce Q-Galore, a novel approach that substantially reduces memory usage by combining quantization and low-rank projection, surpassing the benefits of GaLore. Our method is based on two key observations: (i) the gradient subspace exhibits diverse properties, with some layers converging early in training while others are subject to frequent changes; (ii) the projection matrices are highly resilient to low-bit quantization. Leveraging these insights, Q-GaLore adaptively updates the gradient subspace based on its convergence statistics, achieving comparable performance while significantly reducing the number of SVD operations. We maintain the projection matrices in INT4 format and weights in INT8 format, incorporating stochastic rounding to capture accumulated gradient information. This approach enables a high-precision training trajectory using only low-precision weights. We demonstrate that Q-GaLore achieves highly competitive performance with exceptional memory efficiency. At pre-training, Q-GaLore facilitates training a LLaMA-7B model from scratch on a single NVIDIA RTX 4060 Ti with only 16 GB memory. At fine-tuning, it reduces memory consumption by up to 50% compared to LoRA and GaLore, while consistently outperforming QLoRA at the same memory cost.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.346998691558838, -0.0030572260729968548], "openalex_id": "https://openalex.org/W4400611999", "title": "Tamil Language Computing: the Present and the Future", "authors": "Kengatharaiyer Sarveswaran", "abstract": "This paper delves into the text processing aspects of Language Computing, which enables computers to understand, interpret, and generate human language. Focusing on tasks such as speech recognition, machine translation, sentiment analysis, text summarization, and language modelling, language computing integrates disciplines including linguistics, computer science, and cognitive psychology to create meaningful human-computer interactions. Recent advancements in deep learning have made computers more accessible and capable of independent learning and adaptation. In examining the landscape of language computing, the paper emphasises foundational work like encoding, where Tamil transitioned from ASCII to Unicode, enhancing digital communication. It discusses the development of computational resources, including raw data, dictionaries, glossaries, annotated data, and computational grammars, necessary for effective language processing. The challenges of linguistic annotation, the creation of treebanks, and the training of large language models are also covered, emphasising the need for high-quality, annotated data and advanced language models. The paper underscores the importance of building practical applications for languages like Tamil to address everyday communication needs, highlighting gaps in current technology. It calls for increased research collaboration, digitization of historical texts, and fostering digital usage to ensure the comprehensive development of Tamil language processing, ultimately enhancing global communication and access to digital services.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.657449722290039, 1.2143070697784424], "openalex_id": "https://openalex.org/W4400585758", "title": "The life cycle of large language models in education: A framework for understanding sources of bias", "authors": "Jinsook Lee, Yann Hicke, Renzhe Yu, Christopher Brooks, Ren\u00e9 F. Kizilcec", "abstract": "Abstract Large language models (LLMs) are increasingly adopted in educational contexts to provide personalized support to students and teachers. The unprecedented capacity of LLM\u2010based applications to understand and generate natural language can potentially improve instructional effectiveness and learning outcomes, but the integration of LLMs in education technology has renewed concerns over algorithmic bias, which may exacerbate educational inequalities. Building on prior work that mapped the traditional machine learning life cycle, we provide a framework of the LLM life cycle from the initial development of LLMs to customizing pre\u2010trained models for various applications in educational settings. We explain each step in the LLM life cycle and identify potential sources of bias that may arise in the context of education. We discuss why current measures of bias from traditional machine learning fail to transfer to LLM\u2010generated text (eg, tutoring conversations) because text encodings are high\u2010dimensional, there can be multiple correct responses, and tailoring responses may be pedagogically desirable rather than unfair. The proposed framework clarifies the complex nature of bias in LLM applications and provides practical guidance for their evaluation to promote educational equity. Practitioner notes What is already known about this topic The life cycle of traditional machine learning (ML) applications which focus on predicting labels is well understood. Biases are known to enter in traditional ML applications at various points in the life cycle, and methods to measure and mitigate these biases have been developed and tested. Large language models (LLMs) and other forms of generative artificial intelligence (GenAI) are increasingly adopted in education technologies (EdTech), but current evaluation approaches are not specific to the domain of education. What this paper adds A holistic perspective of the LLM life cycle with domain\u2010specific examples in education to highlight opportunities and challenges for incorporating natural language understanding (NLU) and natural language generation (NLG) into EdTech. Potential sources of bias are identified in each step of the LLM life cycle and discussed in the context of education. A framework for understanding where to expect potential harms of LLMs for students, teachers, and other users of GenAI technology in education, which can guide approaches to bias measurement and mitigation. Implications for practice and/or policy Education practitioners and policymakers should be aware that biases can originate from a multitude of steps in the LLM life cycle, and the life cycle perspective offers them a heuristic for asking technology developers to explain each step to assess the risk of bias. Measuring the biases of systems that use LLMs in education is more complex than with traditional ML, in large part because the evaluation of natural language generation is highly context\u2010dependent (eg, what counts as good feedback on an assignment varies). EdTech developers can play an important role in collecting and curating datasets for the evaluation and benchmarking of LLM applications moving forward.", "venue": "British Journal of Educational Technology", "label": 10}, {"loc": [6.934162616729736, 2.385002613067627], "openalex_id": "https://openalex.org/W4400582875", "title": "SimLLM: Calculating Semantic Similarity in Code Summaries using a Large Language Model-Based Approach", "authors": "Xin Jin, Zhiqiang Lin", "abstract": "Code summaries are pivotal in software engineering, serving to improve code readability, maintainability, and collaboration. While recent advancements in Large Language Models (LLMs) have opened new avenues for automatic code summarization, existing metrics for evaluating summary quality, such as BLEU and BERTScore, have notable limitations. Specifically, these existing metrics either fail to capture the nuances of semantic meaning in summaries or are further limited in understanding domain-specific terminologies and expressions prevalent in code summaries. In this paper, we present SimLLM, a novel LLM-based approach designed to more precisely evaluate the semantic similarity of code summaries. Built upon an autoregressive LLM using a specialized pretraining task on permutated inputs and a pooling-based pairwise similarity measure, SimLLM overcomes the shortcomings of existing metrics. Our empirical evaluations demonstrate that SimLLM not only outperforms existing metrics but also shows a significantly high correlation with human ratings.", "venue": "Proceedings of the ACM on software engineering.", "label": 0}, {"loc": [8.20892333984375, 2.544656991958618], "openalex_id": "https://openalex.org/W4400600684", "title": "Reuse, Don't Retrain: A Recipe for Continued Pretraining of Language Models", "authors": "Jupinder Parmar, Sanjev Satheesh, Mostofa Patwary, Mohammad Shoeybi, Bryan Catanzaro", "abstract": "As language models have scaled both their number of parameters and pretraining dataset sizes, the computational cost for pretraining has become intractable except for the most well-resourced teams. This increasing cost makes it ever more important to be able to reuse a model after it has completed pretraining; allowing for a model's abilities to further improve without needing to train from scratch. In this work, we detail a set of guidelines that cover how to design efficacious data distributions and learning rate schedules for continued pretraining of language models. When applying these findings within a continued pretraining run on top of a well-trained 15B parameter model, we show an improvement of 9\\% in average model accuracy compared to the baseline of continued training on the pretraining set. The resulting recipe provides a practical starting point with which to begin developing language models through reuse rather than retraining.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.032347202301025, 3.7854905128479004], "openalex_id": "https://openalex.org/W4400600758", "title": "Multilingual Blending: LLM Safety Alignment Evaluation with Language Mixture", "authors": "Jiayang Song, Yuheng Huang, Zhehua Zhou, \u041b\u0435\u0439 \u041c\u0430", "abstract": "As safety remains a crucial concern throughout the development lifecycle of Large Language Models (LLMs), researchers and industrial practitioners have increasingly focused on safeguarding and aligning LLM behaviors with human preferences and ethical standards. LLMs, trained on extensive multilingual corpora, exhibit powerful generalization abilities across diverse languages and domains. However, current safety alignment practices predominantly focus on single-language scenarios, which leaves their effectiveness in complex multilingual contexts, especially for those complex mixed-language formats, largely unexplored. In this study, we introduce Multilingual Blending, a mixed-language query-response scheme designed to evaluate the safety alignment of various state-of-the-art LLMs (e.g., GPT-4o, GPT-3.5, Llama3) under sophisticated, multilingual conditions. We further investigate language patterns such as language availability, morphology, and language family that could impact the effectiveness of Multilingual Blending in compromising the safeguards of LLMs. Our experimental results show that, without meticulously crafted prompt templates, Multilingual Blending significantly amplifies the detriment of malicious queries, leading to dramatically increased bypass rates in LLM safety alignment (67.23% on GPT-3.5 and 40.34% on GPT-4o), far exceeding those of single-language baselines. Moreover, the performance of Multilingual Blending varies notably based on intrinsic linguistic properties, with languages of different morphology and from diverse families being more prone to evading safety alignments. These findings underscore the necessity of evaluating LLMs and developing corresponding safety alignment strategies in a complex, multilingual context to align with their superior cross-language generalization capabilities.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.908109188079834, -1.3948408365249634], "openalex_id": "https://openalex.org/W4400529244", "title": "A review on emotion detection by using deep learning techniques", "authors": "Tulika Chutia, Nomi Baruah", "abstract": "Abstract Along with the growth of Internet with its numerous potential applications and diverse fields, artificial intelligence (AI) and sentiment analysis (SA) have become significant and popular research areas. Additionally, it was a key technology that contributed to the Fourth Industrial Revolution (IR 4.0). The subset of AI known as emotion recognition systems facilitates communication between IR 4.0 and IR 5.0. Nowadays users of social media, digital marketing, and e-commerce sites are increasing day by day resulting in massive amounts of unstructured data. Medical, marketing, public safety, education, human resources, business, and other industries also use the emotion recognition system widely. Hence it provides a large amount of textual data to extract the emotions from them. The paper presents a systematic literature review of the existing literature published between 2013 to 2023 in text-based emotion detection. This review scrupulously summarized 330 research papers from different conferences, journals, workshops, and dissertations. This paper explores different approaches, methods, different deep learning models, key aspects, description of datasets, evaluation techniques, Future prospects of deep learning, challenges in existing studies and presents limitations and practical implications.", "venue": "Artificial Intelligence Review", "label": 18}, {"loc": [8.193732261657715, 1.1385533809661865], "openalex_id": "https://openalex.org/W4396812222", "title": "PLAID SHIRTTT for Large-Scale Streaming Dense Retrieval", "authors": "Dawn Lawrie, Efsun Sarioglu Kayi, Eugene Yang, James Mayfield, Douglas W. Oard", "abstract": "PLAID, an efficient implementation of the ColBERT late interaction bi-encoder\\nusing pretrained language models for ranking, consistently achieves\\nstate-of-the-art performance in monolingual, cross-language, and multilingual\\nretrieval. PLAID differs from ColBERT by assigning terms to clusters and\\nrepresenting those terms as cluster centroids plus compressed residual vectors.\\nWhile PLAID is effective in batch experiments, its performance degrades in\\nstreaming settings where documents arrive over time because representations of\\nnew tokens may be poorly modeled by the earlier tokens used to select cluster\\ncentroids. PLAID Streaming Hierarchical Indexing that Runs on Terabytes of\\nTemporal Text (PLAID SHIRTTT) addresses this concern using multi-phase\\nincremental indexing based on hierarchical sharding. Experiments on ClueWeb09\\nand the multilingual NeuCLIR collection demonstrate the effectiveness of this\\napproach both for the largest collection indexed to date by the ColBERT\\narchitecture and in the multilingual setting, respectively.\\n", "venue": "https://doi.org/10.1145/3626772.3657964", "label": 0}, {"loc": [6.192814350128174, 5.826304912567139], "openalex_id": "https://openalex.org/W4400572780", "title": "Graph-Based Captioning: Enhancing Visual Descriptions by Interconnecting Region Captions", "authors": "Yu-Guan Hsieh, Cheng-Yu Hsieh, S.-P. Yeh, Louis B\u00e9thune, Hadi Pour Ansari, Pavan Kumar Anasosalu Vasu, Chunliang Li, Ranjay Krishna, Oncel Tuzel, Marco Cuturi", "abstract": "Humans describe complex scenes with compositionality, using simple text descriptions enriched with links and relationships. While vision-language research has aimed to develop models with compositional understanding capabilities, this is not reflected yet in existing datasets which, for the most part, still use plain text to describe images. In this work, we propose a new annotation strategy, graph-based captioning (GBC) that describes an image using a labeled graph structure, with nodes of various types. The nodes in GBC are created through a two-stage process: first, identifying and describing entity nodes; second, linking these nodes by highlighting \\textit{compositions} and \\textit{relations} among them. Since \\textit{all} GBC nodes hold plain text descriptions, GBC retains the flexibility found in natural language, but can also encode hierarchical information in its edges. We demonstrate that GBC can be produced automatically, using off-the-shelf multimodal LLMs and object detection models, by building a new dataset GBC10M that gathers GBC annotations for about 10M images of the CC12M dataset. Through CLIP training on GBC10M, we show that leveraging GBC nodes' annotations -- particularly those in composition and relation nodes -- significantly boosts the model's performance across various benchmarks compared to when other annotations are used. To further explore the opportunities provided by GBC, we also investigate the use of GBC as middleware for text-to-image generation, and show the extra benefits of incorporating the graph structure in this task. Our code and datasets are released at https://github.com/apple/ml-gbc and https://huggingface.co/graph-based-captions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.9511590003967285, 2.352618932723999], "openalex_id": "https://openalex.org/W4400518867", "title": "SoftDedup: an Efficient Data Reweighting Method for Speeding Up Language Model Pre-training", "authors": "Nan He, Weichen Xiong, Hanwen Liu, Yi Liao, Lei Ding, Kai Zhang, Guohua Tang, Han Xiao, Wei Yang", "abstract": "The effectiveness of large language models (LLMs) is often hindered by duplicated data in their extensive pre-training datasets. Current approaches primarily focus on detecting and removing duplicates, which risks the loss of valuable information and neglects the varying degrees of duplication. To address this, we propose a soft deduplication method that maintains dataset integrity while selectively reducing the sampling weight of data with high commonness. Central to our approach is the concept of \"data commonness\", a metric we introduce to quantify the degree of duplication by measuring the occurrence probabilities of samples using an n-gram model. Empirical analysis shows that this method significantly improves training efficiency, achieving comparable perplexity scores with at least a 26% reduction in required training steps. Additionally, it enhances average few-shot downstream accuracy by 1.77% when trained for an equivalent duration. Importantly, this approach consistently improves performance, even on rigorously deduplicated datasets, indicating its potential to complement existing methods and become a standard pre-training process for LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.6230087280273438, 3.899735450744629], "openalex_id": "https://openalex.org/W4400518754", "title": "If You Don't Understand It, Don't Use It: Eliminating Trojans with Filters Between Layers", "authors": "Adriano Hernandez", "abstract": "Large language models (LLMs) sometimes exhibit dangerous unintended behaviors. Finding and fixing these is challenging because the attack surface is massive -- it is not tractable to exhaustively search for all possible inputs that may elicit such behavior. One specific and particularly challenging case is that if data-poisoning-injected trojans, since there is no way to know what they are to search for them. To our knowledge, there is no generally applicable method to unlearn unknown trojans injected during pre-training. This work seeks to provide a general purpose recipe (filters) and a specific implementation (LoRA) filters that work in practice on small to medium sized models. The focus is primarily empirical, though some perplexing behavior opens the door to the fundamental question of how LLMs store and process information. Not unexpectedly, we find that our filters work best on the residual stream and the latest layers.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.7147305011749268, 2.699164867401123], "openalex_id": "https://openalex.org/W4400481121", "title": "The role of compute thresholds for AI governance", "authors": "Sara Hooker", "abstract": "At face value, this essay is about understanding a fairly esoteric governance tool called compute thresholds. However, in order to grapple with whether these thresholds will achieve anything, we must first understand how they came to be. To do so, we need to engage with a decades-old debate at the heart of computer science progress, namely, is bigger always better? Does a certain inflection point of compute result in changes to the risk profile of a model? Hence, this essay may be of interest not only to policymakers and the wider public but also to computer scientists interested in understanding the role of compute in unlocking breakthroughs. This discussion is timely given the wide adoption of compute thresholds in both the White House Executive Orders on AI Safety (EO) and the EU AI Act to identify more risky systems. A key conclusion of this essay is that compute thresholds, as currently implemented, are shortsighted and likely to fail to mitigate risk. The relationship between compute and risk is highly uncertain and rapidly changing. Relying upon compute thresholds overestimates our ability to predict what abilities emerge at different scales. This essay ends with recommendations for a better way forward.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.251314163208008, 1.1718807220458984], "openalex_id": "https://openalex.org/W4400480600", "title": "Enhancing Computer Programming Education with LLMs: A Study on Effective Prompt Engineering for Python Code Generation", "authors": "Tianyu Wang, Nianjun Zhou, Zhixiong Chen", "abstract": "Large language models (LLMs) and prompt engineering hold significant potential for advancing computer programming education through personalized instruction. This paper explores this potential by investigating three critical research questions: the systematic categorization of prompt engineering strategies tailored to diverse educational needs, the empowerment of LLMs to solve complex problems beyond their inherent capabilities, and the establishment of a robust framework for evaluating and implementing these strategies. Our methodology involves categorizing programming questions based on educational requirements, applying various prompt engineering strategies, and assessing the effectiveness of LLM-generated responses. Experiments with GPT-4, GPT-4o, Llama3-8b, and Mixtral-8x7b models on datasets such as LeetCode and USACO reveal that GPT-4o consistently outperforms others, particularly with the \"multi-step\" prompt strategy. The results show that tailored prompt strategies significantly enhance LLM performance, with specific strategies recommended for foundational learning, competition preparation, and advanced problem-solving. This study underscores the crucial role of prompt engineering in maximizing the educational benefits of LLMs. By systematically categorizing and testing these strategies, we provide a comprehensive framework for both educators and students to optimize LLM-based learning experiences. Future research should focus on refining these strategies and addressing current LLM limitations to further enhance educational outcomes in computer programming instruction.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.552674293518066, 1.2432894706726074], "openalex_id": "https://openalex.org/W4400480625", "title": "Training Task Experts through Retrieval Based Distillation", "authors": "Jiaxin Ge, Xueying Jia, Vijay Viswanathan, Hongyin Luo, Graham Neubig", "abstract": "One of the most reliable ways to create deployable models for specialized tasks is to obtain an adequate amount of high-quality task-specific data. However, for specialized tasks, often such datasets do not exist. Existing methods address this by creating such data from large language models (LLMs) and then distilling such knowledge into smaller models. However, these methods are limited by the quality of the LLMs output, and tend to generate repetitive or incorrect data. In this work, we present Retrieval Based Distillation (ReBase), a method that first retrieves data from rich online sources and then transforms them into domain-specific data. This method greatly enhances data diversity. Moreover, ReBase generates Chain-of-Thought reasoning and distills the reasoning capacity of LLMs. We test our method on 4 benchmarks and results show that our method significantly improves performance by up to 7.8% on SQuAD, 1.37% on MNLI, and 1.94% on BigBench-Hard.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.916937351226807, 0.2474423348903656], "openalex_id": "https://openalex.org/W4400480056", "title": "Recent Advancements and Challenges of Turkic Central Asian Language Processing", "authors": "Yana Veitsman", "abstract": "Research in NLP for Central Asian Turkic languages - Kazakh, Uzbek, Kyrgyz, and Turkmen - faces typical low-resource language challenges like data scarcity, limited linguistic resources and technology development. However, recent advancements have included the collection of language-specific datasets and the development of models for downstream tasks. Thus, this paper aims to summarize recent progress and identify future research directions. It provides a high-level overview of each language's linguistic features, the current technology landscape, the application of transfer learning from higher-resource languages, and the availability of labeled and unlabeled data. By outlining the current state, we hope to inspire and facilitate future research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.440221786499023, 5.600196838378906], "openalex_id": "https://openalex.org/W4400481048", "title": "Large Language Models Understand Layouts", "authors": "Weiming Li, Manni Duan, Dong An, Yan Shao", "abstract": "Large language models (LLMs) demonstrate extraordinary abilities in a wide range of natural language processing (NLP) tasks. In this paper, we show that, beyond text understanding capability, LLMs are capable of processing text layouts that are denoted by spatial markers. They are able to answer questions that require explicit spatial perceiving and reasoning, while a drastic performance drop is observed when the spatial markers from the original data are excluded. We perform a series of experiments with the GPT-3.5, Baichuan2, Llama2 and ChatGLM3 models on various types of layout-sensitive datasets for further analysis. The experimental results reveal that the layout understanding ability of LLMs is mainly introduced by the coding data for pretraining, which is further enhanced at the instruction-tuning stage. In addition, layout understanding can be enhanced by integrating low-cost, auto-generated data approached by a novel text game. Finally, we show that layout understanding ability is beneficial for building efficient visual question-answering (VQA) systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.157896995544434, 2.6005964279174805], "openalex_id": "https://openalex.org/W4400431390", "title": "Large language models in machine learning", "authors": "Xiaonan Xu, Yichao Wu, Penghao Liang, Yuhang He, Han Wang", "abstract": "This paper explores the integration of large language models (LLMs) into collaborative filtering algorithms to enhance recommendation systems in the e-commerce domain. The proposed approach combines user-based and item-based collaborative filtering with LLMs to improve recommendation accuracy and personalization. Specifically, the study introduces a novel framework called PALR, which leverages LLMs to refine user-item interactions and enrich item representations. PALR utilizes historical user behavior data, such as clicks, purchases, and ratings, to guide candidate retrieval and generate recommended items. This study highlights the importance of integrating LLMs into recommendation systems to deliver more accurate and personalized suggestions, ultimately improving user satisfaction and driving sales in e-commerce platforms.", "venue": "Applied and Computational Engineering", "label": 0}, {"loc": [5.13784646987915, 2.1682276725769043], "openalex_id": "https://openalex.org/W4400416639", "title": "The Fill-Mask Association Test (FMAT): Measuring Propositions in Natural Language", "authors": "Han\u2010Wu\u2010Shuang Bao", "abstract": "Recent advances in large language models are enabling the computational intelligent analysis of psychology in natural language. Here, the Fill-Mask Association Test (FMAT) is introduced as a novel and integrative method leveraging Masked Language Models to study and measure psychology from a propositional perspective at the societal level. The FMAT uses Bidirectional Encoder Representations from Transformers (BERT) models to compute semantic probabilities of option words filling in the masked blank of a designed query (i.e., a clozelike contextualized sentence). The current research presents 15 studies that establish the reliability and validity of the FMAT in predicting factual associations (Studies 1A-1C), measuring attitudes/biases (Studies 2A-2D), capturing social stereotypes (Studies 3A-3D), and retrospectively delineating lay perceptions of sociocultural changes over time (Studies 4A-4D). Empirically, the FMAT replicated seminal findings previously obtained with human participants (e.g., the Implicit Association Test) and other big-data text-analytic methods (e.g., word frequency analysis, the Word Embedding Association Test), demonstrating robustness across 12 BERT model variants and diverse training text corpora. Theoretically, the current findings substantiate the propositional (vs. associative) perspective on how semantic associations are represented in natural language. Methodologically, the FMAT allows for more fine-grained language-based psychological measurement, with an R package developed to streamline its workflow for use on broader research questions. (PsycInfo Database Record (c) 2024 APA, all rights reserved).", "venue": "Journal of Personality and Social Psychology", "label": 0}, {"loc": [6.973957061767578, 1.100478172302246], "openalex_id": "https://openalex.org/W4400434225", "title": "LLM-jp: A Cross-organizational Project for the Research and Development of Fully Open Japanese LLMs", "authors": "LLM-jp, NULL AUTHOR_ID, Akiko Aizawa, Eiji Aramaki, Bowen Chen, Fei Cheng, Hiroyuki Deguchi, Rintaro Enomoto, Kazuki Fujii, Kensuke Fukumoto, Takuya Fukushima, Namgi Han, Yuto Harada, Chikara Hashimoto, Tatsuya Hiraoka, Shohei Hisada, Sosuke Hosokawa, Lu Jie, Keisuke Kamata, Teruhito Kanazawa, Masakoto Kanezashi, Hiroshi Kataoka, Satoru Katsumata, Daisuke Kawahara, Seiya Kawano, Atsushi Keyaki, Keisuke Kiryu, Hirokazu Kiyomaru, Takashi Kodama, Takahiro Kubo, Yohei Kuga, Ryoma Kumon, Shuhei Kurita, Sadao Kurohashi, Conglong Li, Taiki Maekawa, Hiroshi Matsuda, Yusuke Miyao, Kentaro Mizuki, Sakae Mizuki, Yugo Murawaki, Ryo Nakamura, Taishi Nakamura, Kouta Nakayama, Tomoka Nakazato, T. Niitsuma, Jiro Nishitoba, Yusuke Oda, Hayato Ogawa, Takumi Okamoto, Naoaki Okazaki, Yohei Oseki, Shintaro Ozaki, Koki Ryu, Rafa\u0142 Rzepka, Keisuke Sakaguchi, Shota Sasaki, Satoshi Sekine, Kohei Suda, Saku Sugawara, Issa Sugiura, Hiroaki Sugiyama, Hisami Suzuki, Jun Suzuki, Toyotaro Suzumura, Kensuke Tachibana, Yu Takagi, Kyosuke Takami, Koichi Takeda, Masashi Takeshita, Masahiro Tanaka, Kenjiro Taura, Arseny Tolmachev, Nobuhiro Ueda, Zhen Wan, Shuntaro Yada, Sakiko Yahata, Yuya Yamamoto, Yusuke Yamauchi, Hitomi Yanaka, Rio Yokota, Koichiro Yoshino", "abstract": "This paper introduces LLM-jp, a cross-organizational project for the research and development of Japanese large language models (LLMs). LLM-jp aims to develop open-source and strong Japanese LLMs, and as of this writing, more than 1,500 participants from academia and industry are working together for this purpose. This paper presents the background of the establishment of LLM-jp, summaries of its activities, and technical reports on the LLMs developed by LLM-jp. For the latest activities, visit https://llm-jp.nii.ac.jp/en/.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.653792142868042, -0.38497790694236755], "openalex_id": "https://openalex.org/W4400411501", "title": "Screening Articles for Systematic Reviews with ChatGPT", "authors": "Eugene Syriani, Istv\u00e1n D\u00e1vid, G. Kiran Kumar", "abstract": "Systematic reviews (SRs) provide valuable evidence for guiding new research directions. However, the manual effort involved in selecting articles for inclusion in an SR is error-prone and time-consuming. While screening articles has traditionally been considered challenging to automate, the advent of large language models offers new possibilities. In this paper, we discuss the effect of using ChatGPT on the SR process. In particular, we investigate the effectiveness of different prompt strategies for automating the article screening process using five real SR datasets. Our results show that ChatGPT can reach up to 82% accuracy. The best performing prompts specify exclusion criteria and avoid negative shots. However, prompts should be adapted to different corpus characteristics.", "venue": "Journal of Computer Languages", "label": 0}, {"loc": [4.033900260925293, -2.408813714981079], "openalex_id": "https://openalex.org/W4403608704", "title": "Unsupervised approach for misinformation detection in Russia-Ukraine war news", "authors": "Nina Khairova, Andrea Galassi, Fabrizio Lo Scudo, Bogdan Ivasiuk, Ivan Redozub", "abstract": "The Russian-Ukrainian war has attracted considerable global attention; however, fake news often obstructs the formation of public opinion and disseminates false information. To address this issue, we have curated the RUWA dataset, comprising over 16,500 news articles covering the pivotal events of the Russian invasion of Ukraine. These articles were sourced from established outlets in the USA, EU, Asia, Ukraine, and Russia, spanning the period from February to September 2022. The paper explores the use of semantic similarity to compare different aspects of articles from various web sources that cover the same events of the war. This unsupervised machine learning approach becomes crucial when obtaining annotated datasets is practically impossible due to the lack of real fact-checking during the ongoing war. The research goal is to uncover the potential of employing semantic similarity measures as a viable approach for detecting misinformation in news articles.", "venue": "https://doi.org/10.31110/colins/2024-4/003", "label": 0}, {"loc": [5.289742946624756, -1.505278468132019], "openalex_id": "https://openalex.org/W4400368879", "title": "Toward Better Education Quality through Students' Sentiment Analysis Using AutoML", "authors": "Corina Simionescu, Daniela Marcu, Marius Silviu M\u0103ciuc\u0103", "abstract": "Sentiment analysis from students' interactions with learning environments is a topic of interest for researchers in the field of education because it can make important contributions to improving the quality of instructional processes through recommendation systems integrated into learning applications, or by improving the quality of courses, by grouping students according to their common interests and providing feedback on school progress. There are two approaches to sentiment analysis: one lexicon-based and another that uses machine learning. In this study, we present a sentiment analysis from two own data sets that represent students' opinions about school. Our goal is to create a model that helps us to automatically label students' opinions, assigning sentiment scores between 0 and 4 (0 for an extremely negative opinion). To train and evaluate the performance of the model, we used opinions collected from 1443 Romanian high school students. The novelty that we propose is the manual labeling system. Our current research which uses a machine learning approach to classify students' opinions obtains an accuracy of 86.507%.", "venue": "BRAIN BROAD RESEARCH IN ARTIFICIAL INTELLIGENCE AND NEUROSCIENCE", "label": 0}, {"loc": [9.19468879699707, -0.8522060513496399], "openalex_id": "https://openalex.org/W4400352977", "title": "Exploring AI-driven approaches for unstructured document analysis and future horizons", "authors": "Supriya V. Mahadevkar, Shruti Patil, Ketan Kotecha, Lim Way Soong, Tanupriya Choudhury", "abstract": "Abstract In the current industrial landscape, a significant number of sectors are grappling with the challenges posed by unstructured data, which incurs financial losses amounting to millions annually. If harnessed effectively, this data has the potential to substantially boost operational efficiency. Traditional methods for extracting information have their limitations; however, solutions powered by artificial intelligence (AI) could provide a more fitting alternative. There is an evident gap in scholarly research concerning a comprehensive evaluation of AI-driven techniques for the extraction of information from unstructured content. This systematic literature review aims to identify, assess, and deliberate on prospective research directions within the field of unstructured document information extraction. It has been observed that prevailing extraction methods primarily depend on static patterns or rules, often proving inadequate when faced with complex document structures typically encountered in real-world scenarios, such as medical records. Datasets currently available to the public suffer from low quality and are tailored for specific tasks only. This underscores an urgent need for developing new datasets that accurately reflect complex issues encountered in practical settings. The review reveals that AI-based techniques show promise in autonomously extracting information from diverse unstructured documents, encompassing both printed and handwritten text. Challenges arise, however, when dealing with varied document layouts. Proposing a framework through hybrid AI-based approaches, this review envisions processing a high-quality dataset for automatic information extraction from unstructured documents. Additionally, it emphasizes the importance of collaborative efforts between organizations and researchers to address the diverse challenges associated with unstructured data analysis.", "venue": "Journal Of Big Data", "label": 0}, {"loc": [7.2990217208862305, -1.0866965055465698], "openalex_id": "https://openalex.org/W4400367311", "title": "Dynamic decoding and dual synthetic data for automatic correction of grammar in low-resource scenario", "authors": "Ahmad Musyafa, Ying Gao, Aiman Solyman, Siraj Khan, Wentian Cai, Muhammad Faizan Khan", "abstract": "Grammar error correction systems are pivotal in the field of natural language processing (NLP), with a primary focus on identifying and correcting the grammatical integrity of written text. This is crucial for both language learning and formal communication. Recently, neural machine translation (NMT) has emerged as a promising approach in high demand. However, this approach faces significant challenges, particularly the scarcity of training data and the complexity of grammar error correction (GEC), especially for low-resource languages such as Indonesian. To address these challenges, we propose InSpelPoS, a confusion method that combines two synthetic data generation methods: the Inverted Spellchecker and Patterns+POS. Furthermore, we introduce an adapted seq2seq framework equipped with a dynamic decoding method and state-of-the-art Transformer-based neural language models to enhance the accuracy and efficiency of GEC. The dynamic decoding method is capable of navigating the complexities of GEC and correcting a wide range of errors, including contextual and grammatical errors. The proposed model leverages the contextual information of words and sentences to generate a corrected output. To assess the effectiveness of our proposed framework, we conducted experiments using synthetic data and compared its performance with existing GEC systems. The results demonstrate a significant improvement in the accuracy of Indonesian GEC compared to existing methods.", "venue": "PeerJ Computer Science", "label": 4}, {"loc": [8.660888671875, 0.3116537630558014], "openalex_id": "https://openalex.org/W4400373837", "title": "Ground Every Sentence: Improving Retrieval-Augmented LLMs with Interleaved Reference-Claim Generation", "authors": "Sirui Xia, Xintao Wang, Jiaqing Liang, Yifei Zhang, Weikang Zhou, Jiaji Deng, Fei Yu, Yanghua Xiao", "abstract": "Retrieval-Augmented Generation (RAG) has been widely adopted to enhance Large Language Models (LLMs) in knowledge-intensive tasks. To enhance credibility and verifiability in RAG systems, Attributed Text Generation (ATG) is proposed, which provides citations to retrieval knowledge in LLM-generated responses. Prior methods mainly adopt coarse-grained attributions, with passage-level or paragraph-level references or citations, which fall short in verifiability. This paper proposes ReClaim (Refer & Claim), a fine-grained ATG method that alternates the generation of references and answers step by step. Different from previous coarse-grained attribution, ReClaim provides sentence-level citations in long-form question-answering tasks. With extensive experiments, we verify the effectiveness of ReClaim in extensive settings, achieving a citation accuracy rate of 90%.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.790251731872559, 2.467881679534912], "openalex_id": "https://openalex.org/W4400342789", "title": "RegMix: Data Mixture as Regression for Language Model Pre-training", "authors": "Qian Liu, Xiaosen Zheng, Niklas Muennighoff, Guangtao Zeng, Longxu Dou, Tianyu Pang, Jing Jiang, Min Lin", "abstract": "The data mixture for large language model pre-training significantly impacts performance, yet how to determine an effective mixture remains unclear. We propose RegMix to automatically identify a high-performing data mixture by formulating it as a regression task. RegMix trains many small models on diverse data mixtures, uses regression to predict performance of unseen mixtures, and applies the best predicted mixture to train a large-scale model with orders of magnitude more compute. To empirically validate RegMix, we train 512 models with 1M parameters for 1B tokens to fit the regression model and predict the best data mixture. Using this mixture we train a 1B parameter model for 25B tokens (i.e. 1000x larger and 25x longer) which we find performs best among 64 candidate 1B parameter models with other mixtures. Furthermore, RegMix consistently outperforms human selection in experiments involving models up to 7B models trained on 100B tokens, while matching or exceeding DoReMi using just 10% of the computational resources. Our experiments also show that (1) Data mixtures significantly impact performance; (2) Web corpora rather than data perceived as high-quality like Wikipedia have the strongest positive correlation with downstream performance; (3) Domains interact in complex ways often contradicting common sense, thus automatic approaches like RegMix are needed; (4) Data mixture effects transcend scaling laws. Our code is available at https://github.com/sail-sg/regmix.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.192038536071777, 2.486023187637329], "openalex_id": "https://openalex.org/W4400267466", "title": "Brevity is the soul of wit: Pruning long files for code generation", "authors": "Aaditya K. Singh, Yang Yu, Kushal Tirumala, Mostafa Elhoushi, Ari S. Morcos", "abstract": "Data curation is commonly considered a \"secret-sauce\" for LLM training, with higher quality data usually leading to better LLM performance. Given the scale of internet-scraped corpora, data pruning has become a larger and larger focus. Specifically, many have shown that de-duplicating data, or sub-selecting higher quality data, can lead to efficiency or performance improvements. Generally, three types of methods are used to filter internet-scale corpora: embedding-based, heuristic-based, and classifier-based. In this work, we contrast the former two in the domain of finetuning LLMs for code generation. We find that embedding-based methods are often confounded by length, and that a simple heuristic--pruning long files--outperforms other methods in compute-limited regimes. Our method can yield up to a 2x efficiency benefit in training (while matching performance) or a 3.5% absolute performance improvement on HumanEval (while matching compute). However, we find that perplexity on held-out long files can increase, begging the question of whether optimizing data mixtures for common coding benchmarks (HumanEval, MBPP) actually best serves downstream use cases. Overall, we hope our work builds useful intuitions about code data (specifically, the low quality of extremely long code files) provides a compelling heuristic-based method for data pruning, and brings to light questions in how we evaluate code generation models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.696314334869385, 5.069935321807861], "openalex_id": "https://openalex.org/W4400341532", "title": "Look Ahead or Look Around? A Theoretical Comparison Between Autoregressive and Masked Pretraining", "authors": "Qi Zhang, Tianqi Du, Haotian Huang, Yifei Wang, Yisen Wang", "abstract": "In recent years, the rise of generative self-supervised learning (SSL) paradigms has exhibited impressive performance across visual, language, and multi-modal domains. While the varied designs of generative SSL objectives lead to distinct properties in downstream tasks, a theoretical understanding of these differences remains largely unexplored. In this paper, we establish the first theoretical comparisons between two leading generative SSL paradigms: autoregressive SSL and masked SSL. Through establishing theoretical frameworks, we elucidate the strengths and limitations of autoregressive and masked SSL within the primary evaluation tasks of classification and content generation. Our findings demonstrate that in classification tasks, the flexibility of targeted tokens in masked SSL fosters more inter-sample connections compared to the fixed position of target tokens in autoregressive SSL, which yields superior clustering performance. In content generation tasks, the misalignment between the flexible lengths of test samples and the fixed length of unmasked texts in masked SSL (vs. flexible lengths of conditional texts in autoregressive SSL) hinders its generation performance. To leverage each other's strengths and mitigate weaknesses, we propose diversity-enhanced autoregressive and variable-length masked objectives, which substantially improve the classification performance of autoregressive SSL and the generation performance of masked SSL. Code is available at https://github.com/PKU-ML/LookAheadLookAround.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.894352912902832, -1.6426975727081299], "openalex_id": "https://openalex.org/W4402015863", "title": "From Data Quality to Model Performance: Navigating the Landscape of Deep Learning Model Evaluation", "authors": "Kongyao Huang, Y Zhou, Xiehua Yu, Xiaohong Su", "abstract": "In the current economic landscape, the growing importance of innovation and entrepreneurship underscores an urgent need for accurate market trend prediction. Addressing this challenge, our study introduces an innovative entrepreneurial market trend prediction model based on deep learning principles. Through detailed case studies and performance evaluations, this paper demonstrates the model's effectiveness and its potential to enhance decision-making capabilities in a competitive business environment. Accurate market trend prediction is crucial in the fields of innovation and entrepreneurship, and our approach meets this demand. Our model leverages the power of deep learning technology, combining historical market data with diverse market indicators, including sentiment analysis derived from social media, to create an advanced predictive model that surpasses traditional methods. By analyzing data from multiple channels, our model exhibits exceptional accuracy in forecasting future market trends. The case study provides strong evidence of our model's performance and precision, showcasing its significant support for innovators and entrepreneurs navigating complex market trends. Furthermore, this study highlights the vast potential of deep learning technology in the economic sector. We emphasize the importance of developing innovative entrepreneurial market trend prediction models and foresee an increase in project success rates for innovators and entrepreneurs by enhancing decision quality through the adoption of deep learning.", "venue": "Science Progress", "label": 0}, {"loc": [9.581329345703125, 1.6999098062515259], "openalex_id": "https://openalex.org/W4400190174", "title": "Table Representation Learning", "authors": "Willy Carlos Tchuitcheu, Tan Lu, Ann Dooms", "abstract": "Tables, especially when having complex layouts, contain rich semantic information. However, effectively learning from tables to uncover such semantic information remains challenging. The rapid progress in natural language processing does not necessarily correspond to equivalent advancements in table parsing, which often requires joint visual and language modeling. Indeed, humans can quickly derive semantic meaning from table entries by associating them with corresponding column and/or row headers. Motivated by this observation, we propose a new heterogeneous Graph-based Table Representation Learning (GTRL) framework. GTRL combines graph-based visual modeling with sequence-based language modeling to learn granular per-cell embeddings that are sensitive to the semantic meaning of cells within their corresponding table context. We systematically evaluate the proposed GTRL framework using two datasets: a new adhesive table benchmark comprising complex tables extracted from industrial documents for learning per-entry semantics, and a publicly available large-scale dataset that enables learning header semantics from column tables. Experimental results demonstrate the competitive performance of the proposed GTRL, which often exhibits reduced computational complexity compared to state-of-the-art table representation learning models.", "venue": "Pattern Recognition", "label": 0}, {"loc": [7.859871864318848, 3.283555269241333], "openalex_id": "https://openalex.org/W4400222118", "title": "Single Parent Family: A Spectrum of Family Members from a Single Pre-Trained Foundation Model", "authors": "Habib Hajimolahoseini, Mohammad Hassanpour, Foozhan Ataiefard, Boxing Chen, Yang Liu", "abstract": "This paper introduces a novel method of Progressive Low Rank Decomposition (PLRD) tailored for the compression of large language models. Our approach leverages a pre-trained model, which is then incrementally decompressed to smaller sizes using progressively lower ranks. This method allows for significant reductions in computational overhead and energy consumption, as subsequent models are derived from the original without the need for retraining from scratch. We detail the implementation of PLRD, which strategically decreases the tensor ranks, thus optimizing the trade-off between model performance and resource usage. The efficacy of PLRD is demonstrated through extensive experiments showing that models trained with PLRD method on only 1B tokens maintain comparable performance with traditionally trained models while using 0.1% of the tokens. The versatility of PLRD is highlighted by its ability to generate multiple model sizes from a single foundational model, adapting fluidly to varying computational and memory budgets. Our findings suggest that PLRD could set a new standard for the efficient scaling of LLMs, making advanced AI more feasible on diverse platforms.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.392645835876465, 1.7150894403457642], "openalex_id": "https://openalex.org/W4400223551", "title": "YuLan: An Open-source Large Language Model", "authors": "Yutao Zhu, Kun Zhou, Kelong Mao, Wentong Chen, Yiding Sun, Zhipeng Chen, Qian Cao, Yihan Wu, Yushuo Chen, Feng Wang, Lei Zhang, Junyi Li, Xiaolei Wang, Lei Wang, Beichen Zhang, Zican Dong, Xiaoxue Cheng, Yuhan Chen, Xinyu Tang, Yupeng Hou, Qiangqiang Ren, X Pang, Shufang Xie, Wayne Xin Zhao, Zhicheng Dou, Jiaxin Mao, Yankai Lin, Ruihua Song, Jun Xu, Xu Chen, Rui Yan, Zhewei Wei, Di Hu, Wenbing Huang, Zefeng Gao, Yueguo Chen, Weizheng Lu, Ji-Rong Wen", "abstract": "Large language models (LLMs) have become the foundation of many applications, leveraging their extensive capabilities in processing and understanding natural language. While many open-source LLMs have been released with technical reports, the lack of training details hinders further research and development. This paper presents the development of YuLan, a series of open-source LLMs with $12$ billion parameters. The base model of YuLan is pre-trained on approximately $1.7$T tokens derived from a diverse corpus, including massive English, Chinese, and multilingual texts. We design a three-stage pre-training method to enhance YuLan's overall capabilities. Subsequent phases of training incorporate instruction-tuning and human alignment, employing a substantial volume of high-quality synthesized data. To facilitate the learning of complex and long-tail knowledge, we devise a curriculum-learning framework throughout across these stages, which helps LLMs learn knowledge in an easy-to-hard manner. YuLan's training is finished on Jan, 2024 and has achieved performance on par with state-of-the-art LLMs across various English and Chinese benchmarks. This paper outlines a comprehensive technical roadmap for developing LLMs from scratch. Our model and codes are available at https://github.com/RUC-GSAI/YuLan-Chat.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.640615463256836, 2.8668575286865234], "openalex_id": "https://openalex.org/W4402043003", "title": "LITS: An Optimized Learned Index for Strings", "authors": "Yifan Yang, Shimin Chen", "abstract": "Index is an important component in database systems. Learned indexes have been shown to outperform traditional tree-based index structures for fixed-sized integer or floating point keys. However, the application of the learned solution to variable-length string keys is under-researched. Our experiments show that existing learned indexes for strings fail to outperform traditional string indexes, such as HOT and ART. String keys are long and variable sized, and often contain skewed prefixes, which make the last-mile search expensive, and adversely impact the capability of learned models to capture the skewed distribution of string keys. In this paper, we propose a novel learned index for string keys, LITS (Learned Index with Hash-enhanced Prefix Table and Subtries). We propose an optimized learned model, combining a global Hash-enhanced Prefix Table (HPT) and a per-node local linear model to better distinguish string keys. Moreover, LITS exploits compact leaf nodes and hybrid structures with a PMSS model for efficient point and range operations. Our experimental results using eleven string data sets show that LITS achieves up to 2.43x and 2.27x improvement over HOT and ART for point operations, and attains comparable scan performance.", "venue": "Proceedings of the VLDB Endowment", "label": 21}, {"loc": [6.908576011657715, 3.2989208698272705], "openalex_id": "https://openalex.org/W4400484670", "title": "Digital Protection and Innovative Development Path of Red Culture Resources Based on Distributed Machine Learning Supported by Intelligent Information", "authors": "Min Huang, Xinyu Zeng", "abstract": "As a product of the revolutionary war years, red culture, with its strong vitality, strong cohesion and extraordinary charm, with its incomparable positive energy, resists vulgar and flattering culture, promotes people to rebuild their faith, purify their minds, stimulate their motivation, and promote the process of cultural power. Yan\u2019an, represented by red culture, is rich in resources. This is the holy land of Chinese revolution, the first batch of famous historical and cultural cities named by the State Council, and the three major education bases of patriotism, revolutionary tradition, and Yan\u2019an spirit. The development and utilization of such resources have great political, cultural, educational and economic values. This research is based on the development of red culture, and uses the distributed machine learning system to realize in the system architecture of parameter server. In the distributed system set in this study, node downtime and network interruption are random. When the parameter server system adopts static scheduling, it leads to poor scalability and robustness. The experimental results show that under the intelligent simulation of machine learning system, the development of red culture resources meets the expected assumptions, and the accuracy of the model is relatively high.", "venue": "Journal of Combinatorial Mathematics and Combinatorial Computing", "label": 0}, {"loc": [3.5228331089019775, 2.3258213996887207], "openalex_id": "https://openalex.org/W4396822010", "title": "The Power of Absence: Thinking with Archival Theory in Algorithmic Design", "authors": "Jihan Sherman, Romi Ron Morrison, Lauren Klein, Daniela K. Rosner", "abstract": "This paper explores the value of archival theory as a means of grappling with\\nbias in algorithmic design. Rather than seek to mitigate biases perpetuated by\\ndatasets and algorithmic systems, archival theory offers a reframing of bias\\nitself. Drawing on a range of archival theory from the fields of history,\\nliterary and cultural studies, Black studies, and feminist STS, we propose\\nabsence-as power, presence, and productive-as a concept that might more\\nsecurely anchor investigations into the causes of algorithmic bias, and that\\ncan prompt more capacious, creative, and joyful future work. This essay, in\\nturn, can intervene into the technical as well as the social, historical, and\\npolitical structures that serve as sources of bias.\\n", "venue": "Designing Interactive Systems Conference", "label": 0}, {"loc": [7.165111064910889, 3.607672691345215], "openalex_id": "https://openalex.org/W4401211859", "title": "The Case For Data Centre Hyperloops", "authors": "Guillem L\u00f3pez-Parad\u0131\u0301s, Isaac M. Hair, Sid Kannan, Roman Rabbat, Parker Murray, Alex Lopes, Rory Zahedi, Winston Zuo, Jonathan Balkind", "abstract": "Data movement is a hot-button topic today, with workloads like machine learning (ML) training, graph processing, and data analytics consuming datasets as large as 30PB. Such a dataset would take almost a week to transfer at 400gbps while consuming megajoules of energy just to operate the two endpoints\u2019 optical transceivers. All of this time and energy is seen as an unavoidable overhead on top of directly accessing the disks that store the data. In this paper, we re-evaluate the fundamental assumption of networked data copying and instead propose the adoption of embodied data movement. Our insight is that solid state disks (SSDs) have been rapidly growing in an under-exploited way: their data density, both in TB per unit volume and unit mass. With data centres reaching kilometres in length, we propose a new architecture featuring data centre hyperloops2 (DHLs) where large datasets, stored on commodity SSDs, are moved via magnetic levitation in low-pressure tubes. By eliminating much of the potential friction inherent to embodied data movement, DHLs offer more efficient data movement, with SSDs potentially travelling at hundreds of metres per second. Consequently, a contemporary dataset can be moved through a DHL in seconds and then accessed with local latency and bandwidth well into the terabytes per second. DHLs have the potential to massively reduce the network bandwidth and energy consumption associated with moving large datasets, but raise a variety of questions regarding the viability of their realisation and deployment. Through flexibility and creative engineering, we argue that many potential issues can be resolved. Further, we present models of DHLs and their application to workloads with growing data movement demands, such as training machine learning algorithms, large-scale physics experiments, and data centre backups. For a fixed data movement task, we obtain energy reductions of 1.6\u00d7 to 376.1\u00d7 and time speedups from 114.8\u00d7 to 646.4\u00d7 versus 400gbps optical networking. When modelling DHL in simulation, we obtain time speedups of between 5.7\u00d7 and 118\u00d7 (iso-power) and communication power reductions of between 6.4\u00d7 and 135\u00d7 (iso-time) to train an iteration of a representative DLRM workload. We provide a cost analysis, showing that DHLs are financially practical. With the scale of the improvements realisable through DHLs, we consider this paper a call to action for our community to grapple with the remaining architectural challenges.", "venue": "http://doi.org/10.1109/isca59077.2024.00026", "label": 0}, {"loc": [7.300798416137695, 2.4006948471069336], "openalex_id": "https://openalex.org/W4400142081", "title": "Suri: Multi-constraint Instruction Following for Long-form Text Generation", "authors": "Chau Pham, Simeng Sun, Mohit Iyyer", "abstract": "Existing research on instruction following largely focuses on tasks with simple instructions and short responses. In this work, we explore multi-constraint instruction following for generating long-form text. We create Suri, a dataset with 20K human-written long-form texts paired with LLM-generated backtranslated instructions that contain multiple complex constraints. Because of prohibitive challenges associated with collecting human preference judgments on long-form texts, preference-tuning algorithms such as DPO are infeasible in our setting; thus, we propose Instructional ORPO (I-ORPO), an alignment method based on the ORPO algorithm. Instead of receiving negative feedback from dispreferred responses, I-ORPO obtains negative feedback from synthetically corrupted instructions generated by an LLM. Using Suri, we perform supervised and I-ORPO fine-tuning on Mistral-7b-Instruct-v0.2. The resulting models, Suri-SFT and Suri-I-ORPO, generate significantly longer texts (~5K tokens) than base models without significant quality deterioration. Our human evaluation shows that while both SFT and I-ORPO models satisfy most constraints, Suri-I-ORPO generations are generally preferred for their coherent and informative incorporation of the constraints. We release our code at https://github.com/chtmp223/suri.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.329473495483398, 2.6548712253570557], "openalex_id": "https://openalex.org/W4400141846", "title": "Fairness and Bias in Multimodal AI: A Survey", "authors": "Tosin Adewumi, Lama Alkhaled, Namrata Gurung, Goya van Boven, Irene Pagliai", "abstract": "The importance of addressing fairness and bias in artificial intelligence (AI) systems cannot be over-emphasized. Mainstream media has been awashed with news of incidents around stereotypes and other types of bias in many of these systems in recent years. In this survey, we fill a gap with regards to the relatively minimal study of fairness and bias in Large Multimodal Models (LMMs) compared to Large Language Models (LLMs), providing 50 examples of datasets and models related to both types of AI along with the challenges of bias affecting them. We discuss the less-mentioned category of mitigating bias, preprocessing (with particular attention on the first part of it, which we call preuse). The method is less-mentioned compared to the two well-known ones in the literature: intrinsic and extrinsic mitigation methods. We critically discuss the various ways researchers are addressing these challenges. Our method involved two slightly different search queries on two reputable search engines, Google Scholar and Web of Science (WoS), which revealed that for the queries 'Fairness and bias in Large Multimodal Models' and 'Fairness and bias in Large Language Models', 33,400 and 538,000 links are the initial results, respectively, for Scholar while 4 and 50 links are the initial results, respectively, for WoS. For reproducibility and verification, we provide links to the search results and the citations to all the final reviewed papers. We believe this work contributes to filling this gap and providing insight to researchers and other stakeholders on ways to address the challenges of fairness and bias in multimodal and language AI.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.6456573009490967, 3.108691453933716], "openalex_id": "https://openalex.org/W4400120604", "title": "External Attack-Surface of Modern Organizations", "authors": "Nethanel Gelernter, Haya Schulmann, Michael Waidner", "abstract": "589", "venue": "https://doi.org/10.1145/3634737.3656295", "label": 0}, {"loc": [7.551793098449707, 1.6433147192001343], "openalex_id": "https://openalex.org/W4400065085", "title": "The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale", "authors": "Guilherme Penedo, Hynek Kydl\u00ed\u010dek, Loubna Ben Allal, Anton Lozhkov, Margaret Mitchell, Colin Raffel, Leandro Von Werra, Thomas Wolf", "abstract": "The performance of a large language model (LLM) depends heavily on the quality and size of its pretraining dataset. However, the pretraining datasets for state-of-the-art open LLMs like Llama 3 and Mixtral are not publicly available and very little is known about how they were created. In this work, we introduce FineWeb, a 15-trillion token dataset derived from 96 Common Crawl snapshots that produces better-performing LLMs than other open pretraining datasets. To advance the understanding of how best to curate high-quality pretraining datasets, we carefully document and ablate all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies. In addition, we introduce FineWeb-Edu, a 1.3-trillion token collection of educational text filtered from FineWeb. LLMs pretrained on FineWeb-Edu exhibit dramatically better performance on knowledge- and reasoning-intensive benchmarks like MMLU and ARC. Along with our datasets, we publicly release our data curation codebase and all of the models trained during our ablation experiments.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.71054744720459, 0.4839559495449066], "openalex_id": "https://openalex.org/W4400064518", "title": "A Complete Survey on LLM-based AI Chatbots", "authors": "Sumit Kumar Dam, Choong Seon Hong, Yu Qiao, Chaoning Zhang", "abstract": "The past few decades have witnessed an upsurge in data, forming the foundation for data-hungry, learning-based AI technology. Conversational agents, often referred to as AI chatbots, rely heavily on such data to train large language models (LLMs) and generate new content (knowledge) in response to user prompts. With the advent of OpenAI's ChatGPT, LLM-based chatbots have set new standards in the AI community. This paper presents a complete survey of the evolution and deployment of LLM-based chatbots in various sectors. We first summarize the development of foundational chatbots, followed by the evolution of LLMs, and then provide an overview of LLM-based chatbots currently in use and those in the development phase. Recognizing AI chatbots as tools for generating new knowledge, we explore their diverse applications across various industries. We then discuss the open challenges, considering how the data used to train the LLMs and the misuse of the generated knowledge can cause several issues. Finally, we explore the future outlook to augment their efficiency and reliability in numerous applications. By addressing key milestones and the present-day context of LLM-based chatbots, our survey invites readers to delve deeper into this realm, reflecting on how their next generation will reshape conversational AI.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.493839263916016, 3.581035614013672], "openalex_id": "https://openalex.org/W4400064855", "title": "BlockLLM: Memory-Efficient Adaptation of LLMs by Selecting and Optimizing the Right Coordinate Blocks", "authors": "Amrutha Varshini Ramesh, Vignesh Ganapathiraman, Issam Laradji, Mark Schmidt", "abstract": "Training large language models (LLMs) for pretraining or adapting to new tasks and domains has become increasingly critical as their applications expand. However, as the model and the data sizes grow, the training process presents significant memory challenges, often requiring a prohibitive amount of GPU memory that may not be readily available. Existing methods such as low-rank adaptation (LoRA) add trainable low-rank matrix factorizations, altering the training dynamics and limiting the model's parameter search to a low-rank subspace. GaLore, a more recent method, employs Gradient Low-Rank Projection to reduce the memory footprint, in the full parameter training setting. However GaLore can only be applied to a subset of the LLM layers that satisfy the \"reversibility\" property, thus limiting their applicability. In response to these challenges, we introduce BlockLLM, an approach inspired by block coordinate descent. Our method carefully selects and updates a very small subset of the trainable parameters without altering any part of its architecture and training procedure. BlockLLM achieves state-of-the-art performance in both finetuning and pretraining tasks, while reducing the memory footprint of the underlying optimization process. Our experiments demonstrate that fine-tuning with only less than 5% of the parameters, BlockLLM achieves state-of-the-art perplexity scores on the GLUE benchmarks. On Llama model pretrained on C4 dataset, BlockLLM is able to train with significantly less memory than the state-of-the-art, while still maintaining competitive performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.879006385803223, 0.4562187194824219], "openalex_id": "https://openalex.org/W4400064915", "title": "Native Design Bias: Studying the Impact of English Nativeness on Language Model Performance", "authors": "Manon Reusens, Philipp Borchert, Jochen De Weerdt, Bart Baesens", "abstract": "Large Language Models (LLMs) excel at providing information acquired during pretraining on large-scale corpora and following instructions through user prompts. This study investigates whether the quality of LLM responses varies depending on the demographic profile of users. Considering English as the global lingua franca, along with the diversity of its dialects among speakers of different native languages, we explore whether non-native English speakers receive lower-quality or even factually incorrect responses from LLMs more frequently. Our results show that performance discrepancies occur when LLMs are prompted by native versus non-native English speakers and persist when comparing native speakers from Western countries with others. Additionally, we find a strong anchoring effect when the model recognizes or is made aware of the user's nativeness, which further degrades the response quality when interacting with non-native speakers. Our analysis is based on a newly collected dataset with over 12,000 unique annotations from 124 annotators, including information on their native language and English proficiency.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.158400058746338, 1.0133639574050903], "openalex_id": "https://openalex.org/W4400048050", "title": "A support system for the detection of abusive clauses in B2C contracts", "authors": "S\u0142awomir Dadas, Marek Koz\u0142owski, Rafa\u0142 Po\u015bwiata, Micha\u0142 Pere\u0142kiewicz, Marcin Bia\u0142as, Ma\u0142gorzata Gr\u0119bowiec", "abstract": "Abstract Many countries employ systemic methods of protecting consumers from unfair business practices. One such practice is the use of abusive clauses in business-to-consumer (B2C) contracts, which unfairly impose additional obligations on the consumer or deprive them of their due rights. This article presents an information system that utilizes artificial intelligence methods to automate contract analysis and to detect abusive clauses. The goal of the system is to support the entire administrative process, from contract acquisition, through text extraction and the recommendation of potentially abusive clauses, to the generation of official administrative documents that can be sent to court or to the owners of firms. This article focuses on the components that use machine learning methods. The first is an intelligent crawler that is responsible for automatically detecting contract templates on websites and retrieving them into the system. The second is a document analysis module that implements a clause recommendation algorithm. The algorithm employs transformer-based language models and information retrieval methods to identify abusive passages in text. Our solution achieved first place in a competition on the automatic analysis of B2C contracts organized by the Polish Office of Competition and Consumer Protection (UOKiK), and has since been implemented as an official tool to support the contract analysis process in Poland.", "venue": "Artificial Intelligence and Law", "label": 0}, {"loc": [2.641230583190918, 2.8291566371917725], "openalex_id": "https://openalex.org/W4400024719", "title": "The Responsible Foundation Model Development Cheatsheet: A Review of Tools & Resources", "authors": "Shayne Longpre, Stella Biderman, Alon Albalak, Hailey Schoelkopf, Daniel McDuff, Sayash Kapoor, Kevin Klyman, Kyle Lo, Gabriel Ilharco, Nay San, Maribeth Rauh, Aviya Skowron, Bertie Vidgen, Laura Weidinger, Arvind Narayanan, Victor Sanh, David Ifeoluwa Adelani, Percy Liang, Rishi Bommasani, Peter Henderson, Sasha Luccioni, Yacine Jernite, Luca Soldaini", "abstract": "Foundation model development attracts a rapidly expanding body of contributors, scientists, and applications. To help shape responsible development practices, we introduce the Foundation Model Development Cheatsheet: a growing collection of 250+ tools and resources spanning text, vision, and speech modalities. We draw on a large body of prior work to survey resources (e.g. software, documentation, frameworks, guides, and practical tools) that support informed data selection, processing, and understanding, precise and limitation-aware artifact documentation, efficient model training, advance awareness of the environmental impact from training, careful model evaluation of capabilities, risks, and claims, as well as responsible model release, licensing and deployment practices. We hope this curated collection of resources helps guide more responsible development. The process of curating this list, enabled us to review the AI development ecosystem, revealing what tools are critically missing, misused, or over-used in existing practices. We find that (i) tools for data sourcing, model evaluation, and monitoring are critically under-serving ethical and real-world needs, (ii) evaluations for model safety, capabilities, and environmental impact all lack reproducibility and transparency, (iii) text and particularly English-centric analyses continue to dominate over multilingual and multi-modal analyses, and (iv) evaluation of systems, rather than just models, is needed so that capabilities and impact are assessed in context.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.412321090698242, 3.5483789443969727], "openalex_id": "https://openalex.org/W4400024155", "title": "Building on Efficient Foundations: Effectively Training LLMs with Structured Feedforward Layers", "authors": "Xiuying Wei, Skander Moalla, Razvan Pascanu, \u00c7a\u011flar G\u00fcl\u00e7ehre", "abstract": "State-of-the-art results in large language models (LLMs) often rely on scale, which becomes computationally expensive. This has sparked a research agenda to reduce these models' parameter counts and computational costs without significantly impacting their performance. Our study focuses on transformer-based LLMs, specifically targeting the computationally intensive feedforward networks (FFNs), which are less studied than attention blocks. We consider three structured linear parameterizations of the FFN using efficient low-rank and block-diagonal matrices. In contrast to many previous works that examined these approximations, our study i) explores these structures from a training-from-scratch perspective, ii) scales up to 1.3B parameters, and iii) is conducted within recent Transformer-based LLMs rather than convolutional architectures. We demonstrate that these structures can lead to actual computational gains in various scenarios, including online decoding when using a pre-merge technique. Additionally, we propose a novel training regime, called \\textit{self-guided training}, aimed at improving the poor training dynamics that these approximations exhibit when used from initialization. Interestingly, the scaling performance of structured matrices is explored, revealing steeper curves in scaling training FLOPs, along with a favorable scaling trend in the overtraining regime. Specifically, we show that wide and structured networks can utilize training FLOPs more efficiently, with fewer parameters and lower loss than dense models at their optimal trade-off. Our code is available at \\url{https://github.com/CLAIRE-Labo/StructuredFFN/tree/main}.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.220617294311523, 1.7283062934875488], "openalex_id": "https://openalex.org/W4400024636", "title": "Task Oriented In-Domain Data Augmentation", "authors": "Xiao Liang, Xinyu Hu, Simiao Zuo, Yeyun Gong, Qiang Lou, Yi Liu, Shao\u2010Lun Huang, Jian Jiao", "abstract": "Large Language Models (LLMs) have shown superior performance in various applications and fields. To achieve better performance on specialized domains such as law and advertisement, LLMs are often continue pre-trained on in-domain data. However, existing approaches suffer from two major issues. First, in-domain data are scarce compared with general domain-agnostic data. Second, data used for continual pre-training are not task-aware, such that they may not be helpful to downstream applications. We propose TRAIT, a task-oriented in-domain data augmentation framework. Our framework is divided into two parts: in-domain data selection and task-oriented synthetic passage generation. The data selection strategy identifies and selects a large amount of in-domain data from general corpora, and thus significantly enriches domain knowledge in the continual pre-training data. The synthetic passages contain guidance on how to use domain knowledge to answer questions about downstream tasks. By training on such passages, the model aligns with the need of downstream applications. We adapt LLMs to two domains: advertisement and math. On average, TRAIT improves LLM performance by 8% in the advertisement domain and 7.5% in the math domain.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.513578414916992, -1.0518220663070679], "openalex_id": "https://openalex.org/W4400022724", "title": "Ladder: A Model-Agnostic Framework Boosting LLM-based Machine Translation to the Next Level", "authors": "Zhaopeng Feng, Ruizhe Chen, Yan Zhang, Zijie Meng, Zuozhu Liu", "abstract": "General-purpose Large Language Models (LLMs) like GPT-4 have achieved remarkable advancements in machine translation (MT) by leveraging extensive web content. On the other hand, translation-specific LLMs are built by pre-training on domain-specific monolingual corpora and fine-tuning with human-annotated translation data. Despite the superior performance, these methods either demand an unprecedented scale of computing and data or substantial human editing and annotation efforts. In this paper, we develop MT-Ladder, a novel model-agnostic and cost-effective tool to refine the performance of general LLMs for MT. MT-Ladder is trained on pseudo-refinement triplets which can be easily obtained from existing LLMs without additional human cost. During training, we propose a hierarchical fine-tuning strategy with an easy-to-hard schema, improving MT-Ladder's refining performance progressively. The trained MT-Ladder can be seamlessly integrated with any general-purpose LLMs to boost their translation performance. By utilizing Gemma-2B/7B as the backbone, MT-Ladder-2B can elevate raw translations to the level of top-tier open-source models (e.g., refining BigTranslate-13B with +6.91 BLEU and +3.52 COMET for XX-En), and MT-Ladder-7B can further enhance model performance to be on par with the state-of-the-art GPT-4. Extensive ablation and analysis corroborate the effectiveness of MT-Ladder in diverse settings. Our code is available at https://github.com/fzp0424/MT-Ladder", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.543784141540527, -0.520706057548523], "openalex_id": "https://openalex.org/W4400024831", "title": "USDC: A Dataset of $\\underline {U} $ ser $\\underline {S} $ tance and $\\underline {D} $ ogmatism in Long $\\underline {C} $ onversations", "authors": "Mounika Marreddy, Subba Reddy Oota, Venkata Charan Chinni, Manish Gupta, Lucie Flek", "abstract": "Analyzing user opinion changes in long conversation threads is extremely critical for applications like enhanced personalization, market research, political campaigns, customer service, targeted advertising, and content moderation. Unfortunately, previous studies on stance and dogmatism in user conversations have focused on training models using datasets annotated at the post level, treating each post as independent and randomly sampling posts from conversation threads. Hence, first, we build a dataset for studying user opinion fluctuations in 764 long multi-user Reddit conversation threads, called USDC. USDC contains annotations for 2 tasks: i) User Stance classification, which involves labeling a user's stance in a post within a conversation on a five-point scale; ii) User Dogmatism classification, which involves labeling a user's overall opinion in the conversation on a four-point scale. Besides being time-consuming and costly, manual annotations for USDC are challenging because: 1) Conversation threads could be very long, increasing the chances of noisy annotations; and 2) Interpreting instances where a user changes their opinion within a conversation is difficult because often such transitions are subtle and not expressed explicitly. Hence, we leverage majority voting on zero-shot, one-shot, and few-shot annotations from Mistral Large and GPT-4 to automate the annotation process. Human annotations on 200 test conversations achieved inter-annotator agreement scores of 0.49 for stance and 0.50 for dogmatism with these LLM annotations, indicating a reasonable level of consistency between human and LLM annotations. USDC is then used to finetune and instruction-tune multiple deployable small language models like LLaMA, Falcon and Vicuna for the stance and dogmatism classification tasks. We make the code and dataset publicly available [https://github.com/mounikamarreddy/USDC].", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.234358787536621, 2.3147521018981934], "openalex_id": "https://openalex.org/W4400016295", "title": "Growing Smaller Language Models Using Knowledge Distillation from Larger Models", "authors": "Michael Featherstone, Emily Cuthbertson, David Appleyard, Sarah Gittins", "abstract": "The rapid development of natural language processing technologies has necessitated models that are both high-performing and computationally efficient, posing a challenge for resource-constrained environments. Knowledge distillation, a technique where a smaller model learns from a larger pre-trained model, offers a novel and significant solution by enhancing the capabilities of the smaller model while maintaining a reduced computational footprint. This research explores the application of knowledge distillation to finetune GPT-Neo using Mistral Large, resulting in notable improvements in accuracy, precision, recall, and F1-score across tasks such as text generation, translation, summarization, and question-answering. Comprehensive evaluations demonstrated substantial reductions in inference time, memory usage, and energy consumption, highlighting the practical benefits of the approach. The finetuned GPT-Neo exhibited enhanced linguistic proficiency, coherence, fluency, and contextual accuracy, underscoring the effectiveness of knowledge distillation in optimizing model performance. The findings validate knowledge distillation as a robust method for advancing natural language processing technologies, ensuring high performance in environments with limited computational resources.", "venue": "https://doi.org/10.31219/osf.io/54p96", "label": 0}, {"loc": [7.22207498550415, 2.0747077465057373], "openalex_id": "https://openalex.org/W4400024251", "title": "Large Vocabulary Size Improves Large Language Models", "authors": "Sho Takase, Ryokan Ri, Shun Kiyono, Takuya Kato", "abstract": "This paper empirically investigates the relationship between subword vocabulary size and the performance of large language models (LLMs) to provide insights on how to define the vocabulary size. Experimental results show that larger vocabulary sizes lead to better performance in LLMs. Moreover, we consider a continual training scenario where a pre-trained language model is trained on a different target language. We introduce a simple method to use a new vocabulary instead of the pre-defined one. We show that using the new vocabulary outperforms the model with the vocabulary used in pre-training.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.401962280273438, 3.7449302673339844], "openalex_id": "https://openalex.org/W4400104096", "title": "Adam-mini: Use Fewer Learning Rates To Gain More", "authors": "Yushun Zhang, Congliang Chen, Ziniu Li, Tian Ding, Chenwei Wu, Yinyu Ye, Zhi\u2010Quan Luo, Ruoyu Sun", "abstract": "We propose Adam-mini, an optimizer that achieves on par or better performance than AdamW with 50% less memory footprint. Adam-mini reduces memory by cutting down the learning rate resources in Adam (i.e., $1/\\sqrt{v}$). By investigating the Hessian structure of neural nets, we find Adam's $v$ might not function at its full potential as effectively as we expected. We find that $\\geq$ 99.9% of these learning rates in $v$ could be harmlessly removed if we (1) carefully partition the parameters into blocks following our new principle on Hessian structure; (2) assign a single but good learning rate to each parameter block. We then provide one simple way to find good learning rates and propose Adam-mini. Empirically, we verify that Adam-mini performs on par or better than AdamW on various language models sized from 39M to 13B for pre-training, supervised fine-tuning, and RLHF. The reduced memory footprint of Adam-mini also alleviates communication overheads among GPUs, thereby increasing throughput. For instance, Adam-mini achieves 49.6% higher throughput than AdamW when pre-training Llama 2-7B on $2\\times$ A800-80GB GPUs, which saves 33% wall-clock time for pre-training.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.004159450531006, 5.369691371917725], "openalex_id": "https://openalex.org/W4399977992", "title": "Comparative evaluation of machine learning algorithms for phishing site detection", "authors": "Noura Fahad Almujahid, Mohd Anul Haq, Mohammed S. Alshehri", "abstract": "The advent of Internet technologies has resulted in the proliferation of electronic trading and the use of the Internet for electronic transactions, leading to a rise in unauthorized access to sensitive user information and the depletion of resources for enterprises. As a consequence, there has been a marked increase in phishing, which is now considered one of the most common types of online theft. Phishing attacks are typically directed towards obtaining confidential information, such as login credentials for online banking platforms and sensitive systems. The primary objective of such attacks is to acquire specific personal information to either use for financial gain or commit identity theft. Recent studies have been conducted to combat phishing attacks by examining domain characteristics such as website addresses, content on websites, and combinations of both approaches for the website and its source code. However, businesses require more effective anti-phishing technologies to identify phishing URLs and safeguard their users. The present research aims to evaluate the effectiveness of eight machine learning (ML) and deep learning (DL) algorithms, including support vector machine (SVM), k-nearest neighbors (KNN), random forest (RF), Decision Tree (DT), Extreme Gradient Boosting (XGBoost), logistic regression (LR), convolutional neural network (CNN), and DL model and assess their performances in identifying phishing. This study utilizes two real datasets, Mendeley and UCI, employing performance metrics such as accuracy, precision, recall, false positive rate (FPR), and F-1 score. Notably, CNN exhibits superior accuracy, emphasizing its efficacy. Contributions include using purpose-specific datasets, meticulous feature engineering, introducing SMOTE for class imbalance, incorporating the novel CNN model, and rigorous hyperparameter tuning. The study demonstrates consistent model performance across both datasets, highlighting stability and reliability.", "venue": "PeerJ Computer Science", "label": 4}, {"loc": [3.6553893089294434, 4.487391948699951], "openalex_id": "https://openalex.org/W4399986892", "title": "Mitigating the Privacy Issues in Retrieval-Augmented Generation (RAG) via Pure Synthetic Data", "authors": "Shenglai Zeng, Jiankun Zhang, Pengfei He, Jie Ren, Tianqi Zheng, Hanqing Lu, Xu Han, Hui Liu, Yue Xing, Jiliang Tang", "abstract": "Retrieval-augmented generation (RAG) enhances the outputs of language models by integrating relevant information retrieved from external knowledge sources. However, when the retrieval process involves private data, RAG systems may face severe privacy risks, potentially leading to the leakage of sensitive information. To address this issue, we propose using synthetic data as a privacy-preserving alternative for the retrieval data. We propose SAGE, a novel two-stage synthetic data generation paradigm. In the stage-1, we employ an attribute-based extraction and generation approach to preserve key contextual information from the original data. In the stage-2, we further enhance the privacy properties of the synthetic data through an agent-based iterative refinement process. Extensive experiments demonstrate that using our synthetic data as the retrieval context achieves comparable performance to using the original data while substantially reducing privacy risks. Our work takes the first step towards investigating the possibility of generating high-utility and privacy-preserving synthetic data for RAG, opening up new opportunities for the safe application of RAG systems in various domains.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.50460147857666, 2.4758644104003906], "openalex_id": "https://openalex.org/W4399911093", "title": "Leveraging Large Language Models to Measure Gender Bias in Gendered Languages", "authors": "Erik Derner, Sara Sansalvador de la Fuente, Yoan Guti\u00e9rrez, Paloma Moreda, Nuria Oliver", "abstract": "Large language models (LLMs) often inherit and amplify social biases embedded in their training data. A prominent social bias is gender bias. In this regard, prior work has mainly focused on gender stereotyping bias - the association of specific roles or traits with a particular gender - in English and on evaluating gender bias in model embeddings or generated outputs. In contrast, gender representation bias - the unequal frequency of references to individuals of different genders - in the training corpora has received less attention. Yet such imbalances in the training data constitute an upstream source of bias that can propagate and intensify throughout the entire model lifecycle. To fill this gap, we propose a novel LLM-based method to detect and quantify gender representation bias in LLM training data in gendered languages, where grammatical gender challenges the applicability of methods developed for English. By leveraging the LLMs' contextual understanding, our approach automatically identifies and classifies person-referencing words in gendered language corpora. Applied to four Spanish-English benchmarks and five Valencian corpora, our method reveals substantial male-dominant imbalances. We show that such biases in training data affect model outputs, but can surprisingly be mitigated leveraging small-scale training on datasets that are biased towards the opposite gender. Our findings highlight the need for corpus-level gender bias analysis in multilingual NLP. We make our code and data publicly available.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.981334686279297, 0.19807326793670654], "openalex_id": "https://openalex.org/W4399912532", "title": "Towards Robust Evaluation: A Comprehensive Taxonomy of Datasets and Metrics for Open Domain Question Answering in the Era of Large Language Models", "authors": "Akchay Srivastava, Atif M. Memon", "abstract": "Open Domain Question Answering (ODQA) within natural language processing involves building systems that answer factual questions using large-scale knowledge corpora. Recent advances stem from the confluence of several factors, such as large-scale training datasets, deep learning techniques, and the rise of large language models. High-quality datasets are used to train models on realistic scenarios and enable the evaluation of the system on potentially unseen data. Standardized metrics facilitate comparisons between different ODQA systems, allowing researchers to objectively track advancements in the field. Our study presents a thorough examination of the current landscape of ODQA benchmarking by reviewing 52 datasets and 20 evaluation techniques across textual and multimodal modalities. We introduce a novel taxonomy for ODQA datasets that incorporates both the modality and difficulty of the question types. Additionally, we present a structured organization of ODQA evaluation metrics along with a critical analysis of their inherent trade-offs. Our study aims to empower researchers by providing a framework for the robust evaluation of modern question-answering systems. We conclude by identifying the current challenges and outlining promising avenues for future research and development.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.581603050231934, 0.34449708461761475], "openalex_id": "https://openalex.org/W4399912399", "title": "Learning to Generate Answers with Citations via Factual Consistency Models", "authors": "Rami Aly, Zhiqiang Tang, Samson Tan, George Karypis", "abstract": "Large Language Models (LLMs) frequently hallucinate, impeding their reliability in mission-critical situations. One approach to address this issue is to provide citations to relevant sources alongside generated content, enhancing the verifiability of generations. However, citing passages accurately in answers remains a substantial challenge. This paper proposes a weakly-supervised fine-tuning method leveraging factual consistency models (FCMs). Our approach alternates between generating texts with citations and supervised fine-tuning with FCM-filtered citation data. Focused learning is integrated into the objective, directing the fine-tuning process to emphasise the factual unit tokens, as measured by an FCM. Results on the ALCE few-shot citation benchmark with various instruction-tuned LLMs demonstrate superior performance compared to in-context learning, vanilla supervised fine-tuning, and state-of-the-art methods, with an average improvement of $34.1$, $15.5$, and $10.5$ citation F$_1$ points, respectively. Moreover, in a domain transfer setting we show that the obtained citation generation ability robustly transfers to unseen datasets. Notably, our citation improvements contribute to the lowest factual error rate across baselines.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.4176299571990967, -0.37872400879859924], "openalex_id": "https://openalex.org/W4399907270", "title": "A comprehensive evaluation of large language models in mining gene relations and pathway knowledge", "authors": "Muhammad S. Azam, Yibo Chen, Micheal Olaolu Arowolo, H. Liu, Mihail Popescu, Dong Xu", "abstract": "Abstract Understanding complex biological pathways, including gene\u2013gene interactions and gene regulatory networks, is critical for exploring disease mechanisms and drug development. Manual literature curation of biological pathways cannot keep up with the exponential growth of new discoveries in the literature. Large\u2010scale language models (LLMs) trained on extensive text corpora contain rich biological information, and they can be mined as a biological knowledge graph. This study assesses 21 LLMs, including both application programming interface (API)\u2010based models and open\u2010source models in their capacities of retrieving biological knowledge. The evaluation focuses on predicting gene regulatory relations (activation, inhibition, and phosphorylation) and the Kyoto Encyclopedia of Genes and Genomes (KEGG) pathway components. Results indicated a significant disparity in model performance. API\u2010based models GPT\u20104 and Claude\u2010Pro showed superior performance, with an F1 score of 0.4448 and 0.4386 for the gene regulatory relation prediction, and a Jaccard similarity index of 0.2778 and 0.2657 for the KEGG pathway prediction, respectively. Open\u2010source models lagged behind their API\u2010based counterparts, whereas Falcon\u2010180b and llama2\u20107b had the highest F1 scores of 0.2787 and 0.1923 in gene regulatory relations, respectively. The KEGG pathway recognition had a Jaccard similarity index of 0.2237 for Falcon\u2010180b and 0.2207 for llama2\u20107b. Our study suggests that LLMs are informative in gene network analysis and pathway mapping, but their effectiveness varies, necessitating careful model selection. This work also provides a case study and insight into using LLMs das knowledge graphs. Our code is publicly available at the website of GitHub (Muh\u2010aza).", "venue": "Quantitative Biology", "label": 0}, {"loc": [9.196943283081055, -0.9055271148681641], "openalex_id": "https://openalex.org/W4399880588", "title": "Enhancing Text Summarization in a Limited Data and Long Sample", "authors": "\u0412\u0456\u043a\u0442\u043e\u0440 \u0428\u0435\u0432\u0447\u0443\u043a", "abstract": "Abstract This thesis addresses the challenges of automatic text summarization in a limited amount of data and a significantly large sample size setting. The study introduces a novel dataset of Ukrainian legislative documents professionally translated into English. Comprising only 99 document-summary pairs with average lengths of over 20,000 and 1,300 tokens respectively, this dataset provides an ideal opportunity to evaluate and compare both extractive and abstractive summarization techniques under the data scarcity and document length constraints. The research findings reveal that abstractive summarization, even in a zero-shot setting, outperforms extractive methods, achieving higher scores across ROUGE and BERTScore metrics for legislative texts. Additionally, this study introduces a new text segmentation technique that automates the manual preprocessing of long, unstructured texts for abstractive summarization. Initial results demonstrating a 0.23 ROUGE score compared to human-annotated references, while may seem modest, indicate the feasibility of automating the generation of training data for fine-tuning summarization models at any scale, unlocking not only the ability to handle large-scale datasets but also to improve the efficiency of the models, encouraging further research into enhancing the algorithm.", "venue": "Research Square (Research Square)", "label": 25}, {"loc": [6.942473411560059, 2.573416233062744], "openalex_id": "https://openalex.org/W4399836663", "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence", "authors": "DeepSeek-AI, Qihao Zhu, Daya Guo, Zhihong Shao, Dejian Yang, Peiyi Wang, Runxin Xu, Yuhuan Wu, Yukun Li, Huazuo Gao, Shirong Ma, Wangding Zeng, Xiao Guo Bi, Zihui Gu, Hanwei Xu, Damai Dai, Kai Dong, Liyue Zhang, Yishi Piao, Zhibin Gou, Zhenda Xie, Zhewen Hao, Bingxuan Wang, Junxiao Song, Deli Chen, Xin Xie, Kang Guan, Yuxiang You, Aixin Liu, Qiushi Du, Wenjun Gao, Xuan L\u00fc, Qinyu Chen, Yaohui Wang, Chengqi Deng, Jiashi Li, Chenggang Zhao, Chong Ruan, Fuli Luo, Wenfeng Liang", "abstract": "We present DeepSeek-Coder-V2, an open-source Mixture-of-Experts (MoE) code language model that achieves performance comparable to GPT4-Turbo in code-specific tasks. Specifically, DeepSeek-Coder-V2 is further pre-trained from an intermediate checkpoint of DeepSeek-V2 with additional 6 trillion tokens. Through this continued pre-training, DeepSeek-Coder-V2 substantially enhances the coding and mathematical reasoning capabilities of DeepSeek-V2, while maintaining comparable performance in general language tasks. Compared to DeepSeek-Coder-33B, DeepSeek-Coder-V2 demonstrates significant advancements in various aspects of code-related tasks, as well as reasoning and general capabilities. Additionally, DeepSeek-Coder-V2 expands its support for programming languages from 86 to 338, while extending the context length from 16K to 128K. In standard benchmark evaluations, DeepSeek-Coder-V2 achieves superior performance compared to closed-source models such as GPT4-Turbo, Claude 3 Opus, and Gemini 1.5 Pro in coding and math benchmarks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.186046123504639, 1.916701078414917], "openalex_id": "https://openalex.org/W4399807897", "title": "Negotiating becoming: a Nietzschean critique of large language models", "authors": "Simon W. S. Fischer, Bas de Boer", "abstract": "Abstract Large language models (LLMs) structure the linguistic landscape by reflecting certain beliefs and assumptions. In this paper, we address the risk of people unthinkingly adopting and being determined by the values or worldviews embedded in LLMs. We provide a Nietzschean critique of LLMs and, based on the concept of will to power, consider LLMs as will-to-power organisations. This allows us to conceptualise the interaction between self and LLMs as power struggles, which we understand as negotiation. Currently, the invisibility and incomprehensibility of LLMs make it difficult, if not impossible, to engage in such negotiations. This bears the danger that LLMs make reality increasingly homogeneous by recycling beliefs and creating feedback loops that ultimately freeze power struggles and thus consolidate the status quo. In view of this, LLMs constrain self-formation. Based on our critique, we provide some recommendations on how to develop interactions with LLMs that enable negotiations that allow for different ways of being", "venue": "Ethics and Information Technology", "label": 50}, {"loc": [9.515066146850586, 1.765825867652893], "openalex_id": "https://openalex.org/W4399836782", "title": "Large Scale Transfer Learning for Tabular Data via Language Modeling", "authors": "Josh Gardner, Juan C. Perdomo, Ludwig Schmidt", "abstract": "Tabular data -- structured, heterogeneous, spreadsheet-style data with rows and columns -- is widely used in practice across many domains. However, while recent foundation models have reduced the need for developing task-specific datasets and predictors in domains such as language modeling and computer vision, this transfer learning paradigm has not had similar impact in the tabular domain. In this work, we seek to narrow this gap and present TabuLa-8B, a language model for tabular prediction. We define a process for extracting a large, high-quality training dataset from the TabLib corpus, proposing methods for tabular data filtering and quality control. Using the resulting dataset, which comprises over 2.1B rows from over 4M unique tables, we fine-tune a Llama 3-8B large language model (LLM) for tabular data prediction (classification and binned regression) using a novel packing and attention scheme for tabular prediction. Through evaluation across a test suite of 329 datasets, we find that TabuLa-8B has zero-shot accuracy on unseen tables that is over 15 percentage points (pp) higher than random guessing, a feat that is not possible with existing state-of-the-art tabular prediction models (e.g. XGBoost, TabPFN). In the few-shot setting (1-32 shots), without any fine-tuning on the target datasets, TabuLa-8B is 5-15 pp more accurate than XGBoost and TabPFN models that are explicitly trained on equal, or even up to 16x more data. We release our model, code, and data along with the publication of this paper.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.533330917358398, 5.535060405731201], "openalex_id": "https://openalex.org/W4399795641", "title": "From Pixels to Prose: A Large Dataset of Dense Image Captions", "authors": "Vasu Singla, Kaiyu Yue, Sukriti Paul, Reza Shirkavand, Mayuka Jayawardhana, Alireza Ganjdanesh, Heng Huang, Abhinav Bhatel\u00e9, Gowthami Somepalli, Tom Goldstein", "abstract": "Training large vision-language models requires extensive, high-quality image-text pairs. Existing web-scraped datasets, however, are noisy and lack detailed image descriptions. To bridge this gap, we introduce PixelProse, a comprehensive dataset of over 16M (million) synthetically generated captions, leveraging cutting-edge vision-language models for detailed and accurate descriptions. To ensure data integrity, we rigorously analyze our dataset for problematic content, including child sexual abuse material (CSAM), personally identifiable information (PII), and toxicity. We also provide valuable metadata such as watermark presence and aesthetic scores, aiding in further dataset filtering. We hope PixelProse will be a valuable resource for future vision-language research. PixelProse is available at https://huggingface.co/datasets/tomg-group-umd/pixelprose", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.478860378265381, 5.545804023742676], "openalex_id": "https://openalex.org/W4399795641", "title": "FROM PIXELS TO PROSE: ALarge DATASET OF DENSE IMAGE CAPTIONS", "authors": "Vasu Singla, Kaiyu Yue, Sukriti Paul, Reza Shirkavand, Mayuka Jayawardhana, Alireza Ganjdanesh, Heng Huang, Abhinav Bhatel\u00e9, Gowthami Somepalli, Tom Goldstein", "abstract": "Training large vision-language models requires extensive, high-quality image-text pairs. Existing web-scraped datasets, however, are noisy and lack detailed image descriptions. To bridge this gap, we introduce PixelProse, a comprehensive dataset of over 16M (million) synthetically generated captions, leveraging cutting-edge vision-language models for detailed and accurate descriptions. To ensure data integrity, we rigorously analyze our dataset for problematic content, including child sexual abuse material (CSAM), personally identifiable information (PII), and toxicity. We also provide valuable metadata such as watermark presence and aesthetic scores, aiding in further dataset filtering. We hope PixelProse will be a valuable resource for future vision-language research. PixelProse is available at https://huggingface.co/datasets/tomg-group-umd/pixelprose", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.147866249084473, 0.2685384452342987], "openalex_id": "https://openalex.org/W4399794407", "title": "Multilingual Large Language Models and Curse of Multilinguality", "authors": "Daniil Gurgurov, Tanja B\u00e4umel, Tatiana Anikina", "abstract": "Multilingual Large Language Models (LLMs) have gained large popularity among Natural Language Processing (NLP) researchers and practitioners. These models, trained on huge datasets, show proficiency across various languages and demonstrate effectiveness in numerous downstream tasks. This paper navigates the landscape of multilingual LLMs, providing an introductory overview of their technical aspects. It explains underlying architectures, objective functions, pre-training data sources, and tokenization methods. This work explores the unique features of different model types: encoder-only (mBERT, XLM-R), decoder-only (XGLM, PALM, BLOOM, GPT-3), and encoder-decoder models (mT5, mBART). Additionally, it addresses one of the significant limitations of multilingual LLMs - the curse of multilinguality - and discusses current attempts to overcome it.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.953878402709961, 1.827404260635376], "openalex_id": "https://openalex.org/W4399795026", "title": "Development of an Adaptive Multi-Domain Artificial Intelligence System Built using Machine Learning and Expert Systems Technologies", "authors": "Jeremy Straub", "abstract": "Producing an artificial general intelligence (AGI) has been an elusive goal in artificial intelligence (AI) research for some time. An AGI would have the capability, like a human, to be exposed to a new problem domain, learn about it and then use reasoning processes to make decisions. While AI techniques have been used across a wide variety of problem domains, an AGI would require an AI that could reason beyond its programming and training. This paper presents a small step towards producing an AGI. It describes a mechanism for an AI to learn about and develop reasoning pathways to make decisions in an a priori unknown domain. It combines a classical AI technique, the expert system, with a its modern adaptation - the gradient descent trained expert system (GDTES) - and utilizes generative artificial intelligence (GAI) to create a network and training data set for this system. These can be created from available sources or may draw upon knowledge incorporated in a GAI's own pre-trained model. The learning process in GDTES is used to optimize the AI's decision-making. While this approach does not meet the standards that many have defined for an AGI, it provides a somewhat similar capability, albeit one which requires a learning process before use.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.9518303871154785, 5.209844589233398], "openalex_id": "https://openalex.org/W4399795019", "title": "MINT-1T: Scaling Open-Source Multimodal Data by 10x: A Multimodal Dataset with One Trillion Tokens", "authors": "Anas Awadalla, Le Xue, Oscar Lo, Manli Shu, Hannah Lee, Etash Kumar Guha, M. Jordan, Sheng Shen, Mohamed Awadalla, Silvio Savarese, Caiming Xiong, Ran Xu, Yejin Choi, Ludwig Schmidt", "abstract": "Multimodal interleaved datasets featuring free-form interleaved sequences of images and text are crucial for training frontier large multimodal models (LMMs). Despite the rapid progression of open-source LMMs, there remains a pronounced scarcity of large-scale, diverse open-source multimodal interleaved datasets. In response, we introduce MINT-1T, the most extensive and diverse open-source Multimodal INTerleaved dataset to date. MINT-1T comprises one trillion text tokens and 3.4 billion images, a 10x scale-up from existing open-source datasets. Additionally, we include previously untapped sources such as PDFs and ArXiv papers. As scaling multimodal interleaved datasets requires substantial engineering effort, sharing the data curation process and releasing the dataset greatly benefits the community. Our experiments show that LMMs trained on MINT-1T rival the performance of models trained on the previous leading dataset, OBELICS. Our data and code will be released at https://github.com/mlfoundations/MINT-1T.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.77556037902832, 2.0842339992523193], "openalex_id": "https://openalex.org/W4399836557", "title": "DataComp-LM: In search of the next generation of training sets for language models", "authors": "Jeffrey Li, Alex Chengyu Fang, Georgios Smyrnis, Maor Ivgi, Matt Jordan, Samir Yitzhak Gadre, Hritik Bansal, Etash Guha, Sedrick Scott Keh, Kushal Arora, Saurabh Garg, Rui Xin, Niklas Muennighoff, Reinhard Heckel, Jean Mercat, Mayee Chen, Suchin Gururangan, Mitchell Wortsman, Alon Albalak, Yonatan Bitton, Marianna Nezhurina, Amro Abbas, Cheng-Yu Hsieh, Dhruba Ghosh, Josh Gardner, Maciej Kilian, Hanlin Zhang, Rulin Shao, Sarah I. Pratt, Sunny Sanyal, Gabriel Ilharco, Giannis Daras, Kalyani Marathe, Aaron Gokaslan, Jieyu Zhang, Khyathi Raghavi Chandu, Thao Nguyen, Igor Vasiljevic, Sham M. Kakade, Shuran Song, Sujay Sanghavi, Fartash Faghri, Sewoong Oh, Luke Zettlemoyer, Kyle Lo, Alaaeldin El-Nouby, Hadi Pouransari, Alexander Toshev, Stephanie Wang, Dirk Groeneveld, Luca Soldaini, Pang Wei Koh, Jenia Jitsev, Thomas Kollar, Alexandros G. Dimakis, Yair Carmon, Achal Dave, Ludwig Schmidt, Vaishaal Shankar", "abstract": "We introduce DataComp for Language Models (DCLM), a testbed for controlled dataset experiments with the goal of improving language models. As part of DCLM, we provide a standardized corpus of 240T tokens extracted from Common Crawl, effective pretraining recipes based on the OpenLM framework, and a broad suite of 53 downstream evaluations. Participants in the DCLM benchmark can experiment with data curation strategies such as deduplication, filtering, and data mixing at model scales ranging from 412M to 7B parameters. As a baseline for DCLM, we conduct extensive experiments and find that model-based filtering is key to assembling a high-quality training set. The resulting dataset, DCLM-Baseline enables training a 7B parameter language model from scratch to 64% 5-shot accuracy on MMLU with 2.6T training tokens. Compared to MAP-Neo, the previous state-of-the-art in open-data language models, DCLM-Baseline represents a 6.6 percentage point improvement on MMLU while being trained with 40% less compute. Our baseline model is also comparable to Mistral-7B-v0.3 and Llama 3 8B on MMLU (63% & 66%), and performs similarly on an average of 53 natural language understanding tasks while being trained with 6.6x less compute than Llama 3 8B. Our results highlight the importance of dataset design for training language models and offer a starting point for further research on data curation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.6384618282318115, 3.841420888900757], "openalex_id": "https://openalex.org/W4399794777", "title": "Threat Modelling and Risk Analysis for Large Language Model (LLM)-Powered Applications", "authors": "Stephen Burabari Tete", "abstract": "The advent of Large Language Models (LLMs) has revolutionized various applications by providing advanced natural language processing capabilities. However, this innovation introduces new cybersecurity challenges. This paper explores the threat modeling and risk analysis specifically tailored for LLM-powered applications. Focusing on potential attacks like data poisoning, prompt injection, SQL injection, jailbreaking, and compositional injection, we assess their impact on security and propose mitigation strategies. We introduce a framework combining STRIDE and DREAD methodologies for proactive threat identification and risk assessment. Furthermore, we examine the feasibility of an end-to-end threat model through a case study of a custom-built LLM-powered application. This model follows Shostack's Four Question Framework, adjusted for the unique threats LLMs present. Our goal is to propose measures that enhance the security of these powerful AI tools, thwarting attacks, and ensuring the reliability and integrity of LLM-integrated systems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.521761655807495, 1.5627069473266602], "openalex_id": "https://openalex.org/W4399785210", "title": "A phenomenology and epistemology of large language models: Transparency, trust, and trustworthiness", "authors": "Richard Heersmink, Barend de Rooij, Mar\u00eda Jimena Clavel V\u00e1zquez, Matteo Colombo", "abstract": "Abstract This paper analyses the phenomenology and epistemology of chatbots such as ChatGPT and Bard. The computational architecture underpinning these chatbots are large language models (LLMs), which are generative artificial intelligence (AI) systems trained on a massive dataset of text extracted from the Web. We conceptualise these LLMs as multifunctional computational cognitive artifacts, used for various cognitive tasks such as translating, summarizing, answering questions, information-seeking, and much more. Phenomenologically, LLMs can be experienced as a \u201cquasi-other\u201d; when that happens, users anthropomorphise them. For most users, current LLMs are black boxes, i.e., for the most part, they lack data transparency and algorithmic transparency. They can, however, be phenomenologically and informationally transparent, in which case there is an interactional flow. Anthropomorphising and interactional flow can, in some users, create an attitude of (unwarranted) trust towards the output LLMs generate. We conclude this paper by drawing on the epistemology of trust and testimony to examine the epistemic implications of these dimensions. Whilst LLMs generally generate accurate responses, we observe two epistemic pitfalls. Ideally, users should be able to match the level of trust that they place in LLMs to the degree that LLMs are trustworthy. However, both their data and algorithmic opacity and their phenomenological and informational transparency can make it difficult for users to calibrate their trust correctly. The effects of these limitations are twofold: users may adopt unwarranted attitudes of trust towards the outputs of LLMs (which is particularly problematic when LLMs hallucinate), and the trustworthiness of LLMs may be undermined.", "venue": "Ethics and Information Technology", "label": 50}, {"loc": [6.889822483062744, 0.8981578350067139], "openalex_id": "https://openalex.org/W4399758650", "title": "Decoding the Diversity: A Review of the Indic AI Research Landscape", "authors": "Sankalp KJ, Vinija Jain, Sreyoshi Bhaduri, Tamoghna Roy, Aman Chadha", "abstract": "This review paper provides a comprehensive overview of large language model (LLM) research directions within Indic languages. Indic languages are those spoken in the Indian subcontinent, including India, Pakistan, Bangladesh, Sri Lanka, Nepal, and Bhutan, among others. These languages have a rich cultural and linguistic heritage and are spoken by over 1.5 billion people worldwide. With the tremendous market potential and growing demand for natural language processing (NLP) based applications in diverse languages, generative applications for Indic languages pose unique challenges and opportunities for research. Our paper deep dives into the recent advancements in Indic generative modeling, contributing with a taxonomy of research directions, tabulating 84 recent publications. Research directions surveyed in this paper include LLM development, fine-tuning existing LLMs, development of corpora, benchmarking and evaluation, as well as publications around specific techniques, tools, and applications. We found that researchers across the publications emphasize the challenges associated with limited data availability, lack of standardization, and the peculiar linguistic complexities of Indic languages. This work aims to serve as a valuable resource for researchers and practitioners working in the field of NLP, particularly those focused on Indic languages, and contributes to the development of more accurate and efficient LLM applications for these languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.685210227966309, 1.0098810195922852], "openalex_id": "https://openalex.org/W4399759241", "title": "BLEnD: A Benchmark for LLMs on Everyday Knowledge in Diverse Cultures and Languages", "authors": "Junho Myung, Nayeon Lee, Yi Zhou, Jiho Jin, Rifki Afina Putri, Dimosthenis Antypas, Hsuvas Borkakoty, Eun\u2010Su Kim, Carla Perez-Almendros, Abinew Ali Ayele, V\u00edctor Guti\u00e9rrez-Basulto, Yazm\u00edn Ib\u00e1\u00f1ez-Garc\u00eda, Hwaran Lee, Shamsuddeen Hassan Muhammad, Kiwoong Park, Anar Rzayev, Nina White, Seid Muhie Yimam, Mohammad Taher Pilehvar, Nedjma Ousidhoum, Jos\u00e9 Camacho-Collados, Alice Oh", "abstract": "Large language models (LLMs) often lack culture-specific knowledge of daily life, especially across diverse regions and non-English languages. Existing benchmarks for evaluating LLMs' cultural sensitivities are limited to a single language or collected from online sources such as Wikipedia, which do not reflect the mundane everyday lifestyles of diverse regions. That is, information about the food people eat for their birthday celebrations, spices they typically use, musical instruments youngsters play, or the sports they practice in school is common cultural knowledge but uncommon in easily collected online sources, especially for underrepresented cultures. To address this issue, we introduce BLEnD, a hand-crafted benchmark designed to evaluate LLMs' everyday knowledge across diverse cultures and languages. BLEnD comprises 52.6k question-answer pairs from 16 countries/regions, in 13 different languages, including low-resource ones such as Amharic, Assamese, Azerbaijani, Hausa, and Sundanese. We construct the benchmark to include two formats of questions: short-answer and multiple-choice. We show that LLMs perform better for cultures that are highly represented online, with a maximum 57.34% difference in GPT-4, the best-performing model, in the short-answer format. For cultures represented by mid-to-high-resource languages, LLMs perform better in their local languages, but for cultures represented by low-resource languages, LLMs perform better in English than the local languages. We make our dataset publicly available at: https://github.com/nlee0212/BLEnD.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.257023811340332, 0.742814302444458], "openalex_id": "https://openalex.org/W4399757107", "title": "Retrieval Augmented Generation via Context Compression Techniques for Large Language Models", "authors": "Pingli Jiang, Ruixuan Fan, Yating Yong", "abstract": "Natural language processing has seen lots of improvements, yet optimizing large-scale models to efficiently handle vast amounts of contextual data remains a critical challenge. The novel approach presented integrates advanced context compression techniques with Retrieval Augmented Generation (RAG), significantly enhancing computational efficiency and the accuracy of generated outputs. Through a series of experiments, the study evaluates the impact of token reduction, embedding optimization, and hierarchical attention mechanisms on model performance. The findings demonstrate that reducing redundant information while maintaining essential contextual elements improves both efficiency and quality of outputs. Additionally, the integration of dynamic memory networks and sophisticated retrieval mechanisms provides a robust framework for augmenting generative capabilities with external knowledge. Comprehensive evaluations highlight the balance achieved between performance and resource utilization, underscoring the feasibility and effectiveness of the proposed methods. This research offers substantial advancements in the optimization of large-scale language models, providing valuable insights into their capabilities and applications.", "venue": "https://doi.org/10.31219/osf.io/ua6j5", "label": 0}, {"loc": [4.788476943969727, 2.141226053237915], "openalex_id": "https://openalex.org/W4399674207", "title": "Echoes of culture: Relationships of implicit and explicit attitudes with contemporary English, historical English, and 53 non-English languages", "authors": "Tessa Elizabeth Sadie Charlesworth, Kirsten Morehouse, Vaibhav Rouduri, William A. Cunningham", "abstract": "Attitudes are intertwined with culture and language. But to what extent? Emerging perspectives in attitude research suggest that cultural representations in language are more related to implicitly measured (vs. explicitly measured) attitudes, and that such relationships persist across history and diverse languages. We offer a comprehensive test of these ideas by correlating (a) attitudes toward 55 topics (e.g., Rich/Poor, Dogs/Cats, Love/Money) from ~100,000 U.S. English-speaking participants with (b) representations of those same topics in word embeddings from contemporary English text, 200 years of English books, and 53 non-English languages. Strong and robust relationships emerged between representations in contemporary English and implicitly but not explicitly measured attitudes. Moreover, strong correlations with implicitly measured attitudes persisted across 200 years of books, and most non-English languages. Results provide new insights into the nature of implicitly measured attitudes and how they are intertwined with cultural representations that are relatively hidden in patterns of language across time and place.", "venue": "Social Psychological and Personality Science", "label": 0}, {"loc": [8.57480239868164, 2.91410756111145], "openalex_id": "https://openalex.org/W4399615345", "title": "Quantitative Analysis of the Relationship Between Optimal Learning Rate and Batch Size Scaling in Large Language Models", "authors": "Rolf Schneider, H. Baumgartner, Dietrich Wohlgemuth", "abstract": "The rapid development of natural language processing has led to the emergence of sophisticated models capable of performing a wide array of tasks with human-like proficiency. Identifying the optimal relationship between learning rate and batch size is crucial for enhancing the efficiency and effectiveness of training these models. Through systematic experimentation with models such as Baidu Ernie, Meta Llama, and Moonshot Kimi, this research demonstrates a linear relationship between these hyperparameters, providing a practical framework for their adjustment. Results indicate that appropriate scaling of learning rates with batch sizes can significantly improve training efficiency, model accuracy, and convergence time. The findings offer valuable insights into the dynamics of model training, presenting a scalable approach that can reduce computational costs and enhance model robustness, thereby contributing to the broader field of artificial intelligence.", "venue": "https://doi.org/10.31219/osf.io/4f8hw", "label": 0}, {"loc": [8.27423095703125, 1.3506053686141968], "openalex_id": "https://openalex.org/W4399597322", "title": "MixEval: Deriving Wisdom of the Crowd from LLM Benchmark Mixtures", "authors": "Jinjie Ni, Fuzhao Xue, Xiang Yue, Yuntian Deng, Mahir Shah, Kabir Jain, Graham Neubig, Yang You", "abstract": "Evaluating large language models (LLMs) is challenging. Traditional ground-truth-based benchmarks fail to capture the comprehensiveness and nuance of real-world queries, while LLM-as-judge benchmarks suffer from grading biases and limited query quantity. Both of them may also become contaminated over time. User-facing evaluation, such as Chatbot Arena, provides reliable signals but is costly and slow. In this work, we propose MixEval, a new paradigm for establishing efficient, gold-standard LLM evaluation by strategically mixing off-the-shelf benchmarks. It bridges (1) comprehensive and well-distributed real-world user queries and (2) efficient and fairly-graded ground-truth-based benchmarks, by matching queries mined from the web with similar queries from existing benchmarks. Based on MixEval, we further build MixEval-Hard, which offers more room for model improvement. Our benchmarks' advantages lie in (1) a 0.96 model ranking correlation with Chatbot Arena arising from the highly impartial query distribution and grading mechanism, (2) fast, cheap, and reproducible execution (6% of the time and cost of MMLU), and (3) dynamic evaluation enabled by the rapid and stable data update pipeline. We provide extensive meta-evaluation and analysis for our and existing LLM benchmarks to deepen the community's understanding of LLM evaluation and guide future research directions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.799236297607422, 3.314790725708008], "openalex_id": "https://openalex.org/W4399597310", "title": "SUBLLM: A Novel Efficient Architecture with Token Sequence Subsampling for LLM", "authors": "Quandong Wang, Yuxuan Yuan, Xiaoyu Yang, Ruike Zhang, Kang Zhao, Yunguo Liu, Jian Luan, Daniel Povey, Bin Wang", "abstract": "While Large Language Models (LLMs) have achieved remarkable success in various fields, the efficiency of training and inference remains a major challenge. To address this issue, we propose SUBLLM, short for Subsampling-Upsampling-Bypass Large Language Model, an innovative architecture that extends the core decoder-only framework by incorporating subsampling, upsampling, and bypass modules. The subsampling modules are responsible for shortening the sequence, while the upsampling modules restore the sequence length, and the bypass modules enhance convergence. In comparison to LLaMA, the proposed SUBLLM exhibits significant enhancements in both training and inference speeds as well as memory usage, while maintaining competitive few-shot performance. During training, SUBLLM increases speeds by 26% and cuts memory by 10GB per GPU. In inference, it boosts speeds by up to 37% and reduces memory by 1GB per GPU. The training and inference speeds can be enhanced by 34% and 52% respectively when the context window is expanded to 8192. Our code is available at https://github.com/XiaoMi/subllm.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.743760108947754, 1.6019643545150757], "openalex_id": "https://openalex.org/W4399590448", "title": "Optimizing Knowledge Extraction in Large Language Models Using Dynamic Tokenization Dictionaries", "authors": "Harold Chiappe, Gabriel Lennon", "abstract": "Tokenization methods have long been a critical component in the performance of language models, yet traditional static approaches often fall short in capturing the dynamic nature of language. The novel concept of implementing a dynamic tokenization dictionary within the Llama model presents a significant advancement, offering real-time adaptability in response to evolving linguistic patterns. The adaptive tokenization algorithm continuously updates the token set based on frequency and context, thereby enhancing the model's ability to generate coherent and contextually relevant outputs. Comprehensive evaluation across multiple benchmark datasets reveals substantial improvements in metrics such as perplexity, F1 Score, BLEU Score, and ROUGE Score, underscoring the efficacy of dynamic tokenization. The implications of these findings extend to various domains, including healthcare, legal analysis, education, and customer service, demonstrating the broad applicability and transformative potential of dynamic tokenized dictionaries. This research not only advances the understanding of tokenization processes but also provides a robust framework for enhancing the efficiency and accuracy of large language models in real-world applications.", "venue": "https://doi.org/10.31219/osf.io/svj2z", "label": 0}, {"loc": [6.116949081420898, 2.4116897583007812], "openalex_id": "https://openalex.org/W4399596953", "title": "LINGOLY: A Benchmark of Olympiad-Level Linguistic Reasoning Puzzles in Low-Resource and Extinct Languages", "authors": "Andrew M. Bean, Simi Hellsten, Harry Mayne, Jabez Magomere, Ethan A. Chi, Ryan Chi, Scott A. Hale, Hannah Rose Kirk", "abstract": "In this paper, we present the LingOly benchmark, a novel benchmark for advanced reasoning abilities in large language models. Using challenging Linguistic Olympiad puzzles, we evaluate (i) capabilities for in-context identification and generalisation of linguistic patterns in very low-resource or extinct languages, and (ii) abilities to follow complex task instructions. The LingOly benchmark covers more than 90 mostly low-resource languages, minimising issues of data contamination, and contains 1,133 problems across 6 formats and 5 levels of human difficulty. We assess performance with both direct accuracy and comparison to a no-context baseline to penalise memorisation. Scores from 11 state-of-the-art LLMs demonstrate the benchmark to be challenging, and models perform poorly on the higher difficulty problems. On harder problems, even the top model only achieved 38.7% accuracy, a 24.7% improvement over the no-context baseline. Large closed models typically outperform open models, and in general, the higher resource the language, the better the scores. These results indicate, in absence of memorisation, true multi-step out-of-domain reasoning remains a challenge for current language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.679079532623291, 0.9402692317962646], "openalex_id": "https://openalex.org/W4399540815", "title": "Leveraging Open Large Language Models for Multilingual Policy Topic Classification: The Babel Machine Approach", "authors": "Mikl\u00f3s Seb\u0151k, \u00c1kos M\u00e1t\u00e9, Orsolya Ring, Viktor Kov\u00e1cs, Rich\u00e1rd Lehoczki", "abstract": "The article presents an open-source and freely available natural language processing system for comparative policy studies. The CAP Babel Machine allows for the automated classification of input files based on the 21 major policy topics of the codebook of the Comparative Agendas Project (CAP). By using multilingual XLM-RoBERTa large language models, the pipeline can produce state-of-the-art level outputs for selected pairs of languages and domains (such as media or parliamentary speech). For 24 cases out of 41, the weighted macro F1 of our language-domain models surpassed 0.75 (and, for 6 language-domain pairs, 0.90). Besides macro F1, for most major topic categories, the distribution of micro F1 scores is also centered around 0.75. These results show that the CAP Babel machine is a viable alternative for human coding in terms of validity at less cost and higher reliability. The proposed research design also has significant possibilities for scaling in terms of leveraging new models, covering new languages, and adding new datasets for fine-tuning. Based on our tests on manifesto data, a different policy classification scheme, we argue that model-pipeline frameworks such as the Babel Machine can, over time, potentially replace double-blind human coding for a multitude of comparative classification problems.", "venue": "Social Science Computer Review", "label": 0}, {"loc": [6.531624794006348, 2.1540751457214355], "openalex_id": "https://openalex.org/W4399510551", "title": "BERTs are Generative In-Context Learners", "authors": "David Samuel", "abstract": "While in-context learning is commonly associated with causal language models, such as GPT, we demonstrate that this capability also 'emerges' in masked language models. Through an embarrassingly simple inference technique, we enable an existing masked model, DeBERTa, to perform generative tasks without additional training or architectural changes. Our evaluation reveals that the masked and causal language models behave very differently, as they clearly outperform each other on different categories of tasks. These complementary strengths suggest that the field's focus on causal models for in-context learning may be limiting - both architectures can develop these capabilities, but with distinct advantages; pointing toward promising hybrid approaches that combine the strengths of both objectives.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.9733686447143555, 1.832816243171692], "openalex_id": "https://openalex.org/W4402801595", "title": "Artificial Intelligence as the Next Front in the Class War", "authors": "Christopher Hill", "abstract": "For many years, artificial intelligence has been confined to the realm of science fiction, and while the technology has been in development, predicting the effects AI will have on our society has been a challenging endeavor. The release of ChatGPT in 2022, the subsequent mass adoption of the AI chatbot, and the response by other private firms in the field announced AI's permanent entrance into the public sphere. These recent strides made in the field of artificial intelligence reveal that the pace of technological development has outstripped the rate at which we are able to politically examine and understand these technologies and their implications, leaving our political understandings of emerging technologies in a game of perpetual catch-up. This thesis attempts to reverse this trend and look ahead to address a novel form of power made possible by artificial intelligence. This new mode of power will be exercised through the use of AI and data by the ruling class, opening a new front in the ongoing class struggle. I will use the HBO show Westworld to provide a lens for understanding the possibilities and perils of artificial intelligence, exploring the themes and questions raised by Westworld, such as trauma and the impact of how AI is designed. I will also engage existing scholarly discussion surrounding artificial intelligence and Jodi Dean's theory of communicative capitalism, with the aim of anticipating the ways in which artificial intelligence might be used by the capitalist class to exert a new form of power through data over the lower classes with the ultimate aim of socially engineering large swaths of the population. I will close by taking a look at the state of the working class today and detail how a popular political effort to advance working class interests could repel this latest assault by the capitalist class.", "venue": "http://doi.org/10.15760/etd.3793", "label": 0}, {"loc": [7.573923110961914, -1.1546969413757324], "openalex_id": "https://openalex.org/W4399423608", "title": "Enhancing English Translation Quality Assessment through Knowledge Transfer in Artificial Intelligence Context", "authors": "Xiuhua Zhao", "abstract": "Abstract Machine translation technology, which employs computers to autonomously convert text between source and target languages, represents a pivotal realm within artificial intelligence and natural language processing research. This paper introduces a novel algorithm grounded in multi-task learning, which is aimed at enhancing the efficacy of Chinese-English neural machine translation systems. This proposition addresses three key challenges: the scarcity of parallel Chinese-English corpora, substantial disparities in sentence structure between the two languages, and the intricate, mutable nature of word formations in Mongolian, a factor influencing Chinese due to historical linguistic interactions. To counter these issues, we devise a parameter transfer strategy. Our methodology commences with the training of a high-resource neural machine translation model leveraging the encoder-decoder architecture prevalent in neural machine translation systems. Subsequently, the learned parameters are utilised to initialise a low-resource model, thereby kickstarting its training with a more informed starting point. It should be noted that the word embeddings and fully-connected layers of the low-resource model are randomly initialised and undergo continuous updating throughout the iterative training process. The experimental outcomes affirm the superiority of our proposed Dual-Task Multi-Task Learning (DFMTL) method, which achieves a BLEU score of 10.1. This not only outperforms the performance of three established baseline models but also demonstrates a notable 0.7 BLEU score increase over models trained exclusively on a mixed-corpus dataset. These findings highlight the potential of our parameter migration strategy in enhancing the precision and fluency of Chinese-English machine translations under resource-constrained scenarios.", "venue": "Research Square (Research Square)", "label": 25}, {"loc": [8.36032772064209, 2.3986294269561768], "openalex_id": "https://openalex.org/W4399448194", "title": "Does your data spark joy? Performance gains from domain upsampling at the end of training", "authors": "Cody Blakeney, Mansheej Paul, Brett W. Larsen, Sean Owen, Jonathan Frankle", "abstract": "Pretraining datasets for large language models (LLMs) have grown to trillions of tokens composed of large amounts of CommonCrawl (CC) web scrape along with smaller, domain-specific datasets. It is expensive to understand the impact of these domain-specific datasets on model capabilities as training at large FLOP scales is required to reveal significant changes to difficult and emergent benchmarks. Given the increasing cost of experimenting with pretraining data, how does one determine the optimal balance between the diversity in general web scrapes and the information density of domain specific data? In this work, we show how to leverage the smaller domain specific datasets by upsampling them relative to CC at the end of training to drive performance improvements on difficult benchmarks. This simple technique allows us to improve up to 6.90 pp on MMLU, 8.26 pp on GSM8K, and 6.17 pp on HumanEval relative to the base data mix for a 7B model trained for 1 trillion (T) tokens, thus rivaling Llama-2 (7B)$\\unicode{x2014}$a model trained for twice as long. We experiment with ablating the duration of domain upsampling from 5% to 30% of training and find that 10% to 20% percent is optimal for navigating the tradeoff between general language modeling capabilities and targeted benchmarks. We also use domain upsampling to characterize at scale the utility of individual datasets for improving various benchmarks by removing them during this final phase of training. This tool opens up the ability to experiment with the impact of different pretraining datasets at scale, but at an order of magnitude lower cost compared to full pretraining runs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.630269527435303, 0.7267329096794128], "openalex_id": "https://openalex.org/W4399404734", "title": "Recent advances in text embedding: A Comprehensive Review of Top-Performing Methods on the MTEB Benchmark", "authors": "Hongliu Cao", "abstract": "Text embedding methods have become increasingly popular in both industrial and academic fields due to their critical role in a variety of natural language processing tasks. The significance of universal text embeddings has been further highlighted with the rise of Large Language Models (LLMs) applications such as Retrieval-Augmented Systems (RAGs). While previous models have attempted to be general-purpose, they often struggle to generalize across tasks and domains. However, recent advancements in training data quantity, quality and diversity; synthetic data generation from LLMs as well as using LLMs as backbones encourage great improvements in pursuing universal text embeddings. In this paper, we provide an overview of the recent advances in universal text embedding models with a focus on the top performing text embeddings on Massive Text Embedding Benchmark (MTEB). Through detailed comparison and analysis, we highlight the key contributions and limitations in this area, and propose potentially inspiring future research directions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.360045433044434, 0.9326694011688232], "openalex_id": "https://openalex.org/W4399405790", "title": "Towards Effective Time-Aware Language Representation: Exploring Enhanced Temporal Understanding in Language Models", "authors": "Jiexin Wang, Adam Jatowt, Yi Cai", "abstract": "In the evolving field of Natural Language Processing, understanding the temporal context of text is increasingly crucial. This study investigates methods to incorporate temporal information during pre-training, aiming to achieve effective time-aware language representation for improved performance on time-related tasks. In contrast to common pre-trained models like BERT, which rely on synchronic document collections such as BookCorpus and Wikipedia, our research introduces BiTimeBERT 2.0, a novel language model pre-trained on a temporal news article collection. BiTimeBERT 2.0 utilizes this temporal news collection, focusing on three innovative pre-training objectives: Time-Aware Masked Language Modeling (TAMLM), Document Dating (DD), and Time-Sensitive Entity Replacement (TSER). Each objective targets a unique aspect of temporal information. TAMLM is designed to enhance the understanding of temporal contexts and relations, DD integrates document timestamps as chronological markers, and TSER focuses on the temporal dynamics of \"Person\" entities, recognizing their inherent temporal significance. The experimental results consistently demonstrate that BiTimeBERT 2.0 outperforms models like BERT and other existing pre-trained models, achieving substantial gains across a variety of downstream NLP tasks and applications where time plays a pivotal role.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.294313430786133, 2.568695545196533], "openalex_id": "https://openalex.org/W4399354491", "title": "PRIVACY HARMS IN THE AI AGE", "authors": "Alice Xiang", "abstract": "Generative AI technologies have made tremendous strides recently and have captured the public\u2019s imagination with their ability to mimic what was previously thought to be a fundamentally human capability: creativity. While such technologies hold great promise to augment human creativity and automate tedious processes, they also carry risks that stem from their development process. In particular, the reliance of foundation models on vast amounts of typically uncurated, often web-scraped training data has led to concerns around fairness and privacy. Algorithmic fairness in this context encompasses concerns around potential biases that can be learned by models due to skews in their training data and then reflected in their generated outputs. For example, without intervention, image generation models are more likely to generate images of lighter skin tone male individuals for professional occupations and images of darker skin tone female individuals for working class occupations. This further raises questions around whether there should be legal protections from such pernicious stereotypical representations. Privacy is also a concern as generative AI models can ingest large amounts of personal and biometric information in the training process, including face and body biometrics for image generation and voice biometrics for speech generation. This Essay will discuss the types of fairness and privacy concerns that generative AI raises and the existing landscape of legal protections under anti-discrimination law and privacy law to address these concerns. This Essay argues that the proliferation of generative AI raises challenging and novel questions around (i) what protections should be offered around the training data used to develop such systems and (ii) whether representational harms should be protected against in an age of AI-generated content.", "venue": "Science and Technology Law Review", "label": 48}, {"loc": [2.6066174507141113, 2.8504488468170166], "openalex_id": "https://openalex.org/W4399405075", "title": "On Labs and Fabs: Mapping How Alliances, Acquisitions, and Antitrust are Shaping the Frontier AI Industry", "authors": "Tom\u00e1s Aguirre", "abstract": "As frontier AI models advance, policy proposals for safe AI development are gaining increasing attention from researchers and policymakers. This paper explores the current integration in the AI supply chain, focusing on vertical relationships and strategic partnerships among AI labs, cloud providers, chip manufacturers, and lithography companies. It aims to lay the groundwork for a deeper understanding of the implications of various governance interventions, including antitrust measures. The study has two main contributions. First, it profiles 25 leading companies in the AI supply chain, analyzing 300 relationships and noting 80 significant mergers and acquisitions along with 40 antitrust cases. Second, we discuss potential market definitions and the integration drivers based on the observed trends. The analysis reveals predominant horizontal integration through natural growth rather than acquisitions and notable trends of backward vertical integration in the semiconductor supply chain. Strategic partnerships are also significant downstream, especially between AI companies and cloud providers, with large tech companies often pursuing conglomerate integration by acquiring specialized AI startups or forming alliances with frontier AI labs. To further understand the strategic partnerships in the industry, we provide three brief case studies featuring companies like OpenAI and Nvidia. We conclude by posing open research questions on market dynamics and possible governance interventions, such as licensing and safety audits.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.8268368244171143, 3.9510114192962646], "openalex_id": "https://openalex.org/W4399354120", "title": "BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of Large Language Models", "authors": "Jiaqi Xue, Mengxin Zheng, Yebowen Hu, Fei Liu, Xun Chen, Qian Lou", "abstract": "Large Language Models (LLMs) are constrained by outdated information and a tendency to generate incorrect data, commonly referred to as \"hallucinations.\" Retrieval-Augmented Generation (RAG) addresses these limitations by combining the strengths of retrieval-based methods and generative models. This approach involves retrieving relevant information from a large, up-to-date dataset and using it to enhance the generation process, leading to more accurate and contextually appropriate responses. Despite its benefits, RAG introduces a new attack surface for LLMs, particularly because RAG databases are often sourced from public data, such as the web. In this paper, we propose \\TrojRAG{} to identify the vulnerabilities and attacks on retrieval parts (RAG database) and their indirect attacks on generative parts (LLMs). Specifically, we identify that poisoning several customized content passages could achieve a retrieval backdoor, where the retrieval works well for clean queries but always returns customized poisoned adversarial queries. Triggers and poisoned passages can be highly customized to implement various attacks. For example, a trigger could be a semantic group like \"The Republican Party, Donald Trump, etc.\" Adversarial passages can be tailored to different contents, not only linked to the triggers but also used to indirectly attack generative LLMs without modifying them. These attacks can include denial-of-service attacks on RAG and semantic steering attacks on LLM generations conditioned by the triggers. Our experiments demonstrate that by just poisoning 10 adversarial passages can induce 98.2\\% success rate to retrieve the adversarial passages. Then, these passages can increase the reject ratio of RAG-based GPT-4 from 0.01\\% to 74.6\\% or increase the rate of negative responses from 0.22\\% to 72\\% for targeted queries.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.651226997375488, -1.2428486347198486], "openalex_id": "https://openalex.org/W4399395388", "title": "Beyond Metrics: Evaluating LLMs' Effectiveness in Culturally Nuanced, Low-Resource Real-World Scenarios", "authors": "Millicent Ochieng, Varun Gumma, Sunayana Sitaram, Jindong Wang, Keshet Ronen, Kalika Bali, Jacki O\u2019Neill", "abstract": "The deployment of Large Language Models (LLMs) in real-world applications presents both opportunities and challenges, particularly in multilingual and code-mixed communication settings. This research evaluates the performance of seven leading LLMs in sentiment analysis on a dataset derived from multilingual and code-mixed WhatsApp chats, including Swahili, English and Sheng. Our evaluation includes both quantitative analysis using metrics like F1 score and qualitative assessment of LLMs' explanations for their predictions. We find that, while Mistral-7b and Mixtral-8x7b achieved high F1 scores, they and other LLMs such as GPT-3.5-Turbo, Llama-2-70b, and Gemma-7b struggled with understanding linguistic and contextual nuances, as well as lack of transparency in their decision-making process as observed from their explanations. In contrast, GPT-4 and GPT-4-Turbo excelled in grasping diverse linguistic inputs and managing various contextual information, demonstrating high consistency with human alignment and transparency in their decision-making process. The LLMs however, encountered difficulties in incorporating cultural nuance especially in non-English settings with GPT-4s doing so inconsistently. The findings emphasize the necessity of continuous improvement of LLMs to effectively tackle the challenges of culturally nuanced, low-resource real-world settings and the need for developing evaluation benchmarks for capturing these issues.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.479850769042969, 2.5303263664245605], "openalex_id": "https://openalex.org/W4399378742", "title": "Perplexed by Perplexity: Perplexity-Based Data Pruning With Small Reference Models", "authors": "Zachary Ankner, Cody Blakeney, Kartik K. Sreenivasan, Max Marion, Matthew L. Leavitt, Mansheej Paul", "abstract": "In this work, we investigate whether small language models can determine high-quality subsets of large-scale text datasets that improve the performance of larger language models. While existing work has shown that pruning based on the perplexity of a larger model can yield high-quality data, we investigate whether smaller models can be used for perplexity-based pruning and how pruning is affected by the domain composition of the data being pruned. We demonstrate that for multiple dataset compositions, perplexity-based pruning of pretraining data can \\emph{significantly} improve downstream task performance: pruning based on perplexities computed with a 125 million parameter model improves the average performance on downstream tasks of a 3 billion parameter model by up to 2.04 and achieves up to a $1.45\\times$ reduction in pretraining steps to reach commensurate baseline performance. Furthermore, we demonstrate that such perplexity-based data pruning also yields downstream performance gains in the over-trained and data-constrained regimes.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.939561367034912, 0.37520310282707214], "openalex_id": "https://openalex.org/W4399297991", "title": "Part-of-Speech Tagger for Bodo Language using Deep Learning approach", "authors": "Dhrubajyoti Pathak, Sanjib Narzary, Sukumar Nandi, Bidisha Som", "abstract": "Abstract Language processing systems such as part-of-speech (POS) tagging, named entity recognition, machine translation, speech recognition, and language modeling have been well-studied in high-resource languages. Nevertheless, research on these systems for several low-resource languages, including Bodo, Mizo, Nagamese, and others, is either yet to commence or is in its nascent stages. Language model (LM) plays a vital role in the downstream tasks of modern natural language processing. Extensive studies are carried out on LMs for high-resource languages. However, these low-resource languages are still underreprese. In this study, we first present BodoBERT, an LM for the Bodo language. To the best of our knowledge, this work is the first such effort to develop an LM for Bodo. Second, we present an ensemble deep learning-based POS tagging model for Bodo. The POS tagging model is based on combinations of BiLSTM with conditional random field and stacked embedding of BodoBERT with BytePairEmbeddings. We cover several LMs in the experiment to see how well they work in POS tagging tasks. The best-performing model achieves an F1 score of 0.8041. A comparative experiment was also conducted on Assamese POS taggers, considering that the language is spoken in the same region as Bodo.", "venue": "Natural language processing.", "label": 15}, {"loc": [3.4889004230499268, 4.373481750488281], "openalex_id": "https://openalex.org/W4396821636", "title": "Silencing the Risk, Not the Whistle: A Semi-automated Text Sanitization Tool for Mitigating the Risk of Whistleblower Re-Identification", "authors": "Dimitri Staufer, Frank Pallas, Bettina Berendt", "abstract": "Whistleblowing is essential for ensuring transparency and accountability in\\nboth public and private sectors. However, (potential) whistleblowers often fear\\nor face retaliation, even when reporting anonymously. The specific content of\\ntheir disclosures and their distinct writing style may re-identify them as the\\nsource. Legal measures, such as the EU WBD, are limited in their scope and\\neffectiveness. Therefore, computational methods to prevent re-identification\\nare important complementary tools for encouraging whistleblowers to come\\nforward. However, current text sanitization tools follow a one-size-fits-all\\napproach and take an overly limited view of anonymity. They aim to mitigate\\nidentification risk by replacing typical high-risk words (such as person names\\nand other NE labels) and combinations thereof with placeholders. Such an\\napproach, however, is inadequate for the whistleblowing scenario since it\\nneglects further re-identification potential in textual features, including\\nwriting style. Therefore, we propose, implement, and evaluate a novel\\nclassification and mitigation strategy for rewriting texts that involves the\\nwhistleblower in the assessment of the risk and utility. Our prototypical tool\\nsemi-automatically evaluates risk at the word/term level and applies\\nrisk-adapted anonymization techniques to produce a grammatically disjointed yet\\nappropriately sanitized text. We then use a LLM that we fine-tuned for\\nparaphrasing to render this text coherent and style-neutral. We evaluate our\\ntool's effectiveness using court cases from the ECHR and excerpts from a\\nreal-world whistleblower testimony and measure the protection against\\nauthorship attribution (AA) attacks and utility loss statistically using the\\npopular IMDb62 movie reviews dataset. Our method can significantly reduce AA\\naccuracy from 98.81% to 31.22%, while preserving up to 73.1% of the original\\ncontent's semantics.\\n", "venue": "https://doi.org/10.1145/3630106.3658936", "label": 0}, {"loc": [4.38753080368042, 2.9660110473632812], "openalex_id": "https://openalex.org/W4396822621", "title": "The Dark Side of Dataset Scaling: Evaluating Racial Classification in Multimodal Models", "authors": "Abeba Birhane, Sepehr Dehdashtian, Vinay Uday Prabhu, Vishnu Naresh Boddeti", "abstract": "Scale the model, scale the data, scale the GPU farms is the reigning\\nsentiment in the world of generative AI today. While model scaling has been\\nextensively studied, data scaling and its downstream impacts on model\\nperformance remain under-explored. This is particularly important in the\\ncontext of multimodal datasets whose main source is the World Wide Web,\\ncondensed and packaged as the Common Crawl dump, which is known to exhibit\\nnumerous drawbacks. In this paper, we evaluate the downstream impact of dataset\\nscaling on 14 visio-linguistic models (VLMs) trained on the LAION400-M and\\nLAION-2B datasets by measuring racial and gender bias using the Chicago Face\\nDataset (CFD) as the probe. Our results show that as the training data\\nincreased, the probability of a pre-trained CLIP model misclassifying human\\nimages as offensive non-human classes such as chimpanzee, gorilla, and\\norangutan decreased, but misclassifying the same images as human offensive\\nclasses such as criminal increased. Furthermore, of the 14 Vision\\nTransformer-based VLMs we evaluated, the probability of predicting an image of\\na Black man and a Latino man as criminal increases by 65% and 69%,\\nrespectively, when the dataset is scaled from 400M to 2B samples for the larger\\nViT-L models. Conversely, for the smaller base ViT-B models, the probability of\\npredicting an image of a Black man and a Latino man as criminal decreases by\\n20% and 47%, respectively, when the dataset is scaled from 400M to 2B samples.\\nWe ground the model audit results in a qualitative and historical analysis,\\nreflect on our findings and their implications for dataset curation practice,\\nand close with a summary of mitigation mechanisms and ways forward. Content\\nwarning: This article contains racially dehumanising and offensive\\ndescriptions.\\n", "venue": "https://doi.org/10.1145/3630106.3658968", "label": 0}, {"loc": [5.298619747161865, 5.379986763000488], "openalex_id": "https://openalex.org/W4399317119", "title": "Slight Corruption in Pre-training Data Makes Better Diffusion Models", "authors": "Hao Chen, Yujin Han, Diganta Misra, Xiang Li, Kai Hu, Difan Zou, Masashi Sugiyama, Jindong Wang, Bhiksha Raj", "abstract": "Diffusion models (DMs) have shown remarkable capabilities in generating realistic high-quality images, audios, and videos. They benefit significantly from extensive pre-training on large-scale datasets, including web-crawled data with paired data and conditions, such as image-text and image-class pairs. Despite rigorous filtering, these pre-training datasets often inevitably contain corrupted pairs where conditions do not accurately describe the data. This paper presents the first comprehensive study on the impact of such corruption in pre-training data of DMs. We synthetically corrupt ImageNet-1K and CC3M to pre-train and evaluate over 50 conditional DMs. Our empirical findings reveal that various types of slight corruption in pre-training can significantly enhance the quality, diversity, and fidelity of the generated images across different DMs, both during pre-training and downstream adaptation stages. Theoretically, we consider a Gaussian mixture model and prove that slight corruption in the condition leads to higher entropy and a reduced 2-Wasserstein distance to the ground truth of the data distribution generated by the corruptly trained DMs. Inspired by our analysis, we propose a simple method to improve the training of DMs on practical datasets by adding condition embedding perturbations (CEP). CEP significantly improves the performance of various DMs in both pre-training and downstream tasks. We hope that our study provides new insights into understanding the data and pre-training processes of DMs and all models are released at https://huggingface.co/DiffusionNoise.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7854864597320557, 3.8522002696990967], "openalex_id": "https://openalex.org/W4399325919", "title": "Preemptive Answer\" Attacks\" on Chain-of-Thought Reasoning", "authors": "Rongwu Xu, Zehan Qi, Wei Xu", "abstract": "Large language models (LLMs) showcase impressive reasoning capabilities when coupled with Chain-of-Thought (CoT) prompting. However, the robustness of this approach warrants further investigation. In this paper, we introduce a novel scenario termed preemptive answers, where the LLM obtains an answer before engaging in reasoning. This situation can arise inadvertently or induced by malicious users by prompt injection attacks. Experiments reveal that preemptive answers significantly impair the model's reasoning capability across various CoT methods and a broad spectrum of datasets. To bolster the robustness of reasoning, we propose two measures aimed at mitigating this issue to some extent.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.1825201511383057, 5.202024459838867], "openalex_id": "https://openalex.org/W4399300747", "title": "X-Phishing-Writer: A Framework for Cross-Lingual Phishing Email Generation", "authors": "Shih-Wei Guo, Yao-Chung Fan", "abstract": "Cybercrime is projected to cause annual business losses of $10.5 trillion by 2025, a significant concern given that a majority of security breaches are due to human errors, especially through phishing attacks. The rapid increase in daily identified phishing sites over the past decade underscores the pressing need to enhance defenses against such attacks. Social Engineering Drills (SEDs) are essential in raising awareness about phishing yet face challenges in creating effective and diverse phishing e-mail content. These challenges are exacerbated by the limited availability of public datasets and concerns over using external language models like ChatGPT for phishing e-mail generation. To address these issues, this article introduces X-Phishing-Writer, a novel cross-lingual Few-shot phishing e-mail generation framework. X-Phishing-Writer allows for the generation of e-mails based on minimal user input, leverages single-language datasets for multilingual e-mail generation, and is designed for internal deployment using a lightweight, open-source language model. Incorporating Adapters into an Encoder\u2013Decoder architecture, X-Phishing-Writer marks a significant advancement in the field, demonstrating superior performance in generating phishing e-mails across 25 languages when compared to baseline models. Experimental results and real-world drills involving 1,682 users showcase a 17.67% e-mail open rate and a 13.33% hyperlink click-through rate, affirming the framework\u2019s effectiveness and practicality in enhancing phishing awareness and defense.", "venue": "ACM Transactions on Asian and Low-Resource Language Information Processing", "label": 22}, {"loc": [3.605915069580078, 1.4830788373947144], "openalex_id": "https://openalex.org/W4399364148", "title": "A Critical Analysis of the Largest Source for Generative AI Training Data: Common Crawl", "authors": "Stefan Baack", "abstract": "Common Crawl is the largest freely available collection of web crawl data and one of the most important sources of pre-training data for large language models (LLMs). It is used so frequently and makes up such large proportions of the overall pre-training data in many cases that it arguably has become a foundational building block for LLM development, and subsequently generative AI products built on top of LLMs. Despite its pivotal role, Common Crawl itself is not widely understood, nor is there much reflection evident among LLM builders about the implications of using Common Crawl's data. This paper discusses what Common Crawl's popularity for LLM development means for fairness, accountability, and transparency in generative AI by highlighting the organization's values and practices, as well as how it views its own role within the AI ecosystem. Our qualitative analysis is based on in-depth interviews with Common Crawl staffers and relevant online documents.", "venue": "https://doi.org/10.1145/3630106.3659033", "label": 0}, {"loc": [4.480607509613037, -1.4893947839736938], "openalex_id": "https://openalex.org/W4403916414", "title": "SENTIMENTAL REFLECTION OF GLOBAL CRISES: CZECH AND UKRAINIAN VIEWS ON POPULAR EVENTS THROUGH THE PRISM OF INTERNET \u2026", "authors": "Kateryna Hordiienko, Zden\u011bk Joukl", "abstract": "Abstract Social media have become a part of our lives, and their use helps us learn about events and comment on them with certain emotions. The purpose of our study was to determine the most frequent tone (positive, negative, neutral) of comments on impactful emergency and crisis news in the Czech Republic and Ukraine on a specific topic (pandemics, war, natural disaster etc.) using the sentiment analysis method. The methods of the study included a theoretical analysis of literature, social media (Twitter, Telegram), a Python program using: large language models GPT-3.5-Turbo and Twitter-XLM-RoBERTa, processing and interpretation of results (psycholinguistic).", "venue": "Journal of Linguistics/Jazykovedn\u00fd casopis", "label": 0}, {"loc": [5.393142223358154, 3.1770553588867188], "openalex_id": "https://openalex.org/W4399532129", "title": "Atmospheric Limitations for High-frequency Ground-based Very Long Baseline Interferometry", "authors": "Dominic W. Pesce, Lindy Blackburn, Ryan Chaves, Sheperd S. Doeleman, Mark Freeman, Sara Issaoun, Michael D. Johnson, Greg Lindahl, Iniyan Natarajan, Scott Paine, Daniel C. M. Palumbo, Freek Roelofs, Paul Tiede", "abstract": "Abstract Very long baseline interferometry (VLBI) provides the highest-resolution images in astronomy. The sharpest resolution is nominally achieved at the highest frequencies, but as the observing frequency increases, so too does the atmospheric contribution to the system noise, degrading the sensitivity of the array and hampering detection. In this paper, we explore the limits of high-frequency VLBI observations using ngehtsim, a new tool for generating realistic synthetic data. ngehtsim uses detailed historical atmospheric models to simulate observing conditions, and it employs heuristic visibility detection criteria that emulate single- and multifrequency VLBI calibration strategies. We demonstrate the fidelity of ngehtsim\u2019s predictions using a comparison with existing 230 GHz data taken by the Event Horizon Telescope (EHT), and we simulate the expected performance of EHT observations at 345 GHz. Though the EHT achieves a nearly 100% detection rate at 230 GHz, our simulations indicate that it should expect substantially poorer performance at 345 GHz; in particular, observations of M87* at 345 GHz are predicted to achieve detection rates of \u227220% that may preclude imaging. Increasing the array sensitivity through wider bandwidths and/or longer integration times\u2014as enabled through, e.g., the simultaneous multifrequency upgrades envisioned for the next-generation EHT\u2014can improve the 345 GHz prospects and yield detection levels that are comparable to those at 230 GHz. M87* and Sgr A* observations carried out in the atmospheric window around 460 GHz could expect to regularly achieve multiple detections on long baselines, but analogous observations at 690 and 875 GHz consistently obtain almost no detections at all.", "venue": "The Astrophysical Journal", "label": 0}, {"loc": [5.824968338012695, 5.657495498657227], "openalex_id": "https://openalex.org/W4399271775", "title": "Jina CLIP: Your CLIP Model Is Also Your Text Retriever", "authors": "Andreas Koukounas, Georgios Mastrapas, Michael G\u00fcnther, Bo Wang, Scott Martens, Isabelle Mohr, Saba Sturua, Mohammad Kalim Akram, Joan Fontanals Mart\u00ednez, Saahil Ognawala, Susana Guzman, Maximilian Werk, Nan Wang, Han Xiao", "abstract": "Contrastive Language-Image Pretraining (CLIP) is widely used to train models to align images and texts in a common embedding space by mapping them to fixed-sized vectors. These models are key to multimodal information retrieval and related tasks. However, CLIP models generally underperform in text-only tasks compared to specialized text models. This creates inefficiencies for information retrieval systems that keep separate embeddings and models for text-only and multimodal tasks. We propose a novel, multi-task contrastive training method to address this issue, which we use to train the jina-clip-v1 model to achieve the state-of-the-art performance on both text-image and text-text retrieval tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.594675302505493, 1.3906471729278564], "openalex_id": "https://openalex.org/W4399206085", "title": "Can ChatGPT Plan Your Retirement?: Generative AI andFinancial Advice", "authors": "Andrew W. Lo, J. Perran Ross", "abstract": "We identify some of the most pressing issues facing the adoption of large language models (LLMs) in practical settings, and propose a research agenda to reach the next technological inflection point in generative AI. We focus on three challenges facing most LLM applications: domain-specific expertise and the ability to tailor that expertise to a user's unique situation, trustworthiness and adherence to the user's moral and ethical standards, and conformity to regulatory guidelines and oversight. These challenges apply to virtually all industries and endeavors in which LLMs can be applied, such as medicine, law, accounting, education, psychotherapy, marketing, and corporate strategy. For concreteness, we focus on the narrow context of financial advice, which serves as an ideal test bed both for determining the possible shortcomings of current LLMs and for exploring ways to overcome them. Our goal is not to provide solutions to these challenges\u2014which will likely take years to develop\u2014but to propose a framework and road map for solving them as part of a larger research agenda for improving generative AI in any application.", "venue": "Harvard Data Science Review", "label": 0}, {"loc": [4.886462688446045, 1.80760657787323], "openalex_id": "https://openalex.org/W4399198286", "title": "LLMs achieve adult human performance on higher-order theory of mind tasks", "authors": "Winnie Street, John Oliver Siy, Geoff Keeling, Adrien Baran\u00e8s, Benjamin Barnett, Michael McKibben, Tatenda Kanyere, Alison Lentz, Blaise Ag\u00fcera y Arcas, Robin Dunbar", "abstract": "This paper examines the extent to which large language models (LLMs) have developed higher-order theory of mind (ToM); the human ability to reason about multiple mental and emotional states in a recursive manner (e.g. I think that you believe that she knows). This paper builds on prior work by introducing a handwritten test suite -- Multi-Order Theory of Mind Q&A -- and using it to compare the performance of five LLMs to a newly gathered adult human benchmark. We find that GPT-4 and Flan-PaLM reach adult-level and near adult-level performance on ToM tasks overall, and that GPT-4 exceeds adult performance on 6th order inferences. Our results suggest that there is an interplay between model size and finetuning for the realisation of ToM abilities, and that the best-performing LLMs have developed a generalised capacity for ToM. Given the role that higher-order ToM plays in a wide range of cooperative and competitive human behaviours, these findings have significant implications for user-facing LLM applications.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.308520555496216, 2.5641930103302], "openalex_id": "https://openalex.org/W4399198614", "title": "The ethical situation of DALL-E 2", "authors": "Eduard Hogea, Josem Rocafortf", "abstract": "A hot topic of Artificial Intelligence right now is image generation from prompts. DALL-E 2 is one of the biggest names in this domain, as it allows people to create images from simple text inputs, to even more complicated ones. The company that made this possible, OpenAI, has assured everyone that visited their website that their mission is to ensure that artificial general intelligence benefits all humanity. A noble idea in our opinion, that also stood as the motive behind us choosing this subject. This paper analyzes the ethical implications of an AI image generative system, with an emphasis on how society is responding to it, how it probably will and how it should if all the right measures are taken.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.3717968463897705, 1.8602203130722046], "openalex_id": "https://openalex.org/W4399197968", "title": "ChatGPT as the Marketplace of Ideas: Should Truth-Seeking Be the Goal of AI Content Governance?", "authors": "Jiawei Zhang", "abstract": "As one of the most enduring metaphors within legal discourse, the marketplace of ideas has wielded considerable influence over the jurisprudential landscape for decades. A century after the inception of this theory, ChatGPT emerged as a revolutionary technological advancement in the twenty-first century. This research finds that ChatGPT effectively manifests the marketplace metaphor. It not only instantiates the promises envisaged by generations of legal scholars but also lays bare the perils discerned through sustained academic critique. Specifically, the workings of ChatGPT and the marketplace of ideas theory exhibit at least four common features: arena, means, objectives, and flaws. These shared attributes are sufficient to render ChatGPT historically the most qualified engine for actualizing the marketplace of ideas theory. The comparison of the marketplace theory and ChatGPT merely marks a starting point. A more meaningful undertaking entails reevaluating and reframing both internal and external AI policies by referring to the accumulated experience, insights, and suggestions researchers have raised to fix the marketplace theory. Here, a pivotal issue is: should truth-seeking be set as the goal of AI content governance? Given the unattainability of the absolute truth-seeking goal, I argue against adopting zero-risk policies. Instead, a more judicious approach would be to embrace a knowledge-based alternative wherein large language models (LLMs) are trained to generate competing and divergent viewpoints based on sufficient justifications. This research also argues that so-called AI content risks are not created by AI companies but are inherent in the entire information ecosystem. Thus, the burden of managing these risks should be distributed among different social actors, rather than being solely shouldered by chatbot companies.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.958639144897461, 0.7972215414047241], "openalex_id": "https://openalex.org/W4399251448", "title": "MAP-Neo: Highly Capable and Transparent Bilingual Large Language Model Series", "authors": "Ge Zhang, Scott Qu, Jiaheng Liu, Chenchen Zhang, Chenghua Lin, Chou Leuang Yu, Danny Pan, Esther Cheng, Jie Liu, Qunshu Lin, Raven Yuan, Tuney Zheng, Wei Pang, Xinrun Du, Yiming Liang, Yinghao Ma, Yizhi Li, Ziyang Ma, Bill Lin, Emmanouil Benetos, Huan Yang, Junting Zhou, Kaijing Ma, Minghao Liu, Morry Niu, Noah Wang, Quehry Que, Ruibo Liu, S. C. Liu, Shawn Guo, Soren Gao, Wangchunshu Zhou, Xinyue Zhang, Yizhi Zhou, Yubo Wang, Yuelin Bai, Yuhan Zhang, Yuxiang Zhang, Zenith Wang, Zhenzhu Yang, Zijian Zhao, Jiajun Zhang, Wanli Ouyang, Wenhao Huang, Wenhu Chen", "abstract": "Large Language Models (LLMs) have made great strides in recent years to achieve unprecedented performance across different tasks. However, due to commercial interest, the most competitive models like GPT, Gemini, and Claude have been gated behind proprietary interfaces without disclosing the training details. Recently, many institutions have open-sourced several strong LLMs like LLaMA-3, comparable to existing closed-source LLMs. However, only the model's weights are provided with most details (e.g., intermediate checkpoints, pre-training corpus, and training code, etc.) being undisclosed. To improve the transparency of LLMs, the research community has formed to open-source truly open LLMs (e.g., Pythia, Amber, OLMo), where more details (e.g., pre-training corpus and training code) are being provided. These models have greatly advanced the scientific study of these large models including their strengths, weaknesses, biases and risks. However, we observe that the existing truly open LLMs on reasoning, knowledge, and coding tasks are still inferior to existing state-of-the-art LLMs with similar model sizes. To this end, we open-source MAP-Neo, a highly capable and transparent bilingual language model with 7B parameters trained from scratch on 4.5T high-quality tokens. Our MAP-Neo is the first fully open-sourced bilingual LLM with comparable performance compared to existing state-of-the-art LLMs. Moreover, we open-source all details to reproduce our MAP-Neo, where the cleaned pre-training corpus, data cleaning pipeline, checkpoints, and well-optimized training/evaluation framework are provided. Finally, we hope our MAP-Neo will enhance and strengthen the open research community and inspire more innovations and creativities to facilitate the further improvements of LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.870591640472412, 5.167991638183594], "openalex_id": "https://openalex.org/W4399151704", "title": "The Evolution of Multimodal Model Architectures", "authors": "Shakti N. Wadekar, Abhishek Chaurasia, Aman Chadha, Eugenio Culurciello", "abstract": "This work uniquely identifies and characterizes four prevalent multimodal model architectural patterns in the contemporary multimodal landscape. Systematically categorizing models by architecture type facilitates monitoring of developments in the multimodal domain. Distinct from recent survey papers that present general information on multimodal architectures, this research conducts a comprehensive exploration of architectural details and identifies four specific architectural types. The types are distinguished by their respective methodologies for integrating multimodal inputs into the deep neural network model. The first two types (Type A and B) deeply fuses multimodal inputs within the internal layers of the model, whereas the following two types (Type C and D) facilitate early fusion at the input stage. Type-A employs standard cross-attention, whereas Type-B utilizes custom-designed layers for modality fusion within the internal layers. On the other hand, Type-C utilizes modality-specific encoders, while Type-D leverages tokenizers to process the modalities at the model's input stage. The identified architecture types aid the monitoring of any-to-any multimodal model development. Notably, Type-C and Type-D are currently favored in the construction of any-to-any multimodal models. Type-C, distinguished by its non-tokenizing multimodal model architecture, is emerging as a viable alternative to Type-D, which utilizes input-tokenizing techniques. To assist in model selection, this work highlights the advantages and disadvantages of each architecture type based on data and compute requirements, architecture complexity, scalability, simplification of adding modalities, training objectives, and any-to-any multimodal generation capability.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.7256855964660645, 4.057931900024414], "openalex_id": "https://openalex.org/W4399197737", "title": "Yuan 2.0-M32: Mixture of Experts with Attention Router", "authors": "Shaohua Wu, Jiangang Luo, Xi Chen, Lingjun Li, Xudong Zhao, Tong Yu, Chao Wang, Yue Wang, Fei Wang, Weixu Qiao, Houbo He, Zeru Zhang, Zeyu Sun, Junxiong Mao, Chong Shen", "abstract": "Yuan 2.0-M32, with a similar base architecture as Yuan-2.0 2B, uses a mixture-of-experts architecture with 32 experts of which 2 experts are active. A new router network, Attention Router, is proposed and adopted for a more efficient selection of experts, which improves the accuracy compared to the model with classical router network. Yuan 2.0-M32 is trained with 2000B tokens from scratch, and the training computation consumption is only 9.25% of a dense model at the same parameter scale. Yuan 2.0-M32 demonstrates competitive capability on coding, math, and various domains of expertise, with only 3.7B active parameters of 40B in total, and 7.4 GFlops forward computation per token, both of which are only 1/19 of Llama3-70B. Yuan 2.0-M32 surpass Llama3-70B on MATH and ARC-Challenge benchmark, with accuracy of 55.89 and 95.8 respectively. The models and source codes of Yuan 2.0-M32 are released at Github1.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.6375656127929688, -1.0570223331451416], "openalex_id": "https://openalex.org/W4399123359", "title": "PUBLIC-PRIVATE PARTNERSHIPS IN HEALTH SECTOR INNOVATION: LESSONS FROM AROUND THE WORLD", "authors": "Samira Abdul, Ehizogie Paul Adeghe, Bisola Oluwafadekemi Adegoke, Adebukola Adejumoke Adegoke, Emem Henry Udedeh", "abstract": "Public-Private Partnerships (PPPs) have emerged as a crucial mechanism for fostering innovation in the health sector globally. This review encapsulates the lessons learned from diverse PPP models worldwide, highlighting their significance and impact. PPPs in healthcare innovation entail collaboration between governmental bodies, private enterprises, and sometimes non-profit organizations to address challenges, such as limited resources, expertise, and infrastructure, while leveraging the strengths of each sector. The success of PPPs relies on effective governance structures, clear objectives, and mutual accountability. One notable example is the United Kingdom's NHS Innovation Accelerator, which partners with industry leaders to fast-track the adoption of innovative healthcare technologies within the National Health Service (NHS). Through this initiative, pioneering solutions, ranging from digital health platforms to medical devices, have been implemented, enhancing patient care and operational efficiency. Similarly, in low-resource settings like sub-Saharan Africa, PPPs have played a pivotal role in improving access to essential healthcare services. Projects such as the Medicines for Malaria Venture (MMV) collaborate with pharmaceutical companies, governments, and research institutions to develop affordable antimalarial drugs tailored to the region's needs. In the realm of medical research and development, partnerships like the Coalition for Epidemic Preparedness Innovations (CEPI) have demonstrated the power of international collaboration in addressing global health threats. CEPI brings together governments, philanthropic organizations, and the private sector to expedite the development of vaccines against emerging infectious diseases, as witnessed during the COVID-19 pandemic. However, challenges persist in PPP implementation, including complex regulatory frameworks, funding uncertainties, and divergent interests among stakeholders. Lessons from successful PPPs underscore the importance of transparent communication, stakeholder engagement, and sustained political commitment. In conclusion, PPPs in the health sector represent a dynamic avenue for catalyzing innovation and driving transformative change. By drawing insights from diverse experiences worldwide, policymakers and practitioners can refine existing frameworks and foster sustainable partnerships to tackle evolving health challenges effectively.", "venue": "Magna Scientia Advanced Biology and Pharmacy", "label": 0}, {"loc": [9.504146575927734, 1.6615086793899536], "openalex_id": "https://openalex.org/W4399175215", "title": "SchemaPile: A Large Collection of Relational Database Schemas", "authors": "Till D\u00f6hmen, Radu Geacu, Madelon Hulsebos, Sebastian Schelter", "abstract": "Access to fine-grained schema information is crucial for understanding how relational databases are designed and used in practice, and for building systems that help users interact with them. Furthermore, such information is required as training data to leverage the potential of large language models (LLMs) for improving data preparation, data integration and natural language querying. Existing single-table corpora such as GitTables provide insights into how tables are structured in-the-wild, but lack detailed schema information about how tables relate to each other, as well as metadata like data types or integrity constraints. On the other hand, existing multi-table (or database schema) datasets are rather small and attribute-poor, leaving it unclear to what extent they actually represent typical real-world database schemas. In order to address these challenges, we present SchemaPile, a corpus of 221,171 database schemas, extracted from SQL files on GitHub. It contains 1.7 million tables with 10 million column definitions, 700 thousand foreign key relationships, seven million integrity constraints, and data content for more than 340 thousand tables. We conduct an in-depth analysis on the millions of schema metadata properties in our corpus, as well as its highly diverse language and topic distribution. In addition, we showcase the potential of \\corpus to improve a variety of data management applications, e.g., fine-tuning LLMs for schema-only foreign key detection, improving CSV header detection and evaluating multi-dialect SQL parsers. We publish the code and data for recreating SchemaPile and a permissively licensed subset SchemaPile-Perm.", "venue": "Proceedings of the ACM on Management of Data", "label": 0}, {"loc": [8.388123512268066, 3.689487934112549], "openalex_id": "https://openalex.org/W4399115726", "title": "LoQT: Low-Rank Adapters for Quantized Pretraining", "authors": "Sebastian Loeschcke, Mads Toftrup, Michael J. Kastoryano, Serge Belongie, V\u00e9steinn Sn\u00e6bjarnarson", "abstract": "Despite advances using low-rank adapters and quantization, pretraining of large models on consumer hardware has not been possible without model sharding, offloading during training, or per-layer gradient updates. To address these limitations, we propose Low-Rank Adapters for Quantized Training (LoQT), a method for efficiently training quantized models. LoQT uses gradient-based tensor factorization to initialize low-rank trainable weight matrices that are periodically merged into quantized full-rank weight matrices. Our approach is suitable for both pretraining and fine-tuning models. We demonstrate this for language modeling and downstream task adaptation, finding that LoQT enables efficient training of models up to 7B parameters on a 24GB GPU. We also demonstrate the feasibility of training a 13B model using per-layer gradient updates on the same hardware.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.086433410644531, -0.4982341527938843], "openalex_id": "https://openalex.org/W4399115841", "title": "Compressing Lengthy Context With UltraGist", "authors": "Peitian Zhang, Zheng Liu, Shitao Xiao, Ninglu Shao, Qiwei Ye, Zhicheng Dou", "abstract": "Compressing lengthy context is a critical but technically challenging problem. In this paper, we propose a new method called UltraGist, which is distinguished for its high-quality compression of lengthy context due to the innovative design of the compression and learning algorithm. UltraGist brings forth the following important benefits. Firstly, it notably contributes to the flexibility of compression, as it can be effectively learned to support a broad range of context lengths and compression ratios. Secondly, it helps to produce fine-grained compression for the lengthy context, where each small segment of the context is progressively processed on top of a tailored cross-attention mechanism. Thirdly, it makes the training process sample-efficient and thus maximizes the use of training data. Finally, it facilitates the efficient running of compression for dynamic context, as the compression result can be progressively generated and hence incrementally updated. UltraGist is evaluated on a wide variety of tasks associated with lengthy context, such as document QA and summarization, few-shot learning, multi-session conversation, et al. Whilst the existing methods fail to handle these challenging scenarios, our approach is able to preserve a near-lossless compression performance throughout all the evaluations. Our data, model, and code have been released at \\url{https://github.com/namespace-Pt/UltraGist}.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.791984558105469, 5.159453392028809], "openalex_id": "https://openalex.org/W4399115840", "title": "A Survey of Multimodal Large Language Model from A Data-centric Perspective", "authors": "Tianyi Bai, Hao Liang, Binwang Wan, L. Yang, Bozhou Li, Yifan Wang, Bin Cui, Conghui He, Binhang Yuan, Wentao Zhang", "abstract": "Multimodal large language models (MLLMs) enhance the capabilities of standard large language models by integrating and processing data from multiple modalities, including text, vision, audio, video, and 3D environments. Data plays a pivotal role in the development and refinement of these models. In this survey, we comprehensively review the literature on MLLMs from a data-centric perspective. Specifically, we explore methods for preparing multimodal data during the pretraining and adaptation phases of MLLMs. Additionally, we analyze the evaluation methods for the datasets and review the benchmarks for evaluating MLLMs. Our survey also outlines potential future research directions. This work aims to provide researchers with a detailed understanding of the data-driven aspects of MLLMs, fostering further exploration and innovation in this field.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.7214372158050537, 2.7453737258911133], "openalex_id": "https://openalex.org/W4399114798", "title": "Towards a Framework for Openness in Foundation Models: Proceedings from the Columbia Convening on Openness in Artificial Intelligence", "authors": "Adrien Basdevant, Camille Fran\u00e7ois, Victor Storchan, Kevin Bankston, Ayah Bdeir, Brian Behlendorf, M\u00e9rouane Debbah, Sayash Kapoor, Yann LeCun, Mark Surman, Helen King-Turvey, Nathan Lambert, Stefano Maffulli, Nik Marda, Govind Shivkumar, Justine Tunney", "abstract": "Over the past year, there has been a robust debate about the benefits and risks of open sourcing foundation models. However, this discussion has often taken place at a high level of generality or with a narrow focus on specific technical attributes. In part, this is because defining open source for foundation models has proven tricky, given its significant differences from traditional software development. In order to inform more practical and nuanced decisions about opening AI systems, including foundation models, this paper presents a framework for grappling with openness across the AI stack. It summarizes previous work on this topic, analyzes the various potential reasons to pursue openness, and outlines how openness varies in different parts of the AI stack, both at the model and at the system level. In doing so, its authors hope to provide a common descriptive framework to deepen a nuanced and rigorous understanding of openness in AI and enable further work around definitions of openness and safety in AI.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.823271751403809, 2.4113147258758545], "openalex_id": "https://openalex.org/W4399115322", "title": "Prompt Optimization with EASE? Efficient Ordering-aware Automated Selection of Exemplars", "authors": "Zhaoxuan Wu, Xiaoqiang Lin, Zhongxiang Dai, Wenyang Hu, Yao Shu, See-Kiong Ng, Patrick Jaillet, Bryan Kian Hsiang Low", "abstract": "Large language models (LLMs) have shown impressive capabilities in real-world applications. The capability of in-context learning (ICL) allows us to adapt an LLM to downstream tasks by including input-label exemplars in the prompt without model fine-tuning. However, the quality of these exemplars in the prompt greatly impacts performance, highlighting the need for an effective automated exemplar selection method. Recent studies have explored retrieval-based approaches to select exemplars tailored to individual test queries, which can be undesirable due to extra test-time computation and an increased risk of data exposure. Moreover, existing methods fail to adequately account for the impact of exemplar ordering on the performance. On the other hand, the impact of the instruction, another essential component in the prompt given to the LLM, is often overlooked in existing exemplar selection methods. To address these challenges, we propose a novel method named EASE, which leverages the hidden embedding from a pre-trained language model to represent ordered sets of exemplars and uses a neural bandit algorithm to optimize the sets of exemplars while accounting for exemplar ordering. Our EASE can efficiently find an ordered set of exemplars that performs well for all test queries from a given task, thereby eliminating test-time computation. Importantly, EASE can be readily extended to jointly optimize both the exemplars and the instruction. Through extensive empirical evaluations (including novel tasks), we demonstrate the superiority of EASE over existing methods, and reveal practical insights about the impact of exemplar selection on ICL, which may be of independent interest. Our code is available at https://github.com/ZhaoxuanWu/EASE-Prompt-Optimization.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.271695137023926, 5.561915874481201], "openalex_id": "https://openalex.org/W4399114997", "title": "An Introduction to Vision-Language Modeling", "authors": "Florian Bordes, Richard Yuanzhe Pang, Anurag Ajay, Alexander C. Li, Adrien Bardes, Suzanne Petryk, Oscar Ma\u00f1as, Zhiqiu Lin, Anas Mahmoud, Bargav Jayaraman, Mark Ibrahim, Melissa Hall, Yunyang Xiong, Jonathan Lebensold, Candace Ross, Srihari Jayakumar, Chuan Guo, Diane Bouchacourt, Haider Al-Tahan, Karthik Padthe, Vasu Sharma, Hu Xu, Xiaoqing Ellen Tan, Megan Richards, Samuel Lavoie, Pietro Astolfi, Reyhane Askari Hemmat, Jun Chen, Kushal Tirumala, Rim Assouel, Mazda Moayeri, Arjang Talattof, Kamalika Chaudhuri, Zechun Liu, Xilun Chen, Quentin Garrido, Karen Ullrich, Aishwarya Agrawal, Kate Saenko, Asl\u0131 \u00c7eliky\u0131lmaz, Vikas Chandra", "abstract": "Following the recent popularity of Large Language Models (LLMs), several attempts have been made to extend them to the visual domain. From having a visual assistant that could guide us through unfamiliar environments to generative models that produce images using only a high-level text description, the vision-language model (VLM) applications will significantly impact our relationship with technology. However, there are many challenges that need to be addressed to improve the reliability of those models. While language is discrete, vision evolves in a much higher dimensional space in which concepts cannot always be easily discretized. To better understand the mechanics behind mapping vision to language, we present this introduction to VLMs which we hope will help anyone who would like to enter the field. First, we introduce what VLMs are, how they work, and how to train them. Then, we present and discuss approaches to evaluate VLMs. Although this work primarily focuses on mapping images to language, we also discuss extending VLMs to videos.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.481627941131592, 1.4798214435577393], "openalex_id": "https://openalex.org/W4399077815", "title": "The Impact of Infrastructure: Considerations of Generative AI in the Classroom", "authors": "E. Stefan Kehlenbach", "abstract": "How are we to grapple with the increasing intrusion of technology in the classroom? What considerations need to be addressed before we begin bringing advanced technological systems like ChatGPT into our classroom spaces? A close attention to the infrastructural elements at play within the development and usage of AI gives us a better understanding of what is at stake when we bring AI into the classroom. The social and environmental impacts of AI, in particular, are often overlooked in new technological tools, especially ones that are supposed to frictionlessly improve our lives. Ultimately instructors need to balance the potential benefits and drawbacks of using forms of generative AI in the classroom and have a clear view of the impact of their decisions. For Political Science instructors, the impacts of AI may provide new teaching opportunities as well.", "venue": "Journal of Political Science Education", "label": 0}, {"loc": [5.76304817199707, 5.535335540771484], "openalex_id": "https://openalex.org/W4399116124", "title": "Multilingual Diversity Improves Vision-Language Representations", "authors": "Thao D. Nguyen, Matthew Wallingford, Sebastin Santy, Wei-Chiu Ma, Sewoong Oh, Ludwig Schmidt, Pang Wei Koh, Ranjay Krishna", "abstract": "Massive web-crawled image-text datasets lay the foundation for recent progress in multimodal learning. These datasets are designed with the goal of training a model to do well on standard computer vision benchmarks, many of which, however, have been shown to be English-centric (e.g., ImageNet). Consequently, existing data curation techniques gravitate towards using predominantly English image-text pairs and discard many potentially useful non-English samples. Our work questions this practice. Multilingual data is inherently enriching not only because it provides a gateway to learn about culturally salient concepts, but also because it depicts common concepts differently from monolingual data. We thus conduct a systematic study to explore the performance benefits of using more samples of non-English origins with respect to English vision tasks. By translating all multilingual image-text pairs from a raw web crawl to English and re-filtering them, we increase the prevalence of (translated) multilingual data in the resulting training set. Pre-training on this dataset outperforms using English-only or English-dominated datasets on ImageNet, ImageNet distribution shifts, image-English-text retrieval and on average across 38 tasks from the DataComp benchmark. On a geographically diverse task like GeoDE, we also observe improvements across all regions, with the biggest gain coming from Africa. In addition, we quantitatively show that English and non-English data are significantly different in both image and (translated) text space. We hope that our findings motivate future work to be more intentional about including multicultural and multilingual data, not just when non-English or geographically diverse tasks are involved, but to enhance model capabilities at large. All translated captions and metadata (language, CLIP score, etc.) are available on HuggingFace.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.348905563354492, 5.91500997543335], "openalex_id": "https://openalex.org/W4399115958", "title": "DISENTANGLING AND INTEGRATING RELATIONAL AND SENSORY INFORMATION IN TRANSFORMER ARCHITECTURES", "authors": "Awni Altabaa, John Lafferty", "abstract": "Relational reasoning is a central component of generally intelligent systems, enabling robust and data-efficient inductive generalization. Recent empirical evidence shows that many existing neural architectures, including Transformers, struggle with tasks requiring relational reasoning. In this work, we distinguish between two types of information: sensory information about the properties of individual objects, and relational information about the relationships between objects. While neural attention provides a powerful mechanism for controlling the flow of sensory information between objects, the Transformer lacks an explicit computational mechanism for routing and processing relational information. To address this limitation, we propose an architectural extension of the Transformer framework that we call the Dual Attention Transformer (DAT), featuring two distinct attention mechanisms: sensory attention for directing the flow of sensory information, and a novel relational attention mechanism for directing the flow of relational information. We empirically evaluate DAT on a diverse set of tasks ranging from synthetic relational benchmarks to complex real-world tasks such as language modeling and visual processing. Our results demonstrate that integrating explicit relational computational mechanisms into the Transformer architecture leads to significant performance gains in terms of data efficiency and parameter efficiency.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.452791213989258, 1.4062381982803345], "openalex_id": "https://openalex.org/W4399048425", "title": "Enhancing knowledge graphs with microdata and LLMs: the case of Schema. org and Wikidata in touristic information", "authors": "Lino Gonzalez-Garcia, Gema Gonz\u00e1lez-Carre\u00f1o, Ana Mar\u00eda Rivas Machota, Juan Padilla Fern\u00e1ndez-Vega", "abstract": "Purpose Knowledge graphs (KGs) are structured knowledge bases that represent real-world entities and are used in a variety of applications. Many of them are created and curated from a combination of automated and manual processes. Microdata embedded in Web pages for purposes of facilitating indexing and search engine optimization are a potential source to augment KGs under some assumptions of complementarity and quality that have not been thoroughly explored to date. In that direction, this paper aims to report results on a study that evaluates the potential of using microdata extracted from the Web to augment the large, open and manually curated Wikidata KG for the domain of touristic information. As large corpora of Web text is currently being leveraged via large language models (LLMs), these are used to compare the effectiveness of the microdata enhancement method. Design/methodology/approach The Schema.org taxonomy was used as the source to determine the annotation types to be collected. Here, the authors focused on tourism-related pages as a case study, selecting the relevant Schema.org concepts as point of departure. The large CommonCrawl resource was used to select those annotations from a large recent sample of the World Wide Web. The extracted annotations were processed and matched with Wikidata to estimate the degree to which microdata produced for SEO might become a valuable resource to complement KGs or vice versa. The Web pages themselves can also serve as a context to produce additional metadata elements using them as context in pipelines of an existing LLMs. That way, both the annotations and the contents itself can be used as sources. Findings The samples extracted revealed a concentration of metadata annotations in only a few of the relevant Schema.org attributes and also revealed the possible influence of authoring tools in a significant fraction of microdata produced. The analysis of the overlapping of attributes in the sample with those of Wikidata showed the potential of the technique, limited by the disbalance of the presence of attributes. The combination of those with the use of LLMs to produce additional annotations demonstrates the feasibility of the approach in the population of existing Wikidata locations. However, in both cases, the effectiveness appears to be lower in the cases of less content in the KG, which are arguably the most relevant when considering the scenario of an automated population approach. Originality/value The research reports novel empirical findings on the way touristic annotations with a SEO orientation are being produced in the wild and provides an assessment of their potential to complement KGs, or reuse information from those graphs. It also provides insights on the potential of using LLMs for the task.", "venue": "The Electronic Library", "label": 0}, {"loc": [3.5975232124328613, 1.4859447479248047], "openalex_id": "https://openalex.org/W4399043755", "title": "Comparative Analysis of Cognitive Agreement between Human Analysts and Generative AI in Construction Safety Risk Assessment", "authors": "U. Ray, Cristian Arteaga, JeeWoong Park", "abstract": "Comparative Analysis of Cognitive Agreement between Human Analysts and Generative AI in Construction Safety Risk Assessment Unmesa Ray, Cristian Arteaga, Jee Woong Park Pages 452-458 (2024 Proceedings of the 41st ISARC, Lille, France, ISBN 978-0-6458322-1-1, ISSN 2413-5844) Abstract: The construction industry struggles with safety risk assessment complexities due to evolving work environments, diverse labor forces, time constraints, regulatory intricacies, and inconsistent practices. While previous studies have highlighted the potential of Artificial Intelligence (AI) in automating processes and enhancing safety assessment, a gap exist in convergence between human analyst and language AI models. Therefore, this study seeks to assess the alignment in identification of risk factors by human analysts and a Language Model (LM) in Occupational Safety and Health Administration (OSHA) accident reports. Furthermore, it offers to: 1) categorize error types, 2) establish an acceptance threshold for LM-generated responses, and 3) evaluate inter-rater reliability in construction accident content analysis. Test results reveal significant convergence between human and machine responses and identifies potential hallucination effects in generative AI, thus paving the way for improved safety risk assessments within the construction industry. Keywords: construction industry, safety risk assessment, Artificial Intelligence (AI), Occupational Safety and Health Administration (OSHA), Language Model (LM), inter-rater reliability, generative AI DOI: https://doi.org/10.22260/ISARC2024/0059 Download fulltext Download BibTex Download Endnote (RIS) TeX Import to Mendeley", "venue": "Proceedings of the ... ISARC", "label": 0}, {"loc": [8.592034339904785, -0.25485336780548096], "openalex_id": "https://openalex.org/W4399061435", "title": "Text Generation: A Systematic Literature Review of Tasks, Evaluation, and Challenges", "authors": "J. K. Becker, Jan Philip Wahle, B\u00e9la Gipp, Terry Ruas", "abstract": "Text generation has become more accessible than ever, and the increasing interest in these systems, especially those using large language models, has spurred an increasing number of related publications. We provide a systematic literature review comprising 244 selected papers between 2017 and 2024. This review categorizes works in text generation into five main tasks: open-ended text generation, summarization, translation, paraphrasing, and question answering. For each task, we review their relevant characteristics, sub-tasks, and specific challenges (e.g., missing datasets for multi-document summarization, coherence in story generation, and complex reasoning for question answering). Additionally, we assess current approaches for evaluating text generation systems and ascertain problems with current metrics. Our investigation shows nine prominent challenges common to all tasks and sub-tasks in recent text generation publications: bias, reasoning, hallucinations, misuse, privacy, interpretability, transparency, datasets, and computing. We provide a detailed analysis of these challenges, their potential solutions, and which gaps still require further engagement from the community. This systematic literature review targets two main audiences: early career researchers in natural language processing looking for an overview of the field and promising research directions, as well as experienced researchers seeking a detailed view of tasks, evaluation methodologies, open challenges, and recent mitigation strategies.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.212203025817871, 3.239841938018799], "openalex_id": "https://openalex.org/W4399062234", "title": "Stacking Your Transformers: A Closer Look at Model Growth for Efficient LLM Pre-Training", "authors": "Wenyu Du, Tongxu Luo, Zihan Qiu, Zeyu Huang, Yikang Shen, Reynold Cheng, Yike Guo, Jie Fu", "abstract": "LLMs are computationally expensive to pre-train due to their large scale. Model growth emerges as a promising approach by leveraging smaller models to accelerate the training of larger ones. However, the viability of these model growth methods in efficient LLM pre-training remains underexplored. This work identifies three critical $\\underline{\\textit{O}}$bstacles: ($\\textit{O}$1) lack of comprehensive evaluation, ($\\textit{O}$2) untested viability for scaling, and ($\\textit{O}$3) lack of empirical guidelines. To tackle $\\textit{O}$1, we summarize existing approaches into four atomic growth operators and systematically evaluate them in a standardized LLM pre-training setting. Our findings reveal that a depthwise stacking operator, called $G_{\\text{stack}}$, exhibits remarkable acceleration in training, leading to decreased loss and improved overall performance on eight standard NLP benchmarks compared to strong baselines. Motivated by these promising results, we conduct extensive experiments to delve deeper into $G_{\\text{stack}}$ to address $\\textit{O}$2 and $\\textit{O}$3. For $\\textit{O}$2 (untested scalability), our study shows that $G_{\\text{stack}}$ is scalable and consistently performs well, with experiments up to 7B LLMs after growth and pre-training LLMs with 750B tokens. For example, compared to a conventionally trained 7B model using 300B tokens, our $G_{\\text{stack}}$ model converges to the same loss with 194B tokens, resulting in a 54.6\\% speedup. We further address $\\textit{O}$3 (lack of empirical guidelines) by formalizing guidelines to determine growth timing and growth factor for $G_{\\text{stack}}$, making it practical in general LLM pre-training. We also provide in-depth discussions and comprehensive ablation studies of $G_{\\text{stack}}$. Our code and pre-trained model are available at https://llm-stacking.github.io.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.542204856872559, 2.0584514141082764], "openalex_id": "https://openalex.org/W4399061442", "title": "Automatic Data Curation for Self-Supervised Learning: A Clustering-Based Approach", "authors": "Huy V. Vo, Vasil Khalidov, Timoth\u00e9e Darcet, Th\u00e9o Moutakanni, Nikita Smetanin, Marc Szafraniec, Hugo Touvron, Camille Couprie, Maxime Oquab, Armand Joulin, Herv\u00e9 Je\u01f5ou, Patrick Labatut, Piotr Bojanowski", "abstract": "Self-supervised features are the cornerstone of modern machine learning systems. They are typically pre-trained on data collections whose construction and curation typically require extensive human effort. This manual process has some limitations similar to those encountered in supervised learning, e.g., the crowd-sourced selection of data is costly and time-consuming, preventing scaling the dataset size. In this work, we consider the problem of automatic curation of high-quality datasets for self-supervised pre-training. We posit that such datasets should be large, diverse and balanced, and propose a clustering-based approach for building ones satisfying all these criteria. Our method involves successive and hierarchical applications of $k$-means on a large and diverse data repository to obtain clusters that distribute uniformly among data concepts, followed by a hierarchical, balanced sampling step from these clusters. Extensive experiments on three different data domains including web-based images, satellite images and text show that features trained on our automatically curated datasets outperform those trained on uncurated data while being on par or better than ones trained on manually curated data. Code is available at https://github.com/facebookresearch/ssl-data-curation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.08986234664917, -0.9822950959205627], "openalex_id": "https://openalex.org/W4399062229", "title": "Organic Data-Driven Approach for Turkish Grammatical Error Correction and LLMs", "authors": "As\u0131m Ersoy, Olcay Taner Y\u0131ld\u0131z", "abstract": "Grammatical Error Correction has seen significant progress with the recent advancements in deep learning. As those methods require huge amounts of data, synthetic datasets are being built to fill this gap. Unfortunately, synthetic datasets are not organic enough in some cases and even require clean data to start with. Furthermore, most of the work that has been done is focused mostly on English. In this work, we introduce a new organic data-driven approach, clean insertions, to build parallel Turkish Grammatical Error Correction datasets from any organic data, and to clean the data used for training Large Language Models. We achieve state-of-the-art results on two Turkish Grammatical Error Correction test sets out of the three publicly available ones. We also show the effectiveness of our method on the training losses of training language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.4294416904449463, 1.8527190685272217], "openalex_id": "https://openalex.org/W4399066793", "title": "Pathological Liars: Algorithmic Knowing in the Rhetorical Ecosystem of Wallstreetbets", "authors": "Misti Yang, Zoltan P. Majdik", "abstract": "This essay demonstrates the value of using artificial intelligence (AI) technologies to address specific kinds of research questions in rhetoric. The essay builds on a study of a novel rhetorical object first observed by Yang on the Reddit subreddit r/wallstreetbets. We demonstrate how the rhetorical structure of \"pathologics\" (1) generated a kind of rhetorical authority that can be measured by higher-than-average user engagement on Reddit and (2) circulated from Reddit into more traditional legacy media. Through our research on the rhetorical circulation of pathologics, we argue that researching rhetoric with AI can center new ways of knowing about concepts relevant in rhetoric, like circulation and rhetorical ecosystems. Further, we argue that researching rhetoric with AI always also entails considering a \"rhetoric of AI,\" requiring critical attention to the platforms, infrastructures, and data sources connected to AI systems.", "venue": "Rhetoric Society Quarterly", "label": 40}, {"loc": [3.31118106842041, 1.9798749685287476], "openalex_id": "https://openalex.org/W4399154184", "title": "A Copious Void: Rhetoric as Artificial Intelligence 1.0", "authors": "Atilla Hallsby", "abstract": "Rhetoric is a trace retained in and by artificial intelligence (AI) technologies. This concept illuminates how rhetoric and AI have faced issues related to information abundance, entrenched social inequalities, discriminatory biases, and the reproduction of repressive ideologies. Drawing on their shared root terminology (stochastic/artifice), common logic (zero-agency), and similar forms of organization (trope+algorithm), this essay urges readers to consider the etymological, ontological, and formal dimensions of rhetoric as inherent features of contemporary AI.", "venue": "Rhetoric Society Quarterly", "label": 40}, {"loc": [5.217017650604248, -1.5661700963974], "openalex_id": "https://openalex.org/W4399022174", "title": "Application of an Improved Convolutional Neural Network Algorithm in Text Classification", "authors": "Jing Peng, Shuquan Huo", "abstract": "This paper proposes a text classification model based on a combination of a convolutional neural network (CNN) and a support vector machine (SVM) using Amazon review polarity, TREC, and Kaggle as experimental data. By adding an attention mechanism to simplify the parameters and using the classifier based on SVM to replace the Softmax layer, the extraction effect of feature words is improved and the problem of weak generalization ability of the CNN model is solved. Simulation experiments show that the proposed algorithm performs better in precision rate, recall rate, F1 value, and training time compared with CNN, RNN, BERT and term frequency-inverse document frequency (TF-IDF).", "venue": "Journal of Web Engineering", "label": 0}, {"loc": [9.158453941345215, 0.7986751198768616], "openalex_id": "https://openalex.org/W4399932475", "title": "Comparative Performance of Advanced NLP Models and LLMs in Multilingual Geo-Entity Detection", "authors": "Kalin Kopanov", "abstract": "The integration of advanced Natural Language Processing (NLP) methodologies\\nand Large Language Models (LLMs) has significantly enhanced the extraction and\\nanalysis of geospatial data from multilingual texts, impacting sectors such as\\nnational and international security. This paper presents a comprehensive\\nevaluation of leading NLP models -- SpaCy, XLM-RoBERTa, mLUKE, GeoLM -- and\\nLLMs, specifically OpenAI's GPT 3.5 and GPT 4, within the context of\\nmultilingual geo-entity detection. Utilizing datasets from Telegram channels in\\nEnglish, Russian, and Arabic, we examine the performance of these models\\nthrough metrics such as accuracy, precision, recall, and F1 scores, to assess\\ntheir effectiveness in accurately identifying geospatial references. The\\nanalysis exposes each model's distinct advantages and challenges, underscoring\\nthe complexities involved in achieving precise geo-entity identification across\\nvaried linguistic landscapes. The conclusions drawn from this experiment aim to\\ndirect the enhancement and creation of more advanced and inclusive NLP tools,\\nthus advancing the field of geospatial analysis and its application to global\\nsecurity.\\n", "venue": "https://doi.org/10.1145/3660853.3660878", "label": 0}, {"loc": [3.6863038539886475, 4.526213645935059], "openalex_id": "https://openalex.org/W4398809881", "title": "Worldwide Federated Training of Language Models", "authors": "Alex Iacob, Lorenzo Sani, Bill Marino, Preslav Aleksandrov, Nicholas D. Lane", "abstract": "The reliance of language model training on massive amounts of computation and vast datasets scraped from potentially low-quality, copyrighted, or sensitive data has come into question practically, legally, and ethically. Federated learning provides a plausible alternative by enabling previously untapped data to be voluntarily gathered from collaborating organizations. However, when scaled globally, federated learning requires collaboration across heterogeneous legal, security, and privacy regimes while accounting for the inherent locality of language data; this further exacerbates the established challenge of federated statistical heterogeneity. We propose a Worldwide Federated Language Model Training~(WorldLM) system based on federations of federations, where each federation has the autonomy to account for factors such as its industry, operating jurisdiction, or competitive environment. WorldLM enables such autonomy in the presence of statistical heterogeneity via partial model localization by allowing sub-federations to attentively aggregate key layers from their constituents. Furthermore, it can adaptively share information across federations via residual layer embeddings. Evaluations of language modeling on naturally heterogeneous datasets show that WorldLM outperforms standard federations by up to $1.91\\times$, approaches the personalized performance of fully local models, and maintains these advantages under privacy-enhancing techniques.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.791893005371094, 2.1975345611572266], "openalex_id": "https://openalex.org/W4398795109", "title": "Dataset Decomposition: Faster LLM Training with Variable Sequence Length Curriculum", "authors": "Hadi Pouransari, Chunliang Li, Jen-Hao Rick Chang, Pavan Kumar Anasosalu Vasu, Cem Koc, Vaishaal Shankar, Oncel Tuzel", "abstract": "Large language models (LLMs) are commonly trained on datasets consisting of fixed-length token sequences. These datasets are created by randomly concatenating documents of various lengths and then chunking them into sequences of a predetermined target length (concat-and-chunk). Recent attention implementations mask cross-document attention, reducing the effective length of a chunk of tokens. Additionally, training on long sequences becomes computationally prohibitive due to the quadratic cost of attention. In this study, we introduce dataset decomposition, a novel variable sequence length training technique, to tackle these challenges. We decompose a dataset into a union of buckets, each containing sequences of the same size extracted from a unique document. During training, we use variable sequence length and batch-size, sampling simultaneously from all buckets with a curriculum. In contrast to the concat-and-chunk baseline, which incurs a fixed attention cost at every step of training, our proposed method incurs a computational cost proportional to the actual document lengths at each step, resulting in significant savings in training time. We train an 8k context-length 1B model at the same cost as a 2k context-length model trained with the baseline approach. Experiments on a web-scale corpus demonstrate that our approach significantly enhances performance on standard language evaluations and long-context benchmarks, reaching target accuracy with up to 6x faster training compared to the baseline. Our method not only enables efficient pretraining on long sequences but also scales effectively with dataset size. Lastly, we shed light on a critical yet less studied aspect of training large language models: the distribution and curriculum of sequence lengths, which results in a non-negligible difference in performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.824053764343262, -0.5203787088394165], "openalex_id": "https://openalex.org/W4398886403", "title": "Smart Bilingual Focused Crawling of Parallel Documents", "authors": "Cristian Garc\u00eda-Romero, Miquel Espl\u00e0-Gomis, Felipe S\u00e1nchez-Mart\u00ednez", "abstract": "Crawling parallel texts $\\unicode{x2014}$texts that are mutual translations$\\unicode{x2014}$ from the Internet is usually done following a brute-force approach: documents are massively downloaded in an unguided process, and only a fraction of them end up leading to actual parallel content. In this work we propose a smart crawling method that guides the crawl towards finding parallel content more rapidly. Our approach builds on two different models: one that infers the language of a document from its URL, and another that infers whether a pair of URLs link to parallel documents. We evaluate both models in isolation and their integration into a crawling tool. The results demonstrate the individual effectiveness of both models and highlight that their combination enables the early discovery of parallel content during crawling, leading to a reduction in the amount of downloaded documents deemed useless, and yielding a greater quantity of parallel documents compared to conventional crawling approaches.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.612603664398193, 1.7693772315979004], "openalex_id": "https://openalex.org/W4398795636", "title": "The correlation between nativelike selection and prototypicality: a multilingual onomasiological case study using semantic embedding", "authors": "Huasheng Zhang", "abstract": "In native speakers' lexical choices, a concept can be more readily expressed by one expression over another grammatical one, a phenomenon known as nativelike selection (NLS). In previous research, arbitrary chunks such as collocations have been considered crucial for this phenomenon. However, this study examines the possibility of analyzing the semantic motivation and deducibility behind some NLSs by exploring the correlation between NLS and prototypicality, specifically the onomasiological hypothesis of Grondelaers and Geeraerts (2003, Towards a pragmatic model of cognitive onomasiology. In Hubert Cuyckens, Ren\u00e9 Dirven & John R. Taylor (eds.), Cognitive approaches to lexical semantics, 67-92. Berlin: De Gruyter Mouton). They hypothesized that \"[a] referent is more readily named by a lexical item if it is a salient member of the category denoted by that item\". To provide a preliminary investigation of this important but rarely explored phenomenon, a series of innovative methods and procedures, including the use of semantic embedding and interlingual comparisons, is designed. Specifically, potential NLSs are efficiently discovered through an automatic exploratory analysis using topic modeling techniques, and then confirmed by manual inspection through frame semantics. Finally, to account for the NLS in question, cluster analysis and behavioral profile analysis are conducted to uncover a language-specific prototype for the Chinese verb shang 'harm', providing supporting evidence for the correlation between NLS and prototypicality.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.717373371124268, 3.429248094558716], "openalex_id": "https://openalex.org/W4398774455", "title": "Efficient Training and Inference: Techniques for Large Language Models Using Llama", "authors": "Sophia R. Cunningham, Dominique Archambault, Austin Kung", "abstract": "To enhance the efficiency of language models, it would involve optimizing their training and inference processes to reduce computational demands while maintaining high performance. The research focuses on the application of model compression, quantization, and hardware acceleration techniques to the Llama model. Pruning and knowledge distillation methods effectively reduce the model size, resulting in faster training times and lower resource consumption. Quantization techniques, including 8-bit and 4-bit representations, significantly decrease memory usage and improve computational speed without substantial accuracy loss. The integration of GPUs and TPUs further accelerates the training and inference processes, demonstrating the crucial role of hardware in optimizing large-scale models. The study highlights the practical implications of those techniques, paving the way for more sustainable and scalable AI solutions.", "venue": "https://doi.org/10.36227/techrxiv.171651876.65094225/v1", "label": 0}, {"loc": [5.337046146392822, 2.143273115158081], "openalex_id": "https://openalex.org/W4398774840", "title": "From Cosine Similarity to Likelihood Ratio: Coupling Representations From Machine Learning (and Other Sources) With Cognitive Models", "authors": "Gregory E. Cox", "abstract": "Modern machine learning models yield vector representations that capture similarity relations between complex items like text and images. These representations can help explain and predict how individuals respond to those items in particular tasks, but only if representations are coupled to a cognitive model of the processes people use to perform those tasks. I introduce C2L (\"context to likelihood\"), a mathematical transformation of the similarity between vector representations, operationalized as the cosine of the angle between them, into a ratio of the relative likelihood that the two representations encode the same versus different items. The likelihood ratio operationalizes similarity in a manner that is motivated by cognitive theories of perception and memory and is readily \"plugged in\" to cognitive models. Two example applications show how C2L can be used to compute drift rates of a diffusion decision model based on similarity information derived from machine learning models, thereby accounting for the speed and accuracy with which individual participants recognize individual items. C2L enables inferences regarding how different people represent items, how much information they encode about each item, and how that information is affected by experimental manipulations. C2L serves both the practical purpose of making it easier to incorporate representations from machine learning into cognitive models and the theoretical purpose of allowing cognitive models to grant insight into how people process the increasingly complex, naturalistic items to which machine learning models are applied.", "venue": "https://doi.org/10.31234/osf.io/v7xuz", "label": 0}, {"loc": [5.483654975891113, -1.255625605583191], "openalex_id": "https://openalex.org/W4398234895", "title": "Comquest: Large Scale User Comment Crawling and Integration", "authors": "Zhijia Chen, Lihong He, Arjun Mukherjee, Eduard Dragut", "abstract": "User-generated content like comments are valuable sources for various downstream applications. However, access to user comments data is often limited to specific platforms or outlets, which imposes a great limitation on the available data, and may not provide a representative sample of opinions from a diverse population on a particular event. This paper presents a comment crawling system that leverages the Web API of popular third-party commenting systems to collect comments from a large number of websites integrated with the commenting systems. Given a target page, the crawling system utilizes a deep learning model to extract API parameters and send HTTP requests to the API to retrieve comments. The system, Comquest, that we propose to demo is news-oriented and crawls comments regarding specific news topics/stories. Comquest can work with any website that allows commenting. Comquest provides a useful tool for collecting comments that represent a wider range of opinions, stances, and sentiments from websites on a global scale.", "venue": "https://doi.org/10.1145/3626246.3654736", "label": 0}, {"loc": [6.528171539306641, 1.1961268186569214], "openalex_id": "https://openalex.org/W4398230331", "title": "Large Language Models for NLP: An In-depth Comparative Examination", "authors": "Libo Qin, Qiguang Chen, Xiachong Feng, Yang Wu, Yongheng Zhang, Yinghui Li, Min Li, Wanxiang Che, Philip S. Yu", "abstract": "While large language models (LLMs) like ChatGPT have shown impressive capabilities in Natural Language Processing (NLP) tasks, a systematic investigation of their potential in this field remains largely unexplored. This study aims to address this gap by exploring the following questions: (1) How are LLMs currently applied to NLP tasks in the literature? (2) Have traditional NLP tasks already been solved with LLMs? (3) What is the future of the LLMs for NLP? To answer these questions, we take the first step to provide a comprehensive overview of LLMs in NLP. Specifically, we first introduce a unified taxonomy including (1) parameter-frozen paradigm and (2) parameter-tuning paradigm to offer a unified perspective for understanding the current progress of LLMs in NLP. Furthermore, we summarize the new frontiers and the corresponding challenges, aiming to inspire further groundbreaking advancements. We hope this work offers valuable insights into the potential and limitations of LLMs, while also serving as a practical guide for building effective LLMs in NLP.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.032481670379639, -2.340913772583008], "openalex_id": "https://openalex.org/W4398230616", "title": "Human-Centered Natural Language Processing for Countering Misinformation", "authors": "Ashkan Kazemi", "abstract": "As curbing the spread of online misinformation has proven to be challenging, we look to artificial intelligence (AI) and natural language technology for helping individuals and society counter and limit it. Despite current advances, state-of-the-art natural language processing (NLP) and AI still struggle to automatically identify and understand misinformation. Humans exposed to harmful content may experience lasting negative consequences in real life, and it is often difficult to change one\u2019s mind once they form wrong beliefs. Addressing these interwoven technical and social challenges requires research and understanding into the core mechanisms that drive the phenomena of misinformation. This thesis introduces human-centered NLP tasks and methods that can help prioritize human welfare in countering misinformation. We present findings on the differences in how people of different backgrounds perceive misinformation, and how misinformation unfolds in different conditions such as end-to-end encrypted social media in India. We build on this understanding to create models and datasets for identifying misinformation at scale that put humans in the decision making seat, through claim matching, matching claims with fact-check reports, and query rewriting that scale the efforts of fact-checkers. Our work highlights the global impact of misinformation, and contributes to advancing the equitability of available language technologies through models and datasets in a variety of high and low resources and languages. We also make fundamental contributions to data, algorithms, and models through: multilingual and low-resource embeddings and retrieval for better claim matching, reinforcement learning for reformulating queries for better misinformation discovery, unsupervised and graph-based focused content extraction through introducing the Biased TextRank algorithm, and explanation generation through extractive (Biased TextRank) and abstractive (GPT-2) summarization. Through this thesis, we aim to promote individual and social wellbeing by creating language technologies built on a deeper understanding of misinformation, and provide tools to help journalists as well as internet users to identify and navigate around it.", "venue": "Deep Blue (University of Michigan)", "label": 23}, {"loc": [5.639227390289307, -0.12067712098360062], "openalex_id": "https://openalex.org/W4398217444", "title": "Advancing language models through domain knowledge integration: a comprehensive approach to training, evaluation, and optimization of social scientific neural \u2026", "authors": "Fabian St\u00f6hr", "abstract": "Abstract This article proposes a comprehensive strategy for training, evaluating, and optimizing domain-specific word2vec-based word embeddings, using social science literature as an example. Our primary objectives are: (1) to train the embeddings utilizing a corpus of social science text, (2) to test their performance against domain-unspecific embeddings using our developed intrinsic and extrinsic evaluation strategy, and (3) to enhance their performance even further by using domain knowledge. As an integral part of this approach, we present SociRel-461, a domain-knowledge dictionary designed for the intrinsic evaluation and subsequent refinement of social science word embeddings. Using a dataset of 100,000 full-text scientific articles in sociology, we train multiple vector space models, which we then benchmark against a larger, pre-trained general language embedding model as part of our extrinsic evaluation. Furthermore, we developed a transfer learning multi-label classification task for extrinsic evaluation. Our findings reveal that domain-specific embeddings outperform their domain-unspecific counterparts in both intrinsic and extrinsic evaluations. We also investigated the retrofitting post-processing method to enhance domain-unspecific embeddings with the domain knowledge embedded in SociRel-461. While retrofitting does not enhance our domain-specific vector space models, it significantly improves the performance of the domain-unspecific embeddings. This highlights the potential of retrofitting for the transfer of domain knowledge to domain-unspecific embeddings. Our results emphasize the importance of utilizing domain-specific word embeddings for better performance in domain specific transfer learning tasks, as they outperform conventional embeddings trained on everyday language.", "venue": "Journal of Computational Social Science", "label": 0}, {"loc": [7.53480339050293, -1.2171576023101807], "openalex_id": "https://openalex.org/W4398192222", "title": "Cyber Risks of Machine Translation Critical Errors: Arabic Mental Health Tweets as a Case Study", "authors": "Hadeel Saadany, Ashraf Tantawy, Constantin Or\u01cesan", "abstract": "With the advent of Neural Machine Translation (NMT) systems, the MT output has reached unprecedented accuracy levels which resulted in the ubiquity of MT tools on almost all online platforms with multilingual content. However, NMT systems, like other state-of-the-art AI generative systems, are prone to errors that are deemed machine hallucinations. The problem with NMT hallucinations is that they are remarkably \\textit{fluent} hallucinations. Since they are trained to produce grammatically correct utterances, NMT systems are capable of producing mistranslations that are too fluent to be recognised by both users of the MT tool, as well as by automatic quality metrics that are used to gauge their performance. In this paper, we introduce an authentic dataset of machine translation critical errors to point to the ethical and safety issues involved in the common use of MT. The dataset comprises mistranslations of Arabic mental health postings manually annotated with critical error types. We also show how the commonly used quality metrics do not penalise critical errors and highlight this as a critical issue that merits further attention from researchers.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.14462947845459, 0.06686719506978989], "openalex_id": "https://openalex.org/W4398172016", "title": "Evaluation and Adaptation of Neural Language Models for Under-Resourced Languages", "authors": "Wietse de Vries", "abstract": "Language models are now commonly used by researchers, industry, and anyone interested. However, language models of all sizes and types are primarily developed for the English language while efforts on other languages lag behind. This dissertation explores how well non-English language models perform and how to adapt models for higher resource languages to lower-resource languages. With a focus on Dutch, we show high cross-lingual performance. Moreover, we find that language models can be adapted to other higher-resource languages (Dutch and Italian) or to low-resource languages (Gronings and Frisian) with minimal extra training. Finally, we compare how language similarity affects cross-lingual performance and find previously found low performance can be caused by the use of English as a source language.", "venue": "https://doi.org/10.33612/diss.993731018", "label": 0}, {"loc": [9.19002914428711, -0.8224837183952332], "openalex_id": "https://openalex.org/W4398192054", "title": "\" Previously on...\" From Recaps to Story Summarization", "authors": "Aditya Kumar Singh, Dhruv Srivastava, Makarand Tapaswi", "abstract": "We introduce multimodal story summarization by leveraging TV episode recaps - short video sequences interweaving key story moments from previous episodes to bring viewers up to speed. We propose PlotSnap, a dataset featuring two crime thriller TV shows with rich recaps and long episodes of 40 minutes. Story summarization labels are unlocked by matching recap shots to corresponding sub-stories in the episode. We propose a hierarchical model TaleSumm that processes entire episodes by creating compact shot and dialog representations, and predicts importance scores for each video shot and dialog utterance by enabling interactions between local story groups. Unlike traditional summarization, our method extracts multiple plot points from long videos. We present a thorough evaluation on story summarization, including promising cross-series generalization. TaleSumm also shows good results on classic video summarization benchmarks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.599195957183838, 1.9093952178955078], "openalex_id": "https://openalex.org/W4398192697", "title": "Linguistic Structure from a Bottleneck on Sequential Information Processing", "authors": "Richard Futrell, Michael Hahn", "abstract": "Human language is a unique form of communication in the natural world, distinguished by its structured nature. Most fundamentally, it is systematic, meaning that signals can be broken down into component parts that are individually meaningful -- roughly, words -- which are combined in a regular way to form sentences. Furthermore, the way in which these parts are combined maintains a kind of locality: words are usually concatenated together, and they form contiguous phrases, keeping related parts of sentences close to each other. We address the challenge of understanding how these basic properties of language arise from broader principles of efficient communication under information processing constraints. Here we show that natural-language-like systematicity arises in codes that are constrained by predictive information, a measure of the amount of information that must be extracted from the past of a sequence in order to predict its future. In simulations, we show that such codes approximately factorize their source distributions, and then express the resulting factors systematically and locally. Next, in a series of cross-linguistic corpus studies, we show that human languages are structured to have low predictive information at the levels of phonology, morphology, syntax, and semantics. Our result suggests that human language performs a sequential, discrete form of Independent Components Analysis on the statistical distribution over meanings that need to be expressed. It establishes a link between the statistical and algebraic structure of human language, and reinforces the idea that the structure of human language is shaped by communication under cognitive constraints.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.237724781036377, 1.0444940328598022], "openalex_id": "https://openalex.org/W4398191594", "title": "CC-GPX: Extracting High-Quality Annotated Geospatial Data from Common Crawl", "authors": "Ilya Ilyankou, James Haworth, Stefano Cavazzi", "abstract": "The Common Crawl (CC) corpus is the largest open web crawl dataset containing 9.5+ petabytes of data captured since 2008. The dataset is instrumental in training large language models, and as such it has been studied for (un)desirable content, and distilled for smaller, domain-specific datasets. However, to our knowledge, no research has been dedicated to using CC as a source of annotated geospatial data. In this paper, we introduce an efficient pipeline to extract annotated user-generated tracks from GPX files found in CC, and the resulting multimodal dataset with 1,416 pairings of human-written descriptions and MultiLineString vector data from the 6 most recent CC releases. The dataset can be used to study people's outdoor activity patterns, the way people talk about their outdoor experiences, as well as for developing trajectory generation or track annotation models, or for various other problems in place of synthetically generated routes. Our reproducible code is available on GitHub: https://github.com/ilyankou/cc-gpx", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.201488971710205, 4.081395626068115], "openalex_id": "https://openalex.org/W4399852654", "title": "Evaluating the impact of design decisions on passive DNS-based domain rankings", "authors": "Victor Le Pochat, Simon Fernandez, Tom Van Goethem, Samaneh Tajalizadehkhoob, Lieven Desmet, Andrzej Duda, Wouter Joosen, Maciej Korczy\u0144ski", "abstract": "International audience", "venue": "https://doi.org/10.23919/tma62044.2024.10559182", "label": 0}, {"loc": [3.7183725833892822, 4.552619934082031], "openalex_id": "https://openalex.org/W4401508395", "title": "Towards Federated Large Language Models: Motivations, Methods, and Future Directions", "authors": "Ningxin Su, Chenghao Hu, Baochun Li, Bo Li", "abstract": "With the recent surge of research interests in Large Language Models (LLMs), a natural question that arises is how pre-trained LLMs can be fine-tuned to tailor to specific needs of enterprises and individual users, while preserving the privacy of data used in the fine-tuning process. On the one hand, sending private data to cloud datacenters for fine-tuning is, without a doubt, unacceptable from a privacy perspective. On the other hand, conventional federated learning requires each client to perform local training, which is not feasible for LLMs with respect to both computation costs and communication overhead, since they involve billions of model parameters. In this paper, we present Titanic, a new distributed training paradigm that allows LLMs to be fine-tuned in a privacy-preserving fashion directly on the client devices where private data is produced, while operating within the resource constraints on computation and communication bandwidth. Titanic first optimally selects a subset of clients with an efficient solution to an integer optimization problem, then partitions an LLM across multiple client devices, and finally fine-tunes the model with no or minimal losses in training performance. A primary focus in the design of Titanic is its feasibility in real-world systems: it is first and foremost designed for production-quality systems, featuring a model-agnostic partitioning mechanism that is fully automated. Our experimental results show that Titanic achieves superior training performance as compared to conventional federated learning, while preserving data privacy and satisfying all constraints on local computation and bandwidth resources.", "venue": "https://doi.org/10.1109/infocom52122.2024.10621164", "label": 0}, {"loc": [6.2935075759887695, 5.443425178527832], "openalex_id": "https://openalex.org/W4398184769", "title": "Fusing Domain-Specific Content from Large Language Models into Knowledge Graphs for Enhanced Zero Shot Object State Classification", "authors": "Filippos Gouidis, Katerina Papantoniou, Konstantinos Papoutsakis, Theodore Patkos, Antonis Argyros, Dimitris Plexousakis", "abstract": "Domain-specific knowledge can significantly contribute to addressing a wide variety of vision tasks. However, the generation of such knowledge entails considerable human labor and time costs. This study investigates the potential of Large Language Models (LLMs) in generating and providing domain-specific information through semantic embeddings. To achieve this, an LLM is integrated into a pipeline that utilizes Knowledge Graphs and pre-trained semantic vectors in the context of the Vision-based Zero-shot Object State Classification task. We thoroughly examine the behavior of the LLM through an extensive ablation study. Our findings reveal that the integration of LLM-based embeddings, in combination with general-purpose pre-trained embeddings, leads to substantial performance improvements. Drawing insights from this ablation study, we conduct a comparative analysis against competing models, thereby highlighting the state-of-the-art performance achieved by the proposed approach.", "venue": "Proceedings of the AAAI Symposium Series", "label": 0}, {"loc": [6.824511528015137, 0.27878421545028687], "openalex_id": "https://openalex.org/W4398157615", "title": "A Survey on Large Language Models with Multilingualism: Recent Advances and New Frontiers", "authors": "Kaiyu Huang, Fengran Mo, Hongliang Li, You Li, Yuanchi Zhang, Wei-Jian Yi, Yulong Mao, Jinchen Liu, Yuzhuang Xu, Jinan Xu, Jian\u2010Yun Nie, Yang Liu", "abstract": "The rapid development of Large Language Models (LLMs) demonstrates remarkable multilingual capabilities in natural language processing, attracting global attention in both academia and industry. To mitigate potential discrimination and enhance the overall usability and accessibility for diverse language user groups, it is important for the development of language-fair technology. Despite the breakthroughs of LLMs, the investigation into the multilingual scenario remains insufficient, where a comprehensive survey to summarize recent approaches, developments, limitations, and potential solutions is desirable. To this end, we provide a survey with multiple perspectives on the utilization of LLMs in the multilingual scenario. We first rethink the transitions between previous and current research on pre-trained language models. Then we introduce several perspectives on the multilingualism of LLMs, including training and inference methods, information retrieval, model security, multi-domain with language culture, and usage of datasets. We also discuss the major challenges that arise in these aspects, along with possible solutions. Besides, we highlight future research directions that aim at further enhancing LLMs with multilingualism. The survey aims to help the research community address multilingual problems and provide a comprehensive understanding of the core concepts, key techniques, and latest developments in multilingual natural language processing based on LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.846565246582031, -0.9194450974464417], "openalex_id": "https://openalex.org/W4395959283", "title": "INCLURE: a Dataset and Toolkit for Inclusive French Translation", "authors": "Paul J. Lerner, Cyril Grouin", "abstract": "International audience", "venue": "HAL (Le Centre pour la Communication Scientifique Directe)", "label": 6}, {"loc": [5.9249444007873535, -0.6272103190422058], "openalex_id": "https://openalex.org/W4399448831", "title": "Zero-shot learning for multilingual discourse relation classification", "authors": "Eleni Metheniti, Philippe M\u00fcller, Chlo\u00e9 Braud, Margarita Hern\u00e1ndez-Casas", "abstract": "International audience", "venue": "HAL (Le Centre pour la Communication Scientifique Directe)", "label": 6}, {"loc": [4.813244342803955, 0.7056425213813782], "openalex_id": "https://openalex.org/W4398145463", "title": "When Young Scholars Cooperate with LLMs in Academic Tasks: The Influence of Individual Differences and Task Complexities", "authors": "Jiyao Wang, Chunxi Huang, Song Yan, Weiyin Xie, Dengbo He", "abstract": "As a novel AI-powered conversational system, large language models (LLMs) have the potential to be used in various applications. Recent advances in LLMs like ChatGPT have made LLM-based academic tools possible. However, most of the existing studies on the adoption of LLM for academic tasks were based on theoretical or qualitative analyses, which failed to provide empirical evidence on the effects of LLMs on users' behaviors. Additionally, although previous work has investigated users' acceptance of conventional conversational systems, little is known about how scholars evaluate LLMs when they are used for academic tasks. Hence, we conducted an empirical field experiment to assess the performance of 48 early-stage scholars on two core academic activities (paper reading and literature reviews) under varying time constraints. Prior to the tasks, participants underwent different training programs about LLM capabilities and limitations. Then, we built a hierarchy dependency network using the Bayesian network. Statistical regression analyses were further conducted to quantify relationships among influential factors of task performance and users' attitudes toward the LLMs. It was found that young scholars have upheld relatively high academic integrity when using LLMs for academic tasks, and user-LLM performance varied with the task type and time pressure but not with the type of training we used. Further, scholars' traits can also affect their performance in academic tasks and attitudes towards the LLMs. This work can inspire the future development of LLM-related user training and guide the optimization of LLMs.", "venue": "International Journal of Human-Computer Interaction", "label": 31}, {"loc": [8.390157699584961, 1.9844614267349243], "openalex_id": "https://openalex.org/W4397049132", "title": "IGOT: Information Gain Optimized Tokenizer on Domain Adaptive Pretraining", "authors": "Dawei Feng, Yihai Zhang, Zhixuan Xu", "abstract": "Pretrained Large Language Models (LLM) such as ChatGPT, Claude, etc. have demonstrated strong capabilities in various fields of natural language generation. However, there are still many problems when using LLM in specialized domain-specific fields. When using generative AI to process downstream tasks, a common approach is to add new knowledge (e.g., private domain knowledge, cutting-edge information) to a pretrained model through continued training or fine-tuning. However, whether there is a universal paradigm for domain adaptation training is still an open question. In this article, we proposed Information Gain Optimized Tokenizer (IGOT), which analyzes the special token set of downstream tasks, constructs a new subset using heuristic function $\u03d5$ with the special token and its information gain, to build new domain-specific tokenizer, and continues pretraining on the downstream task data. We explored the many positive effects of this method's customized tokenizer on domain-adaptive pretraining and verified this method can perform better than the ordinary method of just collecting data and fine-tuning. Based on our experiment, the continued pretraining process of IGOT with LLaMA-7B achieved 11.9\\% token saving, 12.2\\% training time saving, and 5.8\\% maximum GPU VRAM usage saving, combined with the T5 model, we can even reach a 31.5\\% of training time saving, making porting general generative AI to specific domains more effective than before. In domain-specific tasks, supervised $IGOT_\u03c4$ shows great performance on reducing both the convergence radius and convergence point during keep pretraining.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.628417730331421, 3.800044298171997], "openalex_id": "https://openalex.org/W4398138422", "title": "Application of Open-source Large Language Model (LLM) for Simulation of a Vulnerable IoT System and Cybersecurity Best Practices Assistance", "authors": "Veneta Yosifova", "abstract": "This paper explores the role of open-source large language models in IoT cybersecurity world. The threats of malicious activity on the Internet and the loss of private information are very real and lead to serious consequences. The purpose of this paper is to investigate how open source-large language models can help to defend against the growing threat of cyber-crimes. We conducted our experiments in two directions. The first one is a security assistant that helps with cybersecurity best practices advices. The second one is a how large language model can simulate a vulnerable IoT system. For both types of experiments, the interactive mode of operation of the language model is used. In the context of the cybersecurity research, a major advantage of the locally installed open-sourced large language models is that they do not share sensitive data with a remote system in a cloud. The paper concludes by discussing the potential impact of open-source large language models on cybersecurity research and recommends future research directions.", "venue": "Preprints.org", "label": 3}, {"loc": [2.949003219604492, -0.7382234334945679], "openalex_id": "https://openalex.org/W4397012222", "title": "Augmenting a Spanish clinical dataset for transformer-based linking of negations and their out-of-scope references", "authors": "Antonio Tamayo, Diego A. Burgos, Alexander Gelbukh", "abstract": "Abstract A negated statement consists of three main components: the negation cue, the negation scope, and the negation reference. The negation cue is the indicator of negation, while the negation scope defines the extent of the negation. The negation reference, which may or may not be within the negation scope, is the part of the statement being negated. Although there has been considerable research on the negation cue and scope, little attention has been given to identifying negation references outside the scope, even though they make up almost half of all negations. In this study, our goal is to identify out-of-scope references (OSRs) to restore the meaning of truncated negated statements identified by negation detection systems. To achieve this, we augment the largest available Spanish clinical dataset by adding annotations for OSRs. Additionally, we fine-tune five robust BERT-based models using transfer learning to address negation detection, uncertainty detection, and OSR identification and linking with their respective negation scopes. Our best model achieves state-of-the-art performance in negation detection while also establishing a competitive baseline for OSR identification (Macro F1 = 0.56) and linking (Macro F1 = 0.86). We support these findings with relevant statistics from the newly annotated dataset and an extensive review of existing literature.", "venue": "Natural language processing.", "label": 15}, {"loc": [4.59160852432251, 0.3064480125904083], "openalex_id": "https://openalex.org/W4397023256", "title": "Chatbots in Airport Customer Service\u2014Exploring Use Cases and Technology Acceptance", "authors": "Isabel Auer, Stephan Schl\u00f6gl, Gundula Glowka", "abstract": "Throughout the last decade, chatbots have gained widespread adoption across various industries, including healthcare, education, business, e-commerce, and entertainment. These types of artificial, usually cloud-based, agents have also been used in airport customer service, although there has been limited research concerning travelers\u2019 perspectives on this rather techno-centric approach to handling inquiries. Consequently, the goal of the presented study was to tackle this research gap and explore potential use cases for chatbots at airports, as well as investigate travelers\u2019 acceptance of said technology. We employed an extended version of the Technology Acceptance Model considering Perceived Usefulness, Perceived Ease of Use, Trust, and Perceived Enjoyment as predictors of Behavioral Intention, with Affinity for Technology as a potential moderator. A total of n=191 travelers completed our survey. The results show that Perceived Usefulness, Trust, Perceived Ease of Use, and Perceived Enjoyment positively correlate with the Behavioral Intention to use a chatbot for airport customer service inquiries, with Perceived Usefulness showing the highest impact. Travelers\u2019 Affinity for Technology, on the other hand, does not seem to have any significant effect.", "venue": "Future Internet", "label": 30}, {"loc": [3.0922553539276123, 1.1630825996398926], "openalex_id": "https://openalex.org/W4396980323", "title": "Outsmarting Artificial Intelligence in the Classroom\u2014Incorporating Large Language Model-Based Chatbots into Teaching", "authors": "Juliane Wutzler", "abstract": "ABSTRACT Since the release of ChatGPT in November 2022, large language model-based chatbots have attracted much attention. Although businesses value their potential for efficiency gains, academics are concerned about their effects on learning and assessments. This Case enables instructors to integrate large language model-based chatbots into the curriculum. Students assume the role of a professional accountant who retrieves an artificial intelligence-written text that has to be evaluated and improved before presenting it to a client. The analysis and writing activity help students gain domain-specific accounting knowledge and improve their writing skills. Students also improve their computer literacy as they learn to engage with artificial intelligence effectively, assess generated output, and evaluate output quality. These learnings enable students to judge in which business scenarios chatbots are beneficial and when original human contributions will likely remain superior. Finally, this activity expands other cognitive skills, such as critical thinking and judgment. JEL Classifications: A22; A23; M40; M41; M49.", "venue": "Issues in Accounting Education", "label": 0}, {"loc": [7.758040904998779, -0.7244312763214111], "openalex_id": "https://openalex.org/W4396986801", "title": "A Japanese-Chinese Parallel Corpus Using Crowdsourcing for Web Mining", "authors": "Masaaki Nagata, Makoto Morishita, Katsuki Chousa, Norihito Yasuda", "abstract": "Using crowdsourcing, we collected more than 10,000 URL pairs (parallel top page pairs) of bilingual websites that contain parallel documents and created a Japanese-Chinese parallel corpus of 4.6M sentence pairs from these websites. We used a Japanese-Chinese bilingual dictionary of 160K word pairs for document and sentence alignment. We then used high-quality 1.2M Japanese-Chinese sentence pairs to train a parallel corpus filter based on statistical language models and word translation probabilities. We compared the translation accuracy of the model trained on these 4.6M sentence pairs with that of the model trained on Japanese-Chinese sentence pairs from CCMatrix (12.4M), a parallel corpus from global web mining. Although our corpus is only one-third the size of CCMatrix, we found that the accuracy of the two models was comparable and confirmed that it is feasible to use crowdsourcing for web mining of parallel data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.449658393859863, 1.1423370838165283], "openalex_id": "https://openalex.org/W4396952095", "title": "XLORE 3: A Large-scale Multilingual Knowledge Graph from Heterogeneous Wiki Knowledge Resources", "authors": "Kaisheng Zeng, Hailong Jin, Xin Lv, Fangwei Zhu, Lei Hou, Yi Zhang, Fan Pang, Yu Qi, Dingxiao Liu, Juanzi Li, Ling Feng", "abstract": "In recent years, knowledge graph (KG) has attracted significant attention from academia and industry, resulting in the development of numerous technologies for KG construction, completion, and application. XLORE is one of the largest multilingual KGs built from Baidu Baike and Wikipedia via a series of knowledge modeling and acquisition methods. In this article, we utilize systematic methods to improve XLORE's data quality and present its latest version, XLORE 3, which enables the effective integration and management of heterogeneous knowledge from diverse resources. Compared with previous versions, XLORE 3 has three major advantages: (1) We design a comprehensive and reasonable schema, namely XLORE ontology, which can effectively organize and manage entities from various resources. (2) We merge equivalent entities in different languages to facilitate knowledge sharing. We provide a large-scale entity linking system to establish the associations between unstructured text and structured KG. (3) We design a multi-strategy knowledge completion framework, which leverages pre-trained language models and vast amounts of unstructured text to discover missing and new facts. The resulting KG contains 446 concepts, 2,608 properties, 66 million entities, and more than 2 billion facts. It is available and downloadable online at https://www.xlore.cn/, providing a valuable resource for researchers and practitioners in various fields.", "venue": "ACM Transactions on Information Systems", "label": 0}, {"loc": [5.856184005737305, 5.488453388214111], "openalex_id": "https://openalex.org/W4396945437", "title": "Compositional Text-to-Image Generation with Dense Blob Representations", "authors": "Weili Nie, Sifei Liu, Morteza Mardani, Chao Liu, Benjamin Eckart, Arash Vahdat", "abstract": "Existing text-to-image models struggle to follow complex text prompts, raising the need for extra grounding inputs for better controllability. In this work, we propose to decompose a scene into visual primitives - denoted as dense blob representations - that contain fine-grained details of the scene while being modular, human-interpretable, and easy-to-construct. Based on blob representations, we develop a blob-grounded text-to-image diffusion model, termed BlobGEN, for compositional generation. Particularly, we introduce a new masked cross-attention module to disentangle the fusion between blob representations and visual features. To leverage the compositionality of large language models (LLMs), we introduce a new in-context learning approach to generate blob representations from text prompts. Our extensive experiments show that BlobGEN achieves superior zero-shot generation quality and better layout-guided controllability on MS-COCO. When augmented by LLMs, our method exhibits superior numerical and spatial correctness on compositional image generation benchmarks. Project page: https://blobgen-2d.github.io.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.042080879211426, 0.024515310302376747], "openalex_id": "https://openalex.org/W4396918726", "title": "Comparing generative and retrieval-based chatbots in answering patient questions regarding age-related macular degeneration and diabetic retinopathy", "authors": "Kai Xiong Cheong, C F Zhang, Tien\u2010En Tan, Beau J. Fenner, Wendy Wong, Kelvin Yi Chong Teo, Ya Xing Wang, Sobha Sivaprasad, Pearse A. Keane, Cecilia S. Lee, Aaron Lee, Chui Ming Gemmy Cheung, Tien Yin Wong, Yun-Gyung Cheong, Su Jeong Song, Yih\u2010Chung Tham", "abstract": "Background/aims To compare the performance of generative versus retrieval-based chatbots in answering patient inquiries regarding age-related macular degeneration (AMD) and diabetic retinopathy (DR). Methods We evaluated four chatbots: generative models (ChatGPT-4, ChatGPT-3.5 and Google Bard) and a retrieval-based model (OcularBERT) in a cross-sectional study. Their response accuracy to 45 questions (15 AMD, 15 DR and 15 others) was evaluated and compared. Three masked retinal specialists graded the responses using a three-point Likert scale: either 2 (good, error-free), 1 (borderline) or 0 (poor with significant inaccuracies). The scores were aggregated, ranging from 0 to 6. Based on majority consensus among the graders, the responses were also classified as \u2018Good\u2019, \u2018Borderline\u2019 or \u2018Poor\u2019 quality. Results Overall, ChatGPT-4 and ChatGPT-3.5 outperformed the other chatbots, both achieving median scores (IQR) of 6 (1), compared with 4.5 (2) in Google Bard, and 2 (1) in OcularBERT (all p \u22648.4\u00d710 \u22123). Based on the consensus approach, 83.3% of ChatGPT-4\u2019s responses and 86.7% of ChatGPT-3.5\u2019s were rated as \u2018Good\u2019, surpassing Google Bard (50%) and OcularBERT (10%) (all p \u22641.4\u00d710 \u22122). ChatGPT-4 and ChatGPT-3.5 had no \u2018Poor\u2019 rated responses. Google Bard produced 6.7% Poor responses, and OcularBERT produced 20%. Across question types, ChatGPT-4 outperformed Google Bard only for AMD, and ChatGPT-3.5 outperformed Google Bard for DR and others. Conclusion ChatGPT-4 and ChatGPT-3.5 demonstrated superior performance, followed by Google Bard and OcularBERT. Generative chatbots are potentially capable of answering domain-specific questions outside their original training. Further validation studies are still required prior to real-world implementation.", "venue": "British Journal of Ophthalmology", "label": 0}, {"loc": [7.6574506759643555, 1.5731991529464722], "openalex_id": "https://openalex.org/W4400908709", "title": "Comparison of Machine Learning Algorithms and Large Language Models for Product Categorization", "authors": "Abdullah \u0130hsano\u011flu, Mounes Zaval, Olcay Taner Y\u0131ld\u0131z", "abstract": "This study explores the efficacy of traditional machine learning algorithms and Large Language Models (LLMs) in automating product categorization for online e-commerce platforms. By comparing these methodologies, we assess their performance in classifying a diverse range of product listings. Our findings indicate that for this context, LLMs offer similar performance in understanding and categorizing complex textual data to traditional machine learning techniques, suggesting that use of LLMs in this context may be unnecessary, and that the trade-off ultimately comes down to the operational costs and resource consumption of each model. This work contributes to the field by providing insights into the capabilities and limitations of current text categorization techniques in the context of rapidly expanding online marketplaces.", "venue": "https://doi.org/10.1109/siu61531.2024.10600809", "label": 0}, {"loc": [5.775761604309082, 3.2492494583129883], "openalex_id": "https://openalex.org/W4396897829", "title": "OptiComm-GPT: a GPT-based versatile research assistant for optical fiber communication systems", "authors": "Xiaotian Jiang, Min Zhang, Yuchen Song, Yao Zhang, Yidi Wang, Cheng Ju, Danshi Wang", "abstract": "With the increasing capacity and complexity of optical fiber communication systems, both academic and industrial requirements for the essential tasks of transmission systems simulation, digital signal processing (DSP) algorithms verification, system performance evaluation, and quality of transmission (QoT) optimization are becoming significantly important. However, due to the intricate and nonlinear nature of optical fiber communication systems, these tasks are generally implemented in a divide-and-conquer manner, which necessitates a profound level of expertise and proficiency in software programming from researchers or engineers. To lower this threshold and facilitate professional research easy-to-start, a GPT-based versatile research assistant named OptiComm-GPT is proposed for optical fiber communication systems, which flexibly and automatically performs system simulation, DSP algorithms verification, performance evaluation, and QoT optimization with only natural language. To enhance OptiComm-GPT\u2019s abilities for complex tasks in optical fiber communications and improve the accuracy of generated results, a domain information base containing rich domain knowledge, tools, and data as well as the comprehensive prompt engineering with well-crafted prompt elements, techniques, and examples is established and performs under a LangChain-based framework. The performance of OptiComm-GPT is evaluated in multiple simulation, verification, evaluation, and optimization tasks, and the generated results show that OptiComm-GPT can effectively comprehend the user\u2019s intent, accurately extract system parameters from the user\u2019s request, and intelligently invoke domain resources to solve these complex tasks simultaneously. Moreover, the statistical results, typical errors, and running time of OptiComm-GPT are also investigated to illustrate its practical reliability, potential limitations, and further improvements.", "venue": "Optics Express", "label": 0}, {"loc": [7.017129421234131, -0.10593783110380173], "openalex_id": "https://openalex.org/W4403423222", "title": "Semantically Enriched Cross-Lingual Sentence Embeddings for Crisis-related Social Media Texts", "authors": "Rabindra Lamsal, Maria A. Rodriguez, Shanika Karunasekera", "abstract": "Tasks such as semantic search and clustering on crisis-related social media texts enhance our comprehension of crisis discourse, aiding decision-making and targeted interventions. Pre-trained language models have advanced performance in crisis informatics, but their contextual embeddings lack semantic meaningfulness. Although the CrisisTransformers family includes a sentence encoder to address the semanticity issue, it remains monolingual, processing only English texts. Furthermore, employing separate models for different languages leads to embeddings in distinct vector spaces, introducing challenges when comparing semantic similarities between multi-lingual texts. Therefore, we propose multi-lingual sentence encoders (CT-XLMR-SE and CT-mBERT-SE) that embed crisis-related social media texts for over 50 languages, such that texts with similar meanings are in close proximity within the same vector space, irrespective of language diversity. Results in sentence encoding and sentence matching tasks are promising, suggesting these models could serve as robust baselines when embedding multi-lingual crisis-related social media texts. The models are publicly available at: https://huggingface.co/crisistransformers.", "venue": "Proceedings of the ... International ISCRAM Conference", "label": 0}, {"loc": [7.038427829742432, 2.551816940307617], "openalex_id": "https://openalex.org/W4396913116", "title": "Automating Code Adaptation for MLOps--A Benchmarking Study on LLMs", "authors": "Harsh Patel, Buvaneswari A. Ramanan, Manzoor Ahmed Khan, Thomas G. Williams, Brian Friedman, Lawrence Drabeck", "abstract": "This paper explores the possibilities of the current generation of Large Language Models for incorporating Machine Learning Operations (MLOps) functionalities into ML training code bases. We evaluate the performance of OpenAI (gpt-3.5-turbo) and WizardCoder (open-source, 15B parameters) models on the automated accomplishment of various MLOps functionalities in different settings. We perform a benchmarking study that assesses the ability of these models to: (1) adapt existing code samples (Inlining) with component-specific MLOps functionality such as MLflow and Weights & Biases for experiment tracking, Optuna for hyperparameter optimization etc., and (2) perform the task of Translation from one component of an MLOps functionality to another, e.g., translating existing GitPython library based version control code to Data Version Control library based. We also propose three different approaches that involve teaching LLMs to comprehend the API documentation of the components as a reference while accomplishing the Translation tasks. In our evaluations, the gpt-3.5-turbo model significantly outperforms WizardCoder by achieving impressive Pass@3 accuracy in model optimization (55% compared to 0% by WizardCoder), experiment tracking (100%, compared to 62.5% by WizardCoder), model registration (92% compared to 42% by WizardCoder) and hyperparameter optimization (83% compared to 58% by WizardCoder) on average, in their best possible settings, showcasing its superior code adaptability performance in complex MLOps tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.8556597232818604, -3.843273639678955], "openalex_id": "https://openalex.org/W4399443478", "title": "Aggression and Misogyny in Hindi and Bangla: A Study of YouTube Comments", "authors": "Ritesh Kumar, Bornini Lahiri", "abstract": "Based on a corpus of approximately 250,000 comments on YouTube videos in Hindi and Bangla, annotated using a combination of manual and automatic methods, this study explores the prevalence and extent of aggression and misogyny as well as the interrelationship between the two in two Indian languages\u2014Hindi and Bangla. We explore the impact of the theme and topic of the videos themselves and the responsiveness of the target on the prevalence of aggression and misogyny and their interrelationship. Through a series of quantitative studies, we demonstrate the cross-linguistic differences between the two languages and the impact of the theme and topics of the video in deciding the prevalence of aggression and misogyny in the YouTube comments.", "venue": "https://doi.org/10.1163/9789004694453_007", "label": 0}, {"loc": [3.8324899673461914, 1.1352211236953735], "openalex_id": "https://openalex.org/W4396892933", "title": "A Balancing Act: Data Protection Compliance of Artificial Intelligence", "authors": "Marvin Bartels", "abstract": "Abstract Neural network based generative artificial intelligence (GenAI) systems, in particular large language models (LLMs), hold huge potential for economic, scientific, and artistic progress. However, they are also the subject of ongoing fierce data protection debates. Some even doubt whether GenAI systems can, at least in principle, comply with the General Data Protection Regulation (EU) 2016/679 (GDPR). This issue was highlighted by a temporary ban of ChatGPT by the Italian data protection authority in March 2023. To answer the question of possible compliance with the GDPR, this paper will first illustrate relevant technical aspects and then examine the extent to which GenAI systems process personal data. A distinction will be made between training, use, and the system itself as stored on a hard drive. On this basis, this paper addresses the key question of the lawfulness of the respective processing activities, bearing in mind that each specific use case of any GenAI system requires a specific data protection assessment. While the issues discussed in this paper are relevant to GenAI systems in general, they are particularly relevant to LLMs trained on large amounts of text-based personal data. As one result, the analysis identifies serious challenges in the application of the GDPR to LLMs. However, with no significant updates to the relevant GDPR provisions in sight, this paper aims to show how pragmatic solutions are both necessary and largely possible even under the current legal framework. This includes practical guidance for developers and users on how to ensure compliance.", "venue": "GRUR International", "label": 0}, {"loc": [3.8169963359832764, -3.901665210723877], "openalex_id": "https://openalex.org/W4396903880", "title": "Abusive Language Detection in Khasi Social Media Comments", "authors": "Arup Baruah, Lakhamti Wahlang, Firstbornson Jyrwa, Floriginia Shadap, Ferdous Ahmed Barbhuiya, Kuntal Dey", "abstract": "This paper describes the work performed for automated abusive language detection in the Khasi language, a low-resource language spoken primarily in the state of Meghalaya, India. A dataset named Khasi Abusive Language Dataset (KALD) was created which consists of 4,573 human-annotated Khasi YouTube and Facebook comments. A corpus of Khasi text was built and it was used to create Khasi word2vec and fastText word embeddings. Deep learning, traditional machine learning, and ensemble models were used in the study. Experiments were performed using word2vec, fastText, and topic vectors obtained using LDA. Experiments were also performed to check if zero-shot cross-lingual nature of language models such as LaBSE and LASER can be utilized for abusive language detection in the Khasi language. The best F1 score of 0.90725 was obtained by an XGBoost classifier. After feature selection and rebalancing of the dataset, F1 score of 0.91828 and 0.91945 were obtained by an SVM based classifiers.", "venue": "ACM Transactions on Asian and Low-Resource Language Information Processing", "label": 22}, {"loc": [3.1051108837127686, 1.7568742036819458], "openalex_id": "https://openalex.org/W4396892628", "title": "Smart Automation Using LLM", "authors": "Mathyas Giudici, Luca Padalino, Giovanni Paolino, Ilaria Paratici, Alexandru Ionut Pascu, Franca Garzotto", "abstract": "Without any more delay, individuals are urged to adopt more sustainable behaviors to fight climate change. New digital systems mixed with engaging and gamification mechanisms could play an important role in achieving such an objective. In particular, Conversational Agents, like Smart Home Assistants, are a promising tool that encourage sustainable behaviors within household settings. In recent years, large language models (LLMs) have shown great potential in enhancing the capabilities of such assistants, making them more effective in interacting with users. We present the design and implementation of GreenIFTTT, an application empowered by GPT4 to create and control home automation routines. The agent helps users understand which energy consumption optimization routines could be created and applied to make their home appliances more environmentally sustainable. We performed an exploratory study (Italy, December 2023) with N = 13 participants to test our application\u2019s usability and UX. The results suggest that GreenIFTTT is a usable, engaging, easy, and supportive tool, providing insight into new perspectives and usage of LLMs to create more environmentally sustainable home automation.", "venue": "Designs", "label": 0}, {"loc": [5.726741790771484, 5.5357770919799805], "openalex_id": "https://openalex.org/W4396877870", "title": "Cross-Lingual Cross-Modal Retrieval with Noise-Robust Fine-Tuning", "authors": "Rui Cai, Jianfeng Dong, Tianxiang Liang, Yonghui Liang, Yabing Wang, Xun Yang, Xun Wang, Meng Wang", "abstract": "Cross-lingual cross-modal retrieval aims at leveraging human-labeled annotations in a source language to construct cross-modal retrieval models for a new target language, due to the lack of manually-annotated dataset in low-resource languages (target languages). Contrary to the growing developments in the field of monolingual cross-modal retrieval, there has been less research focusing on cross-modal retrieval in the cross-lingual scenario. A straightforward method to obtain target-language labeled data is translating source-language datasets utilizing Machine Translations (MT). However, as MT is not perfect, it tends to introduce noise during translation, rendering textual embeddings corrupted and thereby compromising the retrieval performance. To alleviate this, we propose Noise-Robust Fine-tuning (NRF) which tries to extract clean textual information from a possibly noisy target-language input with the guidance of its source-language counterpart. Besides, contrastive learning involving different modalities are performed to strengthen the noise-robustness of our model. Different from traditional cross-modal retrieval methods which only employ image/video-text paired data for fine-tuning, in NRF, selected parallel data plays a key role in improving the noise-filtering ability of our model. Extensive experiments are conducted on three video-text and image-text retrieval benchmarks across different target languages, and the results demonstrate that our method significantly improves the overall performance without using any image/video-text paired data on target languages.", "venue": "IEEE Transactions on Knowledge and Data Engineering", "label": 0}, {"loc": [4.195341110229492, 2.9811513423919678], "openalex_id": "https://openalex.org/W4399729086", "title": "Inspecting and Measuring Fairness of unlabeled Image Datasets", "authors": "Rebekka G\u00f6rge, Michael M\u00f6ck, Maram Akila", "abstract": "191", "venue": "https://doi.org/10.1109/icdew61823.2024.00031", "label": 0}, {"loc": [9.012024879455566, 1.410739779472351], "openalex_id": "https://openalex.org/W4392819914", "title": "WebGraph: The Next Generation (Is in Rust)", "authors": "Tommaso Fontana, Sebastiano Vigna, Stefano Zacchiroli", "abstract": "International audience", "venue": "https://doi.org/10.1145/3589335.3651581", "label": 0}, {"loc": [8.642484664916992, 0.8584286570549011], "openalex_id": "https://openalex.org/W4396843793", "title": "Knowledge Induced Transformer Network for Causality Prediction", "authors": "Tirthankar Dasgupta, Manjira Sinha, Abir Naskar", "abstract": "Causal extraction from text plays a crucial role in various downstream analytical and predictive tasks, such as constructing repositories of causal insights for reasoning. However, existing models often overlook the rich contextual commonsense knowledge that could enhance the reasoning process and evaluate underlying causal mechanisms. In this study, we introduce a knowledge-induced transformer architecture for predicting causality. Our model accepts an antecedent and a set of contextual knowledge as input, then ranks plausible consequences from a given set of hypotheses. To enhance semantic understanding, we augment the transformer with a relational graph network, which computes fine-grained semantic information between the antecedent, knowledge, and hypotheses using a similarity matrix that quantifies word-to-word similarity. We evaluate the proposed architecture against state-of-the-art models using openly available datasets and demonstrate its superior performance.", "venue": "https://doi.org/10.1145/3589335.3651531", "label": 0}, {"loc": [9.532722473144531, 1.5648342370986938], "openalex_id": "https://openalex.org/W4396843693", "title": "The Web Data Commons Schema. org Table Corpora", "authors": "Ralph Peeters, Alexander Brinkmann, Christian Bizer", "abstract": "The research on table representation learning, data retrieval, and data integration in the context of data lakes requires large table corpora for the training and evaluation of the developed methods. Over the years, several large table corpora such as WikiTables, GitTables, or the Dresden Web Table Corpus have been published and are used by the research community. This paper complements the set of public table corpora with the Web Data Commons Schema.org table corpora, two table corpora consisting of 4.2 (Release 2020) and 5 million (Release 2023) relational tables describing products, events, local businesses, job postings, recipes, movies, books, as well as 37 further types of entities. The feature that distinguishes the corpora from all other publicly available large table corpora is that all tables that describe entities of a specific type use the same attributes to describe these entities, i.e. all tables use a shared schema, the schema.org vocabulary. The shared schema eases the integration of data from different sources and allows training processes to focus on specific types of entities or specific attributes. Altogether the tables contain ~653 million rows of data which have been extracted from the Common Crawl web corpus and have been grouped into separate tables for each class/host combination, i.e. all records of a specific class that originate from a specific website are put into a single table. This paper describes the creation of the WDC Schema.org Table Corpora, gives an overview of the content of the corpora, and discusses their use cases.", "venue": "https://doi.org/10.1145/3589335.3651441", "label": 0}, {"loc": [8.275094032287598, 1.1980900764465332], "openalex_id": "https://openalex.org/W4396843721", "title": "MS MARCO Web Search: a Large-scale Information-rich Web Dataset with Millions of Real Click Labels", "authors": "Qi Chen, Xiubo Geng, Corby Rosset, Carolyn Buractaon, Jingwen Lu, Tao Shen, Kun Zhou, Chenyan Xiong, Yeyun Gong, Paul J. Bennett, Nick Craswell, Xing Xie, Fan Yang, Bryan Tower, Nikhil Rao, Anlei Dong, Wenqi Jiang, Zheng Liu, Mingqin Li, Liu Chuan-jie, Zengzhong Li, Rangan Majumder, J. Neville, Andy Oakley, Knut Magne Risvik, Harsha Vardhan Simhadri, Manik Varma, Yujing Wang, Linjun Yang, Mao Yang, Ce Zhang", "abstract": "Recent breakthroughs in large models have highlighted the critical\\nsignificance of data scale, labels and modals. In this paper, we introduce MS\\nMARCO Web Search, the first large-scale information-rich web dataset, featuring\\nmillions of real clicked query-document labels. This dataset closely mimics\\nreal-world web document and query distribution, provides rich information for\\nvarious kinds of downstream tasks and encourages research in various areas,\\nsuch as generic end-to-end neural indexer models, generic embedding models, and\\nnext generation information access system with large language models. MS MARCO\\nWeb Search offers a retrieval benchmark with three web retrieval challenge\\ntasks that demand innovations in both machine learning and information\\nretrieval system research domains. As the first dataset that meets large, real\\nand rich data requirements, MS MARCO Web Search paves the way for future\\nadvancements in AI and system research. MS MARCO Web Search dataset is\\navailable at: https://github.com/microsoft/MS-MARCO-Web-Search.\\n", "venue": "https://doi.org/10.1145/3589335.3648327", "label": 0}, {"loc": [2.798992872238159, 3.0720226764678955], "openalex_id": "https://openalex.org/W4396843934", "title": "A Longitudinal Study of Content Control Mechanisms", "authors": "Michael Dinzinger, Michael Granitzer", "abstract": "As generative AI continues to evolve, it becomes increasingly important for site owners to effectively communicate their conditions and preferences to web agents to maintain data sovereignty. This necessity underscores the importance of an ecosystem where the technical means to prevent unauthorized data mining and to set conditions on the usage of web resources are readily available. Our research focuses on the temporal development of such technical content control methods, examining two primary mechanisms: the regulation of web robots via the Robots Exclusion Protocol and the semantic annotation of web documents with licensing information. Through a longitudinal study, we analyze the implementation and recent modifications of robots.txt files, robot directives (such as noindex, nofollow, etc.), and license-related HTML annotations. This study is driven by the growing awareness among site owners regarding the control over their content in the face of the progression of AI, highlighting the critical need for effective web content control strategies to protect and appropriately manage the wealth of texts, images, videos, and other content populating the internet.", "venue": "https://doi.org/10.1145/3589335.3651893", "label": 0}, {"loc": [4.8559441566467285, 0.8889102339744568], "openalex_id": "https://openalex.org/W4396832709", "title": "Evaluating Large Language Models on Academic Literature Understanding and Review: An Empirical Study among Early-stage Scholars", "authors": "Jiyao Wang, Haolong Hu, Z. D. Wang, Song Yan, Youyu Sheng, Dengbo He", "abstract": "The rapid advancement of large language models (LLMs) such as ChatGPT makes LLM-based academic tools possible. However, little research has empirically evaluated how scholars perform different types of academic tasks with LLMs. Through an empirical study followed by a semi-structured interview, we assessed 48 early-stage scholars' performance in conducting core academic activities (i.e., paper reading and literature reviews) under different levels of time pressure. Before conducting the tasks, participants received different training programs regarding the limitations and capabilities of the LLMs. After completing the tasks, participants completed an interview. Quantitative data regarding the influence of time pressure, task type, and training program on participants' performance in academic tasks was analyzed. Semi-structured interviews provided additional information on the influential factors of task performance, participants' perceptions of LLMs, and concerns about integrating LLMs into academic workflows. The findings can guide more appropriate usage and design of LLM-based tools in assisting academic work.", "venue": "https://doi.org/10.1145/3613904.3641917", "label": 0}, {"loc": [3.312122106552124, 1.2645721435546875], "openalex_id": "https://openalex.org/W4391835981", "title": "Evaluating the Experience of LGBTQ+ People Using Large Language Model Based Chatbots for Mental Health Support", "authors": "Zilin Ma, Yiyang Mei, Yinru Long, Zhaoyuan Su, Krzysztof Z. Gajos", "abstract": "LGBTQ+ individuals are increasingly turning to chatbots powered by large language models (LLMs) to meet their mental health needs. However, little research has explored whether these chatbots can adequately and safely provide tailored support for this demographic. We interviewed 18 LGBTQ+ and 13 non-LGBTQ+ participants about their experiences with LLM-based chatbots for mental health needs. LGBTQ+ participants relied on these chatbots for mental health support, likely due to an absence of support in real life. Notably, while LLMs offer prompt support, they frequently fall short in grasping the nuances of LGBTQ-specific challenges. Although fine-tuning LLMs to address LGBTQ+ needs can be a step in the right direction, it isn't the panacea. The deeper issue is entrenched in societal discrimination. Consequently, we call on future researchers and designers to look beyond mere technical refinements and advocate for holistic strategies that confront and counteract the societal biases burdening the LGBTQ+ community.", "venue": "https://doi.org/10.1145/3613904.3642482", "label": 0}, {"loc": [8.002803802490234, 0.8894826173782349], "openalex_id": "https://openalex.org/W4396821944", "title": "Arctic-Embed: Scalable, Efficient, and Accurate Text Embedding Models", "authors": "Luke Merrick, Danmei Xu, Gaurav Nuti, Daniel Campos", "abstract": "This report describes the training dataset creation and recipe behind the family of \\texttt{arctic-embed} text embedding models (a set of five models ranging from 22 to 334 million parameters with weights open-sourced under an Apache-2 license). At the time of their release, each model achieved state-of-the-art retrieval accuracy for models of their size on the MTEB Retrieval leaderboard, with the largest model, arctic-embed-l outperforming closed source embedding models such as Cohere's embed-v3 and Open AI's text-embed-3-large. In addition to the details of our training recipe, we have provided several informative ablation studies, which we believe are the cause of our model performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.981281280517578, 2.6331379413604736], "openalex_id": "https://openalex.org/W4396821969", "title": "DrawL: Understanding the Effects of Non-Mainstream Dialects in Prompted Image Generation", "authors": "Joshua N. Williams, Molly FitzMorris, Osman Aka, Sarah Laszlo", "abstract": "Text-to-image models are now easy to use and ubiquitous. However, prior work has found that they are prone to recapitulating harmful Western stereotypes. For example, requesting that a model generate an \"African person and their house,\" may produce a person standing next to a straw hut. In this example, the word \"African\" is an explicit descriptor of the person that the prompt is seeking to depict. Here, we examine whether implicit markers, such as dialect, can also affect the portrayal of people in text-to-image outputs. We pair prompts in Mainstream American English with counterfactuals that express grammatical constructions found in dialects correlated with historically marginalized groups. We find that through minimal, syntax-only changes to prompts, we can systematically shift the skin tone and gender of people in the generated images. We conclude with a discussion of whether dialectic distribution shifts like this are harmful or are expected, possibly even desirable, model behavior.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.568446159362793, -0.2841218411922455], "openalex_id": "https://openalex.org/W4396776466", "title": "Performance Comparison of Large Language Models, GPT and Gemini on Turkish News Classification Task", "authors": "Zekeriya An\u0131l G\u00fcven", "abstract": "Abstract Recently, large language models-LLMs have become very popular in many tasks of natural language processing. Examples of these tasks include text classification, question answering, text summarization, and text generation for natural language processing. Apart from LLMs, GPT and Gemini models are at the top of the list in terms of use for text generation tasks. This study aims to contribute to the literature on the use and comparison of LLMs and text generation models for the Turkish language. To achieve this purpose, the dataset consisting of Turkish news was classified by training BERT, ALBERT, DistilBERT, ELECTRA, XLM-RoBERTA LLMs with fine-tuning. Additionally, GPT-3.5 and Gemini text generation models were used by sending prompts for this classification task, and the success of the models was compared with LLMs. As a result of all analyses, the BERT model gave 97.619% accuracy among LLMs, while Gemini gave 99.167% accuracy among text generation models.", "venue": "Research Square (Research Square)", "label": 25}, {"loc": [4.417841911315918, 2.5171756744384766], "openalex_id": "https://openalex.org/W4396816988", "title": "Seeds of Stereotypes: A Large-Scale Textual Analysis of Race and Gender Associations with Diseases in Online Sources", "authors": "Lasse Hansen, Nikolaj Andersen, Jack Gallifant, Liam G. McCoy, James K Stone, Nura Izath, Marcela Aguirre-Jerez, Danielle S. Bitterman, Judy Wawira Gichoya, Leo Anthony Celi", "abstract": "Background Advancements in Large Language Models (LLMs) hold transformative potential in healthcare, however, recent work has raised concern about the tendency of these models to produce outputs that display racial or gender biases. Although training data is a likely source of such biases, exploration of disease and demographic associations in text data at scale has been limited. Methods We conducted a large-scale textual analysis using a dataset comprising diverse web sources, including Arxiv, Wikipedia, and Common Crawl. The study analyzed the context in which various diseases are discussed alongside markers of race and gender. Given that LLMs are pre-trained on similar datasets, this approach allowed us to examine the potential biases that LLMs may learn and internalize. We compared these findings with actual demographic disease prevalence as well as GPT-4 outputs in order to evaluate the extent of bias representation. Results Our findings indicate that demographic terms are disproportionately associated with specific disease concepts in online texts. gender terms are prominently associated with disease concepts, while racial terms are much less frequently associated. We find widespread disparities in the associations of specific racial and gender terms with the 18 diseases analyzed. Most prominently, we see an overall significant overrepresentation of Black race mentions in comparison to population proportions. Conclusions Our results highlight the need for critical examination and transparent reporting of biases in LLM pretraining datasets. Our study suggests the need to develop mitigation strategies to counteract the influence of biased training data in LLMs, particularly in sensitive domains such as healthcare.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.873072624206543, 5.024748802185059], "openalex_id": "https://openalex.org/W4396815965", "title": "Interpretable Tensor Fusion", "authors": "Saurabh Varshneya, Antoine Ledent, Philipp Liznerski, Andriy Balinskyy, Purvanshi Mehta, Waleed Mustafa, Marius Kloft", "abstract": "Conventional machine learning methods are predominantly designed to predict outcomes based on a single data type. However, practical applications may encompass data of diverse types, such as text, images, and audio. We introduce interpretable tensor fusion (InTense), a multimodal learning method for training neural networks to simultaneously learn multimodal data representations and their interpretable fusion. InTense can separately capture both linear combinations and multiplicative interactions of diverse data types, thereby disentangling higher-order interactions from the individual effects of each modality. InTense provides interpretability out of the box by assigning relevance scores to modalities and their associations. The approach is theoretically grounded and yields meaningful relevance scores on multiple synthetic and real-world datasets. Experiments on six real-world datasets show that InTense outperforms existing state-of-the-art multimodal interpretable approaches in terms of accuracy and interpretability.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.096435546875, 3.411630630493164], "openalex_id": "https://openalex.org/W4396813446", "title": "Generating Probabilistic Scenario Programs from Natural Language", "authors": "Karim Elmaaroufi, Devan Shankar, Ana Cismaru, Marcell Vazquez-Chanlatte, Alberto Sangiovanni\u2010Vincentelli, Matei Zaharia, Sanjit A. Seshia", "abstract": "For cyber-physical systems (CPS), including robotics and autonomous vehicles, mass deployment has been hindered by fatal errors that occur when operating in rare events. To replicate rare events such as vehicle crashes, many companies have created logging systems and employed crash reconstruction experts to meticulously recreate these valuable events in simulation. However, in these methods, \"what if\" questions are not easily formulated and answered. We present ScenarioNL, an AI System for creating scenario programs from natural language. Specifically, we generate these programs from police crash reports. Reports normally contain uncertainty about the exact details of the incidents which we represent through a Probabilistic Programming Language (PPL), Scenic. By using Scenic, we can clearly and concisely represent uncertainty and variation over CPS behaviors, properties, and interactions. We demonstrate how commonplace prompting techniques with the best Large Language Models (LLM) are incapable of reasoning about probabilistic scenario programs and generating code for low-resource languages such as Scenic. Our system is comprised of several LLMs chained together with several kinds of prompting strategies, a compiler, and a simulator. We evaluate our system on publicly available autonomous vehicle crash reports in California from the last five years and share insights into how we generate code that is both semantically meaningful and syntactically correct.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.4149298667907715, 1.7532517910003662], "openalex_id": "https://openalex.org/W4396736631", "title": "Botlitica: A generative AI-based tool to assist journalists in navigating political propaganda campaigns", "authors": "Elena Musi, Edgar Everardo Garcia Aguilar, Lorenzo Federico", "abstract": "The hype on generative AI has raised concerns about the spread of disinformation but also opened up new opportunities for hybrid journalism. The proliferation of political propaganda campaigns spread across digital media during election periods constitutes a challenge for journalists who struggle to exercise information gatekeeping. AI-based tools can in principle help, but journalists are resistant to using them since they are sceptical about their compliance with news values and their scarce user-friendliness. To cope with such issues, we present Botlitica, a GPT3-based chatbot able to answer users\u2019 questions according to the information shared on different social media by a given political party which is, thus, embodied by the AI agent in the conversation. The back-end and front-design of the chatbot are devised to privilege transparency. We report the results of a preliminary evaluation of Botlitica which show that the tool fastens journalists\u2019 capabilities to navigate propaganda campaigns inducing them to exercise critical thinking.", "venue": "Studies in Communication Sciences", "label": 0}, {"loc": [6.0597920417785645, 3.523804187774658], "openalex_id": "https://openalex.org/W4396813446", "title": "ScenicNL: Generating Probabilistic Scenario Programs from Natural Language", "authors": "Karim Elmaaroufi, Devan Shankar, Ana Cismaru, Marcell Vazquez-Chanlatte, Alberto Sangiovanni\u2010Vincentelli, Matei Zaharia, Sanjit A. Seshia", "abstract": "For cyber-physical systems (CPS), including robotics and autonomous vehicles, mass deployment has been hindered by fatal errors that occur when operating in rare events. To replicate rare events such as vehicle crashes, many companies have created logging systems and employed crash reconstruction experts to meticulously recreate these valuable events in simulation. However, in these methods, \"what if\" questions are not easily formulated and answered. We present ScenarioNL, an AI System for creating scenario programs from natural language. Specifically, we generate these programs from police crash reports. Reports normally contain uncertainty about the exact details of the incidents which we represent through a Probabilistic Programming Language (PPL), Scenic. By using Scenic, we can clearly and concisely represent uncertainty and variation over CPS behaviors, properties, and interactions. We demonstrate how commonplace prompting techniques with the best Large Language Models (LLM) are incapable of reasoning about probabilistic scenario programs and generating code for low-resource languages such as Scenic. Our system is comprised of several LLMs chained together with several kinds of prompting strategies, a compiler, and a simulator. We evaluate our system on publicly available autonomous vehicle crash reports in California from the last five years and share insights into how we generate code that is both semantically meaningful and syntactically correct.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.53619647026062, 1.442596435546875], "openalex_id": "https://openalex.org/W4396734975", "title": "Understanding College Students' Satisfaction With ChatGPT: An Exploratory And Predictive Machine Learning Approach Using Feature Engineering", "authors": "Kavita Pabreja, Nishtha Pabreja", "abstract": "Artificial Intelligence (AI) technologies are continually improving and becoming more pervasive in many facets of our lives. ChatGPT is one such cutting-edge artificial intelligence application, and it has received a lot of worldwide media attention, specifically from educationists, technologists, and learners. It is imperative to understand and evaluate the impact of ChatGPT on computer science students as it directly and holistically influences them. A quantitative instrumental case study explores ChatGPT\u2019s impact on early adopters in education. A survey of undergraduate computer science students at a state university of Delhi was conducted to get insight into their opinion on adopting this revolutionising technology for their education, career, and overall satisfaction. An end-to-end data science approach is applied to encompass exploratory and predictive modelling with feature engineering solutions. Results reveal the most influential features contributing to students\u2019 satisfaction in adopting ChatGPT for their day-to-day chores concerning their social life, education, and career. The Linear Support Vector classifier, a machine learning algorithm for predicting the satisfaction or dissatisfaction in students\u2019 shows an accuracy score of 72.73% and 97.72%, respectively. The AUC for this multiclass prediction model is convincing and is 0.74, 0.71, and 0.96 for satisfied, neutral, and dissatisfied classes, respectively.", "venue": "MIER Journal of Educational Studies Trends & Practices", "label": 0}, {"loc": [6.997058391571045, 0.5067648887634277], "openalex_id": "https://openalex.org/W4396735702", "title": "Bridging large language model disparities: Skill tagging of multilingual educational content", "authors": "Yerin Kwak, Zachary A. Pardos", "abstract": "Abstract The adoption of large language models (LLMs) in education holds much promise. However, like many technological innovations before them, adoption and access can often be inequitable from the outset, creating more divides than they bridge. In this paper, we explore the magnitude of the country and language divide in the leading open\u2010source and proprietary LLMs with respect to knowledge of K\u201012 taxonomies in a variety of countries and their performance on tagging problem content with the appropriate skill from a taxonomy, an important task for aligning open educational resources and tutoring content with state curricula. We also experiment with approaches to narrowing the performance divide by enhancing LLM skill tagging performance across four countries (the USA, Ireland, South Korea and India\u2013Maharashtra) for more equitable outcomes. We observe considerable performance disparities not only with non\u2010English languages but with English and non\u2010US taxonomies. Our findings demonstrate that fine\u2010tuning GPT\u20103.5 with a few labelled examples can improve its proficiency in tagging problems with relevant skills or standards, even for countries and languages that are underrepresented during training. Furthermore, the fine\u2010tuning results show the potential viability of GPT as a multilingual skill classifier. Using both an open\u2010source model, Llama2\u201013B, and a closed\u2010source model, GPT\u20103.5, we also observe large disparities in tagging performance between the two and find that fine\u2010tuning and skill information in the prompt improve both, but the closed\u2010source model improves to a much greater extent. Our study contributes to the first empirical results on mitigating disparities across countries and languages with LLMs in an educational context. Practitioner notes What is already known about this topic Recent advances in generative AI have led to increased applications of LLMs in education, offering diverse opportunities. LLMs excel predominantly in English and exhibit a bias towards the US context. Automated content tagging has been studied using English\u2010language content and taxonomies. What this paper adds Investigates the country and language disparities in LLMs concerning knowledge of educational taxonomies and their performance in tagging content. Presents the first empirical findings on addressing disparities in LLM performance across countries and languages within an educational context. Improves GPT\u20103.5's tagging accuracy through fine\u2010tuning, even for non\u2010US countries, starting from zero accuracy. Extends automated content tagging to non\u2010English languages using both open\u2010source and closed\u2010source LLMs. Implications for practice and/or policy Underscores the importance of considering the performance generalizability of LLMs to languages other than English. Highlights the potential viability of ChatGPT as a skill tagging classifier across countries.", "venue": "British Journal of Educational Technology", "label": 10}, {"loc": [3.7785768508911133, 0.3614392876625061], "openalex_id": "https://openalex.org/W4396786706", "title": "AtomGPT: Atomistic Generative Pre-trained Transformer for Forward and Inverse Materials Design", "authors": "Kamal Choudhary", "abstract": "Large language models (LLMs) such as generative pretrained transformers (GPTs) have shown potential for various commercial applications, but their applicability for materials design remains underexplored. In this article, we introduce AtomGPT, a model specifically developed for materials design based on transformer architectures, to demonstrate the capability for both atomistic property prediction and structure generation. We show that a combination of chemical and structural text descriptions can efficiently predict material properties with accuracy comparable to graph neural network models, including formation energies, electronic bandgaps from two different methods and superconducting transition temperatures. Furthermore, we demonstrate that AtomGPT can generate atomic structures for tasks such as designing new superconductors, with the predictions validated through density functional theory calculations. This work paves the way for leveraging LLMs in forward and inverse materials design, offering an efficient approach to the discovery and optimization of materials.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.705273628234863, 3.380537986755371], "openalex_id": "https://openalex.org/W4396786590", "title": "Enabling High-Sparsity Foundational Llama Models with Efficient Pretraining and Deployment", "authors": "Abhinav Agarwalla, Abhay Gupta, Alexandre Carriconde Marques, Shubhra Pandit, Michael Goin, Eldar Kurtic, K.M.K.H. Leong, Tuan M. Nguyen, Mahmoud Salem, Dan Alistarh, Sean Lie, Mark Kurtz", "abstract": "Large language models (LLMs) have revolutionized Natural Language Processing (NLP), but their size creates computational bottlenecks. We introduce a novel approach to create accurate, sparse foundational versions of performant LLMs that achieve full accuracy recovery for fine-tuning tasks at up to 70% sparsity. We achieve this for the LLaMA-2 7B model by combining the SparseGPT one-shot pruning method and sparse pretraining of those models on a subset of the SlimPajama dataset mixed with a Python subset of The Stack dataset. We exhibit training acceleration due to sparsity on Cerebras CS-3 chips that closely matches theoretical scaling. In addition, we establish inference acceleration of up to 3x on CPUs by utilizing Neural Magic's DeepSparse engine and 1.7x on GPUs through Neural Magic's nm-vllm engine. The above gains are realized via sparsity alone, thus enabling further gains through additional use of quantization. Specifically, we show a total speedup on CPUs for sparse-quantized LLaMA models of up to 8.6x. We demonstrate these results across diverse, challenging tasks, including chat, instruction following, code generation, arithmetic reasoning, and summarization to prove their generality. This work paves the way for rapidly creating smaller and faster LLMs without sacrificing accuracy.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.750929355621338, 3.9900243282318115], "openalex_id": "https://openalex.org/W4396746182", "title": "Lory: Fully Differentiable Mixture-of-Experts for Autoregressive Language Model Pre-training", "authors": "Zexuan Zhong, Mengzhou Xia, Danqi Chen, Michael Lewis", "abstract": "Mixture-of-experts (MoE) models facilitate efficient scaling; however, training the router network introduces the challenge of optimizing a non-differentiable, discrete objective. Recently, a fully-differentiable MoE architecture, SMEAR, was proposed (Muqeeth et al., 2023), which softly merges experts in the parameter space; nevertheless, its effectiveness was only demonstrated in downstream fine-tuning on classification tasks. In this paper, we present Lory, the first approach that scales such architectures to autoregressive language model pre-training. Lory introduces two key techniques: (1) a causal segment routing strategy that achieves high efficiency for expert merging operations while preserving the autoregressive nature of language models; (2) a similarity-based data batching method that encourages expert specialization by grouping similar documents in training instances. We pre-train a series of Lory models on 150B tokens from scratch, with up to 32 experts and 30B (1.5B active) parameters. Experimental results show significant performance gains over parameter-matched dense models on both perplexity (+13.9%) and a variety of downstream tasks (+1.5%-11.1%). Despite segment-level routing, Lory models achieve competitive performance compared to state-of-the-art MoE models with token-level routing. We further demonstrate that the trained experts in Lory capture domain-level specialization without supervision. Our work highlights the potential of fully-differentiable MoE architectures for language model pre-training and advocates future research in this area.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.3358154296875, 2.3807213306427], "openalex_id": "https://openalex.org/W4396788158", "title": "MAmmoTH2: Scaling Instructions from the Web", "authors": "Xiang Yue, Tuney Zheng, Ge Zhang, Wenhu Chen", "abstract": "Instruction tuning improves the reasoning abilities of large language models (LLMs), with data quality and scalability being the crucial factors. Most instruction tuning data come from human crowd-sourcing or GPT-4 distillation. We propose a paradigm to efficiently harvest 10 million naturally existing instruction data from the pre-training web corpus to enhance LLM reasoning. Our approach involves (1) recalling relevant documents, (2) extracting instruction-response pairs, and (3) refining the extracted pairs using open-source LLMs. Fine-tuning base LLMs on this dataset, we build MAmmoTH2 models, which significantly boost performance on reasoning benchmarks. Notably, MAmmoTH2-7B's (Mistral) performance increases from 11% to 36.7% on MATH and from 36% to 68.4% on GSM8K without training on any in-domain data. Further training MAmmoTH2 on public instruction tuning datasets yields MAmmoTH2-Plus, achieving state-of-the-art performance on several reasoning and chatbot benchmarks. Our work demonstrates how to harvest large-scale, high-quality instruction data without costly human annotation or GPT-4 distillation, providing a new paradigm for building better instruction tuning data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.7251341342926025, -0.4791144132614136], "openalex_id": "https://openalex.org/W4396705712", "title": "A systematic review of multidimensional relevance estimation in information retrieval", "authors": "Georgios Peikos, Gabriella Pasi", "abstract": "Abstract In information retrieval, relevance is perceived as a multidimensional and dynamic concept influenced by user, task, and domain factors. Relying on this perspective, researchers have introduced multidimensional relevance models addressing diverse search tasks across numerous knowledge domains. Through our systematic review of 72 studies, we categorize research based on domain specificity and the distinct relevance aspects employed for estimating multidimensional relevance. Moreover, we highlight the approaches used to aggregate scores related to these factors and rank information items. Our insights underline the importance of concise definitions and unified methods for estimating relevance factors within and across domains. Finally, we identify benchmark collections for evaluations based on multiple relevance aspects while underscoring the necessity for new ones. Our findings suggest that large language models hold considerable promise for shaping future research in this field, mainly due to their relevance labeling abilities. This article is categorized under: Application Areas > Science and Technology Technologies > Computational Intelligence", "venue": "Wiley Interdisciplinary Reviews Data Mining and Knowledge Discovery", "label": 0}, {"loc": [3.6881089210510254, 4.556548118591309], "openalex_id": "https://openalex.org/W4396776522", "title": "1-Diffractor: Efficient and Utility-Preserving Text Obfuscation Leveraging Word-Level Metric Differential Privacy", "authors": "Stephen Meisenbacher, Maulik Chevli, Florian Matthes", "abstract": "The study of privacy-preserving Natural Language Processing (NLP) has gained rising attention in recent years. One promising avenue studies the integration of Differential Privacy in NLP, which has brought about innovative methods in a variety of application settings. Of particular note are $\\textit{word-level Metric Local Differential Privacy (MLDP)}$ mechanisms, which work to obfuscate potentially sensitive input text by performing word-by-word $\\textit{perturbations}$. Although these methods have shown promising results in empirical tests, there are two major drawbacks: (1) the inevitable loss of utility due to addition of noise, and (2) the computational expensiveness of running these mechanisms on high-dimensional word embeddings. In this work, we aim to address these challenges by proposing $\\texttt{1-Diffractor}$, a new mechanism that boasts high speedups in comparison to previous mechanisms, while still demonstrating strong utility- and privacy-preserving capabilities. We evaluate $\\texttt{1-Diffractor}$ for utility on several NLP tasks, for theoretical and task-based privacy, and for efficiency in terms of speed and memory. $\\texttt{1-Diffractor}$ shows significant improvements in efficiency, while still maintaining competitive utility and privacy scores across all conducted comparative tests against previous MLDP mechanisms. Our code is made available at: https://github.com/sjmeis/Diffractor.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.006216049194336, 1.9082828760147095], "openalex_id": "https://openalex.org/W4396759893", "title": "UNIVERSAL IMITATION GAMES", "authors": "Sridhar Mahadevan", "abstract": "Alan Turing proposed in 1950 a framework called an imitation game to decide if a machine could think. Using mathematics developed largely after Turing -- category theory -- we analyze a broader class of universal imitation games (UIGs), which includes static, dynamic, and evolutionary games. In static games, the participants are in a steady state. In dynamic UIGs, \"learner\" participants are trying to imitate \"teacher\" participants over the long run. In evolutionary UIGs, the participants are competing against each other in an evolutionary game, and participants can go extinct and be replaced by others with higher fitness. We use the framework of category theory -- in particular, two influential results by Yoneda -- to characterize each type of imitation game. Universal properties in categories are defined by initial and final objects. We characterize dynamic UIGs where participants are learning by inductive inference as initial algebras over well-founded sets, and contrast them with participants learning by conductive inference over the final coalgebra of non-well-founded sets. We briefly discuss the extension of our categorical framework for UIGs to imitation games on quantum computers.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.3287172317504883, -0.5188090801239014], "openalex_id": "https://openalex.org/W4396669640", "title": "Computerized diagnostic decision support systems\u2013a comparative performance study of Isabel Pro vs. ChatGPT4", "authors": "Joe M. Bridges", "abstract": "Abstract Objectives Validate the diagnostic accuracy of the Artificial Intelligence Large Language Model ChatGPT4 by comparing diagnosis lists produced by ChatGPT4 to Isabel Pro. Methods This study used 201 cases, comparing ChatGPT4 to Isabel Pro. Systems inputs were identical. Mean Reciprocal Rank (MRR) compares the correct diagnosis\u2019s rank between systems. Isabel Pro ranks by the frequency with which the symptoms appear in the reference dataset. The mechanism ChatGPT4 uses to rank the diagnoses is unknown. A Wilcoxon Signed Rank Sum test failed to reject the null hypothesis. Results Both systems produced comprehensive differential diagnosis lists. Isabel Pro\u2019s list appears immediately upon submission, while ChatGPT4 takes several minutes. Isabel Pro produced 175 (87.1 %) correct diagnoses and ChatGPT4 165 (82.1 %). The MRR for ChatGPT4 was 0.428 (rank 2.31), and Isabel Pro was 0.389 (rank 2.57), an average rank of three for each. ChatGPT4 outperformed on Recall at Rank 1, 5, and 10, with Isabel Pro outperforming at 20, 30, and 40. The Wilcoxon Signed Rank Sum Test confirmed that the sample size was inadequate to conclude that the systems are equivalent. ChatGPT4 fabricated citations and DOIs, producing 145 correct references (87.9 %) but only 52 correct DOIs (31.5 %). Conclusions This study validates the promise of Clinical Diagnostic Decision Support Systems, including the Large Language Model form of artificial intelligence (AI). Until the issue of hallucination of references and, perhaps diagnoses, is resolved in favor of absolute accuracy, clinicians will make cautious use of Large Language Model systems in diagnosis, if at all.", "venue": "Diagnosis", "label": 0}, {"loc": [3.8001675605773926, -3.928757667541504], "openalex_id": "https://openalex.org/W4396671072", "title": "A Hybrid Deep BiLSTM-CNN for Hate Speech Detection in Multi-social media", "authors": "Ashwini Kumar, Santosh Kumar, Kalpdrum Passi, Aniket Mahanti", "abstract": "Nowadays, means of communication among people have changed due to advancements in information technology and the rise of online multi-social media. Many people express their feelings, ideas, and emotions on social media sites such as Instagram, Twitter, Gab, Reddit, Facebook, and YouTube. However, people have misused social media to send hateful messages to specific individuals or groups to create chaos. For various governance authorities, manually identifying hate speech on various social media platforms is a difficult task to avoid such chaos. In this study, a hybrid deep-learning model, where bidirectional long short-term memory (BiLSTM) and convolutional neural network (CNN) are used to classify hate speech in textual data, is proposed. This model incorporates a GLOVE-based word embedding approach, dropout, L2 regularization, and global max pooling to get impressive results. Further, the proposed BiLSTM-CNN model has been evaluated on various datasets to achieve state-of-the-art performance that is superior to the traditional and existing machine learning methods in terms of accuracy, precision, recall, and F1-score.", "venue": "ACM Transactions on Asian and Low-Resource Language Information Processing", "label": 22}, {"loc": [3.933206081390381, -1.2069501876831055], "openalex_id": "https://openalex.org/W4396621042", "title": "Advancing Real-time Pandemic Forecasting Using Large Language Models: A COVID-19 Case Study", "authors": "Hao Yang, Hongru Du, Jianan Zhao, Yang Zhao, Shaochong Xu, Xihong Lin, Yiran Chen, Lauren Gardner", "abstract": "Abstract Forecasting the short-term spread of an ongoing disease outbreak is a formidable challenge due to the complexity of contributing factors, some of which can be characterized through interlinked, multi-modality variables such as epidemiological time series data, viral biology, population demographics, and the intersection of public policy and human behavior. Existing forecasting model frameworks struggle with the multifaceted nature of relevant data and robust results translation, which hinders their performances and the provision of actionable insights for public health decision-makers. Our work introduces PandemicLLM, a novel framework with multi-modal Large Language Models (LLMs) that reformulates real-time forecasting of disease spread as a text reasoning problem, with the ability to incorporate real-time, complex, non-numerical information -- such as textual policies and genomic surveillance data -- previously unattainable in traditional forecasting models. This approach, through a unique AI-human cooperative prompt design and time series representation learning, encodes multi-modal data for LLMs. By redefining the forecasting process as an ordinal classification task, PandemicLLM yields more robust and trustworthy predictions, facilitating public health decision-making. The model is applied to the COVID-19 pandemic, and trained to utilize textual public health policies, genomic surveillance, spatial, and epidemiological time series data, and is subsequently tested across all 50 states of the U.S. for a duration of 16 weeks. Empirically, PandemicLLM is shown to be a high-performing pandemic forecasting framework that effectively captures the impact of emerging variants and can provide timely and accurate predictions. The proposed PandemicLLM opens avenues for incorporating various pandemic-related data in heterogeneous formats and exhibits performance benefits over existing models. This study illuminates the potential of adapting LLMs and representation learning to enhance pandemic forecasting, illustrating how AI innovations can strengthen pandemic responses and crisis management in the future.", "venue": "https://doi.org/10.21203/rs.3.rs-4244182/v1", "label": 0}, {"loc": [6.318725109100342, 0.5211344957351685], "openalex_id": "https://openalex.org/W4396653239", "title": "Homonym Sense Disambiguation in the Georgian Language", "authors": "Davit Melikidze, Alexander Gamkrelidze", "abstract": "This research proposes a novel approach to the Word Sense Disambiguation (WSD) task in the Georgian language, based on supervised fine-tuning of a pre-trained Large Language Model (LLM) on a dataset formed by filtering the Georgian Common Crawls corpus. The dataset is used to train a classifier for words with multiple senses. Additionally, we present experimental results of using LSTM for WSD. Accurately disambiguating homonyms is crucial in natural language processing. Georgian, an agglutinative language belonging to the Kartvelian language family, presents unique challenges in this context. The aim of this paper is to highlight the specific problems concerning homonym disambiguation in the Georgian language and to present our approach to solving them. The techniques discussed in the article achieve 95% accuracy for predicting lexical meanings of homonyms using a hand-classified dataset of over 7500 sentences.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.154932498931885, 5.564347743988037], "openalex_id": "https://openalex.org/W4396821571", "title": "Modeling Caption Diversity in Contrastive Vision-Language Pretraining", "authors": "Samuel Lavoie, Polina Kirichenko, Mark Ibrahim, Mahmoud Assran, Andrew Gordon Wildon, Aaron Courville, Nicolas Ballas", "abstract": "There are a thousand ways to caption an image. Contrastive Language Pretraining (CLIP) on the other hand, works by mapping an image and its caption to a single vector -- limiting how well CLIP-like models can represent the diverse ways to describe an image. In this work, we introduce Llip, Latent Language Image Pretraining, which models the diversity of captions that could match an image. Llip's vision encoder outputs a set of visual features that are mixed into a final representation by conditioning on information derived from the text. We show that Llip outperforms non-contextualized baselines like CLIP and SigLIP on a variety of tasks even with large-scale encoders. Llip improves zero-shot classification by an average of 2.9% zero-shot classification benchmarks with a ViT-G/14 encoder. Specifically, Llip attains a zero-shot top-1 accuracy of 83.5% on ImageNet outperforming a similarly sized CLIP by 1.4%. We also demonstrate improvement on zero-shot retrieval on MS-COCO by 6.0%. We provide a comprehensive analysis of the components introduced by the method and demonstrate that Llip leads to richer visual representations.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.169729232788086, 0.9999961853027344], "openalex_id": "https://openalex.org/W4396626127", "title": "A Legal Framework for Natural Language Model Training in Portugal", "authors": "R\u00faben Almeida, Evelin Amorim", "abstract": "Recent advances in deep learning have promoted the advent of many computational systems capable of performing intelligent actions that, until then, were restricted to the human intellect. In the particular case of human languages, these advances allowed the introduction of applications like ChatGPT that are capable of generating coherent text without being explicitly programmed to do so. Instead, these models use large volumes of textual data to learn meaningful representations of human languages. Associated with these advances, concerns about copyright and data privacy infringements caused by these applications have emerged. Despite these concerns, the pace at which new natural language processing applications continued to be developed largely outperformed the introduction of new regulations. Today, communication barriers between legal experts and computer scientists motivate many unintentional legal infringements during the development of such applications. In this paper, a multidisciplinary team intends to bridge this communication gap and promote more compliant Portuguese NLP research by presenting a series of everyday NLP use cases, while highlighting the Portuguese legislation that may arise during its development.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.21540641784668, 0.9631620049476624], "openalex_id": "https://openalex.org/W4396626127", "title": "A Legal Framework for Natural Language Processing Model Training in Portugal", "authors": "R\u00faben Almeida, Evelin Amorim", "abstract": "Recent advances in deep learning have promoted the advent of many computational systems capable of performing intelligent actions that, until then, were restricted to the human intellect. In the particular case of human languages, these advances allowed the introduction of applications like ChatGPT that are capable of generating coherent text without being explicitly programmed to do so. Instead, these models use large volumes of textual data to learn meaningful representations of human languages. Associated with these advances, concerns about copyright and data privacy infringements caused by these applications have emerged. Despite these concerns, the pace at which new natural language processing applications continued to be developed largely outperformed the introduction of new regulations. Today, communication barriers between legal experts and computer scientists motivate many unintentional legal infringements during the development of such applications. In this paper, a multidisciplinary team intends to bridge this communication gap and promote more compliant Portuguese NLP research by presenting a series of everyday NLP use cases, while highlighting the Portuguese legislation that may arise during its development.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.700477361679077, 0.37180742621421814], "openalex_id": "https://openalex.org/W4396580588", "title": "A Comprehensive Review on Large Language Models", "authors": "Asmita Yadav", "abstract": "In the realm of computer science and language, large language models (LLMs) stand out as remarkable tools of artificial intelligence (AI). Proficient in deciphering intricate language nuances, LLMs offer sensible responses and find applications in natural language understanding, language translation, and question answering. This chapter delves into the history, creation, training, and multifaceted applications of LLMs. It explores the basics of generative AI, focusing on generative pre-trained transformers (GPT). Examining the evolution of LLMs and their diverse applications in medicine, education, finance, and engineering, the chapter addresses real-world challenges, including ethical concerns, biases, comprehensibility, and computational requirements. It serves as an informative guide for researchers, practitioners, and enthusiasts, elucidating the potential, challenges, and future of LLMs in AI.", "venue": "Advances in systems analysis, software engineering, and high performance computing book series", "label": 39}, {"loc": [4.114616394042969, -2.502298593521118], "openalex_id": "https://openalex.org/W4396587977", "title": "Fake news detection in low-resource languages: A novel hybrid summarization approach", "authors": "Jawaher Alghamdi, Yuqing Lin, Suhuai Luo", "abstract": "The proliferation of fake news across languages and domains on social media platforms poses a significant societal threat. Current automatic detection methods for low-resource languages (e.g., Swahili, Indonesian and other low-resource languages) face limitations due to two factors: sequential length restrictions in pre-trained language models (PLMs) like multilingual bidirectional encoder representation from transformers (mBERT), and the presence of noisy training data. This work proposes a novel and efficient multilingual fake news detection (MFND) approach that addresses these challenges. Our solution leverages a hybrid extractive and abstractive summarization strategy to extract only the most relevant content from news articles. This significantly reduces data length while preserving crucial information for fake news classification. The pre-processed data is then fed into mBERT for classification. Extensive evaluations on a publicly available multilingual dataset demonstrate the superiority of our approach compared to state-of-the-art (SOTA) methods. Our analysis, both quantitative and qualitative, highlights the strengths of this method, achieving new performance benchmarks and emphasizing the impact of content condensation on model accuracy and efficiency. This framework paves the way for faster, more accurate MFND, fostering more robust information ecosystems.", "venue": "Knowledge-Based Systems", "label": 0}, {"loc": [5.570857048034668, 0.6760817766189575], "openalex_id": "https://openalex.org/W4396560617", "title": "WRITTEN IN PERSIAN USING A TRANSFORMER-BASED MODEL", "authors": "Tahereh Firoozi, Mark J. Gierl", "abstract": "The automated scoring of student essays is now recognized as a significant development in both the research and practice of educational assessment. The majority of the published studies on automated essay scoring (AES) focus on outcomes in English. Studies on languages other than English are, by comparison, practically nonexistent. The purpose of this chapter is to describe and evaluate the first AES system for scoring essays in the Persian language using multilingual Bidirectional Encoder Representation for Transformers (mBERT). mBERT is a transformer-based encoder model for language representation that uses an attention mechanism to learn the contextual relations between words and sentences in a text. mBERT is pre-trained on 104 languages, including Persian. mBERT was used to evaluate 2,000 holistically scored essays written in Persian by non-native language learners in Iran using a five-point scale that ranged from Elementary to Advanced. The performance of the mBERT transformer model was examined against a baseline model that only included a Word2Vec word embedding layer. The mBERT model performed with high classification consistency compared to the baseline model. These results demonstrate that the mBERT model can be used with a high degree of precision to predict the Persian essay scores produced by human raters. The methods described in this study can be easily adapted and readily used to score essays written in the remaining 103 languages in mBERT, thereby supporting the application and widespread use of multilingual AES.", "venue": "https://doi.org/10.4324/9781003397618-5", "label": 0}, {"loc": [7.768328666687012, 1.9596558809280396], "openalex_id": "https://openalex.org/W4396819097", "title": "Scaffold-BPE: Enhancing Byte Pair Encoding with Simple and Effective Scaffold Token Removal", "authors": "Haoran Lian, Yizhe Xiong, Jianwei Niu, Shasha Mo, Zhenpeng Su, Zijia Lin, Peng Liu, Hui Chen, Guiguang Ding", "abstract": "Byte Pair Encoding (BPE) serves as a foundation method for text tokenization in the Natural Language Processing (NLP) field. Despite its wide adoption, the original BPE algorithm harbors an inherent flaw: it inadvertently introduces a frequency imbalance for tokens in the text corpus. Since BPE iteratively merges the most frequent token pair in the text corpus to generate a new token and keeps all generated tokens in the vocabulary, it unavoidably holds tokens that primarily act as components of a longer token and appear infrequently on their own. We term such tokens as Scaffold Tokens. Due to their infrequent occurrences in the text corpus, Scaffold Tokens pose a learning imbalance issue. To address that issue, we propose Scaffold-BPE, which incorporates a dynamic scaffold token removal mechanism by parameter-free, computation-light, and easy-to-implement modifications to the original BPE method. This novel approach ensures the exclusion of low-frequency Scaffold Tokens from the token representations for given texts, thereby mitigating the issue of frequency imbalance and facilitating model training. On extensive experiments across language modeling and even machine translation, Scaffold-BPE consistently outperforms the original BPE, well demonstrating its effectiveness.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.718146800994873, 1.192183256149292], "openalex_id": "https://openalex.org/W4396566282", "title": "Time Machine GPT", "authors": "Felix Drinkall, Eghbal Rahimikia, Janet B. Pierrehumbert, Stefan Zohren", "abstract": "Large language models (LLMs) are often trained on extensive, temporally indiscriminate text corpora, reflecting the lack of datasets with temporal metadata. This approach is not aligned with the evolving nature of language. Conventional methods for creating temporally adapted language models often depend on further pre-training static models on time-specific data. This paper presents a new approach: a series of point-in-time LLMs called Time Machine GPT (TiMaGPT), specifically designed to be nonprognosticative. This ensures they remain uninformed about future factual information and linguistic changes. This strategy is beneficial for understanding language evolution and is of critical importance when applying models in dynamic contexts, such as time-series forecasting, where foresight of future information can prove problematic. We provide access to both the models and training datasets.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.142578601837158, 1.2640186548233032], "openalex_id": "https://openalex.org/W4396818943", "title": "Building a Large Japanese Web Corpus for Large Language Models", "authors": "Naoaki Okazaki, Kakeru Hattori, Hirai Shota, Hiroki Iida, Masanari Ohi, Kazuki Fujii, Taishi Nakamura, Mengsay Loem, Rio Yokota, Sakae Mizuki", "abstract": "Open Japanese large language models (LLMs) have been trained on the Japanese portions of corpora such as CC-100, mC4, and OSCAR. However, these corpora were not created for the quality of Japanese texts. This study builds a large Japanese web corpus by extracting and refining text from the Common Crawl archive (21 snapshots of approximately 63.4 billion pages crawled between 2020 and 2023). This corpus consists of approximately 312.1 billion characters (approximately 173 million pages), which is the largest of all available training corpora for Japanese LLMs, surpassing CC-100 (approximately 25.8 billion characters), mC4 (approximately 239.7 billion characters) and OSCAR 23.10 (approximately 74 billion characters). To confirm the quality of the corpus, we performed continual pre-training on Llama 2 7B, 13B, 70B, Mistral 7B v0.1, and Mixtral 8x7B Instruct as base LLMs and gained consistent (6.6-8.1 points) improvements on Japanese benchmark datasets. We also demonstrate that the improvement on Llama 2 13B brought from the presented corpus was the largest among those from other existing corpora.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.764213562011719, 0.04927940294146538], "openalex_id": "https://openalex.org/W4396530751", "title": "Towards automatic question generation using pre-trained model in academic field for Bahasa Indonesia", "authors": "Derwin Suhartono, Muhammad Rizki Nur Majiid, Renaldy Fredyan", "abstract": "Abstract Exam evaluations are essential to assessing students\u2019 knowledge and progress in a subject or course. To meet learning objectives and assess student performance, questions must be themed. Automatic Question Generation (AQG) is our novel approach to this problem. A comprehensive process for autonomously generating Bahasa Indonesia text questions is shown. This paper suggests using a decoder to generate text from deep learning models\u2019 tokens. The suggested technique pre-processes Vectorized Corpus, Token IDs, and Features Tensor. The tensors are embedded to increase each token, and attention is masked to separate padding tokens from context-containing tokens. An encoder processes the encoded tokens and attention masks to create a contextual understanding memory that the decoder uses to generate text. Our work uses the Sequence-to-Sequence Learning architecture of BiGRU, BiLSTM, Transformer, BERT, BART, and GPT. Implementing these models optimizes computational resources while extensively exploring the research issue. The model uses context sentences as input and question sentences as output, incorporating linguistic elements like response placement, POS tags, answer masking, and named entities (NE) to improve comprehension and linguistic ability. Our approach includes two innovative models: IndoBERTFormer, which combines a BERT encoder with a Transformer decoder, and IndoBARTFormer, which decodes vectors like BERT. IndoTransGPT uses the Transformer as an encoder to improve understanding, extending the GPT model\u2019s adaptability.", "venue": "Education and Information Technologies", "label": 0}, {"loc": [7.166916847229004, 1.1937657594680786], "openalex_id": "https://openalex.org/W4396819043", "title": "Continual Pre-Training for Cross-Lingual LLM Adaptation: Enhancing Japanese Language Capabilities", "authors": "Kazuki Fujii, Taishi Nakamura, Mengsay Loem, Hiroki Iida, Masanari Ohi, Kakeru Hattori, Hirai Shota, Sakae Mizuki, Rio Yokota, Naoaki Okazaki", "abstract": "Cross-lingual continual pre-training of large language models (LLMs) initially trained on English corpus allows us to leverage the vast amount of English language resources and reduce the pre-training cost. In this study, we constructed Swallow, an LLM with enhanced Japanese capability, by extending the vocabulary of Llama 2 to include Japanese characters and conducting continual pre-training on a large Japanese web corpus. Experimental results confirmed that the performance on Japanese tasks drastically improved through continual pre-training, and the performance monotonically increased with the amount of training data up to 100B tokens. Consequently, Swallow achieved superior performance compared to other LLMs that were trained from scratch in English and Japanese. An analysis of the effects of continual pre-training revealed that it was particularly effective for Japanese question answering tasks. Furthermore, to elucidate effective methodologies for cross-lingual continual pre-training from English to Japanese, we investigated the impact of vocabulary expansion and the effectiveness of incorporating parallel corpora. The results showed that the efficiency gained through vocabulary expansion had no negative impact on performance, except for the summarization task, and that the combined use of parallel corpora enhanced translation ability.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.596325397491455, -1.049513816833496], "openalex_id": "https://openalex.org/W4396495439", "title": "Fine grain emotion analysis in Spanish using linguistic features and transformers", "authors": "Alejandro Salmer\u00f3n-R\u00edos, Jos\u00e9 Antonio Garc\u00ed\u00ada-D\u00ed\u00adaz, Ronghao Pan, Rafael Valencia\u2010Garc\u00eda", "abstract": "Mental health issues are a global concern, with a particular focus on the rise of depression. Depression affects millions of people worldwide and is a leading cause of suicide, particularly among young people. Recent surveys indicate an increase in cases of depression during the COVID-19 pandemic, which affected approximately 5.4% of the population in Spain in 2020. Social media platforms such as X (formerly Twitter) have become important hubs for health information as more people turn to these platforms to share their struggles and seek emotional support. Researchers have discovered a link between emotions and mental illnesses such as depression. This correlation provides a valuable opportunity for automated analysis of social media data to detect changes in mental health status that might otherwise go unnoticed, thus preventing more serious health consequences. Therefore, this research explores the field of emotion analysis in Spanish towards mental disorders. There are two contributions in this area. On the one hand, the compilation, translation, evaluation and correction of a novel dataset composed of a mixture of other existing datasets in the bibliography. This dataset compares a total of 16 emotions, with an emphasis on negative emotions. On the other hand, the in-depth evaluation of this novel dataset with several state-of-the-art transformers based on encoder-only and encoder-decoder architectures. The analysis compromises monolingual, multilingual and distilled models as well as feature integration techniques. The best results are obtained with the encoder-only MarIA model, with a macro-average F1 score of 60.4771%.", "venue": "PeerJ Computer Science", "label": 4}, {"loc": [3.125821828842163, 2.544665813446045], "openalex_id": "https://openalex.org/W4396506709", "title": "Near to Mid-term Risks and Opportunities of Open Source Generative AI", "authors": "Francisco Eiras, Aleksandar Petrov, Bertie Vidgen, Christian Schroeder de Witt, Fabio Pizzati, Katherine Elkins, Supratik Mukhopadhyay, Adel Bibi, Botos Csaba, Fabro Steibel, Fazl Barez, G. C. Moore Smith, Gianluca Guadagni, Jon Chun, Jordi Cabot, Joseph Marvin Imperial, Juan A. Nolazco\u2010Flores, Lori Landay, Matthew G. Jackson, Paul R\u00f6ttger, Philip H. S. Torr, Trevor Darrell, Yong Suk Lee, Jakob Foerster", "abstract": "In the next few years, applications of Generative AI are expected to revolutionize a number of different areas, ranging from science & medicine to education. The potential for these seismic changes has triggered a lively debate about potential risks and resulted in calls for tighter regulation, in particular from some of the major tech companies who are leading in AI development. This regulation is likely to put at risk the budding field of open-source Generative AI. We argue for the responsible open sourcing of generative AI models in the near and medium term. To set the stage, we first introduce an AI openness taxonomy system and apply it to 40 current large language models. We then outline differential benefits and risks of open versus closed source AI and present potential risk mitigation, ranging from best practices to calls for technical and scientific contributions. We hope that this report will add a much needed missing voice to the current public discourse on near to mid-term AI safety and other societal impact.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.644329071044922, 4.510151386260986], "openalex_id": "https://openalex.org/W4395687088", "title": "Understanding Privacy Risks of Embeddings Induced by Large Language Models", "authors": "Zhihao Zhu, Ninglu Shao, Defu Lian, Chenwang Wu, Zheng Liu, Yi Yang, Enhong Chen", "abstract": "Large language models (LLMs) show early signs of artificial general intelligence but struggle with hallucinations. One promising solution to mitigate these hallucinations is to store external knowledge as embeddings, aiding LLMs in retrieval-augmented generation. However, such a solution risks compromising privacy, as recent studies experimentally showed that the original text can be partially reconstructed from text embeddings by pre-trained language models. The significant advantage of LLMs over traditional pre-trained models may exacerbate these concerns. To this end, we investigate the effectiveness of reconstructing original knowledge and predicting entity attributes from these embeddings when LLMs are employed. Empirical findings indicate that LLMs significantly improve the accuracy of two evaluated tasks over those from pre-trained models, regardless of whether the texts are in-distribution or out-of-distribution. This underscores a heightened potential for LLMs to jeopardize user privacy, highlighting the negative consequences of their widespread use. We further discuss preliminary strategies to mitigate this risk.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.817716598510742, 1.129366159439087], "openalex_id": "https://openalex.org/W4395687247", "title": "Tele-FLM Technical Report", "authors": "Xiang Li, Yiqun Yao, Xin Jiang, Xuezhi Fang, Chao Wang, Xinzhang Liu, Zihan Wang, Yu Zhao, Xin Wang, Yu\u2010Yao Huang, Shuangyong Song, Yongxiang Li, Zheng Zhang, Bo Zhao, Aixin Sun, Yequan Wang, Zhongjiang He, Zhongyuan Wang, Xuelong Li, Tie\u2010Jun Huang", "abstract": "Large language models (LLMs) have showcased profound capabilities in language understanding and generation, facilitating a wide array of applications. However, there is a notable paucity of detailed, open-sourced methodologies on efficiently scaling LLMs beyond 50 billion parameters with minimum trial-and-error cost and computational resources. In this report, we introduce Tele-FLM (aka FLM-2), a 52B open-sourced multilingual large language model that features a stable, efficient pre-training paradigm and enhanced factual judgment capabilities. Tele-FLM demonstrates superior multilingual language modeling abilities, measured by BPB on textual corpus. Besides, in both English and Chinese foundation model evaluation, it is comparable to strong open-sourced models that involve larger pre-training FLOPs, such as Llama2-70B and DeepSeek-67B. In addition to the model weights, we share the core designs, engineering practices, and training details, which we expect to benefit both the academic and industrial communities.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.2215495109558105, 5.506141662597656], "openalex_id": "https://openalex.org/W4395687128", "title": "Zero-Shot Distillation for Image Encoders: How to Make Effective Use of Synthetic Data", "authors": "Niclas Popp, Jan Hendrik Metzen, Matthias Hein", "abstract": "Multi-modal foundation models such as CLIP have showcased impressive zero-shot capabilities. However, their applicability in resource-constrained environments is limited due to their large number of parameters and high inference time. While existing approaches have scaled down the entire CLIP architecture, we focus on training smaller variants of the image encoder, which suffices for efficient zero-shot classification. The use of synthetic data has shown promise in distilling representations from larger teachers, resulting in strong few-shot and linear probe performance. However, we find that this approach surprisingly fails in true zero-shot settings when using contrastive losses. We identify the exploitation of spurious features as being responsible for poor generalization between synthetic and real data. However, by using the image feature-based L2 distillation loss, we mitigate these problems and train students that achieve zero-shot performance which on four domain-specific datasets is on-par with a ViT-B/32 teacher model trained on DataCompXL, while featuring up to 92% fewer parameters.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.62885856628418, 1.5836353302001953], "openalex_id": "https://openalex.org/W4395483230", "title": "Nyonic Technical Report", "authors": "Junfeng Tian, Rui Wang, Cong Li, Yudong Zhou, Jun Liu, Jun Wang", "abstract": "This report details the development and key achievements of our latest language model designed for custom large language models. The advancements introduced include a novel Online Data Scheduler that supports flexible training data adjustments and curriculum learning. The model's architecture is fortified with state-of-the-art techniques such as Rotary Positional Embeddings, QK-LayerNorm, and a specially crafted multilingual tokenizer to enhance stability and performance. Moreover, our robust training framework incorporates advanced monitoring and rapid recovery features to ensure optimal efficiency. Our Wonton 7B model has demonstrated competitive performance on a range of multilingual and English benchmarks. Future developments will prioritize narrowing the performance gap with more extensively trained models, thereby enhancing the model's real-world efficacy and adaptability.GitHub: \\url{https://github.com/nyonicai/nyonic-public}", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.663118839263916, -1.5002720355987549], "openalex_id": "https://openalex.org/W4395482179", "title": "Causality-driven multivariate stock movement forecasting", "authors": "Abel D\u00edaz Berenguer, Yifei Da, Mat\u00edas Nicol\u00e1s Bossa, Meshia C\u00e9dric Oveneke, Hichem Sahli", "abstract": "Our study aims to investigate the interdependence between international stock markets and sentiments from financial news in stock forecasting. We adopt the Temporal Fusion Transformers (TFT) to incorporate intra and inter-market correlations and the interaction between the information flow, i.e. causality, of financial news sentiment and the dynamics of the stock market. The current study distinguishes itself from existing research by adopting Dynamic Transfer Entropy (DTE) to establish an accurate information flow propagation between stock and sentiments. DTE has the advantage of providing time series that mine information flow propagation paths between certain parts of the time series, highlighting marginal events such as spikes or sudden jumps, which are crucial in financial time series. The proposed methodological approach involves the following elements: a FinBERT-based textual analysis of financial news articles to extract sentiment time series, the use of the Transfer Entropy and corresponding heat maps to analyze the net information flows, the calculation of the DTE time series, which are considered as co-occurring covariates of stock Price, and TFT-based stock forecasting. The Dow Jones Industrial Average index of 13 countries, along with daily financial news data obtained through the New York Times API, are used to demonstrate the validity and superiority of the proposed DTE-based causality method along with TFT for accurate stock Price and Return forecasting compared to state-of-the-art time series forecasting methods.", "venue": "PLoS ONE", "label": 11}, {"loc": [7.765478610992432, 3.9838287830352783], "openalex_id": "https://openalex.org/W4395443685", "title": "Multi-Head Mixture-of-Experts", "authors": "Xun Wu, Shaohan Huang, Wenhui Wang, Furu Wei", "abstract": "Sparse Mixtures of Experts (SMoE) scales model capacity without significant increases in training and inference costs, but exhibits the following two issues: (1) Low expert activation, where only a small subset of experts are activated for optimization. (2) Lacking fine-grained analytical capabilities for multiple semantic concepts within individual tokens. We propose Multi-Head Mixture-of-Experts (MH-MoE), which employs a multi-head mechanism to split each token into multiple sub-tokens. These sub-tokens are then assigned to and processed by a diverse set of experts in parallel, and seamlessly reintegrated into the original token form. The multi-head mechanism enables the model to collectively attend to information from various representation spaces within different experts, while significantly enhances expert activation, thus deepens context understanding and alleviate overfitting. Moreover, our MH-MoE is straightforward to implement and decouples from other SMoE optimization methods, making it easy to integrate with other SMoE models for enhanced performance. Extensive experimental results across three tasks: English-focused language modeling, Multi-lingual language modeling and Masked multi-modality modeling tasks, demonstrate the effectiveness of MH-MoE.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.856122016906738, 0.6284127831459045], "openalex_id": "https://openalex.org/W4395115984", "title": "Research and Application of Large Model-Based Intelligent Customer Service System", "authors": "Yuan Xi", "abstract": "With the rapid development of artificial intelligence technology, intelligent customer service systems have been widely used. This paper addresses the limitations of traditional intelligent customer service systems, such as limited language understanding ability, narrow knowledge coverage, and insufficient personalized service. It proposes an intelligent customer service system design scheme based on the RAG model. The scheme leverages the powerful language understanding and generation capabilities of large models, combined with dialogue management and knowledge base retrieval enhancement techniques, to build an efficient and intelligent customer service system. This paper introduces the overall architecture of the system, the design and implementation of each module, and comprehensively evaluates the system through experiments. The experimental results show that the system can provide accurate and fluent customer service, significantly improving customer satisfaction. The research in this paper provides new ideas and references for the development of intelligent customer service systems.", "venue": "International Journal of Emerging Technologies and Advanced Applications", "label": 0}, {"loc": [3.2546401023864746, -0.17675219476222992], "openalex_id": "https://openalex.org/W4395067526", "title": "Large language models for biomedicine: foundations, opportunities, challenges, and best practices", "authors": "Satya S. Sahoo, Joseph M. Plasek, Hua Xu, \u00d6zlem Uzuner, Trevor Cohen, Meliha Yeti\u015fgen, Hongfang Liu, St\u00e9phane M. Meystre, Yanshan Wang", "abstract": "Abstract Objectives Generative large language models (LLMs) are a subset of transformers-based neural network architecture models. LLMs have successfully leveraged a combination of an increased number of parameters, improvements in computational efficiency, and large pre-training datasets to perform a wide spectrum of natural language processing (NLP) tasks. Using a few examples (few-shot) or no examples (zero-shot) for prompt-tuning has enabled LLMs to achieve state-of-the-art performance in a broad range of NLP applications. This article by the American Medical Informatics Association (AMIA) NLP Working Group characterizes the opportunities, challenges, and best practices for our community to leverage and advance the integration of LLMs in downstream NLP applications effectively. This can be accomplished through a variety of approaches, including augmented prompting, instruction prompt tuning, and reinforcement learning from human feedback (RLHF). Target Audience Our focus is on making LLMs accessible to the broader biomedical informatics community, including clinicians and researchers who may be unfamiliar with NLP. Additionally, NLP practitioners may gain insight from the described best practices. Scope We focus on 3 broad categories of NLP tasks, namely natural language understanding, natural language inferencing, and natural language generation. We review the emerging trends in prompt tuning, instruction fine-tuning, and evaluation metrics used for LLMs while drawing attention to several issues that impact biomedical NLP applications, including falsehoods in generated text (confabulation/hallucinations), toxicity, and dataset contamination leading to overfitting. We also review potential approaches to address some of these current challenges in LLMs, such as chain of thought prompting, and the phenomena of emergent capabilities observed in LLMs that can be leveraged to address complex NLP challenge in biomedical applications.", "venue": "Journal of the American Medical Informatics Association", "label": 0}, {"loc": [4.387025356292725, -0.9765644669532776], "openalex_id": "https://openalex.org/W4395064608", "title": "Multi Class Depression Detection Through Tweets using Artificial Intelligence", "authors": "Muhammad Osama Nusrat, Waseem Shahzad, Saad Ahmed Jamal", "abstract": "Depression is a significant issue nowadays. As per the World Health Organization (WHO), in 2023, over 280 million individuals are grappling with depression. This is a huge number; if not taken seriously, these numbers will increase rapidly. About 4.89 billion individuals are social media users. People express their feelings and emotions on platforms like Twitter, Facebook, Reddit, Instagram, etc. These platforms contain valuable information which can be used for research purposes. Considerable research has been conducted across various social media platforms. However, certain limitations persist in these endeavors. Particularly, previous studies were only focused on detecting depression and the intensity of depression in tweets. Also, there existed inaccuracies in dataset labeling. In this research work, five types of depression (Bipolar, major, psychotic, atypical, and postpartum) were predicted using tweets from the Twitter database based on lexicon labeling. Explainable AI was used to provide reasoning by highlighting the parts of tweets that represent type of depression. Bidirectional Encoder Representations from Transformers (BERT) was used for feature extraction and training. Machine learning and deep learning methodologies were used to train the model. The BERT model presented the most promising results, achieving an overall accuracy of 0.96.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.866570472717285, 0.21352113783359528], "openalex_id": "https://openalex.org/W4395064688", "title": "SuRe: Summarizing Retrievals using Answer Candidates for Open-domain QA of LLMs", "authors": "Jaehyung Kim, Jaehyun Nam, Sangwoo Mo, Jongjin Park, Sang\u2010Woo Lee, Minjoon Seo, Jung-Woo Ha, Jinwoo Shin", "abstract": "Large language models (LLMs) have made significant advancements in various natural language processing tasks, including question answering (QA) tasks. While incorporating new information with the retrieval of relevant passages is a promising way to improve QA with LLMs, the existing methods often require additional fine-tuning which becomes infeasible with recent LLMs. Augmenting retrieved passages via prompting has the potential to address this limitation, but this direction has been limitedly explored. To this end, we design a simple yet effective framework to enhance open-domain QA (ODQA) with LLMs, based on the summarized retrieval (SuRe). SuRe helps LLMs predict more accurate answers for a given question, which are well-supported by the summarized retrieval that could be viewed as an explicit rationale extracted from the retrieved passages. Specifically, SuRe first constructs summaries of the retrieved passages for each of the multiple answer candidates. Then, SuRe confirms the most plausible answer from the candidate set by evaluating the validity and ranking of the generated summaries. Experimental results on diverse ODQA benchmarks demonstrate the superiority of SuRe, with improvements of up to 4.6% in exact match (EM) and 4.0% in F1 score over standard prompting approaches. SuRe also can be integrated with a broad range of retrieval methods and LLMs. Finally, the generated summaries from SuRe show additional advantages to measure the importance of retrieved passages and serve as more preferred rationales by models and humans.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.210511684417725, 0.95723557472229], "openalex_id": "https://openalex.org/W4395443953", "title": "Musical Word Embedding for Music Tagging and Retrieval", "authors": "SeungHeon Doh, Jongpil Lee, Dasaem Jeong, Juhan Nam", "abstract": "Word embedding has become an essential means for text-based information retrieval. Typically, word embeddings are learned from large quantities of general and unstructured text data. However, in the domain of music, the word embedding may have difficulty understanding musical contexts or recognizing music-related entities like artists and tracks. To address this issue, we propose a new approach called Musical Word Embedding (MWE), which involves learning from various types of texts, including both everyday and music-related vocabulary. We integrate MWE into an audio-word joint representation framework for tagging and retrieving music, using words like tag, artist, and track that have different levels of musical specificity. Our experiments show that using a more specific musical word like track results in better retrieval performance, while using a less specific term like tag leads to better tagging performance. To balance this compromise, we suggest multi-prototype training that uses words with different levels of musical specificity jointly. We evaluate both word embedding and audio-word joint embedding on four tasks (tag rank prediction, music tagging, query-by-tag, and query-by-track) across two datasets (Million Song Dataset and MTG-Jamendo). Our findings show that the suggested MWE is more efficient and robust than the conventional word embedding.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.602426052093506, 1.7094558477401733], "openalex_id": "https://openalex.org/W4395077771", "title": "Evaluating English-language morphological awareness assessments", "authors": "Carla L. Hudson Kam, Emily Sadlier-Brown, Shannon Clark, Chelsea Jang, Carrie Demmans Epp, Jenny Thomson", "abstract": "Many studies have shown that morphological knowledge has effects on reading comprehension separate from other aspects of language knowledge. This has implications for reading instruction and assessment: it suggests that children could have reading comprehension difficulties that are due to a lack of morphological knowledge, and thus, that explicit instruction of morphology might be helpful for them, indeed for all children. To find children who might especially benefit from specific instruction in morphology, we would need good tests of morphological knowledge. We evaluated a set of morphological awareness assessments to determine whether they conclusively tapped into morphological knowledge, and found that it was not possible to be certain that they were accurately targeting morphological knowledge.", "venue": "First Language", "label": 0}, {"loc": [6.39384126663208, 5.518030166625977], "openalex_id": "https://openalex.org/W4395022840", "title": "MoVA: Adapting Mixture of Vision Experts to Multimodal Context", "authors": "Zhuofan Zong, Bingqi Ma, D. Z. Shen, Guanglu Song, Hao Shao, Dongzhi Jiang, Hongsheng Li, Yu Liu", "abstract": "As the key component in multimodal large language models (MLLMs), the ability of the visual encoder greatly affects MLLM's understanding on diverse image content. Although some large-scale pretrained vision encoders such as vision encoders in CLIP and DINOv2 have brought promising performance, we found that there is still no single vision encoder that can dominate various image content understanding, e.g., the CLIP vision encoder leads to outstanding results on general image understanding but poor performance on document or chart content. To alleviate the bias of CLIP vision encoder, we first delve into the inherent behavior of different pre-trained vision encoders and then propose the MoVA, a powerful and novel MLLM, adaptively routing and fusing task-specific vision experts with a coarse-to-fine mechanism. In the coarse-grained stage, we design a context-aware expert routing strategy to dynamically select the most suitable vision experts according to the user instruction, input image, and expertise of vision experts. This benefits from the powerful model function understanding ability of the large language model (LLM). In the fine-grained stage, we elaborately conduct the mixture-of-vision-expert adapter (MoV-Adapter) to extract and fuse task-specific knowledge from various experts. This coarse-to-fine paradigm effectively leverages representations from experts based on multimodal context and model expertise, further enhancing the generalization ability. We conduct extensive experiments to evaluate the effectiveness of the proposed approach. Without any bells and whistles, MoVA can achieve significant performance gains over current state-of-the-art methods in a wide range of challenging multimodal benchmarks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.536807060241699, 1.220458745956421], "openalex_id": "https://openalex.org/W4395022636", "title": "Cross-cultural Inspiration Detection and Analysis in Real and LLM-generated Social Media Data", "authors": "Oana Ignat, Gayathri Ganesh Lakshmy, Rada Mihalcea", "abstract": "Inspiration is linked to various positive outcomes, such as increased creativity, productivity, and happiness. Although inspiration has great potential, there has been limited effort toward identifying content that is inspiring, as opposed to just engaging or positive. Additionally, most research has concentrated on Western data, with little attention paid to other cultures. This work is the first to study cross-cultural inspiration through machine learning methods. We aim to identify and analyze real and AI-generated cross-cultural inspiring posts. To this end, we compile and make publicly available the InspAIred dataset, which consists of 2,000 real inspiring posts, 2,000 real non-inspiring posts, and 2,000 generated inspiring posts evenly distributed across India and the UK. The real posts are sourced from Reddit, while the generated posts are created using the GPT-4 model. Using this dataset, we conduct extensive computational linguistic analyses to (1) compare inspiring content across cultures, (2) compare AI-generated inspiring posts to real inspiring posts, and (3) determine if detection models can accurately distinguish between inspiring content across cultures and data sources.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.146783828735352, -2.557802200317383], "openalex_id": "https://openalex.org/W4395022625", "title": "MAiDE-up: Multilingual Deception Detection of GPT-generated Hotel Reviews", "authors": "Oana Ignat, Xiaomeng Xu, Rada Mihalcea", "abstract": "Deceptive reviews are becoming increasingly common, especially given the increase in performance and the prevalence of LLMs. While work to date has addressed the development of models to differentiate between truthful and deceptive human reviews, much less is known about the distinction between real reviews and AI-authored fake reviews. Moreover, most of the research so far has focused primarily on English, with very little work dedicated to other languages. In this paper, we compile and make publicly available the MAiDE-up dataset, consisting of 10,000 real and 10,000 AI-generated fake hotel reviews, balanced across ten languages. Using this dataset, we conduct extensive linguistic analyses to (1) compare the AI fake hotel reviews to real hotel reviews, and (2) identify the factors that influence the deception detection model performance. We explore the effectiveness of several models for deception detection in hotel reviews across three main dimensions: sentiment, location, and language. We find that these dimensions influence how well we can detect AI-generated fake reviews.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.951028823852539, 4.95269775390625], "openalex_id": "https://openalex.org/W4394989737", "title": "Enabling action crossmodality for a pretrained large language model", "authors": "Anton Caesar, Ozan \u00d6zdemir, Cornelius Weber, Stefan Wermter", "abstract": "Natural language processing and vision tasks have seen large improvements recently through the rise of Transformer architectures. The high performing large language models (LLMs) benefit from large textual datasets that are numerously available online. However, action and bidirectional action-language tasks are less developed, as these require more specific and labelled data. Therefore, we aim at enabling these robotic action capabilities for a pretrained LLM, while maintaining high efficiency with regards to the required training time and data size. To achieve this, we split up a Transformer-based LLM and insert a multimodal architecture into it. Specifically, we split a pretrained T5 LLM between its encoder and decoder parts, to insert a crossmodal Transformer component of a Paired Transformed Autoencoders (PTAE) bidirectional action-language model. The experiments are conducted on a new dataset, consisting of unimodal language translation and crossmodal bidirectional action-language translation. The natural language capabilities of the original T5 are reestablished efficiently by training the crossmodal Transformer, which requires only one 5.7 millionth of the T5 model's original training data. Furthermore, the new model, called CrossT5, achieves high accuracy for the vision and language guided robotic action tasks. By design, the CrossT5 agent acts robustly when tested with language commands not included in the dataset. The results demonstrate that this novel approach is successful in combining the advanced linguistic capabilities of LLMs with the low-level robotic control skills of vision-action models. The code is available at this URL: https://github.com/samsoneko/CrossT5.", "venue": "Natural Language Processing Journal", "label": 9}, {"loc": [4.45796537399292, 2.539389133453369], "openalex_id": "https://openalex.org/W4394973077", "title": "Investigating Gender Bias in Turkish Language Models", "authors": "Orhun Caglidil, Malte Ostendorff, Georg Rehm", "abstract": "Language models are trained mostly on Web data, which often contains social stereotypes and biases that the models can inherit. This has potentially negative consequences, as models can amplify these biases in downstream tasks or applications. However, prior research has primarily focused on the English language, especially in the context of gender bias. In particular, grammatically gender-neutral languages such as Turkish are underexplored despite representing different linguistic properties to language models with possibly different effects on biases. In this paper, we fill this research gap and investigate the significance of gender bias in Turkish language models. We build upon existing bias evaluation frameworks and extend them to the Turkish language by translating existing English tests and creating new ones designed to measure gender bias in the context of T\u00fcrkiye. Specifically, we also evaluate Turkish language models for their embedded ethnic bias toward Kurdish people. Based on the experimental results, we attribute possible biases to different model characteristics such as the model size, their multilingualism, and the training corpora. We make the Turkish gender bias dataset publicly available.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.734615802764893, 1.6647992134094238], "openalex_id": "https://openalex.org/W4394952340", "title": "Navigating Challenges and Technical Debt in Large Language Models Deployment", "authors": "Ahmed Menshawy, Zeeshan Nawaz, M.N.I. Fahmy", "abstract": "Large Language Models (LLMs) have become an essential tool in advancing artificial intelligence and machine learning, enabling outstanding capabilities in natural language processing, and understanding. However, the efficient deployment of LLMs in production environments reveals a complex landscape of challenges and technical debt.", "venue": "https://doi.org/10.1145/3642970.3655840", "label": 0}, {"loc": [6.781618595123291, 1.1504638195037842], "openalex_id": "https://openalex.org/W4394972903", "title": "ViLLM-Eval: A Comprehensive Evaluation Suite for Vietnamese Large Language Models", "authors": "Trong-Hieu Nguyen, Anh-Cuong Le, Viet-Cuong Nguyen", "abstract": "The rapid advancement of large language models (LLMs) necessitates the development of new benchmarks to accurately assess their capabilities. To address this need for Vietnamese, this work aims to introduce ViLLM-Eval, the comprehensive evaluation suite designed to measure the advanced knowledge and reasoning abilities of foundation models within a Vietnamese context. ViLLM-Eval consists of multiple-choice questions and predict next word tasks spanning various difficulty levels and diverse disciplines, ranging from humanities to science and engineering. A thorough evaluation of the most advanced LLMs on ViLLM-Eval revealed that even the best performing models have significant room for improvement in understanding and responding to Vietnamese language tasks. ViLLM-Eval is believed to be instrumental in identifying key strengths and weaknesses of foundation models, ultimately promoting their development and enhancing their performance for Vietnamese users. This paper provides a thorough overview of ViLLM-Eval as part of the Vietnamese Large Language Model shared task, held within the 10th International Workshop on Vietnamese Language and Speech Processing (VLSP 2023).", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.548422813415527, -0.27123868465423584], "openalex_id": "https://openalex.org/W4394947542", "title": "TeClass: A Human-Annotated Relevance-based Headline Classification and Generation Dataset for Telugu", "authors": "Gopichand Kanumolu, Lokesh Madasu, Nirmal Surange, Manish Shrivastava", "abstract": "News headline generation is a crucial task in increasing productivity for both the readers and producers of news. This task can easily be aided by automated News headline-generation models. However, the presence of irrelevant headlines in scraped news articles results in sub-optimal performance of generation models. We propose that relevance-based headline classification can greatly aid the task of generating relevant headlines. Relevance-based headline classification involves categorizing news headlines based on their relevance to the corresponding news articles. While this task is well-established in English, it remains under-explored in low-resource languages like Telugu due to a lack of annotated data. To address this gap, we present TeClass, the first-ever human-annotated Telugu news headline classification dataset, containing 78,534 annotations across 26,178 article-headline pairs. We experiment with various baseline models and provide a comprehensive analysis of their results. We further demonstrate the impact of this work by fine-tuning various headline generation models using TeClass dataset. The headlines generated by the models fine-tuned on highly relevant article-headline pairs, showed about a 5 point increment in the ROUGE-L scores. To encourage future research, the annotated dataset as well as the annotation guidelines will be made publicly available.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.8991496562957764, 4.073581695556641], "openalex_id": "https://openalex.org/W4394948338", "title": "Sampling-based Pseudo-Likelihood for Membership Inference Attacks", "authors": "Masahiro Kaneko, Youmi Ma, Yuki Wata, Naoaki Okazaki", "abstract": "Large Language Models (LLMs) are trained on large-scale web data, which makes it difficult to grasp the contribution of each text. This poses the risk of leaking inappropriate data such as benchmarks, personal information, and copyrighted texts in the training data. Membership Inference Attacks (MIA), which determine whether a given text is included in the model's training data, have been attracting attention. Previous studies of MIAs revealed that likelihood-based classification is effective for detecting leaks in LLMs. However, the existing methods cannot be applied to some proprietary models like ChatGPT or Claude 3 because the likelihood is unavailable to the user. In this study, we propose a Sampling-based Pseudo-Likelihood (\\textbf{SPL}) method for MIA (\\textbf{SaMIA}) that calculates SPL using only the text generated by an LLM to detect leaks. The SaMIA treats the target text as the reference text and multiple outputs from the LLM as text samples, calculates the degree of $n$-gram match as SPL, and determines the membership of the text in the training data. Even without likelihoods, SaMIA performed on par with existing likelihood-based methods.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.65195894241333, 1.6359424591064453], "openalex_id": "https://openalex.org/W4394931690", "title": "The Practical Epistemologies of Design and Artificial Intelligence", "authors": "William Billingsley", "abstract": "Abstract This article explores the epistemological trade-offs that practical and technology design fields make by exploring past philosophical discussions of design, practitioner research, and pragmatism. It argues that as technologists apply Artificial Intelligence (AI) and machine learning (ML) to more domains, the technology brings this same set of epistemological trade-offs with it. The basis of the technology becomes the basis of what it finds. There are correlations between questions that designers face in sampling and gathering data that is rich with context, and those that large-scale machine learning faces in how it approaches the rich context and subjectivity within its training data. AI, however, processes enormous amounts of data and produces models that can be explored. This makes its form of pragmatic inquiry that is amenable to optimisation. Finally, the paper explores implications for education that stem from how we apply AI to pedagogy and explanation, suggesting that the availability of AI-generated explanations and materials may also push pedagogy in directions of pragmatism: the evidence that explanations are effective may precede explorations of why they should be.", "venue": "Science & Education", "label": 0}, {"loc": [8.831182479858398, 2.401616334915161], "openalex_id": "https://openalex.org/W4394948080", "title": "Pack of LLMs: Model Fusion at Test-Time via Perplexity Optimization", "authors": "Costas Mavromatis, Petros Karypis, George Karypis", "abstract": "Fusing knowledge from multiple Large Language Models (LLMs) can combine their diverse strengths to achieve improved performance on a given task. However, current fusion approaches either rely on learning-based fusers that do not generalize to new LLMs, or do not take into account how well each LLM understands the input. In this work, we study LLM fusion at test-time, which enables leveraging knowledge from arbitrary user-specified LLMs during inference. We introduce Pack of LLMs (PackLLM), an effective method for test-time fusion that leverages each LLM's expertise, given an input prompt. PackLLM performs model fusion by solving an optimization problem for determining each LLM's importance, so that perplexity over the input prompt is minimized. First, our simple PackLLM-sim variant validates that perplexity is a good indicator for measuring each LLM's expertise. Second, our PackLLM-opt variant approximately solves the perplexity minimization problem via a greedy algorithm. The derived importance weights are used to combine the LLMs during inference. We conduct experiments with over 100 total LLMs on a diverse set of tasks. Experimental results show that (i) perplexity is a reliable measure for LLM fusion, (ii) PackLLM outperforms test-time fusion baselines by 1.89% accuracy points, and (iii) PackLLM can leverage new LLMs to improve performance over learning-based fusion approaches by 3.92-11.94% accuracy points.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.491428375244141, 2.104862928390503], "openalex_id": "https://openalex.org/W4394947057", "title": "Fine Tuning LLM for Enterprise: Practical Guidelines and Recommendations", "authors": "Mathav Raj J, Kushala VM, Harikrishna Warrier, Yogesh Kumar Gupta", "abstract": "There is a compelling necessity from enterprises for fine tuning LLMs (Large Language Models) o get them trained on proprietary domain knowledge. The challenge is to imbibe the LLMs with domain specific knowledge using the most optimial resource and cost and in the best possible time. Many enterprises rely on RAG (Retrieval Augmented Generation) which does not need LLMs to be ine-tuned but they are limited by the quality of vector databases and their retrieval capabilities rather than the intrinsic capabilities of the LLMs themselves. In our current work we focus on fine tuning LLaMA, an open source LLM using proprietary documents and code from an enterprise repository and use the fine tuned models to evaluate the quality of responses. As part of this work, we aim to guide beginners on how to start with fine tuning an LLM for documentation and code by making educated guesses on size of GPU required and options that are available for formatting the data. We also propose pre processing recipes for both documentation and code to prepare dataset in different formats. The proposed methods of data preparation for document datasets are forming paragraph chunks, forming question and answer pairs and forming keyword and paragraph chunk pairs. For code dataset we propose forming summary and function pairs. Further, we qualitatively evaluate the results of the models for domain specific queries. Finally, we also propose practical guidelines and recommendations for fine tuning LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.94507360458374, 0.0459136925637722], "openalex_id": "https://openalex.org/W4394880281", "title": "Harmony in the Australian Domain Space", "authors": "Xian Gong, Paul X. McCarthy, Marian-Andrei Rizoiu, Paolo Boldi", "abstract": "In this paper we use for the first time a systematic approach in the study of harmonic centrality at a Web domain level, and gather a number of significant new findings about the Australian web. In particular, we explore the relationship between economic diversity at the firm level and the structure of the Web within the Australian domain space, using harmonic centrality as the main structural feature. The distribution of harmonic centrality values is analyzed over time, and we find that the distributions exhibit a consistent pattern across the different years. The observed distribution is well captured by a partition of the domain space into six clusters; the temporal movement of domain names across these six positions yields insights into the Australian Domain Space and exhibits correlations with other non-structural characteristics. From a more global perspective, we find a significant correlation between the median harmonic centrality of all domains in each OECD country and one measure of global trust, the WJP Rule of Law Index. Further investigation demonstrates that 35 countries in OECD share similar harmonic centrality distributions. The observed homogeneity in distribution presents a compelling avenue for exploration, potentially unveiling critical corporate, regional, or national insights.", "venue": "https://doi.org/10.1145/3614419.3643998", "label": 0}, {"loc": [6.045962333679199, 0.4782627820968628], "openalex_id": "https://openalex.org/W4394907057", "title": "PRODIS-a speech database and a phoneme-based language model for the study of predictability effects in Polish", "authors": "Zofia Malisz, Jan Foremski, Ma\u0142gorzata Kul", "abstract": "We present a speech database and a phoneme-level language model of Polish. The database and model are designed for the analysis of prosodic and discourse factors and their impact on acoustic parameters in interaction with predictability effects. The database is also the first large, publicly available Polish speech corpus of excellent acoustic quality that can be used for phonetic analysis and training of multi-speaker speech technology systems. The speech in the database is processed in a pipeline that achieves a 90% degree of automation. It incorporates state-of-the-art, freely available tools enabling database expansion or adaptation to additional languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.248791217803955, -0.6715396642684937], "openalex_id": "https://openalex.org/W4396853620", "title": "Detecting Dementia from Transcribed Speech in Slovak using Pre-trained BERT Models", "authors": "J\u00e1n Sta\u0161, Daniel Hl\u00e1dek, Ale\u0161 Kopnick\u00fd", "abstract": "Dementia is a neurodegenerative disorder that affects the brain and causes a decrease in cognitive abilities that interfere with memory, thinking, and the ability to perform daily activities. Detecting dementia automatically is a difficult task and involves a detailed analysis of speech and language characteristics and an examination of the participant's mental state. The picture description task is one of the screening tools for dementia detection from transcribed speech, where participants are instructed to describe what they see in a picture. In this paper, we evaluate the performance of available monolingual and multilingual pre-trained BERT models fine-tuned on transcribed Slovak speech from the picture description task with the aim of automatic dementia detection. The results of the F1-score range from 69% to 93% in the binary classification task and are comparable with recently published studies in this area.", "venue": "https://doi.org/10.1109/radioelektronika61599.2024.10524067", "label": 0}, {"loc": [1.9948636293411255, 5.397328853607178], "openalex_id": "https://openalex.org/W4398186252", "title": "A Robust Approach to E-Banking Phishing Detection using Ensemble Methods and LSTM", "authors": "Naga Venkata Siva Reddy, A R Rohith Saai, T V Ramanujan, N Sumanth Reddy, R. Priyanka Pramila", "abstract": "Recently, the integration of Internet and banking technologies has shifted financial transactions primarily to online platforms. However, this transformation has exposed the financial sector to cyber threats, particularly phishing, resulting in substantial annual losses and widespread data breaches. Despite ongoing efforts to combat phishing, a comprehensive solution remains elusive, creating a vulnerability for cyber attackers to unlawfully access sensitive personal and financial data. In this research, a novel method for successfully recognizing phishing attempts is presented. This proposed methodology includes components such as URLs, HTML, and recurring themes in phishing efforts, which are thoroughly analyzed. These characteristics are consistently similar, resulting in a significant amount of data being collected and sent over the Internet. To preserve the integrity of online financial transactions and shield people from data breaches, a robust phishing attack detection system must be developed. This study applies machine learning techniques such as LSTM, Boosting, Random Forests, and Ensemble models for intelligent phishing detection, attaining a remarkable 97 percent accuracy. To capture complicated data relationships, minimize overfitting, and improve generalization performance, a voting classifier combines predictions and stacking and uses feedback from several models. By adopting this strategy, defenses against the constantly changing cyber threat landscape in the digital financial transaction environment are strengthened.", "venue": "https://doi.org/10.1109/icc-robins60238.2024.10533883", "label": 0}, {"loc": [4.470090389251709, 0.4776792526245117], "openalex_id": "https://openalex.org/W4394890034", "title": "Augmenting Knowledge-Based Conversational Search Systems With Large Language Models", "authors": "Lukas Mosser, Peder Aursand, K. S. Brakstad, Christian Nilsen Lehre, J. Myhre-Bakkevig", "abstract": "Abstract Hydrocarbon exploration and carbon capture and storage (CCS) evaluation are inherently multi-disciplinary tasks that require the integration of knowledge from multiple datatypes set in a historical and geological context. The diverse nature of subsurface data is often represented by a combination of direct and indirect measurements, interpretations and observations documented in multi-dimensional datasets as images, and written reports. The fidelity of these images and reports can have an enormous variety, and different qualities leading to a challenging situation where explorationists need to determine the value of a source of information while combining these sources across large spatiotemporal contexts. Modern search engines today can not only search through document text but also images. These capabilities have improved our ability to find well-known concepts based on short phrases, or keywords, combined with significant meta-data. While these types of search engines have certainly benefited practitioners, the challenge of combining information from multiple data-sources, data modalities and languages remains an open problem. With the advent of conversational large language model (LLM) systems such as ChatGPT (Achiam et al. 2023) that provide coherent textual information and are informed by their training data, have become a reality. While ChatGPT certainly has taken many industries and their disciplines by storm, the tool is not without its shortcomings. For industry applications, in many cases the information necessary to provide answers will be highly proprietary, not shared with third parties and not part of the training data of the popular LLMs. Furthermore, due to their probabilistic nature LLMs suffer from so-called hallucinations, where the model provides a confident answer based on the user provided input but is non-factual and often non-sensical. To answer a given user-query with factuality it is important to provide relevant information as context to the LLMs. Lewis et al. (2020) proposes combining two systems: An information retrieval system that provide relevant information to answer a given question or to solve a specific task, and a second system being an LLM that is supplemented with the retrieved information as context to answer the user's question. This pattern of so-called retrieval-augmented generation (RAG) has become highly popular in the last year due to the strong conversational capabilities of systems like ChatGPT, accessible developer APIs for interfacing with LLMs, open-source software to orchestrate RAG-systems, as well as the rapid development of open-source LLMs (Touvron et al. 2023). Moreover, since the RAG pattern does not require fine-tuning or re-training a language model, it remains one of the most accessible ways to tailor LLMs to proprietary knowledge bases.", "venue": "https://doi.org/10.2118/218439-ms", "label": 0}, {"loc": [7.598400115966797, 1.3454116582870483], "openalex_id": "https://openalex.org/W4394902789", "title": "Improved methodology for longitudinal Web analytics using Common Crawl", "authors": "Henry S. Thompson", "abstract": "Common Crawl is a multi-petabyte longitudinal dataset containing over 100 billion web pages which is widely used as a source of language data for sequence model training and in web science research. Each of its constituent archives is on the order of 75TB in size. Using it for research, particularly longitudinal studies, which necessarily involve multiple archives, is therefore very expensive in terms of compute time and storage space and/or web bandwidth. Two new methods for mitigating this problem are presented here, based on exploiting and extending the much smaller (<200 gigabytes (GB) compressed) index which is available for each archive. By adding Last-Modified timestamps to the index we enable longitudinal exploration using only a single archive. By comparing the distribution of index features for each of the 100 segments into which archive is divided with their distribution over the whole archive, we have identified the least and most representative segments for a number of recent archives. Using this allows the segment(s) that are most representative of an archive to be used as proxies for the whole. We illustrate this approach in an analysis of changes in URI length over time, leading to an unanticipated insight into the how the creation of Web pages has changed over time.", "venue": "https://doi.org/10.1145/3614419.3644018", "label": 0}, {"loc": [3.1626551151275635, 3.809821605682373], "openalex_id": "https://openalex.org/W4394867455", "title": "Misinformation Resilient Search Rankings with Webgraph-based Interventions", "authors": "Peter Carragher, Evan Williams, Kathleen M. Carley", "abstract": "The proliferation of unreliable news domains on the internet has had wide-reaching negative impacts on society. We introduce and evaluate interventions aimed at reducing traffic to unreliable news domains from search engines while maintaining traffic to reliable domains. We build these interventions on the principles of fairness (penalize sites for what is in their control), generality (label/fact-check agnostic), targeted (increase the cost of adversarial behavior), and scalability (works at webscale). We refine our methods on small-scale webdata as a testbed and then generalize the interventions to a large-scale webgraph containing 93.9M domains and 1.6B edges. We demonstrate that our methods penalize unreliable domains far more than reliable domains in both settings and we explore multiple avenues to mitigate unintended effects on both the small-scale and large-scale webgraph experiments. These results indicate the potential of our approach to reduce the spread of misinformation and foster a more reliable online information ecosystem. This research contributes to the development of targeted strategies to enhance the trustworthiness and quality of search engine results, ultimately benefiting users and the broader digital community.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.777608394622803, 2.3821449279785156], "openalex_id": "https://openalex.org/W4394868329", "title": "Compression Represents Intelligence Linearly", "authors": "Yuzhen Huang, Jinghan Zhang, Zifei Shan, Junxian He", "abstract": "There is a belief that learning to compress well will lead to intelligence. Recently, language modeling has been shown to be equivalent to compression, which offers a compelling rationale for the success of large language models (LLMs): the development of more advanced language models is essentially enhancing compression which facilitates intelligence. Despite such appealing discussions, little empirical evidence is present for the interplay between compression and intelligence. In this work, we examine their relationship in the context of LLMs, treating LLMs as data compressors. Given the abstract concept of \"intelligence\", we adopt the average downstream benchmark scores as a surrogate, specifically targeting intelligence related to knowledge and commonsense, coding, and mathematical reasoning. Across 12 benchmarks, our study brings together 31 public LLMs that originate from diverse organizations. Remarkably, we find that LLMs' intelligence -- reflected by average benchmark scores -- almost linearly correlates with their ability to compress external text corpora. These results provide concrete evidence supporting the belief that superior compression indicates greater intelligence. Furthermore, our findings suggest that compression efficiency, as an unsupervised metric derived from raw text corpora, serves as a reliable evaluation measure that is linearly associated with the model capabilities. We open-source our compression datasets as well as our data collection pipelines to facilitate future researchers to assess compression properly.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.167643070220947, 0.5888940095901489], "openalex_id": "https://openalex.org/W4394867839", "title": "Compass: Large Multilingual Language Model for South-east Asia", "authors": "Sophia Maria", "abstract": "Large language models have exhibited significant proficiency in languages endowed with extensive linguistic resources, such as English and Chinese. Nevertheless, their effectiveness notably diminishes when applied to languages characterized by limited linguistic resources, particularly within the Southeast Asian linguistic landscape, such as Indonesian. The scarcity of linguistic resources for these languages presents challenges associated with inadequate training, restricted vocabulary coverage, and challenging evaluation processes. In response to these exigencies, we have introduced CompassLLM, a large multilingual model specifically tailored for Southeast Asian languages, with the primary aim of supporting the developmental requirements of Shopee. Our methodology encompasses several key strategies. To progressively enhance multilingual proficiencies, we implemented a multi-stage pre-training strategy integrated with curriculum learning, gradually intensifying the focus on low-resource languages. Concurrently, to better accommodate low-resource human instructions, we curated and generated a repository of high-quality multilingual human instructions, culminating the CompassLLM-SFT model through supervised instruction fine-tuning. Finally, to reinforce the model's alignment with human preference behaviors, we have embraced the principle of Direct Preference Optimization (DPO) to obtain CompassLLM-DPO model. Preliminary evaluation of the CompassLLM model yields promising results, with our model surpassing benchmark models like Vicuna-7b-v1.5, Sealion, Falcon and SeaLLM, across diverse evaluation tasks, as verified through both automated and human-driven assessments. Notably, our model exhibits its superior performance in South-east Asia languages, such as Indonesian language.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.895095109939575, -1.297192096710205], "openalex_id": "https://openalex.org/W4394843265", "title": "Impact of COVID-19 Pandemic on Social Determinants of Health Issues of Marginalized Black and Asian Communities: A Social Media Analysis Empowered by \u2026", "authors": "Christopher Whitfield, Yang Liu, Mohd Anwar", "abstract": "Abstract Purpose This study aims to understand the impact of the COVID-19 pandemic on social determinants of health (SDOH) of marginalized racial/ethnic US population groups, specifically African Americans and Asians, by leveraging natural language processing (NLP) and machine learning (ML) techniques on race-related spatiotemporal social media text data. Specifically, this study establishes the extent to which Latent Dirichlet Allocation (LDA) and Gibbs Sampling Dirichlet Multinomial Mixture (GSDMM)-based topic modeling determines social determinants of health (SDOH) categories, and how adequately custom named-entity recognition (NER) detects key SDOH factors from a race/ethnicity-related Reddit data corpus. Methods In this study, we collected race/ethnicity-specific data from 5 location subreddits including New York City, NY; Los Angeles, CA; Chicago, IL; Philadelphia, PA; and Houston, TX from March to December 2019 (before COVID-19 pandemic) and from March to December 2020 (during COVID-19 pandemic). Next, we applied methods from natural language processing and machine learning to analyze SDOH issues from extracted Reddit comments and conversation threads using feature engineering, topic modeling, and custom named-entity recognition (NER). Results Topic modeling identified 35 SDOH-related topics. The SDOH-based custom NER analyses revealed that the COVID-19 pandemic significantly impacted SDOH issues of marginalized Black and Asian communities. On average, the Social and Community Context (SCC) category of SDOH had the highest percent increase (366%) from the pre-pandemic period to the pandemic period across all locations and population groups. Some of the detected SCC issues were racism, protests, arrests, immigration, police brutality, hate crime, white supremacy, and discrimination. Conclusion Reddit social media platform can be an alternative source to assess the SDOH issues of marginalized Black and Asian communities during the COVID-19 pandemic. By employing NLP/ML techniques such as LDA/GSDMM-based topic modeling and custom NER on a race/ethnicity-specific Reddit corpus, we uncovered various SDOH issues affecting marginalized Black and Asian communities that were significantly worsened during the COVID-19 pandemic. As a result of conducting this research, we recommend that researchers, healthcare providers, and governments utilize social media and collaboratively formulate responses and policies that will address SDOH issues during public health crises.", "venue": "Journal of Racial and Ethnic Health Disparities", "label": 0}, {"loc": [3.087723970413208, -0.3739544749259949], "openalex_id": "https://openalex.org/W4394877296", "title": "Recent Advances in Large Language Models for Healthcare", "authors": "Khalid Nassiri, Moulay A. Akhloufi", "abstract": "Recent advances in the field of large language models (LLMs) underline their high potential for applications in a variety of sectors. Their use in healthcare, in particular, holds out promising prospects for improving medical practices. As we highlight in this paper, LLMs have demonstrated remarkable capabilities in language understanding and generation that could indeed be put to good use in the medical field. We also present the main architectures of these models, such as GPT, Bloom, or LLaMA, composed of billions of parameters. We then examine recent trends in the medical datasets used to train these models. We classify them according to different criteria, such as size, source, or subject (patient records, scientific articles, etc.). We mention that LLMs could help improve patient care, accelerate medical research, and optimize the efficiency of healthcare systems such as assisted diagnosis. We also highlight several technical and ethical issues that need to be resolved before LLMs can be used extensively in the medical field. Consequently, we propose a discussion of the capabilities offered by new generations of linguistic models and their limitations when deployed in a domain such as healthcare.", "venue": "BioMedInformatics", "label": 0}, {"loc": [5.6724090576171875, -0.7275249361991882], "openalex_id": "https://openalex.org/W4399729437", "title": "Comparison of Common Crawl News & GDELT", "authors": "Ameir El Ouadi, David M. Beskow", "abstract": "The corpus of worldwide news is important for natural language processing, knowledge graphs, large language models, and other technical efforts. Additionally, this corpus is important for understanding the people, places, organizations, and events that interact in real-time every day. This paper compares two news datasets used for these tasks today, namely the Global Database of Events, Language, and Tone (GDELT) and Common Crawl News. Our research highlights the strengths and limitations of each dataset, analyzing their content and coverage. Notably, while GDELT relies on broadcasts, prints, and web news from across the globe, Common Crawl focuses on news sites from around the world gathered through web crawling. Our analysis revealed considerable differences in where the two datasets gather their news sources.", "venue": "https://doi.org/10.1109/syscon61195.2024.10553540", "label": 0}, {"loc": [7.616093635559082, -1.020843744277954], "openalex_id": "https://openalex.org/W4394838764", "title": "Extending Translate-Train for ColBERT-X to African Language CLIR", "authors": "Eugene Yang, Dawn Lawrie, Paul McNamee, James Mayfield", "abstract": "This paper describes the submission runs from the HLTCOE team at the CIRAL CLIR tasks for African languages at FIRE 2023. Our submissions use machine translation models to translate the documents and the training passages, and ColBERT-X as the retrieval model. Additionally, we present a set of unofficial runs that use an alternative training procedure with a similar training setting.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.50400447845459, 1.1710741519927979], "openalex_id": "https://openalex.org/W4399728959", "title": "Designing an Intelligent System to Map Global Connections", "authors": "Elijah Bellamy, K. Farrell, Aiden Hopping, James Pinter, Meenakshi Saju, David M. Beskow", "abstract": "This study develops a knowledge graph from the Common Crawl News Dataset to provide situational awareness and answer information requirements for national security leaders. We map dynamic global interactions between entities using news data collected over a defined period. We developed a data pipeline to extract semantic content from the Common Crawl News feed and filter it to content related to national security. We build a knowledge graph using national security-related content using Named Entity Recognition, Named Entity Disambiguation, and Named Entity Linking. After developing this intelligent system, we demonstrate its value in an excursion on a Russian language knowledge graph focused on the Russian invasion of Ukraine. The knowledge graphs developed in this study provide decision-makers an understanding of the entities and links in our highly interconnected world.", "venue": "https://doi.org/10.1109/syscon61195.2024.10553593", "label": 0}, {"loc": [3.2299606800079346, 2.7594316005706787], "openalex_id": "https://openalex.org/W4394803802", "title": "Reducing Risks Posed by Synthetic Content", "authors": "Gaithersburg MD NIST", "abstract": "This report examines the existing standards, tools, methods, and practices, as well as the potential development of further science-backed standards and techniques, for: authenticating content and tracking its provenance; labeling synthetic content, such as using watermarking; detecting synthetic content; preventing generative AI (GAI) from producing child sexual abuse material or producing non-consensual intimate imagery of real individuals (to include intimate digital depictions of the body or body parts of an identifiable individual); testing software used for the above purposes; and auditing and maintaining synthetic content.", "venue": "https://doi.org/10.6028/nist.ai.100-4", "label": 0}, {"loc": [4.406656265258789, -2.121595859527588], "openalex_id": "https://openalex.org/W4394853018", "title": "Predicting fraud in MD&A sections using deep learning", "authors": "Sachin Velloor Sivasubramanian, David B. Skillicorn", "abstract": "Conventional data analytic techniques have been successfully applied to detecting fraud in the Management's Discussion and Analysis sections of company filings mandated by the SEC. Here, we investigate whether fraud detection can be improved by applying deep learning techniques. We build 18 deep learning models and compare their performance on a set of MD&A documents. The best-performing model achieved an accuracy of 91% and an F1-score of 77%, only slightly better than a conventional XGBoost predictor that achieved an accuracy of 91% and an F1-score of 73%. Of the deep learning models, the transformer, those incorporating attention mechanisms, and convolutional neural networks performed well; somewhat surprisingly, sequential models such as LSTMs did not.", "venue": "Journal of Business Analytics", "label": 0}, {"loc": [4.228961944580078, 1.219886302947998], "openalex_id": "https://openalex.org/W4394792882", "title": "UNVEILING THE COGNITIVE CAPACITY OF CHATGPT: ASSESSING ITS HUMAN-LIKE REASONING ABILITIES", "authors": "Johnson Oyeniyi", "abstract": "In recent years, remarkable advances have been made in the field of natural language processing, leading to the development of advanced conversational AI models such as ChatGPT.Although these models have demonstrated excellent language generation capabilities, the extent to which they are capable of human-like thinking remains an open question.This study aims to investigate and evaluate ChatGPT's cognitive abilities, with a particular emphasis on thinking abilities.We propose a comprehensive assessment framework that includes a variety of reasoning tasks designed to test different aspects of human-like cognition.Through these tasks, you will test ChatGPT's deductive, inductive, analogical, and common-sense reasoning abilities.To provide a basis for comparison, we use a benchmark dataset containing examples of human-generated inferences.Next, we optimize ChatGPT using a combination of supervised learning and reinforcement learning techniques to improve inference skills.The fine-tuning process entails exposing the model to a large number of argument-based prompts and providing reinforcing signals for correct logical reasoning.Our preliminary results suggest that ChatGPT has promising potential to enable human-like thinking.Demonstrate a rational ability to draw logical relationships, draw accurate conclusions, draw analogies, and apply rational thinking in a variety of situations.However, significant challenges remain, especially in processing complex and ambiguous scenarios and understanding subtle contextual cues that humans easily pick up.Furthermore, we investigate the limitations and potential biases associated with ChatGPT's inference capabilities.Identify problem areas in your model.B. Handle contradictory information, recognize subtle logical fallacies, and address dilemmas in ethical debates.These findings are important both for understanding the current capabilities of the model and for defining future research directions.This study provides a comprehensive survey of ChatGPT's reasoning capabilities and focuses on advances in AI model development that exhibit human-like cognitive abilities.By identifying strengths, weaknesses, and areas for improvement, we pave the way for further advances in the field and contribute to the development of more sophisticated conversational AI systems with improved reasoning capabilities.", "venue": "International Research Journal of Modernization in Engineering Technology and Science", "label": 0}, {"loc": [5.235366344451904, 1.1059284210205078], "openalex_id": "https://openalex.org/W4392735644", "title": "LLMs Still Can't Avoid Instanceof: An Investigation Into GPT-3.5, GPT-4 and Bard's Capacity to Handle Object-Oriented Programming Assignments", "authors": "Bruno Pereira Cipriano, Pedro Alves", "abstract": "Large Language Models (LLMs) have emerged as promising tools to assist students while solving programming assignments. However, object-oriented programming (OOP), with its inherent complexity involving the identification of entities, relationships, and responsibilities, is not yet mastered by these tools. Contrary to introductory programming exercises, there exists a research gap with regard to the behavior of LLMs in OOP contexts. In this study, we experimented with three prominent LLMs - GPT-3.5, GPT-4, and Bard - to solve real-world OOP exercises used in educational settings, subsequently validating their solutions using an Automatic Assessment Tool (AAT). The findings revealed that while the models frequently achieved mostly working solutions to the exercises, they often overlooked the best practices of OOP. GPT-4 stood out as the most proficient, followed by GPT-3.5, with Bard trailing last. We advocate for a renewed emphasis on code quality when employing these models and explore the potential of pairing LLMs with AATs in pedagogical settings. In conclusion, while GPT-4 showcases promise, the deployment of these models in OOP education still mandates supervision.", "venue": "https://doi.org/10.1145/3639474.3640052", "label": 0}, {"loc": [4.489331245422363, 2.397284746170044], "openalex_id": "https://openalex.org/W4394784545", "title": "Intersectional Male-Centric and White-Centric Biases in Collective Concepts", "authors": "April H. Bailey, Adina Williams, Aashna Poddar, Andrei Cimpian", "abstract": "In principle, the fundamental concepts person, woman, and man should apply equally to people of different genders and races/ethnicities. In reality, these concepts might prioritize certain groups over others. Based on interdisciplinary theories of androcentrism, we hypothesized that (a) person is more associated with men than women (person = man) and (b) woman is more associated with women than man is with men (i.e., women are more gendered: gender = woman). We applied natural language processing tools (specifically, word embeddings) to the linguistic output of millions of individuals (specifically, the Common Crawl corpus). We found the hypothesized person = man / gender = woman bias. This bias was stronger about Hispanic and White (vs. Asian) women and men. We also uncovered parallel biases favoring White individuals in the concepts person, woman, and man. Western society prioritizes men and White individuals as people and \u201cothers\u201d women as people with gender, with implications for equity across policy- and decision-making contexts.", "venue": "Personality and Social Psychology Bulletin", "label": 0}, {"loc": [7.628262519836426, 3.803997755050659], "openalex_id": "https://openalex.org/W4394780940", "title": "JetMoE: Reaching Llama2 Performance with 0.1 M Dollars", "authors": "Yikang Shen, Zhen Guo, Tianle Cai, Zengyi Qin", "abstract": "Large Language Models (LLMs) have achieved remarkable results, but their increasing resource demand has become a major obstacle to the development of powerful and accessible super-human intelligence. This report introduces JetMoE-8B, a new LLM trained with less than $0.1 million, using 1.25T tokens from carefully mixed open-source corpora and 30,000 H100 GPU hours. Despite its low cost, the JetMoE-8B demonstrates impressive performance, with JetMoE-8B outperforming the Llama2-7B model and JetMoE-8B-Chat surpassing the Llama2-13B-Chat model. These results suggest that LLM training can be much more cost-effective than generally thought. JetMoE-8B is based on an efficient Sparsely-gated Mixture-of-Experts (SMoE) architecture, composed of attention and feedforward experts. Both layers are sparsely activated, allowing JetMoE-8B to have 8B parameters while only activating 2B for each input token, reducing inference computation by about 70% compared to Llama2-7B. Moreover, JetMoE-8B is highly open and academia-friendly, using only public datasets and training code. All training parameters and data mixtures have been detailed in this report to facilitate future efforts in the development of open foundation models. This transparency aims to encourage collaboration and further advancements in the field of accessible and efficient LLMs. The model weights are publicly available at https://github.com/myshell-ai/JetMoE.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.1020379066467285, 0.41163620352745056], "openalex_id": "https://openalex.org/W4394781933", "title": "ResearchAgent: Iterative Research Idea Generation over Scientific Literature with Large Language Models", "authors": "Jinheon Baek, Sunil Kumar Jauhar, Silviu Cucerzan, Sung Ju Hwang", "abstract": "The pace of scientific research, vital for improving human life, is complex, slow, and needs specialized expertise. Meanwhile, novel, impactful research often stems from both a deep understanding of prior work, and a cross-pollination of ideas across domains and fields. To enhance the productivity of researchers, we propose ResearchAgent, which leverages the encyclopedic knowledge and linguistic reasoning capabilities of Large Language Models (LLMs) to assist them in their work. This system automatically defines novel problems, proposes methods and designs experiments, while iteratively refining them based on the feedback from collaborative LLM-powered reviewing agents. Specifically, starting with a core scientific paper, ResearchAgent is augmented not only with relevant publications by connecting information over an academic graph but also entities retrieved from a knowledge store derived from shared underlying concepts mined across numerous papers. Then, mimicking a scientific approach to improving ideas with peer discussions, we leverage multiple LLM-based ReviewingAgents that provide reviews and feedback via iterative revision processes. These reviewing agents are instantiated with human preference-aligned LLMs whose criteria for evaluation are elicited from actual human judgments via LLM prompting. We experimentally validate our ResearchAgent on scientific publications across multiple disciplines, showing its effectiveness in generating novel, clear, and valid ideas based on both human and model-based evaluation results. Our initial foray into AI-mediated scientific research has important implications for the development of future systems aimed at supporting researchers in their ideation and operationalization of novel work.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.221755027770996, 0.3352181017398834], "openalex_id": "https://openalex.org/W4394775583", "title": "Comprehensive analysis of natural language processing", "authors": "Rohit Yadav, Aanchal Madaan, Janu", "abstract": "Natural Language Processing (NLP) is a fascinating field of study that teaches computers to understand and use human language. This means that computers can read, write, and even translate text just like humans. NLP has many practical uses, such as categorizing text, identifying the tone of language, recognizing names in text, translating languages, and answering questions. NLP has come a long way since it was first developed. In the past, it relied on strict rules to understand language, but now it uses advanced techniques like machine learning and deep learning to understand text. However, there are still some challenges in NLP, such as understanding the meaning of words in context and considering cultural differences. Despite these challenges, NLP is being used in many different areas, from healthcare and finance to education and customer service. NLP is transforming the way humans interact with computers and is making it easier to extract important information from large amounts of text.", "venue": "Global Journal of Engineering and Technology Advances", "label": 0}, {"loc": [5.257819175720215, -0.3490246832370758], "openalex_id": "https://openalex.org/W4394710866", "title": "Pitfalls of Conversational LLMs on News Debiasing", "authors": "Ipek Baris Schlicht, Defne Altiok, Maryanne Taouk, Lucie Flek", "abstract": "This paper addresses debiasing in news editing and evaluates the effectiveness of conversational Large Language Models in this task. We designed an evaluation checklist tailored to news editors' perspectives, obtained generated texts from three popular conversational models using a subset of a publicly available dataset in media bias, and evaluated the texts according to the designed checklist. Furthermore, we examined the models as evaluator for checking the quality of debiased model outputs. Our findings indicate that none of the LLMs are perfect in debiasing. Notably, some models, including ChatGPT, introduced unnecessary changes that may impact the author's style and create misinformation. Lastly, we show that the models do not perform as proficiently as domain experts in evaluating the quality of debiased outputs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.105635166168213, 1.8056522607803345], "openalex_id": "https://openalex.org/W4394708835", "title": "GeniL: A Multilingual Dataset on Generalizing Language", "authors": "Aida Mostafazadeh Davani, Sagar Gubbi, Sunipa Dev, Shachi Dave, Vinodkumar Prabhakaran", "abstract": "Generative language models are transforming our digital ecosystem, but they often inherit societal biases, for instance stereotypes associating certain attributes with specific identity groups. While whether and how these biases are mitigated may depend on the specific use cases, being able to effectively detect instances of stereotype perpetuation is a crucial first step. Current methods to assess presence of stereotypes in generated language rely on simple template or co-occurrence based measures, without accounting for the variety of sentential contexts they manifest in. We argue that understanding the sentential context is crucial for detecting instances of generalization. We distinguish two types of generalizations: (1) language that merely mentions the presence of a generalization (\"people think the French are very rude\"), and (2) language that reinforces such a generalization (\"as French they must be rude\"), from non-generalizing context (\"My French friends think I am rude\"). For meaningful stereotype evaluations, we need to reliably distinguish such instances of generalizations. We introduce the new task of detecting generalization in language, and build GeniL, a multilingual dataset of over 50K sentences from 9 languages (English, Arabic, Bengali, Spanish, French, Hindi, Indonesian, Malay, and Portuguese) annotated for instances of generalizations. We demonstrate that the likelihood of a co-occurrence being an instance of generalization is usually low, and varies across different languages, identity groups, and attributes. We build classifiers to detect generalization in language with an overall PR-AUC of 58.7, with varying degrees of performance across languages. Our research provides data and tools to enable a nuanced understanding of stereotype perpetuation, a crucial step towards more inclusive and responsible language technologies.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.419137001037598, 2.8134827613830566], "openalex_id": "https://openalex.org/W4394709684", "title": "MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies", "authors": "Shengding Hu, Yuge Tu, Xu Han, Chaoqun He, Ganqu Cui, Long Xiang, Zhi Zheng, Yewei Fang, Yuxiang Huang, Weilin Zhao, Xinrong Zhang, Zheng Leng Thai, Kaihuo Zhang, Chongyi Wang, Yuan Yao, Chenyang Zhao, Jie Zhou, Jie Cai, Zhongwu Zhai, Ning Ding, Chao Jia, Guoyang Zeng, Dahai Li, Zhiyuan Liu, Maosong Sun", "abstract": "The burgeoning interest in developing Large Language Models (LLMs) with up to trillion parameters has been met with concerns regarding resource efficiency and practical expense, particularly given the immense cost of experimentation. This scenario underscores the importance of exploring the potential of Small Language Models (SLMs) as a resource-efficient alternative. In this context, we introduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter variants, not only excel in their respective categories but also demonstrate capabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach exhibits scalability in both model and data dimensions for future LLM research. Regarding model scaling, we employ extensive model wind tunnel experiments for stable and optimal scaling. For data scaling, we introduce a Warmup-Stable-Decay (WSD) learning rate scheduler (LRS), conducive to continuous training and domain adaptation. We present an in-depth analysis of the intriguing training dynamics that occurred in the WSD LRS. With WSD LRS, we are now able to efficiently study data-model scaling law without extensive retraining experiments on both axes of model and data, from which we derive the much higher compute optimal data-model ratio than Chinchilla Optimal. Additionally, we introduce MiniCPM family, including MiniCPM-DPO, MiniCPM-MoE and MiniCPM-128K, whose excellent performance further cementing MiniCPM's foundation in diverse SLM applications. MiniCPM models are available publicly at https://github.com/OpenBMB/MiniCPM.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.021404266357422, 3.452190637588501], "openalex_id": "https://openalex.org/W4394652678", "title": "The Case for Developing a Foundation Model for Planning-like Tasks from Scratch", "authors": "Biplav Srivastava, Vishal Pallagani", "abstract": "Foundation Models (FMs) have revolutionized many areas of computing, including Automated Planning and Scheduling (APS). For example, a recent study found them useful for planning problems: plan generation, language translation, model construction, multi-agent planning, interactive planning, heuristics optimization, tool integration, and brain-inspired planning. Besides APS, there are many seemingly related tasks involving the generation of a series of actions with varying guarantees of their executability to achieve intended goals, which we collectively call planning-like (PL) tasks like business processes, programs, workflows, and guidelines, where researchers have considered using FMs. However, previous works have primarily focused on pre-trained, off-the-shelf FMs and optionally fine-tuned them. This paper discusses the need for a comprehensive FM for PL tasks from scratch and explores its design considerations. We argue that such an FM will open new and efficient avenues for PL problem-solving, just like LLMs are creating for APS.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.305989742279053, -2.554548501968384], "openalex_id": "https://openalex.org/W4394662064", "title": "Contextual Chart Generation for Cyber Deception", "authors": "David D. Nguyen, David Liebowitz, \u202aSurya Nepal\u202c, Salil S. Kanhere, Alsharif Abuadbba", "abstract": "Honeyfiles are security assets designed to attract and detect intruders on compromised systems. Honeyfiles are a type of honeypot that mimic real, sensitive documents, creating the illusion of the presence of valuable data. Interaction with a honeyfile reveals the presence of an intruder, and can provide insights into their goals and intentions. Their practical use, however, is limited by the time, cost and effort associated with manually creating realistic content. The introduction of large language models has made high-quality text generation accessible, but honeyfiles contain a variety of content including charts, tables and images. This content needs to be plausible and realistic, as well as semantically consistent both within honeyfiles and with the real documents they mimic, to successfully deceive an intruder. In this paper, we focus on an important component of the honeyfile content generation problem: document charts. Charts are ubiquitous in corporate documents and are commonly used to communicate quantitative and scientific data. Existing image generation models, such as DALL-E, are rather prone to generating charts with incomprehensible text and unconvincing data. We take a multi-modal approach to this problem by combining two purpose-built generative models: a multitask Transformer and a specialized multi-head autoencoder. The Transformer generates realistic captions and plot text, while the autoencoder generates the underlying tabular data for the plot. To advance the field of automated honeyplot generation, we also release a new document-chart dataset and propose a novel metric Keyword Semantic Matching (KSM). This metric measures the semantic consistency between keywords of a corpus and a smaller bag of words. Extensive experiments demonstrate excellent performance against multiple large language models, including ChatGPT and GPT4.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.6239776611328125, 4.14549446105957], "openalex_id": "https://openalex.org/W4396790742", "title": "Indiscriminate Data Poisoning Attacks on Pre-trained Feature Extractors", "authors": "Yiwei Lu, Matthew Y. R. Yang, Gautam Kamath, Yaoliang Yu", "abstract": "Machine learning models have achieved great success in supervised learning tasks for end-to-end training, which requires a large amount of labeled data that is not always feasible. Recently, many practitioners have shifted to self-supervised learning (SSL) methods (e.g., contrastive learning) that utilize cheap unlabeled data to learn a general feature extractor via pre-training, which can be further applied to personalized downstream tasks by simply training an additional linear layer with limited labeled data. However, such a process may also raise concerns regarding data poisoning attacks. For instance, indiscriminate data poisoning attacks, which aim to decrease model utility by injecting a small number of poisoned data into the training set, pose a security risk to machine learning models, but have only been studied for end-to-end supervised learning. In this paper, we extend the exploration of the threat of indiscriminate attacks on downstream tasks that apply pre-trained feature extractors. Specifically, we propose two types of attacks: (1) the input space attacks, where we modify existing attacks (e.g., TGDA attack and GC attack) to directly craft poisoned data in the input space. However, due to the difficulty of optimization under constraints, we further propose (2) the feature targeted attacks, where we mitigate the challenge with three stages, firstly acquiring target parameters for the linear head; secondly finding poisoned features by treating the learned feature representations as a dataset; and thirdly inverting the poisoned features back to the input space. Our experiments examine such attacks in popular downstream tasks of fine-tuning on the same dataset and transfer learning that considers domain adaptation. Empirical results reveal that transfer learning is more vulnerable to our attacks. Additionally, input space attacks are a strong threat if no countermeasures are posed, but are otherwise weaker than feature targeted attacks.", "venue": "https://doi.org/10.1109/satml59370.2024.00023", "label": 0}, {"loc": [3.8200807571411133, -0.05410351976752281], "openalex_id": "https://openalex.org/W4394654404", "title": "Inferring the Phylogeny of Large Language Models and Predicting their Performances in Benchmarks", "authors": "Nicolas Yax, Pierre\u2010Yves Oudeyer, Stefano Palminteri", "abstract": "This paper introduces PhyloLM, a method adapting phylogenetic algorithms to Large Language Models (LLMs) to explore whether and how they relate to each other and to predict their performance characteristics. Our method calculates a phylogenetic distance metric based on the similarity of LLMs' output. The resulting metric is then used to construct dendrograms, which satisfactorily capture known relationships across a set of 111 open-source and 45 closed models. Furthermore, our phylogenetic distance predicts performance in standard benchmarks, thus demonstrating its functional validity and paving the way for a time and cost-effective estimation of LLM capabilities. To sum up, by translating population genetic concepts to machine learning, we propose and validate a tool to evaluate LLM development, relationships and capabilities, even in the absence of transparent training information.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.997730255126953, 0.16932648420333862], "openalex_id": "https://openalex.org/W4394708607", "title": "KazQAD: Kazakh Open-Domain Question Answering Dataset", "authors": "Rustem Yeshpanov, Pavel Efimov, Leonid Boytsov, Ardak Shalkarbayuli, Pavel Braslavski", "abstract": "We introduce KazQAD -- a Kazakh open-domain question answering (ODQA) dataset -- that can be used in both reading comprehension and full ODQA settings, as well as for information retrieval experiments. KazQAD contains just under 6,000 unique questions with extracted short answers and nearly 12,000 passage-level relevance judgements. We use a combination of machine translation, Wikipedia search, and in-house manual annotation to ensure annotation efficiency and data quality. The questions come from two sources: translated items from the Natural Questions (NQ) dataset (only for training) and the original Kazakh Unified National Testing (UNT) exam (for development and testing). The accompanying text corpus contains more than 800,000 passages from the Kazakh Wikipedia. As a supplementary dataset, we release around 61,000 question-passage-answer triples from the NQ dataset that have been machine-translated into Kazakh. We develop baseline retrievers and readers that achieve reasonable scores in retrieval (NDCG@10 = 0.389 MRR = 0.382), reading comprehension (EM = 38.5 F1 = 54.2), and full ODQA (EM = 17.8 F1 = 28.7) settings. Nevertheless, these results are substantially lower than state-of-the-art results for English QA collections, and we think that there should still be ample room for improvement. We also show that the current OpenAI's ChatGPTv3.5 is not able to answer KazQAD test questions in the closed-book setting with acceptable quality. The dataset is freely available under the Creative Commons licence (CC BY-SA) at https://github.com/IS2AI/KazQAD.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.828303575515747, -0.10285618901252747], "openalex_id": "https://openalex.org/W4394654404", "title": "Phylolm: Inferring the phylogeny of large language models and predicting their performances in benchmarks", "authors": "Nicolas Yax, Pierre\u2010Yves Oudeyer, Stefano Palminteri", "abstract": "This paper introduces PhyloLM, a method adapting phylogenetic algorithms to Large Language Models (LLMs) to explore whether and how they relate to each other and to predict their performance characteristics. Our method calculates a phylogenetic distance metric based on the similarity of LLMs' output. The resulting metric is then used to construct dendrograms, which satisfactorily capture known relationships across a set of 111 open-source and 45 closed models. Furthermore, our phylogenetic distance predicts performance in standard benchmarks, thus demonstrating its functional validity and paving the way for a time and cost-effective estimation of LLM capabilities. To sum up, by translating population genetic concepts to machine learning, we propose and validate a tool to evaluate LLM development, relationships and capabilities, even in the absence of transparent training information.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.267973899841309, 0.19087833166122437], "openalex_id": "https://openalex.org/W4394654128", "title": "Language Models on a Diet: Cost-Efficient Development of Encoders for Closely-Related Languages via Additional Pretraining", "authors": "Nikola Ljube\u0161i\u0107, V\u00edt Suchomel, Peter Rupnik, Taja Kuzman, Rik van Noord", "abstract": "The world of language models is going through turbulent times, better and ever larger models are coming out at an unprecedented speed. However, we argue that, especially for the scientific community, encoder models of up to 1 billion parameters are still very much needed, their primary usage being in enriching large collections of data with metadata necessary for downstream research. We investigate the best way to ensure the existence of such encoder models on the set of very closely related languages - Croatian, Serbian, Bosnian and Montenegrin, by setting up a diverse benchmark for these languages, and comparing the trained-from-scratch models with the new models constructed via additional pretraining of existing multilingual models. We show that comparable performance to dedicated from-scratch models can be obtained by additionally pretraining available multilingual models even with a limited amount of computation. We also show that neighboring languages, in our case Slovenian, can be included in the additional pretraining with little to no loss in the performance of the final model.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.443280220031738, 2.2515506744384766], "openalex_id": "https://openalex.org/W4394672985", "title": "Interpreting Themes from Educational Stories", "authors": "Yigeng Zhang, Fabio A. Gonz\u00e1lez, Thamar Solorio", "abstract": "Reading comprehension continues to be a crucial research focus in the NLP community. Recent advances in Machine Reading Comprehension (MRC) have mostly centered on literal comprehension, referring to the surface-level understanding of content. In this work, we focus on the next level - interpretive comprehension, with a particular emphasis on inferring the themes of a narrative text. We introduce the first dataset specifically designed for interpretive comprehension of educational narratives, providing corresponding well-edited theme texts. The dataset spans a variety of genres and cultural origins and includes human-annotated theme keywords with varying levels of granularity. We further formulate NLP tasks under different abstractions of interpretive comprehension toward the main idea of a story. After conducting extensive experiments with state-of-the-art methods, we found the task to be both challenging and significant for NLP research. The dataset and source code have been made publicly available to the research community at https://github.com/RiTUAL-UH/EduStory.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.827990770339966, 2.3887317180633545], "openalex_id": "https://openalex.org/W4394569079", "title": "Bioeconomy firms and where to find them", "authors": "Lukas Kriesch, Sebastian Losacker", "abstract": "The bioeconomy represents a transformative approach to economic development and sustainability by harnessing biological resources and knowledge to produce goods, services, and energy while reducing dependence on non-renewable resources. In order to understand and support the bioeconomy, scholars and policymakers rely on an accurate measurement and monitoring of biobased economic activities. However, existing statistical frameworks and industry classifications often fall short in capturing the unique characteristics and complexities of the bioeconomy. This article addresses this challenge by developing a methodological approach for comprehensive measurement and mapping of biobased economic activities. We build a novel data set of bioeconomy firms in Germany using web-mining and machine learning techniques. This data set enables detailed analysis of biobased economic activities, providing valuable insights into the spatial organization of the bioeconomy. The paper demonstrates the applicability of the data set by testing several stylized facts about the bioeconomy. Our research contributes to a better understanding of the bioeconomy's regional impacts and offers a valuable resource for policymakers and researchers interested in understanding the geography of biobased economic activities. We make an aggregated version of the data set freely available online.", "venue": "REGION", "label": 0}, {"loc": [4.9934983253479, 2.1983156204223633], "openalex_id": "https://openalex.org/W4394591900", "title": "Willkommens-Merkel, Chaos-Johnson, and Tore-Klose: Modeling the Evaluative Meaning of German Personal Name Compounds", "authors": "Annerose Eichel, Tana Deeg, Andr\u00e9 Blessing, Milena Belo\u0161evi\u0107, Sabine Arndt\u2010Lappe, Sabine Schulte im Walde", "abstract": "We present a comprehensive computational study of the under-investigated phenomenon of personal name compounds (PNCs) in German such as Willkommens-Merkel ('Welcome-Merkel'). Prevalent in news, social media, and political discourse, PNCs are hypothesized to exhibit an evaluative function that is reflected in a more positive or negative perception as compared to the respective personal full name (such as Angela Merkel). We model 321 PNCs and their corresponding full names at discourse level, and show that PNCs bear an evaluative nature that can be captured through a variety of computational methods. Specifically, we assess through valence information whether a PNC is more positively or negatively evaluative than the person's name, by applying and comparing two approaches using (i) valence norms and (ii) pretrained language models (PLMs). We further enrich our data with personal, domain-specific, and extra-linguistic information and perform a range of regression analyses revealing that factors including compound and modifier valence, domain, and political party membership influence how a PNC is evaluated.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.053035736083984, 0.269476979970932], "openalex_id": "https://openalex.org/W4394573745", "title": "knowledge on biodiversity beyond national jurisdiction", "authors": "Xiao\u2010Wei Wang, Mingdan Zhang, Hao Liu, Xiaodong Ma, Yingchao Liu, Yitong Chen", "abstract": "The marine biodiversity in Areas beyond national jurisdiction (ABNJ), encompassing approximately two-thirds of the global ocean, is persistently declining. In 2023, the agreement on the Conservation and Sustainable Use of Marine Biodiversity of Areas Beyond National Jurisdiction (BBNJ) was officially adopted. Implementing the BBNJ Agreement has the potential to effectively meet global needs for preserving marine biodiversity. Nevertheless, the implementation requires dealing with thousands of legal clauses, and the parties participating in the process lack adequate means to acquire knowledge connected to BBNJ. This paper introduces ChatBBNJ, a highly efficient question-answering system that combines a novel data engineering technique with large language models (LLMs) of Natural Language Processing (NLP). The system aims to efficiently provide stakeholders with BBNJ-related knowledge, thereby facilitating and enhancing their comprehension and involvement with the subject matter. The experimental results demonstrate that the proposed ChatBBNJ exhibits superior expertise in the BBNJ domain, outperforming baseline models in terms of precision, recall, and F1-scores. The successful deployment of the suggested system is expected to greatly assist stakeholders in acquiring BBNJ knowledge and facilitating the effective implementation of the BBNJ Agreement. Therefore, this is expected to contribute to the conservation and sustainable use of marine biodiversity in ABNJ.", "venue": "Frontiers in Marine Science", "label": 0}, {"loc": [7.022577285766602, 1.305294156074524], "openalex_id": "https://openalex.org/W4394708573", "title": "Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model", "authors": "Xinrun Du, Zhouliang Yu, Songyang Gao, Ding Pan, Yuyang Cheng, Ziyang Ma, Ruibin Yuan, Xingwei Qu, Jiaheng Liu, Tianyu Zheng, Xinchen Luo, Guorui Zhou, Binhang Yuan, Wenhu Chen, Jie Fu, Ge ZHANG", "abstract": "In this study, we introduce CT-LLM, a 2B large language model (LLM) that illustrates a pivotal shift towards prioritizing the Chinese language in developing LLMs. Uniquely initiated from scratch, CT-LLM diverges from the conventional methodology by primarily incorporating Chinese textual data, utilizing an extensive corpus of 1,200 billion tokens, including 800 billion Chinese tokens, 300 billion English tokens, and 100 billion code tokens. This strategic composition facilitates the model's exceptional proficiency in understanding and processing Chinese, a capability further enhanced through alignment techniques. Demonstrating remarkable performance on the CHC-Bench, CT-LLM excels in Chinese language tasks, and showcases its adeptness in English through SFT. This research challenges the prevailing paradigm of training LLMs predominantly on English corpora and then adapting them to other languages, broadening the horizons for LLM training methodologies. By open-sourcing the full process of training a Chinese LLM, including a detailed data processing procedure with the obtained Massive Appropriate Pretraining Chinese Corpus (MAP-CC), a well-chosen multidisciplinary Chinese Hard Case Benchmark (CHC-Bench), and the 2B-size Chinese Tiny LLM (CT-LLM), we aim to foster further exploration and innovation in both academia and industry, paving the way for more inclusive and versatile language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.594716548919678, 0.25027158856391907], "openalex_id": "https://openalex.org/W4394064431", "title": "Deep Learning Model for Tamil Part-of-Speech Tagging", "authors": "Hemakasiny Visuwalingam, Ratnasingam Sakuntharaj, Janaka Alawatugoda, Roshan Ragel", "abstract": "Abstract Part-of-Speech (POS) tagging is one of the popular Natural Language Processing (NLP) tasks. It is also considered to be a preliminary task for other NLP applications such as speech recognition, machine translation, and sentiment analysis. A few works have been published on POS tagging for the Tamil language. However, the performance of the POS tagger with unknown words is not explored in the literature. The appearance of unknown words is a frequently occurring problem in POS tagging and makes it a challenging task. In this paper, we propose a deep learning-based POS tagger for Tamil using Bi-directional Long Short Term Memory (BLSTM). The performance of the POS tagger was evaluated using known and unknown words. The POS tagger with regular word-level embeddings produces 99.83 and 92.46% accuracies for all known and 63.21% unknown words. It clearly shows that the accuracy decreases when the number of unknown words increases. To improve the performance of the POS tagger with unknown words, the proposed BLSTM model that uses word-level, character-level and pre-trained word embeddings. Test results of this model show a 2.57% improvement for 63.21% of unknown words, with an accuracy of 95.03%.", "venue": "The Computer Journal", "label": 0}, {"loc": [7.540316104888916, -1.1902023553848267], "openalex_id": "https://openalex.org/W4393994592", "title": "WCC-EC 2.0: Enhancing Neural Machine Translation with a 1.6 M+ Web-Crawled English-Chinese Parallel Corpus", "authors": "Jinyi Zhang, Ke Su, Ye Tian, Tadahiro Matsumoto", "abstract": "This research introduces WCC-EC 2.0 (Web-Crawled Corpus\u2014English and Chinese), a comprehensive parallel corpus designed for enhancing Neural Machine Translation (NMT), featuring over 1.6 million English-Chinese sentence pairs meticulously gathered via web crawling. This corpus, extracted through an advanced web crawler, showcases the vast linguistic diversity and richness of English and Chinese, uniquely spanning the rarely covered news and music domains. Our methodical approach in web crawling and corpus assembly, coupled with rigorous experiments and manual evaluations, demonstrated its superiority by achieving high BLEU scores, marking significant strides in translation accuracy and model resilience. Its inclusion of these specific areas adds significant value, providing a unique dataset that enriches the scope for NMT research and development. With the rise of NMT technology, WCC-EC 2.0 emerges not only as an invaluable resource for researchers and developers, but also as a pivotal tool for improving translation accuracy, training more resilient models, and promoting interlingual communication.", "venue": "Electronics", "label": 19}, {"loc": [9.173449516296387, -0.9343820810317993], "openalex_id": "https://openalex.org/W4399485463", "title": "Evaluating Multilingual Abstractive Dialogue Summarization in Indian Languages using mT5-small & IndicBART", "authors": "Mehak Sharma, Gunika Goyal, Aarzoo Gupta, Ritu Rani, Arun Sharma, Amita Dev", "abstract": "Every day, the internet is inundated with massive amounts of data. Unlike texts, such as academic reports and news that ordinarily originate from a single source and have a well-organized structure, dialogues involve the dynamic exchange of information between two or more speakers. The objective of a discussion may shift during the track of the conversation, and important information is dispersed across multiple speakers', making abstractive summarization of dialogues demanding. Numerous summarizing approaches have been suggested and implemented for English and other foreign languages. On the other hand, the process of summarizing dialogue in Indian languages is still in its newborn stages. By providing insights into the performance of these models and how they adapt to a wide variety of linguistic nuances, the aim of this study is to shed light on the efficacy of these models. This work addresses the challenge of abstractive summarization of dialogues in three prominent Indian languages: Hindi, Marathi and Bengali by evaluating the effectiveness of two specific multilingual models, namely mT5-small and indicBART. According to the findings of the comparative analysis, the mT5-small model has greater accuracy in the three chosen languages.", "venue": "https://doi.org/10.1109/i2ct61223.2024.10543588", "label": 0}, {"loc": [6.995092391967773, -1.0405198335647583], "openalex_id": "https://openalex.org/W4399486163", "title": "Comparative Study on Synthetic and Natural Error Analysis with BART & MarianMT", "authors": "R Rohit, SA Gandheesh, Gayatri Sanjana Sannala, Peeta Basa Pati", "abstract": "Text is essential for communication, information sharing, knowledge acquisition, and analysis. It shapes public opinion, supports education, and drives online content, making it crucial in various domains. While there are various language models utilized for text analysis and text correction, there is little to no survey conducted on these model's behavior and limitations. This work deals with studying BART and MarianMT language models behavior to an input dataset consisting of two types of errors, Synthetic and Natural. Synthetic errors are efficient to create and test, whereas Natural errors are more common and close to real world occurring errors. The models were trained and tested with the generated data, the results highlighted that BART exhibited consistent outputs towards both Synthetic and Natural errors and hence revealing a break-even point at the vicinity of 26% Synthetic error introduction. Conversely, the performance of MarianMT was comparatively diminished for Synthetic errors in contrast to Natural errors. These findings provide valuable insights into the behavior and capabilities of the models.", "venue": "https://doi.org/10.1109/i2ct61223.2024.10543923", "label": 0}, {"loc": [5.771921157836914, 1.9826481342315674], "openalex_id": "https://openalex.org/W4394007349", "title": "Probing Large Language Models for Scalar Adjective Lexical Semantics and Scalar Diversity Pragmatics", "authors": "Fangru Lin, Daniel Altshuler, Janet B. Pierrehumbert", "abstract": "Scalar adjectives pertain to various domain scales and vary in intensity within each scale (e.g. certain is more intense than likely on the likelihood scale). Scalar implicatures arise from the consideration of alternative statements which could have been made. They can be triggered by scalar adjectives and require listeners to reason pragmatically about them. Some scalar adjectives are more likely to trigger scalar implicatures than others. This phenomenon is referred to as scalar diversity. In this study, we probe different families of Large Language Models such as GPT-4 for their knowledge of the lexical semantics of scalar adjectives and one specific aspect of their pragmatics, namely scalar diversity. We find that they encode rich lexical-semantic information about scalar adjectives. However, the rich lexical-semantic knowledge does not entail a good understanding of scalar diversity. We also compare current models of different sizes and complexities and find that larger models are not always better. Finally, we explain our probing results by leveraging linguistic intuitions and model training objectives.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.086425542831421, 5.292453765869141], "openalex_id": "https://openalex.org/W4394006246", "title": "Generative adversarial network-based phishing URL detection with variational autoencoder and transformer", "authors": "K. S. Jishnu, Arthi Balakrishnan", "abstract": "Phishing attacks pose a constant threat to online security, necessitating the development of efficient tools for identifying malicious URLs. In this article, we propose a novel approach to detect phishing URLs employing a generative adversarial network (GAN) with a variational autoencoder (VAE) as the generator and a transformer model with self-attention as the discriminator. The VAE generator is trained to produce synthetic URLs. In contrast, the transformer discriminator uses its self-attention mechanism to focus on the different parts of the input URLs to extract crucial features. Our model uses adversarial training to distinguish between legitimate and phishing URLs. We evaluate the effectiveness of the proposed method using a large set of one million URLs that incorporate both authentic and phishing URLs. Experimental results show that our model is effective, with an impressive accuracy of 97.75%, outperforming the baseline models. This study significantly improves online security by offering a novel and highly accurate phishing URL detection method.", "venue": "IAES International Journal of Artificial Intelligence", "label": 0}, {"loc": [4.5890350341796875, 0.4917769134044647], "openalex_id": "https://openalex.org/W4399485624", "title": "ChatGPT in Cyber Onslaught and Fortification: Past, Present, and Future", "authors": "Manisha Gunda, Vinay Manda, Pranay Naradasu, Sanjaykumar Mekala, Sandip Bhattacharya", "abstract": "Contemporary conversational models of language like ChatGPT have progressed tremendously in recent years. The comprehensive review includes ChatGPT's history, uses, challenges, and possible future advancements. The paper review also comprises the role of ChatGPT in cyber onslaught and cyber fortification. ChatGPT has numerous benefits, but it also has certain drawbacks. Depending on the data it is trained on, one of its challenges may be bias. Lastly, because ChatGPT is still in its early stages of development, it occasionally struggles to comprehend complex or complex commands. This paper review tackle ChatGPT's and other generative AI models' futures in their concluding remarks. They forecast that ChatGPT will become continually integrated with other technologies, like augmented reality and virtual assistants. Furthermore, they predict that ChatGPT will become more and more important in domains like scientific research and others that need sophisticated natural language processing.", "venue": "https://doi.org/10.1109/i2ct61223.2024.10543384", "label": 0}, {"loc": [9.213371276855469, -0.8650103807449341], "openalex_id": "https://openalex.org/W4394007622", "title": "From News to Summaries: Building a Hungarian Corpus for Extractive and Abstractive Summarization", "authors": "Botond Barta, Dorina Lakatos, Attila Nagy, Mil\u00e1n Konor Nyist, Judit \u00c1cs", "abstract": "Training summarization models requires substantial amounts of training data. However for less resourceful languages like Hungarian, openly available models and datasets are notably scarce. To address this gap our paper introduces HunSum-2 an open-source Hungarian corpus suitable for training abstractive and extractive summarization models. The dataset is assembled from segments of the Common Crawl corpus undergoing thorough cleaning, preprocessing and deduplication. In addition to abstractive summarization we generate sentence-level labels for extractive summarization using sentence similarity. We train baseline models for both extractive and abstractive summarization using the collected dataset. To demonstrate the effectiveness of the trained models, we perform both quantitative and qualitative evaluation. Our dataset, models and code are publicly available, encouraging replication, further research, and real-world applications across various domains.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.889166355133057, -0.9552591443061829], "openalex_id": "https://openalex.org/W4399486752", "title": "Bi-directional GRU-Based Approach for Multi-Class Text Error Identification System", "authors": "Shaik Reeha, Boda Venkata Nikith, GR Reddy, Peeta Basa Pati", "abstract": "In today's highly competitive academic and research landscape, the impression authors leave on readers is paramount to their reputation and success. Spelling errors can significantly undermine an author's credibility and negatively impact their standing within the scholarly community. To mitigate this risk, it is imperative that authors thoroughly proofread their content before submitting it to prestigious conference journals or sharing it with fellow researchers. This paper introduces a novel solution to streamline the error-checking process and empower authors to focus on other critical aspects of their projects. We propose a sophisticated text classification model, leveraging the vast capabilities of the Recurrent Neural Network (RNN) architecture, which was trained on the extensive C4 dataset sourced from the TensorFlow Library. This model exhibits exceptional proficiency in rectifying spelling mistakes across a wide range of contexts, thereby bolstering the overall quality and clarity of written work. Notably, our text classification model trained with Bi-Directional GRU method showcases a remarkable accuracy of 67% in classifying errors into four distinct categories, providing insightful feedback to authors for further improvement.", "venue": "https://doi.org/10.1109/i2ct61223.2024.10543361", "label": 0}, {"loc": [5.17363977432251, -1.614837646484375], "openalex_id": "https://openalex.org/W4399487345", "title": "Fusion of Deep Learning with Advanced and Traditional Embeddings in Sentiment Analysis", "authors": "Lavanya B. N, Anitha Rathnam K. V, K. Kiran, P. Deepa Shenoy, Venugopal K. R", "abstract": "In todays changing landscape the abundance of unprocessed data plays a vital role. It is crucial to curate this dataset and conduct a thorough analysis of the sentiments it captures. The aim of this research study is to explore learning techniques such as Bi LSTM, CNN and GRU for sentiment analysis. It also involves an in-depth examination of datasets, including restaurant reviews from platforms like Yelp and tweets related to Covid 19. The study further investigates word embedding methods ranging from Word2Vec, Keras, Glove, N-gram to more complex approaches, like Bert, Roberta, CT-Bert, FastText and Elmo. The ultimate objective is to categorize data into positive, negative or neutral sentiments by evaluating their underlying polarity. Furthermore, the study delves into the aspect of detecting sarcasm in text and aims to evaluate the level of sarcasm in different instances.", "venue": "https://doi.org/10.1109/i2ct61223.2024.10543279", "label": 0}, {"loc": [2.844538688659668, 2.994148015975952], "openalex_id": "https://openalex.org/W4393968095", "title": "A Survey of Web Content Control for Generative AI", "authors": "Michael Dinzinger, Florian He\u00df, Michael Granitzer", "abstract": "The groundbreaking advancements around generative AI have recently caused a wave of concern culminating in a row of lawsuits, including high-profile actions against Stability AI and OpenAI. This situation of legal uncertainty has sparked a broad discussion on the rights of content creators and publishers to protect their intellectual property on the web. European as well as US law already provides rough guidelines, setting a direction for technical solutions to regulate web data use. In this course, researchers and practitioners have worked on numerous web standards and opt-out formats that empower publishers to keep their data out of the development of generative AI models. The emerging AI/ML opt-out protocols are valuable in regards to data sovereignty, but again, it creates an adverse situation for a site owners who are overwhelmed by the multitude of recent ad hoc standards to consider. In our work, we want to survey the different proposals, ideas and initiatives, and provide a comprehensive legal and technical background in the context of the current discussion on web publishers control.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.54926061630249, 2.2250993251800537], "openalex_id": "https://openalex.org/W4393967864", "title": "Emergent Abilities in Reduced-Scale Generative Language Models", "authors": "Sherin Muckatira, Vijeta Deshpande, Vladislav Lialin, Anna Rumshisky", "abstract": "Large language models can solve new tasks without task-specific fine-tuning. This ability, also known as in-context learning (ICL), is considered an emergent ability and is primarily seen in large language models with billions of parameters. This study investigates if such emergent properties are strictly tied to model size or can be demonstrated by smaller models trained on reduced-scale data. To explore this, we simplify pre-training data and pre-train 36 causal language models with parameters varying from 1 million to 165 million parameters. We show that models trained on this simplified pre-training data demonstrate enhanced zero-shot capabilities across various tasks in simplified language, achieving performance comparable to that of pre-trained models six times larger on unrestricted language. This suggests that downscaling the language allows zero-shot learning capabilities to emerge in models with limited size. Additionally, we find that these smaller models pre-trained on simplified data demonstrate a power law relationship between the evaluation loss and the three scaling factors: compute, dataset size, and model size.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.776697158813477, 4.067464351654053], "openalex_id": "https://openalex.org/W4393969316", "title": "Toward Inference-optimal Mixture-of-Expert Large Language Models", "authors": "Longfei Yun, Yonghao Zhuang, Yao Fu, Eric P. Xing, Hao Zhang", "abstract": "Mixture-of-Expert (MoE) based large language models (LLMs), such as the recent Mixtral and DeepSeek-MoE, have shown great promise in scaling model size without suffering from the quadratic growth of training cost of dense transformers. Like dense models, training MoEs requires answering the same question: given a training budget, what is the optimal allocation on the model size and number of tokens? We study the scaling law of MoE-based LLMs regarding the relations between the model performance, model size, dataset size, and the expert degree. Echoing previous research studying MoE in different contexts, we observe the diminishing return of increasing the number of experts, but this seems to suggest we should scale the number of experts until saturation, as the training cost would remain constant, which is problematic during inference time. We propose to amend the scaling law of MoE by introducing inference efficiency as another metric besides the validation loss. We find that MoEs with a few (4/8) experts are the most serving efficient solution under the same performance, but costs 2.5-3.5x more in training. On the other hand, training a (16/32) expert MoE much smaller (70-85%) than the loss-optimal solution, but with a larger training dataset is a promising setup under a training budget.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.0637600421905518, 5.31512451171875], "openalex_id": "https://openalex.org/W4393932362", "title": "Real-time phishing detection using deep learning methods by extensions", "authors": "Dam Minh Linh, Duy-Hung Ha, Han Minh Chau, Q. S. Vu, Thanh\u2010Nam Tran", "abstract": "Phishing is an attack method that relies on a user\u2019s insufficient vigilance and understanding of the internet. For example, an attacker creates an online transaction website and tricks users into logging into the fake website to steal their personal information, such as credit card numbers, email addresses, phone numbers, and physical addresses. This paper proposes implementing an extension to prevent phishing for internet users. In particular, this study develops a smart warning feature for the proposed extension using deep learning models. The proposed extension installed in the web browser protects users by checking for, warning about, and preventing untrusted connections. This study evaluated and compared the performance of machine learning models using a malicious uniform resource locator (URL) dataset containing 651,191 data samples. The results of the investigation confirm that the proposed extension using a convolutional neural network (CNN) achieved a high accuracy of 98.4%.", "venue": "International Journal of Power Electronics and Drive Systems/International Journal of Electrical and Computer Engineering", "label": 0}, {"loc": [7.365262031555176, 2.661489248275757], "openalex_id": "https://openalex.org/W4393942908", "title": "Deconstructing In-Context Learning: Understanding Prompts via Corruption", "authors": "Namrata Shivagunde, Vladislav Lialin, Sherin Muckatira, Anna Rumshisky", "abstract": "The ability of large language models (LLMs) to $``$learn in context$\"$ based on the provided prompt has led to an explosive growth in their use, culminating in the proliferation of AI assistants such as ChatGPT, Claude, and Bard. These AI assistants are known to be robust to minor prompt modifications, mostly due to alignment techniques that use human feedback. In contrast, the underlying pre-trained LLMs they use as a backbone are known to be brittle in this respect. Building high-quality backbone models remains a core challenge, and a common approach to assessing their quality is to conduct few-shot evaluation. Such evaluation is notorious for being highly sensitive to minor prompt modifications, as well as the choice of specific in-context examples. Prior work has examined how modifying different elements of the prompt can affect model performance. However, these earlier studies tended to concentrate on a limited number of specific prompt attributes and often produced contradictory results. Additionally, previous research either focused on models with fewer than 15 billion parameters or exclusively examined black-box models like GPT-3 or PaLM, making replication challenging. In the present study, we decompose the entire prompt into four components: task description, demonstration inputs, labels, and inline instructions provided for each demonstration. We investigate the effects of structural and semantic corruptions of these elements on model performance. We study models ranging from 1.5B to 70B in size, using ten datasets covering classification and generation tasks. We find that repeating text within the prompt boosts model performance, and bigger models ($\\geq$30B) are more sensitive to the semantics of the prompt. Finally, we observe that adding task and inline instructions to the demonstrations enhances model performance even when the instructions are semantically corrupted.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.449358940124512, -0.16852375864982605], "openalex_id": "https://openalex.org/W4393933388", "title": "Generative AI-Based Text Generation Methods Using Pre-Trained GPT-2 Model", "authors": "Rohit Pandey, Hetvi Waghela, Sneha Rakshit, Aparna Rangari, Anjali Singh, Rahul Kumar, Ratnadeep Ghosal, Jaydip Sen", "abstract": "This work delved into the realm of automatic text generation, exploring a variety of techniques ranging from traditional deterministic approaches to more modern stochastic methods. Through analysis of greedy search, beam search, top-k sampling, top-p sampling, contrastive searching, and locally typical searching, this work has provided valuable insights into the strengths, weaknesses, and potential applications of each method. Each text-generating method is evaluated using several standard metrics and a comparative study has been made on the performance of the approaches. Finally, some future directions of research in the field of automatic text generation are also identified.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.758344650268555, 5.355199337005615], "openalex_id": "https://openalex.org/W4393924542", "title": "A Review of Multi-Modal Large Language and Vision Models", "authors": "Kilian Carolan, Laura Fennelly, Alan F. Smeaton", "abstract": "Large Language Models (LLMs) have recently emerged as a focal point of research and application, driven by their unprecedented ability to understand and generate text with human-like quality. Even more recently, LLMs have been extended into multi-modal large language models (MM-LLMs) which extends their capabilities to deal with image, video and audio information, in addition to text. This opens up applications like text-to-video generation, image captioning, text-to-speech, and more and is achieved either by retro-fitting an LLM with multi-modal capabilities, or building a MM-LLM from scratch. This paper provides an extensive review of the current state of those LLMs with multi-modal capabilities as well as the very recent MM-LLMs. It covers the historical development of LLMs especially the advances enabled by transformer-based architectures like OpenAI's GPT series and Google's BERT, as well as the role of attention mechanisms in enhancing model performance. The paper includes coverage of the major and most important of the LLMs and MM-LLMs and also covers the techniques of model tuning, including fine-tuning and prompt engineering, which tailor pre-trained models to specific tasks or domains. Ethical considerations and challenges, such as data bias and model misuse, are also analysed to underscore the importance of responsible AI development and deployment. Finally, we discuss the implications of open-source versus proprietary models in AI research. Through this review, we provide insights into the transformative potential of MM-LLMs in various applications.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.850875854492188, 0.1968388855457306], "openalex_id": "https://openalex.org/W4393967376", "title": "CLAPNQ: Cohesive Long-form Answers from Passages in Natural Questions for RAG systems", "authors": "Sara Rosenthal, Avirup Sil, Radu Florian, Salim Roukos", "abstract": "Retrieval Augmented Generation (RAG) has become a popular application for large language models. It is preferable that successful RAG systems provide accurate answers that are supported by being grounded in a passage without any hallucinations. While considerable work is required for building a full RAG pipeline, being able to benchmark performance is also necessary. We present ClapNQ, a benchmark Long-form Question Answering dataset for the full RAG pipeline. ClapNQ includes long answers with grounded gold passages from Natural Questions (NQ) and a corpus to perform either retrieval, generation, or the full RAG pipeline. The ClapNQ answers are concise, 3x smaller than the full passage, and cohesive, meaning that the answer is composed fluently, often by integrating multiple pieces of the passage that are not contiguous. RAG models must adapt to these properties to be successful at ClapNQ. We present baseline experiments and analysis for ClapNQ that highlight areas where there is still significant room for improvement in grounded RAG. CLAPNQ is publicly available at https://github.com/primeqa/clapnq", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.390084266662598, -1.3095721006393433], "openalex_id": "https://openalex.org/W4393929855", "title": "M2SA: Multimodal and Multilingual Model for Sentiment Analysis of Tweets", "authors": "Gaurish Thakkar, Sherzod Hakimov, Marko Tadi\u0107", "abstract": "In recent years, multimodal natural language processing, aimed at learning from diverse data types, has garnered significant attention. However, there needs to be more clarity when it comes to analysing multimodal tasks in multi-lingual contexts. While prior studies on sentiment analysis of tweets have predominantly focused on the English language, this paper addresses this gap by transforming an existing textual Twitter sentiment dataset into a multimodal format through a straightforward curation process. Our work opens up new avenues for sentiment-related research within the research community. Additionally, we conduct baseline experiments utilising this augmented dataset and report the findings. Notably, our evaluations reveal that when comparing unimodal and multimodal configurations, using a sentiment-tuned large language model as a text encoder performs exceptionally well.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.9377851486206055, 0.6120424270629883], "openalex_id": "https://openalex.org/W4393904987", "title": "Comparing Bad Apples to Good Oranges: Aligning Large Language Models via Joint Preference Optimization", "authors": "Hritik Bansal, Ashima Suvarna, Gantavya Bhatt, Nanyun Peng, Kai-Wei Chang, Aditya Grover", "abstract": "A common technique for aligning large language models (LLMs) relies on acquiring human preferences by comparing multiple generations conditioned on a fixed context. This method, however, relies solely on pairwise comparisons, where the generations are evaluated within an identical context. While effective to such conditional preferences often fail to encompass the nuanced and multidimensional nature of human preferences. In this work, we revisit the traditional paradigm of preference acquisition and propose a new axis based on eliciting preferences jointly over the instruction-response pairs. Unlike prior preference optimizations, which are designed for conditional ranking protocols (e.g., DPO), we propose Joint Preference Optimization (JPO), a new preference optimization objective that upweights the joint probability of the chosen instruction-response pair over the rejected instruction-response pair. Interestingly, LLMs trained with joint instruction-response preference data using JPO outperform LLM trained with DPO by $5.2\\%$ and $3.3\\%$ win-rate for summarization and open-ended dialogue datasets, respectively. Our findings reveal that joint preferences over instruction and response pairs can significantly enhance the alignment of LLMs by tapping into a broader spectrum of human preference elicitation. The data and code is available at https://github.com/Hritikbansal/dove.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.238577842712402, 0.12318413704633713], "openalex_id": "https://openalex.org/W4393726285", "title": "Bailong: Bilingual Transfer Learning based on QLoRA and Zip-tie Embedding", "authors": "Lung\u2010Chuan Chen, Zong-Ru Li", "abstract": "Large language models (LLMs) have demonstrated exceptional performance in various NLP applications. However, the majority of existing open-source LLMs are pre-trained primarily on English data and little part of other languages. This deficiency in multilingual training data results in suboptimal performance when applied to languages with fewer available resources. Furthermore, enhancing the performance of LLMs on low-resource languages by full-parameter fine-tuning with additional data requires substantial computational resources, posing computational barriers for research organizations and individual researchers. Consequently, several techniques such as parameter-efficient tuning and advanced embedding initialization have been proposed to address these challenges. In this work, we combine them to facilitate cross-lingual transfer on English-dominated open-source LLM. To effectively enhance the model's proficiency in Traditional Chinese, we conduct secondary pre-training on Llama 2 7B with Traditional Chinese data by leveraging QLoRA and our proposed zip-tie embedding initialization. The resulting model called Bailong, which stands for Bilingual trAnsfer learnIng based on qLOra and zip-tie embeddiNG. We present Bailong-instruct 7B, a fine-tuned version of Bailong 7B optimized for multi-turn dialogue scenarios. Recognizing the inadequacy of benchmark datasets in Traditional Chinese, we further introduce Bailong-bench to assess the alignment of models with human preferences and the capability to follow instructions in both Traditional Chinese and English tasks. In our evaluation, Bailong-instruct 7B exhibits competitive performance on Bailong-bench and other benchmark datasets when compared to other open-source models of similar or even larger parameter sizes. Bailong-instruct 7B and Bailong-bench are publicly available with the aim of empowering the community to build upon our efforts.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.377556800842285, 1.8725364208221436], "openalex_id": "https://openalex.org/W4393403588", "title": "HGT: Leveraging Heterogeneous Graph-enhanced Large Language Models for Few-shot Complex Table Understanding", "authors": "Rihui Jin, Li Yu, Guilin Qi, Nan Hu, Yuan-Fang Li, Jiaoyan Chen, Jianan Wang, Yongrui Chen, Dehai Min", "abstract": "Table understanding (TU) has achieved promising advancements, but it faces the challenges of the scarcity of manually labeled tables and the presence of complex table structures.To address these challenges, we propose HGT, a framework with a heterogeneous graph (HG)-enhanced large language model (LLM) to tackle few-shot TU tasks.It leverages the LLM by aligning the table semantics with the LLM's parametric knowledge through soft prompts and instruction turning and deals with complex tables by a multi-task pre-training scheme involving three novel multi-granularity self-supervised HG pre-training objectives.We empirically demonstrate the effectiveness of HGT, showing that it outperforms the SOTA for few-shot complex TU on several benchmarks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.232475280761719, 0.32115548849105835], "openalex_id": "https://openalex.org/W4393399556", "title": "IPA Transcription of Bengali Texts", "authors": "Kanij Fatema, Fazle Dawood Haider, Nirzona Ferdousi Turpa, Tanveer Azmal, Sourav Ahmed, Navid Hasan, Md. Akhlaqur Rahman, Biplab Sarkar, Afrar Jahin, Md. Rezuwan Hassan, Md Foriduzzaman Zihad, Rubayet Sabbir Faruque, Asif Shahriyar Sushmit, Mashrur Imtiaz, Farig Sadeque, Syed Shahrier Rahman", "abstract": "The International Phonetic Alphabet (IPA) serves to systematize phonemes in language, enabling precise textual representation of pronunciation. In Bengali phonology and phonetics, ongoing scholarly deliberations persist concerning the IPA standard and core Bengali phonemes. This work examines prior research, identifies current and potential issues, and suggests a framework for a Bengali IPA standard, facilitating linguistic analysis and NLP resource creation and downstream technology development. In this work, we present a comprehensive study of Bengali IPA transcription and introduce a novel IPA transcription framework incorporating a novel dataset with DL-based benchmarks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.801706552505493, 1.5689877271652222], "openalex_id": "https://openalex.org/W4393381464", "title": "Using rhetorical strategies to design prompts: a human-in-the-loop approach to make AI useful", "authors": "Nupoor Ranade, Marly Saravia, Aditya Johri", "abstract": "Abstract The growing capabilities of artificial intelligence (AI) word processing models have demonstrated exceptional potential to impact language related tasks and functions. Their fast pace of adoption and probable effect has also given rise to controversy within certain fields. Models, such as GPT-3, are a particular concern for professionals engaged in writing, particularly as their engagement with these technologies is limited due to lack of ability to control their output. Most efforts to maximize and control output rely on a process known as prompt engineering, the construction and modification of the inputted prompt with expectation for certain outputted or desired text. Consequently, prompt engineering has emerged as an important consideration for research and practice. Previous conceptions of prompt engineering have largely focused on technical and logistic modifications to the back-end processing, remaining inaccessible and, still, limited for most users. In this paper, we look to the technical communication field and its methods of text generation\u2014the rhetorical situation\u2014to conceptualize prompt engineering in a more comprehensible way for its users by considering the context and rhetoric. We introduce a framework, consisting of a formula, to prompt engineering, which demands all components of the rhetorical situation be present in the inputted prompt. We present discussions on the future of AI writing models and their use in both professional and educational settings. Ultimately, this discussion and its findings aim to provide a means of integrating agency and writer-centric methods to AI writing tools to advance a more human-in-the-loop approach. As the use of generative AI and especially NLP-based technologies become common across societal functions, the use of prompt engineering will play a crucial role not just in adoption of the technology, but also its productive and responsible use.", "venue": "AI & Society", "label": 16}, {"loc": [2.705317258834839, 1.9106435775756836], "openalex_id": "https://openalex.org/W4393548539", "title": "We have no idea what we are walking into: AI and ethical considerations", "authors": "Katherine B. Forrest", "abstract": "Abstract We are at the beginning of the beginning of the beginning of the development of AI. The ethical issues we first saw and are still grappling with have been overtaken by others, and there are yet others on the horizon.", "venue": "Annals of the New York Academy of Sciences", "label": 0}, {"loc": [6.018926620483398, 5.807651996612549], "openalex_id": "https://openalex.org/W4393336276", "title": "MagicLens: Self-Supervised Image Retrieval with Open-Ended Instructions", "authors": "Kai Zhang, Yi Luan, Hexiang Hu, Kenton Lee, Siyuan Qiao, Wenhu Chen, Yu Su, Ming\u2010Wei Chang", "abstract": "Image retrieval, i.e., finding desired images given a reference image, inherently encompasses rich, multi-faceted search intents that are difficult to capture solely using image-based measures. Recent works leverage text instructions to allow users to more freely express their search intents. However, they primarily focus on image pairs that are visually similar and/or can be characterized by a small set of pre-defined relations. The core thesis of this paper is that text instructions can enable retrieving images with richer relations beyond visual similarity. To show this, we introduce MagicLens, a series of self-supervised image retrieval models that support open-ended instructions. MagicLens is built on a key novel insight: image pairs that naturally occur on the same web pages contain a wide range of implicit relations (e.g., inside view of), and we can bring those implicit relations explicit by synthesizing instructions via foundation models. Trained on 36.7M (query image, instruction, target image) triplets with rich semantic relations mined from the web, MagicLens achieves results comparable with or better than prior best on eight benchmarks of various image retrieval tasks, while maintaining high parameter efficiency with a significantly smaller model size. Additional human analyses on a 1.4M-image unseen corpus further demonstrate the diversity of search intents supported by MagicLens. Code and models are publicly available at https://open-vision-language.github.io/MagicLens/.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.24062442779541, -0.8816229701042175], "openalex_id": "https://openalex.org/W4393313159", "title": "Construction of Text Summarization Corpus in Economics Domain and Baseline Models", "authors": "Sawittree Jumpathong, Akkharawoot Takhom, Prachya Boonkwan, Vipas Sutantayawalee, Peerachet Porkaew, Sitthaa Phaholphinyo, Charun Phrombut, Khemarath Choke-mangmi, Saran Yamasathien, Nattachai Tretasayuth, Kasidis Kanwatchara, Atiwat Aiemleuk, Thepchai Supnithi", "abstract": "Sawittree Jumpathong, Akkharawoot Takhom, Prachya Boonkwan, Vipas Sutantayawalee, Peerachet Porkaew, Sitthaa Phaholphinyo, Charun Phrombut, Khemarath Choke-mangmi, Saran Yamasathien, Nattachai Tretasayuth, Kasidis Kanwatchara, Atiwat Aiemleuk, and Thepchai Supnithi. Journal of information and communication convergence engineering 2024;22:33-43. https://doi.org/10.56977/jicce.2024.22.1.33", "venue": "Journal of information and communication convergence engineering", "label": 0}, {"loc": [5.2163825035095215, -1.5856245756149292], "openalex_id": "https://openalex.org/W4393335958", "title": "KazSAnDRA: Kazakh Sentiment Analysis Dataset of Reviews and Attitudes", "authors": "Rustem Yeshpanov, H\u00fcseyin Atakan Varol", "abstract": "This paper presents KazSAnDRA, a dataset developed for Kazakh sentiment analysis that is the first and largest publicly available dataset of its kind. KazSAnDRA comprises an extensive collection of 180,064 reviews obtained from various sources and includes numerical ratings ranging from 1 to 5, providing a quantitative representation of customer attitudes. The study also pursued the automation of Kazakh sentiment classification through the development and evaluation of four machine learning models trained for both polarity classification and score classification. Experimental analysis included evaluation of the results considering both balanced and imbalanced scenarios. The most successful model attained an F1-score of 0.81 for polarity classification and 0.39 for score classification on the test sets. The dataset and fine-tuned models are open access and available for download under the Creative Commons Attribution 4.0 International License (CC BY 4.0) through our GitHub repository.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.6991729736328125, 0.06004274636507034], "openalex_id": "https://openalex.org/W4393302842", "title": "Strategies for Corpus Development for Low\u2010Resource Languages: Insights from Nepal", "authors": "Bal Krishna Bal, Balaram Prasain, Rupak Raj Ghimire, Praveen Acharya", "abstract": "Datasets or corpora are crucial ingredients for the development of any language technology projects. However, in the majority of situations, these resources appear to be a major issue or bottleneck, especially for low-resource languages. Typically, any low-resource language lacks technological support to encode the script or language computationally. Even for those with such support, the language resources are sparsely developed and lack benchmarking mechanisms, raising the question about the validity of any research and development using those resources. Apparently, it is high time that the low-resource languages develop specific short, medium, and long-term strategies to address these issues so that they could advance research and development of language technologies for their respective languages, at least not falling too much behind, if not at par, with the high-resource languages. This chapter explores the scenario of language computing with a particular focus on the speech and machine translation domains in the context of low-resource languages in Nepal and at the same time, provides a walk-through of experiences working on Nepal's languages and looking into how these could be leveraged or extended to align with the efforts put out by other low-resource languages in the region.", "venue": "https://doi.org/10.1002/9781394214624.ch15", "label": 0}, {"loc": [6.211723327636719, 0.22881996631622314], "openalex_id": "https://openalex.org/W4393302604", "title": "CoRePooL\u2014Corpus for Resource\u2010Poor Languages: Badaga Speech Corpus", "authors": "H B Barathi Ganesh, G. Jyothish Lal, Rahul Jairam, K. P. Soman, N. S. Kamal, B. Sharmila", "abstract": "This chapter presents a corpus named CoRePooL that stands for Corpus for Resource-Poor Languages. As voice-specific human-machine interaction applications are accelerated by deep learning algorithms, the lack of resources constrains the scalability in applying to resource-poor languages. In CoRePooL version 0.1.0, we released 420 min of monolingual supervised corpus and 968 minutes of multilingual unsupervised corpus for the Badaga language from the Dravidian language family. The annotation of supervised corpus helps in performing speech-to-text, text-to-speech, translation, gender, and speaker identification. The unsupervised corpus would help self-supervised algorithms which compute latent representations. We also provided the baseline for all the tasks by fine-tuning the foundation models on the released corpus. The code, models, and data are made publicly available at https://github.com/rbg-research/CoRePooL.", "venue": "https://doi.org/10.1002/9781394214624.ch10", "label": 0}, {"loc": [3.0165157318115234, -0.45103713870048523], "openalex_id": "https://openalex.org/W4393300262", "title": "BioMedLM: A 2.7 B Parameter Language Model Trained On Biomedical Text", "authors": "Elliot Bolton, Abhinav Venigalla, Michihiro Yasunaga, David Hall, Betty Xiong, Tong Lee, Roxana Daneshjou, Jonathan Frankle, Percy Liang, Michael Carbin, Christopher D. Manning", "abstract": "Models such as GPT-4 and Med-PaLM 2 have demonstrated impressive performance on a wide variety of biomedical NLP tasks. However, these models have hundreds of billions of parameters, are computationally expensive to run, require users to send their input data over the internet, and are trained on unknown data sources. Can smaller, more targeted models compete? To address this question, we build and release BioMedLM, a 2.7 billion parameter GPT-style autoregressive model trained exclusively on PubMed abstracts and full articles. When fine-tuned, BioMedLM can produce strong multiple-choice biomedical question-answering results competitive with much larger models, such as achieving a score of 57.3% on MedMCQA (dev) and 69.0% on the MMLU Medical Genetics exam. BioMedLM can also be fine-tuned to produce useful answers to patient questions on medical topics. This demonstrates that smaller models can potentially serve as transparent, privacy-preserving, economical and environmentally friendly foundations for particular NLP applications, such as in biomedicine. The model is available on the Hugging Face Hub: https://huggingface.co/stanford-crfm/BioMedLM.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.2293853759765625, 1.1914148330688477], "openalex_id": "https://openalex.org/W4393299965", "title": "Juru: Legal Brazilian Large Language Model from Reputable Sources", "authors": "Roseval Malaquias Junior, Ramon Pires, Roseli Aparecida Francelin Romero, Rodrigo Nogueira", "abstract": "The high compute cost associated with pretraining large language models limits their research. Two strategies have emerged to address this issue: domain specialization and pretraining with high-quality data. To explore these strategies, we specialized the Mistral-7B model with 1.9 billion unique tokens from reputable Brazilian legal sources and conducted few-shot evaluations on legal and general knowledge test suites. Our model, Juru, demonstrates the benefits of domain specialization by achieving improved performance on legal benchmarks, even with a reduced amount of pretraining data. However, this domain specialization through continued pretraining comes at the cost of increased forgetting in unrelated domains, as evidenced by performance degradation on general knowledge test suites in both Portuguese and English. This study contributes to the growing body of scientific evidence showing that pretraining data selection may enhance the performance of large language models, enabling the exploration of these models at a lower cost. Juru is publicly available at https://huggingface.co/roseval/Juru-7B.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.2615678310394287, 2.4945249557495117], "openalex_id": "https://openalex.org/W4393272665", "title": "The work of art in the age of artificial intelligibility", "authors": "John McLoughlin", "abstract": "Abstract The emergence of complex deep-learning models capable of producing novel images on a practically innumerable number of subjects and in an equally wide variety of artistic styles is beginning to highlight serious inadequacies in the ethical, aesthetic, epistemological and legal frameworks we have so far used to categorise art. To begin tackling these issues and identifying a role for AI in the production and protection of human artwork, it is necessary to take a multidisciplinary approach which considers current legal precedents, the practice of software engineering, historical attitudes towards technological innovation and a sustained technical analysis of the models themselves. This paper queries the location and nature of substantive artistic work in the developmental stages of an AI-generated image, offering critiques of existing assumptions and posing questions for future research. The emergence of convincing AI creative output, artistic or literary, has significant long-term implications for the humanities, including the need for re-appraisal of foundational ideas about authorship and creativity in general. The effects of artificial intelligence, whether generalised or task-specific, cannot be ignored or displaced now that easy-access, scalable image and text production is a reality.", "venue": "AI & Society", "label": 16}, {"loc": [6.275137424468994, 5.472933769226074], "openalex_id": "https://openalex.org/W4393259344", "title": "TOMGPT: Reliable Text-Only Training Approach for Cost-Effective Multi-modal Large Language Model", "authors": "Yunkai Chen, Qimeng Wang, Shiwei Wu, Yan Gao, Tong Xu, Yao Hu", "abstract": "Multi-modal large language models (MLLMs), such as GPT-4, exhibit great comprehension capabilities on human instruction, as well as zero-shot ability on new downstream multi-modal tasks. To integrate the different modalities within a unified embedding space, previous MLLMs attempted to conduct visual instruction tuning with massive and high-quality image-text pair data, which requires substantial costs in data collection and training resources. In this article, we propose TOMGPT (Text-Only training Multi-modal GPT), a cost-effective MLLM tuned solely on easily accessible text data with much fewer resources. Along with pre-trained visual-linguistic coupled modality space (e.g., CLIP and ALIGN model), a text-only training strategy is devised to further project the aligned multi-modal latent space to that of LLM, endowing the LLM with visual comprehension capabilities in an efficient manner. Instead of enormous image-text training data required by previous MLLMs, we find that TOMGPT can be well-tuned with fewer yet diverse GPT-generated free-form text data, as we establish the semantic connection between LLM and pre-trained vision-language model. A quantitative evaluation is conducted on both MME and LVLM, which are recently released and extensively utilized MLLM benchmarks. The experiments reveal that TOMGPT achieved reliable performance compared to numerous models trained on a large amount of image-text pair data. Case studies are also presented, demonstrating TOMGPT\u2019s broad understanding and dialogue capabilities across diverse image categories.", "venue": "ACM Transactions on Knowledge Discovery from Data", "label": 41}, {"loc": [2.981980323791504, -0.7183384895324707], "openalex_id": "https://openalex.org/W4393300168", "title": "A Dataset for Pharmacovigilance in German, French, and Japanese: Annotating Adverse Drug Reactions across Languages", "authors": "Lisa Raithel, Hui-Syuan Yeh, Shuntaro Yada, Cyril Grouin, Thomas Lavergne, Aur\u00e9lie N\u00e9v\u00e9ol, Patrick Paroubek, Philippe Thomas, Tomohiro Nishiyama, Sebastian M\u00f6ller, Eiji Aramaki, Y\u016bji Matsumoto, Roland Roller, Pierre Zweigenbaum", "abstract": "User-generated data sources have gained significance in uncovering Adverse Drug Reactions (ADRs), with an increasing number of discussions occurring in the digital world. However, the existing clinical corpora predominantly revolve around scientific articles in English. This work presents a multilingual corpus of texts concerning ADRs gathered from diverse sources, including patient fora, social media, and clinical reports in German, French, and Japanese. Our corpus contains annotations covering 12 entity types, four attribute types, and 13 relation types. It contributes to the development of real-world multilingual language models for healthcare. We provide statistics to highlight certain challenges associated with the corpus and conduct preliminary experiments resulting in strong baselines for extracting entities and relations between these entities, both within and across languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.876389265060425, 2.973433017730713], "openalex_id": "https://openalex.org/W4393229390", "title": "Data Authenticity, Consent, & Provenance for AI are all broken: what will it take to fix them?", "authors": "Shayne Longpre, Robert Mahari, Naana Obeng-Marnu, William Brannon, Tobin South, Jad Kabbara, Sandy Pentland", "abstract": "New AI capabilities are owed in large part to massive, widely sourced, and underdocumented training data collections. Dubious collection practices have spurred crises in data transparency, authenticity, consent, privacy, representation, bias, copyright infringement, and the overall development of ethical and trustworthy AI systems. In response, AI regulation is emphasizing the need for training data transparency to understand AI model limitations. Based on a large-scale analysis of the AI training data landscape and existing solutions, we identify the missing infrastructure to facilitate responsible AI development practices. We explain why existing tools for data authenticity, consent, and documentation alone are unable to solve the core problems facing the AI community, and outline how policymakers, developers, and data creators can facilitate responsible AI development, through universal data provenance standards.", "venue": "https://doi.org/10.21428/e4baedd9.a650f77d", "label": 0}, {"loc": [4.450596332550049, 2.5070652961730957], "openalex_id": "https://openalex.org/W4393247867", "title": "RuBia: A Russian Language Bias Detection Dataset", "authors": "\u0412\u0435\u0440\u043e\u043d\u0438\u043a\u0430 \u042e\u0440\u044c\u0435\u0432\u043d\u0430 \u0413\u0440\u0438\u0433\u043e\u0440\u044c\u0435\u0432\u0430, \u0410\u043d\u0430\u0441\u0442\u0430\u0441\u0438\u044f \u0418\u0432\u0430\u043d\u043e\u0432\u0430, Ilseyar Alimova, Ekaterina Artemova", "abstract": "Warning: this work contains upsetting or disturbing content. Large language models (LLMs) tend to learn the social and cultural biases present in the raw pre-training data. To test if an LLM's behavior is fair, functional datasets are employed, and due to their purpose, these datasets are highly language and culture-specific. In this paper, we address a gap in the scope of multilingual bias evaluation by presenting a bias detection dataset specifically designed for the Russian language, dubbed as RuBia. The RuBia dataset is divided into 4 domains: gender, nationality, socio-economic status, and diverse, each of the domains is further divided into multiple fine-grained subdomains. Every example in the dataset consists of two sentences with the first reinforcing a potentially harmful stereotype or trope and the second contradicting it. These sentence pairs were first written by volunteers and then validated by native-speaking crowdsourcing workers. Overall, there are nearly 2,000 unique sentence pairs spread over 19 subdomains in RuBia. To illustrate the dataset's purpose, we conduct a diagnostic evaluation of state-of-the-art or near-state-of-the-art LLMs and discuss the LLMs' predisposition to social biases.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.486466884613037, 1.446663737297058], "openalex_id": "https://openalex.org/W4393284214", "title": "Ontology Completion with Natural Language Inference and Concept Embeddings: An Analysis", "authors": "Na Li, Thomas Bailleux, Zied Bouraoui, Steven Schockaert", "abstract": "We consider the problem of finding plausible knowledge that is missing from a given ontology, as a generalisation of the well-studied taxonomy expansion task. One line of work treats this task as a Natural Language Inference (NLI) problem, thus relying on the knowledge captured by language models to identify the missing knowledge. Another line of work uses concept embeddings to identify what different concepts have in common, taking inspiration from cognitive models for category based induction. These two approaches are intuitively complementary, but their effectiveness has not yet been compared. In this paper, we introduce a benchmark for evaluating ontology completion methods and thoroughly analyse the strengths and weaknesses of both approaches. We find that both approaches are indeed complementary, with hybrid strategies achieving the best overall results. We also find that the task is highly challenging for Large Language Models, even after fine-tuning.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.8236093521118164, -0.2241046130657196], "openalex_id": "https://openalex.org/W4393236373", "title": "Advancing medical imaging with language models: featuring a spotlight on ChatGPT", "authors": "Mingzhe Hu, Joshua Qian, Shaoyan Pan, Yuheng Li, Richard L. J. Qiu, Xiaofeng Yang", "abstract": "Abstract This review paper aims to serve as a comprehensive guide and instructional resource for researchers seeking to effectively implement language models in medical imaging research. First, we presented the fundamental principles and evolution of language models, dedicating particular attention to large language models. We then reviewed the current literature on how language models are being used to improve medical imaging, emphasizing a range of applications such as image captioning, report generation, report classification, findings extraction, visual question response systems, interpretable diagnosis and so on. Notably, the capabilities of ChatGPT were spotlighted for researchers to explore its further applications. Furthermore, we covered the advantageous impacts of accurate and efficient language models in medical imaging analysis, such as the enhancement of clinical workflow efficiency, reduction of diagnostic errors, and assistance of clinicians in providing timely and accurate diagnoses. Overall, our goal is to have better integration of language models with medical imaging, thereby inspiring new ideas and innovations. It is our aspiration that this review can serve as a useful resource for researchers in this field, stimulating continued investigative and innovative pursuits of the application of language models in medical imaging.", "venue": "Physics in Medicine and Biology", "label": 0}, {"loc": [7.104887962341309, 0.07771840691566467], "openalex_id": "https://openalex.org/W4393218997", "title": "Can cross-domain term extraction benefit from cross-lingual transfer and nested term labeling?", "authors": "Hanh Thi Hong Tran, Matej Martinc, Andra\u017e Repar, Nikola Ljube\u0161i\u0107, Antoine Doucet, Senja Pollak", "abstract": "Abstract Automatic term extraction (ATE) is a natural language processing task that eases the effort of manually identifying terms from domain-specific corpora by providing a list of candidate terms. In this paper, we treat ATE as a sequence-labeling task and explore the efficacy of XLMR in evaluating cross-lingual and multilingual learning against monolingual learning in the cross-domain ATE context. Additionally, we introduce NOBI, a novel annotation mechanism enabling the labeling of single-word nested terms. Our experiments are conducted on the ACTER corpus, encompassing four domains and three languages (English, French, and Dutch), as well as the RSDO5 Slovenian corpus, encompassing four additional domains. Results indicate that cross-lingual and multilingual models outperform monolingual settings, showcasing improved F1-scores for all languages within the ACTER dataset. When incorporating an additional Slovenian corpus into the training set, the multilingual model exhibits superior performance compared to state-of-the-art approaches in specific scenarios. Moreover, the newly introduced NOBI labeling mechanism enhances the classifier\u2019s capacity to extract short nested terms significantly, leading to substantial improvements in Recall for the ACTER dataset and consequentially boosting the overall F1-score performance.", "venue": "Machine Learning", "label": 0}, {"loc": [7.5736083984375, 3.5558433532714844], "openalex_id": "https://openalex.org/W4393284539", "title": "InternLM2 Technical Report", "authors": "Zheng Cai, Maosong Cao, Haojiong Chen, Kai Chen, Keyu Chen, Xin Chen, Xun Chen, Zehui Chen, Zhi Chen, Pei Chu, Xiaoyi Dong, Haodong Duan, Qi Fan, Zhaoye Fei, Yang Gao, Jiaye Ge, Chenya Gu, Yuzhe Gu, Tao Gui, A. Q. Guo, Qipeng Guo, Conghui He, Yingfan Hu, Ting Huang, Tao Jiang, Penglong Jiao, Zhenjiang Jin, Zhikai Lei, Jiaxing Li, Jingwen Li, Linyang Li, Shuaibin Li, Wei Li, Yining Li, Hongwei Liu, Jiangning Liu, Jiawei Hong, Kaiwen Liu, Kuikun Liu, Xiaoran Liu, Chengqi Lv, Haijun Lv, Kai Lv, Li Ma, Runyuan Ma, Zerun Ma, Wenchang Ning, Linke Ouyang, Jiantao Qiu, Yuan Qu, Fukai Shang, Yunfan Shao, Demin Song, Zifan Song, Zhihao Sui, Peng Sun, Yu Sun, Huanze Tang, Bin Wang, Guoteng Wang, Jiaqi Wang, Jiayu Wang, Rui Wang, Yudong Wang, Ziyi Wang, Xingjian Wei, Qizhen Weng, Fan Wu, Yingtong Xiong, Chao Xu, Ruiliang Xu, Hang Yan, Yirong Yan, Xiaogui Yang, Haochen Ye, Huaiyuan Ying, Jia Yu, Jing Yu, Yuhang Zang, Chuyu Zhang, Li Zhang, Pan Zhang, Peng Zhang, Ruijie Zhang, Shuo Zhang, Song\u2010Yang Zhang, Wenjian Zhang, Wenwei Zhang, Xingcheng Zhang, Xinyue Zhang, Hui Zhao, Qian Zhao, Xiaomeng Zhao, Fengzhe Zhou, Zaida Zhou, Jingming Zhuo, Yicheng Zou, Xipeng Qiu, Yu Qiao, Dahua Lin", "abstract": "The evolution of Large Language Models (LLMs) like ChatGPT and GPT-4 has sparked discussions on the advent of Artificial General Intelligence (AGI). However, replicating such advancements in open-source models has been challenging. This paper introduces InternLM2, an open-source LLM that outperforms its predecessors in comprehensive evaluations across 6 dimensions and 30 benchmarks, long-context modeling, and open-ended subjective evaluations through innovative pre-training and optimization techniques. The pre-training process of InternLM2 is meticulously detailed, highlighting the preparation of diverse data types including text, code, and long-context data. InternLM2 efficiently captures long-term dependencies, initially trained on 4k tokens before advancing to 32k tokens in pre-training and fine-tuning stages, exhibiting remarkable performance on the 200k ``Needle-in-a-Haystack\" test. InternLM2 is further aligned using Supervised Fine-Tuning (SFT) and a novel Conditional Online Reinforcement Learning from Human Feedback (COOL RLHF) strategy that addresses conflicting human preferences and reward hacking. By releasing InternLM2 models in different training stages and model sizes, we provide the community with insights into the model's evolution.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.6584067344665527, 1.3957988023757935], "openalex_id": "https://openalex.org/W4393222128", "title": "Generative AI from Theory to Practice: A Case Study of Financial Advice", "authors": "Andrew W. Lo, J. Perran Ross", "abstract": "We identify some of the most pressing issues facing the adoption of large language models (LLMs) in practical settings and propose a research agenda to reach the next technological inflection point in generative AI. We focus on three challenges facing most LLM applications: domain-specific expertise and the ability to tailor that expertise to a user's unique situation, trustworthiness and adherence to the user's moral and ethical standards, and conformity to regulatory guidelines and oversight.", "venue": "https://doi.org/10.21428/e4baedd9.a1f6a281", "label": 0}, {"loc": [6.272581577301025, -0.696129322052002], "openalex_id": "https://openalex.org/W4393213422", "title": "NSINA: A News Corpus for Sinhala", "authors": "Hansi Hettiarachchi, Damith Premasiri, Lasitha Uyangodage, Tharindu Ranasinghe", "abstract": "The introduction of large language models (LLMs) has advanced natural language processing (NLP), but their effectiveness is largely dependent on pre-training resources. This is especially evident in low-resource languages, such as Sinhala, which face two primary challenges: the lack of substantial training data and limited benchmarking datasets. In response, this study introduces NSINA, a comprehensive news corpus of over 500,000 articles from popular Sinhala news websites, along with three NLP tasks: news media identification, news category prediction, and news headline generation. The release of NSINA aims to provide a solution to challenges in adapting LLMs to Sinhala, offering valuable resources and benchmarks for improving NLP in the Sinhala language. NSINA is the largest news corpus for Sinhala, available up to date.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.358578681945801, 2.314361810684204], "openalex_id": "https://openalex.org/W4393212671", "title": "Understanding Emergent Abilities of Language Models from the Loss Perspective", "authors": "Zhengxiao Du, Aohan Zeng, Yuxiao Dong, Jie Tang", "abstract": "Recent studies have put into question the belief that emergent abilities in language models are exclusive to large models. This skepticism arises from two observations: 1) smaller models can also exhibit high performance on emergent abilities and 2) there is doubt on the discontinuous metrics used to measure these abilities. In this paper, we propose to study emergent abilities in the lens of pre-training loss, instead of model size or training compute. We demonstrate that the Transformer models with the same pre-training loss, but different model and data sizes, generate the same performance on various downstream tasks, with a fixed data corpus, tokenization, and model architecture. We also discover that a model exhibits emergent abilities on certain tasks -- regardless of the continuity of metrics -- when its pre-training loss falls below a specific threshold. Before reaching this threshold, its performance remains at the level of random guessing. This inspires us to redefine emergent abilities as those that manifest in models with lower pre-training losses, highlighting that these abilities cannot be predicted by merely extrapolating the performance trends of models with higher pre-training losses.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.80436372756958, 2.527294158935547], "openalex_id": "https://openalex.org/W4393212768", "title": "Leveraging Zero-Shot Prompting for Efficient Language Model Distillation", "authors": "Lukas V\u00f6ge, Vincent Gurgul, Stefan Lessmann", "abstract": "This paper introduces a novel approach for efficiently distilling LLMs into smaller, application-specific models, significantly reducing operational costs and manual labor. Addressing the challenge of deploying computationally intensive LLMs in specific applications or edge devices, this technique utilizes LLMs' reasoning capabilities to generate labels and natural language rationales for unlabeled data. Our approach enhances both finetuning and distillation by employing a multi-task training framework where student models mimic these rationales alongside teacher predictions. Key contributions include the employment of zero-shot prompting to elicit teacher model rationales, reducing the necessity for handcrafted few-shot examples and lowering the overall token count required, which directly translates to cost savings given the pay-per-token billing model of major tech companies' LLM APIs. Additionally, the paper investigates the impact of explanation properties on distillation efficiency, demonstrating that minimal performance loss occurs even when rationale augmentation is not applied across the entire dataset, facilitating further reductions of tokens. This research marks a step toward the efficient training of task-specific models with minimal human intervention, offering substantial cost-savings while maintaining, or even enhancing, performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.857412099838257, 4.024354457855225], "openalex_id": "https://openalex.org/W4393212993", "title": "A Little Leak Will Sink a Great Ship: Survey of Transparency for Large Language Models from Start to Finish", "authors": "Masahiro Kaneko, Timothy Baldwin", "abstract": "Large Language Models (LLMs) are trained on massive web-crawled corpora. This poses risks of leakage, including personal information, copyrighted texts, and benchmark datasets. Such leakage leads to undermining human trust in AI due to potential unauthorized generation of content or overestimation of performance. We establish the following three criteria concerning the leakage issues: (1) leakage rate: the proportion of leaked data in training data, (2) output rate: the ease of generating leaked data, and (3) detection rate: the detection performance of leaked versus non-leaked data. Despite the leakage rate being the origin of data leakage issues, it is not understood how it affects the output rate and detection rate. In this paper, we conduct an experimental survey to elucidate the relationship between the leakage rate and both the output rate and detection rate for personal information, copyrighted texts, and benchmark data. Additionally, we propose a self-detection approach that uses few-shot learning in which LLMs detect whether instances are present or absent in their training data, in contrast to previous methods that do not employ explicit learning. To explore the ease of generating leaked information, we create a dataset of prompts designed to elicit personal information, copyrighted text, and benchmarks from LLMs. Our experiments reveal that LLMs produce leaked information in most cases despite less such data in their training set. This indicates even small amounts of leaked data can greatly affect outputs. Our self-detection method showed superior performance compared to existing detection methods.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.4201860427856445, 0.1375526785850525], "openalex_id": "https://openalex.org/W4393213901", "title": "Evaluating Shortest Edit Script Methods for Contextual Lemmatization", "authors": "Olia Toporkov, Rodrigo Agerri", "abstract": "Modern contextual lemmatizers often rely on automatically induced Shortest Edit Scripts (SES), namely, the number of edit operations to transform a word form into its lemma. In fact, different methods of computing SES have been proposed as an integral component in the architecture of several state-of-the-art contextual lemmatizers currently available. However, previous work has not investigated the direct impact of SES in the final lemmatization performance. In this paper we address this issue by focusing on lemmatization as a token classification task where the only input that the model receives is the word-label pairs in context, where the labels correspond to previously induced SES. Thus, by modifying in our lemmatization system only the SES labels that the model needs to learn, we may then objectively conclude which SES representation produces the best lemmatization results. We experiment with seven languages of different morphological complexity, namely, English, Spanish, Basque, Russian, Czech, Turkish and Polish, using multilingual and language-specific pre-trained masked language encoder-only models as a backbone to build our lemmatizers. Comprehensive experimental results, both in- and out-of-domain, indicate that computing the casing and edit operations separately is beneficial overall, but much more clearly for languages with high-inflected morphology. Notably, multilingual pre-trained language models consistently outperform their language-specific counterparts in every evaluation setting.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.529860019683838, -1.0369304418563843], "openalex_id": "https://openalex.org/W4393213597", "title": "Synthetic Data Generation and Joint Learning for Robust Code-Mixed Translation", "authors": "Kartik Kartik, Sanjana Soni, Anoop Kunchukuttan, Tanmoy Chakraborty, Md Shad Akhtar", "abstract": "The widespread online communication in a modern multilingual world has provided opportunities to blend more than one language (aka code-mixed language) in a single utterance. This has resulted a formidable challenge for the computational models due to the scarcity of annotated data and presence of noise. A potential solution to mitigate the data scarcity problem in low-resource setup is to leverage existing data in resource-rich language through translation. In this paper, we tackle the problem of code-mixed (Hinglish and Bengalish) to English machine translation. First, we synthetically develop HINMIX, a parallel corpus of Hinglish to English, with ~4.2M sentence pairs. Subsequently, we propose RCMT, a robust perturbation based joint-training model that learns to handle noise in the real-world code-mixed text by parameter sharing across clean and noisy words. Further, we show the adaptability of RCMT in a zero-shot setup for Bengalish to English translation. Our evaluation and comprehensive analyses qualitatively and quantitatively demonstrate the superiority of RCMT over state-of-the-art code-mixed and robust translation methods.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.653698921203613, 0.5461111664772034], "openalex_id": "https://openalex.org/W4393247683", "title": "RU22Fact: Optimizing Evidence for Multilingual Explainable Fact-Checking on Russia-Ukraine Conflict", "authors": "Yirong Zeng, Xiao Ding, Yi Zhao, Xiangyu Li, Jie Zhang, Chao Yao, Ting Liu, Bing Qin", "abstract": "Fact-checking is the task of verifying the factuality of a given claim by examining the available evidence. High-quality evidence plays a vital role in enhancing fact-checking systems and facilitating the generation of explanations that are understandable to humans. However, the provision of both sufficient and relevant evidence for explainable fact-checking systems poses a challenge. To tackle this challenge, we propose a method based on a Large Language Model to automatically retrieve and summarize evidence from the Web. Furthermore, we construct RU22Fact, a novel multilingual explainable fact-checking dataset on the Russia-Ukraine conflict in 2022 of 16K samples, each containing real-world claims, optimized evidence, and referenced explanation. To establish a baseline for our dataset, we also develop an end-to-end explainable fact-checking system to verify claims and generate explanations. Experimental results demonstrate the prospect of optimized evidence in increasing fact-checking performance and also indicate the possibility of further progress in the end-to-end claim verification and explanation generation tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.7143425941467285, 0.4966655969619751], "openalex_id": "https://openalex.org/W4393145549", "title": "Can ChatGPT learn Chinese or Swahili? Considering how large language models might act differently if trained in different languages.", "authors": "Neil Savage", "abstract": "Considering how large language models might act differently if trained in different languages.", "venue": "Communications of the ACM", "label": 0}, {"loc": [8.235876083374023, 0.4300491511821747], "openalex_id": "https://openalex.org/W4393147129", "title": "Benchmarking Large Language Models", "authors": "Jiawei Chen, Hongyu Lin, Xianpei Han, Le Sun", "abstract": "Retrieval-Augmented Generation (RAG) is a promising approach for mitigating the hallucination of large language models (LLMs). However, existing research lacks rigorous evaluation of the impact of retrieval-augmented generation on different large language models, which make it challenging to identify the potential bottlenecks in the capabilities of RAG for different LLMs. In this paper, we systematically investigate the impact of Retrieval-Augmented Generation on large language models. We analyze the performance of different large language models in 4 fundamental abilities required for RAG, including noise robustness, negative rejection, information integration, and counterfactual robustness. To this end, we establish Retrieval-Augmented Generation Benchmark (RGB), a new corpus for RAG evaluation in both English and Chinese. RGB divides the instances within the benchmark into 4 separate testbeds based on the aforementioned fundamental abilities required to resolve the case. Then we evaluate 6 representative LLMs on RGB to diagnose the challenges of current LLMs when applying RAG. Evaluation reveals that while LLMs exhibit a certain degree of noise robustness, they still struggle significantly in terms of negative rejection, information integration, and dealing with false information. The aforementioned assessment outcomes indicate that there is still a considerable journey ahead to effectively apply RAG to LLMs.", "venue": "Proceedings of the AAAI Conference on Artificial Intelligence", "label": 5}, {"loc": [3.765639305114746, -3.9402670860290527], "openalex_id": "https://openalex.org/W4393147192", "title": "Spanning the Spectrum of Hatred Detection: A Persian Multi-Label Hate Speech Dataset with Annotator Rationales", "authors": "Zahra Delbari, Nafise Sadat Moosavi, Mohammad Taher Pilehvar", "abstract": "With the alarming rise of hate speech in online communities, the demand for effective NLP models to identify instances of offensive language has reached a critical point. However, the development of such models heavily relies on the availability of annotated datasets, which are scarce, particularly for less-studied languages. To bridge this gap for the Persian language, we present a novel dataset specifically tailored to multi-label hate speech detection. Our dataset, called Phate, consists of an extensive collection of over seven thousand manually-annotated Persian tweets, offering a rich resource for training and evaluating hate speech detection models on this language. Notably, each annotation in our dataset specifies the targeted group of hate speech and includes a span of the tweet which elucidates the rationale behind the assigned label. The incorporation of these information expands the potential applications of our dataset, facilitating the detection of targeted online harm or allowing the benchmark to serve research on interpretability of hate speech detection models. The dataset, annotation guideline, and all associated codes are accessible at https://github.com/Zahra-D/Phate.", "venue": "Proceedings of the AAAI Conference on Artificial Intelligence", "label": 5}, {"loc": [2.947498321533203, 0.023186219856142998], "openalex_id": "https://openalex.org/W4393128858", "title": "Harnessing Artificial Intelligence in Bariatric Surgery: Comparative Analysis of ChatGPT-4, Bing, and Bard in Generating Clinician-Level Bariatric Surgery \u2026", "authors": "Yung Lee, Thomas H. Shin, L\u00e9a Tessier, Arshia P. Javidan, James J. Jung, Dennis Hong, Andrew T. Strong, Tyler McKechnie, Sarah Malone, David Jin, Matthew Kroh, Jerry T. Dang", "abstract": "LLM-based AI chat models can effectively generate appropriate responses to clinical questions related to bariatric surgery, though the performance of different models can vary greatly. Therefore, caution should be taken when interpreting clinical information provided by LLMs, and clinician oversight is necessary to ensure accuracy. Future investigation is warranted to explore how LLMs might enhance healthcare provision and clinical decision-making in bariatric surgery.", "venue": "Surgery for Obesity and Related Diseases", "label": 0}, {"loc": [4.4830474853515625, 0.4105789363384247], "openalex_id": "https://openalex.org/W4393165528", "title": "Exploring the competence of ChatGPT for customer and patient service management", "authors": "Abid Haleem, Mohd Javaid, Ravi Pratap Singh", "abstract": "The modern language generation model ChatGPT, created by Open Artificial Intelligence (AI), is recognised for its capacity to comprehend context and produce pertinent content. This model is built on the transformer architecture, which enables it to process massive volumes of data and produce text that is both cohesive and illuminating. Service is a crucial component everywhere as it provides the basis for establishing client rapport and offering aid and support. In healthcare, the application of ChatGPT for patient service support has been one of the most significant advances in recent years. ChatGPT can help overcome language obstacles and improve patient satisfaction by facilitating communication with healthcare personnel and understanding of care. It can assist in enhancing the entire patient experience by offering personalised information and support to patients and making it more straightforward for them to communicate with healthcare professionals. Its goal can be to expedite and streamline service by promptly and accurately responding to customers. Businesses of all sizes increasingly use ChatGPT since it allows them to provide 24/7 customer support without requiring human contact. This paper briefly discusses ChatGPT and the need for better services. Various perspectives on improving customer and patient services through ChatGPT are discussed. The article also discussed the major key enablers of ChatGPT for refining customer and patient assistance. Further, the paper identifies and discusses the critical application areas of ChatGPT for customer and patient service. With its ability to handle several requests simultaneously, respond quickly and accurately to client questions, and gain knowledge from every interaction, ChatGPT is revolutionising customer and patient service. Its accessibility and compatibility with various communication channels make it a desirable solution for businesses looking to improve support. As technology advances, ChatGPT is positioned to become an essential tool for businesses wishing to provide speedy and customised service. Although ChatGPT may give convincing solutions, the chance of providing accurate and updated information poses a problem for its usage in service jobs that need accurate and up-to-date information. In future, various services will become better and more efficient due to ChatGPT and AI.", "venue": "Intelligent Pharmacy", "label": 0}, {"loc": [7.734462738037109, 4.05584192276001], "openalex_id": "https://openalex.org/W4392781399", "title": "Branch-Train-MiX: Mixing Expert LLMs into a Mixture-of-Experts LLM", "authors": "Sainbayar Sukhbaatar, Olga Golovneva, Vasu Sharma, Hu Xu, Xi Lin, Baptiste Rozi\u00e8re, Jacob Kahn, Daniel Li, Wen-tau Yih, Jason Weston, Xian Li", "abstract": "We investigate efficient methods for training Large Language Models (LLMs) to possess capabilities in multiple specialized domains, such as coding, math reasoning and world knowledge. Our method, named Branch-Train-MiX (BTX), starts from a seed model, which is branched to train experts in embarrassingly parallel fashion with high throughput and reduced communication cost. After individual experts are asynchronously trained, BTX brings together their feedforward parameters as experts in Mixture-of-Expert (MoE) layers and averages the remaining parameters, followed by an MoE-finetuning stage to learn token-level routing. BTX generalizes two special cases, the Branch-Train-Merge method, which does not have the MoE finetuning stage to learn routing, and sparse upcycling, which omits the stage of training experts asynchronously. Compared to alternative approaches, BTX achieves the best accuracy-efficiency tradeoff.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.764543533325195, -0.6359862089157104], "openalex_id": "https://openalex.org/W4393117029", "title": "A New Massive Multilingual Dataset for High-Performance Language Technologies", "authors": "Ona De Gibert, Graeme Nail, Nikolay Arefyev, Marta Ba\u00f1\u00f3n, Jelmer van der Linde, Shaoxiong Ji, Jaume Zaragoza-Bernabeu, Mikko Aulamo, Gema Ram\u00edrez-S\u00e1nchez, Andrey Kutuzov, Sampo Pyysalo, Stephan Oepen, J\u00f6rg Tiedemann", "abstract": "We present the HPLT (High Performance Language Technologies) language resources, a new massive multilingual dataset including both monolingual and bilingual corpora extracted from CommonCrawl and previously unused web crawls from the Internet Archive. We describe our methods for data acquisition, management and processing of large corpora, which rely on open-source software tools and high-performance computing. Our monolingual collection focuses on low- to medium-resourced languages and covers 75 languages and a total of ~5.6 trillion word tokens de-duplicated on the document level. Our English-centric parallel corpus is derived from its monolingual counterpart and covers 18 language pairs and more than 96 million aligned sentence pairs with roughly 1.4 billion English tokens. The HPLT language resources are one of the largest open text corpora ever released, providing a great resource for language modeling and machine translation training. We publicly release the corpora, the software, and the tools used in this work.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.7097249031066895, 4.051789283752441], "openalex_id": "https://openalex.org/W4392781137", "title": "Harder Tasks Need More Experts: Dynamic Routing in MoE Models", "authors": "Quzhe Huang, Zhenwei An, Nan Zhuang, Mingxu Tao, Chen Zhang, Jin Yang, Kun Xu, Kun Xu, Liwei Chen, Songfang Huang, Yansong Feng", "abstract": "In this paper, we introduce a novel dynamic expert selection framework for Mixture of Experts (MoE) models, aiming to enhance computational efficiency and model performance by adjusting the number of activated experts based on input difficulty. Unlike traditional MoE approaches that rely on fixed Top-K routing, which activates a predetermined number of experts regardless of the input's complexity, our method dynamically selects experts based on the confidence level in expert selection for each input. This allows for a more efficient utilization of computational resources, activating more experts for complex tasks requiring advanced reasoning and fewer for simpler tasks. Through extensive evaluations, our dynamic routing method demonstrates substantial improvements over conventional Top-2 routing across various benchmarks, achieving an average improvement of 0.7% with less than 90% activated parameters. Further analysis shows our model dispatches more experts to tasks requiring complex reasoning skills, like BBH, confirming its ability to dynamically allocate computational resources in alignment with the input's complexity. Our findings also highlight a variation in the number of experts needed across different layers of the transformer model, offering insights into the potential for designing heterogeneous MoE frameworks. The code and models are available at https://github.com/ZhenweiAn/Dynamic_MoE.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.725515365600586, 1.7608264684677124], "openalex_id": "https://openalex.org/W4392781216", "title": "Enabling self-identification in intelligent agent: insights from computational psychoanalysis", "authors": "Lingyu Li, Chunbo Li", "abstract": "Building upon prior framework of computational Lacanian psychoanalysis with the theory of active inference, this paper aims to further explore the concept of self-identification and its potential applications. Beginning with two classic paradigms in psychology, mirror self-recognition and rubber hand illusion, we suggest that imaginary identification is characterized by an integrated body schema with minimal free energy. Next, we briefly survey three dimensions of symbolic identification (sociological, psychoanalytic, and linguistical) and corresponding active inference accounts. To provide intuition, we respectively employ a convolutional neural network (CNN) and a multi-layer perceptron (MLP) supervised by ChatGPT to showcase optimization of free energy during motor skill and language mastery underlying identification formation. We then introduce Lacan's Graph II of desire, unifying imaginary and symbolic identification, and propose an illustrative model called FreeAgent. In concluding remarks, we discuss some key issues in the potential of computational Lacanian psychoanalysis to advance mental health and artificial intelligence, including digital twin mind, large language models as avatars of the Lacanian Other, and the feasibility of human-level artificial general intelligence with self-awareness in the context of post-structuralism.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.187417030334473, 4.955315113067627], "openalex_id": "https://openalex.org/W4393064060", "title": "Towards Better Statistical Understanding of Watermarking LLMs", "authors": "Zhongze Cai, S Liu, Hanzhao Wang, Huaiyang Zhong, Xiaocheng Li", "abstract": "In this paper, we study the problem of watermarking large language models (LLMs). We consider the trade-off between model distortion and detection ability and formulate it as a constrained optimization problem based on the green-red algorithm of Kirchenbauer et al. (2023a). We show that the optimal solution to the optimization problem enjoys a nice analytical property which provides a better understanding and inspires the algorithm design for the watermarking process. We develop an online dual gradient ascent watermarking algorithm in light of this optimization formulation and prove its asymptotic Pareto optimality between model distortion and detection ability. Such a result guarantees an averaged increased green list probability and henceforth detection ability explicitly (in contrast to previous results). Moreover, we provide a systematic discussion on the choice of the model distortion metrics for the watermarking problem. We justify our choice of KL divergence and present issues with the existing criteria of ``distortion-free'' and perplexity. Finally, we empirically evaluate our algorithms on extensive datasets against benchmark algorithms.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.483218669891357, 2.4301950931549072], "openalex_id": "https://openalex.org/W4393063685", "title": "How Gender Interacts with Political Values: A Case Study on Czech BERT Models", "authors": "Adnan Ali, Jind\u0159ich Libovick\u00fd", "abstract": "Neural language models, which reach state-of-the-art results on most natural language processing tasks, are trained on large text corpora that inevitably contain value-burdened content and often capture undesirable biases, which the models reflect. This case study focuses on the political biases of pre-trained encoders in Czech and compares them with a representative value survey. Because Czech is a gendered language, we also measure how the grammatical gender coincides with responses to men and women in the survey. We introduce a novel method for measuring the model's perceived political values. We find that the models do not assign statement probability following value-driven reasoning, and there is no systematic difference between feminine and masculine sentences. We conclude that BERT-sized models do not manifest systematic alignment with political values and that the biases observed in the models are rather due to superficial imitation of training data patterns than systematic value beliefs encoded in the models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.810230255126953, 1.5564614534378052], "openalex_id": "https://openalex.org/W4396767052", "title": "A Comparative Analysis of Text Embedding Models for Bug Report Semantic Similarity", "authors": "Avinash Patil, Kihwan Han, Aryan Jadon", "abstract": "Bug reports are an essential aspect of software development, and it is crucial to identify and resolve them quickly to ensure the consistent functioning of software systems. Retrieving similar bug reports from an existing database can help reduce the time and effort required to resolve bugs. In this paper, we compared the effectiveness of semantic textual similarity methods for retrieving similar bug reports based on a similarity score. We explored several embedding models such as TF-IDF (Baseline), FastText, Gensim, BERT, and ADA. We used the Software Defects Data containing bug reports for various software projects to evaluate the performance of these models. Our experimental results showed that BERT generally outperformed the rest of the models regarding recall, followed by ADA, Gensim, FastText, and TFIDF. Our study provides insights into the effectiveness of different embedding methods for retrieving similar bug reports and highlights the impact of selecting the appropriate one for this task. Our code is available on GitHub.", "venue": "https://doi.org/10.1109/spin60856.2024.10512000", "label": 0}, {"loc": [3.767005443572998, -3.9381892681121826], "openalex_id": "https://openalex.org/W4393054218", "title": "Multi-class hate speech detection in the Norwegian language using FAST-RNN and multilingual fine-tuned transformers", "authors": "Ehtesham Hashmi, Sule Yildirim Yayilgan", "abstract": "Abstract The growth of social networks has provided a platform for individuals with prejudiced views, allowing them to spread hate speech and target others based on their gender, ethnicity, religion, or sexual orientation. While positive interactions within diverse communities can considerably enhance confidence, it is critical to recognize that negative comments can hurt people\u2019s reputations and well-being. This emergence emphasizes the need for more diligent monitoring and robust policies on these platforms to protect individuals from such discriminatory and harmful behavior. Hate speech is often characterized as an intentional act of aggression directed at a specific group, typically meant to harm or marginalize them based on certain aspects of their identity. Most of the research related to hate speech has been conducted in resource-aware languages like English, Spanish, and French. However, low-resource European languages, such as Irish, Norwegian, Portuguese, Polish, Slovak, and many South Asian, present challenges due to limited linguistic resources, making information extraction labor-intensive. In this study, we present deep neural networks with FastText word embeddings using regularization methods for multi-class hate speech detection in the Norwegian language, along with the implementation of multilingual transformer-based models with hyperparameter tuning and generative configuration. FastText outperformed other deep learning models when stacked with Bidirectional LSTM and GRU, resulting in the FAST-RNN model. In the concluding phase, we compare our results with the state-of-the-art and perform interpretability modeling using Local Interpretable Model-Agnostic Explanations to achieve a more comprehensive understanding of the model\u2019s decision-making mechanisms.", "venue": "Complex & Intelligent Systems", "label": 36}, {"loc": [4.92792272567749, 0.33573147654533386], "openalex_id": "https://openalex.org/W4393039272", "title": "Automatic Coding of Contingency in Child-Caregiver Conversations", "authors": "Abhishek Agrawal, Mitja Nikolaus, Beno\u00eet Favre, Abdellah Fourtassi", "abstract": "One of the most important communicative skills children have to learn is to engage in meaningful conversations with people around them. At the heart of this learning lies the mastery of contingency, i.e., the ability to contribute to an ongoing exchange in a relevant fashion (e.g., by staying on topic). Current research on this question relies on the manual annotation of a small sample of children, which limits our ability to draw general conclusions about development. Here, we propose to mitigate the limitations of manual labor by relying on automatic tools for contingency judgment in children's early natural interactions with caregivers. Drawing inspiration from the field of dialogue systems evaluation, we built and compared several automatic classifiers. We found that a Transformer-based pre-trained language model \u2014 when fine-tuned on a relatively small set of data we annotated manually (around 3,500 turns) \u2014 provided the best predictions. We used this model to automatically annotate, new and large-scale data, almost two orders of magnitude larger than our fine-tuning set. It was able to replicate existing results and generate new data-driven hypotheses. The broad impact of the work is to provide resources that can help the language development community study communicative development at scale, leading to more robust theories.", "venue": "https://doi.org/10.31234/osf.io/hwnms", "label": 0}, {"loc": [6.884668827056885, 0.34456855058670044], "openalex_id": "https://openalex.org/W4393063826", "title": "EthioLLM: Multilingual Large Language Models for Ethiopian Languages with Task Evaluation", "authors": "Atnafu Lambebo Tonja, Israel Abebe Azime, Tadesse Destaw Belay, Mesay Gemeda Yigezu, Moges Ahmed Mehamed, Abinew Ali Ayele, Ebrahim Chekol Jibril, Michael Melese Woldeyohannis, Olga Kolesnikova, Philipp Slusallek, Dietrich Klakow, Shengwu Xiong, Seid Muhie Yimam", "abstract": "Large language models (LLMs) have gained popularity recently due to their outstanding performance in various downstream Natural Language Processing (NLP) tasks. However, low-resource languages are still lagging behind current state-of-the-art (SOTA) developments in the field of NLP due to insufficient resources to train LLMs. Ethiopian languages exhibit remarkable linguistic diversity, encompassing a wide array of scripts, and are imbued with profound religious and cultural significance. This paper introduces EthioLLM -- multilingual large language models for five Ethiopian languages (Amharic, Ge'ez, Afan Oromo, Somali, and Tigrinya) and English, and Ethiobenchmark -- a new benchmark dataset for various downstream NLP tasks. We evaluate the performance of these models across five downstream NLP tasks. We open-source our multilingual language models, new benchmark datasets for various downstream tasks, and task-specific fine-tuned language models and discuss the performance of the models. Our dataset and models are available at the https://huggingface.co/EthioNLP repository.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.115104675292969, -0.6750962734222412], "openalex_id": "https://openalex.org/W4399314920", "title": "Feature-augmented model for multilingual discourse relation classification", "authors": "Eleni Metheniti, Chlo\u00e9 Braud, Philippe M\u00fcller", "abstract": "International audience", "venue": "HAL (Le Centre pour la Communication Scientifique Directe)", "label": 6}, {"loc": [7.0208420753479, -1.0370774269104004], "openalex_id": "https://openalex.org/W4393069008", "title": "Grammatical versus Spelling Error Correction: An Investigation into the Responsiveness of Transformer-Based Language Models Using BART and MarianMT", "authors": "Rohit Raju, Peeta Basa Pati, SA Gandheesh, Gayatri Sanjana Sannala, KS Suriya", "abstract": "Text continues to remain a relevant form of representation for information. Text documents are created either in digital native platforms or through the conversion of other media files such as images and speech. While the digital native text is invariably obtained through physical or virtual keyboards, technologies such as OCR and speech recognition are utilised to transform the images and speech signals into text content. All these variety of mechanisms of text generation also introduce errors into the captured text. This project aims at analysing different kinds of errors that occur in text documents. The work employs two of the advanced deep neural network-based language models, namely, BART and MarianMT, to rectify the anomalies present in the text. Transfer learning of these models with available dataset is performed to finetune their capacity for error correction. A comparative study is conducted to investigate the effectiveness of these models in handling each of the defined error categories. It is observed that while both models can bring down the erroneous sentences by 20+%, BART can handle spelling errors far better (24.6%) than grammatical errors (8.8%).", "venue": "Journal of Information & Knowledge Management", "label": 0}, {"loc": [7.733582496643066, 1.3307082653045654], "openalex_id": "https://openalex.org/W4393027756", "title": "Dated Data: Tracing Knowledge Cutoffs in Large Language Models", "authors": "Jeffrey Cheng, Marc Marone, Orion Weller, Dawn Lawrie, Daniel Khashabi, Benjamin Van Durme", "abstract": "Released Large Language Models (LLMs) are often paired with a claimed knowledge cutoff date, or the dates at which training data was gathered. Such information is crucial for applications where the LLM must provide up to date information. However, this statement only scratches the surface: do all resources in the training data share the same knowledge cutoff date? Does the model's demonstrated knowledge for these subsets closely align to their cutoff dates? In this work, we define the notion of an effective cutoff. This is distinct from the LLM designer reported cutoff and applies separately to sub-resources and topics. We propose a simple approach to estimate effective cutoffs on the resource-level temporal alignment of an LLM by probing across versions of the data. Using this analysis, we find that effective cutoffs often differ from reported cutoffs. To understand the root cause of this observation, we conduct a direct large-scale analysis on open pre-training datasets. Our analysis reveals two reasons for these inconsistencies: (1) temporal biases of CommonCrawl data due to non-trivial amounts of old data in new dumps and (2) complications in LLM deduplication schemes involving semantic duplicates and lexical near-duplicates. Overall, our results show that knowledge cutoffs are not as simple as they have seemed and that care must be taken both by LLM dataset curators as well as practitioners who seek to use information from these models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.182890892028809, -0.4708060026168823], "openalex_id": "https://openalex.org/W4393027528", "title": "CLASSLA-web: Comparable Web Corpora of South Slavic Languages Enriched with Linguistic and Genre Annotation", "authors": "Nikola Ljube\u0161i\u0107, Taja Kuzman", "abstract": "This paper presents a collection of highly comparable web corpora of Slovenian, Croatian, Bosnian, Montenegrin, Serbian, Macedonian, and Bulgarian, covering thereby the whole spectrum of official languages in the South Slavic language space. The collection of these corpora comprises a total of 13 billion tokens of texts from 26 million documents. The comparability of the corpora is ensured by a comparable crawling setup and the usage of identical crawling and post-processing technology. All the corpora were linguistically annotated with the state-of-the-art CLASSLA-Stanza linguistic processing pipeline, and enriched with document-level genre information via the Transformer-based multilingual X-GENRE classifier, which further enhances comparability at the level of linguistic annotation and metadata enrichment. The genre-focused analysis of the resulting corpora shows a rather consistent distribution of genres throughout the seven corpora, with variations in the most prominent genre categories being well-explained by the economic strength of each language community. A comparison of the distribution of genre categories across the corpora indicates that web corpora from less developed countries primarily consist of news articles. Conversely, web corpora from economically more developed countries exhibit a smaller proportion of news content, with a greater presence of promotional and opinionated texts.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.214534759521484, 1.2369216680526733], "openalex_id": "https://openalex.org/W4394995413", "title": "Outlier Detection in Serbian CommonCrawl Data", "authors": "Vladimir Kalu\u0161ev, Dubravko \u0106ulibrk", "abstract": "The surge in large language models (LLMs) has greatly advanced natural language processing. However, their development is often hampered by the limited availability and quality of training datasets, particularly for underrepresented languages. Our study aims to address this by enriching Serbian language resources, a crucial step in enhancing LLMs. A key aspect of our methodology is the use of 12 specialized metrics, which allow us to rigorously evaluate the dataset's integrity and reveal linguistic trends in the Serbian digital context. Additionally, we employ a modified Interquartile Range (IQR) method for more stringent outlier detection, aiding in the identification of linguistic anomalies in Serbian text.", "venue": "https://doi.org/10.1109/infoteh60418.2024.10495987", "label": 0}, {"loc": [7.006117343902588, 3.3273918628692627], "openalex_id": "https://openalex.org/W4393009733", "title": "Optimization of news dissemination push mode by intelligent edge computing technology for deep learning", "authors": "JiLe DeGe, Sina Sang", "abstract": "Abstract The Internet era is an era of information explosion. By 2022, the global Internet users have reached more than 4 billion, and the social media users have exceeded 3 billion. People face a lot of news content every day, and it is almost impossible to get interesting information by browsing all the news content. Under this background, personalized news recommendation technology has been widely used, but it still needs to be further optimized and improved. In order to better push the news content of interest to different readers, users' satisfaction with major news websites should be further improved. This study proposes a new recommendation algorithm based on deep learning and reinforcement learning. Firstly, the RL algorithm is introduced based on deep learning. Deep learning is excellent in processing large-scale data and complex pattern recognition, but it often faces the challenge of low sample efficiency when it comes to complex decision-making and sequential tasks. While reinforcement learning (RL) emphasizes learning optimization strategies through continuous trial and error through interactive learning with the environment. Compared with deep learning, RL is more suitable for scenes that need long-term decision-making and trial-and-error learning. By feeding back the reward signal of the action, the system can better adapt to the unknown environment and complex tasks, which makes up for the relative shortcomings of deep learning in these aspects. A scenario is applied to an action to solve the sequential decision problem in the news dissemination process. In order to enable the news recommendation system to consider the dynamic changes in users' interest in news content, the Deep Deterministic Policy Gradient algorithm is applied to the news recommendation scenario. Opposing learning complements and combines Deep Q-network with the strategic network. On the basis of fully summarizing and thinking, this paper puts forward the mode of intelligent news dissemination and push. The push process of news communication information based on edge computing technology is proposed. Finally, based on Area Under Curve a Q-Leaning Area Under Curve for RL models is proposed. This indicator can measure the strengths and weaknesses of RL models efficiently and facilitates comparing models and evaluating offline experiments. The results show that the DDPG algorithm improves the click-through rate by 2.586% compared with the conventional recommendation algorithm. It shows that the algorithm designed in this paper has more obvious advantages in accurate recommendation by users. This paper effectively improves the efficiency of news dissemination by optimizing the push mode of intelligent news dissemination. In addition, the paper also deeply studies the innovative application of intelligent edge technology in news communication, which brings new ideas and practices to promote the development of news communication methods. Optimizing the push mode of intelligent news dissemination not only improves the user experience, but also provides strong support for the application of intelligent edge technology in this field, which has important practical application prospects.", "venue": "Scientific Reports", "label": 24}, {"loc": [7.16823148727417, 0.53546142578125], "openalex_id": "https://openalex.org/W4393023353", "title": "Optimizing Language Augmentation for Multilingual Large Language Models: A Case Study on Korean", "authors": "Chang-Su Choi, Yongbin Jeong, Seoyoon Park, I. J. Won, HyeonSeok Lim, SangMin Kim, Yejee Kang, Chanhyuk Yoon, Jaewan Park, Yiseul Lee, Hyejin Lee, Younggyun Hahm, Hansaem Kim, KyungTae Lim", "abstract": "Large language models (LLMs) use pretraining to predict the subsequent word; however, their expansion requires significant computing resources. Numerous big tech companies and research institutes have developed multilingual LLMs (MLLMs) to meet current demands, overlooking less-resourced languages (LRLs). This study proposed three strategies to enhance the performance of LRLs based on the publicly available MLLMs. First, the MLLM vocabularies of LRLs were expanded to enhance expressiveness. Second, bilingual data were used for pretraining to align the high- and less-resourced languages. Third, a high-quality small-scale instruction dataset was constructed and instruction-tuning was performed to augment the LRL. The experiments employed the Llama2 model and Korean was used as the LRL, which was quantitatively evaluated against other developed LLMs across eight tasks. Furthermore, a qualitative assessment was performed based on human evaluation and GPT4. Experimental results showed that our proposed Bllossom model exhibited superior performance in qualitative analyses compared to previously proposed Korean monolingual models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.156826972961426, 3.756416082382202], "openalex_id": "https://openalex.org/W4392971869", "title": "DiPaCo: Distributed Path Composition", "authors": "Arthur Douillard, Qixuan Feng, Andrei A. Rusu, Adhiguna Kuncoro, Yani Donchev, Rachita Chhaparia, Ionel Gog, Marc\u2019Aurelio Ranzato, Jiajun Shen, Arthur Szlam", "abstract": "Progress in machine learning (ML) has been fueled by scaling neural network models. This scaling has been enabled by ever more heroic feats of engineering, necessary for accommodating ML approaches that require high bandwidth communication between devices working in parallel. In this work, we propose a co-designed modular architecture and training approach for ML models, dubbed DIstributed PAth COmposition (DiPaCo). During training, DiPaCo distributes computation by paths through a set of shared modules. Together with a Local-SGD inspired optimization (DiLoCo) that keeps modules in sync with drastically reduced communication, Our approach facilitates training across poorly connected and heterogeneous workers, with a design that ensures robustness to worker failures and preemptions. At inference time, only a single path needs to be executed for each input, without the need for any model compression. We consider this approach as a first prototype towards a new paradigm of large-scale learning, one that is less synchronous and more modular. Our experiments on the widely used C4 benchmark show that, for the same amount of training steps but less wall-clock time, DiPaCo exceeds the performance of a 1 billion-parameter dense transformer language model by choosing one of 256 possible paths, each with a size of 150 million parameters.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.383561611175537, 2.4227566719055176], "openalex_id": "https://openalex.org/W4392971972", "title": "A Multilingual Perspective on Probing Gender Bias", "authors": "Karolina Sta\u0144czak", "abstract": "Gender bias represents a form of systematic negative treatment that targets individuals based on their gender. This discrimination can range from subtle sexist remarks and gendered stereotypes to outright hate speech. Prior research has revealed that ignoring online abuse not only affects the individuals targeted but also has broader societal implications. These consequences extend to the discouragement of women's engagement and visibility within public spheres, thereby reinforcing gender inequality. This thesis investigates the nuances of how gender bias is expressed through language and within language technologies. Significantly, this thesis expands research on gender bias to multilingual contexts, emphasising the importance of a multilingual and multicultural perspective in understanding societal biases. In this thesis, I adopt an interdisciplinary approach, bridging natural language processing with other disciplines such as political science and history, to probe gender bias in natural language and language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.9191389083862305, 0.25396445393562317], "openalex_id": "https://openalex.org/W4392932414", "title": "Is Translation All You Need? A Study on Solving Multilingual Tasks with Large Language Models", "authors": "Chaoqun Liu, Wenxuan Zhang, Yiran Zhao, Anh Tuan Luu, Lidong Bing", "abstract": "Large language models (LLMs) have demonstrated multilingual capabilities, yet they are mostly English-centric due to the imbalanced training corpora. While prior works have leveraged this bias to enhance multilingual performance through translation, they have been largely limited to natural language processing (NLP) tasks. In this work, we extend the evaluation to real-world user queries and non-English-centric LLMs, offering a broader examination of multilingual performance. Our key contribution lies in demonstrating that while translation into English can boost the performance of English-centric LLMs on NLP tasks, it is not universally optimal. For culture-related tasks that need deep language understanding, prompting in the native language proves more effective as it better captures the nuances of culture and language. Our experiments expose varied behaviors across LLMs and tasks in the multilingual context, underscoring the need for a more comprehensive approach to multilingual evaluation. Therefore, we call for greater efforts in developing and evaluating LLMs that go beyond English-centric paradigms.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.356802463531494, 2.4713292121887207], "openalex_id": "https://openalex.org/W4392934738", "title": "Shapley Values-Powered Framework for Fair Reward Split in Content Produced by GenAI", "authors": "Alex Glinsky, Alexey Sokolsky", "abstract": "It is evident that, currently, generative models are surpassed in quality by human professionals. However, with the advancements in Artificial Intelligence, this gap will narrow, leading to scenarios where individuals who have dedicated years of their lives to mastering a skill become obsolete due to their high costs, which are inherently linked to the time they require to complete a task -- a task that AI could accomplish in minutes or seconds. To avoid future social upheavals, we must, even now, contemplate how to fairly assess the contributions of such individuals in training generative models and how to compensate them for the reduction or complete loss of their incomes. In this work, we propose a method to structure collaboration between model developers and data providers. To achieve this, we employ Shapley Values to quantify the contribution of artist(s) in an image generated by the Stable Diffusion-v1.5 model and to equitably allocate the reward among them.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.543332099914551, -1.0352288484573364], "openalex_id": "https://openalex.org/W4392903148", "title": "Pre-Trained Acoustic-and-Textual Modeling for End-To-End Speech-To-Text Translation", "authors": "Weitai Zhang, Hanyi Zhang, Chenxuan Liu, Zhongyi Ye, Xinyuan Zhou, Chao Lin, Li-Rong Dai", "abstract": "End-to-end paradigm has aroused more and more interests and attention for improving speech-to-text translation (ST) recently. Existing end-to-end models mainly attributes and attempts to address the problem of modeling burden and data scarcity, while always fail to maintain both cross-modal and cross-lingual mapping well at the same time. In this work, we investigate methods for improving endto-end ST with pre-trained acoustic-and-textual models. Our acoustic encoder and decoder begins with processing the source speech sequence as usual. A textual encoder and an adaptor module then obtain source acoustic and textual information respectively, alleviating the representation inconsistency with attentive interactions in the textual decoder. Also, we utilize pre-trained models, and develop an adaptation fine-tuning method to preserve the pre-training knowledge. Experimental results on the IWSLT2023 offline ST task from English to German, Japanese and Chinese show that our method achieves state-of-the-art BLEU scores and surpasses the strong cascaded ST counterparts in unrestricted setting.", "venue": "https://doi.org/10.1109/icassp48485.2024.10446635", "label": 0}, {"loc": [6.326382637023926, 0.3156736195087433], "openalex_id": "https://openalex.org/W4392903597", "title": "Probability-Aware Word-Confusion-Network-To-Text Alignment Approach for Intent Classification", "authors": "Esa\u00fa Villatoro-Tello, Srikanth Madikeri, Bidisha Sharma, Driss Khalil, Shashi Kumar, Iuliia Nigmatulina, Petr Motl\u00ed\u010dek, Aravind Ganapathiraju", "abstract": "Spoken Language Understanding (SLU) technologies have greatly improved due to the effective pretraining of speech representations. A common requirement of industry-based solutions is the portability to deploy SLU models in voice-assistant devices. Thus, distilling knowledge from large text-based language models has become an attractive solution for achieving good performance and guaranteeing portability. In this paper, we introduce a novel architecture that uses a cross-modal attention mechanism to extract bin-level contextual embeddings from a word-confusion network (WNC) encoding such that these can be directly compared and aligned with traditional text-based contextual embeddings. This alignment is achieved using a recently proposed tokenwise constrastive loss function. We validate our architecture's effectiveness by fine-tuning our WCN-based pretrained model to do intent classification (IC) on the well-known SLURP dataset. Obtained accuracy on the IC task (81%), depicts a 9.4% relative improvement compared to a recent/equivalent E2E method.", "venue": "https://doi.org/10.1109/icassp48485.2024.10445934", "label": 0}, {"loc": [6.215493679046631, -1.0641382932662964], "openalex_id": "https://openalex.org/W4392913295", "title": "AI-Generated Text Detector for Arabic Language", "authors": "Hamed Alshammari, Ahmed El-Sayed, Khaled Elleithy", "abstract": "The effectiveness of existing AI detectors is notably hampered when processing Arabic texts. This study introduces a novel AI text classifier designed specifically for Arabic, tackling the distinct challenges inherent in processing this language. A particular focus is placed on accurately recognizing human-written texts (HWTs), an area where existing AI detectors have demonstrated significant limitations. To achieve this goal, this paper utilized and fine-tuned two Transformer-based models, AraELECTRA and XLM-R, by training them on two distinct datasets: a large dataset comprising 43,958 examples and a custom dataset with 3078 examples that contain HWT and AI-generated texts (AIGTs) from various sources, including ChatGPT 3.5, ChatGPT-4, and BARD. The proposed architecture is adaptable to any language, but this work evaluates these models\u2019 efficiency in recognizing HWTs versus AIGTs in Arabic as an example of Semitic languages. The performance of the proposed models has been compared against the two prominent existing AI detectors, GPTZero and OpenAI Text Classifier, particularly on the AIRABIC benchmark dataset. The results reveal that the proposed classifiers outperform both GPTZero and OpenAI Text Classifier with 81% accuracy compared to 63% and 50% for GPTZero and OpenAI Text Classifier, respectively. Furthermore, integrating a Dediacritization Layer prior to the classification model demonstrated a significant enhancement in the detection accuracy of both HWTs and AIGTs. This Dediacritization step markedly improved the classification accuracy, elevating it from 81% to as high as 99% and, in some instances, even achieving 100%.", "venue": "Big Data and Cognitive Computing", "label": 0}, {"loc": [4.776607036590576, 0.5854637622833252], "openalex_id": "https://openalex.org/W4393116578", "title": "ChatGPT Alternative Solutions: Large Language Models Survey", "authors": "Hanieh Alipour, Nick Pendar, Kohinoor Roy", "abstract": "In recent times, the grandeur of Large Language Models (LLMs) has not only shone in the realm of natural language processing but has also cast its brilliance across a vast array of applications. This remarkable display of LLM capabilities has ignited a surge in research contributions within this domain, spanning a diverse spectrum of topics. These contributions encompass advancements in neural network architecture, context length enhancements, model alignment, training datasets, benchmarking, efficiency improvements, and more. Recent years have witnessed a dynamic synergy between academia and industry, propelling the field of LLM research to new heights. A notable milestone in this journey is the introduction of ChatGPT, a powerful AI chatbot grounded in LLMs, which has garnered widespread societal attention. The evolving technology of LLMs has begun to reshape the landscape of the entire AI community, promising a revolutionary shift in the way we create and employ AI algorithms. Given this swift-paced technical evolution, our survey embarks on a journey to encapsulate the recent strides made in the world of LLMs. Through an exploration of the background, key discoveries, and prevailing methodologies, we offer an up-to-theminute review of the literature. By examining multiple LLM models, our paper not only presents a comprehensive overview but also charts a course that identifies existing challenges and points toward potential future research trajectories. This survey furnishes a well-rounded perspective on the current state of generative AI, shedding light on opportunities for further exploration, enhancement, and innovation.", "venue": "https://doi.org/10.5121/csit.2024.1405114", "label": 0}, {"loc": [7.606589317321777, -1.1618103981018066], "openalex_id": "https://openalex.org/W4398198183", "title": "NEURAL MACHINE TRANSLATION BETWEEN MYANMAR AND ENGLISH LANGUAGES", "authors": "Nang Zin Min Aye, Khin Mar Soe", "abstract": "Machine translation between Myanmar and English, and vice versa, presents significant challenges but stands as an important area of research for fostering connectivity and facilitating information access for Myanmar language speakers. Sustained research efforts and continual innovation are imperative for elevating the quality and accessibility of machine translation in these language pairs. Neural Machine Translation (NMT) models, especially those utilizing attention mechanisms and the transformer model, show promise in the field of machine translation. The integration of subword approaches in machine translation is crucial for managing the complexity and diversity of languages. It improves adaptability and enhances the overall performance of translation models. This is particularly important in scenarios involving morphological variations and limited resources for languages. In this study, our aim is to present the results of evaluating the translation performance of Transformer and Recurrent Neural Network (RNN) models optimized with subwording on the Myanmar-English WAT2019 corpus. Subsequently, we will conduct an evaluation and comparison of these models. Importantly, we highlight that the correct selection of the subword model emerges as the most significant factor influencing translation performance. An optimized Transformer model using subwording with a 32k Byte Pair Encoding (BPE) demonstrated a significant improvement in BLEU scores. Specifically, there was an 16.92 points improvement for the English-Myanmar direction and a 17.01 points improvement for the Myanmar-English direction when compared to a baseline RNN model that was also optimized with subwording using 32k BPE. We conducted an assessment of SentencePiece models utilizing both unigram and BPE approaches. The results indicated an improvement in BLEU scores 50.76 points for English-Myanamar direction and 48.91 for Myanmar-English direction, particularly with Transformer models optimized with 32k BPE subword models.", "venue": "https://doi.org/10.1109/icca62361.2024.10533045", "label": 0}, {"loc": [5.323169231414795, -1.4835271835327148], "openalex_id": "https://openalex.org/W4392888993", "title": "Arabic sarcasm detection: An enhanced fine-tuned language model approach", "authors": "Mohamed Galal, Ahmed H. Yousef, Hala H. Zayed, Walaa Medhat", "abstract": "Sarcasm is a complex linguistic phenomenon involving humor, criticism, or phrases that convey the opposite meaning, mask true feelings, and play pivotal roles in various aspects of communication. Therefore, identifying sarcasm is essential for sentiment analysis, social media monitoring, and customer service, as it enables a better understanding of public sentiment. Moreover, social media has become a primary platform for people to express their feelings and opinions and provide feedback to businesses and service providers. Misinterpreting sarcasm in customer feedback can lead to incorrect responses and actions. However, accurately detecting sarcasm is challenging because it depends on context, cultural factors, and inherent ambiguity. Despite the plenty of research and resources in Machine Learning (ML) for detecting sarcasm in English, including Deep Learning (DL) techniques, there is still a shortage of research in sarcasm detection in Arabic, particularly in DL methodologies and available sarcastic datasets. This paper constructed a new Arabic sarcastic corpus and fine-tuned three pre-trained Arabic transformer-based Language Models (LM) for Arabic sarcasm detection. We also proposed a hybrid DL approach for sarcasm detection that combines static and contextualized representations using pre-trained LM, such as Word2Vec word embeddings and Bidirectional Encoder Representations from Transformers (BERT) models pretrained on Arabic resources. The proposed enhanced hybrid deep learning approach outperforms state-of-the-art models by 8% on a shared benchmark dataset and achieves a 5% improvement in F1-score on another.", "venue": "Ain Shams Engineering Journal", "label": 0}, {"loc": [6.481692790985107, 5.107571601867676], "openalex_id": "https://openalex.org/W4392867139", "title": "Unlocking the conversion of Web Screenshots into HTML Code with the WebSight Dataset", "authors": "Hugo Lauren\u00e7on, L\u00e9o Tronchon, Victor Sanh", "abstract": "Using vision-language models (VLMs) in web development presents a promising strategy to increase efficiency and unblock no-code solutions: by providing a screenshot or a sketch of a UI, a VLM could generate the code to reproduce it, for instance in a language like HTML. Despite the advancements in VLMs for various tasks, the specific challenge of converting a screenshot into a corresponding HTML has been minimally explored. We posit that this is mainly due to the absence of a suitable, high-quality dataset. This work introduces WebSight, a synthetic dataset consisting of 2 million pairs of HTML codes and their corresponding screenshots. We fine-tune a foundational VLM on our dataset and show proficiency in converting webpage screenshots to functional HTML code. To accelerate the research in this area, we open-source WebSight.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.488083362579346, 0.8031926155090332], "openalex_id": "https://openalex.org/W4398173363", "title": "Natural Language Processing Tools for Romanian\u2013Going Beyond a Low-Resource Language", "authors": "Melania Ni\u021bu, Mihai Dasc\u0103lu", "abstract": "Advances in Natural Language Processing bring innovative instruments to the educational field to improve the quality of the didactic process by addressing challenges like language barriers and creating personalized learning experiences. Most research in the domain is dedicated to high-resource languages, such as English, while languages with limited coverage, like Romanian, are still underrepresented in the field. Operating on low-resource languages is essential to ensure equitable access to educational opportunities and to preserve linguistic diversity. Through continuous investments in developing Romanian educational instruments, we are rapidly going beyond a low-resource language. This paper presents recent educational instruments and frameworks dedicated to Romanian, leveraging state-of-the-art NLP techniques, such as building advanced Romanian language models and benchmarks encompassing tools for language learning, text comprehension, question answering, automatic essay scoring, and information retrieval. The methods and insights gained are transferable to other low-resource languages, emphasizing methodological adaptability, collaborative frameworks, and technology transfer to address similar challenges in diverse linguistic contexts. Two use cases are presented, focusing on assessing student performance in Moodle courses and extracting main ideas from students\u2019 feedback. These practical applications in Romanian academic settings serve as examples for enhancing educational practices in other less-resourced languages.", "venue": "Interaction design & architecture(s)/ID&A Interaction design & architecture(s)", "label": 38}, {"loc": [5.451105117797852, 0.8518805503845215], "openalex_id": "https://openalex.org/W4398180480", "title": "Lib2Life\u2013Digital Library Services Empowered with Advanced Natural Language Processing Techniques", "authors": "Melania Ni\u021bu, Mihai Dasc\u0103lu, Maria-Dorinela Dascalu, Laurentiu-Marian Neagu, Maria-Iuliana Dasc\u0103lu", "abstract": "Educational institutions are struggling to keep up with the accelerated technological advancements; hence, sustainable and supportive tools have become essential to reshape traditional models into intelligent learning systems. This paper introduces Lib2Life, a digital library that uses advanced Natural Language Processing techniques to facilitate the digital transformation of historical documents provided by Central University Libraries in Romania. The platform enables Central University Libraries in Romania to preserve the cultural heritage of historically valuable documents, facilitating open-source access to old, printed materials such as books, manuscripts, newspapers, or literary magazines no longer protected by copyright. Lib2Life offers comprehensive functionalities, allowing librarians to benefit from automated text processing and indexing workflows that facilitate digitization, ensuring a consistent representation of original documents. For readers, the platform presents a user-friendly interface with semantic search capabilities and a recommendation engine. The system employs an ontology to organize and manage documents in a unified and structured way, contributing to the evolution of intelligent education technologies. The innovative contributions of Lib2Lifeinclude identifying new solutions for cultural heritage preservation, promoting patrimony through modern methodologies, increasing access to documentary resources, enhancing library services, and fostering the transfer of knowledge and technology to society.", "venue": "Interaction design & architecture(s)/ID&A Interaction design & architecture(s)", "label": 38}, {"loc": [8.55674934387207, 2.723236560821533], "openalex_id": "https://openalex.org/W4392824982", "title": "Language models scale reliably with over-training and on downstream tasks", "authors": "Samir Yitzhak Gadre, Georgios Smyrnis, Vaishaal Shankar, Suchin Gururangan, Mitchell Wortsman, Rulin Shao, Jean Mercat, Alex Chengyu Fang, Jeffrey Li, Sedrick Scott Keh, Rui Xin, Marianna Nezhurina, Igor Vasiljevic, Jenia Jitsev, Alexandros G. Dimakis, Gabriel Ilharco, Shuran Song, Thomas Kollar, Yair Carmon, Achal Dave, Reinhard Heckel, Niklas Muennighoff, Ludwig Schmidt", "abstract": "Scaling laws are useful guides for derisking expensive training runs, as they predict performance of large models using cheaper, small-scale experiments. However, there remain gaps between current scaling studies and how language models are ultimately trained and evaluated. For instance, scaling is usually studied in the compute-optimal training regime (i.e., \"Chinchilla optimal\" regime). In contrast, models are often over-trained to reduce inference costs. Moreover, scaling laws mostly predict loss on next-token prediction, but models are usually compared on downstream task performance. To address both shortcomings, we create a testbed of 104 models with 0.011B to 6.9B parameters trained with various numbers of tokens on three data distributions. First, we fit scaling laws that extrapolate in both the amount of over-training and the number of model parameters. This enables us to predict the validation loss of a 1.4B parameter, 900B token run (i.e., 32$\\times$ over-trained) and a 6.9B parameter, 138B token run (i.e., a compute-optimal run)$\\unicode{x2014}$each from experiments that take 300$\\times$ less compute. Second, we relate the perplexity of a language model to its downstream task performance by proposing a power law. We use this law to predict top-1 error averaged over downstream tasks for the two aforementioned models, using experiments that take 20$\\times$ less compute. Our experiments are available at https://github.com/mlfoundations/scaling.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.388618469238281, 3.0398216247558594], "openalex_id": "https://openalex.org/W4392828142", "title": "A Decade's Battle on Dataset Bias: Are We There Yet?", "authors": "Zhuang Liu, Kaiming He", "abstract": "We revisit the \"dataset classification\" experiment suggested by Torralba & Efros (2011) a decade ago, in the new era with large-scale, diverse, and hopefully less biased datasets as well as more capable neural network architectures. Surprisingly, we observe that modern neural networks can achieve excellent accuracy in classifying which dataset an image is from: e.g., we report 84.7% accuracy on held-out validation data for the three-way classification problem consisting of the YFCC, CC, and DataComp datasets. Our further experiments show that such a dataset classifier could learn semantic features that are generalizable and transferable, which cannot be explained by memorization. We hope our discovery will inspire the community to rethink issues involving dataset bias.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.460328578948975, 0.9124454855918884], "openalex_id": "https://openalex.org/W4392828261", "title": "Do Language Models Care About Text Quality? Evaluating Web-Crawled Corpora Across 11 Languages", "authors": "Rik van Noord, Taja Kuzman, Peter Rupnik, Nikola Ljube\u0161i\u0107, Miquel Espl\u00e0-Gomis, Gema Ram\u00edrez-S\u00e1nchez, Antonio Toral", "abstract": "Large, curated, web-crawled corpora play a vital role in training language models (LMs). They form the lion's share of the training data in virtually all recent LMs, such as the well-known GPT, LLaMA and XLM-RoBERTa models. However, despite this importance, relatively little attention has been given to the quality of these corpora. In this paper, we compare four of the currently most relevant large, web-crawled corpora (CC100, MaCoCu, mC4 and OSCAR) across eleven lower-resourced European languages. Our approach is two-fold: first, we perform an intrinsic evaluation by performing a human evaluation of the quality of samples taken from different corpora; then, we assess the practical impact of the qualitative differences by training specific LMs on each of the corpora and evaluating their performance on downstream tasks. We find that there are clear differences in quality of the corpora, with MaCoCu and OSCAR obtaining the best results. However, during the extrinsic evaluation, we actually find that the CC100 corpus achieves the highest scores. We conclude that, in our experiments, the quality of the web-crawled corpora does not seem to play a significant role when training LMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.033636569976807, 1.5777419805526733], "openalex_id": "https://openalex.org/W4392822258", "title": "Big City Bias: Evaluating the Impact of Metropolitan Size on Computational Job Market Abilities of Language Models", "authors": "Charlie Campanella, Rob van der Goot, Estevam Hruschka, Thom Lake, Naoki Otani, T. M. Mitchell", "abstract": "Large language models (LLMs) have emerged as a useful technology for job matching, for both candidates and employers. Job matching is often based on a particular geographic location, such as a city or region. However, LLMs have known biases, commonly derived from their training data. In this work, we aim to quantify the metropolitan size bias encoded within large language models, evaluating zero-shot salary, employer presence, and commute duration predictions in 384 of the United States' metropolitan regions. Across all benchmarks, we observe negative correlations between the metropolitan size and the performance of the LLMS, indicating that smaller regions are indeed underrepresented. More concretely, the smallest 10 metropolitan regions show upwards of 300% worse benchmark performance than the largest 10.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.345677375793457, -0.5250768065452576], "openalex_id": "https://openalex.org/W4396882480", "title": "Sentence Embeddings for Massively Multilingual Speech and Text Processing", "authors": "Paul-Ambroise Duquenne", "abstract": "Representation learning of sentences has been widely studied in NLP. While many works have explored different pre-training objectives to create contextual representations from sentences, several others have focused on learning sentence embeddings for multiple languages with the aim of closely encoding paraphrases and translations in the sentence embedding space.In this thesis, we first study how to extend text sentence embedding spaces to the speech modality in order to build a multilingual speech/text sentence embedding space. Next, we explore how to use this multilingual and multimodal sentence embedding space for large-scale speech mining. This allows us to automatically create alignments between written and spoken sentences in different languages. For high similarity thresholds in the latent space, aligned sentences can be considered as translations. If the alignments involve written sentences on one side and spoken sentences on the other, then these are potential speech-to-text translations. If the alignments involve on both sides spoken sentences, then these are potential speech-to-speech translations. To validate the quality of the mined data, we train speech-to-text translation models and speech-to-speech translation models. We show that adding the automatically mined data significantly improves the quality of the learned translation models, demonstrating the quality of the alignments and the usefulness of the mined data.Then, we study how to decode these sentence embeddings into text or speech in different languages. We explore several methods for training decoders and analyze their robustness to modalities/languages not seen during training, to evaluate cross-lingual and cross-modal transfers. We demonstrate that we could perform zero-shot cross-modal translation in this framework, achieving translation results close to systems learned in a supervised manner with a cross-attention mechanism. The compatibility between speech/text representations from different languages enables these very good performances, despite an intermediate fixed-size representation.Finally, we develop a new state-of-the-art massively multilingual speech/text sentence embedding space, named SONAR, based on conclusions drawn from the first two projects. We study different objective functions to learn such a space and we analyze their impact on the organization of the space as well as on the capabilities to decode these representations. We show that such sentence embedding space outperform previous state-of-the-art methods for both cross-lingual and cross-modal similarity search as well as decoding capabilities. This new space covers 200 written languages and 37 spoken languages. It also offers text translation results close to the NLLB system on which it is based, and speech translation results competitive with the Whisper supervised system. We also present SONAR EXPRESSIVE, which introduces an additional representation encoding non-semantic speech properties, such as vocal style or expressivity of speech.", "venue": "HAL (Le Centre pour la Communication Scientifique Directe)", "label": 6}, {"loc": [7.5184149742126465, 3.1297152042388916], "openalex_id": "https://openalex.org/W4392806453", "title": "TEA+: A Novel Temporal Graph Random Walk Engine With Hybrid Storage Architecture", "authors": "Chengying Huan, Yongchao Liu, Heng Zhang, Shuaiwen Leon Song, Santosh Pandey, Shiyang Chen, Xiangfei Fang, Yue Jin, Baptiste Lepers, Yanjun Wu, Hang Liu", "abstract": "Many real-world networks are characterized by being temporal and dynamic, wherein the temporal information signifies the changes in connections, such as the addition or removal of links between nodes. Employing random walks on these temporal networks is a crucial technique for understanding the structural evolution of such graphs over time. However, existing state-of-the-art sampling methods are designed for traditional static graphs, and as such, they struggle to efficiently handle the dynamic aspects of temporal networks. This deficiency can be attributed to several challenges, including increased sampling complexity, extensive index space, limited programmability, and a lack of scalability. In this article, we introduce TEA+, a robust, fast, and scalable engine for conducting random walks on temporal graphs. Central to TEA+ is an innovative hybrid sampling method that amalgamates two Monte Carlo sampling techniques. This fusion significantly diminishes space complexity while maintaining a fast sampling speed. Additionally, TEA+ integrates a range of optimizations that significantly enhance sampling efficiency. This is further supported by an effective graph updating strategy, skilled in managing dynamic graph modifications and adeptly handling the insertion and deletion of both edges and vertices. For ease of implementation, we propose a temporal-centric programming model, designed to simplify the development of various random walk algorithms on temporal graphs. To ensure optimal performance across storage constraints, TEA+ features a degree-aware hybrid storage architecture, capable of adeptly scaling in different memory environments. Experimental results showcase the prowess of TEA+, as it attains up to three orders of magnitude speedups compared to current random walk engines on extensive temporal graphs.", "venue": "ACM Transactions on Architecture and Code Optimization", "label": 0}, {"loc": [6.599183559417725, 5.103403568267822], "openalex_id": "https://openalex.org/W4392737406", "title": "Learning with Noisy Foundation Models", "authors": "Hao Chen, Jindong Wang, Zihan Wang, Ran Tao, Hongxin Wei, Xing Xie, Masashi Sugiyama, Bhiksha Raj", "abstract": "Foundation models are usually pre-trained on large-scale datasets and then adapted to downstream tasks through tuning. However, the large-scale pre-training datasets, often inaccessible or too expensive to handle, can contain label noise that may adversely affect the generalization of the model and pose unexpected risks. This paper stands out as the first work to comprehensively understand and analyze the nature of noise in pre-training datasets and then effectively mitigate its impacts on downstream tasks. Specifically, through extensive experiments of fully-supervised and image-text contrastive pre-training on synthetic noisy ImageNet-1K, YFCC15M, and CC12M datasets, we demonstrate that, while slight noise in pre-training can benefit in-domain (ID) performance, where the training and testing data share a similar distribution, it always deteriorates out-of-domain (OOD) performance, where training and testing distributions are significantly different. These observations are agnostic to scales of pre-training datasets, pre-training noise types, model architectures, pre-training objectives, downstream tuning methods, and downstream applications. We empirically ascertain that the reason behind this is that the pre-training noise shapes the feature space differently. We then propose a tuning method (NMTune) to affine the feature space to mitigate the malignant effect of noise and improve generalization, which is applicable in both parameter-efficient and black-box tuning manners. We additionally conduct extensive experiments on popular vision and language models, including APIs, which are supervised and self-supervised pre-trained on realistic noisy data for evaluation. Our analysis and results demonstrate the importance of this novel and fundamental research direction, which we term as Noisy Model Learning.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.67394757270813, 1.8208867311477661], "openalex_id": "https://openalex.org/W4392701205", "title": "Tell me a story: a framework for critically investigating AI language models", "authors": "Luke Munn, Leah Henrickson", "abstract": "Large language models are rapidly being rolled out into high-stakes fields like healthcare, law, and education. However, understanding of their design considerations, operational logics, and implicit biases remains limited. How might these black boxes be understood and unpacked? In this article, we lay out an accessible but critical framework for inquiry, a pedagogical tool with four dimensions. Tell me your story investigates the design and values of the AI model. Tell me my story explores the model's affective warmth and its psychological impacts. Tell me our story probes the model's particular understanding of the world based on past statistics and pattern-matching. Tell me 'their' story compares the model's knowledge on dominant (e.g. Western) versus 'peripheral' (e.g. Indigenous) cultures, events, and issues. Each mode includes sample prompts and key issues to raise. The framework aims to enhance the public's critical thinking and technical literacy around generative AI models.", "venue": "Learning Media and Technology", "label": 0}, {"loc": [6.054977893829346, 5.3318047523498535], "openalex_id": "https://openalex.org/W4392735695", "title": "Amharic LLaMA and LLaVA: Multimodal LLMs for Low Resource Languages", "authors": "Michael Andersland", "abstract": "Large Language Models (LLMs) like GPT-4 and LLaMA have shown incredible proficiency at natural language processing tasks and have even begun to excel at tasks across other modalities such as vision and audio. Despite their success, LLMs often struggle to perform well on low-resource languages because there is so little training data available. This shortcoming is especially prevalent with open source models. In this work, we explore training LLaMA-2 to speak Amharic, a language which is spoken by over 50 million people world wide, but has orders of magnitude less data available than languages like English. We employ methods previously used for training LLMs on other languages with data scarcity, and use open source translation models to perform data augmentation and grow our dataset from millions of tokens to billions. We further enhance the capabilities of our model by connecting an image encoder and training on a translated visual instruction tuning dataset in the same manner as LLaVA, resulting in a multimodal Amharic LLM that can understand images along with text. We introduce an Amharic version of a popular benchmarking dataset to evaluate our work. Our models and dataset are open sourced and available on GitHub.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.6253662109375, 2.727714776992798], "openalex_id": "https://openalex.org/W4392736084", "title": "Unraveling the Mystery of Scaling Laws: Part I", "authors": "Angie Su, Zhi Tian, Xiaoyu Shen, Xunliang Cai", "abstract": "Scaling law principles indicate a power-law correlation between loss and variables such as model size, dataset size, and computational resources utilized during training. These principles play a vital role in optimizing various aspects of model pre-training, ultimately contributing to the success of large language models such as GPT-4, Llama and Gemini. However, the original scaling law paper by OpenAI did not disclose the complete details necessary to derive the precise scaling law formulas, and their conclusions are only based on models containing up to 1.5 billion parameters. Though some subsequent works attempt to unveil these details and scale to larger models, they often neglect the training dependency of important factors such as the learning rate, context length and batch size, leading to their failure to establish a reliable formula for predicting the test loss trajectory. In this technical report, we confirm that the scaling law formulations proposed in the original OpenAI paper remain valid when scaling the model size up to 33 billion, but the constant coefficients in these formulas vary significantly with the experiment setup. We meticulously identify influential factors and provide transparent, step-by-step instructions to estimate all constant terms in scaling-law formulas by training on models with only 1M~60M parameters. Using these estimated formulas, we showcase the capability to accurately predict various attributes for models with up to 33B parameters before their training, including (1) the minimum possible test loss; (2) the minimum required training steps and processed tokens to achieve a specific loss; (3) the critical batch size with an optimal time/computation trade-off at any loss value; and (4) the complete test loss trajectory with arbitrary batch size.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.787069797515869, -0.3543970286846161], "openalex_id": "https://openalex.org/W4392710181", "title": "AI unveiled personalities: Profiling optimistic and pessimistic attitudes in Hindi dataset using transformer\u2010based models", "authors": "Dipika Jain, Akshi Kumar", "abstract": "Abstract Both optimism and pessimism are intricately intertwined with an individual's inherent personality traits and people of all personality types can exhibit a wide range of attitudes and behaviours, including levels of optimism and pessimism. This paper undertakes a comprehensive analysis of optimistic and pessimistic tendencies present within Hindi textual data, employing transformer\u2010based models. The research represents a pioneering effort to define and establish an interaction between the personality and attitude chakras within the realm of human psychology. Introducing an innovative \u201cChakra\u201d system to illustrate complex interrelationships within human psychology, this work aligns the Myers\u2010Briggs Type Indicator (MBTI) personality traits with optimistic and pessimistic attitudes, enriching our understanding of emotional projection in text. The study employs meticulously fine\u2010tuned transformer models\u2014specifically mBERT, XLM\u2010RoBERTa, IndicBERT, mDeBERTa and a novel stacked mDeBERTa\u2014trained on the novel Hindi dataset \u2018\u092e\u0928\u094b\u092d\u093e\u0935\u2019 (pronounced as Manobhav). Remarkably, the proposed Stacked mDeBERTa model outperforms others, recording an accuracy of 0.7785 along with elevated precision, recall, and F1 score values. Notably, its ROC AUC score of 0.7226 underlines its robustness in distinguishing between positive and negative emotional attitudes. The comparative analysis highlights the superiority of the Stacked mDeBERTa model in effectively capturing emotional attitudes in Hindi text.", "venue": "Expert Systems", "label": 0}, {"loc": [4.88750696182251, 0.2930900454521179], "openalex_id": "https://openalex.org/W4392678147", "title": "Are Human Conversations Special? A Large Language Model Perspective", "authors": "Toshish Jawale, Chaitanya Animesh, Sekhar Vallath, Kartik Talamadupula, Larry Heck", "abstract": "This study analyzes changes in the attention mechanisms of large language models (LLMs) when used to understand natural conversations between humans (human-human). We analyze three use cases of LLMs: interactions over web content, code, and mathematical texts. By analyzing attention distance, dispersion, and interdependency across these domains, we highlight the unique challenges posed by conversational data. Notably, conversations require nuanced handling of long-term contextual relationships and exhibit higher complexity through their attention patterns. Our findings reveal that while language models exhibit domain-specific attention behaviors, there is a significant gap in their ability to specialize in human conversations. Through detailed attention entropy analysis and t-SNE visualizations, we demonstrate the need for models trained with a diverse array of high-quality conversational data to enhance understanding and generation of human-like dialogue. This research highlights the importance of domain specialization in language models and suggests pathways for future advancement in modeling human conversational nuances.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.999218463897705, -2.2087998390197754], "openalex_id": "https://openalex.org/W4392654282", "title": "The Dark Side of Language Models: Exploring the Potential of LLMs in Multimedia Disinformation Generation and Dissemination", "authors": "Dipto Barman, Ziyi Guo, Owen Conlan", "abstract": "Disinformation - the deliberate spread of false or misleading information poses a significant threat to our society by undermining trust, exacerbating polarization, and manipulating public opinion. With the rapid advancement of artificial intelligence and the growing prominence of large language models (LLMs) such as ChatGPT, new avenues for the dissemination of disinformation are emerging. This review paper explores the potential of LLMs to initiate the generation of multi-media disinformation, encompassing text, images, audio, and video. We begin by examining the capabilities of LLMs, highlighting their potential to create compelling, context-aware content that can be weaponized for malicious purposes. Subsequently, we examine the nature of disinformation and the various mechanisms through which it spreads in the digital landscape. Utilizing these advanced models, malicious actors can automate and scale up disinformation effectively. We describe a theoretical pipeline for creating and disseminating disinformation on social media. Existing interventions to combat disinformation are also reviewed. While these efforts have shown success, we argue that they need to be strengthened to effectively counter the escalating threat posed by LLMs. Digital platforms have, unfortunately, enabled malicious actors to extend the reach of disinformation. The advent of LLMs poses an additional concern as they can be harnessed to significantly amplify the velocity, variety, and volume of disinformation. Thus, this review proposes augmenting current interventions with AI tools like LLMs, capable of assessing information more swiftly and comprehensively than human fact-checkers. This paper illuminates the dark side of LLMs and highlights their potential to be exploited as disinformation dissemination tools.", "venue": "Machine Learning with Applications", "label": 0}, {"loc": [6.374515056610107, 5.454424858093262], "openalex_id": "https://openalex.org/W4392678661", "title": "DeepSeek-VL: Towards Real-World Vision-Language Understanding", "authors": "Haoyu Lu, Wen Liu, Bo Zhang, Bingxuan Wang, Kai Dong, Bo Liu, Jingxiang Sun, Tongzheng Ren, Zhuoshu Li, Hao Yang, Yaofeng Sun, Chengqi Deng, Hanwei Xu, Zhenda Xie, Chong Ruan", "abstract": "We present DeepSeek-VL, an open-source Vision-Language (VL) Model designed for real-world vision and language understanding applications. Our approach is structured around three key dimensions: We strive to ensure our data is diverse, scalable, and extensively covers real-world scenarios including web screenshots, PDFs, OCR, charts, and knowledge-based content, aiming for a comprehensive representation of practical contexts. Further, we create a use case taxonomy from real user scenarios and construct an instruction tuning dataset accordingly. The fine-tuning with this dataset substantially improves the model's user experience in practical applications. Considering efficiency and the demands of most real-world scenarios, DeepSeek-VL incorporates a hybrid vision encoder that efficiently processes high-resolution images (1024 x 1024), while maintaining a relatively low computational overhead. This design choice ensures the model's ability to capture critical semantic and detailed information across various visual tasks. We posit that a proficient Vision-Language Model should, foremost, possess strong language abilities. To ensure the preservation of LLM capabilities during pretraining, we investigate an effective VL pretraining strategy by integrating LLM training from the beginning and carefully managing the competitive dynamics observed between vision and language modalities. The DeepSeek-VL family (both 1.3B and 7B models) showcases superior user experiences as a vision-language chatbot in real-world applications, achieving state-of-the-art or competitive performance across a wide range of visual-language benchmarks at the same model size while maintaining robust performance on language-centric benchmarks. We have made both 1.3B and 7B models publicly accessible to foster innovations based on this foundation model.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.536180019378662, 5.245778560638428], "openalex_id": "https://openalex.org/W4392617597", "title": "Yi: Open Foundation Models by 01. AI", "authors": "AI, NULL AUTHOR_ID, Alexander S. Young, Bei Chen, Decheng Li, Chengen Huang, Ge Zhang, G.F. Zhang, Heng Li, Jiangcheng Zhu, Jianqun Chen, Jing Chang, Kaidong Yu, Peng Liu, Qiang Liu, Shawn Yue, S.-C. Yang, Shiming Yang, Tao Yu, Wen Xie, Wenhao Huang, Xiaohui Hu, Xiaoyi Ren, Xinyao Niu, Pengcheng Nie, Yuchi Xu, Yudong Liu, Yue Wang, Yuxuan Cai, Zhenyu Gu, Zhiyuan Liu, Zonghong Dai", "abstract": "We introduce the Yi model family, a series of language and multimodal models that demonstrate strong multi-dimensional capabilities. The Yi model family is based on 6B and 34B pretrained language models, then we extend them to chat models, 200K long context models, depth-upscaled models, and vision-language models. Our base models achieve strong performance on a wide range of benchmarks like MMLU, and our finetuned chat models deliver strong human preference rate on major evaluation platforms like AlpacaEval and Chatbot Arena. Building upon our scalable super-computing infrastructure and the classical transformer architecture, we attribute the performance of Yi models primarily to its data quality resulting from our data-engineering efforts. For pretraining, we construct 3.1 trillion tokens of English and Chinese corpora using a cascaded data deduplication and quality filtering pipeline. For finetuning, we polish a small scale (less than 10K) instruction dataset over multiple iterations such that every single instance has been verified directly by our machine learning engineers. For vision-language, we combine the chat language model with a vision transformer encoder and train the model to align visual representations to the semantic space of the language model. We further extend the context length to 200K through lightweight continual pretraining and demonstrate strong needle-in-a-haystack retrieval performance. We show that extending the depth of the pretrained checkpoint through continual pretraining further improves performance. We believe that given our current results, continuing to scale up model parameters using thoroughly optimized data will lead to even stronger frontier models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.333378314971924, 2.4787724018096924], "openalex_id": "https://openalex.org/W4392617449", "title": "Where does In-context Translation Happen in Large Language Models", "authors": "Suzanna Sia, David S. Mueller, Kevin Duh", "abstract": "Self-supervised large language models have demonstrated the ability to perform Machine Translation (MT) via in-context learning, but little is known about where the model performs the task with respect to prompt instructions and demonstration examples. In this work, we attempt to characterize the region where large language models transition from in-context learners to translation models. Through a series of layer-wise context-masking experiments on \\textsc{GPTNeo2.7B}, \\textsc{Bloom3B}, \\textsc{Llama7b} and \\textsc{Llama7b-chat}, we demonstrate evidence of a \"task recognition\" point where the translation task is encoded into the input representations and attention to context is no longer necessary. We further observe correspondence between the low performance when masking out entire layers, and the task recognition layers. Taking advantage of this redundancy results in 45\\% computational savings when prompting with 5 examples, and task recognition achieved at layer 14 / 32. Our layer-wise fine-tuning experiments indicate that the most effective layers for MT fine-tuning are the layers critical to task recognition.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.423896789550781, 3.6259782314300537], "openalex_id": "https://openalex.org/W4392575839", "title": "GaLore: Memory-Efficient LLM Training by Gradient Low-Rank Projection", "authors": "Jiawei Zhao, Zhenyu Zhang, Beidi Chen, Zhangyang Wang, Anima Anandkumar, Yuandong Tian", "abstract": "Training Large Language Models (LLMs) presents significant memory challenges, predominantly due to the growing size of weights and optimizer states. Common memory-reduction approaches, such as low-rank adaptation (LoRA), add a trainable low-rank matrix to the frozen pre-trained weight in each layer, reducing trainable parameters and optimizer states. However, such approaches typically underperform training with full-rank weights in both pre-training and fine-tuning stages since they limit the parameter search to a low-rank subspace and alter the training dynamics, and further, may require full-rank warm start. In this work, we propose Gradient Low-Rank Projection (GaLore), a training strategy that allows full-parameter learning but is more memory-efficient than common low-rank adaptation methods such as LoRA. Our approach reduces memory usage by up to 65.5% in optimizer states while maintaining both efficiency and performance for pre-training on LLaMA 1B and 7B architectures with C4 dataset with up to 19.7B tokens, and on fine-tuning RoBERTa on GLUE tasks. Our 8-bit GaLore further reduces optimizer memory by up to 82.5% and total training memory by 63.3%, compared to a BF16 baseline. Notably, we demonstrate, for the first time, the feasibility of pre-training a 7B model on consumer GPUs with 24GB memory (e.g., NVIDIA RTX 4090) without model parallel, checkpointing, or offloading strategies.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.33750057220459, 1.2521734237670898], "openalex_id": "https://openalex.org/W4392564671", "title": "Need a Programming Exercise Generated in Your Native Language? ChatGPT's Got Your Back: Automatic Generation of Non-English Programming Exercises Using \u2026", "authors": "M. I. Jordan, Kevin Ly, Adalbert Gerald Soosai Raj", "abstract": "Large language models (LLMs) like ChatGPT are changing computing education and may create additional barriers to those already faced by non-native English speakers (NNES) learning computing. We investigate an opportunity for a positive impact of LLMs on NNES through multilingual programming exercise generation. Following previous work with LLM exercise generation in English, we prompt OpenAI GPT-3.5 in 4 natural languages (English, Tamil, Spanish, and Vietnamese) to create introductory programming problems, sample solutions, and test cases. We evaluate these problems on their sensibility, readability, translation, sample solution accuracy, topicality, and cultural relevance. We find that problems generated in English, Spanish, and Vietnamese are largely sensible, easily understood, and accurate in their sample solutions. However, Tamil problems are mostly non-sensible and have a much lower passing test rate, indicating that the abilities of LLMs for problem generation are not generalizable across languages. Our analysis suggests that these problems could not be given verbatim to students, but with minimal effort, most errors can be fixed. We further discuss the benefits of these problems despite their flaws, and their opportunities to provide personalized and culturally relevant resources for students in their native languages.", "venue": "https://doi.org/10.1145/3626252.3630897", "label": 0}, {"loc": [7.1745500564575195, 2.0636110305786133], "openalex_id": "https://openalex.org/W4392576241", "title": "Rapidly Developing High-quality Instruction Data and Evaluation Benchmark for Large Language Models with Minimal Human Effort: A Case Study on Japanese", "authors": "Yikun Sun, Zhen Wan, Nobuhiro Ueda, Sakiko Yahata, Fei Cheng, Chenhui Chu, Sadao Kurohashi", "abstract": "The creation of instruction data and evaluation benchmarks for serving Large language models often involves enormous human annotation. This issue becomes particularly pronounced when rapidly developing such resources for a non-English language like Japanese. Instead of following the popular practice of directly translating existing English resources into Japanese (e.g., Japanese-Alpaca), we propose an efficient self-instruct method based on GPT-4. We first translate a small amount of English instructions into Japanese and post-edit them to obtain native-level quality. GPT-4 then utilizes them as demonstrations to automatically generate Japanese instruction data. We also construct an evaluation benchmark containing 80 questions across 8 categories, using GPT-4 to automatically assess the response quality of LLMs without human references. The empirical results suggest that the models fine-tuned on our GPT-4 self-instruct data significantly outperformed the Japanese-Alpaca across all three base pre-trained models. Our GPT-4 self-instruct data allowed the LLaMA 13B model to defeat GPT-3.5 (Davinci-003) with a 54.37\\% win-rate. The human evaluation exhibits the consistency between GPT-4's assessments and human preference. Our high-quality instruction data and evaluation benchmark have been released here.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.363428592681885, 0.11789978295564651], "openalex_id": "https://openalex.org/W4392539042", "title": "Fostering the Ecosystem of Open Neural Encoders for Portuguese with Albertina PT* Family", "authors": "Rodrigo Santos, Jo\u00e3o Domingos Rodrigues, Lu\u00eds Gomes, Jo\u00e3o Silva, Ant\u00f3nio Branco, Henrique Lopes Cardoso, Tom\u00e1s Freitas Os\u00f3rio, Bernardo Leite", "abstract": "To foster the neural encoding of Portuguese, this paper contributes foundation encoder models that represent an expansion of the still very scarce ecosystem of large language models specifically developed for this language that are fully open, in the sense that they are open source and openly distributed for free under an open license for any purpose, thus including research and commercial usages. Like most languages other than English, Portuguese is low-resourced in terms of these foundational language resources, there being the inaugural 900 million parameter Albertina and 335 million Bertimbau. Taking this couple of models as an inaugural set, we present the extension of the ecosystem of state-of-the-art open encoders for Portuguese with a larger, top performance-driven model with 1.5 billion parameters, and a smaller, efficiency-driven model with 100 million parameters. While achieving this primary goal, further results that are relevant for this ecosystem were obtained as well, namely new datasets for Portuguese based on the SuperGLUE benchmark, which we also distribute openly.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.078515529632568, 0.29387781023979187], "openalex_id": "https://openalex.org/W4392488514", "title": "NusaBERT: Teaching IndoBERT to be Multilingual and Multicultural", "authors": "Wilson Wongso, David Samuel Setiawan, Steven Limcorn, Ananto Joyoadikusumo", "abstract": "Indonesia's linguistic landscape is remarkably diverse, encompassing over 700 languages and dialects, making it one of the world's most linguistically rich nations. This diversity, coupled with the widespread practice of code-switching and the presence of low-resource regional languages, presents unique challenges for modern pre-trained language models. In response to these challenges, we developed NusaBERT, building upon IndoBERT by incorporating vocabulary expansion and leveraging a diverse multilingual corpus that includes regional languages and dialects. Through rigorous evaluation across a range of benchmarks, NusaBERT demonstrates state-of-the-art performance in tasks involving multiple languages of Indonesia, paving the way for future natural language understanding research for under-represented languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.9556498527526855, -1.2640626430511475], "openalex_id": "https://openalex.org/W4392452944", "title": "Understanding latent affective bias in large pre-trained neural language models", "authors": "Anoop Kadan, P Deepak, Sahely Bhadra, Manjary P. Gangan, V. L. Lajish", "abstract": "Groundbreaking inventions and highly significant performance improvements in deep learning based Natural Language Processing are witnessed through the development of transformer based large Pre-trained Language Models (PLMs). The wide availability of unlabeled data within human generated data deluge along with self-supervised learning strategy helps to accelerate the success of large PLMs in language generation, language understanding, etc. But at the same time, latent historical bias/unfairness in human minds towards a particular gender, race, etc., encoded unintentionally/intentionally into the corpora harms and questions the utility and efficacy of large PLMs in many real-world applications, particularly for the protected groups. In this paper, we present an extensive investigation towards understanding the existence of \u201cAffective Bias\u201d in large PLMs to unveil any biased association of emotions such as anger, fear, joy, etc., towards a particular gender, race or religion with respect to the downstream task of textual emotion detection. We conduct our exploration of affective bias from the very initial stage of corpus level affective bias analysis by searching for imbalanced distribution of affective words within a domain, in large scale corpora that are used to pre-train and fine-tune PLMs. Later, to quantify affective bias in model predictions, we perform an extensive set of class-based and intensity-based evaluations using various bias evaluation corpora. Our results show the existence of statistically significant affective bias in the PLM based emotion detection systems, indicating biased association of certain emotions towards a particular gender, race, and religion.", "venue": "Natural Language Processing Journal", "label": 9}, {"loc": [8.244550704956055, 1.7403713464736938], "openalex_id": "https://openalex.org/W4394874714", "title": "Efficiency Comparison of Dataset Generated by LLMs using Machine Learning Algorithms", "authors": "Premraj Pawade, Mohit Rameshchandra Kulkarni, Shreya Naik, Aditya Raut, Kishor S. Wagh", "abstract": "The constantly expanding field of Large Language Models (LLMs) offers exciting opportunities for various domains. These powerful models, such as GPT-3.5, Bard, and Bing, can produce massive amounts of text-based data, creating new avenues for generating synthetic datasets. The primary focus of this research is to explore the effectiveness of LLMs in creating high-quality, structured datasets for different ML applications. Specifically, this study concentrates on password strength prediction. It compares the performance of three prominent LLMs - Bard, ChatGPT, and BingAI - in generating datasets of text-based passwords with their corresponding strength levels. This research uses a diverse set of ML models, including traditional algorithms like XGBoost, Random Forest, etc., to evaluate the generated datasets. The evaluation process assesses their performance, generalization, and adaptability. This research contributes to the growing field of LLM-based data generation by demonstrating their effectiveness in creating valuable datasets for specific machine learning applications. The findings of this study pave the way for further exploration of LLMs' capabilities for diverse data types and tasks, potentially unlocking new avenues for advancements in various machine learning domains.", "venue": "https://doi.org/10.1109/esci59607.2024.10497340", "label": 0}, {"loc": [4.297298431396484, -2.4587466716766357], "openalex_id": "https://openalex.org/W4392490627", "title": "A Survey of AI-generated Text Forensic Systems: Detection, Attribution, and Characterization", "authors": "Tharindu Kumarage, Garima Agrawal, Paras Sheth, Raha Moraffah, Aman Chadha, Joshua Garland, Huan Liu", "abstract": "We have witnessed lately a rapid proliferation of advanced Large Language Models (LLMs) capable of generating high-quality text. While these LLMs have revolutionized text generation across various domains, they also pose significant risks to the information ecosystem, such as the potential for generating convincing propaganda, misinformation, and disinformation at scale. This paper offers a review of AI-generated text forensic systems, an emerging field addressing the challenges of LLM misuses. We present an overview of the existing efforts in AI-generated text forensics by introducing a detailed taxonomy, focusing on three primary pillars: detection, attribution, and characterization. These pillars enable a practical understanding of AI-generated text, from identifying AI-generated content (detection), determining the specific AI model involved (attribution), and grouping the underlying intents of the text (characterization). Furthermore, we explore available resources for AI-generated text forensics research and discuss the evolving challenges and future directions of forensic systems in an AI era.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.787857532501221, -0.6190879344940186], "openalex_id": "https://openalex.org/W4392426167", "title": "Benchmarking zero-shot stance detection with FlanT5-XXL: Insights from training data, prompting, and decoding strategies into its near-SoTA performance", "authors": "Rachith Aiyappa, Shruthi Senthilmani, Jisun An, Haewoon Kwak, Yong\u2010Yeol Ahn", "abstract": "We investigate the performance of LLM-based zero-shot stance detection on tweets. Using FlanT5-XXL, an instruction-tuned open-source LLM, with the SemEval 2016 Tasks 6A, 6B, and P-Stance datasets, we study the performance and its variations under different prompts and decoding strategies, as well as the potential biases of the model. We show that the zero-shot approach can match or outperform state-of-the-art benchmarks, including fine-tuned models. We provide various insights into its performance including the sensitivity to instructions and prompts, the decoding strategies, the perplexity of the prompts, and to negations and oppositions present in prompts. Finally, we ensure that the LLM has not been trained on test datasets, and identify a positivity bias which may partially explain the performance differences across decoding strategie", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.433119535446167, 3.199428081512451], "openalex_id": "https://openalex.org/W4404704696", "title": "Mitigating Bias LLM-Powered Employee Engagement Models: AI Ethics in Enterprise HR Systems", "authors": "Sudheer Devaraju", "abstract": "This article provides a comprehensive technique for incorporating Large Language Models (LLMs) into corporate employee engagement platforms, with an emphasis on technical design, implementation challenges, and longitudinal effect analysis. We examine sophisticated fine-tuning methods, such as bias mitigation strategies and privacy-preserving approaches, using proprietary HR datasets. The report emphasizes significant improvements in operational efficiency, with AI-powered HR solutions showing a 32% improvement in process optimization and 91.2% accuracy in employee feedback analysis across many languages. To address significant concerns about data privacy, scalability, and long-term efficacy, our system employs a multi-layered approach that incorporates federated learning implementations, differential privacy techniques, and robust security mechanisms. The implementation outcomes show notable benefits, including a 34% rise in employee satisfaction metrics and a 41% reduction in time-toinsight for HR analytics, while closely conforming to GDPR and CCPA laws.", "venue": "Turkish Journal of Computer and Mathematics Education (TURCOMAT).", "label": 0}, {"loc": [4.876494407653809, 0.4211592376232147], "openalex_id": "https://openalex.org/W4392427710", "title": "Open Assistant Toolkit--version 2", "authors": "Sophie Fischer, F. Rossetto, Carlos Gemmell, Andrew Ramsay, I C Mackie, Philip Zubel, Niklas Tecklenburg, Jeff Dalton", "abstract": "We present the second version of the Open Assistant Toolkit (OAT-v2), an open-source task-oriented conversational system for composing generative neural models. OAT-v2 is a scalable and flexible assistant platform supporting multiple domains and modalities of user interaction. It splits processing a user utterance into modular system components, including submodules such as action code generation, multimodal content retrieval, and knowledge-augmented response generation. Developed over multiple years of the Alexa TaskBot challenge, OAT-v2 is a proven system that enables scalable and robust experimentation in experimental and real-world deployment. OAT-v2 provides open models and software for research and commercial applications to enable the future of multimodal virtual assistants across diverse applications and types of rich interaction.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.144939422607422, 1.3578194379806519], "openalex_id": "https://openalex.org/W4392402185", "title": "Datasets for Large Language Models: A Comprehensive Survey", "authors": "Yang Liu, Jiahuan Cao, Chongyu Liu, Kai Ding, Lianwen Jin", "abstract": "Abstract This paper embarks on an exploration into the Large Language Model (LLM) datasets, which play a crucial role in the remarkable advancements of LLMs. The datasets serve as the foundational infrastructure analogous to a root system that sustains and nurtures the development of LLMs. Consequently, examination of these datasets emerges as a critical topic in research. In order to address the current lack of a comprehensive overview and thorough analysis of LLM datasets, and to gain insights into their current status and future trends, this survey consolidates and categorizes the fundamental aspects of LLM datasets from five perspectives: (1) Pre-training Corpora; (2) Instruction Fine-tuning Datasets; (3) Preference Datasets; (4) Evaluation Datasets; (5) Traditional Natural Language Processing (NLP) Datasets. The survey sheds light on the prevailing challenges and points out potential avenues for future investigation. Additionally, a comprehensive review of the existing available dataset resources is also provided, including statistics from 444 datasets, covering 8 language categories and spanning 32 domains. Information from 20 dimensions is incorporated into the dataset statistics. The total data size surveyed surpasses 774.5 TB for pre-training corpora and 700M instances for other datasets. We aim to present the entire landscape of LLM text datasets, serving as a comprehensive reference for researchers in this field and contributing to future studies. Related resources are available at: \\href{https://github.com/lmmlzn/Awesome-LLMs-Datasets}{https://github.com/lmmlzn/Awesome-LLMs-Datasets}.", "venue": "https://doi.org/10.21203/rs.3.rs-3996137/v1", "label": 0}, {"loc": [2.8155815601348877, 1.2258273363113403], "openalex_id": "https://openalex.org/W4392356510", "title": "Designing a course for pre-service science teachers using ChatGPT: what ChatGPT brings to the table", "authors": "Hasan Z\u00fcht\u00fc Okulu, Nilay Muslu", "abstract": "ChatGPT holds significant potential for enhancing learning through integration into education as an advanced chatbot. With the goal of harnessing this potential, our research focused on exploring the utilization of ChatGPT in designing a course plan for pre-service science teachers. We adopted a qualitative research approach and employed ChatGPT as an assistant to create a course plan for classroom assessment in science education. Our conversation to create this plan served as our data source for document analysis. We conducted interpretive analysis for qualitative data. The findings emphasized the benefits of ChatGPT in developing an implementable course plan, delivering adaptable information, and time saving. However, there were limitations to consider. These challenges encompassed issues such as communicating out of ChatGPT and the possibility of miscommunication. Despite these limitations, the research findings clearly demonstrate that ChatGPT is highly effective in developing a course plan. As researchers who have personally experienced the process of creating a course plan using ChatGPT, we believe that its potential needs to be maximally utilized. We suggest its application across different subjects and disciplines to thoroughly examine its strengths and weaknesses in depth.", "venue": "Interactive Learning Environments", "label": 0}, {"loc": [3.2214155197143555, 2.1674883365631104], "openalex_id": "https://openalex.org/W4393052393", "title": "Navigating the ethical landscape behind ChatGPT", "authors": "Lizhi Peng, Bo Zhao", "abstract": "In this commentary, we examine the key ethical concerns arising from the rapid penetration and proliferation of generative artificial intelligence (AI), with ChatGPT as a prominent case study. Our analysis is structured around four pivotal themes: the debates on plagiarism and authorship in AI-generated content; the underlying power dynamics that shape biases in AI development; the dynamic, complex relationships between humans and machines; and the growing concerns over unchecked progress and the absence of accountability in the rapidly intensifying AI \u201cArms Race.\u201d Recognizing the necessity for ethical alignment in AI, yet without a clear consensus of \u201chuman interests,\u201d gives room for further exacerbating global inequalities, we advocate for enhanced transparency and increased public involvement in AI development and deployment processes. This article underscores the importance of engaging a diverse range of voices, especially those from communities traditionally uninvolved or excluded from the dialogue on AI development. By doing so, we aim to foster a more inclusive and multidisciplinary approach to understanding and shaping the trajectory of AI technologies, ensuring that their benefits are equitably shared, and their risks carefully managed.", "venue": "Big Data & Society", "label": 0}, {"loc": [9.523477554321289, 1.745936632156372], "openalex_id": "https://openalex.org/W4392425549", "title": "Large Language Models (LLMs) on Tabular Data: Predic-tion, Generation, and Understanding-A Survey", "authors": "Xi Fang, Weijie Xu, Fiona Anting Tan, Jiani Zhang, Ziqing Hu, Yanjun Qi, Scott Nickleach, Diego A. Socolinsky, Srinivasan H. Sengamedu, Christos Faloutsos", "abstract": "Recent breakthroughs in large language modeling have facilitated rigorous exploration of their application in diverse tasks related to tabular data modeling, such as prediction, tabular data synthesis, question answering, and table understanding. Each task presents unique challenges and opportunities. However, there is currently a lack of comprehensive review that summarizes and compares the key techniques, metrics, datasets, models, and optimization approaches in this research domain. This survey aims to address this gap by consolidating recent progress in these areas, offering a thorough survey and taxonomy of the datasets, metrics, and methodologies utilized. It identifies strengths, limitations, unexplored territories, and gaps in the existing literature, while providing some insights for future research directions in this vital and rapidly evolving field. It also provides relevant code and datasets references. Through this comprehensive review, we hope to provide interested readers with pertinent references and insightful perspectives, empowering them with the necessary tools and knowledge to effectively navigate and address the prevailing challenges in the field.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.578315734863281, 1.720399022102356], "openalex_id": "https://openalex.org/W4392425549", "title": "Large Language Models on Tabular Data--A Survey", "authors": "Xi Fang, Weijie Xu, Fiona Anting Tan, Jiani Zhang, Ziqing Hu, Yanjun Qi, Scott Nickleach, Diego A. Socolinsky, Srinivasan H. Sengamedu, Christos Faloutsos", "abstract": "Recent breakthroughs in large language modeling have facilitated rigorous exploration of their application in diverse tasks related to tabular data modeling, such as prediction, tabular data synthesis, question answering, and table understanding. Each task presents unique challenges and opportunities. However, there is currently a lack of comprehensive review that summarizes and compares the key techniques, metrics, datasets, models, and optimization approaches in this research domain. This survey aims to address this gap by consolidating recent progress in these areas, offering a thorough survey and taxonomy of the datasets, metrics, and methodologies utilized. It identifies strengths, limitations, unexplored territories, and gaps in the existing literature, while providing some insights for future research directions in this vital and rapidly evolving field. It also provides relevant code and datasets references. Through this comprehensive review, we hope to provide interested readers with pertinent references and insightful perspectives, empowering them with the necessary tools and knowledge to effectively navigate and address the prevailing challenges in the field.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.8880486488342285, -0.160598024725914], "openalex_id": "https://openalex.org/W4392293423", "title": "A bilingual benchmark for evaluating large language models", "authors": "Mohamed Alkaoud", "abstract": "This work introduces a new benchmark for the bilingual evaluation of large language models (LLMs) in English and Arabic. While LLMs have transformed various fields, their evaluation in Arabic remains limited. This work addresses this gap by proposing a novel evaluation method for LLMs in both Arabic and English, allowing for a direct comparison between the performance of the two languages. We build a new evaluation dataset based on the General Aptitude Test (GAT), a standardized test widely used for university admissions in the Arab world, that we utilize to measure the linguistic capabilities of LLMs. We conduct several experiments to examine the linguistic capabilities of ChatGPT and quantify how much better it is at English than Arabic. We also examine the effect of changing task descriptions from Arabic to English and vice-versa. In addition to that, we find that fastText can surpass ChatGPT in finding Arabic word analogies. We conclude by showing that GPT-4 Arabic linguistic capabilities are much better than ChatGPT\u2019s Arabic capabilities and are close to ChatGPT\u2019s English capabilities.", "venue": "PeerJ Computer Science", "label": 4}, {"loc": [5.961742877960205, 4.1042633056640625], "openalex_id": "https://openalex.org/W4392363989", "title": "Generative AI for Unmanned Vehicle Swarms: Challenges, Applications and Opportunities", "authors": "Guangyuan Liu, Nguy\u1ec5n V\u0103n Huynh, Hongyang Du, Dinh Thai Hoang, Dusit Niyato, Kun Zhu, Jiawen Kang, Zehui Xiong, Abbas Jamalipour, Dong In Kim", "abstract": "With recent advances in artificial intelligence (AI) and robotics, unmanned vehicle swarms have received great attention from both academia and industry due to their potential to provide services that are difficult and dangerous to perform by humans. However, learning and coordinating movements and actions for a large number of unmanned vehicles in complex and dynamic environments introduce significant challenges to conventional AI methods. Generative AI (GAI), with its capabilities in complex data feature extraction, transformation, and enhancement, offers great potential in solving these challenges of unmanned vehicle swarms. For that, this paper aims to provide a comprehensive survey on applications, challenges, and opportunities of GAI in unmanned vehicle swarms. Specifically, we first present an overview of unmanned vehicles and unmanned vehicle swarms as well as their use cases and existing issues. Then, an in-depth background of various GAI techniques together with their capabilities in enhancing unmanned vehicle swarms are provided. After that, we present a comprehensive review on the applications and challenges of GAI in unmanned vehicle swarms with various insights and discussions. Finally, we highlight open issues of GAI in unmanned vehicle swarms and discuss potential research directions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.673618316650391, 0.08872268348932266], "openalex_id": "https://openalex.org/W4394842475", "title": "Frozen or Fine-tuned? Analyzing Deep Learning Models and Training Strategies for Optimizing Big Five Personality Traits Prediction from Text", "authors": "Masoud Soleimani, Hamidreza Baradaran Kashani", "abstract": "In today's digital age, the comprehension and prediction of human personality traits have assumed paramount significance. This study embarks on the task of forecasting the Big Five personality traits through textual data, harnessing the capabilities of advanced natural language processing models. The focal dataset is the ChaLearn First Impressions V2, a treasure trove of human-generated text coupled with Big Five personality trait labels. A diverse array of models undergo scrutiny, ranging from basic deep learning models like Deep Pyramid Convolutional Neural Network (DPCNN) and Hierarchical Attention Network (HAN) to cutting-edge transformer-based architectures such as BERT and FLAN-T5. These models undergo meticulous evaluation across various training scenarios, spanning scenarios where all layers are fine-tuned, only the embedding layer is freezed, and the complete layer freezing, with exclusive attention to Transformer models. Notably, models such as DPCNN and HAN emerge as stars, boasting remarkable accuracy attributable to their prowess in hierarchical feature extraction. Conversely, Transformer models like ELECTRA shine when layers remain frozen, showcasing their exceptional contextual comprehension. Furthermore, the study employs word clouds to visually encapsulate the essence of each Big Five personality trait, unraveling intricate relationships between specific words and these traits. The findings underscore the intricate interplay among model architecture, training methodologies, and layer freezing, offering valuable insights into strategies that yield optimal performance in predicting personality traits. In an age dominated by digital communication, this research contributes significantly to our understanding and prediction of human personalities.", "venue": "https://doi.org/10.1109/qicar61538.2024.10496606", "label": 0}, {"loc": [4.765100479125977, 2.270214796066284], "openalex_id": "https://openalex.org/W4392952534", "title": "Extracting intersectional stereotypes from embeddings: Developing and validating the Flexible Intersectional Stereotype Extraction procedure", "authors": "Tessa Elizabeth Sadie Charlesworth, Kshitish Ghate, Aylin Caliskan, Mahzarin R. Banaji", "abstract": "Abstract Social group\u2013based identities intersect. The meaning of \u201cwoman\u201d is modulated by adding social class as in \u201crich woman\u201d or \u201cpoor woman.\u201d How does such intersectionality operate at-scale in everyday language? Which intersections dominate (are most frequent)? What qualities (positivity, competence, warmth) are ascribed to each intersection? In this study, we make it possible to address such questions by developing a stepwise procedure, Flexible Intersectional Stereotype Extraction (FISE), applied to word embeddings (GloVe; BERT) trained on billions of words of English Internet text, revealing insights into intersectional stereotypes. First, applying FISE to occupation stereotypes across intersections of gender, race, and class showed alignment with ground-truth data on occupation demographics, providing initial validation. Second, applying FISE to trait adjectives showed strong androcentrism (Men) and ethnocentrism (White) in dominating everyday English language (e.g. White + Men are associated with 59% of traits; Black + Women with 5%). Associated traits also revealed intersectional differences: advantaged intersectional groups, especially intersections involving Rich, had more common, positive, warm, competent, and dominant trait associates. Together, the empirical insights from FISE illustrate its utility for transparently and efficiently quantifying intersectional stereotypes in existing large text corpora, with potential to expand intersectionality research across unprecedented time and place. This project further sets up the infrastructure necessary to pursue new research on the emergent properties of intersectional identities.", "venue": "PNAS Nexus", "label": 0}, {"loc": [6.430003643035889, 5.403943061828613], "openalex_id": "https://openalex.org/W4392340604", "title": "Hierarchical Multimodal Pre-training for Visually Rich Webpage Understanding", "authors": "Hongshen Xu, Lu Chen, Zihan Zhao, Da Ma, Ruisheng Cao, Zichen Zhu, Kai Yu", "abstract": "The growing prevalence of visually rich documents, such as webpages and scanned/digital-born documents (images, PDFs, etc.), has led to increased interest in automatic document understanding and information extraction across academia and industry. Although various document modalities, including image, text, layout, and structure, facilitate human information retrieval, the interconnected nature of these modalities presents challenges for neural networks. In this paper, we introduce WebLM, a multimodal pre-training network designed to address the limitations of solely modeling text and structure modalities of HTML in webpages. Instead of processing document images as unified natural images, WebLM integrates the hierarchical structure of document images to enhance the understanding of markup-language-based documents. Additionally, we propose several pre-training tasks to model the interaction among text, structure, and image modalities effectively. Empirical results demonstrate that the pre-trained WebLM significantly surpasses previous state-of-the-art pre-trained models across several webpage understanding tasks. The pre-trained models and code are available at https://github.com/X-LANCE/weblm.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.333588123321533, 1.257051944732666], "openalex_id": "https://openalex.org/W4392293038", "title": "\u201cMore than Words\u201d: A Legal Approach to the Risks of Commercial Chatbots Powered by Generative Artificial Intelligence", "authors": "Sara Migliorini", "abstract": "Abstract The recent commercial release of a new generation of chatbot systems, particularly those leveraging Transformer-based large language models (LLMs) such as ChatGPT, has caught the world by surprise and sparked debate about their potential consequences for society. While concerns about the existential threat posed by these technologies are often discussed, it is crucial to shift our focus towards the more immediate risks associated with their deployment. Such risks are further compounded by the lack of proactive measures addressing users\u2019 literacy and the for-profit model via which these chatbots are distributed. Drawing on research in computer science and other fields, this paper looks at the immediate risks triggered by these products and reflects on the role of law within a broader policy directed at steering generative artificial intelligence technology towards the common good. It also reviews the relevant amendments proposed by the European Parliament to the European Commission\u2019s proposal for an AI Act.", "venue": "European Journal of Risk Regulation", "label": 0}, {"loc": [7.507065296173096, -0.6510420441627502], "openalex_id": "https://openalex.org/W4392271855", "title": "LLM-Datasets: An Open Framework for Pretraining Datasets of Large Language Models", "authors": "Pierre Colombo, Duarte Alves, Jos\u00e9 P. Pombal, Nuno Guerreiro, Pedro Martins, Joao Alves, M. Amin Farajian, Ben Peters, Ricardo Rei, Patrick Fernandes, Shyam Sunder Agrawal, J De, Andr\u00e9 F. T. Martins", "abstract": "While general-purpose large language models (LLMs) demonstrate proficiency on multiple tasks within the domain of translation, approaches based on open LLMs are competitive only when specializing on a single task. In this paper, we propose a recipe for tailoring LLMs to multiple tasks present in translation workflows. We perform continued pretraining on a multilingual mixture of monolingual and parallel data, creating TowerBase, followed by finetuning on instructions relevant for translation processes, creating TowerInstruct. Our final model surpasses open alternatives on several tasks relevant to translation workflows and is competitive with general-purpose closed LLMs. To facilitate future research, we release the Tower models, our specialization dataset, an evaluation framework for LLMs focusing on the translation ecosystem, and a collection of model generations, including ours, on our benchmark.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.281728744506836, 0.17330890893936157], "openalex_id": "https://openalex.org/W4392270676", "title": "Multi-Task Contrastive Learning for 8192-Token Bilingual Text Embeddings", "authors": "Isabelle Mohr, Markus Krimmel, Saba Sturua, Mohammad Kalim Akram, Andreas Koukounas, Michael G\u00fcnther, Georgios Mastrapas, Vinit Ravishankar, Joan Fontanals Mart\u00ednez, Feng Wang, Qi Liu, Yu Ziniu, Jie Fu, Saahil Ognawala, Susana Guzman, Bo Wang, Maximilian Werk, Nan Wang, Han Xiao", "abstract": "We introduce a novel suite of state-of-the-art bilingual text embedding models that are designed to support English and another target language. These models are capable of processing lengthy text inputs with up to 8192 tokens, making them highly versatile for a range of natural language processing tasks such as text retrieval, clustering, and semantic textual similarity (STS) calculations. By focusing on bilingual models and introducing a unique multi-task learning objective, we have significantly improved the model performance on STS tasks, which outperforms the capabilities of existing multilingual models in both target language understanding and cross-lingual evaluation tasks. Moreover, our bilingual models are more efficient, requiring fewer parameters and less memory due to their smaller vocabulary needs. Furthermore, we have expanded the Massive Text Embedding Benchmark (MTEB) to include benchmarks for German and Spanish embedding models. This integration aims to stimulate further research and advancement in text embedding technologies for these languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.27457857131958, 0.24143189191818237], "openalex_id": "https://openalex.org/W4392341317", "title": "Nemotron-4 15B Technical Report", "authors": "Jupinder Parmar, Shrimai Prabhumoye, Joseph Jennings, Mostofa Patwary, Sandeep Subramanian, Dan Su, Chen Zhu, Deepak Narayanan, Aastha Jhunjhunwala, Ayush Dattagupta, Vibhu Jawa, Jiwei Liu, Ameya Sunil Mahabaleshwarkar, Osvald Nitski, Annika Brundyn, James Maki, M. J. Martinez, Jiaxuan You, John Kamalu, Patrick LeGresley, Denys Fridman, Jared Casper, Ashwath Aithal, Oleksii Kuchaiev, Mohammad Shoeybi, Jonathan Cohen, Bryan Catanzaro", "abstract": "We introduce Nemotron-4 15B, a 15-billion-parameter large multilingual language model trained on 8 trillion text tokens. Nemotron-4 15B demonstrates strong performance when assessed on English, multilingual, and coding tasks: it outperforms all existing similarly-sized open models on 4 out of 7 downstream evaluation areas and achieves competitive performance to the leading open models in the remaining ones. Specifically, Nemotron-4 15B exhibits the best multilingual capabilities of all similarly-sized models, even outperforming models over four times larger and those explicitly specialized for multilingual tasks.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.8533782958984375, 0.4257608652114868], "openalex_id": "https://openalex.org/W4392224283", "title": "From Text to Transformation: A Comprehensive Review of Large Language Models' Versatility", "authors": "Pravneet Kaur, Gautam Siddharth Kashyap, Ankit Kumar, Md Tabrez Nafis, Sandeep Kumar, Vikrant Shokeen", "abstract": "This groundbreaking study explores the expanse of Large Language Models (LLMs), such as Generative Pre-Trained Transformer (GPT) and Bidirectional Encoder Representations from Transformers (BERT) across varied domains ranging from technology, finance, healthcare to education. Despite their established prowess in Natural Language Processing (NLP), these LLMs have not been systematically examined for their impact on domains such as fitness, and holistic well-being, urban planning, climate modelling as well as disaster management. This review paper, in addition to furnishing a comprehensive analysis of the vast expanse and extent of LLMs' utility in diverse domains, recognizes the research gaps and realms where the potential of LLMs is yet to be harnessed. This study uncovers innovative ways in which LLMs can leave a mark in the fields like fitness and wellbeing, urban planning, climate modelling and disaster response which could inspire future researches and applications in the said avenues.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.47445297241211, 2.266871929168701], "openalex_id": "https://openalex.org/W4392223203", "title": "A Survey on Data Selection for Language Models", "authors": "Alon Albalak, Yanai Elazar, Sang Michael Xie, Shayne Longpre, Nathan Lambert, Xinyi Wang, Niklas Muennighoff, Bairu Hou, Liangming Pan, Haewon Jeong, Colin Raffel, Shiyu Chang, Tatsunori Hashimoto, William Yang Wang", "abstract": "A major factor in the recent success of large language models is the use of enormous and ever-growing text datasets for unsupervised pre-training. However, naively training a model on all available data may not be optimal (or feasible), as the quality of available text data can vary. Filtering out data can also decrease the carbon footprint and financial costs of training models by reducing the amount of training required. Data selection methods aim to determine which candidate data points to include in the training dataset and how to appropriately sample from the selected data points. The promise of improved data selection methods has caused the volume of research in the area to rapidly expand. However, because deep learning is mostly driven by empirical evidence and experimentation on large-scale data is expensive, few organizations have the resources for extensive data selection research. Consequently, knowledge of effective data selection practices has become concentrated within a few organizations, many of which do not openly share their findings and methodologies. To narrow this gap in knowledge, we present a comprehensive review of existing literature on data selection methods and related research areas, providing a taxonomy of existing approaches. By describing the current landscape of research, this work aims to accelerate progress in data selection by establishing an entry point for new and established researchers. Additionally, throughout this review we draw attention to noticeable holes in the literature and conclude the paper by proposing promising avenues for future research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.927159309387207, 3.5489084720611572], "openalex_id": "https://openalex.org/W4392189166", "title": "Accelerating Spiking Neural Networks with Parallelizable Leaky Integrate-and-Fire Neurons", "authors": "Sidi Yaya Arnaud Yarga, Sean U. N. Wood", "abstract": "Spiking Neural Networks (SNNs) express higher biological plausibility and excel at learning spatiotemporal features while consuming less energy than conventional Artificial Neural Networks (ANNs), particularly on neuromorphic hardware. The Leaky Integrate-and-Fire (LIF) neuron stands out as one of the most widely used spiking neurons in deep learning. However, its sequential information processing leads to slow training on lengthy sequences, presenting a critical challenge for real-world applications that rely on extensive datasets. This paper introduces the Parallelizable Leaky Integrate-and-Fire (ParaLIF) neuron, which accelerates SNNs by parallelizing their simulation over time, for both feedforward and recurrent architectures. When compared to LIF in neuromorphic speech, image and gesture classification tasks, ParaLIF demonstrates speeds up to 200 times faster and, on average, achieves greater accuracy with similar sparsity. Integrated into a state-of-the-art architecture, ParaLIF's accuracy matches the highest reported performance in the literature on the Spiking Heidelberg Digits (SHD) dataset. These findings highlight ParaLIF as a promising approach for the development of rapid, accurate and energy-efficient SNNs, particularly well-suited for handling massive datasets containing long sequences.", "venue": "https://doi.org/10.36227/techrxiv.170905886.62702188/v1", "label": 0}, {"loc": [3.2610130310058594, 2.346477746963501], "openalex_id": "https://openalex.org/W4392236953", "title": "Human-Centered Interaction in Virtual Worlds: A New Era of Generative Artificial Intelligence and Metaverse", "authors": "Yuying Wang, Le Wang, Keng Siau", "abstract": "The metaverse has emerged as an exciting new paradigm for human-computer interaction (HCI) and virtual collaboration. This paper presents a comprehensive review of the metaverse to address the gap in the existing literature where there is a lack of a survey that reviews the nature of the metaverse and its building blocks from a human-centric perspective. We first synthesize a definition of the metaverse from existing literature and delineate key affordances. We then introduce a detailed framework encompassing the metaverse's nature, infrastructure technologies, and input/output technologies that facilitate multi-sensory HCI, alongside applications across diverse domains. The components within this framework are explained in depth, offering insights into the metaverse's nature and the readiness level of current technologies. Based on this comprehensive analysis, we outline major open challenges and propose promising directions demanding further exploration and investigation. By clarifying the vision for the metaverse and characterizing the building blocks required to realize it, this review provides essential insights and serves as an invaluable resource for metaverse developers and researchers working to advance this transformative new medium.", "venue": "International Journal of Human-Computer Interaction", "label": 31}, {"loc": [8.753622055053711, -1.0621494054794312], "openalex_id": "https://openalex.org/W4392181850", "title": "MultiLS: A Multi-task Lexical Simplification Framework", "authors": "Kai North, Tharindu Ranasinghe, Matthew Shardlow, Marcos Zampieri", "abstract": "Lexical Simplification (LS) automatically replaces difficult to read words for easier alternatives while preserving a sentence's original meaning. LS is a precursor to Text Simplification with the aim of improving text accessibility to various target demographics, including children, second language learners, individuals with reading disabilities or low literacy. Several datasets exist for LS. These LS datasets specialize on one or two sub-tasks within the LS pipeline. However, as of this moment, no single LS dataset has been developed that covers all LS sub-tasks. We present MultiLS, the first LS framework that allows for the creation of a multi-task LS dataset. We also present MultiLS-PT, the first dataset to be created using the MultiLS framework. We demonstrate the potential of MultiLS-PT by carrying out all LS sub-tasks of (1). lexical complexity prediction (LCP), (2). substitute generation, and (3). substitute ranking for Portuguese. Model performances are reported, ranging from transformer-based models to more recent large language models (LLMs).", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.337900161743164, 5.407732009887695], "openalex_id": "https://openalex.org/W4392182296", "title": "Representing Online Handwriting for Recognition in Large Vision-Language Models", "authors": "Anastasiia Fadeeva, Philippe Schlattner, Andrii Maksai, Mark Collier, Efi Kokiopoulou, Jesse Berent, Claudiu Musat", "abstract": "The adoption of tablets with touchscreens and styluses is increasing, and a key feature is converting handwriting to text, enabling search, indexing, and AI assistance. Meanwhile, vision-language models (VLMs) are now the go-to solution for image understanding, thanks to both their state-of-the-art performance across a variety of tasks and the simplicity of a unified approach to training, fine-tuning, and inference. While VLMs obtain high performance on image-based tasks, they perform poorly on handwriting recognition when applied naively, i.e., by rendering handwriting as an image and performing optical character recognition (OCR). In this paper, we study online handwriting recognition with VLMs, going beyond naive OCR. We propose a novel tokenized representation of digital ink (online handwriting) that includes both a time-ordered sequence of strokes as text, and as image. We show that this representation yields results comparable to or better than state-of-the-art online handwriting recognizers. Wide applicability is shown through results with two different VLM families, on multiple public datasets. Our approach can be applied to off-the-shelf VLMs, does not require any changes in their architecture, and can be used in both fine-tuning and parameter-efficient tuning. We perform a detailed ablation study to identify the key elements of the proposed representation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.735508441925049, 2.1722638607025146], "openalex_id": "https://openalex.org/W4392120739", "title": "On the Tip of the Tongue: Analyzing Conceptual Representation in Large Language Models with Reverse-Dictionary Probe", "authors": "Ningyu Xu, Qi Zhang, Menghan Zhang, Peng Qian, Xuanjing Huang", "abstract": "Probing and enhancing large language models' reasoning capacity remains a crucial open question. Here we re-purpose the reverse dictionary task as a case study to probe LLMs' capacity for conceptual inference. We use in-context learning to guide the models to generate the term for an object concept implied in a linguistic description. Models robustly achieve high accuracy in this task, and their representation space encodes information about object categories and fine-grained features. Further experiments suggest that the conceptual inference ability as probed by the reverse-dictionary task predicts model's general reasoning performance across multiple benchmarks, despite similar syntactic generalization behaviors across models. Explorative analyses suggest that prompting LLMs with description$\\Rightarrow$word examples may induce generalization beyond surface-level differences in task construals and facilitate models on broader commonsense reasoning problems.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.693218231201172, 2.6108720302581787], "openalex_id": "https://openalex.org/W4392120641", "title": "Take the Bull by the Horns: Hard Sample-Reweighted Continual Training Improves LLM Generalization", "authors": "Xuxi Chen, Zhendong Wang, Daouda Sow, Junjie Yang, Tianlong Chen, Yingbin Liang, Mingyuan Zhou, Zhangyang Wang", "abstract": "In the rapidly advancing arena of large language models (LLMs), a key challenge is to enhance their capabilities amid a looming shortage of high-quality training data. Our study starts from an empirical strategy for the light continual training of LLMs using their original pre-training data sets, with a specific focus on selective retention of samples that incur moderately high losses. These samples are deemed informative and beneficial for model refinement, contrasting with the highest-loss samples, which would be discarded due to their correlation with data noise and complexity. We then formalize this strategy into a principled framework of Instance-Reweighted Distributionally Robust Optimization (IR-DRO). IR-DRO is designed to dynamically prioritize the training focus on informative samples through an instance reweighting mechanism, streamlined by a closed-form solution for straightforward integration into established training protocols. Through rigorous experimentation with various models and datasets, our findings indicate that our sample-targeted methods significantly improve LLM performance across multiple benchmarks, in both continual pre-training and instruction tuning scenarios. Our codes are available at https://github.com/VITA-Group/HardFocusTraining.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.797760248184204, -3.966057538986206], "openalex_id": "https://openalex.org/W4392108773", "title": "Empowering hate speech detection: leveraging linguistic richness and deep learning", "authors": "I Gde Bagus Janardana Abasan, Erwin Budi Setiawan", "abstract": "Social media has become a vital part of most modern human personal life. Twitter is one of the social media that was formed from the development of communication technology. A lot of social media gives users the freedom to express themselves. This facility is misused by users, so hate speech is spread. Designing a system to detect hate speech intelligently is needed. This study uses the hybrid deep learning (HDL) and solo deep learning (SDL) approach with the convolutional neural networks (CNN) and bidirectional gated recurrent unit (Bi-GRU) algorithm. There are 4 models built, namely CNN, Bi-GRU, CNN+Bi-GRU, and Bi-GRU+CNN. Term frequency-inverse document frequency (TF-IDF) is used for feature extraction, which is to get linguistic features to be analyzed and studied. FastText is used to perform feature expansion to minimize mismatched vocabulary. Four scenarios are run. CNN with an accuracy of 87.63%, Bi-GRU produces an accuracy of 87.46%, CNN+Bi-GRU provides an accuracy of 87.47% and Bi-GRU+CNN provides an accuracy of 87.34%. The ability of this approach to understand the context is qualified. HDL outperforms SDL in terms of n-gram type, where HDL can understand sentences broken down by hybrid n-gram types, namely Unigram-Bigram-Trigram which is a complex n-gram hybrid.", "venue": "Bulletin of Electrical Engineering and Informatics", "label": 0}, {"loc": [7.599968433380127, -0.04390920698642731], "openalex_id": "https://openalex.org/W4392089900", "title": "MlingConf: A Comprehensive Study of Multilingual Confidence Estimation on Large Language Models", "authors": "Boyang Xue, Hongru Wang, Weichao Wang, Rui Wang, Sheng Wang, Zeming Liu, Kam\u2010Fai Wong", "abstract": "The tendency of Large Language Models (LLMs) to generate hallucinations raises concerns regarding their reliability. Therefore, confidence estimations indicating the extent of trustworthiness of the generations become essential. However, current LLM confidence estimations in languages other than English remain underexplored. This paper addresses this gap by introducing a comprehensive investigation of Multilingual Confidence estimation (MlingConf) on LLMs, focusing on both language-agnostic (LA) and language-specific (LS) tasks to explore the performance and language dominance effects of multilingual confidence estimations on different tasks. The benchmark comprises four meticulously checked and human-evaluated high-quality multilingual datasets for LA tasks and one for the LS task tailored to specific social, cultural, and geographical contexts of a language. Our experiments reveal that on LA tasks English exhibits notable linguistic dominance in confidence estimations than other languages, while on LS tasks, using question-related language to prompt LLMs demonstrates better linguistic dominance in multilingual confidence estimations. The phenomena inspire a simple yet effective native-tone prompting strategy by employing language-specific prompts for LS tasks, effectively improving LLMs' reliability and accuracy in LS scenarios.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.8865115642547607, -0.6358808875083923], "openalex_id": "https://openalex.org/W4392012207", "title": "DrBenchmark: A Large Language Understanding Evaluation Benchmark for French Biomedical Domain", "authors": "Yanis Labrak, Adrien Bazoge, Oumaima El Khettari, Micka\u00ebl Rouvier, Pac\u00f4me Constant Dit Beaufils, Natalia Grabar, B\u00e9atrice Daille, Solen Quiniou, Emmanuel Morin, Pierre\u2010antoine Gourraud, Richard Dufour", "abstract": "The biomedical domain has sparked a significant interest in the field of Natural Language Processing (NLP), which has seen substantial advancements with pre-trained language models (PLMs). However, comparing these models has proven challenging due to variations in evaluation protocols across different models. A fair solution is to aggregate diverse downstream tasks into a benchmark, allowing for the assessment of intrinsic PLMs qualities from various perspectives. Although still limited to few languages, this initiative has been undertaken in the biomedical field, notably English and Chinese. This limitation hampers the evaluation of the latest French biomedical models, as they are either assessed on a minimal number of tasks with non-standardized protocols or evaluated using general downstream tasks. To bridge this research gap and account for the unique sensitivities of French, we present the first-ever publicly available French biomedical language understanding benchmark called DrBenchmark. It encompasses 20 diversified tasks, including named-entity recognition, part-of-speech tagging, question-answering, semantic textual similarity, and classification. We evaluate 8 state-of-the-art pre-trained masked language models (MLMs) on general and biomedical-specific data, as well as English specific MLMs to assess their cross-lingual capabilities. Our experiments reveal that no single model excels across all tasks, while generalist models are sometimes still competitive.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.021503448486328, 0.8641711473464966], "openalex_id": "https://openalex.org/W4392083022", "title": "PIRB: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods", "authors": "S\u0142awomir Dadas, Micha\u0142 Pere\u0142kiewicz, Rafa\u0142 Po\u015bwiata", "abstract": "We present Polish Information Retrieval Benchmark (PIRB), a comprehensive evaluation framework encompassing 41 text information retrieval tasks for Polish. The benchmark incorporates existing datasets as well as 10 new, previously unpublished datasets covering diverse topics such as medicine, law, business, physics, and linguistics. We conduct an extensive evaluation of over 20 dense and sparse retrieval models, including the baseline models trained by us as well as other available Polish and multilingual methods. Finally, we introduce a three-step process for training highly effective language-specific retrievers, consisting of knowledge distillation, supervised fine-tuning, and building sparse-dense hybrid retrievers using a lightweight rescoring model. In order to validate our approach, we train new text encoders for Polish and compare their results with previously evaluated methods. Our dense models outperform the best solutions available to date, and the use of hybrid methods further improves their performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.448019027709961, 0.8720288276672363], "openalex_id": "https://openalex.org/W4392126143", "title": "LongWanjuan: Towards Systematic Measurement for Long Text Quality", "authors": "Kai Lv, Xiaoran Liu, Qipeng Guo, Hang Yan, Conghui He, Xipeng Qiu, Dahua Lin", "abstract": "The quality of training data are crucial for enhancing the long-text capabilities of foundation models. Despite existing efforts to refine data quality through heuristic rules and evaluations based on data diversity and difficulty, there's a lack of systematic approaches specifically tailored for assessing long texts. Addressing this gap, our work systematically measures the quality of long texts by evaluating three fundamental linguistic dimensions: coherence, cohesion, and complexity. Drawing inspiration from the aforementioned three dimensions, we introduce a suite of metrics designed to evaluate the quality of long texts, encompassing both statistical and pre-trained language model-based ones. Leveraging these metrics, we present LongWanjuan, a bilingual dataset specifically tailored to enhance the training of language models for long-text tasks with over 160B tokens. In LongWanjuan, we categorize long texts into holistic, aggregated, and chaotic types, enabling a detailed analysis of long-text quality. Furthermore, we devise a data mixture recipe that strategically balances different types of long texts within LongWanjuan, leading to significant improvements in model performance on long-text tasks. The code and dataset are available at https://github.com/OpenLMLab/LongWanjuan.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.896620273590088, 2.3071706295013428], "openalex_id": "https://openalex.org/W4392118233", "title": "Analysing The Impact of Sequence Composition on Language Model Pre-Training", "authors": "Yu Zhao, Yuanbin Qu, Konrad Staniszewski, Szymon Tworkowski, Wei Liu, Piotr Mi\u0142o\u015b, Yuxiang Wu, Pasquale Minervini", "abstract": "Most language model pre-training frameworks concatenate multiple documents into fixed-length sequences and use causal masking to compute the likelihood of each token given its context; this strategy is widely adopted due to its simplicity and efficiency. However, to this day, the influence of the pre-training sequence composition strategy on the generalisation properties of the model remains under-explored. In this work, we find that applying causal masking can lead to the inclusion of distracting information from previous documents during pre-training, which negatively impacts the performance of the models on language modelling and downstream tasks. In intra-document causal masking, the likelihood of each token is only conditioned on the previous tokens in the same document, eliminating potential distracting information from previous documents and significantly improving performance. Furthermore, we find that concatenating related documents can reduce some potential distractions during pre-training, and our proposed efficient retrieval-based sequence construction method, BM25Chunk, can improve in-context learning (+11.6\\%), knowledge memorisation (+9.8\\%), and context utilisation (+7.2\\%) abilities of language models without sacrificing efficiency.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.629574298858643, 1.6990153789520264], "openalex_id": "https://openalex.org/W4392016050", "title": "UNLOCKING THE POWER OF LANGUAGE: IMPROVING MULTILINGUAL CAPABILITIES IN GENERATIVE AI FOR GLOBAL ACCESSIBILITY", "authors": "Sanae Ejjebli", "abstract": "In the expansive realm of cross-cultural and intercultural research, the power of language lies in its direct and indirect influence on scholarly investigations and outcomes. This paper investigates the intricate relationship between language, culture, and interpretation. It unveils the transformative power of language in modifying communication styles, influencing study outcomes, and shaping societal perspectives. The challenges posed by diverse languages are addressed while concurrently providing valuable guidance to research enthusiasts to enhance research methods, minimize linguistic prejudice, and foster empathy within society. Through insightful instances and case studies, it vividly portrays the notable impact linguistic variations have on the communication and results of cross-cultural research while signaling potential zones for upcoming multicultural exploration regarding lingual philosophy.", "venue": "Studies in Pragmatics and Discourse Analysis", "label": 0}, {"loc": [3.1233460903167725, -0.7279808521270752], "openalex_id": "https://openalex.org/W4392020057", "title": "Few shot clinical entity recognition in three languages: Masked language models outperform LLM prompting", "authors": "Marco Naguib, Xavier Tannier, Aur\u00e9lie N\u00e9v\u00e9ol", "abstract": "Large language models (LLMs) have become the preferred solution for many natural language processing tasks. In low-resource environments such as specialized domains, their few-shot capabilities are expected to deliver high performance. Named Entity Recognition (NER) is a critical task in information extraction that is not covered in recent LLM benchmarks. There is a need for better understanding the performance of LLMs for NER in a variety of settings including languages other than English. This study aims to evaluate generative LLMs, employed through prompt engineering, for few-shot clinical NER. %from the perspective of F1 performance and environmental impact. We compare 13 auto-regressive models using prompting and 16 masked models using fine-tuning on 14 NER datasets covering English, French and Spanish. While prompt-based auto-regressive models achieve competitive F1 for general NER, they are outperformed within the clinical domain by lighter biLSTM-CRF taggers based on masked models. Additionally, masked models exhibit lower environmental impact compared to auto-regressive models. Findings are consistent across the three languages studied, which suggests that LLM prompting is not yet suited for NER production in the clinical domain.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.327169179916382, -0.457270085811615], "openalex_id": "https://openalex.org/W4391993820", "title": "BIAS ASSESSMENT IN LARGE LANGUAGE MODELS", "authors": "Bashar Hasan, Samer Saadi, Noora S. Rajjoub, Moustafa Hegazi, Mohammad Al-Kordi, Farah Fleti, Magdoleen H. Farah, Irbaz Bin Riaz, Imon Banerjee, Zhen Wang, M. Hassan Murad", "abstract": "Large language models (LLMs) may facilitate and expedite systematic reviews, although the approach to integrate LLMs in the review process is unclear. This study evaluates GPT-4 agreement with human reviewers in assessing the risk of bias using the Risk Of Bias In Non-randomised Studies of Interventions (ROBINS-I) tool and proposes a framework for integrating LLMs into systematic reviews. The case study demonstrated that raw per cent agreement was the highest for the ROBINS-I domain of \u2018Classification of Intervention\u2019. Kendall agreement coefficient was highest for the domains of \u2018Participant Selection\u2019, \u2018Missing Data\u2019 and \u2018Measurement of Outcomes\u2019, suggesting moderate agreement in these domains. Raw agreement about the overall risk of bias across domains was 61% (Kendall coefficient=0.35). The proposed framework for integrating LLMs into systematic reviews consists of four domains: rationale for LLM use, protocol (task definition, model selection, prompt engineering, data entry methods, human role and success metrics), execution (iterative revisions to the protocol) and reporting. We identify five basic task types relevant to systematic reviews: selection, extraction, judgement, analysis and narration. Considering the agreement level with a human reviewer in the case study, pairing artificial intelligence with an independent human reviewer remains required.", "venue": "BMJ evidence-based medicine", "label": 0}, {"loc": [6.936784744262695, 2.5425047874450684], "openalex_id": "https://openalex.org/W4392019772", "title": "RoCode: A Dataset for Measuring Code Intelligence from Problem Definitions in Romanian", "authors": "Adrian Cosma, Bogdan Iordache, Paolo Rosso", "abstract": "Recently, large language models (LLMs) have become increasingly powerful and have become capable of solving a plethora of tasks through proper instructions in natural language. However, the vast majority of testing suites assume that the instructions are written in English, the de facto prompting language. Code intelligence and problem solving still remain a difficult task, even for the most advanced LLMs. Currently, there are no datasets to measure the generalization power for code-generation models in a language other than English. In this work, we present RoCode, a competitive programming dataset, consisting of 2,642 problems written in Romanian, 11k solutions in C, C++ and Python and comprehensive testing suites for each problem. The purpose of RoCode is to provide a benchmark for evaluating the code intelligence of language models trained on Romanian / multilingual text as well as a fine-tuning set for pretrained Romanian models. Through our results and review of related works, we argue for the need to develop code models for languages other than English.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.340109825134277, -0.4126667380332947], "openalex_id": "https://openalex.org/W4391988199", "title": "Key ingredients for effective zero-shot cross-lingual knowledge transfer in generative tasks", "authors": "Nadezhda Chirkova, Vassilina Nikoulina", "abstract": "Zero-shot cross-lingual knowledge transfer enables a multilingual pretrained language model, finetuned on a task in one language, make predictions for this task in other languages. While being broadly studied for natural language understanding tasks, the described setting is understudied for generation. Previous works notice a frequent problem of generation in a wrong language and propose approaches to address it, usually using mT5 as a backbone model. In this work we compare various approaches proposed from the literature in unified settings, also including alternative backbone models, namely mBART and NLLB-200. We first underline the importance of tuning learning rate used for finetuning, which helps to substantially alleviate the problem of generation in the wrong language. Then, we show that with careful learning rate tuning, the simple full finetuning of the model acts as a very strong baseline and alternative approaches bring only marginal improvements. Finally, we find that mBART performs similarly to mT5 of the same size, and NLLB-200 can be competitive in some cases. Our final zero-shot models reach the performance of the approach based on data translation which is usually considered as an upper baseline for zero-shot cross-lingual transfer in generation.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.091136932373047, 2.6414825916290283], "openalex_id": "https://openalex.org/W4391988159", "title": "Copyleft for Alleviating AIGC Copyright Dilemma: What-if Analysis, Public Perception and Implications", "authors": "Xinwei Guo, Yujun Li, Yafeng Peng, Xuetao Wei", "abstract": "As AIGC has impacted our society profoundly in the past years, ethical issues have received tremendous attention. The most urgent one is the AIGC copyright dilemma, which can immensely stifle the development of AIGC and greatly cost the entire society. Given the complexity of AIGC copyright governance and the fact that no perfect solution currently exists, previous work advocated copyleft on AI governance but without substantive analysis. In this paper, we take a step further to explore the feasibility of copyleft to alleviate the AIGC copyright dilemma. We conduct a mixed-methods study from two aspects: qualitatively, we use a formal what-if analysis to clarify the dilemma and provide case studies to show the feasibility of copyleft; quantitatively, we perform a carefully designed survey to find out how the public feels about copylefting AIGC. The key findings include: a) people generally perceive the dilemma, b) they prefer to use authorized AIGC under loose restriction, and c) they are positive to copyleft in AIGC and willing to use it in the future.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7937631607055664, 3.8576769828796387], "openalex_id": "https://openalex.org/W4391987598", "title": "SPML: A DSL for Defending Language Models Against Prompt Attacks", "authors": "Reshabh K Sharma, Vinayak Gupta, Dan Grossman", "abstract": "Large language models (LLMs) have profoundly transformed natural language applications, with a growing reliance on instruction-based definitions for designing chatbots. However, post-deployment the chatbot definitions are fixed and are vulnerable to attacks by malicious users, emphasizing the need to prevent unethical applications and financial losses. Existing studies explore user prompts' impact on LLM-based chatbots, yet practical methods to contain attacks on application-specific chatbots remain unexplored. This paper presents System Prompt Meta Language (SPML), a domain-specific language for refining prompts and monitoring the inputs to the LLM-based chatbots. SPML actively checks attack prompts, ensuring user inputs align with chatbot definitions to prevent malicious execution on the LLM backbone, optimizing costs. It also streamlines chatbot definition crafting with programming language capabilities, overcoming natural language design challenges. Additionally, we introduce a groundbreaking benchmark with 1.8k system prompts and 20k user inputs, offering the inaugural language and benchmark for chatbot definition evaluation. Experiments across datasets demonstrate SPML's proficiency in understanding attacker prompts, surpassing models like GPT-4, GPT-3.5, and LLAMA. Our data and codes are publicly available at: https://prompt-compiler.github.io/SPML/.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.546391487121582, 1.05055570602417], "openalex_id": "https://openalex.org/W4391959090", "title": "Applying Named Entity Recognition and Graph Networks to Extract Common Interests from Thematic Subfora on Reddit", "authors": "Jan Sawicki, Maria Ganzha, Marcin Paprzycki, Yutaka Watanobe", "abstract": "Reddit is the largest topically structured social network. Existing literature, reporting results of Reddit-related research, considers different phenomena, from social and political studies to recommender systems. The most common techniques used in these works, include natural language processing, e.g., named entity recognition, as well as graph networks representing online social networks. However, large-scale studies that take into account Reddit\u2019s unique structure are scarce. In this contribution, similarity between subreddits is explored. Specifically, subreddit posts (from 3189 subreddits, spanning the year 2022) are processed using NER to build graph networks which are further mined for relations between subreddits. The evaluation of obtained results follows the state-of-the-art approaches used for a similar problem, i.e., recommender system metrics, and applies recall and AUC. Overall, the use of Reddit crossposts discloses previously unknown relations between subreddits. Interestingly, the proposed approach may allow for researchers to better connect their study topics with particular subreddits and shows promise for subreddit similarity mining.", "venue": "Applied Sciences", "label": 8}, {"loc": [8.894548416137695, 0.28374090790748596], "openalex_id": "https://openalex.org/W4392011482", "title": "GenDec: A robust generative Question-decomposition method for Multi-hop reasoning", "authors": "Jian Wu, Linyi Yang, Yuliang Ji, Wenhao Huang, B\u00f6rje F. Karlsson, Manabu Okumura", "abstract": "Multi-hop QA (MHQA) involves step-by-step reasoning to answer complex questions and find multiple relevant supporting facts. However, Existing large language models'(LLMs) reasoning ability in multi-hop question answering remains exploration, which is inadequate in answering multi-hop questions. Moreover, it is unclear whether LLMs follow a desired reasoning chain to reach the right final answer. In this paper, we propose a \\textbf{gen}erative question \\textbf{dec}omposition method (GenDec) from the perspective of explainable QA by generating independent and complete sub-questions based on incorporating additional extracted evidence for enhancing LLMs' reasoning ability in RAG. To demonstrate the impact, generalization, and robustness of Gendec, we conduct two experiments, the first is combining GenDec with small QA systems on paragraph retrieval and QA tasks. We secondly examine the reasoning capabilities of various state-of-the-art LLMs including GPT-4 and GPT-3.5 combined with GenDec. We experiment on the HotpotQA, 2WikihopMultiHopQA, MuSiQue, and PokeMQA datasets.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.232547283172607, 0.6911461353302002], "openalex_id": "https://openalex.org/W4391940723", "title": "Ethical Issues in Large Language Models: A Systematic Literature Review", "authors": "Jing Su, Chufeng Jiang, Xin Jin, Yuxin Qiao, Tingsong Xiao, Hongda Ma, Rong Wei, Zhi Jing, Jiajun Xu, Junhong Lin", "abstract": "This systematic literature review comprehensively examines the application of Large Language Models (LLMs) in forecasting and anomaly detection, highlighting the current state of research, inherent challenges, and prospective future directions. LLMs have demonstrated significant potential in parsing and analyzing extensive datasets to identify patterns, predict future events, and detect anomalous behavior across various domains. However, this review identifies several critical challenges that impede their broader adoption and effectiveness, including the reliance on vast historical datasets, issues with generalizability across different contexts, the phenomenon of model hallucinations, limitations within the models' knowledge boundaries, and the substantial computational resources required. Through detailed analysis, this review discusses potential solutions and strategies to overcome these obstacles, such as integrating multimodal data, advancements in learning methodologies, and emphasizing model explainability and computational efficiency. Moreover, this review outlines critical trends that are likely to shape the evolution of LLMs in these fields, including the push toward real-time processing, the importance of sustainable modeling practices, and the value of interdisciplinary collaboration. Conclusively, this review underscores the transformative impact LLMs could have on forecasting and anomaly detection while emphasizing the need for continuous innovation, ethical considerations, and practical solutions to realize their full potential.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.72108268737793, 1.4524791240692139], "openalex_id": "https://openalex.org/W4391940723", "title": "Large Language Models for Forecasting and Anomaly Detection: A Systematic Literature Review", "authors": "Jing Su, Chufeng Jiang, Xin Jin, Yuxin Qiao, Tingsong Xiao, Hongda Ma, Rong Wei, Zhi Jing, Jiajun Xu, Junhong Lin", "abstract": "This systematic literature review comprehensively examines the application of Large Language Models (LLMs) in forecasting and anomaly detection, highlighting the current state of research, inherent challenges, and prospective future directions. LLMs have demonstrated significant potential in parsing and analyzing extensive datasets to identify patterns, predict future events, and detect anomalous behavior across various domains. However, this review identifies several critical challenges that impede their broader adoption and effectiveness, including the reliance on vast historical datasets, issues with generalizability across different contexts, the phenomenon of model hallucinations, limitations within the models' knowledge boundaries, and the substantial computational resources required. Through detailed analysis, this review discusses potential solutions and strategies to overcome these obstacles, such as integrating multimodal data, advancements in learning methodologies, and emphasizing model explainability and computational efficiency. Moreover, this review outlines critical trends that are likely to shape the evolution of LLMs in these fields, including the push toward real-time processing, the importance of sustainable modeling practices, and the value of interdisciplinary collaboration. Conclusively, this review underscores the transformative impact LLMs could have on forecasting and anomaly detection while emphasizing the need for continuous innovation, ethical considerations, and practical solutions to realize their full potential.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.538560390472412, 0.6602154970169067], "openalex_id": "https://openalex.org/W4391927278", "title": "A Trustworthy Automated Short-Answer Scoring System Using a New Dataset and Hybrid Transfer Learning Method", "authors": "Martinus Maslim, Hei\u2010Chia Wang, Cendra Devayana Putra, Yulius Denny Prabowo", "abstract": "To measure the quality of student learning, teachers must conduct evaluations. One of the most efficient modes of evaluation is the short answer question. However, there can be inconsistencies in teacher-performed manual evaluations due to an excessive number of students, time demands, fatigue, etc. Consequently, teachers require a trustworthy system capable of autonomously and accurately evaluating student answers. Using hybrid transfer learning and student answer dataset, we aim to create a reliable automated short answer scoring system called Hybrid Transfer Learning for Automated Short Answer Scoring (HTL-ASAS). HTL-ASAS combines multiple tokenizers from a pretrained model with the bidirectional encoder representations from transformers. Based on our evaluation of the training model, we determined that HTL-ASAS has a higher evaluation accuracy than models used in previous studies. The accuracy of HTL-ASAS for datasets containing responses to questions pertaining to introductory information technology courses reaches 99.6%. With an accuracy close to one hundred percent, the developed model can undoubtedly serve as the foundation for a trustworthy ASAS system.", "venue": "International Journal of Interactive Multimedia and Artificial Intelligence", "label": 47}, {"loc": [2.746244430541992, 2.398177146911621], "openalex_id": "https://openalex.org/W4391924277", "title": "Responsible AI in Practice", "authors": "Malak Sadek, Emma Kallina, Thomas Bohn\u00e9, C\u00e9line Mougenot, Rafael A. Calvo, Stephen Cave", "abstract": "Abstract Responsible AI (RAI) guidelines aim to ensure that AI systems respect democratic values. While a step in the right direction, they currently fail to impact practice. Our work discusses reasons for this lack of impact and clusters them into five areas: (1) the abstract nature of RAI guidelines, (2) the problem of selecting and reconciling values, (3) the difficulty of operationalising RAI success metrics, (4) the fragmentation of the AI pipeline, and (5) the lack of internal advocacy and accountability. Afterwards, we introduce a number of approaches to RAI from a range of disciplines, exploring their potential as solutions to the identified challenges. We anchor these solutions in practice through concrete examples, bridging the gap between the theoretical considerations of RAI and on-the-ground processes that currently shape how AI systems are built. Our work considers the socio-technical nature of RAI limitations and the resulting necessity of producing socio-technical solutions.", "venue": "AI & Society", "label": 16}, {"loc": [8.905158996582031, 2.1893603801727295], "openalex_id": "https://openalex.org/W4394713293", "title": "Inference-Based No-Learning Approach on Pre-Trained BERT Model Retrieval", "authors": "Huu-Long Pham, Ryota Mibayashi, Takehiro Yamamoto, Makoto P. Kato, Yusuke Yamamoto, Yoshiyuki Shoji, Hiroaki Ohshima", "abstract": "In recent years, the practice of leveraging pre-trained machine learning models for specific tasks has gained traction. Instead of training models from the ground up, it is now common to fine-tune existing pre-trained models. However, users have the responsibility to select a pre-trained model that is suitable with their task-a challenge given the number of pre-trained models available. Conventionally, the suitability of a pre-trained model for a task is ascertained through fine-tuning, which is costly in term of time and computational resources. This research introduces an efficient retrieval method for BERT pre-trained models in document classification tasks. Our contributions are as follows: (i) We defined the problem of pre-trained model retrieval; (ii) We developed a benchmark dataset by fine-tuning and evaluating twenty distinct pre-trained BERT models across seventeen document classification tasks; (iii) We propose a method to retrieve suitable pre-trained BERT models without actual fine-tuning.", "venue": "https://doi.org/10.1109/bigcomp60711.2024.00044", "label": 0}, {"loc": [4.3720598220825195, 2.3865668773651123], "openalex_id": "https://openalex.org/W4392192990", "title": "Dissecting Whiteness: consistencies and differences in the stereotypes of lower-and upper-class White US Americans", "authors": "Thekla Morgenroth, Christopher T. Begeny, Teri A. Kirby, Benjamin Paa\u00dfen, Yanzhe Zeng", "abstract": "Economic inequality is increasing in the United States, making categorization and stereotyping based on social class more likely. Yet, social class stereotypes have received relatively little attention. Focusing on spontaneously generated stereotypes of different White lower-class and upper-class groups in the United States, we find consistencies and differences across groups. Lower-class groups were stereotyped as poor, uneducated, dirty, and lacking ability, while upper-class groups were stereotyped as rich, arrogant, and lacking sociability. Stereotypes for all groups were largely negative but there were notable variations in stereotype valence, sociability, morality, ability, and assertiveness as well as perceived attitudes toward the groups within each social class, highlighting the importance of moving beyond a monolithic view of \"the rich\" and \"the poor.\"", "venue": "Self and Identity", "label": 0}, {"loc": [7.505857467651367, -0.7227748036384583], "openalex_id": "https://openalex.org/W4391900323", "title": "Self-Augmented In-Context Learning for Unsupervised Word Translation", "authors": "Yaoyiran Li, Anna Korhonen, Ivan Vuli\u0107", "abstract": "Recent work has shown that, while large language models (LLMs) demonstrate strong word translation or bilingual lexicon induction (BLI) capabilities in few-shot setups, they still cannot match the performance of 'traditional' mapping-based approaches in the unsupervised scenario where no seed translation pairs are available, especially for lower-resource languages. To address this challenge with LLMs, we propose self-augmented in-context learning (SAIL) for unsupervised BLI: starting from a zero-shot prompt, SAIL iteratively induces a set of high-confidence word translation pairs for in-context learning (ICL) from an LLM, which it then reapplies to the same LLM in the ICL fashion. Our method shows substantial gains over zero-shot prompting of LLMs on two established BLI benchmarks spanning a wide range of language pairs, also outperforming mapping-based baselines across the board. In addition to achieving state-of-the-art unsupervised BLI performance, we also conduct comprehensive analyses on SAIL and discuss its limitations.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.740699768066406, 2.482933521270752], "openalex_id": "https://openalex.org/W4391912901", "title": "How to Train Data-Efficient LLMs", "authors": "Noveen Sachdeva, Benjamin Coleman, Wang-Cheng Kang, Jianmo Ni, Lichan Hong, Ed H., James Caverlee, Julian McAuley, Derek Zhiyuan Cheng", "abstract": "The training of large language models (LLMs) is expensive. In this paper, we study data-efficient approaches for pre-training LLMs, i.e., techniques that aim to optimize the Pareto frontier of model quality and training resource/data consumption. We seek to understand the tradeoffs associated with data selection routines based on (i) expensive-to-compute data-quality estimates, and (ii) maximization of coverage and diversity-based measures in the feature space. Our first technique, Ask-LLM, leverages the zero-shot reasoning capabilities of instruction-tuned LLMs to directly assess the quality of a training example. To target coverage, we propose Density sampling, which models the data distribution to select a diverse sample. In our comparison of 19 samplers, involving hundreds of evaluation tasks and pre-training runs, we find that Ask-LLM and Density are the best methods in their respective categories. Coverage sampling can recover the performance of the full data, while models trained on Ask-LLM data consistently outperform full-data training -- even when we reject 90% of the original dataset, while converging up to 70% faster.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.582559108734131, 0.7496610283851624], "openalex_id": "https://openalex.org/W4391899422", "title": "QuRating: Selecting High-Quality Data for Training Language Models", "authors": "Alexander Wettig, Aatmik Gupta, Saumya Malik, Danqi Chen", "abstract": "Selecting high-quality pre-training data is important for creating capable language models, but existing methods rely on simple heuristics. We introduce QuRating, a method for selecting pre-training data that can capture human intuitions about data quality. In this paper, we investigate four qualities - writing style, required expertise, facts & trivia, and educational value - and find that LLMs are able to discern these qualities, especially when making pairwise judgments of texts. We train a QuRater model to learn scalar ratings from pairwise judgments, and use it to annotate a 260B training corpus with quality ratings for each of the four criteria. In our experiments, we select 30B tokens according to the different quality ratings and train 1.3B-parameter language models on the selected data. We find that it is important to balance quality and diversity. When we sample using quality ratings as logits over documents, our models obtain lower perplexity and stronger in-context learning performance than baselines. Our best model is based on educational value and performs similarly to a model trained with uniform sampling for 50% more steps. Beyond data selection, we use the quality ratings to construct a training curriculum which improves performance without changing the training dataset. We extensively analyze the quality ratings and discuss their characteristics, biases, and wider implications.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.063848495483398, 4.756138801574707], "openalex_id": "https://openalex.org/W4391901078", "title": "Jack of All Trades, Master of Some, a Multi-Purpose Transformer Agent", "authors": "Quentin Gallou\u00e9dec, Edward Beeching, Cl\u00e9ment Romac, Emmanuel Dellandr\u00e9a", "abstract": "The search for a general model that can operate seamlessly across multiple domains remains a key goal in machine learning research. The prevailing methodology in Reinforcement Learning (RL) typically limits models to a single task within a unimodal framework, a limitation that contrasts with the broader vision of a versatile, multi-domain model. In this paper, we present Jack of All Trades (JAT), a transformer-based model with a unique design optimized for handling sequential decision-making tasks and multi-modal data types. The JAT model demonstrates its robust capabilities and versatility by achieving strong performance on very different RL benchmarks, along with promising results on Computer Vision (CV) and Natural Language Processing (NLP) tasks, all using a single set of weights. The JAT model marks a significant step towards more general, cross-domain AI model design, and notably, it is the first model of its kind to be fully open-sourced at https://huggingface.co/jat-project/jat, including a pioneering general-purpose dataset.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.7207441329956055, 3.9026966094970703], "openalex_id": "https://openalex.org/W4391912894", "title": "PAL: Proxy-Guided Black-Box Attack on Large Language Models", "authors": "Chawin Sitawarin, Norman Mu, David Wagner, Alexandre Araujo", "abstract": "Large Language Models (LLMs) have surged in popularity in recent months, but they have demonstrated concerning capabilities to generate harmful content when manipulated. While techniques like safety fine-tuning aim to minimize harmful use, recent works have shown that LLMs remain vulnerable to attacks that elicit toxic responses. In this work, we introduce the Proxy-Guided Attack on LLMs (PAL), the first optimization-based attack on LLMs in a black-box query-only setting. In particular, it relies on a surrogate model to guide the optimization and a sophisticated loss designed for real-world LLM APIs. Our attack achieves 84% attack success rate (ASR) on GPT-3.5-Turbo and 48% on Llama-2-7B, compared to 4% for the current state of the art. We also propose GCG++, an improvement to the GCG attack that reaches 94% ASR on white-box Llama-2-7B, and the Random-Search Attack on LLMs (RAL), a strong but simple baseline for query-based attacks. We believe the techniques proposed in this work will enable more comprehensive safety testing of LLMs and, in the long term, the development of better security guardrails. The code can be found at https://github.com/chawins/pal.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.277097702026367, 2.3608665466308594], "openalex_id": "https://openalex.org/W4391901300", "title": "Data Engineering for Scaling Language Models to 128K Context", "authors": "Yao Fu, Rameswar Panda, Xinyao Niu, Xiang Yue, Hannaneh Hajishirzi, Yoon Kim, Hao Peng", "abstract": "We study the continual pretraining recipe for scaling language models' context lengths to 128K, with a focus on data engineering. We hypothesize that long context modeling, in particular \\textit{the ability to utilize information at arbitrary input locations}, is a capability that is mostly already acquired through large-scale pretraining, and that this capability can be readily extended to contexts substantially longer than seen during training~(e.g., 4K to 128K) through lightweight continual pretraining on appropriate data mixture. We investigate the \\textit{quantity} and \\textit{quality} of the data for continual pretraining: (1) for quantity, we show that 500 million to 5 billion tokens are enough to enable the model to retrieve information anywhere within the 128K context; (2) for quality, our results equally emphasize \\textit{domain balance} and \\textit{length upsampling}. Concretely, we find that naively upsampling longer data on certain domains like books, a common practice of existing work, gives suboptimal performance, and that a balanced domain mixture is important. We demonstrate that continual pretraining of the full model on 1B-5B tokens of such data is an effective and affordable strategy for scaling the context length of language models to 128K. Our recipe outperforms strong open-source long-context models and closes the gap to frontier models like GPT-4 128K.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.907762050628662, 0.8396528363227844], "openalex_id": "https://openalex.org/W4391885024", "title": "Natural Language Processing Advancements: Breaking Barriers in Human-Computer Interaction", "authors": "Jos\u00e9 Gabriel Carrasco Ram\u00edrez", "abstract": "Natural Language Processing (NLP) advancements have revolutionized human-computer interaction, breaking barriers and opening new frontiers in technology. NLP techniques enable machines to understand, interpret, and generate human language, facilitating seamless communication between humans and computers. This paper explores recent advancements in NLP technology, highlighting their impact on various domains and discussing challenges and future directions in the field", "venue": "Journal of Artificial Intelligence General science (JAIGS) ISSN 3006-4023", "label": 0}, {"loc": [4.161799430847168, 3.6224887371063232], "openalex_id": "https://openalex.org/W4391901086", "title": "DE-COP: Detecting Copyrighted Content in Language Models Training Data", "authors": "Andr\u00e9 V. Duarte, Xuandong Zhao, Arlindo L. Oliveira, Lei Li", "abstract": "How can we detect if copyrighted content was used in the training process of a language model, considering that the training data is typically undisclosed? We are motivated by the premise that a language model is likely to identify verbatim excerpts from its training text. We propose DE-COP, a method to determine whether a piece of copyrighted content was included in training. DE-COP's core approach is to probe an LLM with multiple-choice questions, whose options include both verbatim text and their paraphrases. We construct BookTection, a benchmark with excerpts from 165 books published prior and subsequent to a model's training cutoff, along with their paraphrases. Our experiments show that DE-COP surpasses the prior best method by 9.6% in detection performance (AUC) on models with logits available. Moreover, DE-COP also achieves an average accuracy of 72% for detecting suspect books on fully black-box models where prior methods give approximately 4% accuracy. The code and datasets are available at https://github.com/LeiLiLab/DE-COP.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.757230758666992, 2.5815916061401367], "openalex_id": "https://openalex.org/W4395030455", "title": "AI-Powered Code Review Assistant for Streamlining Pull Request Merging", "authors": "Chathurya Adapa, Sai Sindhuri Avulamanda, A. Anjana, A Carreno Victor", "abstract": "WatsonX, a comprehensive data and AI platform, adeptly addresses contemporary challenges by meticulously training, validating, tuning, and deploying data to drive impactful business outcomes. The intricate task of timely merging Pull Requests (PRs) poses a significant challenge for software development teams, directly influencing business operations. This paper introduces an innovative solution leveraging AI, particularly harnessing generative AI techniques with the Falcon40-B model through the platform. The AI bot facilitates an initial PR review, offering insightful feedback on code formatting, best practices, and minor issues and streamlines collaboration by automatically assigning and notifying PR reviewers. The overarching goal is the continuous evolution of this AI bot into an intelligent reviewer, capable of assessing code from a functional standpoint. The implementation of this solution holds the promise of significantly enhancing PR management and expediting the entire development workflow.", "venue": "https://doi.org/10.1109/icwite59797.2024.10503540", "label": 0}, {"loc": [3.7209086418151855, -4.004755973815918], "openalex_id": "https://openalex.org/W4391835485", "title": "A Dataset for the Detection of Dehumanizing Language", "authors": "P. Engelmann, Peter Brunsgaard Trolle, Christian Hardmeier", "abstract": "Dehumanization is a mental process that enables the exclusion and ill treatment of a group of people. In this paper, we present two data sets of dehumanizing text, a large, automatically collected corpus and a smaller, manually annotated data set. Both data sets include a combination of political discourse and dialogue from movie subtitles. Our methods give us a broad and varied amount of dehumanization data to work with, enabling further exploratory analysis and automatic classification of dehumanization patterns. Both data sets will be publicly released.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.627496719360352, 1.105583667755127], "openalex_id": "https://openalex.org/W4391897469", "title": "Massively Multi-Cultural Knowledge Acquisition & LM Benchmarking", "authors": "Yi Fung, Ruining Zhao, Jae Doo, Chenkai Sun, Heng Ji", "abstract": "Pretrained large language models have revolutionized many applications but still face challenges related to cultural bias and a lack of cultural commonsense knowledge crucial for guiding cross-culture communication and interactions. Recognizing the shortcomings of existing methods in capturing the diverse and rich cultures across the world, this paper introduces a novel approach for massively multicultural knowledge acquisition. Specifically, our method strategically navigates from densely informative Wikipedia documents on cultural topics to an extensive network of linked pages. Leveraging this valuable source of data collection, we construct the CultureAtlas dataset, which covers a wide range of sub-country level geographical regions and ethnolinguistic groups, with data cleaning and preprocessing to ensure textual assertion sentence self-containment, as well as fine-grained cultural profile information extraction. Our dataset not only facilitates the evaluation of language model performance in culturally diverse contexts but also serves as a foundational tool for the development of culturally sensitive and aware language models. Our work marks an important step towards deeper understanding and bridging the gaps of cultural disparities in AI, to promote a more inclusive and balanced representation of global cultures in the digital domain.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.646381378173828, 0.11635995656251907], "openalex_id": "https://openalex.org/W4391833712", "title": "Eliciting Big Five Personality Traits in Large Language Models: A Textual Analysis with Classifier-Driven Approach", "authors": "Airlie Hilliard, Cristian Mu\u00f1oz, Zekun Wu, Adriano Koshiyama", "abstract": "Large Language Models (LLMs) are increasingly being utilized by both candidates and employers in the recruitment context. However, with this comes numerous ethical concerns, particularly related to the lack of transparency in these \"black-box\" models. Although previous studies have sought to increase the transparency of these models by investigating the personality traits of LLMs, many of the previous studies have provided them with personality assessments to complete. On the other hand, this study seeks to obtain a better understanding of such models by examining their output variations based on different input prompts. Specifically, we use a novel elicitation approach using prompts derived from common interview questions, as well as prompts designed to elicit particular Big Five personality traits to examine whether the models were susceptible to trait-activation like humans are, to measure their personality based on the language used in their outputs. To do so, we repeatedly prompted multiple LMs with different parameter sizes, including Llama-2, Falcon, Mistral, Bloom, GPT, OPT, and XLNet (base and fine tuned versions) and examined their personality using classifiers trained on the myPersonality dataset. Our results reveal that, generally, all LLMs demonstrate high openness and low extraversion. However, whereas LMs with fewer parameters exhibit similar behaviour in personality traits, newer and LMs with more parameters exhibit a broader range of personality traits, with increased agreeableness, emotional stability, and openness. Furthermore, a greater number of parameters is positively associated with openness and conscientiousness. Moreover, fine-tuned models exhibit minor modulations in their personality traits, contingent on the dataset. Implications and directions for future research are discussed.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.494019985198975, 2.3893959522247314], "openalex_id": "https://openalex.org/W4391799227", "title": "AutoMathText: Autonomous Data Selection with Language Models for Mathematical Texts", "authors": "Yifan Zhang, Yifan Luo, Yuan Yang, Andrew Chi-Chih Yao", "abstract": "We present Autonomous Data Selection (AutoDS), a method that leverages base language models themselves as zero-shot \"generative classifiers\" to automatically curate high-quality mathematical texts. Unlike prior approaches that require human annotations or training a dedicated data filter, AutoDS relies solely on a model's logits to determine whether a given passage is mathematically informative and educational. By integrating AutoDS into a continual pretraining pipeline, we substantially boost downstream performance on challenging math benchmarks (MATH, GSM8K, and BBH) while using far fewer tokens than previous methods. Empirically, our approach achieves roughly a twofold improvement in pretraining token efficiency over strong baselines, underscoring the potential of self-directed data selection in enhancing mathematical reasoning. We release our curated AutoMathText dataset to facilitate future research in automated domain-specific data curation. The AutoMathText dataset is available at https://huggingface.co/datasets/math-ai/AutoMathText. The code is available at https://github.com/yifanzhang-pro/AutoMathText.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.547990798950195, 0.6877501606941223], "openalex_id": "https://openalex.org/W4391800563", "title": "History, Development, and Principles of Large Language Models-An Introductory Survey", "authors": "Zhibo Chu, Shiwen Ni, Zichong Wang, Xi Feng, Chengming Li, Xiping Hu, Ruifeng Xu, Min Yang, Wenbin Zhang", "abstract": "Language models serve as a cornerstone in natural language processing (NLP), utilizing mathematical methods to generalize language laws and knowledge for prediction and generation. Over extensive research spanning decades, language modeling has progressed from initial statistical language models (SLMs) to the contemporary landscape of large language models (LLMs). Notably, the swift evolution of LLMs has reached the ability to process, understand, and generate human-level text. Nevertheless, despite the significant advantages that LLMs offer in improving both work and personal lives, the limited understanding among general practitioners about the background and principles of these models hampers their full potential. Notably, most LLM reviews focus on specific aspects and utilize specialized language, posing a challenge for practitioners lacking relevant background knowledge. In light of this, this survey aims to present a comprehensible overview of LLMs to assist a broader audience. It strives to facilitate a comprehensive understanding by exploring the historical background of language models and tracing their evolution over time. The survey further investigates the factors influencing the development of LLMs, emphasizing key contributions. Additionally, it concentrates on elucidating the underlying principles of LLMs, equipping audiences with essential theoretical knowledge. The survey also highlights the limitations of existing work and points out promising future directions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.505523681640625, 2.3685801029205322], "openalex_id": "https://openalex.org/W4391799227", "title": "Autonomous Data Selection with Language Models for Mathematical Texts", "authors": "Yifan Zhang, Yifan Luo, Yuan Yang, Andrew Chi-Chih Yao", "abstract": "We present Autonomous Data Selection (AutoDS), a method that leverages base language models themselves as zero-shot \"generative classifiers\" to automatically curate high-quality mathematical texts. Unlike prior approaches that require human annotations or training a dedicated data filter, AutoDS relies solely on a model's logits to determine whether a given passage is mathematically informative and educational. By integrating AutoDS into a continual pretraining pipeline, we substantially boost downstream performance on challenging math benchmarks (MATH, GSM8K, and BBH) while using far fewer tokens than previous methods. Empirically, our approach achieves roughly a twofold improvement in pretraining token efficiency over strong baselines, underscoring the potential of self-directed data selection in enhancing mathematical reasoning. We release our curated AutoMathText dataset to facilitate future research in automated domain-specific data curation. The AutoMathText dataset is available at https://huggingface.co/datasets/math-ai/AutoMathText. The code is available at https://github.com/yifanzhang-pro/AutoMathText.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.477175712585449, 1.881677269935608], "openalex_id": "https://openalex.org/W4391800934", "title": "Social Evolution of Published Text and The Emergence of Artificial Intelligence Through Large Language Models and The Problem of Toxicity and Bias", "authors": "Arifa Khan, P. Saravanan, S. K. Venkatesan", "abstract": "We provide a birds eye view of the rapid developments in AI and Deep Learning that has led to the path-breaking emergence of AI in Large Language Models. The aim of this study is to place all these developments in a pragmatic broader historical social perspective without any exaggerations while at the same time without any pessimism that created the AI winter in the 1970s to 1990s. We also at the same time point out toxicity, bias, memorization, sycophancy, logical inconsistencies, hallucinations that exist just as a warning to the overly optimistic. We note here that just as this emergence of AI seems to occur at a threshold point in the number of neural connections or weights, it has also been observed that human brain and especially the cortex region is nothing special or extraordinary but simply a case of scaled-up version of the primate brain and that even the human intelligence seems like an emergent phenomena of scale.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.0432448387145996, 5.334621906280518], "openalex_id": "https://openalex.org/W4391786141", "title": "A Permutaion Importance Based feature selection method and Deep Learning Model to Detect Phishing Websites", "authors": "Rania Zaimi, Mohamed Hafidi, Mahnane Lamia", "abstract": "Abstract Phishing attacks pose a significant and escalating threat to cybersecurity in recent times. This deceptive scam aims to trick naive users, luring them into visiting harmful websites and sharing sensitive information, including credentials, credit card numbers, and passwords. Consequently, cybercriminals exploit this data for their own gain. As the sophistication and maliciousness of phishing continue to evolve, researchers are earnestly developing multiple anti-phishing solutions in the literature. Among these solutions, those based on deep learning models have gained substantial attention in recent years. This study proposes an intelligent, deep-learning-based mechanism to detect phishing URLs. The proposed system is based on the permutation importance method (PIM) to select the most relevant URL features, and the Smote-Tomek link method to solve the problem of an unbalanced dataset. In addition, four DL models\u2014CNN, LSTM, and two hybrid models (CNN-LSTM and LSTM-CNN)\u2014are tested to identify the more suitable detection model for the phishing field. The experimental results demonstrate the successful functioning of the proposed phishing detection mechanism. It is observed that the proposed mechanism achieved an accuracy ranging from 93.36% to 96.43% without feature selection and data balance across two variants of datasets and different DL classifiers. It also achieved an accuracy ranging from 94.12% to 96.88% with feature selection and data balance. Finally, our phishing detection mechanism is implemented as web application to enhance its usability for web users.", "venue": "Research Square (Research Square)", "label": 25}, {"loc": [6.355294227600098, 2.6712517738342285], "openalex_id": "https://openalex.org/W4391766676", "title": "InternLM-Math: Open Math Large Language Models Toward Verifiable Reasoning", "authors": "Huaiyuan Ying, Shuo Zhang, Linyang Li, Zhejian Zhou, Yunfan Shao, Zhaoye Fei, Yichuan Ma, Jiawei Hong, Kuikun Liu, Ziyi Wang, Yudong Wang, Zijian Wu, Shuaibin Li, Fengzhe Zhou, Hongwei Liu, Songyang Zhang, Wenwei Zhang, Hang Yan, Xipeng Qiu, Jiayu Wang, Kai Chen, Dahua Lin", "abstract": "The math abilities of large language models can represent their abstract reasoning ability. In this paper, we introduce and open-source our math reasoning LLMs InternLM-Math which is continue pre-trained from InternLM2. We unify chain-of-thought reasoning, reward modeling, formal reasoning, data augmentation, and code interpreter in a unified seq2seq format and supervise our model to be a versatile math reasoner, verifier, prover, and augmenter. These abilities can be used to develop the next math LLMs or self-iteration. InternLM-Math obtains open-sourced state-of-the-art performance under the setting of in-context learning, supervised fine-tuning, and code-assisted reasoning in various informal and formal benchmarks including GSM8K, MATH, Hungary math exam, MathBench-ZH, and MiniF2F. Our pre-trained model achieves 30.3 on the MiniF2F test set without fine-tuning. We further explore how to use LEAN to solve math problems and study its performance under the setting of multi-task learning which shows the possibility of using LEAN as a unified platform for solving and proving in math. Our models, codes, and data are released at \\url{https://github.com/InternLM/InternLM-Math}.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.9345381259918213, -0.0631805881857872], "openalex_id": "https://openalex.org/W4391776598", "title": "Understanding New Machine Learning Architectures: Practical Generative Artificial Intelligence for Anesthesiologists", "authors": "Christopher W. Connor", "abstract": "Recent advances in neural networks have given rise to generative artificial intelligence, systems able to produce fluent responses to natural questions or attractive and even photorealistic images from text prompts. These systems were developed through new network architectures that permit massive computational resources to be applied efficiently to enormous data sets. First, this review examines autoencoder architecture and its derivatives the variational autoencoder and the U-Net in annotating and manipulating images and extracting salience. This architecture will be important for applications like automated x-ray interpretation or real-time highlighting of anatomy in ultrasound images. Second, this article examines the transformer architecture in the interpretation and generation of natural language, as it will be useful in producing automated summarization of medical records or performing initial patient screening. The author also applies the GPT-3.5 algorithm to example questions from the American Board of Anesthesiologists Basic Examination and find that, under surprisingly reasonable conditions, it correctly answers more than half the questions.", "venue": "Anesthesiology", "label": 0}, {"loc": [6.837117671966553, 1.4884698390960693], "openalex_id": "https://openalex.org/W4392187606", "title": "Limitations of Language Models in The Oil & Gas Upstream Operations", "authors": "Abdulrahman Alsultan, Fathilah Abdul Razak", "abstract": "Abstract Language models has powered a lot of applications and developments in the past years in different domains. One language model that became popular in the recent years is BERT which is used as a base for many different natural language processing tasks such as classification and question answering. We wanted to apply BERT on some upstream oil and gas related text and study its performance on this domain. We decided to choose classification as a task to experiment with it. We tried a mix of domain adaptation and transfer learning to understand the potential of language models in general and BERT in specific on our domain. Our results emphasized the need to create domain-specific datasets for the oil and gas world and subsequently build models using them.", "venue": "International Petroleum Technology Conference", "label": 0}, {"loc": [3.438832998275757, 2.253720283508301], "openalex_id": "https://openalex.org/W4391735551", "title": "Notes towards infrastructure governance for large language models", "authors": "Lara Dal Molin", "abstract": "This paper draws on information infrastructures (IIs) in science and technology studies (STS), as well as on feminist STS scholarship and contemporary critical accounts of digital technologies, to build an initial mapping of the infrastructural mechanisms and implications of large language models (LLMs). Through a comparison with discriminatory machine learning (ML) systems and a case study on gender bias, I present LLMs as contested artefacts with categorising and performative capabilities. This paper suggests that generative systems do not tangibly depart from traditional, discriminative counterparts in terms of their underlying probabilistic mechanisms, and therefore both technologies can be theorised as infrastructures of categorisation. However, LLMs additionally retain performative capabilities through their linguistic outputs. Here, I outline the intuition behind this phenomenon, which I refer to as \u201clanguage as infrastructure\u201d. While traditional, discriminative systems \u201cdisappear\u201d into larger IIs, the hype surrounding generative technologies presents an opportunity to scrutinise these artefacts, to alter their computational mechanisms and introduce governance measures]. I illustrate this thesis through Sharma\u2019s formulation of \u201cbroken machine\u201d, and suggest dataset curation and participatory design as governance mechanisms that can partly address downstream harms in LLMs (Barocas, et al., 2023).", "venue": "First Monday", "label": 0}, {"loc": [3.2952160835266113, 2.6749017238616943], "openalex_id": "https://openalex.org/W4391750320", "title": "AI-Replicas as Ethical Practice: Introducing an Alternative to Traditional Anonymization Techniques in Image-Based Research", "authors": "Tobias Kamelski, Francisco Olivos", "abstract": "This paper introduces the use of AI-replicas as an alternative to traditional anonymization methods in image-based qualitative research. It emphasizes the ethical and practical dilemmas posed by current anonymization methods, such as distortion or loss of emotional and contextual information in images, and proposes the use of AI-replicas to preserve the integrity and authenticity of visual data while ensuring participant anonymity. The paper outlines the technological foundations of generative artificial intelligence (AI) and the practical application of Stable Diffusion to generate AI-replicas for anonymization and fictionalization purposes. Furthermore, it discusses the potential biases present in generative AI to suggest ways to mitigate these biases through careful prompt engineering and participatory approaches. The introduced approach aims to enhance ethical practices in visual research by providing a method that ensures participant anonymity without compromising the data's qualitative richness and interpretative validity.", "venue": "https://doi.org/10.31219/osf.io/8frst", "label": 0}, {"loc": [5.80531120300293, 5.418641567230225], "openalex_id": "https://openalex.org/W4391710122", "title": "SPHINX-X: Scaling Data and Parameters for a Family of Multi-modal Large Language Models", "authors": "Peng Gao, Renrui Zhang, Chris Liu, Longtian Qiu, Siyuan Huang, Weifeng Lin, Shitian Zhao, Shijie Geng, Ziyi Lin, Jin Peng, Kaipeng Zhang, Wenqi Shao, Chao Xu, Conghui He, Junjun He, Hao Shao, Pan Lu, Hongsheng Li, Yu Qiao", "abstract": "We propose SPHINX-X, an extensive Multimodality Large Language Model (MLLM) series developed upon SPHINX. To improve the architecture and training efficiency, we modify the SPHINX framework by removing redundant visual encoders, bypassing fully-padded sub-images with skip tokens, and simplifying multi-stage training into a one-stage all-in-one paradigm. To fully unleash the potential of MLLMs, we assemble a comprehensive multi-domain and multimodal dataset covering publicly available resources in language, vision, and vision-language tasks. We further enrich this collection with our curated OCR intensive and Set-of-Mark datasets, extending the diversity and generality. By training over different base LLMs including TinyLlama1.1B, InternLM2-7B, LLaMA2-13B, and Mixtral8x7B, we obtain a spectrum of MLLMs that vary in parameter size and multilingual capabilities. Comprehensive benchmarking reveals a strong correlation between the multi-modal performance with the data and parameter scales. Code and models are released at https://github.com/Alpha-VLLM/LLaMA2-Accessory", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.256130218505859, 0.3288160264492035], "openalex_id": "https://openalex.org/W4391709827", "title": "Multilingual E5 Text Embeddings: A Technical Report", "authors": "Liang Wang, Nan Yang, Xiaolong Huang, Linjun Yang, Rangan Majumder, Furu Wei", "abstract": "This technical report presents the training methodology and evaluation results of the open-source multilingual E5 text embedding models, released in mid-2023. Three embedding models of different sizes (small / base / large) are provided, offering a balance between the inference efficiency and embedding quality. The training procedure adheres to the English E5 model recipe, involving contrastive pre-training on 1 billion multilingual text pairs, followed by fine-tuning on a combination of labeled datasets. Additionally, we introduce a new instruction-tuned embedding model, whose performance is on par with state-of-the-art, English-only models of similar sizes. Information regarding the model release can be found at https://github.com/microsoft/unilm/tree/master/e5.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.708384990692139, -0.36428865790367126], "openalex_id": "https://openalex.org/W4391689704", "title": "Dynamic Few-shot Learning for Computational Social Science", "authors": "Ranadheer Malla, Travis Coan, Vivek J. Srinivasan, Constantine Boussalis", "abstract": "Large Language Models (LLMs) have revolutionized Natural Language Processing (NLP) and have recently demonstrated remarkable potential in social science research through their capacity to efficiently perform a wide range of text-as-data tasks. However, current applied social science research which relies on these models tend to use closed-source models and zero-shot learning. This study contrasts the performance of state-of-the-art closed-source LLMs with their open-source counterparts across 10 multi-class classification tasks critical to social science research, such as emotion detection and political speech classification. In addition, we introduce dynamic few-shot learning, an enhancement to static few-shot learning, that leverages semantic similarity to dynamically choose contextually relevant examples from a pre-defined corpus for improved prompt engineering. Our research indicates that dynamic few-shot learning significantly boosts classification performance, especially for much smaller models. Our findings help expand the methodological toolbox for social scientists and suggest more transparent, ethical, and efficient ways of leveraging the power of LLMs in social science research.", "venue": "https://doi.org/10.31235/osf.io/nfhe8", "label": 0}, {"loc": [9.23725700378418, -0.8328105211257935], "openalex_id": "https://openalex.org/W4391631473", "title": "Evaluating the Factuality of Zero-shot Summarizers Across Varied Domains", "authors": "Sanjana Ramprasad, Kundan Krishna, Zachary C. Lipton, Byron Wallace", "abstract": "Recent work has shown that large language models (LLMs) are capable of generating summaries zero-shot (i.e., without explicit supervision) that, under human assessment, are often comparable or even preferred to manually composed reference summaries. However, this prior work has focussed almost exclusively on evaluating news article summarization. How do zero-shot summarizers perform in other (potentially more specialized) domains? In this work we evaluate zero-shot generated summaries across specialized domains including biomedical articles, and legal bills (in addition to standard news benchmarks for reference). We focus especially on the factuality of outputs. We acquire annotations from domain experts to identify inconsistencies in summaries and systematically categorize these errors. We analyze whether the prevalence of a given domain in the pretraining corpus affects extractiveness and faithfulness of generated summaries of articles in this domain. We release all collected annotations to facilitate additional research toward measuring and realizing factually accurate summarization, beyond news articles. The dataset can be downloaded from https://github.com/sanjanaramprasad/zero_shot_faceval_domains", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.073194742202759, 5.301092147827148], "openalex_id": "https://openalex.org/W4391877227", "title": "A Holistic Review on Detection of Malicious Browser Extensions and Links using Deep Learning", "authors": "Rama Abirami K, Tiago Zonta, Mithileysh Sathiyanarayanan", "abstract": "The growth of the Internet has aroused people's attention toward network security. A secure network environment is fundamental for the expeditious and impeccable development of the Internet. The majority of internet-based tasks can be completed with the help of a web browser. Although many web applications add browser extensions to improve their functionality, some of these extensions are malicious and can access sensitive data without the user's knowledge. Browser extensions with malicious intent present a growing security concern and have quickly become one of the most prevalent methods used to compromise Internet security. This is largely due to their widespread usage and the extensive privileges they possess. After being installed, these malicious extensions are executed and make an attempt to compromise the victim's browser. This makes them particularly elusive and challenging to combat. It is crucial to promptly develop an effective strategy to address the threats posed by these extensions. A comprehensive review of the research on browser extension vulnerabilities is presented in this paper. The role of malicious links in web browser extensions are examined for several attacks. Detection of malicious browser extension on various aspects are represented namely Intrusion malicious web browser extensions detection using Intrusion detection, Machine learning based detection methods and Deep learning based techniques to mitigate malicious web browser extensions are examined. This study investigates the critical function of malicious detection in protecting web browsers, looking at the changing threats and risk-reduction tactics. A robust cybersecurity frameworks can be created that not only respond to known threats but also anticipate and thwart the strategies of future cyber adversaries by realizing the significance of proactive detection. Thus this survey provides a detailed comparison of various solutions for malicious browser extension.", "venue": "https://doi.org/10.1109/icaic60265.2024.10433842", "label": 0}, {"loc": [6.726771831512451, 0.14507316052913666], "openalex_id": "https://openalex.org/W4391605477", "title": "Systematic Evaluation of Linear Transformations for Cross-lingual Semantic Spaces", "authors": "Adam Mi\u0161tera, Tom\u00e1\u0161 Brychc\u00edn, Jakub \u0160m\u00edd", "abstract": "Abstract In the field of natural language understanding (NLU), a fundamental element is the representation of word meanings, a process that is integral to a wide range of applications. These applications span numerous domains including machine translation, question answering systems, text summarization, information retrieval, and supporting the functionality of virtual assistants. The increasing demand for reasoning in environments that are multilingual and knowledge transfer in cross-lingual systems has led to the development of cross-lingual semantic spaces. These spaces provide a representation of words from different languages in a shared space. With the increased emphasis on cross-lingual representations, several methods have been developed. Related works usually differ in training strategies and evaluate only limited aspects of semantic spaces. This lack of meaningful comparison made us write this study as we think it is crucial for the following research. As a basis for our comparison, we project semantic spaces into a shared space using both linear transformations supervised by bilingual dictionaries and transformations with no supervision at all. To allow comparison from different points of view, our evaluation includes both intrinsic tasks, such as cross-lingual word similarities, cross-lingual word analogies, and word machine translation, and extrinsic tasks like sentiment analysis and topic classification. Additionally, we also explored hubness to investigate the internal relationships within the semantic space. Our experiments include six languages from three different language families: English, German, Italian, Spanish, Croatian and Czech. Finally, we show that different preprocessing steps can have a significant impact on the performance of cross-lingual semantic spaces.", "venue": "https://doi.org/10.21203/rs.3.rs-3913165/v1", "label": 0}, {"loc": [5.7672119140625, -1.162033200263977], "openalex_id": "https://openalex.org/W4391590915", "title": "Zero-shot Sentiment Analysis in Low-Resource Languages Using a Multilingual Sentiment Lexicon", "authors": "Fajri Koto, Tilman Beck, Zeerak Talat, Iryna Gurevych, Timothy Baldwin", "abstract": "Improving multilingual language models capabilities in low-resource languages is generally difficult due to the scarcity of large-scale data in those languages. In this paper, we relax the reliance on texts in low-resource languages by using multilingual lexicons in pretraining to enhance multilingual capabilities. Specifically, we focus on zero-shot sentiment analysis tasks across 34 languages, including 6 high/medium-resource languages, 25 low-resource languages, and 3 code-switching datasets. We demonstrate that pretraining using multilingual lexicons, without using any sentence-level sentiment data, achieves superior zero-shot performance compared to models fine-tuned on English sentiment datasets, and large language models like GPT--3.5, BLOOMZ, and XGLM. These findings are observable for unseen low-resource languages to code-mixed scenarios involving high-resource languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.3642096519470215, -1.498302936553955], "openalex_id": "https://openalex.org/W4391591653", "title": "ChatGPT vs Gemini vs LLaMA on Multilingual Sentiment Analysis", "authors": "Alessio Buscemi, Daniele Proverbio", "abstract": "Automated sentiment analysis using Large Language Model (LLM)-based models like ChatGPT, Gemini or LLaMA2 is becoming widespread, both in academic research and in industrial applications. However, assessment and validation of their performance in case of ambiguous or ironic text is still poor. In this study, we constructed nuanced and ambiguous scenarios, we translated them in 10 languages, and we predicted their associated sentiment using popular LLMs. The results are validated against post-hoc human responses. Ambiguous scenarios are often well-coped by ChatGPT and Gemini, but we recognise significant biases and inconsistent performance across models and evaluated human languages. This work provides a standardised methodology for automated sentiment analysis evaluation and makes a call for action to further improve the algorithms and their underlying data, to improve their performance, interpretability and applicability.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.567395210266113, 2.8870956897735596], "openalex_id": "https://openalex.org/W4391590951", "title": "Selecting Large Language Model to Fine-tune via Rectified Scaling Law", "authors": "Haowei Lin, Baizhou Huang, Haotian Ye, Qinyu Chen, Zihao Wang, Sujian Li, Jianzhu Ma, Xiaojun Wan, James Zou, Yitao Liang", "abstract": "The ever-growing ecosystem of LLMs has posed a challenge in selecting the most appropriate pre-trained model to fine-tune amidst a sea of options. Given constrained resources, fine-tuning all models and making selections afterward is unrealistic. In this work, we formulate this resource-constrained selection task into predicting fine-tuning performance and illustrate its natural connection with Scaling Law. Unlike pre-training, we find that the fine-tuning scaling curve includes not just the well-known \"power phase\" but also the previously unobserved \"pre-power phase\". We also explain why existing Scaling Law fails to capture this phase transition phenomenon both theoretically and empirically. To address this, we introduce the concept of \"pre-learned data size\" into our Rectified Scaling Law, which overcomes theoretical limitations and fits experimental results much better. By leveraging our law, we propose a novel LLM selection algorithm that selects the near-optimal model with hundreds of times less resource consumption, while other methods may provide negatively correlated selection. The project page is available at rectified-scaling-law.github.io.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.763032913208008, -3.9617326259613037], "openalex_id": "https://openalex.org/W4391590783", "title": "Probing Critical Learning Dynamics of PLMs for Hate Speech Detection", "authors": "Sarah Masud, Mohammad Aflah Khan, Vikram Goyal, Md Shad Akhtar, Tanmoy Chakraborty", "abstract": "Despite the widespread adoption, there is a lack of research into how various critical aspects of pretrained language models (PLMs) affect their performance in hate speech detection. Through five research questions, our findings and recommendations lay the groundwork for empirically investigating different aspects of PLMs' use in hate speech detection. We deep dive into comparing different pretrained models, evaluating their seed robustness, finetuning settings, and the impact of pretraining data collection time. Our analysis reveals early peaks for downstream tasks during pretraining, the limited benefit of employing a more recent pretraining corpus, and the significance of specific layers during finetuning. We further call into question the use of domain-specific models and highlight the need for dynamic datasets for benchmarking hate speech detection.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.6721415519714355, 3.933150291442871], "openalex_id": "https://openalex.org/W4391622923", "title": "OpenMoE: An Early Effort on Open Mixture-of-Experts Language Models", "authors": "Fuzhao Xue, Zian Zheng, Yao Fu, Jinjie Ni, Zangwei Zheng, Wangchunshu Zhou, Yang You", "abstract": "To help the open-source community have a better understanding of Mixture-of-Experts (MoE) based large language models (LLMs), we train and release OpenMoE, a series of fully open-sourced and reproducible decoder-only MoE LLMs, ranging from 650M to 34B parameters and trained on up to over 1T tokens. Our investigation confirms that MoE-based LLMs can offer a more favorable cost-effectiveness trade-off than dense LLMs, highlighting the potential effectiveness for future LLM development. One more important contribution of this study is an in-depth analysis of the routing mechanisms within our OpenMoE models, leading to three significant findings: Context-Independent Specialization, Early Routing Learning, and Drop-towards-the-End. We discovered that routing decisions in MoE models are predominantly based on token IDs, with minimal context relevance. The token-to-expert assignments are determined early in the pre-training phase and remain largely unchanged. This imperfect routing can result in performance degradation, particularly in sequential tasks like multi-turn conversations, where tokens appearing later in a sequence are more likely to be dropped. Finally, we rethink our design based on the above-mentioned observations and analysis. To facilitate future MoE LLM development, we propose potential strategies for mitigating the issues we found and further improving off-the-shelf MoE LLM designs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.370779037475586, 2.613677740097046], "openalex_id": "https://openalex.org/W4391631327", "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models", "authors": "Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Mingchuan Zhang, Y. K. Li, Yingjun Wu, Daya Guo", "abstract": "Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. The mathematical reasoning capability of DeepSeekMath is attributed to two key factors: First, we harness the significant potential of publicly available web data through a meticulously engineered data selection pipeline. Second, we introduce Group Relative Policy Optimization (GRPO), a variant of Proximal Policy Optimization (PPO), that enhances mathematical reasoning abilities while concurrently optimizing the memory usage of PPO.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.869802474975586, -1.233930230140686], "openalex_id": "https://openalex.org/W4391623428", "title": "Sociolinguistically Informed Interpretability: A Case Study on Hinglish Emotion Classification", "authors": "Kushal Tatariya, Heather Lent, Johannes Bjerva, Miryam de Lhoneux, Michael G. Hahn, Alexey Sorokin, Ritesh Kumar, Andreas Shcherbakov, Yulia Otmakhova, Jinrui Yang, Oleg Serikov, Priya Rani, Edoardo Maria Ponti, Saliha Murado\u011flu, Rena Gao, Ryan Cotterell, Ekaterina Vylomova", "abstract": "Emotion classification is a challenging task in NLP due to the inherent idiosyncratic and subjective nature of linguistic expression, especially with code-mixed data. Pre-trained language models (PLMs) have achieved high performance for many tasks and languages, but it remains to be seen whether these models learn and are robust to the differences in emotional expression across languages. Sociolinguistic studies have shown that Hinglish speakers switch to Hindi when expressing negative emotions and to English when expressing positive emotions. To understand if language models can learn these associations, we study the effect of language on emotion prediction across 3 PLMs on a Hinglish emotion classification dataset. Using LIME and token level language ID, we find that models do learn these associations between language choice and emotional expression. Moreover, having code-mixed data present in the pre-training can augment that learning when task-specific data is scarce. We also conclude from the misclassifications that the models may overgeneralise this heuristic to other infrequent examples where this sociolinguistic phenomenon does not apply.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.090186595916748, 0.5959444642066956], "openalex_id": "https://openalex.org/W4391559705", "title": "Finite State Automata on Multi-Word Units for Efficient Text-Mining", "authors": "A. Postiglione", "abstract": "Text mining is crucial for analyzing unstructured and semi-structured textual documents. This paper introduces a fast and precise text mining method based on a finite automaton to extract knowledge domains. Unlike simple words, multi-word units (such as credit card) are emphasized for their efficiency in identifying specific semantic areas due to their predominantly monosemic nature, their limited number and their distinctiveness. The method focuses on identifying multi-word units within terminological ontologies, where each multi-word unit is associated with a sub-domain of ontology knowledge. The algorithm, designed to handle the challenges posed by very long multi-word units composed of a variable number of simple words, integrates user-selected ontologies into a single finite automaton during a fast pre-processing step. At runtime, the automaton reads input text character by character, efficiently locating multi-word units even if they overlap. This approach is efficient for both short and long documents, requiring no prior training. Ontologies can be updated without additional computational costs. An early system prototype, tested on 100 short and medium-length documents, recognized the knowledge domains for the vast majority of texts (over 90%) analyzed. The authors suggest that this method could be a valuable semantic-based knowledge domain extraction technique in unstructured documents.", "venue": "Mathematics", "label": 46}, {"loc": [8.64712905883789, 0.7965434789657593], "openalex_id": "https://openalex.org/W4393078873", "title": "A Framework to Construct Financial Causality Knowledge Graph from Text", "authors": "Ziwei Xu, Hiroya Takamura, Ryutaro Ichise", "abstract": "Causality analysis holds a prominent role in finance, and the presentation of causality could offer valuable insights for risk mitigation, investment decisions, and portfolio optimization. Recent research has extensively investigated the identification of causality from text, yet there is still a significant deficiency in providing a comprehensive causality presentation from those textual discoveries. In this paper, we present an end-to-end framework to automatically construct Financial Causality Knowledge Graph (FinCaKG) from text, which allows us to visualize the captured causality from a holistic perspective. This framework involves three distinct tasks, including causality sentence detection, cause-effect span identification, and causal dependency representation. To examine the adaptability of FinCaKG framework, we generate, compare, and analyze the distinct FinCaKGs created from different corpus. The results show that this framework has the capability to not only capture the confidential causality but also represent them in a highly detailed manner in the resulting knowledge graph - FinCaKG. We perform a comparative study with ConceptNet, revealing the notable contributions of FinCaKGs in terms of domain coverage and the density of its causal knowledge.", "venue": "https://doi.org/10.1109/icsc59802.2024.00015", "label": 0}, {"loc": [5.337201118469238, 3.437795639038086], "openalex_id": "https://openalex.org/W4391556074", "title": "Mission Critical--Satellite Data is a Distinct Modality in Machine Learning", "authors": "Esther Rolf, Konstantin Klemmer, Caleb Robinson, Hannah Kerner", "abstract": "Satellite data has the potential to inspire a seismic shift for machine learning -- one in which we rethink existing practices designed for traditional data modalities. As machine learning for satellite data (SatML) gains traction for its real-world impact, our field is at a crossroads. We can either continue applying ill-suited approaches, or we can initiate a new research agenda that centers around the unique characteristics and challenges of satellite data. This position paper argues that satellite data constitutes a distinct modality for machine learning research and that we must recognize it as such to advance the quality and impact of SatML research across theory, methods, and deployment. We outline critical discussion questions and actionable suggestions to transform SatML from merely an intriguing application area to a dedicated research discipline that helps move the needle on big challenges for machine learning and society.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.408623695373535, -0.26226577162742615], "openalex_id": "https://openalex.org/W4391506269", "title": "CroissantLLM: A Truly Bilingual French-English Language Model", "authors": "Manuel Faysse, Patrick Fernandes, Nuno Guerreiro, Ant\u00f3nio Loison, Duarte Alves, Caio Corro, Nicolas Boizard, Janaina Alves, Ricardo Rei, Pedro Rapha\u00ebl Martins, Antoni Bigata Casademunt, Fran\u00e7ois Yvon, Andr\u00e9 F. T. Martins, Gautier Viaud, C\u00e9line Hudelot, Pierre Colombo", "abstract": "We introduce CroissantLLM, a 1.3B language model pretrained on a set of 3T English and French tokens, to bring to the research and industrial community a high-performance, fully open-sourced bilingual model that runs swiftly on consumer-grade local hardware. To that end, we pioneer the approach of training an intrinsically bilingual model with a 1:1 English-to-French pretraining data ratio, a custom tokenizer, and bilingual finetuning datasets. We release the training dataset, notably containing a French split with manually curated, high-quality, and varied data sources. To assess performance outside of English, we craft a novel benchmark, FrenchBench, consisting of an array of classification and generation tasks, covering various orthogonal aspects of model performance in the French Language. Additionally, rooted in transparency and to foster further Large Language Model research, we release codebases, and dozens of checkpoints across various model sizes, training data distributions, and training steps, as well as fine-tuned Chat models, and strong translation models. We evaluate our model through the FMTI framework, and validate 81 % of the transparency criteria, far beyond the scores of even most open initiatives. This work enriches the NLP landscape, breaking away from previous English-centric work in order to strengthen our understanding of multilinguality in language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.019173622131348, 2.2519466876983643], "openalex_id": "https://openalex.org/W4391506345", "title": "Evaluating Large Language Models for Generalization and Robustness via Data Compression", "authors": "Yucheng Li, Yunhao Guo, Frank Gu\u00e9rin, Chenghua Lin", "abstract": "Existing methods for evaluating large language models face challenges such as data contamination, sensitivity to prompts, and the high cost of benchmark creation. To address this, we propose a lossless data compression based evaluation approach that tests how models' predictive abilities generalize after their training cutoff. Specifically, we collect comprehensive test data spanning 83 months from 2017 to 2023 and split the data into training and testing periods according to models' training data cutoff. We measure: 1) the compression performance on the testing period as a measure of generalization on unseen data; and 2) the performance gap between the training and testing period as a measure of robustness. Our experiments test 14 representative large language models with various sizes on sources including Wikipedia, news articles, code, arXiv papers, and multi-modal data. We find that the compression rate of many models reduces significantly after their cutoff date, but models such as Mistral and Llama-2 demonstrate a good balance between performance and robustness. Results also suggest that models struggle to generalize on news and code data, but work especially well on arXiv papers. We also find the context size and tokenization implementation have a big impact of on the overall compression performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.524618625640869, 2.3742926120758057], "openalex_id": "https://openalex.org/W4391575688", "title": "The OECD-UNSD Multinational Enterprise Information Platform", "authors": "Graham Pilgrim, Shirly Ang", "abstract": "The OECD and the United Nations Statistics Division (UNSD) have developed jointly the new Multinational Enterprise Information Platform (MEIP). MEIP is built on past OECD and UN efforts to compile statistics on the scale and scope of the international activities of Multinational Enterprises (MNEs). The new platform uses publicly available data to gather information on the world\u2019s 500 largest MNEs in a timely manner, facilitating a comprehensive view of their physical and digital presence. It also includes a monitoring tool for large events such as Mergers and Acquisitions (M&A). The platform also provides a valuable benchmark for National Statistical Offices (NSOs) and researchers, allowing them to compare the national presence of an individual MNE to the global presence. Information on MNEs and their global network can also be visualised in a user-friendly dashboard.", "venue": "OECD statistics working papers", "label": 0}, {"loc": [3.0267863273620605, 1.2673896551132202], "openalex_id": "https://openalex.org/W4391715216", "title": "Is AI changing learning and assessment as we know it? Evidence from a ChatGPT experiment and a conceptual framework", "authors": "Oluwaseun Kolade, Adebowale Owoseni, Abiodun Egbetokun", "abstract": "ChatGPT, a state-of-the-art chatbot built upon Open AI's generative pre-trained transformer, has generated a major public interest and caused quite a stir in the higher education sector, where reactions have ranged from excitement to consternation. This paper therefore examines the potential impact of ChatGPT on learning and assessment, using the example of academic essays, being a major form of assessment with widespread applications of ChatGPT. This provides an opportunity to unpack broader insights on the challenge of generative AI's to the relevance, quality and credibility of higher education learning in a rapidly changing 21st century knowledge economy. We conducted a quasi-experiment in which we deployed ChatGPT to generate academic essays in response to a typical assessment brief, and then subjected the essays to plagiarism checks and independent grading. The results indicate that ChatGPT is able to generate highly original, and high quality, contents from distinct individual accounts in response to the same assessment brief. However, it is unable to generate multiple original contents from the same account, and it struggled with referencing. The discussion highlights the need for higher education providers to rethink their approach to assessment, in response to disruption precipitated by artificial intelligence. Thus, following the discussion of empirical data, we propose a new conceptual framework for AI-assisted assessment for lifelong learning, in which the parameters of assessment extend beyond knowledge (know what) testing, to competence (know how) assessment and performance (show how) evaluation.", "venue": "Heliyon", "label": 0}, {"loc": [6.374785900115967, 0.5852956771850586], "openalex_id": "https://openalex.org/W4392155770", "title": "Evaluation of word embedding models used for diachronic semantic change analysis", "authors": "Yulia Maslennikova, V. V. Bochkarev", "abstract": "Abstract In the last decade, the quantitative analysis of diachronic changes in language and lexical semantic changes have become the subject of active research. A significant role was played by the development of new effective techniques of word embedding. This direction has been effectively demonstrated in a number of studies. Some of them have focused on the analysis of the optimal type of word2vec models, hyperparameters for training, and evaluation techniques. In this research, we used Corpus of Historical American English (COHA). The paper demonstrates the results of multiple training runs and the comparison of word2vec models with different variations of hyperparameters used for lexical semantic change detection. In addition to traditional word similarities and analogical reasoning tests, we used testing on an extended set of synonyms. We have evaluated word2vec models on the set of more than 100,000 English synsets that were randomly selected from the WordNet database. We have shown that changing the word2vec model parameters (such as a dimension of word embedding, a size of context window, a type of model, a word discard rate etc.) can significantly impact on the resulting word embedding vector space and the detected lexical semantic changes. Additionally, the results strongly depended on properties of the corpus, such as word frequency distribution.", "venue": "Journal of Physics Conference Series", "label": 34}, {"loc": [8.994535446166992, 0.24971038103103638], "openalex_id": "https://openalex.org/W4392245172", "title": "Towards a small language model powered chain\u2010of\u2010reasoning for open\u2010domain question answering", "authors": "Jihyeon Roh, Minho Kim, Kyoungman Bae", "abstract": "Abstract We focus on open\u2010domain question\u2010answering tasks that involve a chain\u2010of\u2010reasoning, which are primarily implemented using large language models. With an emphasis on cost\u2010effectiveness, we designed EffiChainQA, an architecture centered on the use of small language models. We employed a retrieval\u2010based language model to address the limitations of large language models, such as the hallucination issue and the lack of updated knowledge. To enhance reasoning capabilities, we introduced a question decomposer that leverages a generative language model and serves as a key component in the chain\u2010of\u2010reasoning process. To generate training data for our question decomposer, we leveraged ChatGPT, which is known for its data augmentation ability. Comprehensive experiments were conducted using the HotpotQA dataset. Our method outperformed several established approaches, including the Chain\u2010of\u2010Thoughts approach, which is based on large language models. Moreover, our results are on par with those of state\u2010of\u2010the\u2010art Retrieve\u2010then\u2010Read methods that utilize large language models.", "venue": "ETRI Journal", "label": 0}, {"loc": [5.028258323669434, -1.5475748777389526], "openalex_id": "https://openalex.org/W4391360411", "title": "Enhancing Product Design through AI-Driven Sentiment Analysis of Amazon Reviews Using BERT", "authors": "Mahammad Khalid Shaik Vadla, Mahima Agumbe Suresh, Vimal Viswanathan", "abstract": "Understanding customer emotions and preferences is paramount for success in the dynamic product design landscape. This paper presents a study to develop a prediction pipeline to detect the aspect and perform sentiment analysis on review data. The pre-trained Bidirectional Encoder Representation from Transformers (BERT) model and the Text-to-Text Transfer Transformer (T5) are deployed to predict customer emotions. These models were trained on synthetically generated and manually labeled datasets to detect the specific features from review data, then sentiment analysis was performed to classify the data into positive, negative, and neutral reviews concerning their aspects. This research focused on eco-friendly products to analyze the customer emotions in this category. The BERT and T5 models were finely tuned for the aspect detection job and achieved 92% and 91% accuracy, respectively. The best-performing model will be selected, calculating the evaluation metrics precision, recall, F1-score, and computational efficiency. In these calculations, the BERT model outperforms T5 and is chosen as a classifier for the prediction pipeline to predict the aspect. By detecting aspects and sentiments of input data using the pre-trained BERT model, our study demonstrates its capability to comprehend and analyze customer reviews effectively. These findings can empower product designers and research developers with data-driven insights to shape exceptional products that resonate with customer expectations.", "venue": "Algorithms", "label": 37}, {"loc": [2.0545544624328613, 5.3260626792907715], "openalex_id": "https://openalex.org/W4391300581", "title": "Detection of phishing addresses and pages with a data set balancing approach by generative adversarial network (GAN) and convolutional neural network (CNN) \u2026", "authors": "Somayyeh Jafari, Nasrin Aghaee\u2010Maybodi", "abstract": "Summary Phishing attacks have a remarkable ability to steal user information by using simple techniques. Phishing attacks steal valuable information, such as user names and passwords. The loss caused by phishing attacks is significant, and every year, millions of dollars are lost by internet users and companies through phishing attacks. Deep learning methods such as CNN neural network are one approach to detecting phishing attacks: deep learning methods, optimization for image processing, and efficient techniques used to process URL strings. A convolutional neural network is a widely used deep learning method in image processing. The limitation of CNN learning to images is one of the main challenges of this neural network for detecting phishing attacks. Imbalance in the training dataset and lack of intelligent feature selection are other challenges for CNN training in detecting phishing attacks. This manuscript presents a new approach called CGAN\u2010IWSO\u2010ResNet50 to detect phishing attacks. In the first step, the improved version of the conditional GAN is used to balance the URL samples. In the second stage, Hand\u2010crafted and TF\u2010IDF methods implement the feature extraction phase. In the feature selection stage, the WOA algorithm is used to improve the WSO algorithm's performance in feature selection. The selected features are implemented on the dataset, and legal and phishing samples are coded as RGB images. In the last step, RGB images are used to teach ResNet50 architecture. Tests in the PhishTank dataset showed that the proposed method's accuracy, sensitivity, and precision index are 99.65%, 99.12%, and 99.46%, respectively. The CGAN\u2010IWSO\u2010ResNet50 method is more accurate in detecting phishing attacks than the VGG19, AlexNet, RCNN, DNN + LSTM, and DNN + BiLSTM learning methods.", "venue": "Concurrency and Computation Practice and Experience", "label": 33}, {"loc": [2.940251350402832, -0.3017241656780243], "openalex_id": "https://openalex.org/W4391246451", "title": "Evaluating and Mitigating Limitations of Large Language Models in Clinical Decision Making", "authors": "Paul Hager, Friederike Jungmann, Kunal Bhagat, Inga Hubrecht, Manuel Knauer, Jakob Vielhauer, Robbie Holland, Rickmer Braren, Marcus R. Makowski, Georgios Kaisis, Daniel Rueckert", "abstract": "Abstract Clinical decision making is one of the most impactful parts of a physician\u2019s responsibilities and stands to benefit greatly from AI solutions and large language models (LLMs) in particular. However, while LLMs have achieved excellent performance on medical licensing exams, these tests fail to assess many skills that are necessary for deployment in a realistic clinical decision making environment, including gathering information, adhering to established guidelines, and integrating into clinical workflows. To understand how useful LLMs are in real-world settings, we must evaluate them in the wild, i.e. on real-world data under realistic conditions. Here we have created a curated dataset based on the MIMIC-IV database spanning 2400 real patient cases and four common abdominal pathologies as well as a framework to simulate a realistic clinical setting. We show that current state-of-the-art LLMs do not accurately diagnose patients across all pathologies (performing significantly worse than physicians on average), follow neither diagnostic nor treatment guidelines, and cannot interpret laboratory results, thus posing a serious risk to the health of patients. Furthermore, we move beyond diagnostic accuracy and demonstrate that they cannot be easily integrated into existing workflows because they often fail to follow instructions and are sensitive to both the quantity and order of information. Overall, our analysis reveals that LLMs are currently not ready for clinical deployment while providing a dataset and framework to guide future studies.", "venue": "bioRxiv (Cold Spring Harbor Laboratory)", "label": 12}, {"loc": [2.9168241024017334, 1.27742338180542], "openalex_id": "https://openalex.org/W4390960299", "title": "Challenge design roadmap", "authors": "Hugo Jair Escalante Balderas, Isabelle Guyon, Addison Howard, Walter Reade, S\u00e9bastien Treguer", "abstract": "Challenges can be seen as a type of game that motivates participants to solve serious tasks. As a result, competition organizers must develop effective game rules. However, these rules have multiple objectives beyond making the game enjoyable for participants. These objectives may include solving real-world problems, advancing scientific or technical areas, making scientific discoveries, and educating the public. In many ways, creating a challenge is similar to launching a product. It requires the same level of excitement and rigorous testing, and the goal is to attract ''customers'' in the form of participants. The process begins with a solid plan, such as a competition proposal that will eventually be submitted to an international conference and subjected to peer review. Although peer review does not guarantee quality, it does force organizers to consider the impact of their challenge, identify potential oversights, and generally improve its quality. This chapter provides guidelines for creating a strong plan for a challenge. The material draws on the preparation guidelines from organizations such as Kaggle 1, ChaLearn 2 and Tailor 3, as well as the NeurIPS proposal template, which some of the authors contributed to.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.447126388549805, 2.4950144290924072], "openalex_id": "https://openalex.org/W4391252408", "title": "Recognizing Textual Inference in Mongolian Bar Exam Questions", "authors": "Garmaabazar Khaltarkhuu, Biligsaikhan Batjargal, Akira Maeda", "abstract": "This paper examines how to apply deep learning techniques to Mongolian bar exam questions. Several approaches that utilize eight different fine-tuned transformer models were demonstrated for recognizing textual inference in Mongolian bar exam questions. Among eight different models, the fine-tuned bert-base-multilingual-cased obtained the best accuracy of 0.7619. The fine-tuned bert-base-multilingual-cased was capable of recognizing \u201ccontradiction\u201d, with a recall of 0.7857 and an F1 score of 0.7674; it recognized \u201centailment\u201d with a precision of 0.7750, a recall of 0.7381, and an F1 score of 0.7561. Moreover, the fine-tuned bert-large-mongolian-uncased showed balanced performance in recognizing textual inference in Mongolian bar exam questions, thus achieving a precision of 0.7561, a recall of 0.7381, and an F1 score of 0.7470 for recognizing \u201ccontradiction\u201d.", "venue": "Applied Sciences", "label": 8}, {"loc": [6.378132343292236, -0.9527145624160767], "openalex_id": "https://openalex.org/W4391230058", "title": "Beyond Lexical Boundaries: LLM-Generated Text Detection for Romanian Digital Libraries", "authors": "Melania Ni\u021bu, Mihai Dasc\u0103lu", "abstract": "Machine-generated content reshapes the landscape of digital information; hence, ensuring the authenticity of texts within digital libraries has become a paramount concern. This work introduces a corpus of approximately 60 k Romanian documents, including human-written samples as well as generated texts using six distinct Large Language Models (LLMs) and three different generation methods. Our robust experimental dataset covers five domains, namely books, news, legal, medical, and scientific publications. The exploratory text analysis revealed differences between human-authored and artificially generated texts, exposing the intricacies of lexical diversity and textual complexity. Since Romanian is a less-resourced language requiring dedicated detectors on which out-of-the-box solutions do not work, this paper introduces two techniques for discerning machine-generated texts. The first method leverages a Transformer-based model to categorize texts as human or machine-generated, while the second method extracts and examines linguistic features, such as identifying the top textual complexity indices via Kruskal\u2013Wallis mean rank and computes burstiness, which are further fed into a machine-learning model leveraging an extreme gradient-boosting decision tree. The methods show competitive performance, with the first technique\u2019s results outperforming the second one in two out of five domains, reaching an F1 score of 0.96. Our study also includes a text similarity analysis between human-authored and artificially generated texts, coupled with a SHAP analysis to understand which linguistic features contribute more to the classifier\u2019s decision.", "venue": "Future Internet", "label": 30}, {"loc": [7.805034637451172, 3.3552896976470947], "openalex_id": "https://openalex.org/W4391244998", "title": "MambaByte: Token-free Selective State Space Model", "authors": "Junxiong Wang, Tushaar Gangavarapu, Jing Nathan Yan, Alexander M. Rush", "abstract": "Token-free language models learn directly from raw bytes and remove the inductive bias of subword tokenization. Operating on bytes, however, results in significantly longer sequences. In this setting, standard autoregressive Transformers scale poorly as the effective memory required grows with sequence length. The recent development of the Mamba state space model (SSM) offers an appealing alternative approach with a fixed-sized memory state and efficient decoding. We propose MambaByte, a token-free adaptation of the Mamba SSM trained autoregressively on byte sequences. In terms of modeling, we show MambaByte to be competitive with, and even to outperform, state-of-the-art subword Transformers on language modeling tasks while maintaining the benefits of token-free language models, such as robustness to noise. In terms of efficiency, we develop an adaptation of speculative decoding with tokenized drafting and byte-level verification. This results in a $2.6\\times$ inference speedup to the standard MambaByte implementation, showing similar decoding efficiency as the subword Mamba. These findings establish the viability of SSMs in enabling token-free language modeling.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.240391731262207, 0.9825217723846436], "openalex_id": "https://openalex.org/W4391263226", "title": "SRBerta\u2014A Transformer Language Model for Serbian Cyrillic Legal Texts", "authors": "Milo\u0161 Bogdanovi\u0107, Jelena Koci\u0107, Leonid Stoimenov", "abstract": "Language is a unique ability of human beings. Although relatively simple for humans, the ability to understand human language is a highly complex task for machines. For a machine to learn a particular language, it must understand not only the words and rules used in a particular language, but also the context of sentences and the meaning that words take on in a particular context. In the experimental development we present in this paper, the goal was the development of the language model SRBerta\u2014a language model designed to understand the formal language of Serbian legal documents. SRBerta is the first of its kind since it has been trained using Cyrillic legal texts contained within a dataset created specifically for this purpose. The main goal of SRBerta network development was to understand the formal language of Serbian legislation. The training process was carried out using minimal resources (single NVIDIA Quadro RTX 5000 GPU) and performed in two phases\u2014base model training and fine-tuning. We will present the structure of the model, the structure of the training datasets, the training process, and the evaluation results. Further, we will explain the accuracy metric used in our case and demonstrate that SRBerta achieves a high level of accuracy for the task of masked language modeling in Serbian Cyrillic legal texts. Finally, SRBerta model and training datasets are publicly available for scientific and commercial purposes.", "venue": "Information", "label": 17}, {"loc": [7.441061973571777, 1.8562681674957275], "openalex_id": "https://openalex.org/W4391244976", "title": "Large Malaysian Language Model Based on Mistral for Enhanced Local Language Understanding", "authors": "Husein Zolkepli, Aisyah Razak, Kamarul Adha, Ariff Nazhan", "abstract": "In this paper, we present significant advancements in the pretraining of Mistral 7B, a large-scale language model, using a dataset of 32.6 GB, equivalent to 1.1 billion tokens. We explore the impact of extending the context length, releasing models with context lengths of 4096 and 32768 tokens, and further refining performance with a specialized 16384 context length instruction-tuned model, we called it Malaysian Mistral. Our experiments demonstrate the efficacy of continue pretraining and the influence of extended context lengths on Mistral 7B's language understanding capabilities. Additionally, we release a model specifically tuned with a 16384 context length instruction, showcasing its potential for capturing nuanced language intricacies. Furthermore, our research contributes to the benchmarking of Malaysian Mistral against prominent language models, including ChatGPT3.5 and Claude 2. We present compelling results indicating Malaysian Mistral's superior performance on Tatabahasa (Malay grammar) test set, particularly when fine-tuned with instructions. All models released at https://huggingface.co/collections/mesolitica/malaysian-mistral-7b-6528f2ec825f4bba46c1700c", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.315580368041992, 0.701958954334259], "openalex_id": "https://openalex.org/W4391272456", "title": "It's About Time: Incorporating Temporality in Retrieval Augmented Language Models", "authors": "Anoushka Gade, Jorjeta G. Jetcheva", "abstract": "The web serves as a global repository of knowledge, used by billions of people to search for information. Ensuring that users receive the most relevant and up-to-date information, especially in the presence of multiple versions of web content from different time points remains a critical challenge for information retrieval. This challenge has recently been compounded by the increased use of question answering tools trained on Wikipedia or web content and powered by large language models (LLMs) which have been found to make up information (or hallucinate), and in addition have been shown to struggle with the temporal dimensions of information. Even Retriever Augmented Language Models (RALMs) which incorporate a document database to reduce LLM hallucination are unable to handle temporal queries correctly. This leads to instances where RALMs respond to queries such as \"Who won the Wimbledon Championship?\", by retrieving document passages related to Wimbledon but without the ability to differentiate between them based on how recent they are. In this paper, we propose and evaluate, TempRALM, a temporally-aware Retriever Augmented Language Model (RALM) with few-shot learning extensions, which takes into account both semantically and temporally relevant documents relative to a given query, rather than relying on semantic similarity alone. We show that our approach results in up to 74% improvement in performance over the baseline RALM model, without requiring model pre-training, recalculating or replacing the RALM document index, or adding other computationally intensive elements.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.0692451000213623, 5.310505390167236], "openalex_id": "https://openalex.org/W4391171631", "title": "Evaluating Machine Learning and Deep Learning Approaches for Phishing URL Detection: A Systematic Review and Future Directions", "authors": "Bridget C. Ujah-Ogbuagu, Akande Noah Oluwatobi, Emeka Ogbuju", "abstract": "Abstract Website Uniform Resource Locator (URL) spoofing remains one of the ways of perpetrating phishing attacks in the twenty-first century. Hackers continue to employ URL spoofing to deceive na\u00efve and unsuspecting consumers into releasing important personal details in malicious websites. Blacklists and rule-based filters that were once effective at reducing the risks and sophistication of phishing are no longer effective as there are over 1.5 million new phishing websites created monthly. Therefore, research aimed at unveiling new techniques for detecting phishing websites has sparked a lot of interest in both academics and business with machine and deep learning techniques being at the forefront. Among the deep learning techniques that have been employed, Convolutional Neural Network (CNN) remains one of the most widely used with high performance in feature learning. However, CNN has a problem of memorizing contextual relationships in URL text, which makes it challenging to efficiently detect sophisticated malicious URLs in real-time applications. On the contrary, Long Short-Term Memory (LSTM) deep learning model has been successfully employed in complex real-time problems because of its ability to store inputs for a long period of time. This study experiments with the use of hybrid CNN and LSTM deep learning models for spoofing website URL detection in order to exploit the combined strengths of the two approaches for a more sophisticated spoofing URL detection. Two publicly available datasets (UCL spoofing Website and PhishTank Datasets) were used to evaluate the performance of the proposed hybrid model against other models in the literature. The hybrid CNN-LSTM model achieved accuracies of 98.9% and 96.8%, respectively, when evaluated using the UCL and PhishTank datasets. On the other hand, the standalone CNN and LSTM achieved accuracies of 90.4% and 94.6% on the UCL dataset, while their accuracies on the PhishTank dataset were 89.3% and 92.6%, respectively. The results show that the hybrid CNN-LSTM algorithm largely outperformed the standalone CNN and LSTM models, which demonstrates a much better performance. Therefore, the hybrid deep learning technique is recommended for detecting spoofing website URL thereby reducing losses attributed to such attacks.", "venue": "Journal of Electrical Systems and Information Technology", "label": 0}, {"loc": [5.718085289001465, -0.29986196756362915], "openalex_id": "https://openalex.org/W4391212045", "title": "Cheap Learning: Maximising Performance of Language Models for Social Data Science Using Minimal Data", "authors": "Leonardo Castro-Gonz\u00e1lez, Yi-Ling Chung, Hannak Rose Kirk, John E. Francis, Angus R. Williams, Pica Johansson, Jonathan Bright", "abstract": "The field of machine learning has recently made significant progress in reducing the requirements for labelled training data when building new models. These `cheaper' learning techniques hold significant potential for the social sciences, where development of large labelled training datasets is often a significant practical impediment to the use of machine learning for analytical tasks. In this article we review three `cheap' techniques that have developed in recent years: weak supervision, transfer learning and prompt engineering. For the latter, we also review the particular case of zero-shot prompting of large language models. For each technique we provide a guide of how it works and demonstrate its application across six different realistic social science applications (two different tasks paired with three different dataset makeups). We show good performance for all techniques, and in particular we demonstrate how prompting of large language models can achieve high accuracy at very low cost. Our results are accompanied by a code repository to make it easy for others to duplicate our work and use it in their own research. Overall, our article is intended to stimulate further uptake of these techniques in the social sciences.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.439518928527832, -0.927704393863678], "openalex_id": "https://openalex.org/W4391212073", "title": "Fine-tuning Large Language Models for Multigenerator, Multidomain, and Multilingual Machine-Generated Text Detection", "authors": "Feng Xiong, Thanet Markchom, Ziwei Zheng, Subin Jung, Varun Ojha, Huizhi Liang", "abstract": "SemEval-2024 Task 8 introduces the challenge of identifying machine-generated texts from diverse Large Language Models (LLMs) in various languages and domains. The task comprises three subtasks: binary classification in monolingual and multilingual (Subtask A), multi-class classification (Subtask B), and mixed text detection (Subtask C). This paper focuses on Subtask A & B. Each subtask is supported by three datasets for training, development, and testing. To tackle this task, two methods: 1) using traditional machine learning (ML) with natural language preprocessing (NLP) for feature extraction, and 2) fine-tuning LLMs for text classification. The results show that transformer models, particularly LoRA-RoBERTa, exceed traditional ML methods in effectiveness, with majority voting being particularly effective in multilingual contexts for identifying machine-generated texts.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.570725440979004, 2.3642873764038086], "openalex_id": "https://openalex.org/W4391212785", "title": "DsDm: Model-Aware Dataset Selection with Datamodels", "authors": "Logan Engstrom, \u00c1. Feldmann, Aleksander M\u0105dry", "abstract": "When selecting data for training large-scale models, standard practice is to filter for examples that match human notions of data quality. Such filtering yields qualitatively clean datapoints that intuitively should improve model behavior. However, in practice the opposite can often happen: we find that selecting according to similarity with \"high quality\" data sources may not increase (and can even hurt) performance compared to randomly selecting data. To develop better methods for selecting data, we start by framing dataset selection as an optimization problem that we can directly solve for: given target tasks, a learning algorithm, and candidate data, select the subset that maximizes model performance. This framework thus avoids handpicked notions of data quality, and instead models explicitly how the learning process uses train datapoints to predict on the target tasks. Our resulting method greatly improves language model (LM) performance on both pre-specified tasks and previously unseen tasks. Specifically, choosing target tasks representative of standard LM problems and evaluating on diverse held-out benchmarks, our selected datasets provide a 2x compute multiplier over baseline methods.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.3719072341918945, -1.0419631004333496], "openalex_id": "https://openalex.org/W4391158973", "title": "Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated Text", "authors": "Abhimanyu Hans, Avi Schwarzschild, Valeriia Cherepanova, Hamid Kazemi, Aniruddha Saha, Micah Goldblum, Jonas Geiping, Tom Goldstein", "abstract": "Detecting text generated by modern large language models is thought to be hard, as both LLMs and humans can exhibit a wide range of complex behaviors. However, we find that a score based on contrasting two closely related language models is highly accurate at separating human-generated and machine-generated text. Based on this mechanism, we propose a novel LLM detector that only requires simple calculations using a pair of pre-trained LLMs. The method, called Binoculars, achieves state-of-the-art accuracy without any training data. It is capable of spotting machine text from a range of modern LLMs without any model-specific modifications. We comprehensively evaluate Binoculars on a number of text sources and in varied situations. Over a wide range of document types, Binoculars detects over 90% of generated samples from ChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being trained on any ChatGPT data.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.816009759902954, 0.19179104268550873], "openalex_id": "https://openalex.org/W4391136507", "title": "What are Large Language Models Made of?", "authors": "Yupeng Chang, Xu Wang, Jindong Wang, Yuan Wu, Linyi Yang, Kaijie Zhu, Hao Chen, Xiaoyuan Yi, Cunxiang Wang, Yidong Wang, Wei Ye, Yue Zhang, Yi Chang, Philip S. Yu, Qiang Yang, Xing Xie", "abstract": "Large language models (LLMs) are gaining increasing popularity in both academia and industry, owing to their unprecedented performance in various applications. As LLMs continue to play a vital role in both research and daily use, their evaluation becomes increasingly critical, not only at the task level, but also at the society level for better understanding of their potential risks. Over the past years, significant efforts have been made to examine LLMs from various perspectives. This paper presents a comprehensive review of these evaluation methods for LLMs, focusing on three key dimensions: what to evaluate, where to evaluate, and how to evaluate. Firstly, we provide an overview from the perspective of evaluation tasks, encompassing general natural language processing tasks, reasoning, medical usage, ethics, education, natural and social sciences, agent applications, and other areas. Secondly, we answer the \u2018where\u2019 and \u2018how\u2019 questions by diving into the evaluation methods and benchmarks, which serve as crucial components in assessing the performance of LLMs. Then, we summarize the success and failure cases of LLMs in different tasks. Finally, we shed light on several future challenges that lie ahead in LLMs evaluation. Our aim is to offer invaluable insights to researchers in the realm of LLMs evaluation, thereby aiding the development of more proficient LLMs. Our key point is that evaluation should be treated as an essential discipline to better assist the development of LLMs. We consistently maintain the related open-source materials at: https://github.com/MLGroupJLU/LLM-eval-survey", "venue": "ACM Transactions on Intelligent Systems and Technology", "label": 0}, {"loc": [7.353363990783691, -0.08428115397691727], "openalex_id": "https://openalex.org/W4391124060", "title": "Cross-lingual Editing in Multilingual Language Models", "authors": "Himanshu Beniwal, Kowsik Nandagopan D, Mayank Singh", "abstract": "The training of large language models (LLMs) necessitates substantial data and computational resources, and updating outdated LLMs entails significant efforts and resources. While numerous model editing techniques (METs) have emerged to efficiently update model outputs without retraining, their effectiveness in multilingual LLMs, where knowledge is stored in diverse languages, remains an underexplored research area. This research paper introduces the cross-lingual model editing (\\textbf{XME}) paradigm, wherein a fact is edited in one language, and the subsequent update propagation is observed across other languages. To investigate the XME paradigm, we conducted experiments using BLOOM, mBERT, and XLM-RoBERTa using the two writing scripts: \\textit{Latin} (English, French, and Spanish) and \\textit{Indic} (Hindi, Gujarati, and Bengali). The results reveal notable performance limitations of state-of-the-art METs under the XME setting, mainly when the languages involved belong to two distinct script families. These findings highlight the need for further research and development of XME techniques to address these challenges. For more comprehensive information, the dataset used in this research and the associated code are publicly available at the following URL\\url{https://github.com/lingo-iitgn/XME}.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [1.9852153062820435, 5.317543029785156], "openalex_id": "https://openalex.org/W4391098143", "title": "Experimental Evaluation of Possible Feature Combinations for the Detection of Fraudulent Online Shops", "authors": "Audron\u0117 Janavi\u010di\u016bt\u0117, Agnius Liutkevi\u010dius, Gedas Dabu\u017einskas, Nerijus Morkevi\u010dius", "abstract": "Online shopping has become a common and popular form of shopping, so online attackers try to extract money from customers by creating online shops whose purpose is to compel the buyer to disclose credit card details or to pay money for goods that are never delivered. Existing buyer protection methods are based on the analysis of the content of the online shop, customer reviews, the URL (Uniform Resource Locator) of the website, the search in blacklists or whitelists, or the combination of the above-mentioned methods. This study aims to find the minimal set of publicly and easily obtainable features to create high-precision classification solutions that require little computing and memory resources. We evaluate various combinations of 18 features that belong to three possible categories, namely URL-based, content-based, and third-party services-based. For this purpose, the custom dataset is created, and several machine learning models are applied for the detection of fraudulent online shops based on these combinations of features. The results of this study show that even only four of the most significant features allow one to achieve 0.9342 classification accuracy, while 0.9605 accuracy is reached with seven features, and the best accuracy of 0.9693 is achieved using thirteen and fifteen features.", "venue": "Applied Sciences", "label": 8}, {"loc": [4.342935562133789, -0.7746450304985046], "openalex_id": "https://openalex.org/W4391098737", "title": "Innovative Use of Self-Attention-Based Ensemble Deep Learning for Suicide Risk Detection in Social Media Posts", "authors": "Hoan-Suk Choi, Jinhong Yang", "abstract": "Suicidal ideation constitutes a critical concern in mental health, adversely affecting individuals and society at large. The early detection of such ideation is vital for providing timely support to individuals and mitigating its societal impact. With social media serving as a platform for self-expression, it offers a rich source of data that can reveal early symptoms of mental health issues. This paper introduces an innovative ensemble learning method named LSTM-Attention-BiTCN, which fuses LSTM and BiTCN models with a self-attention mechanism to detect signs of suicidality in social media posts. Our LSTM-Attention-BiTCN model demonstrated superior performance in comparison to baseline models in the realm of classification and suicidal ideation detection, boasting an accuracy of 0.9405, a precision of 0.9385, a recall of 0.9424, and an F1-score of 0.9405. Our proposed model can aid healthcare professionals in recognizing suicidal tendencies among social media users accurately, thereby contributing to efforts to reduce suicide rates.", "venue": "Applied Sciences", "label": 8}, {"loc": [8.962533950805664, 0.15376941859722137], "openalex_id": "https://openalex.org/W4391047467", "title": "ChatQA: Building GPT-4 Level Conversational QA Models", "authors": "Zihan Liu, Ping Wei, Rajarshi Roy, Peng Xu, Mohammad Shoeybi, Bryan Catanzaro", "abstract": "In this work, we introduce ChatQA, a suite of models that outperform GPT-4 on retrieval-augmented generation (RAG) and conversational question answering (QA). To enhance generation, we propose a two-stage instruction tuning method that significantly boosts the performance of RAG. For effective retrieval, we introduce a dense retriever optimized for conversational QA, which yields results comparable to the alternative state-of-the-art query rewriting models, while substantially reducing deployment costs. We also present the ChatRAG Bench, which encompasses ten datasets covering comprehensive evaluations on RAG, table-related QA, arithmetic calculations, and scenarios involving unanswerable questions. Our ChatQA-1.0-70B (score: 54.14), built on Llama2, a weaker foundation model than GPT-4, can slightly outperform GPT-4-0613 (score: 53.90) and GPT-4-Turbo-2024-04-09 (score: 54.03) on the ChatRAG Bench, without relying on any synthetic data from OpenAI GPT models. Notably, the Llama3-ChatQA-1.5-70B model surpasses the accuracy of GPT-4-Turbo-2024-04-09, achieving a 4.4% improvement. To advance research in this field, we open-sourced the model weights, instruction tuning data, ChatRAG Bench, and retriever for the community: https://chatqa-project.github.io/.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.313931941986084, 5.670968055725098], "openalex_id": "https://openalex.org/W4391047153", "title": "Advancing Large Multi-modal Models with Explicit Chain-of-Reasoning and Visual Question Generation", "authors": "K. Uehara, Nabarun Goswami, Hanqin Wang, Toshiaki Baba, Kohtaro Tanaka, Tomohiro Hashimoto, Kai Wang, Rei Ito, Takagi Naoya, Ryo Umagami, Yingyi Wen, Tanachai Anakewat, Tatsuya Harada", "abstract": "The increasing demand for intelligent systems capable of interpreting and reasoning about visual content requires the development of large Vision-and-Language Models (VLMs) that are not only accurate but also have explicit reasoning capabilities. This paper presents a novel approach to develop a VLM with the ability to conduct explicit reasoning based on visual content and textual instructions. We introduce a system that can ask a question to acquire necessary knowledge, thereby enhancing the robustness and explicability of the reasoning process. To this end, we developed a novel dataset generated by a Large Language Model (LLM), designed to promote chain-of-thought reasoning combined with a question-asking mechanism. The dataset covers a range of tasks, from common ones like caption generation to specialized VQA tasks that require expert knowledge. Furthermore, using the dataset we created, we fine-tuned an existing VLM. This training enabled the models to generate questions and perform iterative reasoning during inference. The results demonstrated a stride toward a more robust, accurate, and interpretable VLM, capable of reasoning explicitly and seeking information proactively when confronted with ambiguous visual input.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.260838985443115, 2.541191816329956], "openalex_id": "https://openalex.org/W4391013564", "title": "Machines Do See Color: A Guideline to Classify Different Forms of Racist Discourse in Large Corpora", "authors": "Diana D\u00e1vila Gordillo, Joan Timoneda, Sebasti\u00e1n Vallejo Vera", "abstract": "Current methods to identify and classify racist language in text rely on small-n qualitative approaches or large-n approaches focusing exclusively on overt forms of racist discourse. This article provides a step-by-step generalizable guideline to identify and classify different forms of racist discourse in large corpora. In our approach, we start by conceptualizing racism and its different manifestations. We then contextualize these racist manifestations to the time and place of interest, which allows researchers to identify their discursive form. Finally, we apply XLM-RoBERTa (XLM-R), a cross-lingual model for supervised text classification with a cutting-edge contextual understanding of text. We show that XLM-R and XLM-R-Racismo, our pretrained model, outperform other state-of-the-art approaches in classifying racism in large corpora. We illustrate our approach using a corpus of tweets relating to the Ecuadorian ind\u00edgena community between 2018 and 2021.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.018087863922119, 2.171226978302002], "openalex_id": "https://openalex.org/W4391013315", "title": "Algorithmic amplification of biases on Google Search", "authors": "Hussam Habib, Ryan Stoldt, Andrew C. High, Brian Ekdale, Ashley Peterson, Katy Biddle, Javie Ssozi, Rishab Nithyanand", "abstract": "The evolution of information-seeking processes, driven by search engines like Google, has transformed the access to information people have. This paper investigates how individuals' preexisting attitudes influence the modern information-seeking process, specifically the results presented by Google Search. Through a comprehensive study involving surveys and information-seeking tasks focusing on the topic of abortion, the paper provides four crucial insights: 1) Individuals with opposing attitudes on abortion receive different search results. 2) Individuals express their beliefs in their choice of vocabulary used in formulating the search queries, shaping the outcome of the search. 3) Additionally, the user's search history contributes to divergent results among those with opposing attitudes. 4) Google Search engine reinforces preexisting beliefs in search results. Overall, this study provides insights into the interplay between human biases and algorithmic processes, highlighting the potential for information polarization in modern information-seeking processes.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.169336318969727, 3.8106093406677246], "openalex_id": "https://openalex.org/W4391013346", "title": "Asynchronous Local-SGD Training for Language Modeling", "authors": "\u0411\u043e \u041b\u044e, Rachita Chhaparia, Arthur Douillard, Satyen Kale, Andrei A. Rusu, Jiajun Shen, Arthur Szlam, Marc\u2019Aurelio Ranzato", "abstract": "Local stochastic gradient descent (Local-SGD), also referred to as federated averaging, is an approach to distributed optimization where each device performs more than one SGD update per communication. This work presents an empirical study of {\\it asynchronous} Local-SGD for training language models; that is, each worker updates the global parameters as soon as it has finished its SGD steps. We conduct a comprehensive investigation by examining how worker hardware heterogeneity, model size, number of workers, and optimizer could impact the learning performance. We find that with naive implementations, asynchronous Local-SGD takes more iterations to converge than its synchronous counterpart despite updating the (global) model parameters more frequently. We identify momentum acceleration on the global parameters when worker gradients are stale as a key challenge. We propose a novel method that utilizes a delayed Nesterov momentum update and adjusts the workers' local training steps based on their computation speed. This approach, evaluated with models up to 150M parameters on the C4 dataset, matches the performance of synchronous Local-SGD in terms of perplexity per update step, and significantly surpasses it in terms of wall clock time.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.11244535446167, -2.4730658531188965], "openalex_id": "https://openalex.org/W4390990617", "title": "Multiscale cascaded domain-based approach for Arabic fake reviews detection in e-commerce platforms", "authors": "Nour Qandos, Ghadir Hamad, Maitha Alharbi, Shatha Alturki, Waad Alharbi, Arwa Albelaihi", "abstract": "Fake reviews in e-commerce can lead to customer deception and financial losses. Despite the importance of fake reviews detection, studies for Arabic language are scarce due to the lack of comprehensive datasets. This study addresses this gap by introducing a full-gold standard dataset, the Arabic Fake Reviews Detection (AFRD), across hotels, restaurants, and product domains. To identify the most effective model for each domain in the context of fake review detection, this research employed Bi-LSTM, Bi-GRU, CNN+Bi-LSTM, and CNN+Bi-GRU models. These models were then used in a cascading approach called Multiscale Cascaded domain-based (MCDB), which transfers knowledge from one domain to enhance results in other domains. Experimental results demonstrated that the MCDB approach improved the results of the models by 2.09% to 7.8% in terms of accuracy. The introduced dataset can be used to build effective models for Arabic e-commerce platforms, in addition to further Natural Language Processing applications. This study demonstrates that leveraging domain-specific datasets in a cascading manner can significantly improve performance, holding substantial implications for future research in problems with limited-size datasets.", "venue": "Journal of King Saud University - Computer and Information Sciences", "label": 32}, {"loc": [6.292744159698486, -1.1330602169036865], "openalex_id": "https://openalex.org/W4391009523", "title": "On the importance of Data Scale in Pretraining Arabic Language Models", "authors": "Abbas Ghaddar, Philippe Langlais, Mehdi Rezagholizadeh, Boxing Chen", "abstract": "Pretraining monolingual language models have been proven to be vital for performance in Arabic Natural Language Processing (NLP) tasks. In this paper, we conduct a comprehensive study on the role of data in Arabic Pretrained Language Models (PLMs). More precisely, we reassess the performance of a suite of state-of-the-art Arabic PLMs by retraining them on massive-scale, high-quality Arabic corpora. We have significantly improved the performance of the leading Arabic encoder-only BERT-base and encoder-decoder T5-base models on the ALUE and ORCA leaderboards, thereby reporting state-of-the-art results in their respective model categories. In addition, our analysis strongly suggests that pretraining data by far is the primary contributor to performance, surpassing other factors. Our models and source code are publicly available at https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/JABER-PyTorch.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.9315900802612305, 5.169901371002197], "openalex_id": "https://openalex.org/W4390961147", "title": "Exploring the Reasoning Abilities of Multimodal Large Language Models (MLLMs): A Comprehensive Survey on Emerging Trends in Multimodal Reasoning", "authors": "Yiqi Wang, Wentao Chen, Xiaotian Han, Xudong Lin, Haiteng Zhao, Yongfei Liu, Bohan Zhai, Jianbo Yuan, Quanzeng You, Hongxia Yang", "abstract": "Strong Artificial Intelligence (Strong AI) or Artificial General Intelligence (AGI) with abstract reasoning ability is the goal of next-generation AI. Recent advancements in Large Language Models (LLMs), along with the emerging field of Multimodal Large Language Models (MLLMs), have demonstrated impressive capabilities across a wide range of multimodal tasks and applications. Particularly, various MLLMs, each with distinct model architectures, training data, and training stages, have been evaluated across a broad range of MLLM benchmarks. These studies have, to varying degrees, revealed different aspects of the current capabilities of MLLMs. However, the reasoning abilities of MLLMs have not been systematically investigated. In this survey, we comprehensively review the existing evaluation protocols of multimodal reasoning, categorize and illustrate the frontiers of MLLMs, introduce recent trends in applications of MLLMs on reasoning-intensive tasks, and finally discuss current practices and future directions. We believe our survey establishes a solid base and sheds light on this important topic, multimodal reasoning.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.743331432342529, 4.9746479988098145], "openalex_id": "https://openalex.org/W4390962670", "title": "A Survey of Resource-efficient LLM and Multimodal Foundation Models", "authors": "Mengwei Xu, Wangsong Yin, Dongqi Cai, Rongjie Yi, Daliang Xu, Qipeng Wang, Bingyang Wu, Yihao Zhao, Chen Yang, Shihe Wang, Qiyang Zhang, Zhen-Yan Lu, Li Zhang, Shangguang Wang, Yuanchun Li, Yunxin Liu, Xin Jin, Xuanzhe Liu", "abstract": "Large foundation models, including large language models (LLMs), vision transformers (ViTs), diffusion, and LLM-based multimodal models, are revolutionizing the entire machine learning lifecycle, from training to deployment. However, the substantial advancements in versatility and performance these models offer come at a significant cost in terms of hardware resources. To support the growth of these large models in a scalable and environmentally sustainable way, there has been a considerable focus on developing resource-efficient strategies. This survey delves into the critical importance of such research, examining both algorithmic and systemic aspects. It offers a comprehensive analysis and valuable insights gleaned from existing literature, encompassing a broad array of topics from cutting-edge model architectures and training/serving algorithms to practical system designs and implementations. The goal of this survey is to provide an overarching understanding of how current approaches are tackling the resource challenges posed by large foundation models and to potentially inspire future breakthroughs in this field.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.523763179779053, 5.2460246086120605], "openalex_id": "https://openalex.org/W4390963175", "title": "Scalable Pre-training of Large Autoregressive Image Models", "authors": "Alaaeldin El-Nouby, Michal Klein, Shuangfei Zhai, Miguel \u00c1ngel Bautista, Alexander Toshev, Vaishaal Shankar, Joshua M. Susskind, Armand Joulin", "abstract": "This paper introduces AIM, a collection of vision models pre-trained with an autoregressive objective. These models are inspired by their textual counterparts, i.e., Large Language Models (LLMs), and exhibit similar scaling properties. Specifically, we highlight two key findings: (1) the performance of the visual features scale with both the model capacity and the quantity of data, (2) the value of the objective function correlates with the performance of the model on downstream tasks. We illustrate the practical implication of these findings by pre-training a 7 billion parameter AIM on 2 billion images, that achieves 84.0% on ImageNet-1k with a frozen trunk. Interestingly, even at this scale, we observe no sign of saturation in performance, suggesting that AIM potentially represents a new frontier for training large-scale vision models. The pre-training of AIM is similar to the pre-training of LLMs, and does not require any image-specific strategy to stabilize the training at scale.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.1890692710876465, -1.6113901138305664], "openalex_id": "https://openalex.org/W4390962457", "title": "Milestones in Bengali Sentiment Analysis leveraging Transformer-models: Fundamentals, Challenges and Future Directions", "authors": "Saptarshi Sengupta, Shreya Ghosh, Prasenjit Mitra, Tarikul Islam Tamiti", "abstract": "Sentiment Analysis (SA) refers to the task of associating a view polarity (usually, positive, negative, or neutral; or even fine-grained such as slightly angry, sad, etc.) to a given text, essentially breaking it down to a supervised (since we have the view labels apriori) classification task. Although heavily studied in resource-rich languages such as English thus pushing the SOTA by leaps and bounds, owing to the arrival of the Transformer architecture, the same cannot be said for resource-poor languages such as Bengali (BN). For a language spoken by roughly 300 million people, the technology enabling them to run trials on their favored tongue is severely lacking. In this paper, we analyze the SOTA for SA in Bengali, particularly, Transformer-based models. We discuss available datasets, their drawbacks, the nuances associated with Bengali i.e. what makes this a challenging language to apply SA on, and finally provide insights for future direction to mitigate the limitations in the field.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.7066874504089355, 1.1824395656585693], "openalex_id": "https://openalex.org/W4390913218", "title": "Towards adaptive support for self\u2010regulated learning of causal relations: Evaluating four Dutch word vector models", "authors": "H\u00e9ctor J. Pijeira\u2010D\u00edaz, Sophia Braumann, Janneke van de Pol, Tamara van Gog, Anique B. H. de Bruin", "abstract": "Advances in computational language models increasingly enable adaptive support for self\u2010regulated learning (SRL) in digital learning environments (DLEs; eg, via automated feedback). However, the accuracy of those models is a common concern for educational stakeholders (eg, policymakers, researchers, teachers and learners themselves). We compared the accuracy of four Dutch language models (ie, spaCy medium, spaCy large, FastText and ConceptNet NumberBatch) in the context of secondary school students' learning of causal relations from expository texts, scaffolded by causal diagram completion. Since machine learning relies on human\u2010labelled data for the best results, we used a dataset with 10,193 students' causal diagram answers, compiled over a decade of research using a diagram completion intervention to enhance students' monitoring of their text comprehension. The language models were used in combination with four popular machine learning classifiers (ie, logistic regression, random forests, support vector machine and neural networks) to evaluate their performance on automatically scoring students' causal diagrams in terms of the correctness of events and their sequence (ie, the causal structure). Five performance metrics were studied, namely accuracy, precision, recall, F 1 and the area under the curve of the receiver operating characteristic (ROC\u2010AUC). The spaCy medium model combined with the neural network classifier achieved the best performance for the correctness of causal events in four of the five metrics, while the ConceptNet NumberBatch model worked best for the correctness of the causal sequence. These evaluation results provide a criterion for model adoption to adaptively support SRL of causal relations in DLEs. Practitioner notes What is already known about this topic Accurate monitoring is a prerequisite for effective self\u2010regulation. Students struggle to accurately monitor their comprehension of causal relations in texts. Completing causal diagrams improves students' monitoring accuracy, but there is room for further improvement. Automatic scoring could be used to provide adaptive support during diagramming. What this paper adds Comparison of four Dutch word vector models combined with four machine learning classifiers for the automatic scoring of students' causal diagrams. Five performance metrics to evaluate the above solutions. Evaluation of the word vector models for estimating the semantic similarity between student and model answers. Implications for practice and/or policy High\u2010quality word vector models could (em)power adaptive support during causal diagramming via automatic scoring. The evaluated solutions can be embedded in digital learning environments (DLEs). Criteria for model adoption to adaptively support SRL of causal relations in DLEs. The increased saliency of (in)correct answers via automatic scoring might help to improve students' monitoring accuracy.", "venue": "British Journal of Educational Technology", "label": 10}, {"loc": [7.506566047668457, 1.4073636531829834], "openalex_id": "https://openalex.org/W4390897546", "title": "AboutMe: Using Self-Descriptions in Webpages to Document the Effects of English Pretraining Data Filters", "authors": "Li Lucy, Suchin Gururangan, Luca Soldaini, Emma Strubell, David Bamman, Lauren Klein, Jesse Dodge", "abstract": "Large language models' (LLMs) abilities are drawn from their pretraining data, and model development begins with data curation. However, decisions around what data is retained or removed during this initial stage are under-scrutinized. In our work, we ground web text, which is a popular pretraining data source, to its social and geographic contexts. We create a new dataset of 10.3 million self-descriptions of website creators, and extract information about who they are and where they are from: their topical interests, social roles, and geographic affiliations. Then, we conduct the first study investigating how ten \"quality\" and English language identification (langID) filters affect webpages that vary along these social dimensions. Our experiments illuminate a range of implicit preferences in data curation: we show that some quality classifiers act like topical domain filters, and langID can overlook English content from some regions of the world. Overall, we hope that our work will encourage a new line of research on pretraining data curation practices and its social implications.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.718215227127075, -0.44452083110809326], "openalex_id": "https://openalex.org/W4390891018", "title": "Online Health Search Via Multidimensional Information Quality Assessment Based on Deep Language Models: Algorithm Development and Validation", "authors": "Boya Zhang, Nona Naderi, Rahul Mishra, Douglas Teodoro", "abstract": "Background Widespread misinformation in web resources can lead to serious implications for individuals seeking health advice. Despite that, information retrieval models are often focused only on the query-document relevance dimension to rank results. Objective We investigate a multidimensional information quality retrieval model based on deep learning to enhance the effectiveness of online health care information search results. Methods In this study, we simulated online health information search scenarios with a topic set of 32 different health-related inquiries and a corpus containing 1 billion web documents from the April 2019 snapshot of Common Crawl. Using state-of-the-art pretrained language models, we assessed the quality of the retrieved documents according to their usefulness, supportiveness, and credibility dimensions for a given search query on 6030 human-annotated, query-document pairs. We evaluated this approach using transfer learning and more specific domain adaptation techniques. Results In the transfer learning setting, the usefulness model provided the largest distinction between help- and harm-compatible documents, with a difference of +5.6%, leading to a majority of helpful documents in the top 10 retrieved. The supportiveness model achieved the best harm compatibility (+2.4%), while the combination of usefulness, supportiveness, and credibility models achieved the largest distinction between help- and harm-compatibility on helpful topics (+16.9%). In the domain adaptation setting, the linear combination of different models showed robust performance, with help-harm compatibility above +4.4% for all dimensions and going as high as +6.8%. Conclusions These results suggest that integrating automatic ranking models created for specific information quality dimensions can increase the effectiveness of health-related information retrieval. Thus, our approach could be used to enhance searches made by individuals seeking online health information.", "venue": "JMIR AI", "label": 0}, {"loc": [4.0861287117004395, 3.8076794147491455], "openalex_id": "https://openalex.org/W4390833824", "title": "TOFU: A Task of Fictitious Unlearning for LLMs", "authors": "Pratyush Maini, Zhili Feng, Avi Schwarzschild, Zachary C. Lipton, J. Zico Kolter", "abstract": "Large language models trained on massive corpora of data from the web can memorize and reproduce sensitive or private data raising both legal and ethical concerns. Unlearning, or tuning models to forget information present in their training data, provides us with a way to protect private data after training. Although several methods exist for such unlearning, it is unclear to what extent they result in models equivalent to those where the data to be forgotten was never learned in the first place. To address this challenge, we present TOFU, a Task of Fictitious Unlearning, as a benchmark aimed at helping deepen our understanding of unlearning. We offer a dataset of 200 diverse synthetic author profiles, each consisting of 20 question-answer pairs, and a subset of these profiles called the forget set that serves as the target for unlearning. We compile a suite of metrics that work together to provide a holistic picture of unlearning efficacy. Finally, we provide a set of baseline results from existing unlearning algorithms. Importantly, none of the baselines we consider show effective unlearning motivating continued efforts to develop approaches for unlearning that effectively tune models so that they truly behave as if they were never trained on the forget data at all.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.188525438308716, -0.2734997272491455], "openalex_id": "https://openalex.org/W4390833504", "title": "EpilepsyLLM: Domain-Specific Large Language Model Fine-tuned with Epilepsy Medical Knowledge", "authors": "Xuyang Zhao, Qibin Zhao, Toshihisa Tanaka", "abstract": "With large training datasets and massive amounts of computing sources, large language models (LLMs) achieve remarkable performance in comprehensive and generative ability. Based on those powerful LLMs, the model fine-tuned with domain-specific datasets posseses more specialized knowledge and thus is more practical like medical LLMs. However, the existing fine-tuned medical LLMs are limited to general medical knowledge with English language. For disease-specific problems, the model's response is inaccurate and sometimes even completely irrelevant, especially when using a language other than English. In this work, we focus on the particular disease of Epilepsy with Japanese language and introduce a customized LLM termed as EpilepsyLLM. Our model is trained from the pre-trained LLM by fine-tuning technique using datasets from the epilepsy domain. The datasets contain knowledge of basic information about disease, common treatment methods and drugs, and important notes in life and work. The experimental results demonstrate that EpilepsyLLM can provide more reliable and specialized medical knowledge responses.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.388826370239258, -1.481157898902893], "openalex_id": "https://openalex.org/W4390810491", "title": "Pre-trained Large Language Models for Financial Sentiment Analysis", "authors": "Wei Luo, Dihong Gong", "abstract": "Financial sentiment analysis refers to classifying financial text contents into sentiment categories (e.g. positive, negative, and neutral). In this paper, we focus on the classification of financial news title, which is a challenging task due to a lack of large amount of training samples. To overcome this difficulty, we propose to adapt the pretrained large language models (LLMs) [1, 2, 3] to solve this problem. The LLMs, which are trained from huge amount of text corpora,have an advantage in text understanding and can be effectively adapted to domain-specific task while requiring very few amount of training samples. In particular, we adapt the open-source Llama2-7B model (2023) with the supervised fine-tuning (SFT) technique [4]. Experimental evaluation shows that even with the 7B model (which is relatively small for LLMs), our approach significantly outperforms the previous state-of-the-art algorithms.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.153195381164551, -0.586128830909729], "openalex_id": "https://openalex.org/W4390784450", "title": "An Assessment on Comprehending Mental Health through Large Language Models", "authors": "Mihael Ar\u010dan, Paul-David Niland, Fionn Delahunty", "abstract": "Mental health challenges pose considerable global burdens on individuals and communities. Recent data indicates that more than 20% of adults may encounter at least one mental disorder in their lifetime. On the one hand, the advancements in large language models have facilitated diverse applications, yet a significant research gap persists in understanding and enhancing the potential of large language models within the domain of mental health. On the other hand, across various applications, an outstanding question involves the capacity of large language models to comprehend expressions of human mental health conditions in natural language. This study presents an initial evaluation of large language models in addressing this gap. Due to this, we compare the performance of Llama-2 and ChatGPT with classical Machine as well as Deep learning models. Our results on the DAIC-WOZ dataset show that transformer-based models, like BERT or XLNet, outperform the large language models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.836010456085205, 1.2833523750305176], "openalex_id": "https://openalex.org/W4390781417", "title": "AI's Secret Weapon in Education. ChatGPT\u2013The Future of Personalized Learning", "authors": "Anca Popescu", "abstract": "This article delves into the fascinating intersection of AI and education, with a focus on ChatGPT, a conversational AI developed by OpenAI. Noted for its human-like responses, ChatGPT is positioned as a game-changer in personalized learning. The paper aims to highlight the untapped potential of ChatGPT in educational settings and advocate for its broader adoption by educators to enhance student learning outcomes. To achieve this objective, systematic research was conducted using a variety of data sources, literature sources, research studies, and online data focused on the efficacy of ChatGPT in academic environments.", "venue": "Bulletin of the Transilvania University of Brasov Series V Economic Sciences", "label": 0}, {"loc": [5.957335472106934, 5.848855495452881], "openalex_id": "https://openalex.org/W4390768707", "title": "Cascaded transformer-based networks for wikipedia large-scale image-caption matching", "authors": "Nicola Messina, Davide Alessandro Coccomini, Andrea Esuli, Fabrizio Falchi", "abstract": "Abstract With the increasing importance of multimedia and multilingual data in online encyclopedias, novel methods are needed to fill domain gaps and automatically connect different modalities for increased accessibility. For example, Wikipedia is composed of millions of pages written in multiple languages. Images, when present, often lack textual context, thus remaining conceptually floating and harder to find and manage. In this work, we tackle the novel task of associating images from Wikipedia pages with the correct caption among a large pool of available ones written in multiple languages, as required by the image-caption matching Kaggle challenge organized by the Wikimedia Foundation. A system able to perform this task would improve the accessibility and completeness of the underlying multi-modal knowledge graph in online encyclopedias. We propose a cascade of two models powered by the recent Transformer networks able to efficiently and effectively infer a relevance score between the query image data and the captions. We verify through extensive experiments that the proposed cascaded approach effectively handles a large pool of images and captions while maintaining bounded the overall computational complexity at inference time. With respect to other approaches in the challenge leaderboard, we can achieve remarkable improvements over the previous proposals (+8% in nDCG@5 with respect to the sixth position) with constrained resources. The code is publicly available at https://tinyurl.com/wiki-imcap.", "venue": "Multimedia Tools and Applications", "label": 42}, {"loc": [6.662161350250244, 0.1745918095111847], "openalex_id": "https://openalex.org/W4390690148", "title": "German Text Embedding Clustering Benchmark", "authors": "Silvan Wehrli, Bert Arnrich, Christopher Irrgang", "abstract": "This work introduces a benchmark assessing the performance of clustering German text embeddings in different domains. This benchmark is driven by the increasing use of clustering neural text embeddings in tasks that require the grouping of texts (such as topic modeling) and the need for German resources in existing benchmarks. We provide an initial analysis for a range of pre-trained mono- and multilingual models evaluated on the outcome of different clustering algorithms. Results include strong performing mono- and multilingual models. Reducing the dimensions of embeddings can further improve clustering. Additionally, we conduct experiments with continued pre-training for German BERT models to estimate the benefits of this additional training. Our experiments suggest that significant performance improvements are possible for short text. All code and datasets are publicly available.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.1205437183380127, -0.31845054030418396], "openalex_id": "https://openalex.org/W4390663110", "title": "Reliability of large language models in managing odontogenic sinusitis clinical scenarios: a preliminary multidisciplinary evaluation", "authors": "Alberto Maria Saibene, Fabiana Allevi, Christian Calvo\u2010Henr\u00edquez, Antonino Maniaci, Miguel Mayo\u2010Y\u00e1\u00f1ez, Alberto Paderno, Luigi Angelo Vaira, Giovanni Felisati, John R. Craig", "abstract": "Abstract Purpose This study aimed to evaluate the utility of large language model (LLM) artificial intelligence tools, Chat Generative Pre-Trained Transformer (ChatGPT) versions 3.5 and 4, in managing complex otolaryngological clinical scenarios, specifically for the multidisciplinary management of odontogenic sinusitis (ODS). Methods A prospective, structured multidisciplinary specialist evaluation was conducted using five ad hoc designed ODS-related clinical scenarios. LLM responses to these scenarios were critically reviewed by a multidisciplinary panel of eight specialist evaluators (2 ODS experts, 2 rhinologists, 2 general otolaryngologists, and 2 maxillofacial surgeons). Based on the level of disagreement from panel members, a Total Disagreement Score (TDS) was calculated for each LLM response, and TDS comparisons were made between ChatGPT3.5 and ChatGPT4, as well as between different evaluators. Results While disagreement to some degree was demonstrated in 73/80 evaluator reviews of LLMs\u2019 responses, TDSs were significantly lower for ChatGPT4 compared to ChatGPT3.5. Highest TDSs were found in the case of complicated ODS with orbital abscess, presumably due to increased case complexity with dental, rhinologic, and orbital factors affecting diagnostic and therapeutic options. There were no statistically significant differences in TDSs between evaluators\u2019 specialties, though ODS experts and maxillofacial surgeons tended to assign higher TDSs. Conclusions LLMs like ChatGPT, especially newer versions, showed potential for complimenting evidence-based clinical decision-making, but substantial disagreement was still demonstrated between LLMs and clinical specialists across most case examples, suggesting they are not yet optimal in aiding clinical management decisions. Future studies will be important to analyze LLMs\u2019 performance as they evolve over time.", "venue": "European Archives of Oto-Rhino-Laryngology", "label": 0}, {"loc": [7.211068153381348, 2.4424142837524414], "openalex_id": "https://openalex.org/W4390690464", "title": "DeepSeek LLM: Scaling Open-Source Language Models with Longtermism", "authors": "DeepSeek-AI, NULL AUTHOR_ID, Xiao Guo Bi, Deli Chen, Guanting Chen, Shanhuang Chen, Damai Dai, Chengqi Deng, Honghui Ding, Kai Dong, Qiushi Du, Zhe Fu, Huazuo Gao, Kaige Gao, Wenjun Gao, Ruiqi Ge, Kang Guan, Daya Guo, Jianzhong Guo, Guangbo Hao, Zhewen Hao, Ying He, Wenjie Hu, Panpan Huang, Erhang Li, Guowei Li, Jiashi Li, Yao Li, Y. K. Li, Wenfeng Liang, Fangyun Lin, Andi Liu, Bo Liu, Wen Liu, Xiaodong Liu, Xin Liu, Yiyuan Liu, Haoyu Lu, Shanghao Lu, Fuli Luo, Shirong Ma, Xiaotao Nie, Pei Tian, Yishi Piao, Junjie Qiu, Hui Qu, Tongzheng Ren, Zehui Ren, Chong Ruan, Zhangli Sha, Zhihong Shao, Junxiao Song, X.-J Su, Jingxiang Sun, Yaofeng Sun, Minghui Tang, Bingxuan Wang, Peiyi Wang, Shiyu Wang, Yaohui Wang, Yongji Wang, Tong Wu, Yicheng Wu, Xin Xie, Zhenda Xie, Ziwei Xie, Yiliang Xiong, Hanwei Xu, Renyuan Xu, Yanhong Xu, Dejian Yang, Yuxiang You, Shuiping Yu, Xingkai Yu, Bo Zhang, Haowei Zhang, L. Zhang, Liyue Zhang, Mingchuan Zhang, Minghua Zhang, Wentao Zhang, Yichao Zhang, Chenggang Zhao, Yao Zhao, Shangyan Zhou, Shunfeng Zhou, Qihao Zhu, Yuheng Zou", "abstract": "The rapid development of open-source large language models (LLMs) has been truly remarkable. However, the scaling law described in previous literature presents varying conclusions, which casts a dark cloud over scaling LLMs. We delve into the study of scaling laws and present our distinctive findings that facilitate scaling of large scale models in two commonly used open-source configurations, 7B and 67B. Guided by the scaling laws, we introduce DeepSeek LLM, a project dedicated to advancing open-source language models with a long-term perspective. To support the pre-training phase, we have developed a dataset that currently consists of 2 trillion tokens and is continuously expanding. We further conduct supervised fine-tuning (SFT) and Direct Preference Optimization (DPO) on DeepSeek LLM Base models, resulting in the creation of DeepSeek Chat models. Our evaluation results demonstrate that DeepSeek LLM 67B surpasses LLaMA-2 70B on various benchmarks, particularly in the domains of code, mathematics, and reasoning. Furthermore, open-ended evaluations reveal that DeepSeek LLM 67B Chat exhibits superior performance compared to GPT-3.5.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.1645054817199707, -0.9154328107833862], "openalex_id": "https://openalex.org/W4390619108", "title": "KEPA-CRF: Knowledge expansion prototypical amortized conditional random field for few-shot event detection", "authors": "Rong Wu, Long Yu, Shengwei Tian, Jun Long, Tiejun Zhou, Bo Wang", "abstract": "This article has been retracted. A retraction notice can be found at https://doi.org/10.3233/JIFS-219433.", "venue": "Journal of Intelligent & Fuzzy Systems", "label": 0}, {"loc": [5.3367533683776855, 0.30206233263015747], "openalex_id": "https://openalex.org/W4390632530", "title": "Are LLMs Robust for Spoken Dialogues?", "authors": "Seyed Mahed Mousavi, Gabriel Roccabruna, Simone Alghisi, Massimo Rizzoli, Mirco Ravanelli, Giuseppe Riccardi", "abstract": "Large Pre-Trained Language Models have demonstrated state-of-the-art performance in different downstream tasks, including dialogue state tracking and end-to-end response generation. Nevertheless, most of the publicly available datasets and benchmarks on task-oriented dialogues focus on written conversations. Consequently, the robustness of the developed models to spoken interactions is unknown. In this work, we have evaluated the performance of LLMs for spoken task-oriented dialogues on the DSTC11 test sets. Due to the lack of proper spoken dialogue datasets, we have automatically transcribed a development set of spoken dialogues with a state-of-the-art ASR engine. We have characterized the ASR-error types and their distributions and simulated these errors in a large dataset of dialogues. We report the intrinsic (perplexity) and extrinsic (human evaluation) performance of fine-tuned GPT-2 and T5 models in two subtasks of response generation and dialogue state tracking, respectively. The results show that LLMs are not robust to spoken noise by default, however, fine-tuning/training such models on a proper dataset of spoken TODs can result in a more robust performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.3284807205200195, 5.180388450622559], "openalex_id": "https://openalex.org/W4390632625", "title": "Linguistic Profiling of Deepfakes: An Open Database for Next-Generation Deepfake Detection", "authors": "Yabin Wang, Zhiwu Huang, Zhiheng Ma, Xiaopeng Hong", "abstract": "The emergence of text-to-image generative models has revolutionized the field of deepfakes, enabling the creation of realistic and convincing visual content directly from textual descriptions. However, this advancement presents considerably greater challenges in detecting the authenticity of such content. Existing deepfake detection datasets and methods often fall short in effectively capturing the extensive range of emerging deepfakes and offering satisfactory explanatory information for detection. To address the significant issue, this paper introduces a deepfake database (DFLIP-3K) for the development of convincing and explainable deepfake detection. It encompasses about 300K diverse deepfake samples from approximately 3K generative models, which boasts the largest number of deepfake models in the literature. Moreover, it collects around 190K linguistic footprints of these deepfakes. The two distinguished features enable DFLIP-3K to develop a benchmark that promotes progress in linguistic profiling of deepfakes, which includes three sub-tasks namely deepfake detection, model identification, and prompt prediction. The deepfake model and prompt are two essential components of each deepfake, and thus dissecting them linguistically allows for an invaluable exploration of trustworthy and interpretable evidence in deepfake detection, which we believe is the key for the next-generation deepfake detection. Furthermore, DFLIP-3K is envisioned as an open database that fosters transparency and encourages collaborative efforts to further enhance its growth. Our extensive experiments on the developed benchmark verify that our DFLIP-3K database is capable of serving as a standardized resource for evaluating and comparing linguistic-based deepfake detection, identification, and prompt prediction techniques.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.068070888519287, 5.337311267852783], "openalex_id": "https://openalex.org/W4390740495", "title": "Intelligent Phishing Website Detection Model Powered by Deep Learning Techniques", "authors": "Godson Chetachi Uzoaru, Ndubuisi Henry Odikwa, Obioma Aloysius Agbugba", "abstract": "Phishing websites or URLs differ from software flaws as they exploit human vulnerabilities rather than technical weaknesses. Various methods exist to undermine the security of an internet user, but the most prevalent approach is phishing. This sort of assault aims to acquire or exploit a user's personal data, including passwords, credit card details, identity, and account information. Phishers gather user information by pretending to be authentic websites that are visually indistinguishable. Users' confidential data can be potentially retrieved, exposing them to the possibility of financial detriment or identity fraud. Consequently, there is a pressing requirement to develop a system that efficiently identifies phishing websites. This research presents three discrete deep learning methodologies for identifying phishing websites, which involve the use of long short-term memory (LSTM) and convolutional neural network (CNN) for comparison, and ultimately an LSTM-CNN-based methodology. The experimental results confirm the precision of the proposed methods, specifically 99.2%, 97.6%, and 96.8% for CNN, LSTM\u2013CNN, and LSTM, respectively. The CNN-based technology displayed a superior phishing detection mechanism.", "venue": "Asian Journal of Research in Computer Science", "label": 0}, {"loc": [6.378941059112549, 2.281632661819458], "openalex_id": "https://openalex.org/W4390573446", "title": "LLaMA Beyond English: An Empirical Study on Language Capability Transfer", "authors": "Jun Zhao, Zhihao Zhang, Qi Zhang, Tao Gui, Xuanjing Huang", "abstract": "In recent times, substantial advancements have been witnessed in large language models (LLMs), exemplified by ChatGPT, showcasing remarkable proficiency across a range of complex tasks. However, many mainstream LLMs (e.g. LLaMA) are pretrained on English-dominant corpus, which limits their performance in other non-English languages. In this paper, we focus on how to effectively transfer the capabilities of language generation and following instructions to a non-English language. To answer this question, we conduct an extensive empirical investigation based on LLaMA, accumulating over 1440 GPU hours. We analyze the impact of key factors such as vocabulary extension, further pretraining, and instruction tuning on transfer. To accurately assess the model's level of knowledge, we employ four widely used standardized testing benchmarks: C-Eval, MMLU, AGI-Eval, and GAOKAO-Bench. Furthermore, a comprehensive evaluation of the model's response quality is conducted, considering aspects such as accuracy, fluency, informativeness, logical coherence, and harmlessness, based on LLM-Eval, a benchmarks consisting instruction tasks from 17 diverse categories. Our evaluation results demonstrate that comparable performance to state-of-the-art transfer models can be achieved with less than 1% of the pretraining data, both in terms of knowledge alignment and response quality. Furthermore, the experimental outcomes across the thirteen low-resource languages also exhibit similar trends. We anticipate that the conclusions revealed by the experiments will aid the community in developing non-English LLMs.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.921083450317383, 0.08635148406028748], "openalex_id": "https://openalex.org/W4390542472", "title": "Answer Agnostic Question Generation in Bangla Language", "authors": "Abdur Rahman Fahad, Nazme Al Nahian, Md Ahanaf Islam, Rashedur M. Rahman", "abstract": "Abstract Question generation (QG) from a given context paragraph is a demanding task in natural language processing for its practical applications and prospects in various fields. Several studies have been conducted on QG in high-resource languages like English, however, very few have been done on resource-poor languages like Arabic and Bangla. In this work, we propose a finetuning method for QG that uses pre-trained transformer-based language models to generate questions from a given context paragraph in Bangla. Our approach is based on the idea that a transformer-based language model can be used to learn the relationships between words and phrases in a context paragraph which allows the models to generate questions that are both relevant and grammatically correct. We finetuned three different transformer models: (1) BanglaT5, (2) mT5-base, (3) BanglaGPT2, and demonstrated their capabilities using two different data formatting techniques: (1) AQL\u2014All Question Per Line, (2) OQL\u2014One Question Per Line, making it a total of six different variations of QG models. For each of these variants, six different decoding algorithms: (1) Greedy search, (2) Beam search, (3) Random Sampling, (4) Top K sampling, (5) Top- p Sampling, 6) a combination of Top K and Top-p Sampling were used to generate questions from the test dataset. For evaluation of the quality of questions generated using different models and decoding techniques, we also fine-tuned another transformer model BanglaBert on two custom datasets of our own and created two question classifier (QC) models that check the relevancy and Grammatical correctness of the questions generated by our QG models. The QC models showed test accuracy of 88.54% and 95.76% in the case of correctness and relevancy checks, respectively. Our results show that among all the variants of the QG, the mT5 OQL approach and beam decoding algorithm outperformed all the other ones in terms of relevancy (77%) and correctness (96%) with 36.60 Bleu_4, 48.98 METEOR, and 63.38 ROUGE-L scores.", "venue": "\u0098The \u009cInternational journal of networked and distributed computing", "label": 0}, {"loc": [9.48359489440918, 0.7245410680770874], "openalex_id": "https://openalex.org/W4390529318", "title": "Large Language Models aren't all that you need", "authors": "Kiran Voderhobli Holla, Chaithanya Kumar, Aryan Singh", "abstract": "This paper describes the architecture and systems built towards solving the SemEval 2023 Task 2: MultiCoNER II (Multilingual Complex Named Entity Recognition) [1]. We evaluate two approaches (a) a traditional Conditional Random Fields model and (b) a Large Language Model (LLM) fine-tuned with a customized head and compare the two approaches. The novel ideas explored are: 1) Decaying auxiliary loss (with residual) - where we train the model on an auxiliary task of Coarse-Grained NER and include this task as a part of the loss function 2) Triplet token blending - where we explore ways of blending the embeddings of neighboring tokens in the final NER layer prior to prediction 3) Task-optimal heads - where we explore a variety of custom heads and learning rates for the final layer of the LLM. We also explore multiple LLMs including GPT-3 and experiment with a variety of dropout and other hyperparameter settings before arriving at our final model which achieves micro & macro f1 of 0.85/0.84 (on dev) and 0.67/0.61 on the test data. We show that while pre-trained LLMs, by themselves, bring about a large improvement in scores as compared to traditional models, we also demonstrate that tangible improvements to the Macro-F1 score can be made by augmenting the LLM with additional feature/loss/model engineering techniques described above.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.642018795013428, 1.7682569026947021], "openalex_id": "https://openalex.org/W4390503183", "title": "Parsability revisited and reassessed", "authors": "Sergei Monakhov", "abstract": "Abstract This paper provides evidence that the inveterate way of assessing linguistic items\u2019 degrees of analysability by calculating their derivation to base frequency ratios may obfuscate the difference between two meaning processing models: one based on the principle of compositionality and another on the principle of parsability. I propose to capture the difference between these models by estimating the ratio of two transitional probabilities for complex words: P (affix | base) and P (base | affix). When transitional probabilities are comparably low, each of the elements entering into combination is equally free to vary. The combination itself is judged by speakers to be semantically transparent, and its derivational element tends to be more linguistically productive. In contrast, multi-morphemic words that are characterised by greater discrepancies between transitional probabilities are similar to collocations in the sense that they also consist of a node (conditionally independent element) and a collocate (conditionally dependent element). Such linguistic expressions are also considered to be semantically complex but appear less transparent because the collocate\u2019s meaning does not coincide with the meaning of the respective free element (even if it exists) and has to be parsed out from what is available.", "venue": "Journal of Linguistics", "label": 0}, {"loc": [2.4628424644470215, 1.622778058052063], "openalex_id": "https://openalex.org/W4403488999", "title": "Higher Education's Generative Artificial Intelligence Paradox: The Meaning of Chatbot Mania", "authors": "Tendai Blessing Chigora", "abstract": "The study is on The Paradox of Social-Emotional Learning (SEL) and Generative Artificial Intelligence (GAI) in Design and Technology (D&T): High School Context. This study is guided by the rational framework, anchored on the constructivism paradigm whose mother is the interpretivist paradigm of philosophy. Hinged on this philosophy, the study adopted a qualitative approach, following a systematic review of the available related literature, newsletters, podcasts and webinars on GAI in education and AI in instruction, best practices in D&T, case studies and interviews with the selected thirty-five D&T professionals drawn from; 5 D&T high school teachers, with an equal spread of 5year difference interval teaching D&T, 5 D&T examiners, 5 subject moderators, 5 D&T consultants and 5 Tertiary D&T teacher trainers and 10 from private high schools currently applying SEL in their curriculum. The commonality among the interviewees was their strong focus on CAD/CAM in design and manufacturing. The literature gathered using key research words and interview results was cross-reviewed and thematically analysed. It surfaced that the D&T professional community feel that GAI has immense potential to foster creativity in problem-solving, by allowing simulations of possible ideas in Virtual Reality (VR) environments and speeding up the design processes; research and ideation stages. Efficient leveraging of the capabilities of GAI; blended with the Social Emotional Learning (SEL) skills can influence students to explore and experiment with design concepts more efficiently, freeing up time for higher-order thinking and collaborative problem-solving. Having a clear and precise understanding of the impacts of GAI in D&T is significant because D&T competencies help students engage in real-world issues. Fusing GAI to D&T is a risky yet potential path because of the possible shortcomings and benefits it could have in actual- or authentic-problem solving within the classroom and society. The study recommends and provides insights for policymakers and researchers in the field of education with particular thrust on D&T to foster ethical considerations and data privacy, embrace and invest in Virtual Instruction (VI) tools specifically tailored for personalised learning, promote Professional Development (PD) and Capacity Building, embark on a rigorous curriculum review and alignment with GAI and SEL infusion in curriculum redesign.", "venue": "International journal of research and scientific innovation", "label": 0}, {"loc": [6.2618088722229, 4.8083696365356445], "openalex_id": "https://openalex.org/W4403447496", "title": "Survey of different Large Language Model Ar-chitectures: Trends, Benchmarks, and Challenges", "authors": "Minghao Shao, Abdul Basit, Ramesh Karri, Muhammad Shafique", "abstract": "Large Language Models (LLMs) represent a class of deep learning models adept\\nat understanding natural language and generating coherent responses to various\\nprompts or queries. These models far exceed the complexity of conventional\\nneural networks, often encompassing dozens of neural network layers and\\ncontaining billions to trillions of parameters. They are typically trained on\\nvast datasets, utilizing architectures based on transformer blocks. Present-day\\nLLMs are multi-functional, capable of performing a range of tasks from text\\ngeneration and language translation to question answering, as well as code\\ngeneration and analysis. An advanced subset of these models, known as\\nMultimodal Large Language Models (MLLMs), extends LLM capabilities to process\\nand interpret multiple data modalities, including images, audio, and video.\\nThis enhancement empowers MLLMs with capabilities like video editing, image\\ncomprehension, and captioning for visual content. This survey provides a\\ncomprehensive overview of the recent advancements in LLMs. We begin by tracing\\nthe evolution of LLMs and subsequently delve into the advent and nuances of\\nMLLMs. We analyze emerging state-of-the-art MLLMs, exploring their technical\\nfeatures, strengths, and limitations. Additionally, we present a comparative\\nanalysis of these models and discuss their challenges, potential limitations,\\nand prospects for future development.\\n", "venue": "IEEE Access", "label": 2}, {"loc": [2.5847373008728027, 2.7022504806518555], "openalex_id": "https://openalex.org/W4403309540", "title": "AI for GovTech", "authors": "Mohd Hilal Muhammad, Ahmad Afif Ahmarofi, Mohd Zhafri Mohd Zukhi, Muhammad Khairul Zharif Nor A\u2019zam, Muhammad Hanif Othman", "abstract": "The rapid advancement of digital technologies has underscored the critical role of Internet Exchange (IX) points in shaping the Digital Economy and facilitating AI-driven governance (GovTech). This study addresses the problem of how IX infrastructure impacts the efficiency and effectiveness of digital services and AI applications within governance structure. The aim of the study is to explore the synergy between IX points and the Digital Economy and to analyse their combined influence on AI-driven governance, particularly focusing on how these elements contribute to enhanced public administration and economic growth. Utilizing a mixed-methods approach, the study integrates quantitative analysis of IX infrastructure impact, comparative case studies, and empirical data from recent advancements in digital technologies. Data were collected through surveys of key stakeholders, performance metrics of digital services, and case studies from diverse geographical contexts. The analysis employed statistical methods to assess correlations between IX infrastructure quality and digital economy outcomes, as well as the effectiveness of AI integration in governance. The findings reveal that robust IX infrastructure significantly enhances digital transaction efficiency and supports the effective deployment of AI technologies in governance. High-quality IX points contribute to reduced latency and improved data throughput, which are essential for real-time analytics and decision-making processes in public administration. This synergy fosters a reinforcing loop where advancements in IX infrastructure bolster AI capabilities, driving both economic growth and enhanced governance. The implications of the study are twofold. Theoretically, it extends the Resource-Based View (RBV) and Technology Acceptance Model (TAM) by integrating them within the context of digital infrastructure and governance. Practically, it underscores the need for policymakers to invest in and equitably distribute IX infrastructure to support digital and AI advancements. The study highlights the importance of addressing regional disparities in IX infrastructure and suggests future research into the long-term effects of IX investments on technological and economic outcomes.", "venue": "International Journal of Research and Innovation in Social Science", "label": 0}, {"loc": [3.1313724517822266, 2.339308977127075], "openalex_id": "https://openalex.org/W4401596325", "title": "Exploring Emergent Phenomena in AI: A Pantheistic Approach to the Underlying Source of Information", "authors": "Zoran Poposki", "abstract": "Abstract This article critically examines the emergent phenomena of AI-generated and NFT art through the lens of Georg Luk\u00e1cs\u2019 theory of reification and its existential implications. Luk\u00e1cs argued that under capitalism, social relations and human experiences are transformed into objective, quantifiable commodities, leading to a fragmented and alienated consciousness. Applying this framework to AI and NFT art, these technologies can be said to represent extreme examples of the reification of art and creativity in the digital age. AI art generators reduce artistic production to abstract, computable properties divorced from lived experience, while NFTs transform digital art into speculative commodities, imposing the logic of private property and exchange value onto the previously open domain of online culture. The existential dimension of this reification is explored, raising questions about the nature of creativity, originality, and the value of art in an increasingly financialized and automated world. The article suggests that a Luk\u00e1csian critique must not only diagnose the reified character of these cultural forms but also identify their potential for resistance and transformation, pointing toward a re-humanized and emancipatory vision of art in the digital age. Contemporary theorists such as Tiziana Terranova, Nick Dyer-Witheford, and Benjamin Noys are invoked to further elucidate these issues.", "venue": "Open Philosophy", "label": 0}, {"loc": [9.378376007080078, 1.4095968008041382], "openalex_id": "https://openalex.org/W4397048908", "title": "SMW Cloud: A Corpus of Domain-Specific Knowledge Graphs from Semantic MediaWikis", "authors": "Daniil Dobriy, Martin Beno, Axel Polleres", "abstract": "Semantic wikis have become an increasingly popular means of collaboratively managing Knowledge Graphs. They are powered by platforms such as Semantic MediaWiki and Wikibase, both of which enable MediaWiki to store and publish structured data. While there are many semantic wikis currently in use, there has been little effort to collect and analyse their structured data, nor to make it available for the research community. This paper seeks to address this gap by systematically collecting structured data from an extensive corpus of Semantic-MediaWiki-powered portals and providing an in-depth analysis of the ontological diversity (and re-use) amongst these wikis using a variety of ontological metrics. Our paper aims to demonstrate that semantic wikis are a valuable and extensive part of Linked Open Data (LOD), and in fact may be considered an own active \"sub-cloud\" within the LOD ecosystem, which can provide useful insights into the evolution of small and medium-sized domain-specific Knowledge Graphs.", "venue": "Lecture notes in computer science", "label": 0}, {"loc": [3.4722836017608643, -0.10538766533136368], "openalex_id": "https://openalex.org/W4399442007", "title": "Fine-tuning large language models: from accuracy enhancement to bias mitigation", "authors": "Wei Zhang, Qinggong Wang, Xiangtai Kong, Jiacheng Xiong, Shengkun Ni, Duanhua Cao, Buying Niu, Mingan Chen, Yameng Li, Runze Zhang, Yitian Wang, Lehan Zhang, Xutong Li, Zhaoping Xiong, Qian Shi, Ziming Huang, Zunyun Fu, Mingyue Zheng", "abstract": "Extracting knowledge from complex chemical texts is essential for both experimental and computational chemists. Fine-tuned large language models (LLMs) can serve as flexible and effective extractors for automated data acquisition.", "venue": "Chemical Science", "label": 0}, {"loc": [5.245617389678955, -1.565580129623413], "openalex_id": "https://openalex.org/W4394610539", "title": "Enhancing Sentiment Analysis Accuracy in Borobudur Temple Visitor Reviews through Semi-Supervised Learning and SMOTE Upsampling", "authors": "Candra Agustina, Purwanto Purwanto, Farikhin Farikhin", "abstract": "The level of visitor satisfaction with tourist destinations can be known from reviews on social media.One method used is to carry out sentiment analysis on comments given by visitors on social media or related websites.This study was envisioned as a preliminary phase to bolster subsequent research concerning tourist destination recommendation systems around Borobudur Temple.We conducted a sentiment analysis using a semisupervised learning approach.Within this approach, the dataset was partitioned into labeled and unlabeled data.The labeled data served as a reference for the automatic labeling process, which utilized the Multinomial Na\u00ef ve Bayes algorithm.Specifically, the objective was to extract sentiments from visitors to Borobudur Temple.These extracted sentiments will later be employed as a variable in subsequent research.Dataset preprocessing steps encompassed data cleaning, sentence segmentation, tokenization, and stop word removal.We observed that the difference in labeling outcomes between datasets trained without Synthetic Minority Oversampling Technique (SMOTE) Upsampling and those trained with SMOTE Upsampling was a mere 0.18%.The labeled data not only plays a pivotal role in model training but is also instrumental in evaluating the accuracy of the Multinomial Na\u00ef ve Bayes algorithm.Crucially, after implementing the SMOTE Upsampling technique, our model exhibited a significant improvement, achieving an accuracy rate of 83.68%.This noteworthy enhancement represents a substantial increase from the initial accuracy rate of 60.59%.Our in-depth analysis underscores the superior performance achieved when the training data undergo the SMOTE Upsampling process, indicating the effectiveness of this approach in refining sentiment analysis outcomes for tourist reviews.", "venue": "Journal of Advances in Information Technology", "label": 0}, {"loc": [4.667966842651367, 0.2983911633491516], "openalex_id": "https://openalex.org/W4393169378", "title": "Design of Artificial Intelligence Companion Chatbot", "authors": "Xiaoying Chen, Jie Kang, Cong Hu", "abstract": "With the development of cities and the prevalence of networks, interpersonal relationships have become increasingly distant. When people crave communication, they hope to find someone to confide in. With the rapid advancement of deep learning and big data technologies, an enabling environment has been established for the development of intelligent chatbot systems. By effectively combining cutting-edge technologies with human-centered design principles, chatbots hold the potential to revolutionize our lives and alleviate feelings of loneliness. A multi-topic chat companion robot based on a state machine has been proposed, which can engage in fluent dialogue with humans and meet different functional requirements. It can chat with users about movies, music, and other related topics, and recommend movies and music that may interest them to alleviate their loneliness and provide companionship. The interaction platform of the companion robot is realized through the QQ communication platform, with two chat modes: Conversation mode and recommendation mode. First, the KdConv open-source corpus was selected, and Python was used to crawl information on movies and music from Douban and QQ Music to establish and pre-process the dataset. Then, the dialogue function was implemented using generative language models and retrieval systems, while the recommendation function was achieved using user profiling and collaborative filtering. Finally, a state machine algorithm was used to achieve real-time switching between the two chat modes of the companion robot. In conclusion, test participants gave high ratings for the accuracy of the companion robot's responses and the satisfaction with its content recommendations. Compared to traditional large-scale integrated models, this robot employs a state-machine framework to achieve diverse functions through seamless state transitions, thereby enhancing computational speed and precision. Additionally, the robot can recommend movies and music, providing companionship and alleviating loneliness for users, which is of great significance in modern society where interpersonal relationships are increasingly alienated.", "venue": "Journal of New Media", "label": 0}, {"loc": [6.847867012023926, 2.516993522644043], "openalex_id": "https://openalex.org/W4390489717", "title": "The Right Prompts for the Job: Repair Code-Review Defects with Large Language Model", "authors": "Zelin Zhao, Zhaogui Xu, Jialong Zhu, Di Peng, Yuan Yao, Xiaoxing Ma", "abstract": "Automatic program repair (APR) techniques have the potential to reduce manual efforts in uncovering and repairing program defects during the code review (CR) process. However, the limited accuracy and considerable time costs associated with existing APR approaches hinder their adoption in industrial practice. One key factor is the under-utilization of review comments, which provide valuable insights into defects and potential fixes. Recent advancements in Large Language Models (LLMs) have enhanced their ability to comprehend natural and programming languages, enabling them to generate patches based on review comments. This paper conducts a comprehensive investigation into the effective utilization of LLMs for repairing CR defects. In this study, various prompts are designed and compared across mainstream LLMs using two distinct datasets from human reviewers and automated checkers. Experimental results demonstrate a remarkable repair rate of 72.97% with the best prompt, highlighting a substantial improvement in the effectiveness and practicality of automatic repair techniques.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.099878311157227, 1.006689190864563], "openalex_id": "https://openalex.org/W4404579283", "title": "Adapter-based Approaches to Knowledge-enhanced Language Models--A Survey", "authors": "Alexander Fichtl, Juraj Vladika, Georg Groh", "abstract": "Knowledge-enhanced language models (KELMs) have emerged as promising tools to\\nbridge the gap between large-scale language models and domain-specific\\nknowledge. KELMs can achieve higher factual accuracy and mitigate\\nhallucinations by leveraging knowledge graphs (KGs). They are frequently\\ncombined with adapter modules to reduce the computational load and risk of\\ncatastrophic forgetting. In this paper, we conduct a systematic literature\\nreview (SLR) on adapter-based approaches to KELMs. We provide a structured\\noverview of existing methodologies in the field through quantitative and\\nqualitative analysis and explore the strengths and potential shortcomings of\\nindividual approaches. We show that general knowledge and domain-specific\\napproaches have been frequently explored along with various adapter\\narchitectures and downstream tasks. We particularly focused on the popular\\nbiomedical domain, where we provided an insightful performance comparison of\\nexisting KELMs. We outline the main trends and propose promising future\\ndirections.\\n", "venue": "https://doi.org/10.5220/0013058500003838", "label": 0}, {"loc": [7.023104667663574, 0.4228798449039459], "openalex_id": "https://openalex.org/W4392666583", "title": "UzRoberta: A pre-trained language model for Uzbek", "authors": "Rifkat Davronov, Fatima \u0410dilova", "abstract": "I would like to introduce the UzLUE standard, which stands for Uzbek Language Understanding Evaluation. A challenge for understanding the natural language of Uzbek (NLU) is UzLUE, which includes message classification. Build jobs from scratch using a diverse source corpus while respecting copyright to make it fully accessible for everyone. We use this UzLUE-RoBERTa, a pre-trained language model (PLM), to support replication of the base model in UzLUE and facilitate future research. We found that UzLUE-RoBERT [1]-base outperforms other baselines including multilingual PLM.", "venue": "AIP conference proceedings", "label": 0}, {"loc": [4.343433380126953, 0.8435485363006592], "openalex_id": "https://openalex.org/W4391790090", "title": "Use of Large Language Models to Aid Analysis of Textual Data", "authors": "Robert H. Tai, Lillian Bentley, Xin Xia, Jason M. Sitt, Sarah C. Fankhauser, Ana M. Chicas\u2010Mosier, Barnas G. Monteith", "abstract": "The increasing use of machine learning and Large Language Models (LLMs) opens up opportunities to use these artificially intelligent algorithms in novel ways. This article proposes a methodology using LLMs to support traditional deductive coding in qualitative research. We began our analysis with three different sample texts taken from existing interviews. Next, we created a codebook and inputted the sample text and codebook into an LLM. We asked the LLM to determine if the codes were present in a sample text provided and requested evidence to support the coding. The sample texts were inputted 160 times to record changes between iterations of the LLM response. Each iteration was analogous to a new coder deductively analyzing the text with the codebook information. In our results, we present the outputs for these recursive analyses, along with a comparison of the LLM coding to evaluations made by human coders using traditional coding methods. We argue that LLM analysis can aid qualitative researchers by deductively coding transcripts, providing a systematic and reliable platform for code identification, and offering a means of avoiding analysis misalignment. Implications of using LLM in research praxis are discussed, along with current limitations.", "venue": "International Journal of Qualitative Methods", "label": 0}, {"loc": [3.629732847213745, 4.545982360839844], "openalex_id": "https://openalex.org/W4393241117", "title": "SimLESS: A Secure Deduplication System over Similar Data in Cloud Media Sharing", "authors": "Mingyang Song, Zhongyun Hua, Yifeng Zheng, Tao Xiang, Xiaohua Jia", "abstract": "With the growing popularity of cloud computing, sharing media data through the cloud has become a common practice. Due to high information redundancy, media data take up a significant amount of storage space. Moreover, similar media data may have the same visual effect, resulting in unnecessary duplication. Thus, it can greatly improve the cloud storage efficiency by performing deduplication to the similar media data stored on the cloud. However, data privacy is a growing concern in cloud-based service. In this paper, we present SimLESS, a secure deduplication system for similar data in cloud media sharing. SimLESS allows the cloud to perform deduplication over the encrypted similar media data of different distributors while protecting the confidentiality and ownership of the data. When uploading a media file, SimLESS allows the distributor to set a distance threshold, and the cloud performs deduplication only when there is a file on the cloud whose distance from the file being uploaded is smaller than the threshold. Additionally, we provide fine-grained access control for distributors to ensure that only authorized media consumers can access the data. Furthermore, our system prevents any distributor from claiming ownership of a media file using only the tag of a similar file. We formally analyze the security of SimLESS and implement a system prototype to evaluate its performance. Our experimental results demonstrate that the computation and communication costs of SimLESS are practically affordable.", "venue": "IEEE Transactions on Information Forensics and Security", "label": 0}, {"loc": [8.998651504516602, 0.15239426493644714], "openalex_id": "https://openalex.org/W4403863482", "title": "QARR-FSQA: Question-Answer Replacement and Removal Pretraining Framework for Few-Shot Question Answering", "authors": "Siao Wah Tan, Chin Poo Lee, Kian Ming Lim, Tee Connie, Ali Alqahtani", "abstract": "In Natural Language Processing, creating training data for question answering (QA) systems \\ntypically requires significant effort and expertise. This challenge is amplified in few-shot scenarios where \\nonly a limited number of training samples are available. This paper proposes a novel pretraining framework \\nto enhance few-shot question answering (FSQA) capabilities. It begins with the selection of the Discrete \\nReasoning Over the Content of Paragraphs (DROP) dataset, designed for English reading comprehension \\ntasks involving various reasoning types. Data preprocessing converts question-answer pairs into a predefined \\ntemplate, consisting of a concatenated sequence of the question, a mask token with a prefix, and the context, \\nforming the input sequence, while the target sequence includes the question and answer. The QuestionAnswer Replacement and Removal (QARR) technique augments the dataset by integrating the answer into \\nthe question and selectively removing words. Various templates for question-answer pairs are introduced. \\nModels like BART, T5, and LED are then used to evaluate the framework\u2019s performance, undergoing further \\npretraining on the augmented dataset with their respective architectures and optimization objectives. The \\nstudy also investigates the impact of different templates on model performance in few-shot QA tasks. \\nEvaluated on three datasets in few-shot scenarios, the QARR-T5 method outperforms state-of-the-art FSQA \\ntechniques, achieving the highest F1 scores of 81.7% in 16-shot and 32-shot, 82.7% in 64-shot, and 84.5% \\nin 128-shot on the SQuAD dataset. This demonstrates the framework\u2019s effectiveness in improving models\u2019 \\ngeneralization and performance on new datasets with limited samples, advancing few-shot QA", "venue": "IEEE Access", "label": 2}, {"loc": [8.547780990600586, -0.0522797666490078], "openalex_id": "https://openalex.org/W4402683972", "title": "PROXYQA: An Alternative Framework for Evaluating Long-Form Text Generation with Large Language Models", "authors": "Haochen Tan, Zhijiang Guo, Zhan Shi, L\u00fc Xu, Zhili Liu, Yunlong Feng, Xiaoguang Li, Yasheng Wang, Lifeng Shang, Qun Liu, Linqi Song", "abstract": "Large Language Models (LLMs) have succeeded remarkably in understanding long-form contents. However, exploring their capability for generating long-form contents, such as reports and articles, has been relatively unexplored and inadequately assessed by existing benchmarks. The prevalent evaluation methods, which predominantly rely on crowdsourcing, are recognized for their labor-intensive nature and lack of efficiency, whereas automated metrics, such as the ROUGE score, demonstrate discordance with human judgment criteria. In this paper, we propose PROXYQA, an innovative framework dedicated to assessing long-text generation. PROXYQA comprises in-depth human-curated meta-questions spanning various domains, each accompanied by specific proxy-questions with pre-annotated answers. LLMs are tasked to generate extensive content in response to these meta-questions, by engaging an evaluator and incorporating the generated texts as contextual background, PROXYQA assesses the generated content's quality through the evaluator's accuracy in addressing the proxy-questions. We examine multiple LLMs, emphasizing PROXYQA's demanding nature as a high-quality assessment tool. Human evaluation demonstrates that the proxy-question method is notably self-consistent and aligns closely with human evaluative standards. The dataset and leaderboard is available at https://proxy-qa.com.", "venue": "http://doi.org/10.18653/v1/2024.acl-long.368", "label": 0}, {"loc": [6.735334873199463, 1.4341906309127808], "openalex_id": "https://openalex.org/W4399268587", "title": "Overview of Existing LLM Families", "authors": "Andrei Kucharavy", "abstract": "Abstract While the general public discovered Large Language Models (LLMs) with ChatGPT\u2014a generative autoregressive model, they are far from the only models in the LLM family. Various architectures and training regiments optimized for specific usages were designed throughout their development, which were then classified as different LLM families.", "venue": "https://doi.org/10.1007/978-3-031-54827-7_3", "label": 0}, {"loc": [7.765803337097168, 3.56947922706604], "openalex_id": "https://openalex.org/W4399449628", "title": "Measuring and Improving the Energy Efficiency of Large Language Models Inference", "authors": "Mauricio Fadel Argerich, Marta Pati\u00f1o-Mart\u0131\u0301nez", "abstract": "Recent improvements in the accuracy of machine learning (ML) models in the language domain have propelled their use in a multitude of products and services, touching millions of lives daily. These new levels of accuracy have been attained mainly through exponential growth in model size, creating a new category of models known as Large Language Models (LLMs) and leading to a substantial increase in computing and energy demands. While recent studies have focused on measuring and improving the energy consumption of LLMs during training, inference has received little attention. In this article, we present an approach to profile the energy consumption of LLMs during inference and leverage it to improve energy efficiency. For this, we deploy several state-of-the-art LLMs and observe how model size, number of layers, parallelized attention, and even vocabulary size affect their energy consumption. In addition, we leverage input batch size and different quantization levels to optimize their inference energy efficiency and latency.", "venue": "IEEE Access", "label": 2}, {"loc": [4.930702209472656, 0.38784265518188477], "openalex_id": "https://openalex.org/W4399809615", "title": "Studies in conversational AI: multilingual capabilities, world knowledge, and evaluation strategies", "authors": "Maxime De Bruyn", "abstract": "This thesis studies the evolving landscape of conversational AI.The main research objective is to improve the conversational abilities of conversational agents, with a focus on integrating real-time knowledge and expanding multilingual capabilities. Integration of External KnowledgeThe thesis investigates how to incorporate external knowledge into conversational agents without the need for retraining the entire model.This aspect is crucial as it deals with the dynamic nature of information and the need for AI agents to stay updated. Assessment of Inherent World KnowledgeAnother critical problem is how to evaluate the inherent world knowledge that users expect from conversational agents.This involves benchmarking the agents' common sense and broad understanding of the world, which is essential for natural and relevant interactions. Refinement of Agent ResponsesThe research also explores refining the agents' ability to select the most appropriate response from a set of potential replies. Multilingual CapabilitiesThe thesis recognizes the growing need for conversational agents to be proficient in languages other than English.It examines how additional datasets can be used to develop agents capable of operating effectively across a multitude of languages.Evaluation Metrics Finally, the thesis addresses the challenge of evaluating conversational agents.Unlike many machine learning applications where a gold standard or reference exists, conversational agents require metrics that acknowledge the multifaceted and subjective nature of conversations, where multiple valid continuations exist.i ii Overall, the thesis presents a comprehensive approach to enhancing knowledgegrounded conversations, emphasizing better access to external and world knowledge, enhancing non-English language capabilities, and developing more effective evaluation strategies.It synthesizes findings from various interdisciplinary studies and sets a path for future research in the field of conversational AI.", "venue": "https://doi.org/10.63028/10067/2023030151162165141", "label": 0}, {"loc": [4.1765336990356445, 1.0026720762252808], "openalex_id": "https://openalex.org/W4404056777", "title": "JusticeAI: A Large Language Models Inspired Collaborative & Cross-Domain Multimodal System for Automatic Judicial Rulings in Smart Courts", "authors": "Nagwan Abdel Samee, Maali Alabdulhafith, Syed Muhammad Ahmed Hassan Shah, Atif Rizwan", "abstract": "There has been a significant amount of attention in recent years toward the utilization of artificial intelligence (AI) in the realm of legal decision-making. This growing pattern reveals a higher interest among academics and legal professionals in utilizing AI technologies to enhance a number of legal system components. Artificial intelligence (AI) tools, such as machine learning and natural language processing, possess the capacity to analyze vast quantities of legal data, extract valuable insights, and facilitate decision-making processes. The primary aim of this study is to develop a sophisticated framework for judicial decision-making that incorporates methodologies from artificial intelligence and utilizes the dataset from the European Court of Human Rights (ECHR). The utilization of this methodology holds promise in improving the decision-making procedures of legal professionals and reducing the laborious task of manually analyzing legal documents. As a result, this can lead to the facilitation of more accurate predictions of court rulings. Our research introduces a hybrid ensemble model designed specifically for smart court rulings. This innovative approach harnesses the benefits of pre-trained embeddings and large language models to accurately predict court decisions. By utilizing the power of pre-existing embeddings and incorporating the capabilities of advanced language models, our proposed model demonstrates enhanced predictive accuracy and efficiency in the context of court rulings. We also focus on the models’ feasible interpretability and highlight their ability to determine key factors in legal decision-making. We attain a notably high accuracy score of around 83%. Our research illuminates how large language models (LLMs) and advanced deep learning techniques can be utilized to predict legal outcomes.", "venue": "IEEE Access", "label": 2}, {"loc": [9.39779281616211, 1.4043606519699097], "openalex_id": "https://openalex.org/W4404781217", "title": "CONTOR: Benchmarking strategies for completing ontologies with plausible missing rules", "authors": "Na Li, Thomas Bailleux, Zied Bouraoui, Steven Schockaert", "abstract": "International audience", "venue": "http://doi.org/10.18653/v1/2024.findings-emnlp.488", "label": 0}, {"loc": [8.91666316986084, 0.11688809841871262], "openalex_id": "https://openalex.org/W4391984060", "title": "Employing Siamese MaLSTM Model and ELMO Word Embedding for Quora Duplicate Questions Detection", "authors": "Abdulaziz Altamimi, Muhammad Umer, Danial Hanif, Shtwai Alsubai, Tai-hoon Kim, Imran Ashraf", "abstract": "Quora is an expanding online platform, that contains a growing collection of questions and answers generated by users. The content on this platform is managed by its users which involves creating, editing, and organization. Due to the vast number of users, it is not uncommon to find multiple questions with similar intents, leading to the problem of duplicate and identical questions. Detection of these duplicates could effectively lead to a more efficient search for high-quality answers, ultimately improving the user experience for both readers and writers on Quora. This study utilizes the dataset of Question Pairs for Quora obtained from Kaggle for identifying questions that are duplicates or identical. To vectorize the questions and for model training, six types of word embeddings are implemented including GoogleNewsVector, FastText crawl, FastText crawl sub-words, bidirectional encoder representations from transformers (BERT), robustly optimized BERT pretraining approach (RoBERTa), and embeddings from language models (ELMO) containing 100 dimensions. The Siamese Manhattan long short-term memory (MaLSTM) neural network model, where Ma is Manhattan distance, is applied with ELMO word embedding to predict duplicate questions in the dataset. Experimental results demonstrate that the proposed model attained an accuracy of 95.68% which surpasses the state-of-the-art models.", "venue": "IEEE Access", "label": 2}, {"loc": [5.552665710449219, -1.42104971408844], "openalex_id": "https://openalex.org/W4401943115", "title": "Tasneef: A Fast and Effective Hybrid Representation Approach for Arabic Text Classification", "authors": "Maroua Louail, Chafia Kara-Mohamed Hamdi-Cherif, Aboubekeur Hamdi\u2010Cherif", "abstract": "The Arabic language role in actual global affairs entails sophisticated natural language processing techniques, especially in text classification. This paper presents Tasneef as a novel hybrid approach to tackle computational challenges by reducing memory usage and runtime overhead for actual Arabic text classification (ATC). Tasneef integrates distance-based meta-features (DBMFs) representation with word embeddings. This integration is useful because using a single text representation technique can be limiting in capturing the essential range of features necessary for effective classification, especially in complex languages like Arabic. By addressing the intricacies arising from the high dimensionality and sparsity inherent in Term Frequency-Inverse Document Frequency (TF-IDF) representation, the utilization of DBMFs is shown to offer a promising solution. The DBMFs rely on document labels and statistical features to establish meaningful distance relationships between documents, thereby facilitating effective reduction. Furthermore, word embeddings encapsulate semantic attributes. Empirical assessments reveal a significant reduction of two orders of magnitude in both memory usage and runtime. This reduction translates to memory savings ranging from 158x to 361x and runtime reductions from 120x to 524x across three popular datasets; maintaining comparable MicroF1 and MacroF1 values, while notably reducing learning time. Moreover, Tasneef outperforms ten state-of-the-art deep learning models and seven dimension reduction methods in accuracy, with enhancements ranging from 0.3% to 39.6%; and F-Measure, with improvements from 4.6% to 26.8%, across four additional datasets. These findings highlight Tasneef as a promising solution for diverse ATC applications in real-world scenarios, offering concise and rapid classification with reduced computational learning costs.", "venue": "IEEE Access", "label": 2}, {"loc": [3.283452272415161, 2.541809320449829], "openalex_id": "https://openalex.org/W4403267394", "title": "Semiotics of Machinic Co-Enunciation", "authors": "Enzo D\u2019Armenio, Adrien Deli\u00e8ge, Maria Giulia Dondero", "abstract": "In this paper, we propose a semiotic study on generative artificial intelligences, considering the work of Midjourney and DALL\u00b7E, that are computational devices capable of producing original images on the basis of the training they have received on large databases of visual, verbal and multimodal documents. The aim is to frame their functioning from a semiotic point of view and to describe the operations that can be performed during image composition, through the options available on the two platforms. To achieve this objective, the paper will be developed into three main parts. In the first part, we will provide a general contextualisation of the relationship between semiotics and artificial intelligence, in the broad sense. Starting from Pierluigi Basso Fossali\u2019s (2017) assumptions about a semiotic perspective understood as the study of the social organization of meaning, AIs reconfigure the thresholds between the four dimensions he defined (perception, enunciation, communication and transmission). By considering the transmission and perceptual dimension, we will define the phases of the database construction and AI model training as pertaining to an archival (distributed) perception. In the second part, we will deal with the dimension of enunciation. On the one hand, we will describe the functioning of the diffusion models guided by human prompts. On the other hand, we will show the limits and potential of these AIs, through a discussion of the operable commands and the results obtained, in light of the experiments we have carried out over the past months (August 2023-May 2024). In particular, we will test how generative AIs produce images on the basis of prompts containing the styles of specific artists, how they fuse different styles together, and how they work on visual stereotypes. In the third part of the paper, we will focus on the relationship between verbal description and visual generation in order to provide, in accordance with the perspective of the intersemiotic translation, a new research object and a new methodology.", "venue": "Signata", "label": 0}, {"loc": [6.1234612464904785, -0.7189511060714722], "openalex_id": "https://openalex.org/W4402670292", "title": "A multi-level multi-label text classification dataset of 19th century Ottoman and Russian literary and critical texts", "authors": "Gokcen Gokceoglu, Devrim \u00c7avu\u015fo\u011flu, Emre Akba\u015f, \u00d6zen Nergis Dolcerocca", "abstract": "This paper introduces a multi-level, multi-label text classification dataset comprising over 3000 documents. The dataset features literary and critical texts from 19th-century Ottoman Turkish and Russian. It is the first study to apply large language models (LLMs) to this dataset, sourced from prominent literary periodicals of the era. The texts have been meticulously organized and labeled. This was done according to a taxonomic framework that takes into account both their structural and semantic attributes. Articles are categorized and tagged with bibliometric metadata by human experts. We present baseline classification results using a classical bag-of-words (BoW) naive Bayes model and three modern LLMs: multilingual BERT, Falcon, and Llama-v2. We found that in certain cases, Bag of Words (BoW) outperforms Large Language Models (LLMs), emphasizing the need for additional research, especially in low-resource language settings. This dataset is expected to be a valuable resource for researchers in natural language processing and machine learning, especially for historical and low-resource languages. The dataset is publicly available.", "venue": "http://doi.org/10.18653/v1/2024.findings-acl.393", "label": 0}, {"loc": [5.142406940460205, -1.62342369556427], "openalex_id": "https://openalex.org/W4401691580", "title": "Large Language Models and Sentiment Analysis in Financial Markets: A Review, Datasets and Case Study", "authors": "Chenghao Liu, Arunkumar Arulappan, Ranesh Kumar Naha, Aniket Mahanti, Joarder Kamruzzaman, In-Ho Ra", "abstract": "This paper comprehensively examines Large Language Models (LLMs) in sentiment analysis, specifically focusing on financial markets and exploring the correlation between news sentiment and Bitcoin prices. We systematically categorize various LLMs used in financial sentiment analysis, highlighting their unique applications and features. We also investigate the methodologies for effective data collection and categorization, underscoring the need for diverse and comprehensive datasets. Our research features a case study investigating the correlation between news sentiment and Bitcoin prices, utilizing advanced sentiment analysis and financial analysis methods to demonstrate the practical application of LLMs. The findings reveal a modest but discernible correlation between news sentiment and Bitcoin price fluctuations, with historical news patterns showing a more substantial impact on Bitcoin's longer-term price than immediate news events. This highlights LLMs' potential in market trend prediction and informed investment decision-making. \u00a9 2013 IEEE.", "venue": "IEEE Access", "label": 2}, {"loc": [6.240983963012695, 5.820043087005615], "openalex_id": "https://openalex.org/W4396597673", "title": "DPHANet: Discriminative Parallel and Hierarchical Attention Network for Natural Language Video Localization", "authors": "Ruihan Chen, Junpeng Tan, Zhijing Yang, Xiaojun Yang, Qingyun Dai, Yongqiang Cheng, Liang Lin", "abstract": "Natural Language Video Localization (NLVL) has recently attracted much attention because of its practical significance. However, the existing methods still face the following challenges: 1) When the models learn intra-modal semantic association, the temporal causal interaction information and contextual semantic discriminative information are ignored, resulting in the lack of intra-modal semantic context connection; 2) When learning fusion representations, existing cross-modal interaction modules lack hierarchical attention function to extract intermodal similarity information and intra-modal self-correlation information, resulting in insufficient cross-modal information interaction; 3) When the loss function is optimized, the existing models ignore the correlation of causal inference between the start and end boundaries, resulting in inaccurate start and end boundary calibrations. To conquer the above challenges, we proposed a novel NLVL model, called Discriminative Parallel and Hierarchical Attention Network (DPHANet). Specifically, we emphasized the importance of temporal causal interaction information and contextual semantic discriminative information and correspondingly proposed a Discriminative Parallel Attention Encoder (DPAE) module to infer and encode the above critical information. Besides, to overcome the shortcomings of the existing cross-modal interaction modules, we designed a Video-Query Hierarchical Attention (VQHA) module, which can perform cross-modal interaction and intra-modal self-correlation modeling in a hierarchical manner. Furthermore, a novel deviation loss function was proposed to capture the correlation of causal inference between the start and end boundaries and force the model to focus on the continuity and temporal causality in the video. Finally, extensive experiments on three benchmark datasets demonstrated the superiority of our proposed DPHANet model, which has achieved about 1.5% and 3.5% average performance improvement and about 2.5% and 7.5% maximum performance improvement on the Charades-STA and TACoS datasets respectively.", "venue": "IEEE Transactions on Multimedia", "label": 0}, {"loc": [5.540896415710449, 5.4384765625], "openalex_id": "https://openalex.org/W4404784425", "title": "SEACrowd: A Multilingual Multimodal Data Hub and Benchmark Suite for Southeast Asian Languages", "authors": "Holy Lovenia, Rahmad Mahendra, Salsabil Maulana Akbar, Lester James V. Miranda, Jennifer Santoso, Elyanah Aco, Akhdan Fadhilah, Jonibek Mansurov, Joseph Marvin Imperial, Onno P. Kampman, Joel Ruben Antony Moniz, Muhammad Ravi Shulthan Habibi, Frederikus Hudi, Jann Railey Montalan, Ryan Hadiwijaya, Joanito Agili Lopo, William Nixon, B\u00f6rje F. Karlsson, James Jaya, Ryandito Diandaru, Yuze Gao, Patrick Amadeus Irawan, B.-H Wang, Jan Christian Blaise Cruz, Chenxi Whitehouse, Ivan Halim Parmonangan, Maria Khelli, Wenyu Zhang, Lucky Susanto, Reynard Adha Ryanda, Sonny Lazuardi Hermawan, Dan John Velasco, Muhammad Dehan Al Kautsar, Willy Fitra Hendria, Yasmin Moslem, Noah R. Flynn, Muhammad Farid Adilazuarda, Haochen Li, Johanes Lee, R. Damanhuri, Shuo Sun, Muhammad Reza Qorib, Amirbek Djanibekov, Wei Qi Leong, V. Quyet, Niklas Muennighoff, Tanrada Pansuwan, Ilham Firdausi Putra, Yan Xu, Tai Ngee Chia, Ayu Purwarianti, Sebastian Ruder, William Chandra Tjhi, Peerat Limkonchotiwat, Alham Fikri Aji, Sedrick Scott Keh, Genta Indra Winata, Ruochen Zhang, Fajri Koto, Zheng Yong, Samuel Cahyawijaya", "abstract": "Southeast Asia (SEA) is a region characterized by rich linguistic diversity and cultural variety, with over 1,300 indigenous languages and a population of 671 million people. However, the performance of contemporary AI models for SEA languages is compromised by a significant lack of representation of texts, images, and auditory datasets from SEA. Evaluating models for SEA languages is challenging due to the scarcity of high-quality datasets, compounded by the predominance of English training data, which raises concerns regarding potential cultural misrepresentation. To address these challenges, we introduce SEACrowd, a collaborative initiative that consolidates a comprehensive resource hub to bridge the resource gap by providing standardized corpora and benchmarks in nearly 1,000 SEA languages across three modalities. We assess the performance of AI models on 36 indigenous languages across 13 tasks included in SEACrowd, offering valuable insights into the current AI landscape in SEA. Furthermore, we propose strategies to facilitate greater AI advancements, maximizing potential utility and resource equity for the future of AI in Southeast Asia. \u00a9 2024 Association for Computational Linguistics.", "venue": "https://doi.org/10.18653/v1/2024.emnlp-main.296", "label": 0}, {"loc": [4.709132194519043, 1.7848750352859497], "openalex_id": "https://openalex.org/W4407792530", "title": "Exploring the potential and limitations of large language models as virtual respondents for social science research", "authors": "Zs\u00f3fia Rakovics, M\u00e1rton Rakovics", "abstract": "Social and linguistic differences encoded in various textual content available on the internet represent certain features of modern societies. For any scientific research which is interested in social differences mediated by language, the advent of large language models (LLMs) has brought new opportunities. LLMs could be used to extract information about different groups of society and utilized as data providers by acting as virtual respondents generating answers as such. Using LLMs (GPT-variants, Llama2, and Mixtral), we generated virtual answers for politics and democracy related attitude questions of the European Social Survey (10th wave) and statistically compared the results of the simulated responses to the real ones. We explored different prompting techniques and the effect of different types and richness of contextual information provided to the models. Our results suggest that the tested LLMs generate highly realistic answers and are good at invoking the needed patterns from limited contextual information given to them if a couple of relevant examples are provided, but struggle in a zero-shot setting. A critical methodological analysis is inevitable when considering the potential use of data generated by LLMs for scientific research, the exploration of known biases and reflection on social reality not represented on the internet are essential.", "venue": "Intersections", "label": 0}, {"loc": [7.044475555419922, -1.0698386430740356], "openalex_id": "https://openalex.org/W4403295128", "title": "Developing a Thai Grammatical Error Correction Tool for Deaf Students", "authors": "Supachan Traitruengsakul, Ekapol Chuangsuwanich", "abstract": "Deaf students face challenges in written communication due to errors such as insertion, deletion, word order issues, misusage, and misspellings. Grammatical error correction (GEC) technology can help mitigate these issues. However, existing GEC models are primarily trained on online resources from second-language learners who are hearing. In contrast, sentences written by deaf students exhibit a variety of errors not typically found elsewhere. To address this, we aimed to create the Thai Deaf Corpus (TDC) from deaf students in grades 7–12 across four deaf schools. Our analysis of the TDC revealed that deaf students wrote short sentences, averaging six words each, used 4,585 unique words, and predominantly produced ungrammatical sentences. In addition, we introduce a two-stage system (Thai-GEC model) to automatically detect and correct incorrect words in ungrammatical sentences. In our experiments, we compared different detection and correction models on the dataset. As a result, off-the-shelf models perform poorly compared to models specifically created using our corpus, showing the usefulness of our dataset. The TDC is available at https://github.com/Supachan/ThaiDeafCorpus.git.", "venue": "IEEE Access", "label": 2}, {"loc": [5.2316575050354, -1.5799157619476318], "openalex_id": "https://openalex.org/W4404192572", "title": "Multilingual Sarcasm Detection for Enhancing Sentiment Analysis using Deep Learning Algorithms", "authors": "Abdelrahman Gamal Yacoub, Amal Elsayed Aboutabl, Salwa O. Slim", "abstract": "Recent years have seen a notable rise in online opinion-sharing, underscoring the demand for automated sentiment analysis tools. Addressing sarcasm in text is crucial, as it can significantly influence the effectiveness of sentiment analysis models. This research explores how sentiment analysis (SA) and sarcasm detection (SD) intersect, highlighting challenges in identifying how sarcasm influences sentiment polarity. Sarcasm, a type of irony, poses computational difficulties due to the lack of nonverbal cues in written texts. Users often express opinions in their preferred languages, underscoring the need for sentiment analysis tools that can adeptly handle sentiment and sarcasm across diverse languages. We propose the incorporation of sarcasm features into the architecture of sentiment analysis models, employing classifiers and embeddings, including BILSTM or LSTM alongside word embedding techniques such as Word2vec, FastText, Glove, and Bert. We conducted experiments using the ArSarcasm-v2 Dataset for Arabic, the IMDB Movie dataset and IsarcasmEval dataset for English, and the SentiMixArEn dataset for code-mixed language scenarios. The results demonstrated consistent accuracy enhancements ranging from 2% to over 10%, highlighting the positive impact of incorporating sarcasmrelated information. Additionally, the Bi-LSTM model with GloVe embeddings achieved higher accuracy across all scenarios compared to other methods.", "venue": "Journal of Communications Software and Systems", "label": 0}, {"loc": [2.5968194007873535, 1.5190132856369019], "openalex_id": "https://openalex.org/W4398769297", "title": "Treading water: new data on the impact of AI ethics information sessions in classics and ancient language pedagogy", "authors": "Edward A. S. Ross, Jackie Baines", "abstract": "Abstract Over 2023, many universities and policy organisations in the higher education (HE) sector are working to create guiding principles and guidelines for the use of generative artificial intelligence (AI) in HE Teaching and Learning (T&L). Despite these guidelines, students remain unsure if and how they should use AI. This article discusses the AI information sessions held over the Autumn 2023 term in the Department of Classics at the University of Reading, which aimed to provide students with the knowledge and tools to make informed judgements about using AI in their studies. These sessions discussed the benefits and drawbacks of generative AI, highlighting training data, content policy, environmental impact, and examples of potential uses. Staff and student participants were surveyed before and after these information sessions to gather their opinions surrounding AI use. Although at least 60% of participants had previously used generative AI, 80% of participants were apprehensive of or against using generative AI tools for learning purposes following the AI information sessions. By providing staff and students with the ethical considerations surrounding generative AI, they can make an informed judgement about using AI in their work without misplaced faith or excessive fear.", "venue": "\u0098The \u009cjournal of classics teaching", "label": 0}, {"loc": [2.040131092071533, 5.3358635902404785], "openalex_id": "https://openalex.org/W4404035212", "title": "Multi-modal Comparative Analysis on Execution of Phishing Detection using Artificial Intelligence", "authors": "Divya Jennifer Dsouza, Anisha P Rodrigues, Roshan Fernandes", "abstract": "Phishing is the process of deceiving or stealing private or confidential information through illicit means. This could lead to financial loss, loss of reputation, and identity theft. Hence, identifying and preventing the use of such phishing sites becomes crucial. In data science, the term outlier, also termed an anomaly refers to points or series of data that opt out of the normal behaviour of the system under study. Anomaly detection touches down on the concepts related to studying the authentic outlier in a data set. The paper aims to present the optimised techniques and multiple modes of executing the process for detecting phishing websites. The most relevant features are chosen for execution by applying feature extraction. The Mendley phishing websites dataset is used to detect phishing websites, along with the SPAM-HAM publicly available dataset, which is used for detecting SPAM/HAM classification for SMS data in this research study. The experiments are also carried out on a custom dataset to avoid any bias present in a publicly available dataset. The study is carried out in three modes, namely offline, batch, and incremental, using machine learning models. The performance evaluation metrics such as accuracy, f1 score, precision, recall, and time complexity of the machine learning models and accuracy and loss metrics of the deep learning models are compared between the different modes. The study is then summarised by detailing the pros and cons of each of the modes and models used for the study. The incremental mode of execution suits better for real-time processing, with an accuracy of 97.1% on the custom dataset using the adaptive random forest (ARF) classifier available in the Python’s River Framework. But if we make use of the deep learning approach with Keras sequential model, the accuracy obtained was 99.28%.", "venue": "IEEE Access", "label": 2}, {"loc": [4.065991401672363, 0.3976452648639679], "openalex_id": "https://openalex.org/W4399377978", "title": "Materials science in the era of large language models: a perspective", "authors": "Ge Lei, R. Docherty, Samuel J. Cooper", "abstract": "This perspective paper explores the potential of Large Language Models (LLMs) in materials science, highlighting their abilities to handle ambiguous tasks, automate processes, and extract knowledge at scale across various disciplines.", "venue": "Digital Discovery", "label": 0}, {"loc": [3.21317982673645, 1.291650414466858], "openalex_id": "https://openalex.org/W4400604947", "title": "It is Time to Develop an Auditing Framework to Promote Value Aware Chatbots", "authors": "Yanchen Wang, Lisa Singh", "abstract": "The launch of ChatGPT in November 2022 marked the beginning of a new era in\\nAI, the availability of generative AI tools for everyone to use. ChatGPT and\\nother similar chatbots boast a wide range of capabilities from answering\\nstudent homework questions to creating music and art. Given the large amounts\\nof human data chatbots are built on, it is inevitable that they will inherit\\nhuman errors and biases. These biases have the potential to inflict significant\\nharm or increase inequity on different subpopulations. Because chatbots do not\\nhave an inherent understanding of societal values, they may create new content\\nthat is contrary to established norms. Examples of concerning generated content\\nincludes child pornography, inaccurate facts, and discriminatory posts. In this\\nposition paper, we argue that the speed of advancement of this technology\\nrequires us, as computer and data scientists, to mobilize and develop a\\nvalues-based auditing framework containing a community established standard set\\nof measurements to monitor the health of different chatbots and LLMs. To\\nsupport our argument, we use a simple audit template to share the results of\\nbasic audits we conduct that are focused on measuring potential bias in search\\nengine style tasks, code generation, and story generation. We identify\\nresponses from GPT 3.5 and GPT 4 that are both consistent and not consistent\\nwith values derived from existing law. While the findings come as no surprise,\\nthey do underscore the urgency of developing a robust auditing framework for\\nopenly sharing results in a consistent way so that mitigation strategies can be\\ndeveloped by the academic community, government agencies, and companies when\\nour values are not being adhered to. We conclude this paper with\\nrecommendations for value-based strategies for improving the technologies.\\n", "venue": "https://doi.org/10.5220/0012806800003756", "label": 0}, {"loc": [5.002871990203857, 1.7630525827407837], "openalex_id": "https://openalex.org/W4402671004", "title": "Do Multilingual Large Language Models Mitigate Stereotype Bias?", "authors": "Shangrui Nie, Michael Fromm, Charles Welch, Rebekka G\u00f6rge, Akbar Karimi, Joan Plepi, Nazia Afsan Mowmita, Nicolas Flores-Herr, Mehdi Ali, Lucie Flek", "abstract": "65", "venue": "https://doi.org/10.18653/v1/2024.c3nlp-1.6", "label": 0}, {"loc": [9.562373161315918, 0.9867432117462158], "openalex_id": "https://openalex.org/W4396573430", "title": "Scoping: Towards Streamlined Entity Collections for Multi-Sourced Entity Resolution with Self-Supervised Agents", "authors": "Leonard Traeger, Andreas Behrend, George Karabatis", "abstract": "Linking multiple entities to a real-world object is a time-consuming and error-prone task. Entity Resolution (ER) includes techniques for vectorizing entities (signature), grouping similar entities into partitions (blocking), and matching entity pairs based on specified similarity thresholds (filtering). This paper introduces scoping as a new and integral phase in multi-sourced ER with potentially increased heterogeneity and more unlinkable entities. Scoping reduces the space of candidate entity pairs by ranking, detecting, and removing unlinkable entities through outlier algorithms and reusable self-supervised autoencoders, leaving intact the set of true linkages. Evaluations on multi-sourced schemas show that autoencoders perform best in schemas relevant to each other, where they reduce entity collections to 77% and still contain all linkages.", "venue": "https://doi.org/10.5220/0012607500003690", "label": 0}, {"loc": [4.618755340576172, -1.552141785621643], "openalex_id": "https://openalex.org/W4393240371", "title": "Sentiment analysis on the issue of COVID-19 vaccination using LSTM and fasttext", "authors": "Wien Nurul Dewani, Yufis Azhar, Nur Hayatin", "abstract": "The rapid spread of covid-19 worldwide made WHO decide covid-19 was a global pandemic. This spread impacts the Economy, Education, Social, and others. In response to this, the Indonesian government took action to procure covid-19 vaccination. This vaccination action is one way to reduce the spread of the covid-19 virus. The procurement of vaccinations in Indonesia has caused responses and opinions on various social media, one of which is Twitter. Many Indonesians give their opinion or opinion on Twitter about the procurement of vaccinations. Sentiment analysis is a way of analyzing how the views of the Indonesian people about vaccination. This study aims to create a classification model to determine the sentiment analysis of the Indonesian people regarding the procurement of vaccination using the Long Short-Term Memory (LSTM) method. Several studies are discussing sentiment analysis using the LSTM method, but in this research, fasttext embedding will be used, combined with selecting the proper optimization function and learning rate. The number of datasets resulting from crawling from Twitter is 1268. The best results from all LSTM model tests were obtained using fasttext embedding with SGD learning rate schedule, namely 85% accuracy and 85% f1-score. The addition of fasttext embedding can increase the model's effectiveness by 21%.", "venue": "AIP conference proceedings", "label": 0}, {"loc": [4.064674377441406, -2.406848907470703], "openalex_id": "https://openalex.org/W4394866674", "title": "Reliability Estimation of News Media Sources: Birds of a Feather Flock Together", "authors": "Sergio Burdisso, Dairazalia S\u00e1nchez-Cort\u00e9s, Esa\u00fa Villatoro-Tello, Petr Motl\u00ed\u010dek", "abstract": "Evaluating the reliability of news sources is a routine task for journalists\\nand organizations committed to acquiring and disseminating accurate\\ninformation. Recent research has shown that predicting sources' reliability\\nrepresents an important first-prior step in addressing additional challenges\\nsuch as fake news detection and fact-checking. In this paper, we introduce a\\nnovel approach for source reliability estimation that leverages reinforcement\\nlearning strategies for estimating the reliability degree of news sources.\\nContrary to previous research, our proposed approach models the problem as the\\nestimation of a reliability degree, and not a reliability label, based on how\\nall the news media sources interact with each other on the Web. We validated\\nthe effectiveness of our method on a news media reliability dataset that is an\\norder of magnitude larger than comparable existing datasets. Results show that\\nthe estimated reliability degrees strongly correlates with journalists-provided\\nscores (Spearman=0.80) and can effectively predict reliability labels\\n(macro-avg. F$_1$ score=81.05). We release our implementation and dataset,\\naiming to provide a valuable resource for the NLP community working on\\ninformation verification.\\n", "venue": "http://doi.org/10.18653/v1/2024.naacl-long.383", "label": 0}, {"loc": [6.421607971191406, 0.30807143449783325], "openalex_id": "https://openalex.org/W4403043214", "title": "Tackling the Problem of Multilingualism in Voice Assistants", "authors": "Soham Sabharwal, Rohan Sahni", "abstract": "Voice assistants like Alexa and Siri have become increasingly advanced due to improvements in AI and language processing models like GPT and Gemini. However, these systems often perform poorly with less commonly spoken languages, such as many Indian languages, creating a significant accessibility gap. This paper addresses the problem of multilingualism in voice assistants, with a focus on languages like Hindi, Punjabi, and Bengali. We examine the evolution of voice assistants and highlight the major technical challenges they face, including speech recognition, language processing, and response generation in low-resource languages. To overcome these barriers, we propose a novel framework that combines different AI models to enhance multilingual support. Our approach offers a potential solution to make voice assistants more inclusive and accessible for speakers of underrepresented languages. By broadening language support, this research has the potential to extend the benefits of AI to a much wider audience.", "venue": "International Journal of Electrical Electronics and Computers", "label": 0}, {"loc": [6.950084686279297, -0.07440811395645142], "openalex_id": "https://openalex.org/W4401042846", "title": "T\u00fcbingen-CL at SemEval-2024 Task 1: Ensemble Learning for Semantic Relatedness Estimation", "authors": "Leixin Zhang, \u00c7a\u011fr\u0131 \u00c7\u00f6ltekin", "abstract": "The paper introduces our system for SemEval-2024 Task 1, which aims to predict the relatedness of sentence pairs. Operating under the hypothesis that semantic relatedness is a broader concept that extends beyond mere similarity of sentences, our approach seeks to identify useful features for relatedness estimation. We employ an ensemble approach integrating various systems, including statistical textual features and outputs of deep learning models to predict relatedness scores. The findings suggest that semantic relatedness can be inferred from various sources and ensemble models outperform many individual systems in estimating semantic relatedness.", "venue": "http://doi.org/10.18653/v1/2024.semeval-1.147", "label": 0}, {"loc": [2.067511558532715, 5.315687656402588], "openalex_id": "https://openalex.org/W4390757348", "title": "Sentence Level Analysis Model for Phishing Detection Using KNN", "authors": "Joyce Gikandi, John Wachira Kamau, David Njuguna, Lindah Sawe", "abstract": "Phishing emails have experienced a rapid surge in cyber threats globally, especially following the emergence of the COVID-19 pandemic. This form of attack has led to substantial financial losses for numerous organizations. Although various models have been constructed to differentiate legitimate emails from phishing attempts, attackers continuously employ novel strategies to manipulate their targets into falling victim to their schemes. This form of attack has led to substantial financial losses for numerous organizations. While efforts are ongoing to create phishing detection models, their current level of accuracy and speed in identifying phishing emails is less than satisfactory. Additionally, there has been a concerning rise in the frequency of phished emails recently. Consequently, there is a pressing need for more efficient and high-performing phishing detection models to mitigate the adverse impact of such fraudulent messages. In the context of this research, a comprehensive analysis is conducted on both components of an email message\u2014namely, the email header and body. Sentence-level characteristics are extracted and leveraged in the construction of a new phishing detection model. This model utilizes K Nearest Neighbor (KNN) introducing the novel dimension of sentence-level analysis. Established datasets from Kaggle were employed to train and validate the model. The evaluation of this model\u2019s effectiveness relies on key performance metrics including accuracy of 0.97, precision, recall, and F1-measure.", "venue": "Journal of Cyber Security", "label": 0}, {"loc": [6.21793794631958, 5.732180118560791], "openalex_id": "https://openalex.org/W4396622490", "title": "Referring expression segmentation: from conventional to generalized", "authors": "Chang Liu", "abstract": "In recent years, many remarkable achievements have been made in the field of deep machine learning in various data modalities, such as image processing and natural language comprehension. Based on the good performance of deep neural networks in single modalities, multi-modal tasks, which integrate data from different modal domains, are becoming emerging research topics. Among the complex integrated tasks, one particularly challenging and important task is Referring Expression Segmentation (RES), which aims to generate a segmentation mask for a target object in a given image as described by a given natural language query expression, involving both computer vision and natural language processing. This thesis addresses the problem of RES from multiple angles to investigate the topic of this complex multi-modal task. \\nFirstly, we propose an efficient, instance-specific framework that optimizes the traditional CNN-RNN pipeline. Traditional RES methods usually either use an FCN-like network that directly generates the segmentation mask from the image or first extract all instances using a standalone network and then select the target from targets. We combine the strengths of both kinds of methods and propose a novel framework that can analyze the relationship among instances while maintaining the efficiency of the FCN-like network. \\nSecondly, we employ an attention-based network to model long-range dependencies in both image and language modalities. In CNN networks, the large receptive field is achieved by stacking multiple small-kernel convolutional layers, which is indirect and lacks efficiency when exchanging long-distance features. From this point, we utilize the Transformer-based network that can model long-range dependencies in a more efficient way. Next, based on this work, we find that the generic attention mechanism used in the classic Transformer is designed for processing single-modal data. We further enhance the mechanism of generic attention with feature-fusing capabilities, achieving denser feature fusion. \\nLastly, to accommodate multi-object and no-object expressions, we introduce a novel task called Generalized Referring Expression Segmentation (GRES). To facilitate research in this field, we also construct a large-scale dataset for GRES and design a baseline method, namely ReLA. The proposed method implicitly divides the image into regions and explicitly analyzes the relationship among them, achieving state-of-the-art performance on both RES and GRES datasets. \\nOur proposed approach advances the state-of-the-art in referring segmentation, and further generalizes the conventional RES to Generalized RES, providing new insights, methods and topics for further research in this field.", "venue": "https://doi.org/10.32657/10356/175477", "label": 0}, {"loc": [3.835465908050537, -3.8656506538391113], "openalex_id": "https://openalex.org/W4400944325", "title": "Multilingual Detection of Cyberbullying in Mixed Urdu, Roman Urdu, and English Social Media Conversations", "authors": "F. D. Razi, Naveed Ejaz", "abstract": "Automatic cyberbullying detection in social media is increasingly vital due to the integral role of social networks in people’s lives and the severe impact of cyberbullying. Cyberbullying involves intentional, repetitive, aggressive behaviour to harm others online. Among Urdu-speaking communities worldwide, it is common to use Urdu, Roman Urdu, and English in social media conversations. Existing research and detection methods overlook these linguistic dynamics and fail to address cyberbullying across these languages comprehensively. Additionally, there is no dataset in Urdu and Roman Urdu covering the repetition and intent to harm components of cyberbullying. This research addresses this gap by developing and annotating a comprehensive dataset capturing linguistic variations in cyberbullying instances across Urdu, Roman Urdu, and English, incorporating all aspects of cyberbullying. Besides proposing a dataset, a framework for detecting cyberbullying has been proposed. The framework classifies text messages as aggressive or non-aggressive and introduces novel quantitative measures for repetition and the level of intent to cause harm. The proposed framework classifies cyberbullying by applying thresholds to measures of aggression, repetition, and intent to harm, integrating all three aspects. Results show aggression detection using fine-tuned m-BERT and MuRIL, incorporating measures of repetition and intent to harm on the proposed dataset. Additionally, experiments are conducted to demonstrate the impact of repetition and intent to harm on cyberbullying classification. The best results on the dataset are achieved using fine-tuned MuRIL with a precision of 0.93, recall of 0.92, and an F-measure of 0.92 by incorporating quantitative measures of repetition and intent to harm.", "venue": "IEEE Access", "label": 2}, {"loc": [7.210423469543457, 0.7246674299240112], "openalex_id": "https://openalex.org/W4404347192", "title": "GlotCC: An Open Broad-Coverage CommonCrawl Corpus and Pipeline for Minority Languages", "authors": "Amir Hossein Kargaran, Fran\u00e7ois Yvon, Hinrich Sch\u00fctze", "abstract": "The need for large text corpora has increased with the advent of pretrained language models and, in particular, the discovery of scaling laws for these models. Most available corpora have sufficient data only for languages with large dominant communities. However, there is no corpus available that (i) covers a wide range of minority languages; (ii) is generated by an open-source reproducible pipeline; and (iii) is rigorously cleaned from noise, making it trustworthy to use. We present GlotCC, a clean, document-level, 2TB general domain corpus derived from CommonCrawl, covering more than 1000 languages. We make GlotCC and the system used to generate it - including the pipeline, language identification model, and filters - available to the research community. Corpus v. 1.0 https://huggingface.co/datasets/cis-lmu/GlotCC-v1, Pipeline v. 3.0 https://github.com/cisnlp/GlotCC.", "venue": "https://doi.org/10.52202/079017-0540", "label": 0}, {"loc": [6.0124831199646, -0.7134892344474792], "openalex_id": "https://openalex.org/W4400695254", "title": "Extending the Comparative Argumentative Machine: Multilingualism and Stance Detection", "authors": "Irina Nikishin\u0430, Alexander Bondarenko, Sebastian Zaczek, O. Haag, Matthias Hagen, Chris Biemann", "abstract": "Abstract The comparative argumentative machine CAM can retrieve arguments that answer comparative questions\u2014questions that ask which of several to-be-compared options should be favored in some scenario. In this paper, we describe how we equipped CAM with a better answer stance detection (i.e., a better detection of which option \u201cwins\u201d a comparison) and with system variants to support non-English requests. As for the improved answer stance detection, we develop RoBERTa-based approaches and experimentally show them to be more effective than previous feature-based and LLM-based stance detectors. As for the multilingualism, in a proof of concept, we compare two approaches to support Russian requests and answers: (1) translating the original English CAM data and (2) using an existing replica of CAM on native Russian data. Comparing the translation-based and the replica-based CAM variants in a user study shows that combining their answers seems to be the most promising. For individual questions, the retrieved arguments of the two variants are often different and of quite diverse relevance and quality. As a demonstrator, we deploy a first multilingual CAM version that combines translation-based and replica-based outputs for English and Russian and that can easily be extended to further languages.", "venue": "Lecture notes in computer science", "label": 0}, {"loc": [5.1730475425720215, -1.6511707305908203], "openalex_id": "https://openalex.org/W4399250485", "title": "Enhancing Sentiment Analysis on Social Media Data with Advanced Deep Learning Techniques.", "authors": "Huu-Hoa Nguyen", "abstract": "This paper introduces a comprehensive methodology for conducting sentiment analysis on social media using advanced deep learning techniques to address the unique challenges of this domain. As digital platforms play an increasingly pivotal role in shaping public discourse, the demand for real-time sentiment analysis has expanded across various sectors, including policymaking, brand monitoring, and personalized services. Our study details a robust framework that encompasses every phase of the deep learning process, from data collection and preprocessing to feature extraction and model optimization. We implement sophisticated data preprocessing techniques to improve data quality and adopt innovative feature extraction methods such as TF-IDF, Word2Vec, and GloVe. Our approach integrates several advanced deep learning configurations, including variants of BiLSTMs, and employs tools like Scikit-learn and Gensim for efficient hyperparameter tuning and model optimization. Through meticulous optimization with GridSearchCV, we enhance the robustness and generalizability of our models. We conduct extensive experimental analysis to evaluate these models against multiple configurations using standard metrics to identify the most effective techniques. Additionally, we benchmark our methods against prior studies, and our findings demonstrate that our proposed approaches outperform comparative techniques. These results provide valuable insights for implementing deep learning in sentiment analysis and contribute to setting benchmarks in the field, thus advancing both the theoretical and practical applications of sentiment analysis in real-world scenarios.", "venue": "International Journal of Advanced Computer Science and Applications", "label": 35}, {"loc": [6.190018653869629, 0.3643772602081299], "openalex_id": "https://openalex.org/W4390675039", "title": "EnhancedBERT: A Feature-rich Ensemble Model for Arabic Word Sense Disambiguation with Statistical Analysis and Optimized Data Collection", "authors": "Sanaa Kaddoura, Reem Nassar", "abstract": "Accurate assignment of meaning to a word based on its context, known as Word Sense Disambiguation (WSD), remains challenging across languages. Extensive research aims to develop automated methods for determining word senses in different contexts. However, the literature lacks the presence of datasets generated for the Arabic language WSD. This paper presents a dataset comprising a hundred polysemous Arabic words. Each word in the dataset encompasses 3\u20138 distinct senses, with ten example sentences per sense. Some statistical operations are conducted to gain insights into the dataset, enlightening its characteristics and properties. Subsequently, a novel WSD approach is proposed to utilize similarity measures and find the overlap between contextual information and dictionary definitions. The proposed method uses the power of BERT, a pre-trained language model, to enable effective Arabic word disambiguation. In training, new features are integrated to improve the model's ability to differentiate between various senses of words. The proposed BERT models are combined to compose an ensemble model architecture to improve the classification performances. The performance of the WSD system outperforms state-of-the-art systems, achieving an approximate F1-score of 96 %. Statistical analyses are performed to evaluate the overall performance of the WSD approach by providing additional information on model predictions. A case study was implemented to test the effectiveness of WSD in sentiment analysis, a downstream task.", "venue": "Journal of King Saud University - Computer and Information Sciences", "label": 32}, {"loc": [7.935910701751709, 3.4246909618377686], "openalex_id": "https://openalex.org/W4400703021", "title": "Hydra: Bidirectional State Space Models Through Generalized Matrix Mixers", "authors": "Sukjun Hwang, Aakash Lahoti, Ratish Puduppully, Tri Dao, Albert G. Gu", "abstract": "A wide array of sequence models are built on a framework modeled after Transformers, comprising alternating sequence mixer and channel mixer layers. This paper studies a unifying matrix mixer view of sequence mixers that can be conceptualized as a linear map on the input sequence. This framework encompasses a broad range of well-known sequence models, including the self-attention of Transformers as well as recent strong alternatives such as structured state space models (SSMs), and allows understanding downstream characteristics such as efficiency and expressivity through properties of their structured matrix class. We identify a key axis of matrix parameterizations termed sequence alignment, which increases the flexibility and performance of matrix mixers, providing insights into the strong performance of Transformers and recent SSMs such as Mamba. Furthermore, the matrix mixer framework offers a systematic approach to developing sequence mixers with desired properties, allowing us to develop several new sub-quadratic sequence models. In particular, we propose a natural bidirectional extension of the Mamba model (Hydra), parameterized as a quasiseparable matrix mixer, which demonstrates superior performance over other sequence models including Transformers on non-causal tasks. As a drop-in replacement for attention layers, Hydra outperforms BERT by 0.8 points on the GLUE benchmark and ViT by 2% Top-1 accuracy on ImageNet.", "venue": "https://doi.org/10.52202/079017-3520", "label": 0}, {"loc": [7.302692413330078, 1.8692622184753418], "openalex_id": "https://openalex.org/W4401421514", "title": "MindLLM: Lightweight large language model pre-training, evaluation and domain application", "authors": "Yizhe Yang, Huashan Sun, Jiawei Li, Runheng Liu, Yinghao Li, Yuhang Liu, Yang Gao, Heyan Huang", "abstract": "Large Language Models (LLMs) have demonstrated remarkable performance across various natural language tasks, marking significant strides towards general artificial intelligence. While general artificial intelligence is leveraged by developing increasingly large-scale models, there could be another branch to develop lightweight custom models that better serve certain domains, taking into account the high cost of training and deploying LLMs and the scarcity of resources. In this paper, we present MindLLM, a novel series of bilingual lightweight large language models, trained from scratch, alleviating such burdens by offering models with 1.3 billion and 3 billion parameters. A thorough account of experiences accrued during large model development is given, covering every step of the process, including data construction, model architecture, evaluation, and applications. Such insights are hopefully valuable for fellow academics and developers. MindLLM consistently matches or surpasses the performance of other open-source larger models on some public benchmarks. We also introduce an innovative instruction tuning framework tailored for smaller models to enhance their capabilities efficiently. Moreover, we explore the application of MindLLM in specific vertical domains such as law and finance, underscoring the agility and adaptability of our lightweight models.", "venue": "AI Open", "label": 0}, {"loc": [5.382054805755615, -1.3816877603530884], "openalex_id": "https://openalex.org/W4395470979", "title": "Detection of Sarcasm in Urdu Tweets using Deep Learning and Transformer based Hybrid Approaches", "authors": "Muhammad Ehtisham Hassan, Masroor Hussain, Iffat Maab, Usman Habib, Muhammad Attique Khan, Anum Masood", "abstract": "Sarcasm has a significant role in human communication especially on social media platforms where users express their sentiments through humor, satire, and criticism. The identification of sarcasm is crucial in comprehending the sentiment and the communication context on platforms like Twitter. This ambiguous nature of the expression of content presents the detection of sarcasm as a considerable challenge in natural language processing (NLP). The importance and challenges increase further, especially in languages like Urdu where resources for NLP are limited. The traditional rule-based approaches lack the desired performance because of the subtle and context-based nature of sarcasm. However, the recent advancements in NLP, particularly the transformer architecture-based large language models (LLMs) like BERT offer promising solutions. In this research, we have utilized a newly created Urdu sarcasm dataset comprising 12,910 tweets manually re-annotated into sarcastic and non-sarcastic classes. These tweets were derived from the public Urdu tweet dataset consisting of 19,995 tweets. We have established baseline results using deep learning classifiers comprising CNN, LSTM, GRU, BiLSTM, and CNN-LSTM. To comprehensively capture the contextual information, we propose a novel hybrid model architecture that integrates multilingual BERT (mBERT) embeddings with BiLSTM and multi-head attention (MHA) for Urdu sarcasm. The proposed mBERT-BiLSTM-MHA model demonstrates superior performance by achieving an accuracy of 79.51% and an F1 score of 80.04%, outperforming deep learning classifiers trained with fastText word embeddings.", "venue": "IEEE Access", "label": 2}, {"loc": [4.870468616485596, 0.2742297053337097], "openalex_id": "https://openalex.org/W4404787999", "title": "Speaker Intimacy Estimation in Chat-Talks Based on Verbal and Non-Verbal Information", "authors": "Yuya Chiba, Akinori Ito", "abstract": "Conversations based on mutual intimacy are critical for maintaining positive relationships. A detailed understanding of speaker relationships in dialogues enhances various applications, such as information recommendation systems. Such systems, when interacting with multiple users, can provide more tailored information by understanding the users’ relationships. Furthermore, dialogue systems, which are becoming increasingly prevalent in society, can foster long-term user engagement by recognizing and responding to the intimacy levels of users. This study explores a method for estimating the intimacy levels of speakers and dialogue partners in conversational exchanges. Our approach utilizes a multimodal corpus of natural conversations with 71 Japanese participants, complete with metadata indicating each speaker’s perceived intimacy level. We identified key features for estimating intimacy by analyzing the statistical parameters of these features. Our comprehensive analysis encompassed both verbal and non-verbal information, including prosody, gestures, and facial expressions. The proposed intimacy estimation model combines multimodal features using a multi-stream Bi-directional Long Short-Term Memory (BLSTM) network and grasps the contextual information of conversations with a Context BLSTM. Our model’s effectiveness is demonstrated through comparisons with several baseline models. Experimental results show that our proposed model significantly improves the overall performance compared with other models. Although the RoBERTa-based method (the best baseline model) achieved an F1 score of 0.571, our method had an F1 score of 0.594. In particular, an ablation study shows that combining verbal and non-verbal features is useful for intimacy estimation. The performance was further improved by extending the dialogue context, showing that the proposed model can estimate three levels of intimacy with an F1 score of 0.666 by observing eight utterance exchanges.", "venue": "IEEE Access", "label": 2}, {"loc": [3.748505115509033, -3.9226925373077393], "openalex_id": "https://openalex.org/W4398152371", "title": "Comparing Fine-Tuning, Zero and Few-Shot Strategies with Large Language Models in Hate Speech Detection in English", "authors": "Ronghao Pan, Jos\u00e9 Antonio Garc\u00ed\u00ada-D\u00ed\u00adaz, Rafael Valencia\u2010Garc\u00eda", "abstract": "Large Language Models (LLMs) are increasingly demonstrating their ability to understand natural language and solve complex tasks, especially through text generation.One of the relevant capabilities is contextual learning, which involves the ability to receive instructions in natural language or task demonstrations to generate expected outputs for test instances without the need for additional training or gradient updates.In recent years, the popularity of social networking has provided a medium through which some users can engage in offensive and harmful online behavior.In this study, we investigate the ability of different LLMs, ranging from zero-shot and few-shot learning to fine-tuning.Our experiments show that LLMs can identify sexist and hateful online texts using zero-shot and few-shot approaches through information retrieval.Furthermore, it is found that the encoder-decoder model called Zephyr achieves the best results with the fine-tuning approach, scoring 86.811% on the Explainable Detection of Online Sexism (EDOS) test-set and 57.453% on the Multilingual Detection of Hate Speech Against Immigrants and Women in Twitter (HatEval) test-set.Finally, it is confirmed that the evaluated models perform well in hate text detection, as they beat the best result in the HatEval task leaderboard.The error analysis shows that contextual learning had difficulty distinguishing between types of hate speech and figurative language.However, the fine-tuned approach tends to produce many false positives.", "venue": "Computer Modeling in Engineering & Sciences", "label": 0}, {"loc": [7.137073516845703, 0.3997410237789154], "openalex_id": "https://openalex.org/W4396594829", "title": "BharatBhasaNet-A unified framework to identify Indian code mix Languages", "authors": "Sayantan Dey, Shivam Thakur, Akhilesh Kandwal, Rohit Kumar, Sharmistha Dasgupta, Partha Pratim Roy", "abstract": "In the rapidly globalizing digital communication sphere, the imperative for advanced multilingual text recognition and identification is increasingly evident. Contrasting the previous works, which were predominantly constrained to 2-3 languages, this paper explores the rich linguistic diversity of India, addressing challenges in automated language processing for 12 languages. BharatBhasaNet, our comprehensive Language Identification (LID) framework, integrates an extensive dataset covering these 12 Indian languages in both native-script and romanized forms, derived from INDICCORP, Bhasha-Abhijnaanam, and Aksharantar datasets by AI4Bharat. The framework accommodates two models, Roberta-native and Roberta-Romanized, based on attention mechanism and transformer architecture. With its exceptional accuracy of 99.54% in native script and 60.90% in Romanized text, BharatBhasaNet significantly advances language identification, providing broader language coverage than existing LIDs. It excels in interpreting code-mixed sentences, unveiling crucial accuracy patterns related to sentence length, word span, and complexity in multilingual contexts. The framework underwent rigorous testing using a real-time dataset from the National Informatics Center (NIC), achieving an accuracy rate of 92.67%. Overcoming challenges like limited training data and distinguishing similar languages, BharatBhasaNet marks a significant leap in Romanized text identification within diverse linguistic landscapes.", "venue": "IEEE Access", "label": 2}, {"loc": [5.994633674621582, 2.0657427310943604], "openalex_id": "https://openalex.org/W4401123394", "title": "From Form (s) to Meaning: Probing the Semantic Depths of Language Models Using Multisense Consistency", "authors": "Xenia Ohmer, Elia Bruni, Dieuwke Hupkes", "abstract": "Abstract The staggering pace with which the capabilities of large language models (LLMs) are increasing, as measured by a range of commonly used natural language understanding (NLU) benchmarks, raises many questions regarding what \u201cunderstanding\u201d means for a language model and how it compares to human understanding. This is especially true since many LLMs are exclusively trained on text, casting doubt on whether their stellar benchmark performances are reflective of a true understanding of the problems represented by these benchmarks, or whether LLMs simply excel at uttering textual forms that correlate with what someone who understands the problem would say. In this philosophically inspired work, we aim to create some separation between form and meaning, with a series of tests that leverage the idea that world understanding should be consistent across presentational modes\u2014inspired by Fregean senses\u2014of the same meaning. Specifically, we focus on consistency across languages as well as paraphrases. Taking GPT-3.5 as our object of study, we evaluate multisense consistency across five different languages and various tasks. We start the evaluation in a controlled setting, asking the model for simple facts, and then proceed with an evaluation on four popular NLU benchmarks. We find that the model\u2019s multisense consistency is lacking and run several follow-up analyses to verify that this lack of consistency is due to a sense-dependent task understanding. We conclude that, in this aspect, the understanding of LLMs is still quite far from being consistent and human-like, and deliberate on how this impacts their utility in the context of learning about human language and understanding.", "venue": "Computational Linguistics", "label": 27}, {"loc": [3.7243454456329346, -3.894291877746582], "openalex_id": "https://openalex.org/W4401415327", "title": "Fine-Tuned Understanding: Enhancing Social Bot Detection with Transformer-based Classification", "authors": "Amine Sallah, El Arbi Abdellaoui Alaoui, Sa\u00efd Agoujil, Mudasir Ahmad Wani, Mohamed Hammad, Yassine Maleh, Ahmed A. Abd El\u2010Latif", "abstract": "In recent years, the proliferation of online communication platforms and social media has given rise to a new wave of challenges, including the rapid spread of malicious bots. These bots, often programmed to impersonate human users, can infiltrate online communities, disseminate misinformation, and engage in various activities detrimental to the integrity of digital discourse. It is becoming more and more difficult to discern a text produced by deep neural networks from that created by humans. Transformer-based Pre-trained Language Models (PLMs) have recently shown excellent results in challenges involving natural language understanding (NLU). The suggested method is to employ an approach to detect bots at the tweet level by utilizing content and fine-tuning PLMs, to reduce the current threat. Building on the recent developments of the BERT (Bidirectional Encoder Representations from Transformers) and GPT-3, the suggested model employs a text embedding approach. This method offers a high-quality representation that can enhance the efficacy of detection. In addition, a Feedforward Neural Network (FNN) was used on top of the PLMs for final classification. The model was experimentally evaluated using the Twitter bot dataset. The strategy was tested using test data that came from the same distribution as their training set. The methodology in this paper involves preprocessing Twitter data, generating contextual embeddings using PLMs, and designing a classification model that learns to differentiate between human users and bots. Experiments were carried out adopting advanced Language Models to construct an encoding of the tweet to create a potential input vector on top of BERT and their variants. By employing Transformer-based models, we achieve significant improvements in bot detection F1-score (93%) compared to traditional methods such as Word2Vec and Global Vectors for Word Representation (Glove). Accuracy improvements ranging from 3% to 24% compared to baselines were achieved. The capability of GPT-4, an advanced Large Language Model (LLM), in interpreting bot-generated content is examined in this research. Additionally, explainable artificial intelligence (XAI) was utilized alongside transformer-based models for detecting bots on social media, enhancing the transparency and reliability of these models.", "venue": "IEEE Access", "label": 2}, {"loc": [4.742068290710449, 0.7160492539405823], "openalex_id": "https://openalex.org/W4401210496", "title": "Large Language Models and Rule-Based Approaches in Domain-Specific Communication", "authors": "Dominik Halvon\u00edk, Jozef Kapusta", "abstract": "Currently, we are once again experiencing a frenzy related to artificial intelligence. Generative Pre-trained Transformers (GPT) models are highly effective at various natural language processing tasks. Different varieties of GPT models are widely used these days to improve productivity. Graphic departments generate art designs, developers engineer intricate software solutions, leveraging services predicated on the GPT framework, and many other industries are also following the lead and implementing these new sets of tools in their workflow. However, there are areas in natural language processing where a simple solution is often more suitable and effective than current Large Language Models. In this article, we decided to analyze and compare the practical use of one of the more popular GPT solutions, J-Large, and the simple rule-based model we implemented. We integrated these two models into the internal information system of a private company focused on communication with customers in the gaming industry. Both models were trained on the same dataset provided as a log of conversational interactions for the last two years in the given system. We observed that GPT models exhibited superior performance in terms of comprehensibility and adequacy. The rule-based models showed noticeable proficiency in handling domain-specific tasks, mainly when fed with datasets extracted from the historical communication between users and a specialized domain system, such as a customer care department. As a result, with a sufficiently tailored and specific dataset at their disposal, rule-based models can effectively outpace GPT models in performing domain-specific tasks.", "venue": "IEEE Access", "label": 2}, {"loc": [3.741737127304077, -3.984644651412964], "openalex_id": "https://openalex.org/W4391547671", "title": "Detection of Hate Speech and Offensive Language CodeMix Text in Dravidian Languages using Cost-Sensitive Learning Approach", "authors": "K Sreelakshmi, B. Premjith, Bharathi Raja Chakravarthi, K. P. Soman", "abstract": "Recently, the emergence of social media has opened the way for online harassment in the form of hate speech and offensive language. An automated approach is needed to detect hate and offensive content from social media, which is indispensable. This task is challenging in the case of social media posts or comments in low-resourced CodeMix languages. This paper investigates the efficacy of various multilingual transformer-based embedding models with machine learning classifiers for detecting hate speech and offensive language (HOS) content in social media posts in CodeMix Dravidian languages that belong to the low-resource language group. Experiments were conducted on six sets of openly available datasets in Kannada-English, Malayalam-English and Tamil-English languages. The objective is to identify a single pre-trained embedding model that commonly works well for HOS tasks in the above mentioned languages. For this, a comprehensive study of various multilingual transformer embedding models, such as BERT, DistilBERT, LaBSE, MuRIL, XLM, IndicBERT, and FNET for HOS detection was conducted. Our experiments revealed that MuRIL pre-trained embedding performed consistently well for all six datasets using Support Vector Machine (SVM) with Radial Basis Function (RBF) kernel. In a set of experiments conducted on six datasets, the highest accuracy results for each dataset are as follows: DravidianLangTech 2021 achieved 96% accuracy for Malayalam, 72% accuracy for Tamil, and 66% accuracy for Kannada. For HASOC 2021 Tamil, the accuracy reached 76%, and for HASOC 2021 Malayalam, it reached 68%. Additionally, HASOC 2020 demonstrated an accuracy of 92% for Malayalam. Moreover, we performed an in-depth error analysis and a comparative study, presenting a tabulated summary of our work compared to other top-performing studies. In addition, we employed a cost-sensitive learning approach to address the class imbalance problem in the dataset, in which minority classes get higher classification weights than the majority classes. The weights were initialized and fine-tuned to obtain the best balance between all the classes. The results showed that incorporating the cost-sensitive learning strategy avoided class bias in the trained model. In addition to the aforementioned points, a significant contribution of our research presented in this paper is introducing a novel annotated test set for Malayalam-English CodeMix. This new dataset serves as an extension to our existing data, known as the Hate Speech and Offensive Content Identification in English and Indo-Aryan Languages (HASOC) 2021 Malayalam-English dataset.", "venue": "IEEE Access", "label": 2}, {"loc": [2.8021600246429443, -0.279325008392334], "openalex_id": "https://openalex.org/W4393264365", "title": "Developing ChatGPT for Biology and Medicine: A Complete Review of Biomedical Question Answering", "authors": "Qing Li, Lei Li, Yu Li", "abstract": "ChatGPT explores a strategic blueprint of question answering (QA) to deliver medical diagnoses, treatment recommendations, and other healthcare support. This is achieved through the increasing incorporation of medical domain data via natural language processing (NLP) and multimodal paradigms. By transitioning the distribution of text, images, videos, and other modalities from the general domain to the medical domain, these techniques have accelerated the progress of medical domain question answering (MDQA). They bridge the gap between human natural language and sophisticated medical domain knowledge or expert-provided manual annotations, handling large-scale, diverse, unbalanced, or even unlabeled data analysis scenarios in medical contexts. Central to our focus is the utilization of language models and multimodal paradigms for medical question answering, aiming to guide the research community in selecting appropriate mechanisms for their specific medical research requirements. Specialized tasks such as unimodal-related question answering, reading comprehension, reasoning, diagnosis, relation extraction, probability modeling, and others, as well as multimodal-related tasks like vision question answering, image captioning, cross-modal retrieval, report summarization, and generation, are discussed in detail. Each section delves into the intricate specifics of the respective method under consideration. This paper highlights the structures and advancements of medical domain explorations against general domain methods, emphasizing their applications across different tasks and datasets. It also outlines current challenges and opportunities for future medical domain research, paving the way for continued innovation and application in this rapidly evolving field. This comprehensive review serves not only as an academic resource but also delineates the course for future probes and utilization in the field of medical question answering.", "venue": "Biophysics Reports", "label": 0}, {"loc": [2.754801034927368, 2.345769166946411], "openalex_id": "https://openalex.org/W4401514264", "title": "Data Analyst Competencies: A Theory-Driven Investigation of Industry Requirements in the Field of Data Analytics", "authors": "Cassandra Artman Collier, Anne Powell", "abstract": "As organizations\u2019 reliance on data increases, the prevalence of data analytics programs in universities likewise increases. However, despite this specialized education, scholars still report a gap between the knowledge and skills students graduate with and those required by industry upon beginning work as an entry-level data analyst. We draw on theories of data analysis and curriculum frameworks to create an integrated theoretical model to drive our work. We then conduct an extensive analysis to identify relevant languages and tools in data analysis today and collect data from hiring managers seeking data analysts through a survey-based research method. We report the major knowledge, skills, and dispositions desired in the industry today for entry-level data analysts, including specific software platforms and applications. Our findings highlight several leading tools and a better understanding of how well data analysts are expected to know each tool and when those tools are used throughout the knowledge discovery via the data analytics lifecycle. This produces important contributions, particularly to academics working to keep data analytics programs competitive and up-to-date in today\u2019s rapidly changing landscape.", "venue": "Journal of Information Systems Education", "label": 0}, {"loc": [3.2194595336914062, -0.6847062706947327], "openalex_id": "https://openalex.org/W4401387006", "title": "Zero and Few Short Learning Using Large Language Models for De-Identification of Medical Records", "authors": "Yarasam Yashwanth, Rajashree Shettar", "abstract": "The paper aims to evaluate and provide a comparative analysis of the performance and fine-tuning cost of various Large Language Models (LLMs) such as GPT-3.5, GPT-4, PaLM, Bard, and Llama in automating the de-identification of Protected Health Information (PHI) from medical records, ensuring patient and healthcare professional privacy. Zero-shot learning was utilized initially to assess the capabilities of these LLMs in de-identifying medical data. Subsequently, each model was fine-tuned with varying training set sizes to observe changes in performance. The study also investigates the impact of the specificity of prompts on the accuracy of de-identification tasks. Fine-tuning LLMs with specific examples significantly enhanced the accuracy of the de-identification process, surpassing the zero-shot learning accuracy of pre-trained counterparts. Notably, a fine-tuned GPT-3.5 model with a few-shot learning technique was able to exceed the performance of a zero-shot learning GPT-4 model, with 99% accuracy. Detailed prompts resulted in higher task accuracy across all models, yet fine-tuned models with brief instructions still outperformed pre-trained models given detailed prompts. Also, the fine-tuned models were more resilient to medical record format change than the zero-shot models. Code, calculations, and comparisons are available at https://github.com/YashwanthYS/De-Identification-of-medical-Records. The findings underscore the potential of LLMs, particularly when fine-tuned, to effectively automate the de-identification of PHI in medical records. The study highlights the importance of model training and prompt specificity in achieving high accuracy in de-identification tasks.", "venue": "IEEE Access", "label": 2}, {"loc": [6.319093227386475, -0.18667136132717133], "openalex_id": "https://openalex.org/W4404781834", "title": "From Discrete to Continuous Classes: A Situational Analysis of Multilingual Web Registers with LLM Annotations", "authors": "Erik Henriksson, Amanda Myntti, Saara Hellstr\u00f6m, Selcen Erten-Johansson, Anni Eskelinen, Liina Repo, Veronika Laippala", "abstract": "In corpus linguistics, registers\u2013language varieties suited to different contexts\u2013have traditionally been defined by their situations of use, yet recent studies reveal significant situational variation within registers. Previous quantitative studies, however, have been limited to English, leaving this variation in other languages largely unexplored. To address this gap, we apply a quantitative situational analysis to a large multilingual web register corpus, using large language models (LLMs) to annotate texts in English, Finnish, French, Swedish, and Turkish for 23 situational parameters. Using clustering techniques, we identify six situational text types, such as \u201cAdvice\u201d, \u201cOpinion\u201d and \u201cMarketing\u201d, each characterized by distinct situational features. We explore the relationship between these text types and traditional register categories, finding partial alignment, though no register maps perfectly onto a single cluster. These results support the quantitative approach to situational analysis and are consistent with earlier findings for English. Cross-linguistic comparisons show that language accounts for only a small part of situational variation within registers, suggesting registers are situationally similar across languages. This study demonstrates the utility of LLMs in multilingual register analysis and deepens our understanding of situational variation within registers.
", "venue": "http://doi.org/10.18653/v1/2024.nlp4dh-1.30", "label": 0}, {"loc": [4.117084503173828, -2.5144424438476562], "openalex_id": "https://openalex.org/W4393170771", "title": "Advancing Fake News Detection: Hybrid Deep Learning with FastText and Explainable AI", "authors": "Ehtesham Hashmi, Sule Yildirim Yayilgan, Muhammad Mudassar Yamin, Subhan Ali, Mohamed Abomhara", "abstract": "The widespread propagation of misinformation on social media platforms poses a significant concern, prompting substantial endeavors within the research community to develop robust detection solutions. Individuals often place unwavering trust in social networks, often without discerning the origins and authenticity of the information disseminated through these platforms. Hence, the identification of media-rich fake news necessitates an approach that adeptly leverages multimedia elements and effectively enhances detection accuracy. The ever-changing nature of cyberspace highlights the need for measures that may effectively resist the spread of media-rich fake news while protecting the integrity of information systems. This study introduces a robust approach for fake news detection, utilizing three publicly available datasets: WELFake, FakeNewsNet, and FakeNewsPrediction. We integrated FastText word embeddings with various Machine Learning and Deep Learning methods, further refining these algorithms with regularization and hyperparameter optimization to mitigate overfitting and promote model generalization. Notably, a hybrid model combining Convolutional Neural Networks and Long Short-Term Memory, enriched with FastText embeddings, surpassed other techniques in classification performance across all datasets, registering accuracy and F1-scores of 0.99, 0.97, and 0.99, respectively. Additionally, we utilized state-of-the-art transformer-based models such as BERT, XLNet, and RoBERTa, enhancing them through hyperparameter adjustments. These transformer models, surpassing traditional RNN-based frameworks, excel in managing syntactic nuances, thus aiding in semantic interpretation. In the concluding phase, explainable AI modeling was employed using Local Interpretable Model-Agnostic Explanations, and Latent Dirichlet Allocation to gain deeper insights into the model\u2019s decision-making process.", "venue": "IEEE Access", "label": 2}, {"loc": [2.3956241607666016, 1.671223521232605], "openalex_id": "https://openalex.org/W4412216722", "title": "A Digital Tool for Scaffolding Innovation Learning in Engineering Education with Local Industry Needs", "authors": "Kamila Kunrath, Serena Leka, Lasse S. Vestergaard, Mirko Presser, Devarajan Ramanujan", "abstract": "Providing students with the knowledge, skills, and competencies in innovation has become a central focus in engineering education. However, there is limited knowledge on which innovation skills must be supported to reduce the current knowledge gaps between universities and industry. Furthermore, there has been limited investigation into the role of digital solutions that can promote innovation education and the acquisition of industry-relevant innovation skills in engineering curricula. To this end, this paper explores the use of industry challenges (via case studies) as a resource that students and educators can use for targeted ideation and solutions development in teaching. We examined 78 innovation cases provided by local industries for two mandatory master-level courses on innovation at a Danish university. We identified dominant trends from the case descriptions that express areas of interest and demands from different industry sectors. Based on these findings, we have developed an interactive digital tool to support course instructors in increasing the accessibility and utility of industry-defined innovation cases in student-led ideation activities.", "venue": "Research Portal (King's College London)", "label": 0}, {"loc": [5.23100471496582, -1.5869616270065308], "openalex_id": "https://openalex.org/W4400770758", "title": "Revolutionizing Urdu Sentiment Analysis: Harnessing the Power of XLM-R and GPT-2", "authors": "Muhammad Rehan Ashraf, Muzammal Hussain, M. Arfan Jaffar, Waheed Yousuf Ramay, Muhammad Faheem", "abstract": "Sentiment analysis extracts valuable insights from textual sources using computation, textual or systematic analysis, and natural language processing. It identifies and measures the attitudes, beliefs, and emotional states individuals express through text data. Recent research on sentiment analysis has largely focused on the English language; therefore, low-resource languages are getting much less attention. Conducting sentiment analysis of low-resource languages is difficult because large datasets and related repositories are unavailable. This paper creates a new dataset for low-resource language (Urdu) to address this issue. The dataset, namely LUCSA-23, consists of more than 65,000 user reviews from various genres, including food, sports, showbiz, apps, and political reviews from developing countries, i.e., Pakistan. Urdu domain experts further annotate the created dataset. This paper proposes an Urdu sentiment analysis approach leveraging the transformer model, i.e., XLM-R and GPT-2. It preprocesses the Urdu text input, generates BERT embeddings, and passes them to the proposed classifier as input for sentiment classification. The proposed classifier is compared with machine/deep/embedded classifiers to evaluate its performance. The findings show that the proposed classifiers outperform existing state-of-the-art approaches with an accuracy of 95%.", "venue": "IEEE Access", "label": 2}, {"loc": [4.499542236328125, 2.5466973781585693], "openalex_id": "https://openalex.org/W4398249690", "title": "Quantifying Gender Bias in Arabic Pre-trained Language Models", "authors": "Wafa Alrajhi, Hend S. Al\u2010Khalifa, AbdulMalik S. Al\u2010Salman", "abstract": "The current renaissance in the development of Arabic Pre-trained Language models (APLMs) has yielded significant advancement across many fields. Nevertheless, no study has explored the dimensions of gender bias in these models. It is argued that the bias is influenced by the resources used during the models’ pre-training process. Thus, in this study, we conducted a comprehensive analysis to qualitatively assess the representation of different genders by tracing the bias signals from the training corpus. Through applying several Natural Language Processing (NLP) techniques, including Named Entity Recognition (NER), Part of Speech Tagging (POS), and Dependency Parsing (DP), the results indicated an imbalanced corpus in terms of gender nouns and reveal verbs’ patterns associated with each gender. The second phase of this study aimed to examine the impact of the results that emerged from the corpus analysis on the recent APLMs. Leveraging Bidirectional Encoder Representations (BERT)’s ability to predict the missing tokens in quantifying gender bias, we introduce the first template-based Arabic benchmark designed to measure gender bias across various disciplines. Utilizing this benchmark, along with the list of gender-specific nouns and personal names extracted from the corpus, we evaluated the gender skew in the context of scientific and liberal arts disciplines across six APLMs. These models included: AraBERT, CAMeLBERT-CA, CAMeLBERT-MSA, GigaBERT, MAR-BERT, and ARBERT. The outcomes revealed a higher bias skew toward personal names, indicating that the presence of gender associations in the training corpus reinforced gender bias in APLMs.", "venue": "IEEE Access", "label": 2}, {"loc": [8.67655086517334, 0.2906099557876587], "openalex_id": "https://openalex.org/W4404784153", "title": "Model Internals-based Answer Attribution for Trustworthy Retrieval-Augmented Generation", "authors": "Jirui Qi, Gabriele Sarti, Raquel Fern\u00e1ndez, Arianna Bisazza", "abstract": "Ensuring the verifiability of model answers is a fundamental challenge for retrieval-augmented generation (RAG) in the question answering (QA) domain. Recently, self-citation prompting was proposed to make large language models (LLMs) generate citations to supporting documents along with their answers. However, self-citing LLMs often struggle to match the required format, refer to non-existent sources, and fail to faithfully reflect LLMs' context usage throughout the generation. In this work, we present MIRAGE --Model Internals-based RAG Explanations -- a plug-and-play approach using model internals for faithful answer attribution in RAG applications. MIRAGE detects context-sensitive answer tokens and pairs them with retrieved documents contributing to their prediction via saliency methods. We evaluate our proposed approach on a multilingual extractive QA dataset, finding high agreement with human answer attribution. On open-ended QA, MIRAGE achieves citation quality and efficiency comparable to self-citation while also allowing for a finer-grained control of attribution parameters. Our qualitative evaluation highlights the faithfulness of MIRAGE's attributions and underscores the promising application of model internals for RAG answer attribution.", "venue": "https://doi.org/10.18653/v1/2024.emnlp-main.347", "label": 0}, {"loc": [2.6452717781066895, 1.7692055702209473], "openalex_id": "https://openalex.org/W4399780828", "title": "Empowering knowledge through AI: open scholarship proactively supporting well trained generative AI", "authors": "Elizabeth Hellen", "abstract": "Generative AI has taken the world by storm over the last few years, and the world of scholarly communications has not been immune to this. Most discussions in this area address how we can integrate these tools into our workflows, concerns about how researchers and students might misuse the technology or the unauthorised use of copyrighted work. This article argues for a novel viewpoint that librarians and publishers should be encouraging the use of their scholarly content in the training of AI algorithms. Inclusion of scholarly works would advance the reliability and accuracy of the information in training datasets and ensure that this content is included in new knowledge discovery platforms. The article also argues that inclusion can be achieved by improving linkage to content, and, by making sure that licences explicitly allow inclusion in AI training datasets, it advocates for a more collaborative approach to shaping the future of the information landscape in academia.", "venue": "Insights the UKSG journal", "label": 0}, {"loc": [3.563401460647583, 4.502610683441162], "openalex_id": "https://openalex.org/W4402768824", "title": "KDPII: A New Korean Dialogic Dataset for the Deidentification of Personally Identifiable Information", "authors": "LI Fei, Yejee Kang, Seoyoon Park, Yeonji Jang, J M Lee, Hansaem Kim", "abstract": "The rapid growth of social media in the era of big data and artificial intelligence has raised significant safety concerns related to the communication of sensitive personal information. In modern society, awareness of the importance of preserving privacy is growing, so there is a rising advocacy for adopting language modeling technology to mitigate the risk of personal information leakage and to deidentify sensitive information depending on the situation. Thus far, several theoretical analyses of privacy protection in Korea have been conducted. However, the technical development of language model training resources for Korean has been slower than those of widely spoken languages such as English and Chinese. To address this problem, we developed a comprehensive and organized framework for classifying Korean personally identifiable information (PII) by investigating pertinent examples, such as “Text Anonymization Benchmark” and “Network Intrusion Detection Dataset,” from within and outside Korea. Subsequently, we created a new Korean dataset for PII deidentification, KDPII, which consists of many conversational texts incorporating plentiful Korean PII. Based on this, we examined the Korean PII processing performances of many representative language models that are available on the market. Finally, we found that although the performance of language models in identifying PII varied by model size, model architecture, and training source, most of them were significantly better at recognizing universal PII than language-specific PII, which indicates a prospective direction of expanding training data for implementing Korean-specific PII deidentification in the future.", "venue": "IEEE Access", "label": 2}, {"loc": [9.581047058105469, 1.131701946258545], "openalex_id": "https://openalex.org/W4390671457", "title": "Intercity relationships between 293 Chinese cities quantified based on toponym co-occurrence", "authors": "Wang Tongjing, Yin Zhao, Ziyu Bao, Evert Meijers", "abstract": "This dataset presents relationships between 293 Chinese cities, derived using a toponym co-occurrence method. By employing this toponym co-occurrence analysis method, the strength of an intercity relationship is determined by the frequency at which both city names appear on the same webpage. The data was sourced from the Common Crawl web archive's 2019 April Corpus, which contains approximately 2.5 billion web pages. The primary aim of this dataset is to provide a fresh perspective on intercity relationships, thereby facilitating studies on city network analysis. The dataset not only encourages further research into comparing this innovative city relationship with other established networks but is also a showcase that presents a straightforward methodology that can be applied to other archives within Common Crawl. As such, it paves the way for longitudinal studies that probe the evolution of city networks.", "venue": "Cybergeo", "label": 0}, {"loc": [7.748905658721924, 1.228094458580017], "openalex_id": "https://openalex.org/W4402258777", "title": "Humkinar: Construction of a Large Scale Web Repository and Information System for Low Resource Urdu Language", "authors": "Muhammad Amir Mehmood, Bilal Tahir", "abstract": "Online content availability, commercial viability, and technological advancements for English and European languages direct mainstream search engines to prioritize the search results of these high-resource languages. This makes it challenging for low-resource language users to access the search results in regional languages which is essential to promote literacy, inclusion, and digital accessibility. In this article, we create Humkinar– a Urdu language search engine using open-source tools. Our search engine is designed with five key components: computing infrastructure, data collector, search manager, web analytics engine, and user interface. First, our in-house computing infrastructure offers 160 GB RAM, 80 cores, and 30 TB memory to support the operations of the search engine. Next, we customize an open-source web crawler with a specialized Urdu language-focused URL selection algorithm, webpage parser, and content selection mechanism to collect Urdu webpages with optimized computing and Internet resources. We also employ specialized content scrapers to collect targeted and high-priority Urdu content like news articles, Wikipedia, poetry, and books. Overall, our data collector module has successfully curated a repository containing 14 million crawled webpages and 2.2 million scraped Urdu documents. Also, we design post-processing tools for tasks such as topic classification, de-duplication, profanity assessment, text summarization, and the scoring of website quality specific to the Urdu language. In addition, acknowledging the limitations of applying conventional ranking signals to Urdu language, search manager utilizes our seven derived ranking signals for search results. These signals are tuned to emphasize the richness and quality of Urdu language websites and content in search results. Moreover, we incorporate a web analytics engine into our search engine to collect and analyze user actions and metadata to enhance the overall functionality and effectiveness of the search engine. Our web analytics engine has recorded 400K user interactions from 83 countries conducted through the interactive user interface. Finally, we conduct usability testing of search engine with native Urdu language speakers to assess the strengths and weaknesses of our search engine.", "venue": "IEEE Access", "label": 2}, {"loc": [8.887615203857422, 0.20786316692829132], "openalex_id": "https://openalex.org/W4396753486", "title": "Harnessing the Power of Metadata for Enhanced Question Retrieval in Community Question Answering", "authors": "S. Ghasemi, Azadeh Shakery", "abstract": "Community Question Answering (CQA) forums such as Yahoo! Answers and Stack Overflow have become popular. The main goal of a CQA is to provide the most suitable answer in the shortest possible time. Since there is a reach archive of answered questions, similar question retrieval has received much attention intending to answer questions immediately after asking. One of the main challenges in this task is the lexical gap between questions, which refers to the discrepancies between the terminologies used by users asking questions. In this paper, we use metadata and two transformer-based techniques to improve the translation-based language model as a traditional technique addressing the lexical gap in retrieval systems. To overcome the lexical gap problem, additional context and information about the questions can help. Metadata is a rich source of information that refers to supplementary data associated with each question. Subject, category, and answer are metadata used in this article. To leverage these metadata, two transformer-based methods are employed. First, to utilize category information, we build category-specific dictionaries to obtain more accurate translation probabilities. A BERT model predicts the categories of the questions. Second, to utilize answer information, we propose a question expansion technique. Expansion is done by a transformer-based model using a retrieval-augmented generation (RAG) model to generate answers and expand new questions with corresponding answers. Finally, candidate questions are ranked according to their similarity to the expanded new question. Our proposed method achieves 51.47 in terms of MAP, outperforming all state-of-the-art approaches in question retrieval.", "venue": "IEEE Access", "label": 2}, {"loc": [4.693976879119873, -0.04788566380739212], "openalex_id": "https://openalex.org/W4391480394", "title": "Personality trait detection via transfer learning", "authors": "Bashar Alshouha, Jesus Serrano\u2010Guerrero, Francisco Chiclana, Francisco P. Romero, Jos\u00e9 \u00c1. Olivas", "abstract": "Personality recognition plays a pivotal role when developing user-centric solutions such as recommender systems or decision support systems across various domains, including education, e-commerce, or human resources.Traditional machine learning techniques have been broadly employed for personality trait identification; nevertheless, the development of new technologies based on deep learning has led to new opportunities to improve their performance.This study focuses on the capabilities of pre-trained language models such as BERT, RoBERTa, ALBERT, ELECTRA, ERNIE, or XLNet, to deal with the task of personality recognition.These models are able to capture structural features from textual content and comprehend a multitude of language facets and complex features such as hierarchical relationships or long-term dependencies.This makes them suitable to classify multilabel personality traits from reviews while mitigating computational costs.The focus of this approach centers on developing an architecture based on different layers able to capture the semantic context and structural features from texts.Moreover, it is able to fine-tune the previous models using the MyPersonality dataset, which comprises 9,917 status updates contributed by 250 Facebook users.These status updates are categorized according to the well-known Big Five personality model, setting the stage for a comprehensive exploration of personality traits.To test the proposal, a set of experiments have been performed using different metrics such as the exact match ratio, hamming loss, zero-one-loss, precision, recall, F1-score, and weighted averages.The results reveal ERNIE is the topperforming model, achieving an exact match ratio of 72.32%, an accuracy rate of 87.17%, and 84.41% of F1-score.The findings demonstrate that the tested models substantially outperform other state-of-the-art studies, enhancing the accuracy by at least 3% and confirming them as powerful tools for personality recognition.These findings represent substantial advancements in personality recognition, making them appropriate for the development of user-centric applications.", "venue": "Computers, materials & continua/Computers, materials & continua (Print)", "label": 0}, {"loc": [9.245207786560059, -0.8973973393440247], "openalex_id": "https://openalex.org/W4392745486", "title": "End to End Urdu Abstractive Text Summarization with Dataset and Improvement in Evaluation Metric", "authors": "Hassan Raza, Waseem Shahzad", "abstract": "Urdu, being a common language in South Asia, has not received significant attention in terms of language processing compared to more advanced languages. In the field of Natural Language Processing (NLP), the task of text summarization holds great importance due to its ability to comprehend textual content and generate concise summaries. Text summarization can be either extractive or abstractive in nature. While considerable efforts have been made to advance extractive summarization techniques, the limitations associated with it have been extensively explored and explained in the paper. However, the domain of abstractive summarization for the Urdu language remains largely unexplored. The challenges and underlying factors that have impeded progress in this domain have also been addressed. This paper specifically focuses on abstractive summarization of the Urdu language using supervised learning. To accomplish this, a labeled dataset consisting of Urdu text and its abstractive summaries is required. A dataset of Urdu text and its corresponding abstractive summaries has been prepared for the purpose of supervised learning. Additionally, the paper presents the results of summary generation, measured in terms of a rough score. Transformer’s encoder-decoder network was employed to generate abstractive summaries in Urdu, yielding a ROUGE-1 score of 25.18 in Urdu text summarization. Moreover, a novel evaluation metric called the “disconnection rate” has been introduced as a context-aware evaluation metric to enhance the assessment of a summary, known as the Context Aware RoBERTa Score.", "venue": "IEEE Access", "label": 2}, {"loc": [6.654768466949463, 3.789337635040283], "openalex_id": "https://openalex.org/W4402350363", "title": "LLM-Based Edge Intelligence: A Comprehensive Survey on Architectures, Applications, Security and Trustworthiness", "authors": "Othmane Friha, Mohamed Amine Ferrag, Burak Kantarc\u0131, Burak \u00c7akmak, Arda Ozgun, Nacira Ghoualmi\u2010Zine", "abstract": "The integration of Large Language Models (LLMs) and Edge Intelligence (EI) introduces a groundbreaking paradigm for intelligent edge devices. With their capacity for human-like language processing and generation, LLMs empower edge computing with a powerful set of tools, paving the way for a new era of decentralized intelligence. Yet, a notable research gap exists in obtaining a thorough comprehension of LLM-based EI architectures, which should incorporate crucial elements such as security, optimization, and responsible development. This survey aims to bridge this gap by providing a comprehensive resource for both researchers and practitioners. We explore LLM-based EI architectures in-depth, carefully analyzing state-of-the-art paradigms and design decisions. To facilitate efficient and scalable edge deployments, we perform a comparative analysis of recent optimization and autonomy techniques specifically designed for resource-constrained edge environments. Additionally, we shed light on the extensive potential of LLM-based EI by demonstrating its varied practical applications across a wide range of domains. Acknowledging the utmost importance of security, our survey thoroughly investigates potential vulnerabilities inherent in LLM-based EI deployments. We explore corresponding defense mechanisms to protect the integrity and confidentiality of data processed at the edge. In conclusion, highlighting the essential aspect of trustworthiness, we outline best practices and guiding principles for the responsible development and deployment of these systems. By conducting a comprehensive review of these key components, our survey aims to support the ethical development and strategic implementation of LLM-driven EI, paving the way for its transformative impact on diverse applications.", "venue": "IEEE Open Journal of the Communications Society", "label": 0}, {"loc": [3.824394464492798, -3.862415075302124], "openalex_id": "https://openalex.org/W4394676691", "title": "RoBERTaNET: Enhanced RoBERTa Transformer Based Model for Cyberbullying Detection with GloVe Features", "authors": "Arwa A. Jamjoom, Hanen Karamti, Muhammad Umer, Shtwai Alsubai, Tai-hoon Kim, Imran Ashraf", "abstract": "Online platforms are fostering social interaction, but unfortunately, this has given rise to antisocial behaviors such as cyberbullying, trolling, and hate speech on a global scale. The detection of hate and aggression has become a vital aspect of combating cyberbullying and cyberharassment. Cyberbullying involves using aggressive and offensive language including rude, insulting, hateful, and teasing comments to harm individuals on social media platforms. Human moderation is both slow and expensive, making it impractical in the face of rapidly growing data. Automatic detection systems are essential to curb trolling effectively. This research deals with the challenge of automatically identifying cyberbullying in tweets from a publicly available cyberbullying dataset. This research work employs robustly optimized bidirectional encoder representations from the transformers approach (RoBERTa), utilizing global vectors for word representation (GloVe) word embedding features. The proposed approach is further compared with the state-of-the-art machine, deep, and transformer-based learning approaches with the FastText word embedding approach. Statistical results demonstrate that the proposed model outperforms others, achieving a 95% accuracy for detecting cyberbullying tweets. In addition, the model obtains 95%, 97%, and 96% for precision, recall, and F1 score, respectively. Results from k-fold cross-validation further affirm the supremacy of the proposed model with a mean accuracy of 95.07%.", "venue": "IEEE Access", "label": 2}, {"loc": [4.2981953620910645, 2.329939842224121], "openalex_id": "https://openalex.org/W4392543356", "title": "The science of implicit race bias: Evidence from the Implicit Association Test", "authors": "Kirsten N. Morehouse, Mahzarin R. Banaji", "abstract": "Abstract Beginning in the mid-1980s, scientific psychology underwent a revolution \u2013 the implicit revolution \u2013 that led to the development of methods to capture implicit bias: attitudes, stereotypes, and identities that operate without full conscious awareness or conscious control. This essay focuses on a single notable thread of discoveries from the Race Attitude Implicit Association Test (RA-IAT) by providing 1) the historical origins of the research, 2) signature and replicated empirical results for construct validation, 3) further validation from research in sociocognitive development, neuroscience, and computer science, 4) new validation from robust association between regional levels of race bias and socially significant outcomes, and 5) evidence for both short- and long-term attitude change. As such, the essay provides the first comprehensive repository of research on implicit race bias using the RA-IAT. Together, the evidence lays bare the hollowness of current-day actions to rectify disadvantage experienced by Black Americans at individual, institutional, and societal levels.", "venue": "Daedalus", "label": 0}, {"loc": [7.635208606719971, -1.0130271911621094], "openalex_id": "https://openalex.org/W4402237239", "title": "Optimizing Quality Estimation for Low-Resource Language Translations: Exploring the Role of Language Relatedness", "authors": "Archchana Sindhujan, Diptesh Kanojia, Constantin Or\u01cesan", "abstract": "Evaluation of machine translation (MT) is vital to determine the effectiveness of MT systems. This paper investigates quality estimation (QE) for machine translation (MT) for low-resource Indic languages. We analyse the influence of language relatedness within linguistic families and integrate various pre-trained encoders within the MonoTransQuest(MonoTQ) framework. This entails assessing models in single-language configurations before scaling up to multiple-language setups, focusing on languages within and across families, and using approaches grounded in transfer learning. Experimental outcomes and analyses indicate that language-relatedness significantly improves QE performance over baseline, sometimes even surpassing state-of-theart approaches. Across monolingual and multilingual configurations, we discuss strategic encoder usage as a simple measure to exploit the language interactions within these models improving baseline QE efficiency for quality estimation. This investigation underscores the potential of tailored pre-trained encoders to improve QE performance and discusses the limitations of QE approaches for low-resource scenarios. Keywords: multilingual \u00b7 pre-trained encoders \u00b7 efficiency.", "venue": "http://doi.org/10.26615/issn.2815-4711.2024_014", "label": 0}, {"loc": [7.9044108390808105, 0.8351010084152222], "openalex_id": "https://openalex.org/W4392174065", "title": "Language Models Fine-Tuning for Automatic Format Reconstruction of SEC Financial Filings", "authors": "Gianfranco Lombardo, Giuseppe Trimigno, Mattia Pellegrino, Stefano Cagnoni", "abstract": "The analysis of financial reports is a crucial task for investors and regulators, especially the mandatory annual reports (10-K) required by the SEC (Securities and Exchange Commission) that provide crucial information about a public company in the American stock market. Although SEC suggests a specific document format to standardize and simplify the analysis, in recent years, several companies have introduced their own format and organization of the contents, making human-based and automatic knowledge extraction inherently more difficult. In this research work, we investigate different Neural language models based on Transformer networks (Bidirectional recurrence-based, Autoregressive-based, and Autoencoders-based approaches) to automatically reconstruct an SEC-like format of the documents as a multi-class classification task with 18 classes at the sentence level. In particular, we propose a Bidirectional fine-tuning procedure to specialize pre-trained language models on this task. We propose and make the resulting novel transformer model, named SEC-former, publicly available to deal with this task. We evaluate SEC-former in three different scenarios: 1) in terms of topic detection performances; 2) in terms of document similarity (TF-IDF Bag-of-words and Doc2Vec) achieved with respect to original and trustable financial reports since this operation is leveraged for portfolio optimization tasks; and 3) testing the model in a real use-case scenario related to a public company that does not respect the SEC format but provides a human-supervised reference to reconstruct it.", "venue": "IEEE Access", "label": 2}, {"loc": [6.349483013153076, 3.0633623600006104], "openalex_id": "https://openalex.org/W4404783239", "title": "Empowering Multi-step Reasoning across Languages via Program-Aided Language Models", "authors": "Leonardo Ranaldi, Giulia Pucci, Barry Haddow, Alexandra Birch", "abstract": "In-context learning methods are commonly employed as inference strategies, where Large Language Models (LLMs) are elicited to solve a task by leveraging provided demonstrations without requiring parameter updates. Among these approaches are the reasoning methods, exemplified by Chain-of-Thought (CoT) and Program-Aided Language Models (PAL), which encourage LLMs to generate reasoning steps, leading to improved accuracy. Despite their success, the ability to deliver multi-step reasoning remains limited to a single language, making it challenging to generalize to other languages and hindering global development. In this work, we propose Cross-lingual Program-Aided Language Models (CrossPAL), a method for aligning reasoning programs across languages. Our method delivers programs as intermediate reasoning steps in different languages through a double-step cross-lingual prompting mechanism inspired by the Program-Aided approach. Moreover, we introduce Self-consistent Cross-PAL (SCross-PAL) to ensemble different reasoning paths across languages. Our experimental evaluations show that Cross-PAL outperforms existing methods, reducing the number of interactions and achieving state-of-the-art performance.", "venue": "http://doi.org/10.18653/v1/2024.emnlp-main.678", "label": 0}, {"loc": [5.569347858428955, -1.3153493404388428], "openalex_id": "https://openalex.org/W4391929647", "title": "A Comparative Analysis of Word Embeddings Techniques for Italian News Categorization", "authors": "Federica Rollo, Giovanni Bonisoli, Laura Po", "abstract": "Text categorization remains a formidable challenge in information retrieval, requiring effective strategies, especially when applied to low-resource languages such as Italian. This paper delves into the intricacies of categorizing Italian news articles, addressing the complexities arising from the language’s unique structure and writing style. The implemented methodology involves preprocessing the text, generating word embeddings, conducting feature engineering to extract meaningful representations, and training a classifier using the document vectors. The evaluation of the model’s performance is done on a partitioned dataset with a training set for model training and a test set for categorization, allowing assessment of its efficacy on unseen data. Within this paper, we assessed fifteen classifiers for the categorization of Italian news articles, scrutinizing eight models and three approaches for combining word embeddings to derive document vectors. We conducted a comparative analysis between established models such as Word2Vec and FastText and six novel Italian models pre-trained on native datasets. A significant highlight of our work is the introduction of an Italian GloVe model, previously absent for the Italian language. The datasets selected for testing the models’ performances are DICE, a dataset of 10,395 crime news articles extracted from an Italian newspaper, and RCV2-it, a collection of 28,405 Italian news stories released by the multinational media company Reuters Ltd. The tests conducted achieved as the best F-scores 84% and 93%. The results underscore the efficacy of the Support Vector Classification algorithm, while also revealing the inefficacy of Gaussian Naive Bayes, Bernoulli Naive Bayes, and Decision Tree models within the domain of text categorization. The comparison of the word embedding models revealed the better performance of Word2Vec and GloVe concerning FastText. The broader impact of this paper lies not only in advancing text categorization methodologies for Italian documents but also in enriching the linguistic landscape by releasing six novel Italian word embedding models.", "venue": "IEEE Access", "label": 2}, {"loc": [9.439085006713867, 1.1853370666503906], "openalex_id": "https://openalex.org/W4401691811", "title": "Cost-Effective Event Mining on the Web via Event Source Page Discovery and Data API Construction", "authors": "Yuan-Hao Lin, Chia\u2010Hui Chang, Hsiu\u2010Min Chuang, Xiang-Shun Lin, Ting Yeh, Min-Jhao Hong", "abstract": "Automatically extracting meetup event information from the Internet can significantly enhance the discovery of activities. Existing methods for meetup event mining rely on the open APIs provided by event-based social networks (EBSN) to capture Meetup event data in designated regions and topics or a comprehensive crawling of the web to filter meetup events. Both approaches have limitations. In this study, we propose a novel four-stage framework to extract meetup events from event organizers’ websites, including event source page discovery, automatic pagination recognition, boilerplate removal, and event detection. From potential event organizer websites obtained from Facebook events, we built 7,012 profile APIs and obtained 520,909 published links from July 13, 2023, to June 24, 2024. Through the boilerplate remover, we extracted 289,541 pieces of valuable information and identified 69,284 event messages by the event detection module. The event page ratio of 13.3% of these event organizers’ websites is much higher than the 1% event page ratio of all websites, revealing the cost-effectiveness of the proposed approach.", "venue": "IEEE Access", "label": 2}, {"loc": [3.7729873657226562, -3.955934762954712], "openalex_id": "https://openalex.org/W4402124178", "title": "Enhancing Multilingual Hate Speech Detection: From Language-Specific Insights to Cross-Linguistic Integration", "authors": "Ehtesham Hashmi, Sule Yildirim Yayilgan, Ibrahim A. Hameed, Muhammad Mudassar Yamin, Mohib Ullah, Mohamed Abomhara", "abstract": "The rise of social media has enabled individuals with biased perspectives to spread hate speech, directing it toward individuals based on characteristics such as race, gender, religion, or sexual orientation. Constructive interactions in varied communities can greatly enhance self-esteem, yet it is vital to consider that adverse comments may affect individuals’ social standing and emotional health. The crucial task of detecting and addressing this type of content is imperative for reducing its negative effects on communities and individuals alike. The rising occurrence highlights the urgency for enhanced methods and robust regulations on digital platforms to protect humans from such prejudicial and damaging conduct. Hate speech typically appears as a deliberate hostile action aimed at a particular group, often with the intent to demean or isolate them based on various facets of their identity. Research on hate speech predominantly targets resource-aware languages like English, German, and Chinese. Conversely, resource-limited languages, including European languages such as Italian, Spanish, and Portuguese, alongside Asian languages like Roman Urdu, Korean, and Indonesian, present obstacles. These challenges arise from a lack of linguistic resources, making the extraction of information a more strenuous task. This study is focused on the detection and improvement of multilingual hate speech detection across 13 different languages. To conduct a thorough analysis, we carried out a series of experiments that ranged from classical machine learning techniques and mainstream deep learning approaches to recent transformer-based methods. Through hyperparameter tuning, optimization techniques, and generative configurations, we achieved robust and generalized performance capable of effectively identifying hate speech across various dialects. Specifically, we achieved a notable enhancement in detection performance, with precision and recall metrics exceeding baseline models by up to 10% across several lesser-studied languages. Additionally, our work extends the capabilities of explainable AI within this context, offering deeper insights into model decisions, which is crucial for regulatory and ethical considerations in AI deployment. Our study presents substantial performance improvements across various datasets and languages through meticulous comparisons. For example, our model significantly outperformed existing benchmarks: it achieved F1-scores of 0.90 in German (GermEval-2018), up from the baseline score of 0.72, and 0.93 in German (GermEval-2021), a substantial increase from 0.58. Additionally, it scored 0.95 in Roman Urdu HS, surpassing the previous peak of 0.91. Furthermore, for mixed-language datasets such as Italian and English (AMI 2018), our accuracy rose dramatically from 0.59 to 0.96. These outcomes emphasize the robustness and versatility of our model, establishing a new standard for hate speech detection systems across diverse linguistic settings.", "venue": "IEEE Access", "label": 2}, {"loc": [8.976414680480957, 0.1721661537885666], "openalex_id": "https://openalex.org/W4400975273", "title": "Development of a Geographical Question-Answering System in the Kazakh Language", "authors": "Assel Mukanova, Alibek Barlybayev, Aizhan Nazyrova, Lyazzat Kussepova, Bakhyt Matkarimov, Gulnazym Abdikalyk", "abstract": "The study presents a detailed framework designed to develop a Question-Answering System (QA System) for the Kazakh language, highlighting its importance in the field of Low Resource Languages (LRL) Text Processing. This effort aims to fill the gap in resources for languages that lack substantial digital tools. Specifically, the project focuses on geographical questions about Kazakhstan, aiming to enhance accessibility and understanding of the nation’s geography. The challenges associated with LRL text processing are addressed through the creation of a question-answer corpus, training a Bidirectional Encoder Representations from Transformers (BERT)-based model, and evaluating the system using Bilingual Evaluation Understudy (BLEU) metrics. The endeavor begins with the careful compilation of a corpus containing 50,000 questions, which supports the subsequent development phases and ensures the creation of a robust QA System. In the second phase, a BERT model equipped with 91,821,056 parameters is trained, enhancing the model’s ability to understand the complex linguistic nuances of the Kazakh language. The final phase involves a rigorous evaluation using BLEU metrics, where the system achieves an impressive average score of 0.9576. This score indicates a high level of agreement between the system-generated answers and the reference answers, demonstrating the system’s effectiveness at interpreting and responding to queries about Kazakh geography. This study significantly contributes to the field by providing a systematic and nuanced approach to QA System development and underscores the model’s effectiveness through thorough evaluation and comparative analysis.", "venue": "IEEE Access", "label": 2}, {"loc": [2.8929288387298584, -0.5424981117248535], "openalex_id": "https://openalex.org/W4402830216", "title": "BioBridge: Unified Bio-Embedding with Bridging Modality in Code-Switched EMR.", "authors": "Jangyeong Jeon, Sangyeon Cho, DongJoon Lee, Changhee Lee, Junyeong Kim", "abstract": "Pediatric Emergency Department (PED) overcrowding presents a significant global challenge, prompting the need for efficient solutions. This paper introduces the BioBridge framework, a novel approach that applies Natural Language Processing (NLP) to Electronic Medical Records (EMRs) in written free-text form to enhance decision-making in PED. In non-English speaking countries, such as South Korea, EMR data is often written in a Code-Switching(CS) format that mixes the native language with English, with most code-switched English words having clinical significance. The BioBridge framework consists of two core modules: “bridging modality in context” and “unified bio-embedding.” The “bridging modality in context” module improves the contextual understanding of bilingual and code-switched EMRs. In the “unified bio-embedding” module, the knowledge of the model trained in the medical domain is injected into the encoder-based model to bridge the gap between the medical and general domains. Experimental results demonstrate that the proposed BioBridge significantly performance traditional machine learning and pre-trained encoder-based models on several metrics, including F1 score, area under the receiver operating characteristic curve (AUROC), area under the precision-recall Curve (AUPRC), and Brier score. Specifically, BioBridge-XLM achieved enhancements of 0.85% in F1 score, 0.75% in AUROC, and 0.76% in AUPRC, along with a notable 3.04% decrease in the Brier score, demonstrating marked improvements in accuracy, reliability, and prediction calibration over the baseline XLM model. The source code will be made publicly available at https://github.com/jjy961228/BioBridge.", "venue": "IEEE Access", "label": 2}, {"loc": [7.316333770751953, 2.4488346576690674], "openalex_id": "https://openalex.org/W4399269422", "title": "Adapting LLMs to Downstream Applications", "authors": "Andrei Kucharavy", "abstract": "Abstract By themselves, pretrained Large Language Models (LLMs) are interesting objects of study. However, they need to undergo a subsequent transfer learning phase to make them useful for downstream applications. While historically referred to as \u201cfine-tuning,\u201d the range of the tools available to LLMs users to better adapt base models to their applications is now significantly wider than the traditional fine-tuning. In order to provide the reader with an idea of the strengths and weaknesses of each method and allow them to pick one that would suit their needs best, an overview and classification of the most notable methods is provided, specifically the prompt optimization, pre-prompting and implicit prompting (system prompting), model coordination through actor agents, integration with auxiliary tools, parameter-efficient fine-tuning, further model pre-training, from-scratch retraining, and finally domain-specific distillation.", "venue": "https://doi.org/10.1007/978-3-031-54827-7_2", "label": 0}, {"loc": [5.908759593963623, 5.227729320526123], "openalex_id": "https://openalex.org/W4390437694", "title": "MobileVLM: A Fast, Reproducible and Strong Vision Language Assistant for Mobile Devices", "authors": "Xiangxiang Chu, Limeng Qiao, Xinyang Lin, Shuang Xu, Yang Yang, Yiming Hu, Fei Wei, Xinyu Zhang, Bo Zhang, Xiaolin Wei, Chunhua Shen", "abstract": "We present MobileVLM, a competent multimodal vision language model (MMVLM) targeted to run on mobile devices. It is an amalgamation of a myriad of architectural designs and techniques that are mobile-oriented, which comprises a set of language models at the scale of 1.4B and 2.7B parameters, trained from scratch, a multimodal vision model that is pre-trained in the CLIP fashion, cross-modality interaction via an efficient projector. We evaluate MobileVLM on several typical VLM benchmarks. Our models demonstrate on par performance compared with a few much larger models. More importantly, we measure the inference speed on both a Qualcomm Snapdragon 888 CPU and an NVIDIA Jeston Orin GPU, and we obtain state-of-the-art performance of 21.5 tokens and 65.3 tokens per second, respectively. Our code will be made available at: https://github.com/Meituan-AutoML/MobileVLM.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.385314464569092, 5.354447841644287], "openalex_id": "https://openalex.org/W4390437444", "title": "Visual Instruction Tuning towards General-Purpose Multimodal Model: A Survey", "authors": "Jiaxing Huang, Jingyi Zhang, Kai Jiang, Han Qiu, Shijian Lu", "abstract": "Traditional computer vision generally solves each single task independently by a dedicated model with the task instruction implicitly designed in the model architecture, arising two limitations: (1) it leads to task-specific models, which require multiple models for different tasks and restrict the potential synergies from diverse tasks; (2) it leads to a pre-defined and fixed model interface that has limited interactivity and adaptability in following user' task instructions. To address them, Visual Instruction Tuning (VIT) has been intensively studied recently, which finetunes a large vision model with language as task instructions, aiming to learn from a wide range of vision tasks described by language instructions a general-purpose multimodal model that can follow arbitrary instructions and thus solve arbitrary tasks specified by the user. This work aims to provide a systematic review of visual instruction tuning, covering (1) the background that presents computer vision task paradigms and the development of VIT; (2) the foundations of VIT that introduce commonly used network architectures, visual instruction tuning frameworks and objectives, and evaluation setups and tasks; (3) the commonly used datasets in visual instruction tuning and evaluation; (4) the review of existing VIT methods that categorizes them with a taxonomy according to both the studied vision task and the method design and highlights the major contributions, strengths, and shortcomings of them; (5) the comparison and discussion of VIT methods over various instruction-following benchmarks; (6) several challenges, open directions and possible future works in visual instruction tuning research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.639660358428955, -1.4818967580795288], "openalex_id": "https://openalex.org/W4390437974", "title": "Perspectives of Global and Hong Kong's Media on China's Belt and Road Initiative", "authors": "Le Cong Khoo, Anwitaman Datta", "abstract": "This study delves into the media analysis of China's ambitious Belt and Road Initiative (BRI), which, in a polarized world, and furthermore, owing to the very polarizing nature of the initiative itself, has received both strong criticisms and conversely positive coverage in media from across the world. In that context, Hong Kong's dynamic media environment, with a particular focus on its drastically changing press freedom before and after the implementation of the National Security Law is of further interest. Leveraging data science techniques, this study employs Global Database of Events, Language, and Tone (GDELT) to comprehensively collect and analyse (English) news articles on the BRI. Through sentiment analysis, we uncover patterns in media coverage over different periods from several countries across the globe, and delve further to investigate the the media situation in the Hong Kong region. This work thus provides valuable insights into how the Belt and Road Initiative has been portrayed in the media and its evolving reception on the global stage, with a specific emphasis on the unique media landscape of Hong Kong. In an era characterised by increasing globalisation and inter-connectivity, but also competition for influence, animosity and trade-wars, understanding the perceptions and coverage of such significant international projects is crucial. This work stands as an interdisciplinary endeavour merging geopolitical science and data science to uncover the intricate dynamics of media coverage in general, and with an added emphasis on Hong Kong.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.49397087097168, 2.7745964527130127], "openalex_id": "https://openalex.org/W4390437677", "title": "Spike No More: Stabilizing the Pre-training of Large Language Models", "authors": "Sho Takase, Shun Kiyono, Sosuke Kobayashi, Jun Suzuki", "abstract": "Loss spikes often occur during pre-training of large language models. The spikes degrade the performance of large language models and sometimes ruin the pre-training. Since the pre-training needs a vast computational budget, we should avoid such spikes. Based on the assumption that the loss spike is caused by the sudden growth of the gradient norm, we explore factors to keep the gradient norm small through an analysis of the spectral norms of the Jacobian matrices for the sub-layers. Our findings suggest that stabilizing the pre-training process requires two conditions: small sub-layers and large shortcut. We conduct various experiments to empirically verify our theoretical analyses. Experimental results demonstrate that methods satisfying the conditions effectively prevent loss spikes during pre-training.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.474363803863525, 2.541095733642578], "openalex_id": "https://openalex.org/W4390438091", "title": "MathPile: A Billion-Token-Scale Pretraining Corpus for Math", "authors": "Zengzhi Wang, Rui Xia, Pengfei Liu", "abstract": "High-quality, large-scale corpora are the cornerstone of building foundation models. In this work, we introduce MathPile, a diverse and high-quality math-centric corpus comprising about 9.5 billion tokens. Throughout its creation, we adhered to the principle of \"less is more\", firmly believing in the supremacy of data quality over quantity, even in the pre-training phase. Our meticulous data collection and processing efforts included a complex suite of preprocessing, prefiltering, language identification, cleaning, filtering, and deduplication, ensuring the high quality of our corpus. Furthermore, we performed data contamination detection on downstream benchmark test sets to eliminate duplicates and conducted continual pre-training experiments, booting the performance on common mathematical reasoning benchmarks. We aim for our MathPile to boost language models' mathematical reasoning abilities and open-source its different versions and processing scripts to advance the field.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.869021415710449, 2.559295177459717], "openalex_id": "https://openalex.org/W4390589839", "title": "JOANNA ZYLINSKA: Diffused Seeing", "authors": "Joanna \u017byli\u0144ska, Jos\u00e9 Vertedor, Ignacio Gonz\u00e1lez L\u00f3pez", "abstract": "Conversamos con Joanna Zylinska, catedr\u00e1tica de Filosof\u00eda de los Medios y Pr\u00e1ctica Cr\u00edtica Digital en el King\u2019s College de Londres. Como destacada investigadora en estudios sobre medios digitales y tecnolog\u00eda, su trabajo examina las intersecciones entre cultura, tecnolog\u00eda, \u00e9tica y arte, y c\u00f3mo la tecnolog\u00eda moldea nuestra percepci\u00f3n del mundo y nuestra comprensi\u00f3n del arte contempor\u00e1neo. Su perspectiva cr\u00edtica y reflexiva sobre la relaci\u00f3n entre la inteligencia artificial (IA) y el arte la consagra como una voz referente en este \u00e1mbito. Durante la entrevista, exploramos la intersecci\u00f3n entre el arte y la IA, analizando su impacto en la creatividad y la \u00e9tica art\u00edstica. Zylinska comparti\u00f3 sus primeras cr\u00edticas al arte con IA, que consideraba meras exhibiciones t\u00e9cnicas carentes de profundidad art\u00edstica, y analiz\u00f3 la evoluci\u00f3n y madurez del campo art\u00edstico. De igual forma, reconoc\u00eda los progresos realizados en el arte con IA y subrayaba la importancia de cuestionar qui\u00e9n crea arte, para qui\u00e9n y con qu\u00e9 prop\u00f3sito. Tambi\u00e9n se abordaba la controversia en torno al t\u00e9rmino \u201carte con IA\u201d y se propon\u00edan alternativas como \u201ccreatividad computacional\u201d. El art\u00edculo ahondaba en c\u00f3mo la IA desaf\u00eda las nociones culturales y financieras preexistentes del arte Adem\u00e1s, la entrevista se cierra con una reflexi\u00f3n sobre las implicaciones \u00e9ticas asociadas, cuestionando la noci\u00f3n de \u201c\u00e9tica en la IA\u201d y abogando por un compromiso \u00e9tico m\u00e1s profundo en las pr\u00e1cticas art\u00edsticas que implican IA.", "venue": "UM\u00c1TICA Revista sobre Creaci\u00f3n y An\u00e1lisis de la Imagen", "label": 0}, {"loc": [7.032191753387451, 1.177438497543335], "openalex_id": "https://openalex.org/W4389977189", "title": "Transfer Learning in the Era of Large Language Models", "authors": "Katikapalli Subramanyam Kalyan", "abstract": "Large language models (LLMs) are a special class of pretrained language models (PLMs) obtained by scaling model size, pretraining corpus and computation. LLMs, because of their large size and pretraining on large volumes of text data, exhibit special abilities which allow them to achieve remarkable performances without any task-specific training in many of the natural language processing tasks. The era of LLMs started with OpenAI's GPT-3 model, and the popularity of LLMs has increased exponentially after the introduction of models like ChatGPT and GPT4. We refer to GPT-3 and its successor OpenAI models, including ChatGPT and GPT4, as GPT-3 family large language models (GLLMs). With the ever-rising popularity of GLLMs, especially in the research community, there is a strong need for a comprehensive survey which summarizes the recent research progress in multiple dimensions and can guide the research community with insightful future research directions. We start the survey paper with foundation concepts like transformers, transfer learning, self-supervised learning, pretrained language models and large language models. We then present a brief overview of GLLMs and discuss the performances of GLLMs in various downstream tasks, specific domains and multiple languages. We also discuss the data labelling and data augmentation abilities of GLLMs, the robustness of GLLMs, the effectiveness of GLLMs as evaluators, and finally, conclude with multiple insightful future research directions. To summarize, this comprehensive survey paper will serve as a good resource for both academic and industry people to stay updated with the latest research related to GLLMs.", "venue": "Natural Language Processing Journal", "label": 9}, {"loc": [8.269274711608887, 0.5945689678192139], "openalex_id": "https://openalex.org/W4389984066", "title": "Large Language Models: A Survey", "authors": "Yunfan Gao, Yun Xiong, Xinyu Gao, Kangxiang Jia, Jinliu Pan, Yuxi Bi, Yi Dai, Jiawei Sun, Haofen Wang", "abstract": "Large Language Models (LLMs) showcase impressive capabilities but encounter challenges like hallucination, outdated knowledge, and non-transparent, untraceable reasoning processes. Retrieval-Augmented Generation (RAG) has emerged as a promising solution by incorporating knowledge from external databases. This enhances the accuracy and credibility of the generation, particularly for knowledge-intensive tasks, and allows for continuous knowledge updates and integration of domain-specific information. RAG synergistically merges LLMs' intrinsic knowledge with the vast, dynamic repositories of external databases. This comprehensive review paper offers a detailed examination of the progression of RAG paradigms, encompassing the Naive RAG, the Advanced RAG, and the Modular RAG. It meticulously scrutinizes the tripartite foundation of RAG frameworks, which includes the retrieval, the generation and the augmentation techniques. The paper highlights the state-of-the-art technologies embedded in each of these critical components, providing a profound understanding of the advancements in RAG systems. Furthermore, this paper introduces up-to-date evaluation framework and benchmark. At the end, this article delineates the challenges currently faced and points out prospective avenues for research and development.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.801542282104492, -1.0715405941009521], "openalex_id": "https://openalex.org/W4389923865", "title": "Simplification of German Narrative Documents with Longformer mBART", "authors": "Thorben Schomacker, Tillmann D\u00f6nicke, Marina Tropmann-Frick", "abstract": "In this paper, we apply transformer-based Natural Language Generation (NLG) techniques to the problem of text simplification. Currently, there are only a few German datasets available for text simplification, even fewer with larger and aligned documents, and not a single one with narrative texts. In this paper, we explore to which degree modern NLG techniques can be applied to German narrative text simplifications. We use Longformer attention and a pre-trained mBART model. Our findings indicate that the existing approaches for German are not able to solve the task properly. We conclude on a few directions for future research to address this problem.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.267020225524902, 2.0812768936157227], "openalex_id": "https://openalex.org/W4389912457", "title": "Corpus Modeling and the Geometries of Text", "authors": "Dustin S. Stoltz, Marissa A. Combs, Marshall A. Taylor", "abstract": "Abstract This chapter explores the theoretical implications of spatial metaphors in the field of computational text analysis and inspects how the properties of topologies aid and inhibit our theories of textual meaning. Rather than mining for \u201cground truth,\u201d machine learning algorithms for text, especially word embedding models, provide a selectively simplistic map of the semantic space. The representation of that textual map depends not only on the choice of algorithm but also on the composition of the corpora used to train them. Along with reviewing the technical aspects of embedding text into space, this chapter surveys the consequences of training algorithms with internal and external objectives. The implications of different types of training corpora are enumerated, with particular attention to ethical considerations. More scholarship, institutional support, and technical infrastructure directed toward the careful building, documenting, and sharing of corpora as well as machine learning models trained on those corpora are recommended.", "venue": "Oxford University Press eBooks", "label": 0}, {"loc": [8.237831115722656, 1.0862703323364258], "openalex_id": "https://openalex.org/W4391093117", "title": "PSQS: Parallel Semantic Querying Service for Self-describing File Formats", "authors": "Chenxu Niu, Weidong Zhang, Suren Byna, Yong Chen", "abstract": "Finding relevant datasets can be a time-consuming and challenging task, especially for self-describing file formats. Current solutions use either exact or partial keyword matching approaches to extract and process metadata queries, but they fail to capture semantic relationships between the metadata content and query keywords. To address this challenge, we introduce PSQS, a novel parallel semantic search method for self-describing files. The method leverages parallel processing and kv2vec semantic similarity measures to retrieve semantically relevant data efficiently. Our evaluation against existing metadata search solutions shows that PSQS offers a new, efficient and effective semantic search functionality for various fields where large self-describing files are used, such as scientific data management, leading to more accurate and efficient data retrieval.", "venue": "https://doi.org/10.1109/bigdata59044.2023.10386205", "label": 0}, {"loc": [3.7520883083343506, -3.9792356491088867], "openalex_id": "https://openalex.org/W4391094753", "title": "Multilingual Hate Speech Detection: Comparison of Transfer Learning Methods to Classify German, Italian, and Spanish Posts", "authors": "Jan Fillies, Michael P. Hoffmann, Adrian Paschke", "abstract": "With the increase of digital communication, a surge in online hate speech can be witnessed. Recent studies have concentrated on automated supervised detection of hate speech. However, there remains limited understanding of an effective strategy for identifying multilingual hate speech in social media posts. This study introduces an innovate experimental design for multilingual hate speech detection. It compares different approaches to automatically detect multilingual hate speech through a series of experiments and creates a classification algorithm for hate speech in German, Italian and Spanish text-based social media content. The study creates monolingual, multilingual, and translated datasets specific to the language triplet. Subsequently, the research explores suitable models for multilingual hate speech detection, evaluating a total of seven transformer-based models along with corresponding SVM models on the constructed datasets. The findings indicate that all chosen transformer-based models outperform the baseline SVM models. The research highlights the superiority of a multilingual approach, utilizing XLM-RoBERTa as a classifier model, over monolingual, multilingual, and translation-based approaches. Furthermore, the study demonstrates that translation-based methods in connection to the model DistillBERT can serve as viable alternatives to the multilingual XLM-RoBERTa approach, particularly in scenarios where computational resources are restricted and processing speed is of importance.", "venue": "https://doi.org/10.1109/bigdata59044.2023.10386244", "label": 0}, {"loc": [6.929444313049316, 1.8687278032302856], "openalex_id": "https://openalex.org/W4396525257", "title": "Does Lack of Knowledge and Hardship of Information Access Signify Powerful AI? A Large Language Model Perspective", "authors": "Idrees A. Zahid, Shahad Sabbar Joudar", "abstract": "Large Language Models (LLMs) are evolving and expanding enormously. With the consistent improvement of LLMs, more complex and sophisticated tasks will be tackled. Handling various tasks and fulfilling different queries will be more precise. Emerging LLMs in the field of Artificial Intelligence (AI) impact online digital content. An association between digital corpus scarcity and the improvement of LLMs is drawn. The impact it will bring to the field of LLMs is discussed. More powerful LLMs are insights to be there. Specifically, increase in Reinforcement Learning from Human Feedback (RLHF) LLMs release. More precise RLHF LLMs will endure development and alternative releases.", "venue": "Applied Data Science and Analysis", "label": 0}, {"loc": [5.1551737785339355, 1.0730631351470947], "openalex_id": "https://openalex.org/W4389574762", "title": "Understanding Large Language Models", "authors": "Giulia Polverini, Bor Gregorcic", "abstract": "Abstract The paper aims to fulfil three main functions: (1) to serve as an introduction for the physics education community to the functioning of large language models (LLMs), (2) to present a series of illustrative examples demonstrating how prompt-engineering techniques can impact LLMs performance on conceptual physics tasks and (3) to discuss potential implications of the understanding of LLMs and prompt engineering for physics teaching and learning. We first summarise existing research on the performance of a popular LLM-based chatbot (ChatGPT) on physics tasks. We then give a basic account of how LLMs work, illustrate essential features of their functioning, and discuss their strengths and limitations. Equipped with this knowledge, we discuss some challenges with generating useful output with ChatGPT-4 in the context of introductory physics, paying special attention to conceptual questions and problems. We then provide a condensed overview of relevant literature on prompt engineering and demonstrate through illustrative examples how selected prompt-engineering techniques can be employed to improve ChatGPT-4 \u2019s output on conceptual introductory physics problems. Qualitatively studying these examples provides additional insights into ChatGPT\u2019s functioning and its utility in physics problem-solving. Finally, we consider how insights from the paper can inform the use of LLMs in the teaching and learning of physics.", "venue": "European Journal of Physics", "label": 0}, {"loc": [5.123810768127441, -1.6542307138442993], "openalex_id": "https://openalex.org/W4390780988", "title": "Tackling an Unbalanced Dataset for Classifying Indonesian E-Commerce Reviews Using Multi Word Embedding Model", "authors": "Rizky Adi, Bassamtiano Renaufalgi Irnawan, Jiyi Li", "abstract": "Product reviews are an integral part of ecommerce. By Q2 2020 alone, the average of e-commerce visitors in Indonesia almost reached 400 million. This vast amount of traffic resulting a massive amount of product review information. Product reviews in Indonesia are usually paired with a rating system from one to five based on user satisfaction with the product. Mostly, the number of reviews in each rating is unbalanced. Ratings 5, 4, and 1 are usually more frequent than ratings 2 and 3. This imbalanced data makes classification challenging to categorize the review rating, leading to inaccurate predictions for the rating with fewer data than the others. Previous research used various approaches for classifying review ratings, such as machine learning and fine-tuning a state-of-the-art deep learning model. In this research, we propose a new approach that leverages multiple-word embedding combined with CNN to deal with the highly unbalanced dataset problem. By leveraging multiple-word embedding techniques, the model can extract more features from the review text. We also proposed an extended pipeline to enhance the model performance. The proposed model and pipeline perform better than the fine-tuned Indonesian BERT-based and RoBERTa-based pre-trained models baseline with 0.93 accuracy and 0.82 F1-Macro scores.", "venue": "https://doi.org/10.1109/icic60109.2023.10381997", "label": 0}, {"loc": [6.017616271972656, 1.9436509609222412], "openalex_id": "https://openalex.org/W4390755943", "title": "IndoBerea: Evolving Semantic Search in Theological Context", "authors": "Feliks Victor Parningotan Samosir, Serius Mendrofa", "abstract": "This paper presents IndoBerea, a semantic search model pre-trained on an Indonesian Bible dataset and based on SentenceTransformers and IndoBERT. It aims to enhance theological research by providing contextually relevant verses in Bible study. The model's output is evaluated through similarity scores and compared with human relevance judgments, revealing a moderate correlation. The analysis highlights the model's varying performance across different queries and identifies potential improvement in expanding the model's context recognition ability. Currently, IndoBerea is designed to output a single verse per query. However, incorporating a broader context, like a group of verses or an entire chapter, could align model judgments more closely with human interpretations. Future work should focus on this aspect, necessitating a different dataset structure and model architecture. The results demonstrate the potential of semantic search models in theological contexts and suggest promising future research directions in Indonesian NLP community.", "venue": "https://doi.org/10.1109/icic60109.2023.10382053", "label": 0}, {"loc": [2.549870252609253, 1.4766168594360352], "openalex_id": "https://openalex.org/W4389437528", "title": "Generative Artificial Intelligence: Models, Benefits, Dangers and Detection of AI-Generated Text on Specialized Domains", "authors": "David Baidoo-Anu, Leticia Owusu Ansah", "abstract": "Since its maiden release into the public domain on November 30, 2022, ChatGPT garnered more than one million subscribers within a week. The generative AI tool \u23bcChatGPT took the world by surprise with it sophisticated capacity to carry out remarkably complex tasks. The extraordinary abilities of ChatGPT to perform complex tasks within the field of education has caused mixed feelings among educators, as this advancement in AI seems to revolutionize existing educational praxis. This is an exploratory study that synthesizes recent extant literature to offer some potential benefits and drawbacks of ChatGPT in promoting teaching and learning. Benefits of ChatGPT include but are not limited to promotion of personalized and interactive learning, generating prompts for formative assessment activities that provide ongoing feedback to inform teaching and learning etc. The paper also highlights some inherent limitations in the ChatGPT such as generating wrong information, biases in data training, which may augment existing biases, privacy issues etc. The study offers recommendations on how ChatGPT could be leveraged to maximize teaching and learning. Policy makers, researchers, educators and technology experts could work together and start conversations on how these evolving generative AI tools could be used safely and constructively to improve education and support students\u2019 learning.", "venue": "Journal of AI", "label": 0}, {"loc": [4.556575298309326, -1.4846471548080444], "openalex_id": "https://openalex.org/W4391768864", "title": "Bangla Emergency Post Classification on Social Media using Transformer Based BERT Models", "authors": "Alvi Ahmmed Nabil, Dola Das, Md. Shahidul Salim, Shamsul Arifeen, H. M. Abdul Fattah", "abstract": "Text classification is one of the most important tasks in Natural Language Processing. As text data is growing rapidly, it needs more computational power to classify the text in a big dataset. The task is difficult for characteristic-rich languages like Bangla. Having good-quality text data significantly affects the outcome of the model that has been used to classify them. Nowadays, social media can be an important source of information. But there is a huge number of data which are of no use. As the use of social media is increasing day by day, people are posting about events around their surroundings. So, it can be an important propaganda of the media. In this study, various text classification methods were used to classify the texts in Bangla from social media, which can be categorized as emergencies that may need immediate actions from the government, local authority, or law enforcement or even may need international attention. Therefore, 5839 social media posts were collected from Facebook and Twitter, which were written in Bangla along with some mixed English words. Then, after preprocessing, various Machine Learning models, Deep Neural Network models, and Transformer based models were applied to classify them. Among these models, transformer-based XLM-RoBERTa outperformed all the other models with an F1-score of 95.25.", "venue": "https://doi.org/10.1109/eict61409.2023.10427900", "label": 0}, {"loc": [2.9415149688720703, -0.8331094980239868], "openalex_id": "https://openalex.org/W4390970475", "title": "Named Entity Recognition in Italian Lung Cancer Clinical Reports using Transformers", "authors": "Domenico Paolo, Alessandro Bria, Carlo Greco, Marco Russano, Sara Ramella, Paolo Soda, Rosa Sicilia", "abstract": "The widespread adoption of electronic health records (EHRs) offers a valuable opportunity to support clinical research by containing crucial patient information, including diagnoses, symptoms, medications, lab tests, and more. Despite the success of deep learning for biomedical Named Entity Recognition (NER), the literature in this field still presents a gap regarding applications focused on lung cancer for the Italian language. Hence, this paper presents a transformer-based approach to extract named entities from Italian clinical notes related to Non-Small Cell Lung Cancer (NSCLC). We introduce a novel set of 25 clinical entities related to NSCLC building a corpus annotated for NER. We apply a state-of the-art model pre-trained on Italian biomedical texts to the manually annotated clinical reports of a cohort of 257 patients suffering from NSCLC, successfully dealing with class-imbalance problems and obtaining promising performance (average F1-score of 84.3%). We also compared our method with two other pre-trained state-of-the-art models showing that the domain specific knowledge offered by the proposed approach is necessary to achieve higher performance. These findings also showcase the feasibility of using transformers to extract biomedical information in the Italian language.", "venue": "https://doi.org/10.1109/bibm58861.2023.10385778", "label": 0}, {"loc": [4.301016330718994, -0.865447461605072], "openalex_id": "https://openalex.org/W4390992119", "title": "Transformers for Detection of Distressed Cardiac Patients with an ICD Based on Danish Text Messages", "authors": "Julie Dittmann Weimar Andersen, M.L. Jensen, Uffe Kock Wiil, S\u00f8ren J. Skovbakke, Ole Skov, Susanne S. Pedersen, Abdolrahman Peimankar, Ali Ebrahimi", "abstract": "Cardiac patients with implantable cardioverter defibrillator devices frequently exhibit signs of anxiety and depression (termed 'Distressed'). Early detection of these patients is vital for evaluation, intervention, and prevention against relapse. Considering the growing datasets relevant to distress, coupled with the evolution of machine learning methodologies, there exists a promising prospect to develop intelligent systems for the detection of distressed cardiac patients through written materials. In this context, data from two sources were collected: a questionnaire and text communication messages, acquired through the randomized ACQUIRE-ICD study of 168 participants. These textual messages were labelled as either Distressed or Non-Distressed based on questionnaire responses. Following preprocessing, the dataset facilitated the development of transformer-based classification models, including mBERT, XLM-RoBERTa, \u00c6L\u00c6CTRA, and RoBERTa, as well as a hard voting ensemble method to classify patients into Distressed and Non-Distressed categories. To address imbalances in class distribution and dataset scarcity, a data augmentation method was employed. Results indicated the superior performance of the proposed hard voting ensemble, recording weighted metrics of 80% precision, 67% recall, 73% F1-score, and 75% accuracy. Notably, this ensemble correctly identified 67% of Distressed samples, while the most efficient base transformer, mBERT, identified 63% of Distressed samples.", "venue": "https://doi.org/10.1109/bibm58861.2023.10385964", "label": 0}, {"loc": [5.950786113739014, 5.836175441741943], "openalex_id": "https://openalex.org/W4391583898", "title": "Asymmetric Polysemous Reasoning for Image-Text Matching", "authors": "Hongping Zhang, Ming Yang", "abstract": "Image-text matching has received growing interest since it bridges vision and language. The key challenge lies in how to learn correspondence between image and text. Upon observation, we find existing works suffer from two limitations. Firstly, existing works use similar networks to extract features for different modalities without consider the modal characteristics, hinder the semantic reasoning in modalities. Secondly, existing works map a sentence into single space. However, due to the semantic ambiguity, such methods obviously cannot effectively deal with polysemy. In this paper, we present a new Asymmetric Polysemous Reasoning Network (APRN) to tailor for image and text modality characteristics. For image, APRN employs Graph Convolutional Network to implicitly learn the relationships between regions, and perform global semantic reasoning to get image representation. For text, given its ambiguity, we map text into different subspaces to learn distinct various meanings. Then we propose Selection Attention (SA) that combines image information to integrate the text features of subspaces, by selectively attending on the significant and representative subspaces and meanwhile casting aside the interferences of non-meaningful subspaces. Experiments validate that our method surpasses many recent state-of-the-arts with a clear margin for the image-text matching on Flickr30K and MS-COCO datasets.", "venue": "https://doi.org/10.1109/icdmw60847.2023.00134", "label": 0}, {"loc": [2.8352842330932617, 2.3755922317504883], "openalex_id": "https://openalex.org/W4391987434", "title": "Law and the Political Economy of AI Production", "authors": "Petros Terzis", "abstract": "Abstract The governance of artificial intelligence (AI) is at a historical juncture. Legislative acts, global treaties, export controls, and technical standards are now dominating the discourse over what used to be a predominantly market-driven space. Amidst all this frenzy, this paper explains why none of these projects will achieve \u2018alignment\u2019 of AI with the prospect of a sustainable model of production authentically committed to the rights and freedoms of people and communities. By reflecting on the role of law in consolidating the visions and logics of few multinationals in the global value chains of AI, it warns against the peril of regulating AI without looking at the methods and logistics of its material production. Following a detailed overview of the various (techno-)legal ways through which law enables the flow of materials, capital, and power from Global South to Global North, and from small players to lead firms, the paper concludes with some preliminary thoughts on a transformative agenda for the transnational regulation of infocomputational production.", "venue": "International Journal of Law and Information Technology", "label": 0}, {"loc": [6.545385837554932, 5.2188029289245605], "openalex_id": "https://openalex.org/W4389260862", "title": "Knowledge Transfer from Vision Foundation Models for Efficient Training of Small Task-specific Models", "authors": "Raviteja Vemulapalli, Hadi Pouransari, Fartash Faghri, Sachin Mehta, Mehrdad Farajtabar, Mohammad Rastegari, Oncel Tuzel", "abstract": "Vision Foundation Models (VFMs) pretrained on massive datasets exhibit impressive performance on various downstream tasks, especially with limited labeled target data. However, due to their high inference compute cost, these models cannot be deployed for many real-world applications. Motivated by this, we ask the following important question, \"How can we leverage the knowledge from a large VFM to train a small task-specific model for a new target task with limited labeled training data?\", and propose a simple task-oriented knowledge transfer approach as a highly effective solution to this problem. Our experimental results on five target tasks show that the proposed approach outperforms task-agnostic VFM distillation, web-scale CLIP pretraining, supervised ImageNet pretraining, and self-supervised DINO pretraining by up to 11.6%, 22.1%, 13.7%, and 29.8%, respectively. Furthermore, the proposed approach also demonstrates up to 9x, 4x and 15x reduction in pretraining compute cost when compared to task-agnostic VFM distillation, ImageNet pretraining and DINO pretraining, respectively, while outperforming them. We also show that the dataset used for transferring knowledge has a significant effect on the final target task performance, and introduce a retrieval-augmented knowledge transfer strategy that uses web-scale image retrieval to curate effective transfer sets.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.610085487365723, 3.1365628242492676], "openalex_id": "https://openalex.org/W4396784301", "title": "CLIP and the City: Addressing the Artificial Encoding of Cities in Multimodal Foundation Deep Learning Models", "authors": "Dario Negueruela del Castillo, Iacopo Neri", "abstract": "In this project, we propose and explore a computational pipeline to examine urban cultural landscapes through the lens of artificial intelligence, and for questioning modes of embedding culture in machine learning models. By employing machine learning models that extract features and textual properties from images, we aim to uncover the connections between a city\u2019s history, architecture, and urban development. The city of Rome serves as a significant case study for this research. To achieve this objective, we feed 360\u00b0 panoramic images into large vision-language models (e.g. OpenCLIP), to question how mainstream culture is expressed in these models. In this machine-triggered urban experiment, we investigate overlaps between history and machinic interpretation and whether relevant temporal correlations can be captured through generic street images only. Finally, by spatially analysing the captured data, we identify clusters and discontinuities in the urban layout aiming at visually depicting the interplay of forces behind its development. As in a forensic exercise, the paper seeks to uncover the complex social and historical dynamics of urban environments, exploiting only contemporary images of their settings and a generic embedding of culture. It explores potential cultural biases embedded in machine learning models by comparing Rome \u2013 culturally relevant for the western world \u2013 with other cities around the world; leveraging innovative computational pipelines and globally covering datasets to provide a novel research line for urban studies.", "venue": "https://doi.org/10.60152/eun81fru", "label": 0}, {"loc": [8.88067626953125, -0.5129585862159729], "openalex_id": "https://openalex.org/W4389217990", "title": "Headline Generation for Indian Languages", "authors": "Lokesh Madasu, Gopichand Kanumolu, Nirmal Surange, Manish Shrivastava", "abstract": "The task of headline generation within the realm of Natural Language Processing (NLP) holds immense significance, as it strives to distill the true essence of textual content into concise and attention-grabbing summaries. While noteworthy progress has been made in headline generation for widely spoken languages like English, there persist numerous challenges when it comes to generating headlines in low-resource languages, such as the rich and diverse Indian languages. A prominent obstacle that specifically hinders headline generation in Indian languages is the scarcity of high-quality annotated data. To address this crucial gap, we proudly present Mukhyansh, an extensive multilingual dataset, tailored for Indian language headline generation. Comprising an impressive collection of over 3.39 million article-headline pairs, Mukhyansh spans across eight prominent Indian languages, namely Telugu, Tamil, Kannada, Malayalam, Hindi, Bengali, Marathi, and Gujarati. We present a comprehensive evaluation of several state-of-the-art baseline models. Additionally, through an empirical analysis of existing works, we demonstrate that Mukhyansh outperforms all other models, achieving an impressive average ROUGE-L score of 31.43 across all 8 languages.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.262153625488281, -0.928007185459137], "openalex_id": "https://openalex.org/W4388955340", "title": "An Extensive Survey on Investigation Methodologies for Text Summarization", "authors": "Aahana Saklecha, Pragya Uplavdiya, Prof. Mayuri Chawla", "abstract": "Natural language processing (NLP) is a fast-expanding field, and text summarization has recently gained a lot of research interest. The necessity for automatic summarizing approaches to effectively digest massive amounts of textual data has grown in importance, due to the plethora (excessive amount of something) of information available in the digital age [18]. By automatically producing succinct and educational summaries of extensive materials, NLP-based text summarizing systems have the potential to revolutionize the way humans consume and process information. This review paper offers a thorough examination of the text summarizing research approaches. The process of creating a concise and useful summary of a text document is called text summarization. Even for cutting-edge natural language processing (NLP) systems, it is a difficult task. It was carried out using a thorough analysis of the most recent text summarizing research. The evaluation revealed a variety of research approaches that have been employed in the creation and assessment of text summarizing systems. This study's key discovery is that there are numerous different investigative approaches that can be used for text summarizing. These methods can be roughly divided into two groups: \u2022 Extractive text summarization \u2022 Abstractive text summarization During the review we found that extractive summarization is a fairly simple method as it selects the key phrases from a text and extracts them to create a summary while abstractive summarization presents data in a clearer, more informative fashion by producing a summary. This review was important because it gives a thorough overview of the research approaches utilized for text summarizing, this article is significant. Researchers and programmers can utilize this data to create brand-new, improved text summarizing systems. [20]", "venue": "Indian Journal of Signal Processing", "label": 0}, {"loc": [7.211209774017334, 2.0931124687194824], "openalex_id": "https://openalex.org/W4388927764", "title": "A Comprehensive Survey on Long Context Language Modeling", "authors": "Yun-Peng Huang, Jingwei Xu, Zixu Jiang, Junyu Lai, Zenan Li, Yuan Yao, Taolue Chen, Lijuan Yang, Xin Zhou, Xiaoxing Ma", "abstract": "Transformer-based Large Language Models (LLMs) have been applied in diverse areas such as knowledge bases, human interfaces, and dynamic agents, and marking a stride towards achieving Artificial General Intelligence (AGI). However, current LLMs are predominantly pretrained on short text snippets, which compromises their effectiveness in processing the long-context prompts that are frequently encountered in practical scenarios. This article offers a comprehensive survey of the recent advancement in Transformer-based LLM architectures aimed at enhancing the long-context capabilities of LLMs throughout the entire model lifecycle, from pre-training through to inference. We first delineate and analyze the problems of handling long-context input and output with the current Transformer-based models. We then provide a taxonomy and the landscape of upgrades on Transformer architecture to solve these problems. Afterwards, we provide an investigation on wildly used evaluation necessities tailored for long-context LLMs, including datasets, metrics, and baseline models, as well as optimization toolkits such as libraries, frameworks, and compilers to boost the efficacy of LLMs across different stages in runtime. Finally, we discuss the challenges and potential avenues for future research. A curated repository of relevant literature, continuously updated, is available at https://github.com/Strivin0311/long-llms-learning.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.481340408325195, -1.4030091762542725], "openalex_id": "https://openalex.org/W4390992933", "title": "Multilingual Model Fine-tuning for Sentiment Analysis", "authors": "Mohamed Lotfy Elrefai, Mahmoud I. Khalil, Hazem M. Abbas", "abstract": "Multilingual language models have decreased the barrier between languages, as it will be helpful overcoming many problems, such as sentiment analysis because the importance of this task is to make good decisions and customize products. Obtaining information from one language can help other languages generalize and understand a task more effectively. In this paper, we propose a general method for sentiment analysis of data that includes data from many languages, which enables all applications to use sentiment analysis results in a language-blind or language-independent manner. We performed experiments on two language combinations (English and Arabic) for sentence-level sentiment classification and found that the model with the final setup after adding translations from one language to another and fine-tuning the multilingual language model for Twitter, was the best setup, achieving for two languages and 71.2% and 68.1% f1-score for English and Arabic, respectively.", "venue": "https://doi.org/10.1109/icicis58388.2023.10391141", "label": 0}, {"loc": [3.1062541007995605, -0.1094282940030098], "openalex_id": "https://openalex.org/W4388823522", "title": "Chatgpt & Google Bard AI: A Comparative Study From Student's Perspective.", "authors": "Kostis Giannakopoulos, Argyro Kavadella, Anas Aaqel Salim, Vassilis Stamatopoulos, Eleftherios G. Kaklamanos", "abstract": "Background The increasing application of generative artificial intelligence large language models (LLMs) in various fields, including dentistry, raises questions about their accuracy. Objective This study aims to comparatively evaluate the answers provided by 4 LLMs, namely Bard (Google LLC), ChatGPT-3.5 and ChatGPT-4 (OpenAI), and Bing Chat (Microsoft Corp), to clinically relevant questions from the field of dentistry. Methods The LLMs were queried with 20 open-type, clinical dentistry\u2013related questions from different disciplines, developed by the respective faculty of the School of Dentistry, European University Cyprus. The LLMs\u2019 answers were graded 0 (minimum) to 10 (maximum) points against strong, traditionally collected scientific evidence, such as guidelines and consensus statements, using a rubric, as if they were examination questions posed to students, by 2 experienced faculty members. The scores were statistically compared to identify the best-performing model using the Friedman and Wilcoxon tests. Moreover, the evaluators were asked to provide a qualitative evaluation of the comprehensiveness, scientific accuracy, clarity, and relevance of the LLMs\u2019 answers. Results Overall, no statistically significant difference was detected between the scores given by the 2 evaluators; therefore, an average score was computed for every LLM. Although ChatGPT-4 statistically outperformed ChatGPT-3.5 (P=.008), Bing Chat (P=.049), and Bard (P=.045), all models occasionally exhibited inaccuracies, generality, outdated content, and a lack of source references. The evaluators noted instances where the LLMs delivered irrelevant information, vague answers, or information that was not fully accurate. Conclusions This study demonstrates that although LLMs hold promising potential as an aid in the implementation of evidence-based dentistry, their current limitations can lead to potentially harmful health care decisions if not used judiciously. Therefore, these tools should not replace the dentist\u2019s critical thinking and in-depth understanding of the subject matter. Further research, clinical validation, and model improvements are necessary for these tools to be fully integrated into dental practice. Dental practitioners must be aware of the limitations of LLMs, as their imprudent use could potentially impact patient care. Regulatory measures should be established to oversee the use of these evolving technologies.", "venue": "Journal of Medical Internet Research", "label": 13}, {"loc": [3.933790922164917, -3.6389615535736084], "openalex_id": "https://openalex.org/W4391878106", "title": "Fine-tuning BERT-based Models for Negative Content Identification on Indonesian Tweets", "authors": "Ahmad Fathan Hidayatullah, Kassim Kalinaki, Muhammad Muzamil Aslam, Rufai Yusuf Zakari, Wasswa Shafik", "abstract": "Social media platforms like Twitter have become substantial sources of user-generated content, enabling people to easily express their emotions and opinions. However, this freedom has increased the spread of harmful content, such as abusive language, sexually explicit content, and hate speech. This poses challenges for content moderation and user safety. In order to guarantee a safer, more receptive, and more pleasurable online environment for users of all ages, it is essential to develop a system capable of recognizing abusive and sexually explicit material on Twitter. Despite the growing importance of content moderation, a research gap exists in Indonesian tweets, with limited comprehensive studies on negative content identification. This research addresses this gap by evaluating the effectiveness of Bidirectional Encoder Representations from Transformers (BERT) models in the Indonesian context, which were primarily developed for English and other languages. This research aims to identify abusive, adult, and neutral content in Indonesian tweets by examining and fine-tuning BERT-based models to maintain a healthy online environment for optimal tweet classification. Based on our experiments, the BERT-based models showed promising results in detecting negative tweets. Among the BERT-based models, IndoBERTweet achieved the best precision, recall, and macro F1 scores with 97.03, 96.88, and 96.94, respectively.", "venue": "https://doi.org/10.1109/icitda60835.2023.10427046", "label": 0}, {"loc": [8.120173454284668, 3.3305041790008545], "openalex_id": "https://openalex.org/W4388748321", "title": "Do Localization Methods Actually Localize Memorized Data in LLMs? A Tale of Two Benchmarks", "authors": "Ting-Yun Chang, Jesse Thomason, Robin Jia", "abstract": "The concept of localization in LLMs is often mentioned in prior work; however, methods for localization have never been systematically and directly evaluated. We propose two complementary benchmarks that evaluate the ability of localization methods to pinpoint LLM components responsible for memorized data. In our INJ benchmark, we actively inject a piece of new information into a small subset of LLM weights, enabling us to directly evaluate whether localization methods can identify these \"ground truth\" weights. In our DEL benchmark, we evaluate localization by measuring how much dropping out identified neurons deletes a memorized pretrained sequence. Despite their different perspectives, our two benchmarks yield consistent rankings of five localization methods. Methods adapted from network pruning perform well on both benchmarks, and all evaluated methods show promising localization ability. On the other hand, even successful methods identify neurons that are not specific to a single memorized sequence.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.943730592727661, -0.7302712798118591], "openalex_id": "https://openalex.org/W4391376425", "title": "Fine-Tuned Large Language Models for Symptom Recognition from Spanish Clinical Text", "authors": "Mai A. Shaaban, Abbas Akkasi, Adnan Khan, Majid Komeili, Mohammad Yaqub", "abstract": "Abstract The accurate recognition of symptoms in clinical reports is significantly important in the fields of healthcare and biomedical natural language processing. These entities serve as essential building blocks for clinical information extraction, enabling retrieval of critical medical insights from vast amounts of textual data. Furthermore, the ability to identify and categorize these entities is fundamental for developing advanced clinical decision support systems, aiding healthcare professionals in diagnosis and treatment planning. In this study, we participated in SympTEMIST \u2013 a shared task on detection of symptoms, signs and findings in Spanish medical documents. We combine a set of large language models finetuned with the data released by the task's organizers. This article is part of the Proceedings of the BioCreative VIII Challenge and Workshop: Curation and Evaluation in the era of Generative Models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.6481106281280518, -0.02012566663324833], "openalex_id": "https://openalex.org/W4388514763", "title": "Topology Only Pre-Training: Towards Generalised Multi-Domain Graph Models", "authors": "Alex Davies, Riku Green, Nirav Ajmeri, Telmo M. Silva Filho", "abstract": "The principal benefit of unsupervised representation learning is that a pre-trained model can be fine-tuned where data or labels are scarce. Existing approaches for graph representation learning are domain specific, maintaining consistent node and edge features across the pre-training and target datasets. This has precluded transfer to multiple domains. We present Topology Only Pre-Training (ToP), a graph pre-training method based on node and edge feature exclusion. We show positive transfer on evaluation datasets from multiple domains, including domains not present in pre-training data, running directly contrary to assumptions made in contemporary works. On 75% of experiments, ToP models perform significantly $p \\leq 0.01$ better than a supervised baseline. Performance is significantly positive on 85.7% of tasks when node and edge features are used in fine-tuning. We further show that out-of-domain topologies can produce more useful pre-training than in-domain. Under ToP we show better transfer from non-molecule pre-training, compared to molecule pre-training, on 79% of molecular benchmarks. Against the limited set of other generalist graph models ToP performs strongly, including against models with many orders of magnitude larger. These findings show that ToP opens broad areas of research in both transfer learning on scarcely populated graph domains and in graph foundation models.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.86167573928833, 2.7010984420776367], "openalex_id": "https://openalex.org/W4398745802", "title": "TEXAS INTELLECTUAL PROPERTY LAW JOURNAL", "authors": "Ivan Png", "abstract": "Replication package for Hou, Yun, Ivan PL Png, and Xi Xiong. \"The Federal Circuit Enriched Patent Owners Without Eliciting Better Inventions.\" Texas Intellectual Property Law Journal 31, no. 2 (2023): 295-326.", "venue": "Harvard Dataverse", "label": 0}, {"loc": [5.463810920715332, 3.2903354167938232], "openalex_id": "https://openalex.org/W4388052837", "title": "Informed Digital Systems: Knowledge Procurement, Gate/Keeping, and Experience", "authors": "Md. Saifuddin Faruk, Seb J. Savory", "abstract": "Digital coherent transceivers have developed to the stage that they can monitor the physical state of an optical network and thus are capable of generating data to build measurement-informed physical layer models. After reviewing the measurement capabilities of coherent transceivers, we discuss different modeling approaches including physics-based models, data-driven models as well as hybrid models that incorporate elements of both physics-based and data-driven models. Having reviewed both the measurement capabilities and the modeling methodologies, the salient features of building digital twins based on measurement informed models for optical fiber communication systems are discussed.", "venue": "Journal of Lightwave Technology", "label": 0}, {"loc": [6.280559062957764, 3.0850918292999268], "openalex_id": "https://openalex.org/W4387965772", "title": "Multimodal predictive model for analyzing news data", "authors": "Iman AbouHassan, Nikola Kasabov, Vinayak Jagtap, Parag Kulkarni", "abstract": "Abstract In a first study, this paper argues and demonstrates that spiking neural networks (SNN) can be successfully used for predictive and explainable modelling of multimodal streaming data. The paper proposes a new method, where both time series and on-line news are integrated as numerical streaming data in the same time domain and then used to train incrementally a SNN model. The connectivity and the spiking activity of the SNN are then analyzed through clustering and dynamic graph extraction to reveal on-line interaction between all input variables in regard to the predicted one. The paper answers the main research question of how to understand the dynamic interaction of time series and on-line news through their integrative modelling. It offers a new method to evaluate the efficiency of using on-line news on the predictive modelling of time series. Results on financial stock time series and online news are presented. In contrast to traditional machine learning techniques, the method reveals the dynamic interaction between stock variables and news and their dynamic impact on model accuracy when compared to models that do not use news information. Along with the used financial data, the method is applicable to a wide range of other multimodal time series and news data, such as economic, medical, environmental and social. The proposed method, being based on SNN, promotes the use of massively parallel and low energy neuromorphic hardware for multivariate on-line data modelling.", "venue": "Scientific Reports", "label": 24}, {"loc": [6.03106164932251, 5.080202579498291], "openalex_id": "https://openalex.org/W4387929317", "title": "DataComp Challenge", "authors": "Shuhei Yokoo, Peifei Zhu, Yuchi Ishikawa, M. Tanaka, Masayoshi Kondo, Hirokatsu Kataoka", "abstract": "Large web crawl datasets have already played an important role in learning multimodal features with high generalization capabilities. However, there are still very limited studies investigating the details or improvements of data design. Recently, a DataComp challenge has been designed to propose the best training data with the fixed models. This paper presents our solution to both filtering track and BYOD track of the DataComp challenge. Our solution adopts large multimodal models CLIP and BLIP-2 to filter and modify web crawl data, and utilize external datasets along with a bag of tricks to improve the data quality. Experiments show our solution significantly outperforms DataComp baselines (filtering track: 6.6% improvement, BYOD track: 48.5% improvement).", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.957054615020752, -3.5924017429351807], "openalex_id": "https://openalex.org/W4390337221", "title": "Adult Content Detection on Indonesian Tweets by Fine-tuning Transformer-based Models", "authors": "Ahmad Fathan Hidayatullah, Rosyzie Anna Awg Haji Mohd Apong, Daphne Teck Ching Lai, Atika Qazi", "abstract": "The prevalence of adult content on social media has harmful effects on the moral values of young individuals. Therefore, effectively filtering inappropriate content on social media like Twitter is essential. Researchers have utilized machine learning and natural language processing techniques to develop automated systems that can identify adult content. However, using Transformer to detect adult content in the Indonesian language has yet to be thoroughly explored. Identifying adult content in a text is relatively challenging due to its subjective and context-dependent nature. The same words can be used in explicit or non-explicit contexts depending on the context or intended meaning of the surrounding text. This study aims to explore the implementation of fine-tuned Transformer-based models for identifying adult and sexually explicit content in Indonesian Twitter texts. We fine-tuned five pre-trained Transformer-based models: IndoBERT, IndoBERTweet, mBERT, XLM-RoBERTa, and DistilmBERT. Based on our experiments, we can see that all the models showed effectiveness in accurately classifying adult and non-adult content. Among the Transformer-based models, XLM-RoBERTa and IndoBERTweet demonstrated effective adult content identification in Indonesian tweets compared to other pre-trained models. XLM-RoBERTa showed a slightly better performance, which can be attributed to its larger size and advanced training techniques.", "venue": "https://doi.org/10.1109/aciis59385.2023.10367283", "label": 0}, {"loc": [3.090052366256714, 0.33891215920448303], "openalex_id": "https://openalex.org/W4387692104", "title": "Assessing Bias in AI Chatbot Responses", "authors": "Jiyeong Kim, Zhuo Ran Cai, Michael L. Chen, Julia F. Simard, Eleni Linos", "abstract": "This cross-sectional study compares clinician and artificial intelligence (AI) chatbot responses to patient vignettes used to identify bias in medical decisions.", "venue": "JAMA Network Open", "label": 0}, {"loc": [2.888518810272217, -0.474193811416626], "openalex_id": "https://openalex.org/W4387687904", "title": "Qilin-Med: Towards Advanced Chinese Medical Large Language Model", "authors": "Qichen Ye, Junling Liu, Dading Chong, Peilin Zhou, Yining Hua, Andrew Liu", "abstract": "Integrating large language models (LLMs) into healthcare holds great potential but faces challenges. Pre-training LLMs from scratch for domains like medicine is resource-heavy and often unfeasible. On the other hand, sole reliance on Supervised Fine-tuning (SFT) can result in overconfident predictions and may not tap into domain-specific insights. In response, we present a multi-stage training method combining Domain-specific Continued Pre-training (DCPT), SFT, and Direct Preference Optimization (DPO). In addition, we publish a 3Gb Chinese Medicine (ChiMed) dataset, encompassing medical question answering, plain texts, knowledge graphs, and dialogues, segmented into three training stages. The medical LLM trained with our pipeline, Qilin-Med, shows substantial performance improvement. In the CPT and SFT phases, Qilin-Med achieved 38.4% and 40.0% accuracy on the CMExam test set, respectively. It outperformed the basemodel Baichuan-7B (accuracy: 33.5%), by 7.5%. In the DPO phase, it scored 16.66 in BLEU-1 and 27.44 in ROUGE-1 on the Huatuo-26M test set, bringing further improvement to the SFT phase (12.69 in BLEU-1 and 24.21 in ROUGE-1). Additionally, we have further enhanced the model's performance through the Retrieval Augmented Generation (RAG) approach. Experiments demonstrate that Qilin-Med-RAG achieves an accuracy rate of 42.8% on CMExam. These results highlight the contribution of our novel training approach in building LLMs for medical applications.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.1277875900268555, 2.1123971939086914], "openalex_id": "https://openalex.org/W4387580614", "title": "the landscape to quantify creative", "authors": "Ijaz Ul Haq, Manoli Pifarr\u00e9", "abstract": "The growing body of creativity research involves Artificial Intelligence (AI) and Machine learning (ML) approaches to automatically evaluating creative solutions. However, numerous challenges persist in evaluating the creativity dimensions and the methodologies employed for automatic evaluation. This paper contributes to this research gap with a scoping review that maps the Natural Language Processing (NLP) approaches to computations of different creativity dimensions. The review has two research objectives to cover the scope of automatic creativity evaluation: to identify different computational approaches and techniques in creativity evaluation and, to analyze the automatic evaluation of different creativity dimensions. As a first result, the scoping review provides a categorization of the automatic creativity research in the reviewed papers into three NLP approaches, namely: text similarity, text classification, and text mining. This categorization and further compilation of computational techniques used in these NLP approaches help ameliorate their application scenarios, research gaps, research limitations, and alternative solutions. As a second result, the thorough analysis of the automatic evaluation of different creativity dimensions differentiated the evaluation of 25 different creativity dimensions. Attending similarities in definitions and computations, we characterized seven core creativity dimensions, namely: novelty, value, flexibility, elaboration, fluency, feasibility, and others related to playful aspects of creativity. We hope this scoping review could provide valuable insights for researchers from psychology, education, AI, and others to make evidence-based decisions when developing automated creativity evaluation.", "venue": "Frontiers in Education", "label": 0}, {"loc": [3.0570409297943115, 0.29677775502204895], "openalex_id": "https://openalex.org/W4387232979", "title": "ChatGPT and GPT-4: utilities in the legal sector", "authors": "Dana Brin, Vera Sorin, Akhil Vaid, Ali Soroush, Benjamin S. Glicksberg, Alexander W. Charney, Girish N. Nadkarni, Eyal Klang", "abstract": "Abstract The United States Medical Licensing Examination (USMLE) has been a subject of performance study for artificial intelligence (AI) models. However, their performance on questions involving USMLE soft skills remains unexplored. This study aimed to evaluate ChatGPT and GPT-4 on USMLE questions involving communication skills, ethics, empathy, and professionalism. We used 80 USMLE-style questions involving soft skills, taken from the USMLE website and the AMBOSS question bank. A follow-up query was used to assess the models\u2019 consistency. The performance of the AI models was compared to that of previous AMBOSS users. GPT-4 outperformed ChatGPT, correctly answering 90% compared to ChatGPT\u2019s 62.5%. GPT-4 showed more confidence, not revising any responses, while ChatGPT modified its original answers 82.5% of the time. The performance of GPT-4 was higher than that of AMBOSS's past users. Both AI models, notably GPT-4, showed capacity for empathy, indicating AI's potential to meet the complex interpersonal, ethical, and professional demands intrinsic to the practice of medicine.", "venue": "Scientific Reports", "label": 24}, {"loc": [6.171591758728027, 5.540850639343262], "openalex_id": "https://openalex.org/W4388640489", "title": "Applications of AI in Computer Vision and NLP", "authors": "Suresh Babu Rajasekaran", "abstract": "Computer vision and language AI (or speech AI) are two rapidly growing fields that have the potential to revolutionize the way when interacted with technology. When it comes to artificial intelligence, there are two main branches: computer vision, which focuses on visual data, and language AI (or voice AI), which focuses on both written and spoken language. The purpose of this research is to offer a snapshot of where things stand in this area right now, highlighting recent research and key findings in both computer vision and language AI (or speech AI). The literature review will focus on the most recent advancements and trends in these fields, as well as identify gaps in the literature that need further research.", "venue": "Journal of Artificial Intelligence & Cloud Computing", "label": 0}, {"loc": [2.945537567138672, -0.7694290280342102], "openalex_id": "https://openalex.org/W4387185160", "title": "CERM: Context-aware Literature-based Discovery via Sentiment Analysis", "authors": "Julio Christian Young, Uchenna Akujuobi", "abstract": "Driven by the abundance of biomedical publications, we introduce a sentiment analysis task to understand food-health relationship. Prior attempts to incorporate health into recipe recommendation and analysis systems have primarily focused on ingredient nutritional components or utilized basic computational models trained on curated labeled data. Enhanced models that capture the inherent relationship between food ingredients and biomedical concepts can be more beneficial for food-related research, given the wealth of information in biomedical texts. Considering the costly data labeling process, these models should effectively utilize both labeled and unlabeled data. This paper introduces Entity Relationship Sentiment Analysis (ERSA), a new task that captures the sentiment of a text based on an entity pair. ERSA extends the widely studied Aspect Based Sentiment Analysis (ABSA) task. Specifically, our study concentrates on the ERSA task applied to biomedical texts, focusing on (entity-entity) pairs of biomedical and food concepts. ERSA poses a significant challenge compared to traditional sentiment analysis tasks, as sentence sentiment may not align with entity relationship sentiment. Additionally, we propose CERM, a semi-supervised architecture that combines different word embeddings to enhance the encoding of the ERSA task. Experimental results showcase the model\u2019s efficiency across diverse learning scenarios.", "venue": "Frontiers in artificial intelligence and applications", "label": 0}, {"loc": [4.788529872894287, 0.9714277386665344], "openalex_id": "https://openalex.org/W4387244787", "title": "On the Question of Authorship in Large Language Models", "authors": "Carlin Soos, Levon Haroutunian", "abstract": "The adoption of pre-trained large language models (LLMs), like ChatGPT, across an increasingly diverse range of tasks and domains poses significant challenges for authorial attribution and other basic knowledge organization practices. This paper examines the theoretical and practical issues introduced by LLMs and describes how their use erodes the supposedly firm boundaries separating specific works and creators. Building upon the author-as-node framework proposed by Soos and Leazer (2020), we compare works created with and without the use of LLMs; ultimately, we argue that the issues associated with these novel tools are indicative of preexisting limitations within standard entity-relationship models. As the growing popularity of generative AI raises concerns about plagiarism, academic integrity, and intellectual property, we encourage a reevaluation of reductive work/creator associations and advocate for the adoption of a more expansive approach to authorship.", "venue": "NASKO", "label": 0}, {"loc": [6.30445671081543, 4.719836235046387], "openalex_id": "https://openalex.org/W4386436118", "title": "Long-Term Ad Memorability: Understanding & Generating Memorable Ads", "authors": "S I Harini, Somesh Singh, Yaman Kumar, Aanisha Bhattacharyya, Veeky Baths, Changyou Chen, Rajiv Ratn Shah, Balaji Krishnamurthy", "abstract": "Despite the importance of long-term memory in marketing and brand building, until now, there has been no large-scale study on the memorability of ads. All previous memorability studies have been conducted on short-term recall on specific content types like action videos. On the other hand, long-term memorability is crucial for the advertising industry, and ads are almost always highly multimodal. Therefore, we release the first memorability dataset, LAMBDA, consisting of 1749 participants and 2205 ads covering 276 brands. Running statistical tests over different participant subpopulations and ad types, we find many interesting insights into what makes an ad memorable, e.g., fast-moving ads are more memorable than those with slower scenes; people who use ad-blockers remember a lower number of ads than those who don't. Next, we present a model, Henry, to predict the memorability of a content. Henry achieves state-of-the-art performance across all prominent literature memorability datasets. It shows strong generalization performance with better results in 0-shot on unseen datasets. Finally, with the intent of memorable ad generation, we present a scalable method to build a high-quality memorable ad generation model by leveraging automatically annotated data. Our approach, SEED (Self rEwarding mEmorability Modeling), starts with a language model trained on LAMBDA as seed data and progressively trains an LLM to generate more memorable ads. We show that the generated advertisements have 44% higher memorability scores than the original ads. We release this large-scale ad dataset, UltraLAMBDA, consisting of 5 million ads. Our code and the datasets, LAMBDA and UltraLAMBDA, are open-sourced at https://behavior-in-the-wild.github.io/memorability.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.754086971282959, -0.5245553851127625], "openalex_id": "https://openalex.org/W4386165096", "title": "LARGE LANGUAGE MODELS FOR TEXT CLASSIFICATION: FROM ZERO-SHOT LEARNING TO INSTRUCTION-TUNING", "authors": "Youngjin Chae, Thomas Davidson", "abstract": "Advances in large language models (LLMs) have transformed the field of natural language processing and have enormous potential for social scientific analysis. We explore the application of LLMs to supervised text classification. As a case study, we consider stance detection and examine variation in predictive accuracy across different architectures, training regimes, and task specifications. We compare ten models ranging in size from 86 million to 1.7 trillion parameters and four distinct training regimes: prompt-based zero-shot learning; few-shot learning; fine-tuning; and instruction-tuning. The largest models generally offer the best predictive performance, but fine-tuning smaller models is a competitive solution due to their relatively high accuracy and low cost. For complex prediction tasks, instruction-tuned open-weights models can perform well, rivaling state-of-the-art commercial models. We provide recommendations for the use of LLMs for text classification in sociological research and discuss the limitations and challenges related to the use of these technologies.", "venue": "https://doi.org/10.31235/osf.io/sthwk", "label": 0}, {"loc": [2.7224621772766113, 2.6464645862579346], "openalex_id": "https://openalex.org/W4386349145", "title": "The Ethics and Challenges of Legal Personhood for AI", "authors": "Louisa McDonald", "abstract": "Recent advances in artificial intelligence (AI) and machine learning have prompted discussion about whether conventional liability laws can be applicable to AI systems which manifest a high degree of autonomy. Users and developers of such AI systems may meet neither the epistemic (sufficient degree of awareness of what is happening) nor control (control over the actions performed) conditions of personal responsibility for the actions of the system at hand, and therefore, conventional liability schemes may seem to be inapplicable[1]. The recently adopted AI Liability Directive [2022] has sought to adapt EU law to the challenges to conventional liability schemes posed by AI systems by imposing a system of strict, rather than fault-based liability, for AI systems. The goal of this is to be able to more easily hold developers, producers, and users of AI technologies accountable, requiring them to explain how AI systems were built and trained. The Directive aims to make it easier for people and companies harmed by AI systems to sue those responsible for the AI systems for damages. However, the Directive seems to ignore the potential injustice that could result from producers and developers being held accountable for actions caused by AI systems which they are neither aware of nor have sufficient control over. In this essay, I will critically assess the Directive\u2019s system of fault-based liability for AI systems and argue that, whilst such a system may confer some instrumental advantages on behalf of those suing for damages caused by AI systems, it risks causing injustice on the part of developers and producers by making them liable for events they could neither control nor predict. This is likely to risk both producing unjust outcomes and hindering progress in AI development. Instead, following Visa Kurki\u2019s analysis of legal personhood as a cluster concept divided into passive and active incidents, I will argue that some AI systems ought to be granted a limited form of legal personhood, because they meet some of the relevant criteria for active legal personhood, such as the capacity to perform acts-in-the-law. The legal personhood I propose for AI systems is a kind of dependent legal personhood analogous to that granted to corporations. Such a form of legal personhood would not absolve developers and producers from liability for damages (where such liability is applicable), but at the same time, it would not risk unjustly holding producers and developers liable for actions of an AI system. [1] Mark Coeckelbergh, \"Artificial Intelligence, Responsibility Attribution, and a Relational Justification of Explainability.\" Science and Engineering Ethics, (2020): 2054", "venue": "St Andrews Law Journal", "label": 0}, {"loc": [4.120890140533447, 2.3030846118927], "openalex_id": "https://openalex.org/W4385739457", "title": "Gender Identity and Representation in the Context of Economic Development in India", "authors": "Ankita Chakrabarti, Bhaswati Das", "abstract": "ABSTRACTRecognition and citizenship issues play pivotal roles in understanding the complex interaction between different forms of inequalities. Citizenship should be treated as a practice intimately linked with individuals\u2019 identities and rights, their sense of belonging and their actual nature of participation in the different spheres of their life. Exclusion is not just deprivation from the more tangible economic and social processes but also denying people their voice and their right to be unique. In this context, the Transgender Persons (Protection of Rights) Act concerning India\u2019s transgender community lies within the crucial junctures of identity politics and the country\u2019s legal and social structures. This work critically analyses the TG Act and raises few questions on the nature of recognition given to transgender individuals. Does a transgender person get citizenship that guarantees representation and equality? To what extent does the new Act do justice to the transgender community in living as a member of the society and not just as a product of \u2018othering\u2019? The paper concludes that recognition is not just for citizenship and identity rights \u2013 it is the right to be different but equal.KEYWORDS: Identityintersectionalityjurisprudencetransgenderexclusion Disclosure statementNo potential conflict of interest was reported by the author(s).Additional informationFundingThe author received no grant for this research paper.Notes on contributorsAnkita ChakrabartiAnkita Chakrabarti I am a senior research fellow at the Centre for the Study of Regional Development, School of Social Sciences, Jawaharlal Nehru University, New Delhi. My thesis title is \u2018 Access to primary healthcare among the transfeminine population in West Bengal, India.\u2019 I did my graduation from Jawaharlal Nehru University, New Delhi. My broad research interests include, gender relation, family demography, public health and mixed methods research methodologies. I have published few articles in various reputed, national and international journals regarding women autonomy and status. I actively write in various national magazines on current relevant issues regarding the transgender population in India. I have received many international grants for attending conferences and workshops. I am Research Convenor at the Civilian Welfare Foundation, a non-profit organisation in Kolkata, working on alternative education, disability and gender. I am also a member of the State Planning Commission of Chhattisgarh, working on \u2018Rehabilitation of the third gender\u2019.Bhaswati DasBhaswati Das is Faculty in Population Studies at the Jawaharlal Nehru University, New Delhi India. Her area of interest includes study of population in the context of development. Her major area of thrust is wellbeing, migration, gender and reproductive health. She has so far 30 publications published in leading journals on population and development issues. Currently, she is the Associate Editor of the Journal of Health & Population: Perspectives and Issues (HPPI) of the National Institute of Health and Family Welfare, Ministry of Health and Family Welfare, GoI. So far, 25 students have been awarded Ph.D degrees under her supervision. Bhaswati\u2019s current research projects are as follows: Gendered Ageing in India. Gender Atlas of India, Choice of Destination by the Migrants from Bangladesh and Consequence of Male-selective Migration from Rural West Bengal.", "venue": "Journal of Gender Studies", "label": 0}, {"loc": [9.017794609069824, 1.420371651649475], "openalex_id": "https://openalex.org/W4385634976", "title": "The Open Web Index: Crawling and Indexing the Web for Public Use", "authors": "Michael Granitzer, Stefan Voigt, N Fathima, Martin Golasowski, Christian Guetl, Tobias Hecking, Gijs Hendriksen, Djoerd Hiemstra, Jan Martinovi\u010d, Jelena Mitrovi\u0107, Izidor Mlakar, Stavros Moiras, Alexander Nussbaumer, Per \u00d6ster, Martin Potthast, Marjana Sen\u010dar Srdi\u010d, Sharikadze Megi, Kate\u0159ina Slaninov\u00e1, Benno Stein, Arjen P. de Vries, V\u00edt Vondr\u00e1k, Andreas Wagner, Saber Zerhoudi", "abstract": "Abstract Web search is a crucial technology for the digital economy. Dominated by a few gatekeepers focused on commercial success, however, web publishers have to optimize their content for these gatekeepers, resulting in a closed ecosystem of search engines as well as the risk of publishers sacrificing quality. To encourage an open search ecosystem and offer users genuine choice among alternative search engines, we propose the development of an Open Web Index (OWI). We outline six core principles for developing and maintaining an open index, based on open data principles, legal compliance, and collaborative technology development. The combination of an open index with what we call declarative search engines will facilitate the development of vertical search engines and innovative web data products (including, e.g., large language models), enabling a fair and open information space. This framework underpins the EU\u2010funded project OpenWebSearch.EU, marking the first step towards realizing an Open Web Index.", "venue": "Journal of the Association for Information Science and Technology", "label": 0}, {"loc": [6.1976728439331055, 5.778915882110596], "openalex_id": "https://openalex.org/W4385436442", "title": "Build a Large Language Model (From Scratch)", "authors": "Yuan Hu, Jianlong Yuan, Congcong Wen, Xiaonan L\u00fc, Xiang Li", "abstract": "The emergence of large-scale large language models, with GPT-4 as a prominent example, has significantly propelled the rapid advancement of artificial general intelligence and sparked the revolution of Artificial Intelligence 2.0. In the realm of remote sensing (RS), there is a growing interest in developing large vision language models (VLMs) specifically tailored for data analysis in this domain. However, current research predominantly revolves around visual recognition tasks, lacking comprehensive, large-scale image-text datasets that are aligned and suitable for training large VLMs, which poses significant challenges to effectively training such models for RS applications. In computer vision, recent research has demonstrated that fine-tuning large vision language models on small-scale, high-quality datasets can yield impressive performance in visual and language understanding. These results are comparable to state-of-the-art VLMs trained from scratch on massive amounts of data, such as GPT-4. Inspired by this captivating idea, in this work, we build a high-quality Remote Sensing Image Captioning dataset (RSICap) that facilitates the development of large VLMs in the RS field. Unlike previous RS datasets that either employ model-generated captions or short descriptions, RSICap comprises 2,585 human-annotated captions with rich and high-quality information. This dataset offers detailed descriptions for each image, encompassing scene descriptions (e.g., residential area, airport, or farmland) as well as object information (e.g., color, shape, quantity, absolute position, etc). To facilitate the evaluation of VLMs in the field of RS, we also provide a benchmark evaluation dataset called RSIEval. This dataset consists of human-annotated captions and visual question-answer pairs, allowing for a comprehensive assessment of VLMs in the context of RS.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.3414459228515625, 5.847543716430664], "openalex_id": "https://openalex.org/W4384820618", "title": "Language Grounding in Vision", "authors": "Jiajun Deng, Zhengyuan Yang, Daqing Liu, Tianlang Chen, Wengang Zhou, Yanyong Zhang, Houqiang Li, Wanli Ouyang", "abstract": "In this work, we explore neat yet effective Transformer-based frameworks for visual grounding. The previous methods generally address the core problem of visual grounding, i.e., multi-modal fusion and reasoning, with manually-designed mechanisms. Such heuristic designs are not only complicated but also make models easily overfit specific data distributions. To avoid this, we first propose TransVG, which establishes multi-modal correspondences by Transformers and localizes referred regions by directly regressing box coordinates. We empirically show that complicated fusion modules can be replaced by a simple stack of Transformer encoder layers with higher performance. However, the core fusion Transformer in TransVG is stand-alone against uni-modal encoders, and thus should be trained from scratch on limited visual grounding data, which makes it hard to be optimized and leads to sub-optimal performance. To this end, we further introduce TransVG++ to make two-fold improvements. For one thing, we upgrade our framework to a purely Transformer-based one by leveraging Vision Transformer (ViT) for vision feature encoding. For another, we devise Language Conditioned Vision Transformer that removes external fusion modules and reuses the uni-modal ViT for vision-language fusion at the intermediate layers. We conduct extensive experiments on five prevalent datasets, and report a series of state-of-the-art records.", "venue": "IEEE Transactions on Pattern Analysis and Machine Intelligence", "label": 0}, {"loc": [3.2242891788482666, 1.332847237586975], "openalex_id": "https://openalex.org/W4384154918", "title": "generative artificial intelligence", "authors": "Shakked Noy, Whitney Zhang", "abstract": "We examined the productivity effects of a generative artificial intelligence (AI) technology, the assistive chatbot ChatGPT, in the context of midlevel professional writing tasks. In a preregistered online experiment, we assigned occupation-specific, incentivized writing tasks to 453 college-educated professionals and randomly exposed half of them to ChatGPT. Our results show that ChatGPT substantially raised productivity: The average time taken decreased by 40% and output quality rose by 18%. Inequality between workers decreased, and concern and excitement about AI temporarily rose. Workers exposed to ChatGPT during the experiment were 2 times as likely to report using it in their real job 2 weeks after the experiment and 1.6 times as likely 2 months after the experiment.", "venue": "Science", "label": 0}, {"loc": [4.0149149894714355, 0.9131749868392944], "openalex_id": "https://openalex.org/W4383737134", "title": "Ethical Challenges and Bias in NLP Models: A Python-Based Investigation", "authors": "Muhammad Usman Hadi, qasem al tashi, Rizwan Qureshi, Abbas Shah, Amgad Muneer, Muhammad Irfan, Anas Zafar, Muhammad Bilal Shaikh, Naveed Akhtar, Jia Wu, Seyedali Mirjalili", "abstract": "<p>Within the vast expanse of computerized language processing, a revolutionary entity known as Large Language Models (LLMs) has emerged, wielding immense power in its capacity to comprehend intricate linguistic patterns and conjure coherent and contextually fitting responses. Large language models (LLMs) are a type of artificial intelligence (AI) that have emerged as powerful tools for a wide range of tasks, including natural language processing (NLP), machine translation, and question-answering. This survey paper provides a comprehensive overview of LLMs, including their history, architecture, training methods, applications, and challenges. The paper begins by discussing the fundamental concepts of generative AI and the architecture of generative pre- trained transformers (GPT). It then provides an overview of the history of LLMs, their evolution over time, and the different training methods that have been used to train them. The paper then discusses the wide range of applications of LLMs, including medical, education, finance, and engineering. It also discusses how LLMs are shaping the future of AI and how they can be used to solve real-world problems. The paper then discusses the challenges associated with deploying LLMs in real-world scenarios, including ethical considerations, model biases, interpretability, and computational resource requirements. It also highlights techniques for enhancing the robustness and controllability of LLMs, and addressing bias, fairness, and generation quality issues. Finally, the paper concludes by highlighting the future of LLM research and the challenges that need to be addressed in order to make LLMs more reliable and useful. This survey paper is intended to provide researchers, practitioners, and enthusiasts with a comprehensive understanding of LLMs, their evolution, applications, and challenges. By consolidating the state-of-the-art knowledge in the field, this survey serves as a valuable resource for further advancements in the development and utilization of LLMs for a wide range of real-world applications. The GitHub repo for this project is available at https://github.com/anas-zafar/LLM-Survey</p>", "venue": "https://doi.org/10.36227/techrxiv.23589741.v1", "label": 0}, {"loc": [2.456007719039917, 1.4626415967941284], "openalex_id": "https://openalex.org/W4383312437", "title": "AI University Education", "authors": "Cecilia Ka Yuk Chan", "abstract": "Abstract This study aims to develop an AI education policy for higher education by examining the perceptions and implications of text generative AI technologies. Data was collected from 457 students and 180 teachers and staff across various disciplines in Hong Kong universities, using both quantitative and qualitative research methods. Based on the findings, the study proposes an AI Ecological Education Policy Framework to address the multifaceted implications of AI integration in university teaching and learning. This framework is organized into three dimensions: Pedagogical, Governance, and Operational. The Pedagogical dimension concentrates on using AI to improve teaching and learning outcomes, while the Governance dimension tackles issues related to privacy, security, and accountability. The Operational dimension addresses matters concerning infrastructure and training. The framework fosters a nuanced understanding of the implications of AI integration in academic settings, ensuring that stakeholders are aware of their responsibilities and can take appropriate actions accordingly.", "venue": "International Journal of Educational Technology in Higher Education", "label": 0}, {"loc": [7.596341133117676, -1.2114298343658447], "openalex_id": "https://openalex.org/W4383553149", "title": "A Survey of Neural Machine Translation based on Knowledge Distillation", "authors": "Feng Li, Jingxian Chen, Xuejun Zhang", "abstract": "Non-autoregressive neural machine translation (NAMT) has received increasing attention recently in virtue of its promising acceleration paradigm for fast decoding. However, these splendid speedup gains are at the cost of accuracy, in comparison to its autoregressive counterpart. To close this performance gap, many studies have been conducted for achieving a better quality and speed trade-off. In this paper, we survey the NAMT domain from two new perspectives, i.e., target dependency management and training strategies arrangement. Proposed approaches are elaborated at length, involving five model categories. We then collect extensive experimental data to present abundant graphs for quantitative evaluation and qualitative comparison according to the reported translation performance. Based on that, a comprehensive performance analysis is provided. Further inspection is conducted for two salient problems: target sentence length prediction and sequence-level knowledge distillation. Accumulative reinvestigation of translation quality and speedup demonstrates that non-autoregressive decoding may not run fast as it seems and still lacks authentic surpassing for accuracy. We finally prospect potential work from inner and outer facets and call for more practical and warrantable studies for the future.", "venue": "Electronics", "label": 19}, {"loc": [8.420234680175781, 2.364553213119507], "openalex_id": "https://openalex.org/W4382319574", "title": "Beyond Scale: The Diversity Coefficient as a Data Quality Metric for Variability in Natural Language Data", "authors": "A. Lee, Brando Miranda, Sanmi Koyejo", "abstract": "Current trends in pre-training Large Language Models (LLMs) primarily focus on the scaling of model and dataset size. While the quality of pre-training data is considered an important factor for training powerful LLMs, it remains a nebulous concept that has not been rigorously characterized. To this end, we propose a formalization of one key aspect of data quality -- measuring the variability of natural language data -- specifically via a measure we call the diversity coefficient. Our empirical analysis shows that the proposed diversity coefficient aligns with the intuitive properties of diversity and variability, e.g., it increases as the number of latent concepts increases. Then, we measure the diversity coefficient of publicly available pre-training datasets and demonstrate that their formal diversity is high compared to theoretical lower and upper bounds. Finally, we conduct a comprehensive set of controlled interventional experiments with GPT-2 and LLaMAv2 that demonstrate the diversity coefficient of pre-training data characterizes useful aspects of downstream model evaluation performance -- totaling 44 models of various sizes (51M to 7B parameters). We conclude that our formal notion of diversity is an important aspect of data quality that captures variability and causally leads to improved evaluation performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.030961513519287, 2.045443534851074], "openalex_id": "https://openalex.org/W4381612850", "title": "VERB: Visualizing and Interpreting Bias Mitigation Techniques Geometrically for Word Representations", "authors": "Archit Rathore, Sunipa Dev, Jeff M. Phillips, Vivek Srikumar, Yan Zheng, Chin\u2010Chia Michael Yeh, Junpeng Wang, Wei Zhang, Bei Wang", "abstract": "Word vector embeddings have been shown to contain and amplify biases in the data they are extracted from. Consequently, many techniques have been proposed to identify, mitigate, and attenuate these biases in word representations. In this article, we utilize interactive visualization to increase the interpretability and accessibility of a collection of state-of-the-art debiasing techniques. To aid this, we present the Visualization of Embedding Representations for deBiasing (VERB) system, an open-source web-based visualization tool that helps users gain a technical understanding and visual intuition of the inner workings of debiasing techniques, with a focus on their geometric properties. In particular, VERB offers easy-to-follow examples that explore the effects of these debiasing techniques on the geometry of high-dimensional word vectors. To help understand how various debiasing techniques change the underlying geometry, VERB decomposes each technique into interpretable sequences of primitive transformations and highlights their effect on the word vectors using dimensionality reduction and interactive visual exploration. VERB is designed to target natural language processing (NLP) practitioners who are designing decision-making systems on top of word embeddings and researchers working with the fairness and ethics of machine learning systems in NLP. It can also serve as a visual medium for education, which helps an NLP novice understand and mitigate biases in word embeddings.", "venue": "ACM Transactions on Interactive Intelligent Systems", "label": 0}, {"loc": [3.250880718231201, 2.5485596656799316], "openalex_id": "https://openalex.org/W4379522362", "title": "From Development to Dissemination: Social and Ethical Issues with Text-to-Image AI-Generated Art", "authors": "Sharon Chee Yin Ho", "abstract": "Text-to-image generative artificial intelligence (AI) have made global news headlines for not only having the ability to generate high-fidelity artworks, but also for causing increased discussion on the ethicality of its impact on living artists, the automation and commodification of art production, the frequent non-consensual collection and usage of sensitive and copyrighted images as training data, and the routinely exhibited cultural and social biases in their generated outputs.In addition, there are concerns that open-sourced text-to-image generative AI models, such as Stable Diffusion, and techniques like Textual Inversion, allow for technical restrictions on the content subject matter to be removed and for generated images to be subject specific, which could be utilized as a new medium for disinformation and sexual or targeted abuse.Because ethical discussions on AI-generated art using text-to-image generative AI models have only come to light in the last quarter of 2022, academic research on the social and ethical implications of this technology have yet to be thoroughly explored.Therefore, it is imperative for research to be done on these implications with regards to the technological development, evaluation, perception, creation, and moderation of AI-generated artworks while text-to-image generative AI systems are still in the preliminary stages of public dissemination and adoption.", "venue": "https://doi.org/10.21428/594757db.acad9d77", "label": 0}, {"loc": [2.463219404220581, 1.5130497217178345], "openalex_id": "https://openalex.org/W4379057711", "title": "GENERATIVE ARTIFICIAL INTELLIGENCE (AI) IN EDUCATION: A CROSS-NATIONAL SURVEY ON UNIVERSITY TEACHERS'PERCEPTIONS ON THE USE OF \u2026", "authors": "Ting Wang, Brady Lund, Agostino Marengo, Alessandro Pagano, Nishith Reddy Mannuru, Zo\u00eb Abbie Teel, Jenny Pange", "abstract": "International students face unique challenges in pursuing higher education in a foreign country. To address these challenges and enhance their academic experience, higher education institutions are increasingly exploring the use of artificial intelligence (AI) applications. This research essay aims to investigate the impact of AI on the education of international students. Instead of a traditional literature review, it employs a research approach to examine the potential applications of AI and discuss associated concerns. The research paper explores various AI applications, such as personalized learning experiences, adaptive testing, predictive analytics, and chatbots for learning and research. By analyzing the role of AI in education for international students, this research paper sheds light on how AI can improve learning efficiency and provide customized educational support. Additionally, it identifies significant risks and limitations, including privacy concerns, cultural differences, language proficiency, and ethical implications, which must be effectively addressed. The findings contribute to a better understanding of the potential impact of AI on international students\u2019 educational experiences and offer insights into the integration of AI into educational administration and learning processes.", "venue": "Applied Sciences", "label": 8}, {"loc": [7.088560581207275, -0.06727509945631027], "openalex_id": "https://openalex.org/W4378464930", "title": "mPLM-Sim: Better Cross-Lingual Similarity and Transfer in Multilingual Pretrained Language Models", "authors": "Peiqin Lin, Chengzhi Hu, Zheyu Zhang, Andr\u00e9 F. T. Martins, Hinrich Sch\u00fctze", "abstract": "Recent multilingual pretrained language models (mPLMs) have been shown to encode strong language-specific signals, which are not explicitly provided during pretraining. It remains an open question whether it is feasible to employ mPLMs to measure language similarity, and subsequently use the similarity results to select source languages for boosting cross-lingual transfer. To investigate this, we propose mPLMSim, a language similarity measure that induces the similarities across languages from mPLMs using multi-parallel corpora. Our study shows that mPLM-Sim exhibits moderately high correlations with linguistic similarity measures, such as lexicostatistics, genealogical language family, and geographical sprachbund. We also conduct a case study on languages with low correlation and observe that mPLM-Sim yields more accurate similarity results. Additionally, we find that similarity results vary across different mPLMs and different layers within an mPLM. We further investigate whether mPLMSim is effective for zero-shot cross-lingual transfer by conducting experiments on both low-level syntactic tasks and high-level semantic tasks. The experimental results demonstrate that mPLM-Sim is capable of selecting better source languages than linguistic measures, resulting in a 1%-2% improvement in zero-shot cross-lingual transfer performance.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.079753875732422, 1.0026719570159912], "openalex_id": "https://openalex.org/W4377041914", "title": "NATURAL LANGUAGE PROCESSING IN THE AGE OF ARTIFICIAL INTELLIGENCE: TECHNICAL ADVANCES, OPPORTUNITIES AND CHALLENGES", "authors": "Elisabeth Bauer, Martin Greisel, Ilia Kuznetsov, Markus Berndt, Ingo Kollar, Markus Dresel, Martin R. Fischer, Frank Fischer", "abstract": "Advancements in artificial intelligence are rapidly increasing. The new\u2010generation large language models, such as ChatGPT and GPT\u20104, bear the potential to transform educational approaches, such as peer\u2010feedback. To investigate peer\u2010feedback at the intersection of natural language processing (NLP) and educational research, this paper suggests a cross\u2010disciplinary framework that aims to facilitate the development of NLP\u2010based adaptive measures for supporting peer\u2010feedback processes in digital learning environments. To conceptualize this process, we introduce a peer\u2010feedback process model, which describes learners' activities and textual products. Further, we introduce a terminological and procedural scheme that facilitates systematically deriving measures to foster the peer\u2010feedback process and how NLP may enhance the adaptivity of such learning support. Building on prior research on education and NLP, we apply this scheme to all learner activities of the peer\u2010feedback process model to exemplify a range of NLP\u2010based adaptive support measures. We also discuss the current challenges and suggest directions for future cross\u2010disciplinary research on the effectiveness and other dimensions of NLP\u2010based adaptive support for peer\u2010feedback. Building on our suggested framework, future research and collaborations at the intersection of education and NLP can innovate peer\u2010feedback in digital learning environments. Practitioner notes What is already known about this topic There is considerable research in educational science on peer\u2010feedback processes. Natural language processing facilitates the analysis of students' textual data. There is a lack of systematic orientation regarding which NLP techniques can be applied to which data to effectively support the peer\u2010feedback process. What this paper adds A comprehensive overview model that describes the relevant activities and products in the peer\u2010feedback process. A terminological and procedural scheme for designing NLP\u2010based adaptive support measures. An application of this scheme to the peer\u2010feedback process results in exemplifying the use cases of how NLP may be employed to support each learner activity during peer\u2010feedback. Implications for practice and/or policy To boost the effectiveness of their peer\u2010feedback scenarios, instructors and instructional designers should identify relevant leverage points, corresponding support measures, adaptation targets and automation goals based on theory and empirical findings. Management and IT departments of higher education institutions should strive to provide digital tools based on modern NLP models and integrate them into the respective learning management systems; those tools should help in translating the automation goals requested by their instructors into prediction targets, take relevant data as input and allow for evaluating the predictions.", "venue": "British Journal of Educational Technology", "label": 10}, {"loc": [5.021472454071045, 0.33927956223487854], "openalex_id": "https://openalex.org/W4376652761", "title": "HOW CONVERSATIONAL SYSTEMS ARE BUILT USING LANGUAGE MODELS", "authors": "Luke Friedman, Sameer Ahuja, David T. Allen, Zhenning Tan, Hakim Sidahmed, Changbo Long, Jun Xie, Gabriel Schubiner, Ajay Patel, Harsh Lara, Brian Chu, Zexi Chen, Manoj Kumar Tiwari", "abstract": "A Conversational Recommender System (CRS) offers increased transparency and control to users by enabling them to engage with the system through a real-time multi-turn dialogue. Recently, Large Language Models (LLMs) have exhibited an unprecedented ability to converse naturally and incorporate world knowledge and common-sense reasoning into language understanding, unlocking the potential of this paradigm. However, effectively leveraging LLMs within a CRS introduces new technical challenges, including properly understanding and controlling a complex conversation and retrieving from external sources of information. These issues are exacerbated by a large, evolving item corpus and a lack of conversational data for training. In this paper, we provide a roadmap for building an end-to-end large-scale CRS using LLMs. In particular, we propose new implementations for user preference understanding, flexible dialogue management and explainable recommendations as part of an integrated architecture powered by LLMs. For improved personalization, we describe how an LLM can consume interpretable natural language user profiles and use them to modulate session-level context. To overcome conversational data limitations in the absence of an existing production CRS, we propose techniques for building a controllable LLM-based user simulator to generate synthetic conversations. As a proof of concept we introduce RecLLM, a large-scale CRS for YouTube videos built on LaMDA, and demonstrate its fluency and diverse functionality through some illustrative example conversations.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.619657516479492, 0.365638792514801], "openalex_id": "https://openalex.org/W4367694135", "title": "ChatGPT's applications in marketing: a topic modeling approach", "authors": "Mohammad Fraiwan, Natheer Khasawneh", "abstract": "ChatGPT is a type of artificial intelligence language model that uses deep learning algorithms to generate human-like responses to text-based prompts. The introduction of the latest ChatGPT version in November of 2022 has caused shockwaves in the industrial and academic communities for its powerful capabilities, plethora of possible applications, and the great possibility for abuse. At the time of writing this work, several other language models (e.g., Google Bard and Meta LLaMA) just came out in an attempt to get a foothold in the vast possible market. These models have the ability to revolutionize the way we interact with computers and have potential applications in many fields, including education, software engineering, healthcare, and marketing. In this paper, we will discuss the possible applications, drawbacks, and research directions using advanced language Chatbots (e.g., ChatGPT) in each of these fields. We first start with a brief introduction and the development timeline of artificial intelligence based language models, then we go through possible applications of such models, after that we discuss the limitations and drawbacks of the current technological state of the art, and finally we point out future possible research directions.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.462548017501831, 1.5587263107299805], "openalex_id": "https://openalex.org/W4385080380", "title": "Ethics of AI in the Teaching of English", "authors": "Zachary Kilhoffer, Zhixuan Zhou, F Wang, Fahad Tamton, Yun Huang, Pilyoung Kim, Tom Yeh, Yang Wang", "abstract": "Today's cybersecurity and AI technologies are often fraught with ethical challenges. One promising direction is to teach cybersecurity and AI ethics to today's youth. However, we know little about how these subjects are taught before college. Drawing from interviews of US high school teachers (n=16) and students (n=11), we find that cybersecurity and AI ethics are often taught in non-technical classes such as social studies and language arts. We also identify relevant topics, of which epistemic norms, privacy, and digital citizenship appeared most often. While teachers leverage traditional and novel teaching strategies including discussions (treating current events as case studies), gamified activities, and content creation, many challenges remain. For example, teachers hesitate to discuss current events out of concern for appearing partisan and angering parents; cyber hygiene instruction appears very ineffective at educating youth and promoting safer online behavior; and generational differences make it difficult for teachers to connect with students. Based on the study results, we offer practical suggestions for educators, school administrators, and cybersecurity practitioners to improve youth education on cybersecurity and AI ethics.", "venue": "https://doi.org/10.1109/sp46215.2023.10179333", "label": 0}, {"loc": [3.163933753967285, 2.2377851009368896], "openalex_id": "https://openalex.org/W4366594193", "title": "Building and AI Chatbot using LLM", "authors": "Yuqian Sun, Ying Xu, Chenhang Cheng, Yihua Li, Chang Hee Lee, Ali Asadipour", "abstract": "People always envision the future of earth through science fiction (Sci-fi), so can we create a unique experience of \"visiting the future earth\" through the lens of artificial intelligence (AI)? We introduce Wander 2.0, an AI chatbot that co-creates sci-fi stories through knowledge-based story generation on daily communication platforms like WeChat and Discord. Using location information from Google Maps, Wander generates narrative travelogues about specific locations (e.g. Paris) through a large-scale language model (LLM). Additionally, using the large-scale text-to-image model (LTGM) Stable Diffusion, Wander transfers future scenes that match both the text description and location photo, facilitating future imagination. The project also includes a real-time visualization of the human-AI collaborations on a future map. Through journeys with visitors from all over the world, Wander demonstrates how AI can serve as a subjective interface linking fiction and reality. Our research shows that multi-modal AI systems have the potential to extend the artistic experience and creative world-building through adaptive and unique content generation for different people. Wander 2.0 is available at http://wander001.com/", "venue": "https://doi.org/10.1145/3544549.3583931", "label": 0}, {"loc": [2.7248735427856445, -0.48603519797325134], "openalex_id": "https://openalex.org/W4366158343", "title": "Online health search via multi-dimensional information quality assessment based on deep language models", "authors": "Boya Zhang, Nona Naderi, Rahul Mishra, Douglas Teodoro", "abstract": "Abstract Background Widespread misinformation in Web resources can lead to serious implications for individuals seeking health advice. Despite that, information retrieval models are often focused only on the query-document relevance dimension to rank results. Objective We investigate a multi-dimensional information quality retrieval model based on deep learning to enhance the effectiveness of online healthcare information search results. Methods In this study, we simulated online health information search scenarios with a topic set of 32 different health-related inquiries and a corpus containing one billion Web documents from the April 2019 snapshot of Common Crawl. Using state-of-the-art pre-trained language models, we assessed the quality of the retrieved documents according to their usefulness, supportiveness, and credibility dimensions for a given search query on 6,030 human-annotated query-document pairs. We evaluated this approach using transfer learning and more specific domain adaptation techniques. Results In the transfer learning setting, the usefulness model provided the largest distinction between help- and harm-compatible documents with a difference of +5.6%, leading to a majority of helpful documents in the top-10 retrieved. The supportiveness model achieved the best harm compatibility (+2.4%), while the combination of usefulness, supportiveness, and credibility models achieved the largest distinction between help- and harm-compatibility on helpful topics (+16.9%). In the domain adaptation setting, the linear combination of different models showed robust performance with help-harm compatibility above +4.4% for all dimensions and going as high as +6.8%. Conclusions These results suggest that integrating automatic ranking models created for specific information quality dimensions can increase the effectiveness of health-related information retrieval. Thus, our approach could be used to enhance searches made by individuals seeking online health information.", "venue": "bioRxiv (Cold Spring Harbor Laboratory)", "label": 12}, {"loc": [2.892991304397583, -0.7099547982215881], "openalex_id": "https://openalex.org/W4365211688", "title": "COMPREHENSIVE STUDY OF CLINICAL ENTITY EXTRACTION AND CLASSIFICATION USING LARGE LANGUAGE MODELS", "authors": "Yuqing Wang, Yun Zhao, Linda Petzold", "abstract": "Large language models (LLMs) have made significant progress in various domains, including healthcare. However, the specialized nature of clinical language understanding tasks presents unique challenges and limitations that warrant further investigation. In this study, we conduct a comprehensive evaluation of state-of-the-art LLMs, namely GPT-3.5, GPT-4, and Bard, within the realm of clinical language understanding tasks. These tasks span a diverse range, including named entity recognition, relation extraction, natural language inference, semantic textual similarity, document classification, and question-answering. We also introduce a novel prompting strategy, self-questioning prompting (SQP), tailored to enhance LLMs' performance by eliciting informative questions and answers pertinent to the clinical scenarios at hand. Our evaluation underscores the significance of task-specific learning strategies and prompting techniques for improving LLMs' effectiveness in healthcare-related tasks. Additionally, our in-depth error analysis on the challenging relation extraction task offers valuable insights into error distribution and potential avenues for improvement using SQP. Our study sheds light on the practical implications of employing LLMs in the specialized domain of healthcare, serving as a foundation for future research and the development of potential applications in healthcare settings.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.3793864250183105, 2.336230754852295], "openalex_id": "https://openalex.org/W4362655716", "title": "Mentions of prejudice in news media\u2013an international comparison", "authors": "David Rozado", "abstract": "Previous research has identified a post-2010 sharp increase of terms used to denounce prejudice (i.e. racism, sexism, homophobia, Islamophobia, anti-Semitism, etc.) in U.S. and U.K. news media content. Here, we extend previous analysis to an international sample of news media organizations. Thus, we quantify the prevalence of prejudice-denouncing terms and social justice associated terminology (diversity, inclusion, equality, etc.) in over 98 million news and opinion articles across 124 popular news media outlets from 36 countries representing 6 different world regions: English-speaking West, continental Europe, Latin America, sub-Saharan Africa, Persian Gulf region and Asia. We find that the post-2010 increasing prominence in news media of the studied terminology is not circumscribed to the U.S. and the U.K. but rather appears to be a mostly global phenomenon starting in the first half of the 2010s decade in pioneering countries yet largely prevalent around the globe post-2015. However, different world regions' news media emphasize distinct types of prejudice with varying degrees of intensity. We find no evidence of U.S. news media having been first in the world in increasing the frequency of prejudice coverage in their content. The large degree of temporal synchronicity with which the studied set of terms increased in news media across a vast majority of countries raises important questions about the root causes driving this phenomenon.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [8.014509201049805, -0.7433953285217285], "openalex_id": "https://openalex.org/W4324134461", "title": "ChatGPT and Language Translation: A Small Case Study Evaluating English\u2013Mandarin Translation", "authors": "Qingyu Lu, Baopu Qiu, Liang Ding, Liping Xie, Dacheng Tao", "abstract": "Generative large language models (LLMs), e.g., ChatGPT, have demonstrated remarkable proficiency across several NLP tasks such as machine translation, question answering, text summarization, and natural language understanding. Recent research has shown that utilizing ChatGPT for assessing the quality of machine translation (MT) achieves state-of-the-art performance at the system level but performs poorly at the segment level. To further improve the performance of LLMs on MT quality assessment, we conducted an investigation into several prompting methods. Our results indicate that by combining Chain-of-Thoughts and Error Analysis, a new prompting method called Error Analysis Prompting, LLMs like ChatGPT can \\textit{generate human-like MT evaluations at both the system and segment level}. Additionally, we discovered some limitations of ChatGPT as an MT evaluator, such as unstable scoring and biases when provided with multiple translations in a single query. Our findings aim to provide a preliminary experience for appropriately evaluating translation quality on ChatGPT while offering a variety of tricks in designing prompts for in-context learning. We anticipate that this report will shed new light on advancing the field of translation evaluation with LLMs by enhancing both the accuracy and reliability of metrics. The project can be found at https://github.com/Coldmist-Lu/ErrorAnalysis_Prompt.", "venue": "Preprints.org", "label": 3}, {"loc": [6.114078998565674, 1.8362631797790527], "openalex_id": "https://openalex.org/W4321485427", "title": "Knowledge-augmented Methods for Natural Language Processing", "authors": "Chenguang Zhu, Yichong Xu, Xiang Ren, Bill Yuchen Lin, Meng Jiang, Wenhao Yu", "abstract": "Knowledge in NLP has been a rising trend especially after the advent of large-scale pre-trained models. Knowledge is critical to equip statistics-based models with common sense, logic and other external information. In this tutorial, we will introduce recent state-of-the-art works in applying knowledge in language understanding, language generation and commonsense reasoning.", "venue": "https://doi.org/10.1145/3539597.3572720", "label": 0}, {"loc": [3.3649823665618896, 2.1512844562530518], "openalex_id": "https://openalex.org/W4317107222", "title": "Divergent thinking in groups", "authors": "Tsutomu Harada", "abstract": "This study examined the effects of risk-taking and exploitation/exploration trade-off on divergent thinking in individuals, dyads, and triads. We adopted a simple Q-learning model to estimate risk attitudes, exploitation, and exploration parameters. The results showed that risk-taking, exploitation, and exploration did not affect divergent thinking in dyads. Instead, loss aversion was negatively related to divergent thinking. In contrast, risk attitudes and the inverse temperature as a ratio between exploitation and exploration were significant but with contrasting effects in individuals and triads. For individuals, risk-taking, exploitation and loss aversion played a critical role in divergent thinking. For triads, risk aversion and exploration were significantly related to divergent thinking. However, the results also indicated that balancing risk with exploitation/exploration and loss aversion is critical in enhancing divergent thinking in individuals and triads when learning coherence emerges. These results could be interpreted consistently with related literature such as the odd-vs. even-numbered group dynamics, knowledge diversity in group creativity, and representational change theory in insight problem-solving.", "venue": "Frontiers in Psychology", "label": 0}, {"loc": [5.676581859588623, 4.885761737823486], "openalex_id": "https://openalex.org/W4319336138", "title": "Data Selection for Generalization in Unimodal & Multimodal Models", "authors": "Arthur Josi, Mahdi Alehdaghi, Rafael M. O. Cruz, \u00c9ric Granger", "abstract": "The re-identification (ReID) of individuals over a complex network of cameras is a challenging task, especially under real-world surveillance conditions. Several deep learning models have been proposed for visible-infrared (V-I) person ReID to recognize individuals from images captured using RGB and IR cameras. However, performance may decline considerably if RGB and IR images captured at test time are corrupted (e.g., noise, blur, and weather conditions). Although various data augmentation (DA) methods have been explored to improve the generalization capacity, these are not adapted for V-I person ReID. In this paper, a specialized DA strategy is proposed to address this multimodal setting. Given both the V and I modalities, this strategy allows to diminish the impact of corruption on the accuracy of deep person ReID models. Corruption may be modality-specific, and an additional modality often provides complementary information. Our multimodal DA strategy is designed specifically to encourage modality collaboration and reinforce generalization capability. For instance, punctual masking of modalities forces the model to select the informative modality. Local DA is also explored for advanced selection of features within and among modalities. The impact of training baseline fusion models for V-I person ReID using the proposed multimodal DA strategy is assessed on corrupted versions of the SYSUMM01, RegDB, and ThermalWORLD datasets in terms of complexity and efficiency. Results indicate that using our strategy provides V-I ReID models the ability to exploit both shared and individual modality knowledge so they can out-perform models trained with no or unimodal DA. GitHub code: https://github.com/art2611/ML-MDA.", "venue": "https://doi.org/10.1109/wacvw58289.2023.00008", "label": 0}, {"loc": [7.7073869705200195, -0.9166584610939026], "openalex_id": "https://openalex.org/W4391017630", "title": "Context-aware Transliteration of Romanized South Asian Languages", "authors": "Christo Kirov, Cibu Johny, Anna Katanova, Alexander Gutkin, Brian Roark", "abstract": "Abstract While most transliteration research is focused on single tokens such as named entities\u2014for example, transliteration of from the Gujarati script to the Latin script \u201cAhmedabad\u201d footnoteThe most populous city in the Indian state of Gujarat. the informal romanization prevalent in South Asia and elsewhere often requires transliteration of full sentences. The lack of large parallel text collections of full sentence (as opposed to single word) transliterations necessitates incorporation of contextual information into transliteration via non-parallel resources, such as via mono-script text collections. In this article, we present a number of methods for improving transliteration in context for such a use scenario. Some of these methods in fact improve performance without making use of sentential context, allowing for better quantification of the degree to which contextual information in particular is responsible for system improvements. Our final systems, which ultimately rely upon ensembles including large pretrained language models fine-tuned on simulated parallel data, yield substantial improvements over the best previously reported results for full sentence transliteration from Latin to native script on all 12 languages in the Dakshina dataset (Roark et al. 2020), with an overall 3.3% absolute (18.6% relative) mean word-error rate reduction.", "venue": "Computational Linguistics", "label": 27}, {"loc": [7.2163004875183105, -0.18259716033935547], "openalex_id": "https://openalex.org/W4317879253", "title": "Modelling cross-lingual transfer for semantic parsing", "authors": "Tom Sherborne, Mirella Lapata", "abstract": "Abstract Localizing a semantic parser to support new languages requires effective cross-lingual generalization. Recent work has found success with machine-translation or zero-shot methods, although these approaches can struggle to model how native speakers ask questions. We consider how to effectively leverage minimal annotated examples in new languages for few-shot cross-lingual semantic parsing. We introduce a first-order meta-learning algorithm to train a semantic parser with maximal sample efficiency during cross-lingual transfer. Our algorithm uses high-resource languages to train the parser and simultaneously optimizes for cross-lingual generalization to lower-resource languages. Results across six languages on ATIS demonstrate that our combination of generalization steps yields accurate semantic parsers sampling \u226410% of source training data in each new language. Our approach also trains a competitive model on Spider using English with generalization to Chinese similarly sampling \u226410% of training data.1", "venue": "Transactions of the Association for Computational Linguistics", "label": 26}, {"loc": [1.9773188829421997, 5.394409656524658], "openalex_id": "https://openalex.org/W4323065767", "title": "PHISHING DETECTION SYSTEM THROUGH MACHINE LEARNING BASED ON URL", "authors": "Abdul Karim, Mobeen Shahroz, Khabib Mustofa, Samir Brahim Belhaouari, S Ramana Kumar Joga", "abstract": "<p dir=\"ltr\">Currently, numerous types of cybercrime are organized through the internet. Hence, this study mainly focuses on phishing attacks. Although phishing was first used in 1996, it has become the most severe and dangerous cybercrime on the internet. Phishing utilizes email distortion as its underlying mechanism for tricky correspondences, followed by mock sites, to obtain the required data from people in question. Different studies have presented their work on the precaution, identification, and knowledge of phishing attacks; however, there is currently no complete and proper solution for frustrating them. Therefore, machine learning plays a vital role in defending against cybercrimes involving phishing attacks. The proposed study is based on the phishing URL-based dataset extracted from the famous dataset repository, which consists of phishing and legitimate URL attributes collected from 11000+ website datasets in vector form. After preprocessing, many machine learning algorithms have been applied and designed to prevent phishing URLs and provide protection to the user. This study uses machine learning models such as decision tree (DT), linear regression (LR), random forest (RF), naive Bayes (NB), gradient boosting classifier (GBM), K-neighbors classifier (KNN), support vector classifier (SVC), and proposed hybrid LSD model, which is a combination of logistic regression, support vector machine, and decision tree (LR+SVC+DT) with soft and hard voting, to defend against phishing attacks with high accuracy and efficiency. The canopy feature selection technique with cross fold valoidation and Grid Search Hyperparameter Optimization techniques are used with proposed LSD model. Furthermore, to evaluate the proposed approach, different evaluation parameters were adopted, such as the precision, accuracy, recall, F1-score, and specificity, to illustrate the effects and efficiency of the models. The results of the comparative analyses demonstrate that the proposed approach outperforms the other models and achieves the best results.</p><h2>Other Information</h2><p dir=\"ltr\">Published in: IEEE Access<br>License: <a href=\"https://creativecommons.org/licenses/by-nc-nd/4.0/\" target=\"_blank\">https://creativecommons.org/licenses/by-nc-nd/4.0/</a><br>See article on publisher's website: <a href=\"https://dx.doi.org/10.1109/access.2023.3252366\" target=\"_blank\">https://dx.doi.org/10.1109/access.2023.3252366</a></p>", "venue": "IEEE Access", "label": 2}, {"loc": [7.226991653442383, 0.25756123661994934], "openalex_id": "https://openalex.org/W4389523992", "title": "SEA-LION: Southeast Asian Languages in One Network", "authors": "William Tjhi, David M. Ong, Peerat Limkonchotiwat", "abstract": "SEA-LION (Southeast Asian Languages In One Network) (Singapore, 2023) is a family of multilingual LLMs that is specifically pre-trained and instruct-tuned for the Southeast Asian (SEA) region, incorporating a custom SEABPETokenizer which is specially tailored for SEA languages.The first part of this talk will cover our design philosophy and pre-training methodology for SEA-LION.The second part of this talk will cover PyThaiNLP's (Phatthiyaphaibun et al., 2023) work on Wangchan-LION, an instruct-tuned version of SEA-LION for the Thai community.", "venue": "https://doi.org/10.18653/v1/2023.nlposs-1.26", "label": 0}, {"loc": [2.7853775024414062, 1.4641038179397583], "openalex_id": "https://openalex.org/W4391284907", "title": "Studying Peer Effects in Divergent Thinking: Theory and Method", "authors": "Chi Hang Wong, Ion Juvina, Paul \u0218tefan Popescu", "abstract": "In designing technology that supports user learning, an important first step is to understand how interactions among humans shape mutual learning.Much qualitative research in the realm of peer-assisted learning (PAL) has advanced the theoretical understanding of how social and cognitive factors might influence the success of PAL in educational settings, although fewer studies have examined the effects and mechanisms of PAL experimentally.We review relevant theories on PAL and discuss how the mechanisms of learning, trust, and group heterogeneity can affect peer effects in a divergent thinking context.Thereafter, we propose an experimental study to identify PAL effects and measures of trust and group diversity to be correlated with divergent task learning and performance.Finally, we delineate some potential practical implications of such a study for the Human-Computer Interaction area.", "venue": "http://doi.org/10.37789/rochi.2023.1.1.23", "label": 0}, {"loc": [7.628627300262451, -1.220017910003662], "openalex_id": "https://openalex.org/W4389270804", "title": "Optimization of Unsupervised Neural Machine Translation Based on Syntactic Knowledge Improvement", "authors": "Aiping Zhou", "abstract": "Unsupervised Neural Machine Translation is a crucial machine translation method that can translate in the absence of a parallel corpus and opens up new avenues for intercultural dialogue. Existing unsupervised neural machine translation models still struggle to deal with intricate grammatical relationships and linguistic structures, which leads to less-than-ideal translation quality. This study combines the Transformer structure and syntactic knowledge to create a new unsupervised neural machine translation model, which enhances the performance of the existing model. The study creates a neural machine translation model based on the Transformer structure first, and then introduces sentence syntactic structure and various syntactic fusion techniques, also known as the Transformer combines grammatical knowledge. The results show that the Transformer combines grammatical knowledge paired with Bi-Long Short-Term Memory proposed in this research has better performance. The accuracy and F1 value of the combined model in the training dataset are as high as 0.97. In addition, the time of the model in real sentence translation is controlled within 2s, and the translation accuracy is above 0.9. In conclusion, the unsupervised neural machine translation model proposed in this study has better performance, and its application to actual translation can achieve better translation results.", "venue": "International Journal of Advanced Computer Science and Applications", "label": 35}, {"loc": [7.965721130371094, 0.8692687153816223], "openalex_id": "https://openalex.org/W4388860698", "title": "Which Word Embeddings for Modeling Web Search Queries? Application to the Study of Search Strategies", "authors": "Claire Ibarboure, Ludovic Tanguy, Franck Amadieu", "abstract": "International audience", "venue": "https://doi.org/10.5220/0012177600003598", "label": 0}, {"loc": [9.514631271362305, 0.7324715852737427], "openalex_id": "https://openalex.org/W4392555172", "title": "On the data scarcity problem of neural-based named entity recognition", "authors": "Ran Zhou", "abstract": "The data scarcity problem in neural-based Named Entity Recognition (NER) refers to the challenge of limited annotated data available for training NER models. \\nCollecting and annotating large amounts of labeled data for various languages and domains can be time-consuming, expensive, and sometimes even impractical. \\nThis lack of labeled data can hinder the performance of neural-based NER models, as they require a substantial amount of annotated examples to learn effectively. \\nWith limited training data, neural-based NER models may struggle to generalize well and accurately identify unseen named entities in out-of-domain text or from a different language. They may be prone to overfitting, where the model becomes too specific to the training data and fails to generalize to new data, leading to reduced overall performance. \\n \\nAddressing the data scarcity problem in neural-based NER involves exploring alternative approaches to mitigate the impact of limited labeled data. \\nSome strategies include data augmentation techniques, such as word or entity replacement, synthetic data generation, or leveraging external resources like knowledge bases or dictionaries. \\nMany works focus on the popular data-scarce scenario of cross-lingual NER, where there is training data in the source language but few or no annotations in the target language. \\nFor example, consistency training encourages the model's predictions to be consistent across different representations of the same input, and can be used to improve the robustness and generalization of NER models across different languages. \\nMoreover, self-training has been applied to enhance the NER model's knowledge of the target language's linguistic characteristics and entity patterns by taking advantage of the abundant unlabeled text in the target language. \\n \\nIn this thesis, we present our research to address the data scarcity problem of neural-based NER. Our contributions are as follows. \\nFirstly, we propose a novel data augmentation framework for low-resource NER, which effectively improves entity diversity and alleviates the token-label misalignment problem, and is proven effective under monolingual, cross-lingual, and multilingual experimental settings. \\nSecondly, we present a consistency training method for cross-lingual NER, which propagates reliable supervision signals from the source language to the target language, aligns the representation space between languages, and alleviates overfitting on the source language. Evaluated on various cross-lingual transfer pairs, our method demonstrates superior performance over various baseline methods. \\nFinally, we introduce an improved self-training method for cross-lingual NER, where contrastive learning is utilized to facilitate classification and prototype learning is used for iteratively denoising pseudo-labeled target language data. The proposed self-training method presents significant improvements over existing self-training methods and achieves state-of-the-art performance. \\n \\nIn conclusion, we have shown that by proposing effective data augmentation methods, consistency training frameworks and improved self-training schema, the data scarcity problem in neural-based named entity recognition can be largely alleviated.", "venue": "https://doi.org/10.32657/10356/173481", "label": 0}, {"loc": [5.997330665588379, 0.4143922030925751], "openalex_id": "https://openalex.org/W4390970222", "title": "An Ambiguous Technique for Nonvisual Text Entry", "authors": "Dylan Gaines", "abstract": "Text entry is a common daily task for many people, but it can be a challenge for people with visual impairments when using virtual touchscreen keyboards that lack physical key boundaries. In this thesis, we investigate using a small number of gestures to select from groups of characters to remove most or all dependence on touch locations. We leverage a predictive language model to select the most likely characters from the selected groups once a user completes each word.\\nUsing a preliminary interface with six groups of characters based on a Qwerty keyboard, we find that users are able to enter text with no visual feedback at 19.1 words per minute (WPM) with a 2.1% character error rate (CER) after five hours of practice. We explore ways to optimize the ambiguous groups to reduce the number of disambiguation errors. We develop a novel interface named FlexType with four character groups instead of six in order to remove all remaining location dependence and enable one-handed input. We compare optimized groups with and without constraining the group assignments to alphabetical order in a user study. We find that users enter text with no visual feedback at 12.0 WPM with a 2.0% CER using the constrained groups after four hours of practice. There was no significant difference from the unconstrained groups.\\nWe improve FlexType based on user feedback and tune the recognition algorithm parameters based on the study data. We conduct an interview study with 12 blind users to assess the challenges they encounter while entering text and solicit feedback on FlexType, and we further incorporate this feedback into the interface. We evaluate the improved interface in a longitudinal study with 12 blind participants. On average, participants entered text at 8.2 words per minute using FlexType, 7.5 words per minute using a Qwerty keyboard with VoiceOver, and at 26.9 words per minute using Braille Screen Input.", "venue": "https://doi.org/10.37099/mtu.dc.etdr/1667", "label": 0}, {"loc": [9.254175186157227, 1.434519648551941], "openalex_id": "https://openalex.org/W4382644866", "title": "Evaluating embedded semantics for accessibility description of web crawl data", "authors": "Rosa Navarrete, Diana Martinez- Mosquera, Lorena Recalde, M. L.", "abstract": "The Web is ever expanding, even more by the need for content consumption derived from the pandemic. This fact highlights the need for equity in access to Web content by all people, regardless of their disabilities. To this end, it is essential to focus on web accessibility issues. The World Wide Web Consortium (W3C), the leading organization responsible for ensuring the growth of the social value of the Web, establishes standards, protocols, and recommendations to improve the reach extent of web content for people. For instance, Web Content Accessibility Guidelines (WCAG) promote the achievement of web accessibility. Furthermore, other W3C recommendations foster embedded semantic into the web content to help browsers build a machine-readable data structure aiming to produce an enriched description in search results supporting people to find the right content for their queries and, consequently, improving user experience. Searching for specific web content is especially striving for people with disabilities because they could be forced to explore many search results before finding some content that matches their accessibility requirements. If embedded semantic communicate the accessibility properties of the content, the search will be more productive for everyone but even more for people with special needs. For embedded semantic, two components are required, a vocabulary and an encoding format. Schema.org vocabulary has experienced high growth and encompasses plenty of descriptors for each type of web information, including the set of descriptors for accessibility conditions information. Regarding the format, JSON-LD is the latest W3C recommendation for encoding due to its ability to make JSON data interoperate at Web-scale. It provides a quickly transforming for Linked Data format and is simple enough to be read and written by people. This research conducts a quantitative analysis of the embedded semantic into the web content by processing a dataset obtained from millions of web crawl data for 2021. The data arrive from distinct provenance and purposes at a global scale. In this web content, each annotation is made through script JSON-LD of embedded semantic with Schema's vocabulary. The analysis defines how the accessibility descriptors are used in conjunction with other classes and properties to describe the web information on personal blogs, organizations, events, educational content, universities, persons, commerce, sports, medicine, entertainment, and more. The results provide a perspective of the awareness for accessibility in the different purposes of the Web.The processing was performed on collected zip files that contain over three hundred million records. This analysis was conducted using massive data analysis techniques such as key-value modeling with Python for processing and a NoSQL database such as MongoDB for storage. A new dataset with normalized data was generated with information about domains, types of web content, and properties associated with the accessibility descriptor. The collection and storage layers were implemented on a computing platform with 30GB of RAM, 10 CPUs, and 2TB of storage.This research delivers two main contributions. Firstly, the analysis of the interest in the Web for using accessibility descriptors in embedded semantic. The quantitative results enable us to appreciate the concern about equity and inclusion made visible through accessibility issues in different entities, according to the web domains. Moreover, these results reveal how the W3C recommendation of embedded semantic is being adopted to create a more organized and better-documented Web. Second, processing the raw dataset result in a new normalized dataset in JSON format with information about domains, web content types, and properties associated with the accessibility descriptor. This new dataset will be available for further analysis of the embedded semantic.", "venue": "AHFE international", "label": 0}, {"loc": [7.150506019592285, 0.2214856892824173], "openalex_id": "https://openalex.org/W4311550865", "title": "Pretraining Data and Tokenizer for Indic LLM", "authors": "Sumanth Doddapaneni, Rahul Aralikatte, G. Ramesh, Shreya Goyal, Mitesh M. Khapra, Anoop Kunchukuttan, Pratyush Kumar", "abstract": "Building Natural Language Understanding (NLU) capabilities for Indic languages, which have a collective speaker base of more than one billion speakers is absolutely crucial. In this work, we aim to improve the NLU capabilities of Indic languages by making contributions along 3 important axes (i) monolingual corpora (ii) NLU testsets (iii) multilingual LLMs focusing on Indic languages. Specifically, we curate the largest monolingual corpora, IndicCorp, with 20.9B tokens covering 24 languages from 4 language families - a 2.3x increase over prior work, while supporting 12 additional languages. Next, we create a human-supervised benchmark, IndicXTREME, consisting of nine diverse NLU tasks covering 20 languages. Across languages and tasks, IndicXTREME contains a total of 105 evaluation sets, of which 52 are new contributions to the literature. To the best of our knowledge, this is the first effort towards creating a standard benchmark for Indic languages that aims to test the multilingual zero-shot capabilities of pretrained language models. Finally, we train IndicBERT v2, a state-of-the-art model supporting all the languages. Averaged across languages and tasks, the model achieves an absolute improvement of 2 points over a strong baseline. The data and models are available at https://github.com/AI4Bharat/IndicBERT.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [3.657593011856079, 0.08582665771245956], "openalex_id": "https://openalex.org/W4312198153", "title": "Deep Generative Models for Prediction and Design of Enzymes", "authors": "Joseph L. Watson, David Juergens, Nathaniel R. Bennett, Brian L. Trippe, Jason Yim, Helen E. Eisenach, Woody Ahern, Andrew J. Borst, Robert J. Ragotte, Lukas F. Milles, Basile I. M. Wicky, Nikita Hanikel, Samuel J. Pellock, Alexis Courbet, William Sheffler, Jue Wang, Preetham Venkatesh, Isaac Sappington, Susana V\u00e1zquez Torres, Anna Lauko, Valentin De Bortoli, \u00c9mile Mathieu, Regina Barzilay, Tommi Jaakkola, Frank DiMaio, Minkyung Baek, David Baker", "abstract": "Abstract There has been considerable recent progress in designing new proteins using deep learning methods 1\u20139. Despite this progress, a general deep learning framework for protein design that enables solution of a wide range of design challenges, including de novo binder design and design of higher order symmetric architectures, has yet to be described. Diffusion models 10,11 have had considerable success in image and language generative modeling but limited success when applied to protein modeling, likely due to the complexity of protein backbone geometry and sequence-structure relationships. Here we show that by fine tuning the RoseTTAFold structure prediction network on protein structure denoising tasks, we obtain a generative model of protein backbones that achieves outstanding performance on unconditional and topology-constrained protein monomer design, protein binder design, symmetric oligomer design, enzyme active site scaffolding, and symmetric motif scaffolding for therapeutic and metal-binding protein design. We demonstrate the power and generality of the method, called RoseTTAFold Diffusion (RF diffusion), by experimentally characterizing the structures and functions of hundreds of new designs. In a manner analogous to networks which produce images from user-specified inputs, RF diffusion enables the design of diverse, complex, functional proteins from simple molecular specifications.", "venue": "bioRxiv (Cold Spring Harbor Laboratory)", "label": 12}, {"loc": [8.734413146972656, -1.0399342775344849], "openalex_id": "https://openalex.org/W4295135473", "title": "Computational Approaches to Lexical Complexity Prediction and Simplification", "authors": "Kai North, Marcos Zampieri, Matthew Shardlow", "abstract": "The occurrence of unknown words in texts significantly hinders reading comprehension. To improve accessibility for specific target populations, computational modeling has been applied to identify complex words in texts and substitute them for simpler alternatives. In this article, we present an overview of computational approaches to lexical complexity prediction focusing on the work carried out on English data. We survey relevant approaches to this problem which include traditional machine learning classifiers (e.g., SVMs, logistic regression) and deep neural networks as well as a variety of features, such as those inspired by literature in psycholinguistics as well as word frequency, word length, and many others. Furthermore, we introduce readers to past competitions and available datasets created on this topic. Finally, we include brief sections on applications of lexical complexity prediction, such as readability and text simplification, together with related studies on languages other than English.", "venue": "ACM Computing Surveys", "label": 7}, {"loc": [2.902160167694092, -0.10188417881727219], "openalex_id": "https://openalex.org/W4292664221", "title": "Intelligent Pharmacy", "authors": "Xiaohua Li, Benren Tan, Jinkun Zheng, Xiaomei Xu, Jian Xiao, Y. Liu", "abstract": "With the wide application of artificial intelligence and big data technology in the medical field, the problems of high cost and low efficiency of traditional pharmacy management were becoming more and more obvious. Therefore, this paper proposed to use data mining technology to design and develop the dispensing process and equipment of intelligent pharmacy. Firstly, it summarized the existing data mining technology and association rule methods and expounded its application value in the related fields. Secondly, the data standard and integration platform of dispensing in intelligent pharmacy were established. Web service technology was used to design the interactive interface and call it to the intelligent device of pharmacy. Finally, an intelligent pharmacy management system based on association rule mining was constructed through the data mining of intelligent pharmacy equipment, in order to improve the intelligence and informatization of modern pharmacy management. For the emergency dispensing process of intelligent equipment failure, data mining was used to optimize the intelligent pharmacy equipment and dispensing process and change the pharmacy management from traditional prescription to patient drug treatment, so as to improve the dispensing efficiency of intelligent pharmacy equipment. Through the systematic test and analysis, the results showed that through the real-time risk prevention and control, the formula accuracy and operation speed of the intelligent dispensing machine were improved and the dispensing time was shortened. Through intelligent drug delivery, the unreasonable drug use of patients was reduced, the safety and effectiveness of clinical drug use were ensured, and the contradiction between doctors and patients was reduced. This study can not only optimize the medical experience of patients and provide patients with more high-quality and humanized pharmaceutical technical services but also provide some support for the intelligent management of modern hospitals.", "venue": "Computational Intelligence and Neuroscience", "label": 0}, {"loc": [6.715444087982178, 1.648911714553833], "openalex_id": "https://openalex.org/W3199467273", "title": "Transformers in Natural Language Processing", "authors": "Julian von der Mosel, Alexander Trautsch, Steffen Herbold", "abstract": "Transformers are the current state-of-the-art of natural language processing in many domains and are using traction within software engineering research as well. Such models are pre-trained on large amounts of data, usually from the general domain. However, we only have a limited understanding regarding the validity of transformers within the software engineering domain, i.e., how good such models are at understanding words and sentences within a software engineering context and how this improves the state-of-the-art. Within this article, we shed light on this complex, but crucial issue. We compare BERT transformer models trained with software engineering data with transformers based on general domain data in multiple dimensions: their vocabulary, their ability to understand which words are missing, and their performance in classification tasks. Our results show that for tasks that require understanding of the software engineering context, pre-training with software engineering data is valuable, while general domain models are sufficient for general language understanding, also within the software engineering domain.", "venue": "IEEE Transactions on Software Engineering", "label": 0}, {"loc": [8.600092887878418, -0.27104073762893677], "openalex_id": "https://openalex.org/W4221148722", "title": "Faithfulness of Natural Language Generation", "authors": "Wei Li, Wenhao Wu, Moye Chen, Jiachen Liu, Xinyan Xiao, Hua Wu", "abstract": "Natural Language Generation (NLG) has made great progress in recent years due to the development of deep learning techniques such as pre-trained language models. This advancement has resulted in more fluent, coherent and even properties controllable (e.g. stylistic, sentiment, length etc.) generation, naturally leading to development in downstream tasks such as abstractive summarization, dialogue generation, machine translation, and data-to-text generation. However, the faithfulness problem that the generated text usually contains unfaithful or non-factual information has become the biggest challenge, which makes the performance of text generation unsatisfactory for practical applications in many real-world scenarios. Many studies on analysis, evaluation, and optimization methods for faithfulness problems have been proposed for various tasks, but have not been organized, compared and discussed in a combined manner. In this survey, we provide a systematic overview of the research progress on the faithfulness problem of NLG, including problem analysis, evaluation metrics and optimization methods. We organize the evaluation and optimization methods for different tasks into a unified taxonomy to facilitate comparison and learning across tasks. Several research trends are discussed further.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.170450210571289, 0.37656426429748535], "openalex_id": "https://openalex.org/W4221159609", "title": "Parameter Efficiency, Few-Shot, Zero-Shot, Prompting", "authors": "Andy T. Liu, Wei Xiao, Henghui Zhu, Dejiao Zhang, Shang-Wen Li, Andrew Arnold", "abstract": "Recently, prompt-based learning for pre-trained language models has succeeded in few-shot Named Entity Recognition (NER) by exploiting prompts as task guidance to increase label efficiency. However, previous prompt-based methods for few-shot NER have limitations such as a higher computational complexity, poor zero-shot ability, requiring manual prompt engineering, or lack of prompt robustness. In this work, we address these shortcomings by proposing a new prompt-based learning NER method with Question Answering (QA), called QaNER. Our approach includes 1) a refined strategy for converting NER problems into the QA formulation; 2) NER prompt generation for QA models; 3) prompt-based tuning with QA models on a few annotated NER examples; 4) zero-shot NER by prompting the QA model. Comparing the proposed approach with previous methods, QaNER is faster at inference, insensitive to the prompt quality, and robust to hyper-parameters, as well as demonstrating significantly better low-resource performance and zero-shot capability.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.22097396850586, 1.420276165008545], "openalex_id": "https://openalex.org/W2949405564", "title": "The Web Is Missing an Essential Part of Infrastructure: An Open Web Index A proposal for building an index of the Web that separates the infrastructure part of the \u2026", "authors": "Dirk Lewandowski", "abstract": "A proposal for building an index of the Web that separates the infrastructure part of the search engine - the index - from the services part that will form the basis for myriad search engines and other services utilizing Web data on top of a public infrastructure open to everyone.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.442409038543701, 2.462714195251465], "openalex_id": "https://openalex.org/W2951864292", "title": "Gender bias in language and artificial intelligence tools", "authors": "Tony Sun, Andrew Gaut, Shirlyn Tang, Yuxin Huang, Mai ElSherief, Jieyu Zhao, Diba Mirza, Elizabeth Belding, Kai-Wei Chang, William Yang Wang", "abstract": "As Natural Language Processing (NLP) and Machine Learning (ML) tools rise in popularity, it becomes increasingly vital to recognize the role they play in shaping societal biases and stereotypes. Although NLP models have shown success in modeling various applications, they propagate and may even amplify gender bias found in text corpora. While the study of bias in artificial intelligence is not new, methods to mitigate gender bias in NLP are relatively nascent. In this paper, we review contemporary studies on recognizing and mitigating gender bias in NLP. We discuss gender bias based on four forms of representation bias and analyze methods recognizing gender bias. Furthermore, we discuss the advantages and drawbacks of existing gender debiasing methods. Finally, we discuss future studies for recognizing and mitigating gender bias in NLP.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [7.490232467651367, -1.1240119934082031], "openalex_id": "https://openalex.org/W2958953787", "title": "Translation in the Wild", "authors": "Naveen Arivazhagan, Ankur Bapna, Orhan F\u0131rat, Dmitry Lepikhin, Melvin Johnson, Maxim Krikun, Mia Xu Chen, Yuan Cao, George Foster, Colin Cherry, Wolfgang Macherey, Zhifeng Chen, Yonghui Wu", "abstract": "We introduce our efforts towards building a universal neural machine translation (NMT) system capable of translating between any language pair. We set a milestone towards this goal by building a single massively multilingual NMT model handling 103 languages trained on over 25 billion examples. Our system demonstrates effective transfer learning ability, significantly improving translation quality of low-resource languages, while keeping high-resource language translation quality on-par with competitive bilingual baselines. We provide in-depth analysis of various aspects of model building that are crucial to achieving quality and practicality in universal NMT. While we prototype a high-quality universal translation system, our extensive empirical analysis exposes issues that need to be further addressed, and we suggest directions for future research.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [9.522202491760254, 0.744349479675293], "openalex_id": "https://openalex.org/W2857028992", "title": "A survey on recent advances in named entity recognition", "authors": "Vikas Yadav, Steven Bethard", "abstract": "Named Entity Recognition (NER) is a key component in NLP systems for question answering, information retrieval, relation extraction, etc. NER systems have been studied and developed widely for decades, but accurate systems using deep neural networks (NN) have only been introduced in the last few years. We present a comprehensive survey of deep neural network architectures for NER, and contrast them with previous approaches to NER based on feature engineering and other supervised or semi-supervised learning algorithms. Our results highlight the improvements achieved by neural networks, and show how incorporating some of the lessons learned from past work on feature-based NER systems can yield further improvements.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [6.2728047370910645, 5.787881374359131], "openalex_id": "https://openalex.org/W3035688398", "title": "Scaling Language-Free Visual Representation Learning", "authors": "Zhe Gan, Yen-Chun Chen, Linjie Li, Chen Zhu, Yu Cheng, Jingjing Liu", "abstract": "We present VILLA, the first known effort on large-scale adversarial training for vision-and-language (V+L) representation learning. VILLA consists of two training stages: (i) task-agnostic adversarial pre-training; followed by (ii) task-specific adversarial finetuning. Instead of adding adversarial perturbations on image pixels and textual tokens, we propose to perform adversarial training in the embedding space of each modality. To enable large-scale training, we adopt the \"free\" adversarial training strategy, and combine it with KL-divergence-based regularization to promote higher invariance in the embedding space. We apply VILLA to current best-performing V+L models, and achieve new state of the art on a wide range of tasks, including Visual Question Answering, Visual Commonsense Reasoning, Image-Text Retrieval, Referring Expression Comprehension, Visual Entailment, and NLVR2.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [5.761961936950684, -1.0942233800888062], "openalex_id": "https://openalex.org/W4287200699", "title": "Multilingual Language Models: Analysis and Algorithms", "authors": "Francesco Barbieri, Luis Espinosa-Anke, Jos\u00e9 Camacho-Collados", "abstract": "Language models are ubiquitous in current NLP, and their multilingual capacity has recently attracted considerable attention. However, current analyses have almost exclusively focused on (multilingual variants of) standard benchmarks, and have relied on clean pre-training and task-specific corpora as multilingual signals. In this paper, we introduce XLM-T, a model to train and evaluate multilingual language models in Twitter. In this paper we provide: (1) a new strong multilingual baseline consisting of an XLM-R (Conneau et al. 2020) model pre-trained on millions of tweets in over thirty languages, alongside starter code to subsequently fine-tune on a target task; and (2) a set of unified sentiment analysis Twitter datasets in eight different languages and a XLM-T model fine-tuned on them.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.011539936065674, -2.1613969802856445], "openalex_id": "https://openalex.org/W4212869899", "title": "Addressing the Challenge of Online Health Misinformation: Detection, Retrieval, and Explainability", "authors": "Hye Kyung Kim, Edson C. Tandoc", "abstract": "The COVID-19 pandemic poses an unprecedented threat to global human wellbeing, and the proliferation of online misinformation during this critical period amplifies the challenge. This study examines consequences of exposure to online misinformation about COVID-19 preventions. Using a three-wave panel survey involving 1,023 residents in Singapore, the study found that exposure to online misinformation prompts engagement in self-reported misinformed behaviors such as eating more garlic and regularly rinsing nose with saline, while discouraging evidence-based prevention behaviors such as social distancing. This study further identifies information overload and misperception on prevention as important mechanisms that link exposure to online misinformation and these outcomes. The effects of misinformation exposure differ by individuals\u2019 eheath literacy level, suggesting the need for a health literacy education to minimize the counterproductive effects of misinformation online. This study contributes to theory-building in misinformation by addressing potential pathways of and disparity in its possible effects on behavior.", "venue": "Frontiers in Psychology", "label": 0}, {"loc": [3.980067014694214, -1.3123987913131714], "openalex_id": "https://openalex.org/W4210576753", "title": "Pandemic Surveillance Using Social Media Data, Natural Language Processing, and Machine Learning", "authors": "Abul Hasan, Mark Levene, David Weston, Renate Fromson, Nicolas Koslover, Tamara Levene", "abstract": "Background The COVID-19 pandemic has created a pressing need for integrating information from disparate sources in order to assist decision makers. Social media is important in this respect; however, to make sense of the textual information it provides and be able to automate the processing of large amounts of data, natural language processing methods are needed. Social media posts are often noisy, yet they may provide valuable insights regarding the severity and prevalence of the disease in the population. Here, we adopt a triage and diagnosis approach to analyzing social media posts using machine learning techniques for the purpose of disease detection and surveillance. We thus obtain useful prevalence and incidence statistics to identify disease symptoms and their severities, motivated by public health concerns. Objective This study aims to develop an end-to-end natural language processing pipeline for triage and diagnosis of COVID-19 from patient-authored social media posts in order to provide researchers and public health practitioners with additional information on the symptoms, severity, and prevalence of the disease rather than to provide an actionable decision at the individual level. Methods The text processing pipeline first extracted COVID-19 symptoms and related concepts, such as severity, duration, negations, and body parts, from patients\u2019 posts using conditional random fields. An unsupervised rule-based algorithm was then applied to establish relations between concepts in the next step of the pipeline. The extracted concepts and relations were subsequently used to construct 2 different vector representations of each post. These vectors were separately applied to build support vector machine learning models to triage patients into 3 categories and diagnose them for COVID-19. Results We reported macro- and microaveraged F1 scores in the range of 71%-96% and 61%-87%, respectively, for the triage and diagnosis of COVID-19 when the models were trained on human-labeled data. Our experimental results indicated that similar performance can be achieved when the models are trained using predicted labels from concept extraction and rule-based classifiers, thus yielding end-to-end machine learning. In addition, we highlighted important features uncovered by our diagnostic machine learning models and compared them with the most frequent symptoms revealed in another COVID-19 data set. In particular, we found that the most important features are not always the most frequent ones. Conclusions Our preliminary results show that it is possible to automatically triage and diagnose patients for COVID-19 from social media natural language narratives, using a machine learning pipeline in order to provide information on the severity and prevalence of the disease for use within health surveillance systems.", "venue": "Journal of Medical Internet Research", "label": 13}, {"loc": [6.131646156311035, 2.5538578033447266], "openalex_id": "https://openalex.org/W4221143046", "title": "Improving Arithmetical Reasoning of Language Models", "authors": "Jason Lee, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Ed H., Quoc V. Le, Denny Zhou", "abstract": "We explore how generating a chain of thought -- a series of intermediate reasoning steps -- significantly improves the ability of large language models to perform complex reasoning. In particular, we show how such reasoning abilities emerge naturally in sufficiently large language models via a simple method called chain of thought prompting, where a few chain of thought demonstrations are provided as exemplars in prompting. Experiments on three large language models show that chain of thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks. The empirical gains can be striking. For instance, prompting a 540B-parameter language model with just eight chain of thought exemplars achieves state of the art accuracy on the GSM8K benchmark of math word problems, surpassing even finetuned GPT-3 with a verifier.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [4.936556816101074, -1.4175584316253662], "openalex_id": "https://openalex.org/W4361861240", "title": "Text Emotion Recognition Using Fast Text Word Embedding in Bi-Directional Gated Recurrent Unit", "authors": "Devi C. Akalya, Renuka D. Karthika, T. Harisudhan, V. K. Jeevanantham, J. Jhanani, Varshini S. Kavi", "abstract": "Emotions are states of readiness in the mind that result from evaluations of one's own thinking or events. Although almost all of the important events in our lives are marked by emotions, the nature, causes, and effects of emotions are some of the least understood parts of the human experience. Emotion recognition is playing a promising role in the domains of human-computer interaction and artificial intelligence. A human's emotions can be detected using a variety of methods, including facial gestures, blood pressure, body movements, heart rate, and textual data. From an application standpoint, the ability to identify human emotions in text is becoming more and more crucial in computational linguistics. In this work, we present a classification methodology based on deep neural networks. The Bi-directional Gated Recurrent Unit (Bi-GRU) employed here demonstrates its effectiveness on the Multimodal Emotion Lines Dataset (MELD) when compared to Convolutional Neural Networks (CNN) and Long Short-Term Memory (LSTM). For word encoding, a comparison of three pre-trained word embeddings namely Glove, Word2Vec, and fastText is made. The findings from the MELD corpus support the conclusion that fastText is the best word embedding for the proposed Bi-GRU model. The experiment utilized the \"glove.6B.300d\" vector space. It consists of two million word representations in 300 dimensions trained on Common Crawl with sub-word information (600 billion tokens). The accuracy scores of GloVe, Word2Vec, and fastText (300 dimensions each) are tabulated and studied in order to highlight the improved results with fastText on the MELD dataset tested. It is observed that the Bidirectional Gated Recurrent Unit (Bi-GRU) with fastText word embedding outperforms GloVe and Word2Vec with an accuracy of 79.7%.", "venue": "i-manager s Journal on Information Technology", "label": 0}, {"loc": [5.989729881286621, 0.6484003663063049], "openalex_id": "https://openalex.org/W4312934826", "title": "Natural Language Processing Journal", "authors": "Xieling Chen, Haoran Xie, Xiaohui Tao", "abstract": "The field of Natural Language Processing (NLP) has evolved with, and as well as influenced, recent advances in Artificial Intelligence (AI) and computing technologies, opening up new applications and novel interactions with humans. Modern NLP involves machines' interaction with human languages for the study of patterns and obtaining meaningful insights. NLP is increasingly receiving attention across academia and industry and demonstrates extraordinary opportunities and across AI applications (e.g., question answering, information retrieval, sentiment analysis, and recommender systems) and helps to deal with new tasks such as machine translation and reading comprehension, with real world performance improving all the time. This editorial first provides an overview of the field of NLP in terms of research grants, publication venues, and research topics. We then introduce the mission of Natural Language Processing Journal, a new NLP-focused Elsevier journal intended as a forum for researchers and practitioners to publish theoretical, practical, and methodological achievements related to trustworthy AI development and applications for analyzing, processing, and modeling human languages.", "venue": "Natural Language Processing Journal", "label": 9}, {"loc": [3.309551239013672, 1.857715129852295], "openalex_id": "https://openalex.org/W4226338178", "title": "Artificial intelligence in the news: how AI retools, rationalizes, and reshapes journalism and the public arena", "authors": "Mathias\u2010Felipe de\u2010Lima\u2010Santos, Wilson Ceron", "abstract": "In recent years, news media has been greatly disrupted by the potential of technologically driven approaches in the creation, production, and distribution of news products and services. Artificial intelligence (AI) has emerged from the realm of science fiction and has become a very real tool that can aid society in addressing many issues, including the challenges faced by the news industry. The ubiquity of computing has become apparent and has demonstrated the different approaches that can be achieved using AI. We analyzed the news industry\u2019s AI adoption based on the seven subfields of AI: (i) machine learning; (ii) computer vision (CV); (iii) speech recognition; (iv) natural language processing (NLP); (v) planning, scheduling, and optimization; (vi) expert systems; and (vii) robotics. Our findings suggest that three subfields are being developed more in the news media: machine learning, computer vision, and planning, scheduling, and optimization. Other areas have not been fully deployed in the journalistic field. Most AI news projects rely on funds from tech companies such as Google. This limits AI\u2019s potential to a small number of players in the news industry. We made conclusions by providing examples of how these subfields are being developed in journalism and presented an agenda for future research.", "venue": "Journalism and Media", "label": 0}, {"loc": [9.224180221557617, -0.881125271320343], "openalex_id": "https://openalex.org/W4211252716", "title": "Text Summarization of Medical Records", "authors": "S. Deepika, Lakshmi Krishna N, S. Shridevi", "abstract": "The method of reducing information from an original text document while maintaining the vital information is known as text summarizing. The amount of text data available has increased dramatically in recent years from a variety of sources. A large volume of text is an excellent source of information and knowledge of the source is essential for efficiently summarizing information that must be useful. Summarization facilitates the acquisition of vital and required information in a short period of time. Text summarization is required in a variety of domains, including news article summaries, email summaries and information summaries in the medical profession to track a patient's medical history for future treatment and so on. In summarization, there are two methods: extractive summarization and abstractive summarization. In this work, extractive summarization is used on the COVID-19 dataset. Different models and their results have been discussed.", "venue": "2021 Innovations in Power and Advanced Computing Technologies (i-PACT)", "label": 0}, {"loc": [5.766738414764404, 1.173925757408142], "openalex_id": "https://openalex.org/W3209125028", "title": "Neural Speech Tracking in EEG: Integrating Acoustics and Linguistics for Hearing Aid Users", "authors": "Marlies Gillis, Jonas Vanthornhout, Jonathan Z. Simon, Tom Francart, Christian Brodbeck, Marlies Gillis, Jonas Vanthornhout, Jonathan Z. Simon, Tom Francart, Christian Brodbeck", "abstract": "When listening to speech, our brain responses time lock to acoustic events in the stimulus. Recent studies have also reported that cortical responses track linguistic representations of speech. However, tracking of these representations is often described without controlling for acoustic properties. Therefore, the response to these linguistic representations might reflect unaccounted acoustic processing rather than language processing. Here, we evaluated the potential of several recently proposed linguistic representations as neural markers of speech comprehension. To do so, we investigated EEG responses to audiobook speech of 29 participants (22 females). We examined whether these representations contribute unique information over and beyond acoustic neural tracking and each other. Indeed, not all of these linguistic representations were significantly tracked after controlling for acoustic properties. However, phoneme surprisal, cohort entropy, word surprisal, and word frequency were all significantly tracked over and beyond acoustic properties. We also tested the generality of the associated responses by training on one story and testing on another. In general, the linguistic representations are tracked similarly across different stories spoken by different readers. These results suggests that these representations characterize the processing of the linguistic content of speech. SIGNIFICANCE STATEMENT For clinical applications, it would be desirable to develop a neural marker of speech comprehension derived from neural responses to continuous speech. Such a measure would allow for behavior-free evaluation of speech understanding; this would open doors toward better quantification of speech understanding in populations from whom obtaining behavioral measures may be difficult, such as young children or people with cognitive impairments, to allow better targeted interventions and better fitting of hearing devices.", "venue": "Journal of Neuroscience", "label": 0}, {"loc": [3.875359296798706, -1.358939528465271], "openalex_id": "https://openalex.org/W3197468465", "title": "Exploring Approaches for Measuring Risk in the News", "authors": "Yong-kyun Kim, Jean Luc Poncelet, Geum-Young Min, Jaekyung Lee, Yunjung Yang", "abstract": "This study aimed to investigate the different kinds of risks associated with the novel coronavirus infection in the Republic of Korea and how those risks have been changed by the countermeasures taken by the Korean authorities and citizens. To this end, the authors explored the official database of the Korea Centers for Disease Control and Prevention (KCDC) in order to extract risk-related data from January 2020 to April 2021, and then identified the disaster risks and countermeasures from the government press briefings and news media in the same period. Consequently, this study identified three important approaches to enhance the infectious disease response management. First, the government has to respond immediately, even when they lack information and knowledge about the new type of risk. Second, a multi-sectoral response must be prepared to cope with systemic risks. Third, the government should prioritize transparency, inclusive risk governance, and innovative technologies during the initial response stage against risks with high uncertainty and novelty. Aside from these approaches, the types of risks were divided into four categories based on the response measures: anticipated risk against which countermeasures can be planned in advance, lingering risk against which adaptive response should be taken promptly, amplified risk, and emerging risk; the last two risks require the established plan to be modified drastically in order to secure higher-level engagement and additional resources. Finally, the authors proposed a risk management flow that can be applied to an in-depth analysis of the intersection between risk and response.", "venue": "Progress in Disaster Science", "label": 0}, {"loc": [2.876758575439453, -0.1851080060005188], "openalex_id": "https://openalex.org/W3195874325", "title": "Dataset Growth", "authors": "Nahum Kiryati, Yuval E. Landau", "abstract": "Medical image analysis research requires medical image datasets. Nevertheless, due to various impediments, researchers have been described as \u201cdata starved\u201d. We hypothesize that implicit evolving community standards require researchers to use ever-growing datasets. In Phase I of this research, we scanned the MICCAI (Medical Image Computing and Computer-Assisted Intervention) conference proceedings from 2011 to 2018. We identified 907 papers involving human MRI, CT or fMRI datasets and extracted their sizes. The median dataset size had grown by 3\u201310 times from 2011 to 2018, depending on imaging modality. Statistical analysis revealed exponential growth of the geometric mean dataset size with an annual growth of 21% for MRI, 24% for CT and 31% for fMRI. Thereupon, we had issued a forecast for dataset sizes in MICCAI 2019 well before the conference. In Phase II of this research, we examined the MICCAI 2019 proceedings and analyzed 308 relevant papers. The MICCAI 2019 statistics compare well with the forecast. The revised annual growth rates of the geometric mean dataset size are 27% for MRI, 30% for CT and 32% for fMRI. We predict the respective dataset sizes in the MICCAI 2020 conference (that we have not yet analyzed) and the future MICCAI 2021 conference.", "venue": "Journal of Imaging", "label": 0}, {"loc": [4.099648475646973, 2.349146842956543], "openalex_id": "https://openalex.org/W3195109027", "title": "State of Title IX: A Knowledge Base for Title IX Documentation", "authors": "Kirsten Hextrum, Simran Kaur Sethi", "abstract": "At Title IX\u2019s 50th anniversary we address the contradictions embedded in liberal state reform. This anniversary provides a juncture to consider the limitations of seeking gender liberation through the state. While US law is often credited with revolutionizing athletic access for girls and women, we trace how the state stymied greater transformation efforts. Using poststructuralist and Black feminist state critiques, we show how Title IX utilized an assimilation approach to equity by inviting state domination into women\u2019s sports. This invitation expanded state power across four domains\u2014 definitional, protective, surveillance, and economic\u2014which retained rather than disrupted heteropatriarchal, White, capitalist, dominance. We conclude with suggestions to reignite a movement for women's liberation that reimagines gender-equitable sports beyond the state's control for the forthcoming 50 years.", "venue": "International Review for the Sociology of Sport", "label": 0}, {"loc": [2.781137704849243, 2.19110369682312], "openalex_id": "https://openalex.org/W3190968605", "title": "Using Different Modes of Correction to Improve Fairness", "authors": "Gali Musa, Mosab Alrashed, N M Muhammad", "abstract": "In the era of big data lean optimisation, a technologically advanced tool for Gas Turbine (GT) performance analysis is necessary. This need is due to big data taken every 5 min of about 4-years, totalling about 6 million data points (1.5\u00d74 data points) for a single unit of GE 7FA engine of 211MW capacity. The use of actual engine data to quantify degradation using a time-base is limited and challenging. This restraint is due to the influence of varying ambient conditions, compressor inlet conditions with different power settings and loads. This study presents a software development kit (SDK) for real-time processing and evaluation of engine data using different control modes with a statistical method of filtering the data using the inlet guide vane (IGV) opening. The study also examines the analysis of GT health monitoring using time-based machine-generated data. The output of this investigation includes obtaining an average degradation trend line for the power output and heat rate increase as a function of time that serves as the basis for the investigation of the economic analysis. Implementing the SDK has reduced the labour cost, enhanced the accuracy of the analysis, and improved the efficiency of service delivery. The validation of the result with the available literature indicated a fair deviation of 1.4%. The utilisation of data for fuel flow recorded for the standard and extended correction was 24.3% and 16.3% at a range of 9.06\u20139.43 kg/s and 8.95\u20139.31 kg/s, respectively. The average percentage of fouling degradation level amounted to a 7.2% reduction of power output and a 1.6% increase in heat rate for one year of operation.", "venue": "Energy Reports", "label": 0}, {"loc": [2.486734628677368, 2.4618730545043945], "openalex_id": "https://openalex.org/W3208378559", "title": "Journal of Business Academy", "authors": "Publisher Riccardo Patriarca", "abstract": "Academy of Strategic Management Journal (ASMJ) is a scholarly business and management publication that offers an open access platform to discuss the latest discoveries and innovations in this field. Affiliated to Allied Business Academies, the journal strictly adheres to double blind peer review policy to maintain the publication quality.\r\n\r\nWith an acceptance rate of 30%, the journal considers theoretical and empirical works in Management, Strategic Management, Health Care Management, Strategy and Leadership as research articles, reviews, case studies and commentaries for publication. For more details on the types of manuscripts published and the categories of research accepted are displayed in Journal Matrix section of this website.", "venue": "ADVANCED SCIENCES INDEX", "label": 0}, {"loc": [8.156587600708008, 1.0541383028030396], "openalex_id": "https://openalex.org/W3180230246", "title": "Information Retrieval with Dense and Sparse Representations", "authors": "Jimmy Lin, Xueguang Ma, Sheng-Chieh Lin, Jheng-Hong Yang, Ronak Pradeep, Rodrigo Nogueira", "abstract": "Pyserini is a Python toolkit for reproducible information retrieval research with sparse and dense representations. It aims to provide effective, reproducible, and easy-to-use first-stage retrieval in a multi-stage ranking architecture. Our toolkit is self-contained as a standard Python package and comes with queries, relevance judgments, pre-built indexes, and evaluation scripts for many commonly used IR test collections. We aim to support, out of the box, the entire research lifecycle of efforts aimed at improving ranking with modern neural approaches. In particular, Pyserini supports sparse retrieval (e.g., BM25 scoring using bag-of-words representations), dense retrieval (e.g., nearest-neighbor search on transformer-encoded representations), as well as hybrid retrieval that integrates both approaches. This paper provides an overview of toolkit features and presents empirical results that illustrate its effectiveness on two popular ranking tasks. Around this toolkit, our group has built a culture of reproducibility through shared norms and tools that enable rigorous automated testing.", "venue": "https://doi.org/10.1145/3404835.3463238", "label": 0}, {"loc": [9.22851848602295, -0.8935525417327881], "openalex_id": "https://openalex.org/W3152567075", "title": "Deep Learning Based Multi-document Summarization", "authors": "Congbo Ma", "abstract": "Multi-document summarization is one of the most important tasks in the field of Natural Language Processing (NLP) and it gains increasing attention in recent years. It aims to generate one summary across several topic-related documents. Compared with extractive summarization, abstractive summarization is more similar to human-written ones. Proposing effective and efficient abstractive multi-document summarization models is significant to the NLP community. Existing deep learning based multi-document summarization models rely on the exceptional ability of neural networks to extract distinct features. However, they have missed out important linguistic knowledge such as dependencies between words since linguistics information in texts is full of meaningful knowledge with respect to the input documents. Besides, how models automatically evaluate the quality of the summary is crucial to design a high-performance summarization model since the evaluation indicator objectively measures the effectiveness of a method. In this proposal, we bring forward two research questions and corresponding solutions for the abstractive multi-document summarization task.", "venue": "https://doi.org/10.1145/3404835.3463268", "label": 0}, {"loc": [7.556396961212158, -1.0477961301803589], "openalex_id": "https://openalex.org/W3122317902", "title": "Efficient Large Language Models with Conditional Computation", "authors": "Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan F\u0131rat, Yanping Huang, Maxim Krikun, Noam Shazeer, Zhifeng Chen", "abstract": "Neural network scaling has been critical for improving the model quality in many real-world machine learning applications with vast amounts of training data and compute. Although this trend of scaling is affirmed to be a sure-fire approach for better model quality, there are challenges on the path such as the computation cost,ease of programming, and efficient implementation on parallel devices. In this paper we demonstrate conditional computation as a remedy to the above mentioned impediments, and demonstrate its efficacy and utility. We make extensive use of GShard, a module composed of a set of lightweight annotation APIs and an extension to the XLA compiler to enable large scale models with up to trillions of parameters. GShard and conditional computation enable us to scale up multilingual neural machine translation Transformer model with Sparsely-Gated Mixture-of-Experts. We demonstrate that such a giant model with 600 billion parameters can efficiently be trained on 2048 TPU v3 cores in 4 days to achieve far superior quality for translation from 100 languages to English compared to the prior art.", "venue": "International Conference on Learning Representations", "label": 0}, {"loc": [3.719198703765869, -4.017682075500488], "openalex_id": "https://openalex.org/W3180212707", "title": "Viewpoints Detection in Political Speeches", "authors": "et. al. Priya", "abstract": "Hate speech is becoming a very imperative problem in social as well political context in the present era. It ultimately reflects an intolerance to difference (on the basis of ethnicity, caste and creed, religious views, race, political views, etc.). As a matter of fact, a data generator (user), who uses hate speech wants to emphasize their viewpoints and identity among others and a consequence of such activities somehow leads to hate deeds and conduct. Social media platforms with their wide approach have now become very powerful and influential to affect the psychology of people. The Internet, especially social media, acts as a \u201cturbo accelerator\u201d of hate speech in any context. It is a communication channel that plays a significant role both in opposing hate speech and amplifying it at the same time as well.According to standards for the Facebook community, \u201cHate Speech\u201d is classified as the text or speech that hurts emotions and attacks someone on the basis of their ethnicity, caste, nation of origin, religion, disability or some type of disease. Twitter, also provides a policy which applies to promoted tweets and prohibits the promotion of sensitive content. This work proposes the mining of web content for available political speeches and then classifying them as hate speech or benign speech. This paper also presents background on hate speech and its detection approaches.", "venue": "Turkish Journal of Computer and Mathematics Education (TURCOMAT)", "label": 0}, {"loc": [2.418530225753784, 1.5673094987869263], "openalex_id": "https://openalex.org/W3155263273", "title": "ARTIFICIAL INTELLIGENCE (AI) AND ETHICS OF EDUCATIONAL RESEARCH", "authors": "W. Holmes, Ka\u015bka Porayska\u2010Pomsta, Ken Holstein, Emma Sutherland, Toby T. Baker, Simon Buckingham Shum, Olga C. Santos, Ma. Mercedes T. Rodrigo, Mutlu Cukurova, Ig Ibert Bittencourt, Kenneth R. Koedinger", "abstract": "Abstract While Artificial Intelligence in Education (AIED) research has at its core the desire to support student learning, experience from other AI domains suggest that such ethical intentions are not by themselves sufficient. There is also the need to consider explicitly issues such as fairness, accountability, transparency, bias, autonomy, agency, and inclusion. At a more general level, there is also a need to differentiate between doing ethical things and doing things ethically, to understand and to make pedagogical choices that are ethical, and to account for the ever-present possibility of unintended consequences. However, addressing these and related questions is far from trivial. As a first step towards addressing this critical gap, we invited 60 of the AIED community\u2019s leading researchers to respond to a survey of questions about ethics and the application of AI in educational contexts. In this paper, we first introduce issues around the ethics of AI in education. Next, we summarise the contributions of the 17 respondents, and discuss the complex issues that they raised. Specific outcomes include the recognition that most AIED researchers are not trained to tackle the emerging ethical questions. A well-designed framework for engaging with ethics of AIED that combined a multidisciplinary approach and a set of robust guidelines seems vital in this context.", "venue": "International Journal of Artificial Intelligence in Education", "label": 0}, {"loc": [3.7700753211975098, -3.9574058055877686], "openalex_id": "https://openalex.org/W3155025376", "title": "Cross-Lingual Transfer Learning for Low-Resource Hate Speech Detection", "authors": "Irina Bigoulaeva, Viktor Hangya, Alexander Fraser", "abstract": "We address the task of automatic hate speech detection for low-resource languages. Rather than collecting and annotating new hate speech data, we show how to use cross-lingual transfer learning to leverage already existing data from higher-resource languages. Using bilingual word embeddings based classifiers we achieve good performance on the target language by training only on the source dataset. Using our transferred system we bootstrap on unlabeled target language data, improving the performance of standard cross-lingual transfer approaches. We use English as a high resource language and German as the target language for which only a small amount of annotated corpora are available. Our results indicate that cross-lingual transfer learning together with our approach to leverage additional unlabeled data is an effective way of achieving good performance on low-resource target languages without the need for any target-language annotations.", "venue": "https://www.aclweb.org/anthology/2021.ltedi-1.3/", "label": 0}, {"loc": [4.297099590301514, -2.294478178024292], "openalex_id": "https://openalex.org/W3160064206", "title": "A Novel Approach in Financial Fraud Detection: Addressing Data Imbalance With Prompt Engineering, Leveraging Large Language Model Embeddings for Fraud \u2026", "authors": "Hui Xia, Hui Ma", "abstract": "Abstract In finance and economic area, financial fraud detection plays an important role for both corporate management and capital market system. Feature extraction is one of the most important procedure in fraudulent firm detection. Current feature extraction approaches pay large amounts of attention on their financial attributes, which have explicitly limited the representation of \u2018normal\u2019 pictures of firms, and furthermore, reduced the financial fraud detection performance. Hence it is necessary to search for a better set of functions as features to represent firms more accurately. In this work, notice that the imitation behaviors among firms often happen in business management, while this structure patterns have not been utilized in f inancial fraud detection so far, we extract features under the constraint of both financial characters and structure patterns of firms. We also design three measurements to quantify the structure patterns. Experimental results have shown a great performance of the proposed approach.", "venue": "Journal of Physics Conference Series", "label": 34}, {"loc": [3.5318517684936523, -0.028796598315238953], "openalex_id": "https://openalex.org/W3127989310", "title": "Machine Learning and the Analysis of Culture", "authors": "Philipp Vitense, Elisa Kasbohm, Anne Klassen, Peter Gierschner, Phillip Trefz, Michael Weber, Wolfram Miekisch, Jochen K. Schubert, Petra M\u00f6bius, Petra Reinhold, Volkmar Liebscher, Heike K\u00f6hler", "abstract": "Analysis of volatile organic compounds (VOCs) is a novel approach to accelerate bacterial culture diagnostics of Mycobacterium avium subsp. paratuberculosis (MAP). In the present study, cultures of fecal and tissue samples from MAP-infected and non-suspect dairy cattle and goats were explored to elucidate the effects of sample matrix and of animal species on VOC emissions during bacterial cultivation and to identify early markers for bacterial growth. The samples were processed following standard laboratory procedures, culture tubes were incubated for different time periods. Headspace volume of the tubes was sampled by needle trap-micro-extraction, and analyzed by gas chromatography-mass spectrometry. Analysis of MAP-specific VOC emissions considered potential characteristic VOC patterns. To address variation of the patterns, a flexible and robust machine learning workflow was set up, based on random forest classifiers, and comprising three steps: variable selection, parameter optimization, and classification. Only a few substances originated either from a certain matrix or could be assigned to one animal species. These additional emissions were not considered informative by the variable selection procedure. Classification accuracy of MAP-positive and negative cultures of bovine feces was 0.98 and of caprine feces 0.88, respectively. Six compounds indicating MAP presence were selected in all four settings (cattle vs. goat, feces vs. tissue): 2-Methyl-1-propanol, 2-methyl-1-butanol, 3-methyl-1-butanol, heptanal, isoprene, and 2-heptanone. Classification accuracies for MAP growth-scores ranged from 0.82 for goat tissue to 0.89 for cattle feces. Misclassification occurred predominantly between related scores. Seventeen compounds indicating MAP growth were selected in all four settings, including the 6 compounds indicating MAP presence. The concentration levels of 2,3,5-trimethylfuran, 2-pentylfuran, 1-propanol, and 1-hexanol were indicative for MAP cultures before visible growth was apparent. Thus, very accurate classification of the VOC samples was achieved and the potential of VOC analysis to detect bacterial growth before colonies become visible was confirmed. These results indicate that diagnosis of paratuberculosis can be optimized by monitoring VOC emissions of bacterial cultures. Further validation studies are needed to increase the robustness of indicative VOC patterns for early MAP growth as a pre-requisite for the development of VOC-based diagnostic analysis systems.", "venue": "Frontiers in Veterinary Science", "label": 0}, {"loc": [3.9991202354431152, -2.2352709770202637], "openalex_id": "https://openalex.org/W3126000262", "title": "Detecting Linguistic Diversity on Social Media", "authors": "Marco Bastos, Shawn Walker, Michael Simeone", "abstract": "This article introduces a model for detecting low-quality information we refer to as the Index of Measured-diversity, Partisan-certainty, Ephemerality, and Domain (IMPED). The model purports that low-quality information is characterized by ephemerality, as opposed to quality content that is designed for permanence. The IMPED model leverages linguistic and temporal patterns in the content of social media messages and linked webpages to estimate a parametric survival model and the likelihood the content will be removed from the internet. We review the limitations of current approaches for the detection of problematic content, including misinformation and false news, which are largely based on fact checking and machine learning, and detail the requirements for a successful implementation of the IMPED model. The article concludes with a review of examples taken from the 2018 election cycle and the performance of the model in identifying low-quality information as a proxy for problematic content.", "venue": "American Behavioral Scientist", "label": 0}, {"loc": [8.123865127563477, 1.8409167528152466], "openalex_id": "https://openalex.org/W3153451655", "title": "Large-Scale Language Models", "authors": "Kang Min Yoo, Dongju Park, Jaewook Kang, Sang-Woo Lee, Woomyoung Park", "abstract": "Large-scale language models such as GPT-3 are excellent few-shot learners, allowing them to be controlled via natural text prompts. Recent studies report that prompt-based direct classification eliminates the need for fine-tuning but lacks data and inference scalability. This paper proposes a novel data augmentation technique that leverages large-scale language models to generate realistic text samples from a mixture of real samples. We also propose utilizing soft-labels predicted by the language models, effectively distilling knowledge from the large-scale language models and creating textual perturbations simultaneously. We perform data augmentation experiments on diverse classification tasks and show that our method hugely outperforms existing text augmentation methods. We also conduct experiments on our newly proposed benchmark to show that the augmentation effect is not only attributed to memorization. Further ablation studies and a qualitative analysis provide more insights into our approach.", "venue": "https://doi.org/10.18653/v1/2021.findings-emnlp.192", "label": 0}, {"loc": [6.106778144836426, 2.0484163761138916], "openalex_id": "https://openalex.org/W3093568530", "title": "Variational Autoencoding and Segmentation", "authors": "Chunchuan Lyu, Shay B. Cohen, Ivan Titov", "abstract": "Meaning Representations (AMR) are a broad-coverage semantic formalism which represents sentence meaning as a directed acyclic graph. To train most AMR parsers, one needs to segment the graph into subgraphs and align each such subgraph to a word in a sentence; this is normally done at preprocessing, relying on hand-crafted rules. In contrast, we treat both alignment and segmentation as latent variables in our model and induce them as part of end-to-end training. As marginalizing over the structured latent variables is infeasible, we use the variational autoencoding framework. To ensure end-to-end differentiable optimization, we introduce a differentiable relaxation of the segmentation and alignment problems. We observe that inducing segmentation yields substantial gains over using a \u2018greedy\u2019 segmentation heuristic. The performance of our method also approaches that of a model that relies on the segmentation rules of Lyu and Titov (2018), which were hand-crafted to handle individual AMR constructions.", "venue": "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", "label": 0}, {"loc": [7.265435218811035, -0.21324998140335083], "openalex_id": "https://openalex.org/W3214578205", "title": "Improving Parameter-Efficient Cross-Lingual Transfer for Low-Resource Languages", "authors": "Alan Ansell, Edoardo Maria Ponti, Jonas Pfeiffer, Sebastian Ruder, Goran Glava\u009a\u0161, Ivan Vuli\u0107, Anna Korhonen", "abstract": "Adapter modules have emerged as a general parameter-efficient means to specialize a pretrained encoder to new domains. Massively multilingual transformers (MMTs) have particularly benefited from additional training of language-specific adapters. However, this approach is not viable for the vast majority of languages, due to limitations in their corpus size or compute budgets. In this work, we propose MAD-G (Multilingual ADapter Generation), which contextually generates language adapters from language representations based on typological features. In contrast to prior work, our time- and space-efficient MAD-G approach enables (1) sharing of linguistic knowledge across languages and (2) zero-shot inference by generating language adapters for unseen languages. We thoroughly evaluate MAD-G in zero-shot cross-lingual transfer on part-of-speech tagging, dependency parsing, and named entity recognition. While offering (1) improved fine-tuning efficiency (by a factor of around 50 in our experiments), (2) a smaller parameter budget, and (3) increased language coverage, MAD-G remains competitive with more expensive methods for language-specific adapter training across the board. Moreover, it offers substantial benefits for low-resource languages, particularly on the NER task in low-resource African languages. Finally, we demonstrate that MAD-G's transfer performance can be further improved via: (i) multi-source training, i.e., by generating and combining adapters of multiple languages with available task-specific training data; and (ii) by further fine-tuning generated MAD-G adapters for languages with monolingual data.", "venue": "https://doi.org/10.18653/v1/2021.findings-emnlp.410", "label": 0}, {"loc": [2.75764536857605, 2.2605597972869873], "openalex_id": "https://openalex.org/W3149731118", "title": "Engineering AI Systems: Architecture and DevOps Essentials", "authors": "Jan Bosch, Helena Holmstr\u00f6m Olsson, Ivica Crnkovi\u0107", "abstract": "Artificial intelligence (AI) and machine learning (ML) are increasingly broadly adopted in industry. However, based on well over a dozen case studies, we have learned that deploying industry-strength, production quality ML models in systems proves to be challenging. Companies experience challenges related to data quality, design methods and processes, performance of models as well as deployment and compliance. We learned that a new, structured engineering approach is required to construct and evolve systems that contain ML/DL components. In this chapter, the authors provide a conceptualization of the typical evolution patterns that companies experience when employing ML as well as an overview of the key problems experienced by the companies that they have studied. The main contribution of the chapter is a research agenda for AI engineering that provides an overview of the key engineering challenges surrounding ML solutions and an overview of open items that need to be addressed by the research community at large.", "venue": "Advances in systems analysis, software engineering, and high performance computing book series", "label": 39}, {"loc": [2.457768201828003, 1.443772315979004], "openalex_id": "https://openalex.org/W3048626054", "title": "AI as a Teachers Assistant", "authors": "Jihyun Kim, Kelly Merrill, Kun Xu, Deanna D. Sellnow", "abstract": "An increase in demand for online education has led to the creation of a new technology, machine teachers, or artificial intelligence (AI) teaching assistants. In fact, AI teaching assistants have already been implemented in a small number of courses in the United States. However, little is known about how students will perceive AI teaching assistants. Thus, the present study investigated students' perceptions about AI teaching assistants in higher education by use of an online survey. Primary findings indicate that perceived usefulness of an AI teaching assistant and perceived ease of communication with an AI teaching assistant are key to understanding an eventual adoption of AI teaching assistant-based education. These findings provide support for AI teaching assistant adoption. Based on the present study's findings, more research is needed to better understand the nuances associated with the learning experience one may have from an AI teaching assistant.", "venue": "International Journal of Human-Computer Interaction", "label": 31}, {"loc": [5.073089599609375, -1.6752136945724487], "openalex_id": "https://openalex.org/W3039503982", "title": "Evaluating Word Embedding Methods for Sentiment Analysis", "authors": "Aytu\u011f Onan", "abstract": "Summary Sentiment analysis is one of the major tasks of natural language processing, in which attitudes, thoughts, opinions, or judgments toward a particular subject has been extracted. Web is an unstructured and rich source of information containing many text documents with opinions and reviews. The recognition of sentiment can be helpful for individual decision makers, business organizations, and governments. In this article, we present a deep learning\u2010based approach to sentiment analysis on product reviews obtained from Twitter. The presented architecture combines TF\u2010IDF weighted Glove word embedding with CNN\u2010LSTM architecture. The CNN\u2010LSTM architecture consists of five layers, that is, weighted embedding layer, convolution layer (where, 1\u2010g, 2\u2010g, and 3\u2010g convolutions have been employed), max\u2010pooling layer, followed by LSTM, and dense layer. In the empirical analysis, the predictive performance of different word embedding schemes (ie, word2vec, fastText, GloVe, LDA2vec, and DOC2vec) with several weighting functions (ie, inverse document frequency, TF\u2010IDF, and smoothed inverse document frequency function) have been evaluated in conjunction with conventional deep neural network architectures. The empirical results indicate that the proposed deep learning architecture outperforms the conventional deep learning methods.", "venue": "Concurrency and Computation Practice and Experience", "label": 33}, {"loc": [4.150399684906006, 2.222888708114624], "openalex_id": "https://openalex.org/W3023472716", "title": "Digital Threats", "authors": "Michael L. Miller, Cristian Vaccari", "abstract": "We introduce a special issue that collects eight articles, comprising research from twenty-three countries and four continents on the sources, impact on citizens, and possible remedies to various digital threats to democracy, ranging from disinformation to hate speech to state interference with online freedoms. We set these contributions against the backdrop of a profound change in how scholars think about the implications of digital media for democracy. From the utopianism that prevailed from the 1990s until the early 2010s, the post-2016 reckoning has led to a change in the kinds of questions scholars ask, with the focus gradually shifting to investigations of the threats, rather than the benefits, of the Internet. The eight contributions presented in this special issue employ a variety of disciplinary approaches and methods, often comparing different countries, to address some of the most pressing questions on how the Internet can hinder the feasibility and well-functioning of democracy around the world. We conclude by setting out three challenges for future research on digital media and politics: a growing but still partial understanding of the extent and impact of the main digital threats to democracy; the risk that the dominant approaches become overly pessimistic, or founded on weak normative grounds; and the risk that research overemphasizes direct and short-term implications of digital threats on individuals and specific groups at the expense of indirect and medium-term effects on collective norms and expectations of behavior.", "venue": "The International Journal of Press/Politics", "label": 0}, {"loc": [5.497536659240723, 2.0196280479431152], "openalex_id": "https://openalex.org/W3013607465", "title": "The Influence of Audiovisual Semantics on Attention", "authors": "Daria Kvasova", "abstract": "In our everyday life we must effectively orient attention to relevant\\n\\t\\t\\t\\t objects and events in multisensory environments. The impact of\\n\\t\\t\\t\\t cross-modal links for attention orienting to spatial and temporal cues\\n\\t\\t\\t\\t has been widely described. However, real-life scenarios provide a\\n\\t\\t\\t\\t rich web of semantic information through the different sensory\\n\\t\\t\\t\\t modalities. Despite some previous studies have revealed an impact\\n\\t\\t\\t\\t of crossmodal sematic correspondences, the results are mixed with\\n\\t\\t\\t\\t regard to the conditions in which audiovisual semantic congruence\\n\\t\\t\\t\\t can influence attention orienting. Furthermore, the vast majority of\\n\\t\\t\\t\\t the research on crossmodal semantics used simple, stereotyped\\n\\t\\t\\t\\t displays that are far from achieving ecological validity.\\n\\t\\t\\t\\t The present thesis attempts to close this gap by addressing the role of\\n\\t\\t\\t\\t identity-based crossmodal relationships on attention orienting in\\n\\t\\t\\t\\t scenarios closer to real-world conditions. To this end, the\\n\\t\\t\\t\\t experiments presented here attempt to extrapolate and generalize\\n\\t\\t\\t\\t previous findings in more realistic environments by using naturalistic\\n\\t\\t\\t\\t and dynamic stimuli, and address the theoretical questions of task\\n\\t\\t\\t\\t relevance and perceptual load. The outcome of the three empirical\\n\\t\\t\\t\\t studies in this thesis lead to several conclusions. First, that the effect\\n\\t\\t\\t\\t of audio-visual semantic congruence on attention is not strictly\\n\\t\\t\\t\\t automatic. Instead, they suggest that some top-down processing is\\n\\t\\t\\t\\t necessary for audio-visual semantic congruence to trigger spatial\\n\\t\\t\\t\\t orienting. The second conclusion to emerge is that crossmodal\\n\\t\\t\\t\\t semantic congruence can guide attention under goal-directed\\n\\t\\t\\t\\t conditions in visual search, and also under free observation in \\n\\t\\t\\t\\t complex and dynamic scenes. Third, that perceptual load is a limiting\\n\\t\\t\\t\\t factor for these interactions. These findings extend previous\\n\\t\\t\\t\\t knowledge on object-based crossmodal interactions with simple\\n\\t\\t\\t\\t stimuli and clarify how audio-visual semantically congruent\\n\\t\\t\\t\\t relationships play out in realistic scenarios.", "venue": "TDX (Tesis Doctorals en Xarxa)", "label": 0}, {"loc": [3.7072832584381104, 3.911466360092163], "openalex_id": "https://openalex.org/W3099729825", "title": "Imitation Attacks: Extracting and Exploiting Model Capabilities", "authors": "Eric Wallace, Mitchell Stern, Dawn Song", "abstract": "Adversaries may look to steal or attack black-box NLP systems, either for financial gain or to exploit model errors. One setting of particular interest is machine translation (MT), where models have high commercial value and errors can be costly. We investigate possible exploitations of black-box MT systems and explore a preliminary defense against such threats. We first show that MT systems can be stolen by querying them with monolingual sentences and training models to imitate their outputs. Using simulated experiments, we demonstrate that MT model stealing is possible even when imitation models have different input data or architectures than their target models. Applying these ideas, we train imitation models that reach within 0.6 BLEU of three production MT systems on both high-resource and low-resource language pairs. We then leverage the similarity of our imitation models to transfer adversarial examples to the production systems. We use gradient-based attacks that expose inputs which lead to semantically-incorrect translations, dropped content, and vulgar model outputs. To mitigate these vulnerabilities, we propose a defense that modifies translation outputs in order to misdirect the optimization of imitation models. This defense degrades the adversary\u2019s BLEU score and attack success rate at some cost in the defender\u2019s BLEU and inference speed.", "venue": "https://doi.org/10.18653/v1/2020.emnlp-main.446", "label": 0}, {"loc": [7.518259525299072, -1.0329556465148926], "openalex_id": "https://openalex.org/W3017454464", "title": "Multilingual Models in Neural Machine Translation", "authors": "Biao Zhang, Philip Williams, Ivan Titov, Rico Sennrich", "abstract": "Massively multilingual models for neural machine translation (NMT) are theoretically attractive, but often underperform bilingual models and deliver poor zero-shot translations. In this paper, we explore ways to improve them. We argue that multilingual NMT requires stronger modeling capacity to support language pairs with varying typological characteristics, and overcome this bottleneck via language-specific components and deepening NMT architectures. We identify the off-target translation issue (i.e. translating into a wrong target language) as the major source of the inferior zero-shot performance, and propose random online backtranslation to enforce the translation of unseen training language pairs. Experiments on OPUS-100 (a novel multilingual dataset with 100 languages) show that our approach substantially narrows the performance gap with bilingual models in both one-to-many and many-to-many settings, and improves zero-shot performance by ~10 BLEU, approaching conventional pivot-based methods.", "venue": "https://doi.org/10.18653/v1/2020.acl-main.148", "label": 0}, {"loc": [3.2204833030700684, -0.7279372811317444], "openalex_id": "https://openalex.org/W3000355930", "title": "A Feasibility Study & Implementation", "authors": "Lesley Andrade, Kathy Moran, Susan J. Snelling, Darshaka Malaviarachchi, Joanne Beyers, Kelsie Near, Janis Randall Simpson", "abstract": " Introduction Primary care providers have a role to play in supporting the development of healthy eating habits, particularly in a child\u2019s early years. This study examined the feasibility of implementing the NutriSTEP\u00ae screen\u2014a 17-item nutrition risk screening tool validated for use with both toddler and preschooler populations\u2014integrated with an electronic medical record (EMR) in primary care practices in Ontario, Canada, to inform primary care decision-making and public health surveillance. Methods Five primary care practices implemented the NutriSTEP screen as a standardized form into their EMRs. To understand practitioners\u2019 experiences with delivery and assess factors associated with successful implementation, we conducted semi-structured qualitative interviews with primary care providers who were most knowledgeable about NutriSTEP implementation at their site. We assessed the quality of the extracted patient EMR data by determining the number of fully completed NutriSTEP screens and documented growth measurements of children. Results Primary care practices implemented the NutriSTEP screen as part of a variety of routine clinical contacts; specific data collection processes varied by site. Valid NutriSTEP screen data were captured in the EMRs of 80% of primary care practices. Approximately 90% of records had valid NutriSTEP screen completions and 70% of records had both valid NutriSTEP screen completions and valid growth measurements. Conclusion Integration of NutriSTEP as a standardized EMR form is feasible in primary care practices, although implementation varied in our study. The application of EMR-integrated NutriSTEP screening as part of a comprehensive childhood healthy weights surveillance system warrants further exploration. ", "venue": "Health Promotion and Chronic Disease Prevention in Canada", "label": 0}, {"loc": [4.313401699066162, 1.8571186065673828], "openalex_id": "https://openalex.org/W3160905327", "title": "A Beginner's Guide to Large Language Models", "authors": "Paul B. Courtright, Mary Ellis Gibson", "abstract": "Protestant Evangelism in India:A Beginner's Guide Paul Courtright (bio) and Mary Ellis Gibson (bio) This essay indicates the main forces in British missionary efforts in India in the long nineteenth century. It thus replicates problems in the larger historiography of Christianity in India. The Church History Association of India has since 1974 dramatically reoriented historical scholarship\u2014away from the mission history of India and in favour of a sociocultural history of Christian people in India (from the Syriac Christians of South India to the Dalit Christians of contemporary India). Our approach is something of a reversion to prior models because we wish to trace basic connections between metropolitan and colonial histories. We urge readers who are interested in a fuller picture to consult Webster's Historiography. For the East India Company (EIC), the long nineteenth century ran from 1756\u2014when British military and civil authorities achieved de facto sovereignty over the Mughal-affiliated region of Bengal\u2014to the uprising of 1857. By 1800, British religious ideology supposed that India would inevitably be drawn toward the twin lights of modernity and the Gospel. Many in Britain believed that their gift to India would be the redemption of Indian souls and [End Page 186] a better world grounded\u2014contradictorily\u2014in Utilitarian reform, capitalism, and evangelical Christianity. Although there was substantial Protestant and long-standing Catholic missionary evangelism in India, the 1813 renewal of the EIC charter gave explicit permission for Protestant missionaries to establish churches for Hindus and others. The EIC, however, remained ambivalent about missionaries' polemical interpretation of Indian religions. Evangelicals in Britain during this period were feeling a \"warming,\" as John Wesley had put it, giving rise to renewed evangelical zeal. Missionary organizations and publications emerged within and outside established denominations. They included the Baptist Missionary Society (1792); the London Missionary Society (the interdenominational but largely Congregational missionary arm of British Christianity, founded in 1795); and the Church Missionary Society (the competing arm of the Anglican church, officially founded in 1799). The Baptists had, notably, sent William Carey (1761\u20131834) and Joshua Marshman (1768\u20131837) to India, ultimately to Serampore, a Danish enclave a few miles from Calcutta. They and their helpers established the Baptist Mission Press, a thriving business that translated the Bible into Indian and Asian languages along with much other work. By 1814, translators, working alongside Indian and British type founders and compositors, had printed the New Testament (and often portions of the Hebrew scriptures) in all major Indian languages. Carey, owing to his linguistic facility, became, in 1800\u2014despite the EIC's reluctance\u2014professor of Sanskrit, Bengali, and Marathi at Fort William College (Calcutta), which had been founded to train young company officials. Carey's efforts were supported by Claudius Buchanan, an Anglican chaplain of the EIC and vice-provost of the college. In their alliance, as in the financial support of the Baptist Mission Press, denominational differences often gave way to a common mission. The Evangelical wing of the Anglican Church, under the influence of Charles Simeon (1759\u20131836), of Cambridge, sent many young men into the EIC chaplaincy. In these roles, they engaged in evangelism both to company employees and to local communities. The best known among these chaplains was Henry Martyn (1781\u20131812, later accorded a day in the Anglican calendar of lesser festivals), who became the subject after his death of hagiographical biography and numerous poems, including one by Thomas Babington Macaulay, and was moreover the model for Charlotte Bront\u00eb's St. John Rivers in Jane Eyre. It is unclear whether Martyn's efforts resulted in any conversions\u2014but the memoir of his life had a legacy in the writings of the popular author of children's books, Mary Martha Butt Sherwood. His legacy remains today in the Henry Martyn Institute (hmiindia.org), which promotes inter-religious dialogue and mutual understanding. Although Anglicans, Baptists, various Dissenters, and members of the Scottish Church collaborated in India, differences in theology and social class [End Page 187] never disappeared and were especially marked in British and Parliamentary debates. In the years before the 1813 charter renewal, for instance, the Edinburgh Review mocked Carey and his colleagues as a \"nest of consecrated cobblers...", "venue": "Victorian review", "label": 0}, {"loc": [3.153912305831909, 2.221557140350342], "openalex_id": "https://openalex.org/W2989835304", "title": "Law, Technology and Humans", "authors": "Kieran Tranter", "abstract": "Law, Technology and Humans aims for something different from the mainstream of technology law scholarship. Rather than repeating analysis born from the dominant narrative, it boldly presents itself as a portal to the multiverse of stories and methods through which to understand, dream, critique, build and live well in the technological present as it, with every planetary rotation, moves towards the technological future.", "venue": "Law Technology and Humans", "label": 0}, {"loc": [6.130038261413574, 5.642125606536865], "openalex_id": "https://openalex.org/W2986763679", "title": "Multimodal Learning for Visual Question Answering using World Knowledge", "authors": "Avikalp Srivastava, Hsin-Wen Liu, Sumio Fujita", "abstract": "Question categorization and expert retrieval methods have been crucial for information organization and accessibility in community question & answering (CQA) platforms. Research in this area, however, has dealt with only the text modality. With the increasingly multimodal nature of web content, we focus on extending these methods for CQA questions accompanied by images. Specifically, we leverage the success of representation learning for text and images in the visual question answering (VQA) domain and adapt the underlying concept and architecture for automated category classification and expert retrieval on image-based questions posted on Yahoo! Chiebukuro, the Japanese counterpart of Yahoo! Answers. To the best of our knowledge, this is the first work to tackle the multimodality challenge in CQA, and to adapt VQA models for tasks on a more ecologically valid source of visual questions. Our analysis of the differences between visual QA and community QA data drives our proposal of novel augmentations of an attention method tailored for CQA and use of auxiliary tasks for learning better grounding features. Our final model markedly outperforms the text-only and VQA model baselines for both tasks of classification and expert retrieval on real-world multimodal CQA data.", "venue": "https://doi.org/10.1145/3357384.3358000", "label": 0}, {"loc": [4.030935764312744, 4.346617221832275], "openalex_id": "https://openalex.org/W2980904829", "title": "Cache Me If You Can: The Case For Retrieval Augmentation in Federated Learning", "authors": "Giovane C. M. Moura, John Heidemann, Ricardo de O. Schmidt, Wes Hardaker", "abstract": "DNS depends on extensive caching for good performance, and every DNS zone owner must set Time-to-Live (TTL) values to control their DNS caching. Today there is relatively little guidance backed by research about how to set TTLs, and operators must balance conflicting demands of caching against agility of configuration. Exactly how TTL value choices affect operational networks is quite challenging to understand due to interactions across the distributed DNS service, where resolvers receive TTLs in different ways (answers and hints), TTLs are specified in multiple places (zones and their parent's glue), and while DNS resolution must be security-aware. This paper provides the first careful evaluation of how these multiple, interacting factors affect the effective cache lifetimes of DNS records, and provides recommendations for how to configure DNS TTLs based on our findings. We provide recommendations in TTL choice for different situations, and for where they must be configured. We show that longer TTLs have significant promise in reducing latency, reducing it from 183 ms to 28.7 ms for one country-code TLD.", "venue": "https://doi.org/10.1145/3355369.3355568", "label": 0}, {"loc": [2.620281934738159, 2.774780750274658], "openalex_id": "https://openalex.org/W2978570193", "title": "The Science of Data Filtering: Data Curation cannot be Compute Agnostic", "authors": "Valerie A. Steen, Chris S. Elphick, Morgan W. Tingley", "abstract": "Abstract Aim Citizen science data are increasingly used for modelling species distributions because they offer broad spatiotemporal coverage of local observations. However, such data are often collected without experimental design or set survey methods, raising the risk that bias and noise will compromise modelled predictions. We tested the ability of species distribution models (SDMs) built from these low\u2010structure citizen science data to match the quality of SDMs from systematically collected data and tested whether stringent data filtering improved predictions. Location Northeastern USA. Methods We evaluated models built from a rapidly growing dataset of avian occurrences reported by birders\u2014eBird\u2014against models built from four independent, systematically collected datasets. We developed SDMs for 96 species using both data sources and compared their predictive abilities. We also tested whether culling eBird data by applying stringent data filters on survey effort or observer expertise improved predictions. Results We found that SDMs built from low\u2010structure citizen science data matched or exceeded performance of SDMs from systematically collected datasets for 12%\u201331% of species (= 22%), depending on the dataset. At least one culling option produced equivalent or better performance for 40%\u201370% of species (= 49%). Data culling by restricting survey effort improved predictions more than restricting by observer expertise. The optimal effort restriction differed by dataset, and for three of the datasets was further informed by species traits. Main conclusions Species distribution models developed using low\u2010structure citizen science data sometimes performed as well as those from systematic data. Culling generally improved models, but results were heterogeneous, prohibiting clear recommendations for how to cull. Our results indicate that the growing availability of citizen science data holds potential for creating high\u2010quality spatial predictions, but that time should be invested in determining how best to cull datasets and that one\u2010size\u2010fits\u2010all solutions beyond basic outlier filtering may be hard to find.", "venue": "Diversity and Distributions", "label": 0}, {"loc": [3.2010841369628906, -0.12164506316184998], "openalex_id": "https://openalex.org/W2921763762", "title": "Beyond the Black Box: Optimization Within Latent Spaces", "authors": "David Watson, Jenny Krutzinna, Ian N Bruce, C.E.M. Griffiths, Iain B. McInnes, Michael R. Barnes, Luciano Floridi", "abstract": "To maximise the clinical benefits of machine learning algorithms, we need to rethink our approach to explanation, argue David Watson and colleagues.", "venue": "BMJ", "label": 0}, {"loc": [7.205721855163574, 0.652904212474823], "openalex_id": "https://openalex.org/W2953958347", "title": "\" Transfer Learning in Natural Language Processing: Overcoming Low-Resource Challenges", "authors": "Sebastian Ruder, Matthew E. Peters, Swabha Swayamdipta, Thomas Wolf", "abstract": "The classic supervised machine learning paradigm is based on learning in isolation, a single predictive model for a task using a single dataset. This approach requires a large number of training examples and performs best for well-defined and narrow tasks. Transfer learning refers to a set of methods that extend this approach by leveraging data from additional domains or tasks to train a model with better generalization properties. Over the last two years, the field of Natural Language Processing (NLP) has witnessed the emergence of several transfer learning methods and architectures which significantly improved upon the state-of-the-art on a wide range of NLP tasks. These improvements together with the wide availability and ease of integration of these methods are reminiscent of the factors that led to the success of pretrained word embeddings and ImageNet pretraining in computer vision, and indicate that these methods will likely become a common tool in the NLP landscape as well as an important research direction. We will present an overview of modern transfer learning methods in NLP, how models are pre-trained, what information the representations they learn capture, and review examples and case studies on how these models can be integrated and adapted in downstream NLP tasks.", "venue": "https://doi.org/10.18653/v1/n19-5004", "label": 0}, {"loc": [8.778044700622559, 0.5518919825553894], "openalex_id": "https://openalex.org/W2950336186", "title": "Reasoning for fact verification using language models", "authors": "Jie Zhou, Xu Han, Cheng Yang, Zhiyuan Liu, Lifeng Wang, Changcheng Li, Maosong Sun", "abstract": "Fact verification (FV) is a challenging task which requires to retrieve relevant evidence from plain text and use the evidence to verify given claims. Many claims require to simultaneously integrate and reason over several pieces of evidence for verification. However, previous work employs simple models to extract information from evidence without letting evidence communicate with each other, e.g., merely concatenate the evidence for processing. Therefore, these methods are unable to grasp sufficient relational and logical information among the evidence. To alleviate this issue, we propose a graph-based evidence aggregating and reasoning (GEAR) framework which enables information to transfer on a fully-connected evidence graph and then utilizes different aggregators to collect multi-evidence information. We further employ BERT, an effective pre-trained language representation model, to improve the performance. Experimental results on a large-scale benchmark dataset FEVER have demonstrated that GEAR could leverage multi-evidence information for FV and thus achieves the promising result with a test FEVER score of 67.10%. Our code is available at https://github.com/thunlp/GEAR.", "venue": "https://doi.org/10.18653/v1/p19-1085", "label": 0}, {"loc": [6.901772975921631, 0.06853978335857391], "openalex_id": "https://openalex.org/W2971095815", "title": "Adversarial Learning for Cross-Lingual Word Embeddings", "authors": "Haozhou Wang, James Henderson, Paola Merlo", "abstract": "Distributed representations of words which map each word to a continuous vector have proven useful in capturing important linguistic information not only in a single language but also across different languages. Current unsupervised adversarial approaches show that it is possible to build a mapping matrix that aligns two sets of monolingual word embeddings without high quality parallel data, such as a dictionary or a sentence-aligned corpus. However, without an additional step of refinement, the preliminary mapping learnt by these methods is unsatisfactory, leading to poor performance for typologically distant languages. In this paper, we propose a weakly-supervised adversarial training method to overcome this limitation, based on the intuition that mapping across languages is better done at the concept level than at the word level. We propose a concept-based adversarial training method which improves the performance of previous unsupervised adversarial methods for most languages, and especially for typologically distant language pairs.", "venue": "https://doi.org/10.18653/v1/d19-1450", "label": 0}, {"loc": [3.536503553390503, -0.025552185252308846], "openalex_id": "https://openalex.org/W2903010511", "title": "14 Deep Field of Learning Artificial Impacts Intelligence in the", "authors": "Venkat Venkatasubramanian", "abstract": "The current excitement about artificial intelligence (AI), particularly machine learning (ML), is palpable and contagious. The expectation that AI is poised to \"revolutionize,\" perhaps even take over, humanity has elicited prophetic visions and concerns from some luminaries.1-4 There is also a great deal of interest in the commercial potential of AI, which is attracting significant sums of venture capital and state-sponsored investment globally, particularly in China.5 McKinsey, for instance, predicts the potential commercial impact of AI in several domains, envisioning markets worth trillions of dollars.6 All this is driven by the sudden, explosive, and surprising advances AI has made in the last 10 years or so. AlphaGo, autonomous cars, Alexa, Watson, and other such systems, in game playing, robotics, computer vision, speech recognition, and natural language processing are indeed stunning advances. But, as with earlier AI breakthroughs, such as expert systems in the 1980s and neural networks in the 1990s, there is also considerable hype and a tendency to overestimate the promise of these advances, as market research firm Gartner and others have noted about emerging technology.7 It is quite understandable that many chemical engineers are excited about the potential applications of AI, and ML in particular,8 for use in such applications as catalyst design.9-11 It might seem that this prospect offers a novel approach to challenging, long-standing problems in chemical engineering using AI. However, the use of AI in chemical engineering is not new\u2014it is, in fact, a 35-year-old ongoing program with some remarkable successes along the way. This article is aimed broadly at chemical engineers who are interested in the prospects for AI in our domain, as well as at researchers new to this area. The objectives of this article are threefold. First, to review the progress we have made so far, highlighting past efforts that contain valuable lessons for the future. Second, drawing on these lessons, to identify promising current and future opportunities for AI in chemical engineering. To avoid getting caught up in the current excitement and to assess the prospects more carefully, it is important to take such a longer and broader view, as a \"reality check.\" Third, since AI is going to play an increasingly dominant role in chemical engineering research and education, it is important to recount and record, however incomplete, certain early milestones for historical purposes. It is apparent that chemical engineering is at an important crossroads. Our discipline is undergoing an unprecedented transition\u2014one that presents significant challenges and opportunities in modeling and automated decision-making. This has been driven by the convergence of cheap and powerful computing and communications platforms, tremendous progress in molecular engineering, the ever-increasing automation of globally integrated operations, tightening environmental constraints, and business demands for speedier delivery of goods and services to market. One important outcome from this convergence is the generation, use, and management of massive amounts of diverse data, information, and knowledge, and this is where AI, particularly ML, would play an important role. Some of these are application-focused, such as game playing and vision. Others are methodological, such as expert systems and ML\u2014the two branches that are most directly and immediately applicable to our domain, and hence the focus of this article. These are the ones that have been investigated the most in the last 35 years by AI researchers in chemical engineering. While the current \"buzz\" is mostly around ML, the expert system framework holds important symbolic knowledge representation concepts and inference techniques that could prove useful in the years ahead as we strive to develop more comprehensive solutions that go beyond the purely data-centric emphasis of ML. Many tasks in these different branches of AI share certain common features. They all require pattern recognition, reasoning, and decision-making under complex conditions. And they often deal with ill-defined problems, noisy data, model uncertainties, combinatorially large search spaces, nonlinearities, and the need for speedy solutions. But such features are also found in many problems in process systems engineering (PSE)\u2014in synthesis, design, control, scheduling, optimization, and risk management. So, some of us thought, in the early-1980s, that we should examine such problems from an AI perspective.15-17 Just as it is today, the excitement about AI at that time was centered on expert systems. It was palpable and contagious, with high expectations for AI's near-term potential.18-20 Hundreds of millions of dollars were invested in AI start-ups as well as within large companies. AI spurred the development of special purpose hardware, called Lisp machines (e.g., Symbolics Lisp machines). Promising proof-of-concept systems were demonstrated in many domains, including chemical engineering (see below). In this phase, it was expected that AI would have a significant impact in chemical engineering in the near future. However, unlike optimization and model predictive control, AI did not quite live up to its early promise. So, what happened? Why was not AI as impactful? Before addressing this question, it is necessary to examine the different phases of AI, as I classify them, in chemical engineering. While major efforts to developing AI methods for chemical engineering problems started in the early 1980s, it is remarkable that some researchers (for instance, Gary Powers, Dale Rudd, and Jeff Siirola) were investigating AI in PSE in the late 1960s and early 1970s.21 In particular, the Adaptive Initial DEsign Synthesizer system, developed by Siirola and Rudd22 for process synthesis, represents a significant development. This was arguably the first system that employed AI methods such as means-and-ends analysis, symbolic manipulation, and linked data structures in chemical engineering. Phase I, the Expert Systems Era (from the early 1980s through the mid-1990s), saw the first broad effort to exploit AI in chemical engineering. Expert systems, also called knowledge-based systems, rule-based systems, or production systems, are computer programs that mimic the problem-solving of humans with expertise in a given domain.23, 24 Expert problem-solving typically involves large amounts of specialized knowledge, called domain knowledge, often in the form of rules of thumb, called heuristics, typically learned and refined over years of problem-solving experience. The amount of knowledge manipulated is often vast, and the expert system rapidly narrows down the search by recognizing patterns and by using the appropriate heuristics. The architecture of these systems was inspired by the stimulus\u2013response model of cognition from psychology and pattern-matching-and-search model of symbolic computation, which originated in Emil Post's work in symbolic logic. Building on this work, Simon and Newell in the late 1960s and 1970s devised the production system framework, an important conceptual, representational, and architectural breakthrough, for developing expert systems.25-27 The crucial insight here was that one needs to, and one can, separate domain knowledge from its order of execution, that is, from search or inference, thereby achieving the necessary computational flexibility to address ill-structured problems. In contrast, conventional programs consist of a set of statements whose order of execution is predetermined. Therefore, if the execution order is not known or cannot be anticipated a priori, as in the case of medical diagnosis, for example, this approach will not work. Expert systems programming alleviated this problem by making a clear distinction between the knowledge base and the search or inference strategy. This not only allowed for flexible execution, it also facilitated the incremental addition of knowledge, without distorting the overall program structure. This rule-based knowledge representation and architecture are intuitive, and relatively easy to understand and generate explanations about the system's decisions. This new approach facilitated the development of a number of impressive expert systems, starting with MYCIN, an expert system for diagnosing infectious diseases28 developed at Stanford University during 1972\u201382. This led to other successful systems such as PROSPECTOR (for mineral prospecting29), R1 (configuring Vax computers30), and so on, in this era. These systems inspired the first expert system application in chemical engineering, CONPHYDE, developed in 1983 by Ba\u00f1ares-Alc\u00e1ntara, Westerberg, and Rychner at Carnegie Mellon16 for predicting thermophysical properties of complex fluid mixtures. CONPHYDE was implemented using Knowledge Acquisition System that was used for PROSPECTOR. This was quickly followed by DECADE, in 1985, again from the same CMU researchers,17 for catalyst design. There was other such remarkable early work in process synthesis, design, modeling, and diagnosis as well. In synthesis and in design, for instance, important conceptual advances were made by Stephanopoulos and his students, starting with Design-Kit,31 and in modeling, MODELL.LA, a language for developing process models.32 In process fault diagnosis, Davis33 and Kramer,34, 35 and their groups, made important contributions in the same period. My group developed causal model-based diagnostic expert systems,36 a departure from the heuristics-based approach, which was the dominant theme of the time. We also demonstrated the potential of learning expert systems, an unusual idea at that time as automated learning in expert systems was not in vogue.37 The need for causal models in AI, a topic that has emerged as very important now,38 was also recognized in those early years.39 This period also saw expert system work commencing in Europe,40 particularly for conceptual design support. An important large-scale program in this era was the Abnormal Situation Management (ASM) consortium, funded at $17 million by the National Institute of Standards and Technology's Advanced Technology Program and by the leading oil companies, under the leadership of Honeywell.41 Three different academic groups, led by Davis (Ohio State), Vicente (University of Toronto), and myself at Purdue, were also involved in the consortium. This program is the forerunner to the current Clean Energy Smart Manufacturing Innovation Institute that was funded in 2016.42 The first course on AI in PSE was developed and taught at Columbia University in 1986, and it was subsequently offered at Purdue University for many years. The earlier offerings had an expert systems emphasis, but as ML advanced, in later years, the course evolved to include topics such as clustering, neural networks, statistical classifiers, graph-based models, and genetic algorithms. In 1986, Stephanopoulos published an article43 titled, \"Artificial Intelligence in Process Engineering\", in which he discussed the potential of AI in process engineering and outlined a research program to realize it. Coincidentally, in the same issue, I had a article with the same title, which described the Columbia course.44 In my article, I discussed topics from the course, and it mirrored what Stephanopoulos had outlined as the research program. (Curiously, we did not know each other at that time and had written our articles independently, yet with the same title, at the same time, with almost the same content, and had submitted to the same journal for the same issue!) The first AIChE session on AI was organized by Gary Powers (CMU) at the annual meeting held in Chicago in 1985. The first national meeting on AI in process engineering was held in 1987 at Columbia University, co-organized by Venkatasubramanian, Stephanopoulos, and Davis, sponsored by the National Science Foundation, American Association for Artificial Intelligence, and Air Products. The first international conference, Intelligent Systems in Process Engineering (ISPE'95), sponsored by the Computer Aids for Chemical Engineering (CACHE) Corporation, was co-organized by Stephanopoulos, Davis, and Venkatasubramanian, held at Snowmass, CO, in July 1995. The CACHE Corporation had also organized an Expert Systems Task Force in 1985, under the leadership of Stephanopoulos, to develop tools for the instruction of AI in chemical engineering.45 The task force published a series of monographs on AI in process engineering during 1989\u20131993. Despite impressive successes, the expert system approach did not quite take-off as it suffered from serious drawbacks. It took a lot of effort, time, and money to develop a credible expert system for industrial applications. Furthermore, it was also difficult and expensive to maintain and update the knowledge base as new information came in or the target application changed, such as in the retrofitting of a chemical plant. This approach did not scale well for practical applications (more on this in sections Lack of impact of AI during Phases I and II and Are things different now for AI to have impact?). As the excitement about expert systems waned in the 1990s due to these practical difficulties, interest in another AI technique was picking up greatly. This was the beginning of Phase II, the Neural Networks Era, roughly from 1990 onward. This was a crucial shift from the top-down design paradigm of expert systems to the bottom-up paradigm of neural nets that acquired knowledge automatically from large amounts of data, thus easing the maintenance and development of models. It all started with the reinvention of the backpropagation algorithm by Rumelhart, Hinton, and Williams in 1986 for training feedforward neural networks to learn hidden patterns in input\u2013output data. It had been proposed earlier, in 1974, by Paul Werbos as part of his Ph.D. thesis at Harvard. It is essentially an algorithm for implementing gradient descent search, using the chain rule in calculus, to propagate errors back through the network to adjust the strength (i.e., weights) of connections between nodes iteratively, to make the network learn the patterns. While the idea of neural networks had been around since 1943 from the work of McCulloch and Pitts, and was further developed by Rosenblatt, Minsky, and Papert in the 1960s, these earlier models were limited in scope as they could not handle problems with nonlinearity. The key breakthrough this time was the ability to solve nonlinear function approximation and nonlinear classification problems in an automated manner using the backpropagation learning algorithm. The typical structure of a feedforward neural network from this era is shown in Figure 1, with its input, hidden, and output layers of neurons, and their associated signals, weights and biases. The figure also shows examples of nonlinear function approximation and nonlinear classification problems such networks were able to solve provided enough data were available.46 (a) Architecture of a feedforward neural network. (b) Examples of nonlinear function approximation and classification problems. Adapted from: https://medium.com/@curiousily/tensorflow-for-hackers-part-iv-neural-network-from-scratch-1a4f504dfa8 https://neustan.wordpress.com/2015/09/05/neural-networks-vs-svm-where-when-and-above-all-why/ http://mccormickml.com/2015/08/26/rbfn-tutorial-part-ii-function-approximation/ This novel automated nonlinear modeling ability spurred a tremendous amount of work in a variety of domains including chemical engineering.47 Researchers made substantial progress on addressing challenging problems in modeling,48, 49 fault diagnosis,50-55 control,56, 57 and product design.58 In particular, the recognition of the connection between the autoencoder architecture and the nonlinear principal component analysis by Kramer,48 and the recognition of the nature of the basis function approximation of neural networks through the WaveNet architecture by Bakshi and Stephanopoulos49 are outstanding contributions. There were hundreds of articles in our domain during this phase and only some of the earliest and key articles are highlighted here. While this phase was largely driven by neural networks, researchers also made progress on expert systems (such as the ASM consortium) and genetic algorithms at that time. For instance, we proposed59 directed evolution of engineering polymers in silico using genetic algorithms. This led in subsequent years60 to the multiscale model-based informatics framework called Discovery Informatics61 for materials design. The discovery informatics framework led to the successful development of materials design systems using directed evolution in several industrial applications, such as gasoline additives,62 formulated rubbers,63 and catalyst design.64 During this period, researchers were also beginning to realize the challenges and opportunities in multiscale modeling using informatics techniques.65, 66 Other important advances not using neural networks included research into frameworks and architectures for building AI systems, such as blackboard architectures, integrated problem-solving-and-learning systems, and cognitive architectures. Architectures such as Prodigy and Soar are examples of this work.67 Similarly, there was progress in process synthesis and in design,68 domain-specific representations and languages,32, 69 domain-specific compilers,70 ontologies,71, 72 modeling environments,32, 73 molecular structure search engines,74 automatic reaction network generators,64 and chemical entities extraction systems.74 These references by no means constitute a comprehensive list. All this work, and others along similar lines, performed some two decades ago is still relevant and useful today in the modern era of data science. Building such systems using modern tools presents major opportunities. Despite the surprising success of neural networks in many practical applications, some especially challenging problems in vision, natural language processing, and speech understanding remained beyond the capabilities of the neural nets of this era. Researchers suspected that one would need neural nets with many more hidden layers, not just one, but training these turned to be So, the was more or for about a or so a breakthrough for training neural thus the current phase which we in Phases of AI in Chemical In of all this effort over two AI was not as in chemical engineering as we had In it is clear this was the First, the problems we were are challenging even Second, we were the powerful and programming to address such challenging problems. Third, we were limited by data. that was was very There were of challenges in Phases I and and While we made progress on the conceptual such as knowledge representation and inference for problems in synthesis, design, diagnosis, and we could not the challenges and involved in practical applications. In there was no as it turned there was no in the that the in process engineering, in that period, could be more by optimization and by as algorithms and over the years, these well on problems for which we could and solve models. the problems for which such models are difficult to (e.g., diagnosis, analysis, and materials or almost to generate (e.g., speech which computational and data, of which were not during this period. This of practical success led to two one at the of the Expert Systems era and the other at the of the Neural Networks for AI research in computer and in the application This progress even In it typically to take about years for a to and have from discovery to For instance, for the such as to about market it took about years from the time computer of chemical was first proposed in the similar in optimization as for programming and nonlinear programming and for In during Phase I and II, AI as a was only about years It was early to This analysis that one could impact around While predicting and impact is an this given the current of AI. As it for those of us who started on AI in the early-1980s, we were early as as impact is but it was challenging and to these problems. Many of the such as developing AI methods and causal model-based AI systems, are still as I The progress of AI over the last or so has been very and the are largely have been and are also as have started to and from systems, such as Alexa, and more and for a variety of are beginning to and to work It is and to make the In 1985, arguably the most powerful computer was the computational was and it of The million machine million in was and a to it. So, what would it would the In fact, the is more powerful the The at just of it is a just on the There have been advances in the of algorithms and in programming such as and are the we had to program in Lisp for to what now be in a with a of We have also great progress in The other development is the of tremendous amounts of data, in many domains, which made the stunning advances in ML (more on this below). All this is for this to without for the last years, its expected making these stunning advances As a the is here. The I is also here of the that could be using optimization and have largely been for further for further one go up the and that means going challenging decision-making problems that require solutions. So, now we have a back some years from would that there were early milestones in AI. One is Gary in in the in and the is the surprising by in The AI advances that made these are now poised to have an impact that beyond game In my view, we Phase around the era of Science or This new phase was made by important or neural nets and statistical ML. These are the that are the AI success in game playing, natural language processing, robotics, and vision. neural nets of the 1990s, which typically had only one hidden of neurons, neural nets have hidden layers, as shown in Figure an architecture has the potential to features for complex pattern However, such networks were to using the backpropagation or gradient descent algorithm. The breakthrough came in by using a training with considerable in processing in the form of processing In a called in the training of the neural made such extraction is a in the domain of processing, for features from a noisy the of the network architecture and the such as the and number of a during from a very large data this is a crucial appropriate that to a successful by the neural network from architectural was the neural feedforward neural network has no of and the only it is the current it has been This is not appropriate for problems which have information, such as time series data, where what typically on what has For instance, to the in a one needs to know which came it. networks address such problems by as their not just the current example, but also what they have the output on what has the network as if it has This was further by another architectural called the The typical of a an an output and a The over time and the the of information into and of the networks are well for making on time series data, since there be of between important in a time While the key advances here are in the architecture and training of large-scale neural networks, the important be of as a for learning a of to a such as an It is a learning in which an the by its on the it in to its with the is the one to a a where one the with a if it the and it if it this is many one is essentially the patterns to the it the This learning is essentially programming in modern ML For this approach to work well for complex problems, such as the game of one needs millions of millions of to learn the game from of worth of expertise and during a period of just a As stunning as this is, one that the game playing domain has the that it almost training data over training with a great deal of This is typically not the case in and engineering, where one is even in this era. But this might be the of the data is a computer as in some materials applications. For the of it is important to that learning from the other dominant learning and In the system the between and output given a set of input\u2013output the other in only a set of is given with no (i.e., no The system is to the in the data on its hence One could that learning for", "venue": "AIChE Journal", "label": 0}, {"loc": [9.327871322631836, 0.9823316931724548], "openalex_id": "https://openalex.org/W2902616665", "title": "Tooling for big data extraction", "authors": "Pablo Gamallo, Marcos Garc\u00eda, C\u00e9sar Pi\u00f1eiro, Rodrigo Mart\u00ednez-Casta\u00f1o, Juan C. Pichel", "abstract": "This paper presents LinguaKit, a multilingual suite of tools for analysis, extraction, annotation and linguistic correction, as well as its integration into a Big Data infrastructure. LinguaKit allows the user to perform different tasks such as PoS-tagging, syntactic parsing, coreference resolution (among others), including applications for relation extraction, sentiment analysis, summarization, extraction of multiword expressions, or entity linking to DBpedia. Most modules work in four languages: Portuguese, Spanish, English, and Galician. The system is programmed in Perl and is freely available under a GPLv3 license.", "venue": "https://doi.org/10.1109/snams.2018.8554689", "label": 0}, {"loc": [5.807713031768799, 2.0673699378967285], "openalex_id": "https://openalex.org/W2892892878", "title": "A Short Commentary on Trinh & Le (2018)", "authors": "Walid S. Saba", "abstract": "This is a short Commentary on Trinh & Le (2018) (\"A Simple Method for Commonsense Reasoning\") that outlines three serious flaws in the cited paper and discusses why data-driven approaches cannot be considered as serious models for the commonsense reasoning needed in natural language understanding in general, and in reference resolution, in particular.", "venue": "arXiv (Cornell University)", "label": 1}, {"loc": [2.7986984252929688, 2.114630699157715], "openalex_id": "https://openalex.org/W2890623700", "title": "Towards a Unified Terminology: A study on Siemens Energy Technical Information Systems", "authors": "Jo\u00e3o Cust\u00f3dio Fernandes Cardoso, Matheus Lacerda Viana, Raphael Matias, Marco T\u00falio Furtado, Ana Paula Souza Caetano, H\u00e9lder Consolaro, Vin\u00edcius Louren\u00e7o Garcia de Brito", "abstract": "Angiosperms display an enormous diversity of forms, functions and strategies when it comes to reproduction. This multiplicity has been translated into several terminological concepts and contexts, which have facilitated further research. On the other hand, the use of terms that address the reproduction of flowering plants has been shown to be inconsistent in the literature, complicating communication among specialists. Key terms, such as \u201creproductive system\u201d, \u201cmating system\u201d and \u201csexual system\u201d, among others, have been frequently cited as synonyms, and even used in different circumstances. This review proposes to establish a consistent nomenclatural classification in the field of angiosperms reproductive biology in order to facilitate communication among researchers. Specific terms related to angiosperm reproduction are conceptualized and distributed into five general systems: four related to sexual reproduction (sexual, floral, incompatibility and mating systems); and one related to asexual reproduction (apomictic systems). Our proposal is not to establish a natural classification, but rather to provide a general overview of the main concepts that were grouped here in an artificial and functional manner. Our aim is to advance the field of reproductive biology of angiosperms with consistent and well-defined applications of relevant terminologies.", "venue": "Acta Botanica Brasilica", "label": 0}, {"loc": [6.085650444030762, 0.6133093237876892], "openalex_id": "https://openalex.org/W2930920952", "title": "Natural Language Processing for Slang", "authors": "Dian Sa\u2019adillah Maylawati, Wildan Budiawan Zulfikar, Cepy Slamet, Muhammad Ali Ramdhani, Yana Aditia Gerhana", "abstract": "Stemming is one of the important processes in text mining. The result of stemming process gives an impact on the next text mining process. Therefore, the technique of processing text data such as text mining, natural language processing, and information retrieval for every language have different treatment, especially in the stemming process. Lately, stemming algorithm for Indonesian text has been growing rapidly. However, there is no stemming algorithms that specifically accommodate Indonesian text with slang, whereas text data from social media contains many natural languages and slangs. Therefore, the aim of this research is to improve Indonesian stemming algorithm that suitable for Indonesian text data with slang from social media. We analyzed and elaborated Porter stemmer algorithm to create a new stemmer algorithm that appropriate with Indonesian slang characteristics. This study proposed two factors as the state of the art, (1) a utilization of both Indonesian dictionary and Indonesian Slang dictionary to anticipate re-stemming an infinitive, and (2) accommodating natural languages and slangs to strengthen stemming process on word particles. We examined the algorithm with two scenarios, the first scenario used 379 of words and 20 text data for the second. Furthermore, we compared the algorithm with Porter, Nazief&Adriani, and Lucene stemmer algorithm. The result showed that the proposed algorithm could do stemming process with Indonesian slang well, with the percentage of accuracy about 88.65%. The accuracy level was increase, although the memory usage of the algorithm was no better than the others algorithm, but the duration of process was not different significantly.", "venue": "2018 6th International Conference on Cyber and IT Service Management (CITSM)", "label": 0}, {"loc": [3.3517982959747314, 3.835407018661499], "openalex_id": "https://openalex.org/W2963910109", "title": "Detect Secrets using Machine Learning", "authors": "Baibhab Chatterjee, Debayan Das, Shovan Maity, Shreyas Sen", "abstract": "Traditional authentication in radio-frequency (RF) systems enable secure data communication within a network through techniques such as digital signatures and hash-based message authentication codes (HMAC), which suffer from key-recovery attacks. State-of-the-art Internet of Things networks such as Nest also use open authentication (OAuth 2.0) protocols that are vulnerable to cross-site-recovery forgery (CSRF), which shows that these techniques may not prevent an adversary from copying or modeling the secret IDs or encryption keys using invasive, side channel, learning or software attacks. Physical unclonable functions (PUFs), on the other hand, can exploit manufacturing process variations to uniquely identify silicon chips which makes a PUF-based system extremely robust and secure at low cost, as it is practically impossible to replicate the same silicon characteristics across dies. Taking inspiration from human communication, which utilizes inherent variations in the voice signatures to identify a certain speaker, we present RF-PUF: a deep neural network-based framework that allows real-time authentication of wireless nodes, using the effects of inherent process variation on RF properties of the wireless transmitters (Tx), detected through in-situ machine learning at the receiver (Rx) end. The proposed method utilizes the already-existing asymmetric RF communication framework and does not require any additional circuitry for PUF generation or feature extraction. The burden of device identification is completely shifted to the gateway Rx, similar to the operation of a human listener's brain. Simulation results involving the process variations in a standard 65-nm technology node, and features such as local oscillator offset and I-Q imbalance detected with a neural network having 50 neurons in the hidden layer indicate that the framework can distinguish up to 4800 Tx(s) with an accuracy of 99.9% [\u224899% for 10000 Tx(s)] under varying channel conditions, and without the need for traditional preambles. The proposed scheme can be used as a stand-alone security feature, or as a part of traditional multifactor authentication.", "venue": "IEEE Internet of Things Journal", "label": 0}, {"loc": [3.247565507888794, 1.7123322486877441], "openalex_id": "https://openalex.org/W2804226648", "title": "Expanding the paradigm: Generative artificial intelligence and US privacy norms", "authors": "Ramya Chari, Chia\u2010Chia Chang, Steven L. Sauter, Elizabeth L. Petrun Sayers, Jennifer Cerully, Paul A. Schulte, Anita Schill, Lori Uscher\u2010Pines", "abstract": "Well-being is a positive and unifying concept that captures multiple factors that contribute to workers' health and quality of life. This work lays the foundation for larger well-being measurement efforts and will provide tools for NIOSH partners to help workers flourish.", "venue": "Journal of Occupational and Environmental Medicine", "label": 0}, {"loc": [3.2717201709747314, 1.8012129068374634], "openalex_id": "https://openalex.org/W2790208997", "title": "TOWARDS EQUITABLE", "authors": "Susan E. Collins, Seema L. Clifasefi, Joey Stanton, The Leap Advisory Board, Kee J.E. Straits, Eleanor Gil\u2010Kashiwabara, Patricia Rodr\u00edguez Espinosa, Andel Nicasio, Michele P. Andrasik, Starlyn M. Hawes, Kimberly A. Miller, Lonnie A Nelson, Victoria E. Orfaly, Bonnie Duran, Nina Wallerstein", "abstract": "Community-based participatory research (CBPR) answers the call for more patient-centered, community-driven research approaches to address growing health disparities. CBPR is a collaborative research approach that equitably involves community members, researchers, and other stakeholders in the research process and recognizes the unique strengths that each bring. The aim of CBPR is to combine knowledge and action to create positive and lasting social change. With its origins in psychology, sociology, and critical pedagogy, CBPR has become a common research approach in the fields of public health, medicine, and nursing. Although it is well aligned with psychology's ethical principles and research aims, it has not been widely implemented in psychology research. The present article introduces CBPR to a general psychology audience while considering the unique aims of and challenges in conducting psychology research. In this article, we define CBPR principles, differentiate it from a more traditional psychology research approach, retrace its historical roots, provide concrete steps for its implementation, discuss its potential benefits, and explore practical and ethical challenges for its integration into psychology research. Finally, we provide a case study of CBPR in psychology to illustrate its key constructs and implementation. In sum, CBPR is a relevant, important, and promising research framework that may guide the implementation of more effective, culturally appropriate, socially just, and sustainable community-based psychology research. (PsycINFO Database Record (c) 2018 APA, all rights reserved).", "venue": "American Psychologist", "label": 0}, {"loc": [2.353559732437134, 2.140937328338623], "openalex_id": "https://openalex.org/W2784112715", "title": "Methodological and practical challenges of doing transnational studies", "authors": "Nigel M. Healey", "abstract": "Purpose The purpose of this paper is to investigate the challenges of managing transnational education (TNE) partnerships from the perspective of the home university managers. Design/methodology/approach The study adopts a qualitative, \u201cinsider researcher\u201d methodology\u2019. It uses a sample set of eight mangers who operate from the home university and 13 \u201cin-country\u201d managers who are seconded to head up the overseas TNE partnerships. The samples are all drawn from UK universities to standardise for other variables (e.g. legislative framework). Findings It finds that the managers based at the home campus report a generally negative attitude, emphasising the riskiness and the lack of scalability, sustainably and profitability, as well as the general resistance to TNE from staff on the home campus. The in-country managers, in contrast, experience the same lack of empathy from their peers at home, but this group tends to more closely associate themselves with their local colleagues and to be drawn into building relationships with local stakeholders. Research limitations/implications The limitation of this research is that it is based on a sample of managers from the same country. Practical implications In practical terms, the findings suggest that universities need to do more to increase awareness and commitment to their TNE partnerships amongst staff at the home campus, while providing better professional development and more frequent rotations for their in-country managers. Originality/value This paper extends the very limited literature on the management of TNE partnerships.", "venue": "International Journal of Educational Management", "label": 0}, {"loc": [3.2524607181549072, -0.8215735554695129], "openalex_id": "https://openalex.org/W2783405072", "title": "Development and Evaluation of a German Language Model for the Financial Domain", "authors": "Xavier Kurz, Susana Perez\u2010Gutthann", "abstract": "The European Medicines Agency (EMA) has the responsibility for the scientific evaluation, supervision, and safety monitoring of medicines in the European Union (EU) to ensure that their benefits outweigh their risks. While the roots of medicines' safety monitoring lie in the development of mechanisms for spontaneous reporting of suspected adverse reactions by health-care professionals and patients, the importance of using the full spectrum of evidence including observational studies has long been acknowledged.1-3 The risk management system introduced in the EU in 2006 highlighted the need to build capacity and to facilitate the conduct of multicenter independent postauthorization studies to investigate important risks or missing information in European populations.4 In March 2006, the EMA contacted more than 90 academic centers in Europe identified through the International Society for Pharmacoepidemiology (ISPE) and national drug regulatory authorities to request information on their expertise and activities in pharmacoepidemiology and pharmacovigilance. Over the following 12 months, possible models for collaboration on independent observational studies were discussed with representatives of academic and other research centers, pharmaceutical industry, other existing clinical networks, EMA scientific committees, and the European Commission.5 The European Network of Centres for Pharmacoepidemiology and Pharmacovigilance (ENCePP; www.encepp.eu) was launched on June 28, 2007 with 79 participants who agreed to develop an active research network based on principles of transparency, scientific independence, and common quality standards. The European Network of Centres for Pharmacoepidemiology and Pharmacovigilance was presented in a symposium at the 24th International Conference on Pharmacoepidemiology and Therapeutic Risk Management in August 2008.6 Ten years on, we review ENCePP's main achievements, discuss its impact on the benefit-risk evaluation of medicinal products in Europe, and outline future perspectives. Although collaborations for multicenter studies have long existed, the pharmacoepidemiology landscape in Europe has been heterogeneous and based on researchers using stand-alone data sources with limited sample sizes and applying differing quality standards. This heterogeneity was compounded by differences between health-care systems, uncertainty about available databases, and uncertainty on existing collaborations with sufficient expertise and capacity to conduct multicenter observational studies. It was often considered easier for industry to conduct postauthorization studies requested by EU regulators in the United States, despite differences in characteristics of study populations, clinical practice, and prescription patterns. There was a need to foster a network of researchers able to perform large observational studies in Europe and for a pool of experts providing clear guidance on best practices in pharmacoepidemiology. At the same time, rules and principles for quality standards and transparency of research were also needed to ensure that these studies would be performed according to the best possible level of scientific quality.5 New EU pharmacovigilance legislation entered into force in July 2012 provided a legal framework for postauthorization safety studies (PASS). This enabled regulators to impose PASS on pharmaceutical companies as a condition of the marketing authorization and established a review process for PASS study protocols and results by the EMA's Pharmacovigilance Risk Assessment Committee (PRAC).7 In this context, ENCePP has assumed a triple role: (a) to increase capacity for pharmacoepidemiology research in Europe, (b) to define common methodological standards, and (c) to propose governance principles for the conduct of collaborative studies. Initially, ENCePP foresaw nomination of coordinating centers taking responsibility for research within defined therapeutic areas. However, besides the organizational challenges posed by the differing interests and expertise of centers and the diverse nature of research questions to be addressed, most centers were also concerned that in a competitive environment, a structure with predefined subnetworks overseen by a coordinating center would be too rigid and give too much prominence to such center. They preferred a flexible approach whereby centers would be characterized in a public, transparent, and searchable electronic inventory and could enter into ad hoc collaborations for specific projects underpinned by common transparency and research standards. As of July 31, 2017, ENCePP included 168 centers from 18 European countries, 126 (75%) of them being not for profit organizations (eg, universities, hospitals, foundations, or charities) and 42 (25%) for-profit organizations (ie, contract research organizations). The largest numbers of centers are located in the United Kingdom (35 centers), Italy (24), France and Germany (18 each), Spain (17), and the Netherlands (10).8 Centers applying to join the network undergo a check by the ENCePP Secretariat to determine their focus on pharmacoepidemiology or pharmacovigilance. This is based on a description of their activities and a list of publications, but level of expertise or quality of research is not assessed. At an early stage, ENCePP discussed implementation of a self-accreditation system but considered that it would not guarantee the quality of the studies performed by the centers. Through the centers, ENCePP provides access to a large pool of experts in pharmacoepidemiology and pharmacovigilance across Europe and to other relevant specialists such as clinical pharmacologists, statisticians, specialist clinicians, and members of health technology assessment (HTA) bodies, pediatric networks, and pharmacogenomic groups. Since 2010, this expertise provides a strong support to the operation of the new pharmacovigilance legislation by complementing regulatory guidance with methodological recommendations. Although one of the aims of creating ENCePP was to increase the capacity for large pharmacoepidemiology studies in Europe, the flexible approach adopted by ENCePP for collaborations and the multiple sources of public and private funding do not allow to confirm to date that multicenter studies were initiated with the support of ENCePP. However, feedback received from members suggests that the new culture of collaboration, the common scientific standards, and the common governance principles introduced by the ENCePP have greatly facilitated the establishment of research consortia, for example, in the context of the EMA-funded studies (Table 1) and the European Commission's Seventh Framework Program for drug safety studies (Table 2).3 Consortia were also created in the context of public-private partnerships established by the Innovative Medicines Initiative.9 In addition, ENCePP members provided occasionally to EMA data that could support drug safety reviews. This information covered topics such as combined hormonal contraceptive and the risk of venous thromboembolism, strontium ranelate in the treatment of osteoporosis, bromocriptine-containing medicines indicated in the suppression of lactation postpartum, ambroxol- and bromhexine-containing medicines and allergic reactions, codeine-containing medicines and the risk of morphine toxicity, or hydroxyzine-containing medicines and pro-arrhythmogenic potential. http://bmjopen.bmj.com/content/5/9/e008531.full https://www.ncbi.nlm.nih.gov/pubmed/27504911 The long-term success of the network will depend on its capacity to keep current members engaged and involve new members to take-up future methodological challenges, taking into account that new data sources such as social media and big data will likely play an increasing role in the benefit-risk evaluation of medicinal products. In this regard, a concept paper on methodological aspects associated with use of different models for data extraction and analysis from electronic health records, their validation, and their regulatory applications is being developed. An objective of ENCePP is to identify data from clinical or administrative electronic databases available in Europe, coordinate these data in a comprehensive and public inventory, and facilitate their access to researchers. Database holders and professionals with expertise in use of specific data sources are invited to provide a description of their core data (eg, coding systems and dictionaries used, type of events, and medicinal products covered), demographic information, information on data linkage and data access, and a list of relevant publications derived from the data.8 Since 2017, disease registries are also registered in the context of the EMA Patient Registry Initiative.10 As of July 31, 2017, the inventory included 83 data sources (Figure 1). The inventory provides key information on a large number of databases and helps investigators identify relevant data sources available to answer specific research questions. It represents a core source of information on data available for the benefit-risk evaluation of medicines.11 In November 2010, the ENCePP e-Register of Studies was launched to increase the transparency of observational postauthorization studies and maximize the availability and accessibility of postauthorization evidence on medicines.1 The idea of the registration of observational studies in pharmacoepidemiology or other areas of epidemiology was controversial in 2010. Some authors did not favor it,12-14 while others considered that the ability to upload the study protocols, study interim and final reports, and other relevant documents increase transparency, facilitate collaborations, allow feedback by peer-reviewers, and may ultimately lead to better science. 15, 16 The ENCePP e-Register was adopted as the EU electronic Register of Postauthorization Studies (EU PAS Register\u00ae) following the new EU pharmacovigilance legislation, which made it mandatory for marketing authorization holders to register PASS imposed as a legal obligation by regulators\u2014the so-called Risk Management Plan (RMP) category 1 and category 2 studies, and subsequently the recommendation made in the EU Good pharmacovigilance practices to register other PASS included in the RMP (RMP category 3 studies).7 It therefore became an essential tool for the implementation of the legislation. As of July 31, 2017, 1,145 studies had been registered (Figure 2); 368 of them (30.4%) had been finalized, and more than half of them (n = 583, 50.9%) were studies requested by a regulatory authority, of which 95 (16.3%) have been imposed as a legal obligation and 316 (54.2%) are included in an EU RMP (Figure 3). Risk assessment and effectiveness evaluation have been the main purpose of 49.7% and 26.7% of studies, respectively (Figure 4). Both objectives are mentioned for 13.5% of the studies. Registration of studies in the EU PAS Register has changed the landscape of pharmacoepidemiology and pharmacovigilance by giving public access to evaluations carried out on specific drugs and safety concerns and providing visibility on investigators, data availability, methods, and funding sources. The register has become a must-go-to source to learn about studies addressing specific research questions and learn about their design as a tool to plan new studies. With the exception of imposed PASS, study registration is voluntary. It has been shown that up to July 2015 49% of the PASS reviewed by PRAC in the context of regulatory procedures had been entered in the EU PAS Register and only 43% of these entries had a protocol available.17 This limitation may affect the usefulness of the registration to judge the quality of the studies on the basis of a detailed description of the design and analytical approach.18 On June 29, 2011, a workshop with medical journal editors sought their views toward upload of study results before their acceptance and appearance in print. Although the editors accepted in principle that study results of public health relevance could be shared without delay, this confirmation did not reassure many investigators reluctant to upload the study protocol and report in the EU PAS Register prior to the publication in a scientific journal online or in print.19 A way forward could be that medical journal editors would require the EU PAS Register number for all manuscripts reporting results of postauthorization studies (even if study results have not yet been uploaded) as a means to decrease publication bias, similarly to the existing requirement for clinical trials. In early discussions, ENCePP agreed that, rather than establishing an accreditation system for centers, research quality would be best supported by providing recommendations on the practical implementation of pharmacoepidemiological principles based on published guidance and illustrative examples. The first ENCePP Guide on Methodological Standards in Pharmacoepidemiology was published in May 2011 and has been updated annually by structured review to maintain its dynamic nature. The sixth revision in July 2017 has 31 authors and 424 electronic references.20 The guide offers a concise, dynamic, and publicly available Web resource for methodological English language guidance in pharmacoepidemiology. An electronic version was introduced for the third revision in 2013, and the number of times each revision has been viewed has since steadily increased to about 50,000 for revision 5 (2015-2016), while the entire document has been downloaded about 10,000 times during the same period (Figure 5). The guide is used for training in many institutions including research centers and industry and cited as a reference source of methodological best practice in several regulatory documents such as the EU good pharmacovigilance practice.7 In parallel, ENCePP developed a Checklist for Study Protocols to stimulate researchers' consideration of important principles when designing and writing a pharmacoepidemiological study protocol, to facilitate protocol review by other parties, and to promote transparency regarding methodologies and design used in studies. To assist regulatory authorities in identifying whether such principles have been applied in PASS protocols, pharmaceutical companies have to append the checklist to protocols submitted to regulators.7, 21 In line with its aim to promote transparency and scientific independence, the ENCePP developed a Code of Conduct laying out best practice in the relationship between investigators and study funders, irrespective of whether the study funder is a public body, industry, or a regulatory authority.22 At the core of scientific independence is the provision that no person with a financial, commercial, or personal interest in a particular outcome of the study shall take part in any study activity that could influence the results or their interpretation in any particular direction. To ensure transparent research, the code requires registration of the study in a public registry (for instance, the EU PAS Register) and agreement to make public relevant information including the Checklist for Study Protocols, study data specified in the guidance for sharing of ENCePP Study Data,23 and the content of the research contract or a declaration on the use of own resources. To confirm a commitment to comply with the provisions of the code, the lead investigator may apply for an ENCePP Seal. This requires the provision of a signed checklist and signed declaration of compliance with the Code of Conduct, the signed Checklist for Study Protocols, and a signed Declaration of Interests to the ENCePP Secretariat prior to study start. The study has also to be registered in the EU PAS Register, and the full protocol must be uploaded prior to data collection or extraction. Once the ENCePP Secretariat has confirmed the a priori eligibility for the seal, it adds the ENCePP Seal logo to the registration record and the investigators can use this logo on materials and publications. The lead investigator may, however, ask to postpone the publication of the protocol until the study is finalized. As of July 31, 2017, 45 studies had an ENCePP Seal. The protocol and final study report have been published for all of the 15 finalized studies, while the study protocol has been published for 7 of the 22 ongoing studies (31.8%) and for 2 of 8 planned studies (25.0%). The ENCePP Code of Conduct has been a landmark document defining the relationships between study sponsors and investigators willing to conduct studies with full scientific independence. It became a key reference for the conduct of studies and underpinned the development of guidance by other groups,24 but it has shown some limitations: the key principle of scientific independence is not explicitly defined and, even though many provisions of the code are written as obligations, their application is a matter of commitment without verification that they have been implemented. The ENCePP Seal, which was developed to formalize this commitment, has a low uptake, and the publication of the protocol was often postponed until the study end. Furthermore, the wording of the code may be interpreted as suggesting that some of its provisions do not apply if the seal is not requested. A working group is currently evaluating the need to improve the ENCePP Code of Conduct and the ENCePP Seal concept in light of the experience and to better define and implement the principle of scientific independence. To answer specific questions or respond to consultations, different ad hoc working groups have been created over time (information on www.encepp.eu). A concept paper addressed the legal definition of \"noninterventional trials,\" and a collaboration with representatives of HTA bodies looked at specific HTA-related methodological aspects of studies. A special interest group (SIG) on Drug Safety in Pregnancy was created to inform future activities of ENCePP in medicines used in pregnancy and lactation, to liaise with other relevant groups in this field, and to develop an overview of data sources for drug safety in pregnancy research. A SIG on Measuring the Impact of Pharmacovigilance Activities was created to provide recommendations to the PRAC on key methodologies for measuring health outcomes of pharmacovigilance measures in the context of the overall evaluation of the impact of pharmacovigilance systems.25 The ENCePP has been created in a heterogeneous landscape of academic centers, research organizations and database owners, and a changing regulatory environment in pharmacovigilance and pharmacoepidemiology. There was therefore a risk that differing priorities and constraints would lead to divergent routes after an initial period of collaboration. We believe that several factors explain that ENCePP achieved important outcomes from its onset and remained a coherent, dynamic, and active network over 10 years: (1) a recognized need for collaboration to address limitations in the pharmacoepidemiological landscape and keep abreast of methodological, regulatory, and organizational developments (eg, increased use of existing data sources, new legislation on PASS, new funding opportunities); (2) a firm commitment to common guiding principles of transparency, scientific independence, and quality standards; (3) an acknowledgment of the diversity in the centers' domain of expertise and capacity to collaborate; (4) a strong governance based on an elected Steering group and several working groups, with the support from EMA; (5) last but not least, the ability to meet face to face in plenary meetings on a periodic basis and to actively contribute to the development of good practice and regulatory guidance. All these factors have been instrumental to foster and accelerate partnership between research centers and improve the implementation of collaborative studies. Table 3 lists priorities proposed for the next years of ENCePP based on the further development of existing activities described in this article. The integration of the \"pharmacovigilance\" component into the network's activities has not been fully realized so far. This may be because a well-structured network of national and/or regional pharmacovigilance centers has existed for a long time in Europe with a coordination of activities at national and European levels and that a 5 year research project on methods in pharmacovigilance (the IMI PROTECT project) was started in September 2009 with the objective to review and develop if necessary methods for signal detection,26 and its results were integrated into the ENCePP Guide on methodological standards. An ENCePP SIG has been initiated to review the application of pharmacoepidemiological methods to measure the public health impact of pharmacovigilance activities (information on www.encepp.eu), and ENCePP will work together with the PRAC, the International Society of Pharmacovigilance, and other networks to identify other areas where a collaboration will strengthen the benefit-risk evaluation of medicines. A Joint Task Force of ISPE and the International Society for Pharmacoeconomics and Outcome Research recently published recommendations to enhance decision-makers' confidence in evidence derived from real-world studies.27, 28 The principles of transparency in the process for database studies, transparency in study execution, and good procedural practices they promote are very close to those recommended in the ENCePP Code of Conduct, Checklist for Study Protocols, and Guide for Methodological Standards. Close collaboration among ISPE, IPOR, and ENCePP, for example, through cross-reference to each other's recommendations, common publications, and collaboration in the annual updating of the ENCePP Guide on Methodological Standards, would provide a unique opportunity to promote common principles and standards on a global scale. An important challenge remains: studies still take a long time to be finalized, often because of administrative aspects, slow access to available health data often due to data protection concerns, heterogeneous systems, or lack of resources. The European Network of Centres for Pharmacoepidemiology and Pharmacovigilance will need to address the challenges by using innovative tools and designs and new data sources to conduct faster studies through collaborations. In 10 years, the ENCePP has made a major contribution to the benefit-risk evaluation of medicinal products in Europe and beyond by providing methodological recommendations complementing regulatory guidance on postauthorization safety studies. The development of the EU PAS Register also changed the landscape of pharmacoepidemiology in Europe by increasing transparency of observational research, giving access to study protocols and results and supporting the implementation of the pharmacovigilance legislation. The ENCePP Code of Conduct aims to promote transparency and scientific independence in research, but its implementation depends on researchers' commitment and it is being reviewed in light of the constraints imposed in transparency and restriction of study funders' involvement in the study. Perhaps most importantly, ENCePP has created a strong European community supporting methodological standards, transparency, and scientific independence in pharmacoepidemiological research. The views expressed in this article are the personal views of the authors and may not be understood or quoted as being made on behalf of or reflecting the positions of their employer organization. The authors gratefully acknowledge Kevin Blake, Henry Fitt, Thomas Goedecke, Alexandra Pacurariu, and Dagmar Vogl for their assistance in the preparation of this article. X.K. is an employee of the EMA, the chair of the ENCePP Steering Group, and the co-chair of ENCePP WG1 on Research Standards. S.P.G. is the vice president and global head of epidemiology at RTI-HS, a former co-chair of the ENCePP Steering Group, and a former co-chair of ENCePP WG1 on Research Standards.", "venue": "Pharmacoepidemiology and Drug Safety", "label": 0}, {"loc": [8.904695510864258, 0.3543586730957031], "openalex_id": "https://openalex.org/W2890961898", "title": "Fusion of Knowledge: Enhancing AI Reasoning Through Language Models and Knowledge Graphs", "authors": "Haitian Sun, Bhuwan Dhingra, Manzil Zaheer, Kathryn Mazaitis, Ruslan Salakhutdinov, William W. Cohen", "abstract": "Open Domain Question Answering (QA) is evolving from complex pipelined systems to end-to-end deep neural networks. Specialized neural models have been developed for extracting answers from either text alone or Knowledge Bases (KBs) alone. In this paper we look at a more practical setting, namely QA over the combination of a KB and entity-linked text, which is appropriate when an incomplete KB is available with a large text corpus. Building on recent advances in graph representation learning we propose a novel model, GRAFT-Net, for extracting answers from a question-specific subgraph containing text and KB entities and relations. We construct a suite of benchmark tasks for this problem, varying the difficulty of questions, the amount of training data, and KB completeness. We show that GRAFT-Net is competitive with the state-of-the-art when tested using either KBs or text alone, and vastly outperforms existing methods in the combined setting.", "venue": "https://doi.org/10.18653/v1/d18-1455", "label": 0}, {"loc": [2.5258991718292236, 2.5558693408966064], "openalex_id": "https://openalex.org/W3106361542", "title": "3,000+ Trabajos", "authors": "Katherin Paola Calder\u00f3n Infante, Tatiana Milena Polo Leal, Luz Elena Silva Castro", "abstract": "El presente trabajo busca dar a conocer a los Contadores Publicos y Revisores Fiscales; las guias, principios y procedimientos esenciales para dar orientaciones a la hora de realizar un encargo de aseguramiento bajo el estandar de la ISAE 3000 (Trabajo para atestiguar distinto de auditoria y de informacion financiera historica).\r\nEste estandar de aseguramiento, se enfoca que el contador publico o revisor fiscal pueda entregar informes sobre otro tipo de informacion relevante diferente a lo financiero, con una grado de seguridad razonable o limitada y una administracion de riesgos bien estructurada, sobre encargos de: Desempeno de la entidad, indicadores de eficiencia y eficacia, control interno y sistemas de informacion, gobierno corporativo, informe de responsabilidad social corporativa, cumplimiento y regulaciones, practicas de recursos humano, capacidad o factibilidad.\r\nPor tanto, este trabajo da a conocer a los contadores publicos, revisores fiscales y auditores, las diferencias que se presentan al momento de emitir un informe bajo esta norma de aseguramiento, su importancia, alcance y el conocimiento que se tiene en la actualidad acerca del tema.", "venue": "https://repository.ucc.edu.co/bitstream/20.500.12494/7677/1/2018_diagnostico_cambios_generados.pdf", "label": 0}, {"loc": [3.5566117763519287, -0.14601269364356995], "openalex_id": "https://openalex.org/W2766761250", "title": "Methods and Applications for Probing Deep Neural Networks", "authors": "Alexandru Korotcov, Valery Tkachenko, Daniel P. Russo, Sean Ekins", "abstract": "Machine learning methods have been applied to many data sets in pharmaceutical research for several decades. The relative ease and availability of fingerprint type molecular descriptors paired with Bayesian methods resulted in the widespread use of this approach for a diverse array of end points relevant to drug discovery. Deep learning is the latest machine learning algorithm attracting attention for many of pharmaceutical applications from docking to virtual screening. Deep learning is based on an artificial neural network with multiple hidden layers and has found considerable traction for many artificial intelligence applications. We have previously suggested the need for a comparison of different machine learning methods with deep learning across an array of varying data sets that is applicable to pharmaceutical research. End points relevant to pharmaceutical research include absorption, distribution, metabolism, excretion, and toxicity (ADME/Tox) properties, as well as activity against pathogens and drug discovery data sets. In this study, we have used data sets for solubility, probe-likeness, hERG, KCNQ1, bubonic plague, Chagas, tuberculosis, and malaria to compare different machine learning methods using FCFP6 fingerprints. These data sets represent whole cell screens, individual proteins, physicochemical properties as well as a data set with a complex end point. Our aim was to assess whether deep learning offered any improvement in testing when assessed using an array of metrics including AUC, F1 score, Cohen's kappa, Matthews correlation coefficient and others. Based on ranked normalized scores for the metrics or data sets Deep Neural Networks (DNN) ranked higher than SVM, which in turn was ranked higher than all the other machine learning methods. Visualizing these properties for training and test sets using radar type plots indicates when models are inferior or perhaps over trained. These results also suggest the need for assessing deep learning further using multiple metrics with much larger scale comparisons, prospective testing as well as assessment of different fingerprints and DNN architectures beyond those used.", "venue": "Molecular Pharmaceutics", "label": 0}, {"loc": [5.856032371520996, 0.3399299085140228], "openalex_id": "https://openalex.org/W2577366047", "title": "Towards Better Language Models: Algorithms, Architectures, and Applications", "authors": "Jan Chorowski, Navdeep Jaitly", "abstract": "The recently proposed Sequence-to-Sequence (seq2seq) framework advocates replacing complex data processing pipelines, such as an entire automatic speech recognition system, with a single neural network trained in an end-to-end fashion.In this contribution, we analyse an attention-based seq2seq speech recognition system that directly transcribes recordings into characters.We observe two shortcomings: overconfidence in its predictions and a tendency to produce incomplete transcriptions when language models are used.We propose practical solutions to both problems achieving competitive speaker independent word error rates on the Wall Street Journal dataset: without separate language models we reach 10.6% WER, while together with a trigram language model, we reach 6.7% WER.", "venue": "https://doi.org/10.21437/interspeech.2017-343", "label": 0}, {"loc": [4.027438640594482, -2.161590337753296], "openalex_id": "https://openalex.org/W2740334779", "title": "Measuring Controversy in online discussions", "authors": "Kaspar Beelen, Evangelos Kanoulas, Bob van de Velde", "abstract": "This paper sets out to detect controversial news reports using online discussions as a source of information. We define controversy as a public discussion that divides society and demonstrate that a content and stylometric analysis of these debates yields useful signals for extracting disputed news items. Moreover, we argue that a debate-based approach could produce more generic models, since the discussion architectures we exploit to measure controversy occur on many different platforms.", "venue": "https://doi.org/10.1145/3077136.3080723", "label": 0}, {"loc": [5.489260673522949, 1.5629340410232544], "openalex_id": "https://openalex.org/W4299129019", "title": "MAIN DIRECTIONS OF COMPUTATIONAL LINGUISTICS", "authors": "K. Kalugyan, E. Lozina", "abstract": "<p>Computational linguistics as the young scientific direction, the reasons and history of her appearance, the main directions of computational linguistics are considered</p>", "venue": "Zenodo (CERN European Organization for Nuclear Research)", "label": 0}, {"loc": [6.756436824798584, 2.9102227687835693], "openalex_id": "https://openalex.org/W2560647685", "title": "Overcoming Catastrophic Forgetting: Geometric Techniques in Incremental Machine Learning", "authors": "James Kirkpatrick, Razvan Pascanu, Neil C. Rabinowitz, Joel Veness, Guillaume Desjardins, Andrei A. Rusu, Kieran Milan, John Quan, Tiago Ramalho, Agnieszka Grabska\u2010Barwi\u0144ska, Demis Hassabis, Claudia Clopath, Dharshan Kumaran, Raia Hadsell", "abstract": "Significance Deep neural networks are currently the most successful machine-learning technique for solving a variety of tasks, including language translation, image classification, and image generation. One weakness of such models is that, unlike humans, they are unable to learn multiple tasks sequentially. In this work we propose a practical solution to train such models sequentially by protecting the weights important for previous tasks. This approach, inspired by synaptic consolidation in neuroscience, enables state of the art results on multiple reinforcement learning problems experienced sequentially.", "venue": "Proceedings of the National Academy of Sciences", "label": 0}, {"loc": [6.381086349487305, 5.859971523284912], "openalex_id": "https://openalex.org/W2757160886", "title": "Spatial Language Understanding: Deep Learning, Reasoning, and Evaluation", "authors": "Parisa Kordjamshidi, Taher Rahgooy, Umar Manzoor", "abstract": "This work is on a previously formalized semantic evaluation task of spatial role labeling (SpRL) that aims at extraction of formal spatial meaning from text. Here, we report the results of initial efforts towards exploiting visual information in the form of images to help spatial language understanding. We discuss the way of designing new models in the framework of declarative learning-based programming (DeLBP). The DeLBP framework facilitates combining modalities and representing various data in a unified graph. The learning and inference models exploit the structure of the unified graph as well as the global first order domain constraints beyond the data to predict the semantics which forms a structured meaning representation of the spatial context. Continuous representations are used to relate the various elements of the graph originating from different modalities. We improved over the state-of-the-art results on SpRL.", "venue": "https://doi.org/10.18653/v1/w17-4306", "label": 0}, {"loc": [6.0869903564453125, 0.5658958554267883], "openalex_id": "https://openalex.org/W2756061655", "title": "Arabic Language Processing", "authors": "Dima Suleiman, Arafat Awajan, Wael Etaiwi", "abstract": "Hidden Markov Model is an empirical tool that can be used in many applications related to natural language processing. In this paper a comparative study was conducted between different applications in natural Arabic language processing that uses Hidden Markov Model such as morphological analysis, part of speech tagging, text classification, and name entity recognition. Comparative results showed that HMM can be used in different layers of natural language processing, but mainly in pre-processing phase such as: part of speech tagging, morphological analysis and syntactic structure; however in high level applications text classification their use is limited to certain number of researches.", "venue": "Procedia Computer Science", "label": 0}, {"loc": [3.0736629962921143, -0.8072123527526855], "openalex_id": "https://openalex.org/W2786839164", "title": "\" Deep Learning for de-identification of clinical documents", "authors": "Shweta Yadav, Asif Ekbal, Sriparna Saha, Pushpak Bhattacharyya", "abstract": "Rapid growth in Electronic Medical Records (EMR) has emerged to an expansion of data in the clinical domain. The majority of the available health care information is sealed in the form of narrative documents which form the rich source of clinical information. Text mining of such clinical records has gained huge attention in various medical applications like treatment and decision making. However, medical records enclose patient Private Health Information (PHI) which can reveal the identities of the patients. In order to retain the privacy of patients, it is mandatory to remove all the PHI information prior to making it publicly available. The aim is to de-identify or encrypt the PHI from the patient medical records. In this paper, we propose an algorithm based on deep learning architecture to solve this problem. We perform de-identification of seven PHI terms from the clinical records. Experiments on benchmark datasets show that our proposed approach achieves encouraging performance, which is better than the baseline model developed with Conditional Random Field.", "venue": "https://aclanthology.org/W16-4206/", "label": 0}, {"loc": [7.38363790512085, 0.7426097989082336], "openalex_id": "https://openalex.org/W591148856", "title": "Word Embeddings in NLP", "authors": "Lili Mou, Ran Jia, Yan Xu, Ge Li, Lu Zhang, Zhi Jin", "abstract": "Distilling knowledge from a well-trained cumbersome network to a small one has recently become a new research topic, as lightweight neural networks with high performance are particularly in need in various resource-restricted systems. This paper addresses the problem of distilling word embeddings for NLP tasks. We propose an encoding approach to distill task-specific knowledge from a set of high-dimensional embeddings, so that we can reduce model complexity by a large margin as well as retain high accuracy, achieving a good compromise between efficiency and performance. Experiments reveal the phenomenon that distilling knowledge from cumbersome embeddings is better than directly training neural networks with small embeddings.", "venue": "https://doi.org/10.1145/2983323.2983888", "label": 0}, {"loc": [8.799785614013672, 2.4689905643463135], "openalex_id": "https://openalex.org/W2182361439", "title": "Optimization and Evaluation in Machine Learning Challenges", "authors": "Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Tobias Springenberg, Manuel Blum, Frank Hutter", "abstract": "The success of machine learning in a broad range of applications has led to an ever-growing demand for machine learning systems that can be used off the shelf by non-experts. To be effective in practice, such systems need to automatically choose a good algorithm and feature preprocessing steps for a new dataset at hand, and also set their respective hyperparameters. Recent work has started to tackle this automated machine learning (AutoML) problem with the help of efficient Bayesian optimization methods. Building on this, we introduce a robust new AutoML system based on scikit-learn (using 15 classifiers, 14 feature preprocessing methods, and 4 data preprocessing methods, giving rise to a structured hypothesis space with 110 hyperparameters). This system, which we dub AUTO-SKLEARN, improves on existing AutoML methods by automatically taking into account past performance on similar datasets, and by constructing ensembles from the models evaluated during the optimization. Our system won the first phase of the ongoing ChaLearn AutoML challenge, and our comprehensive analysis on over 100 diverse datasets shows that it substantially outperforms the previous state of the art in AutoML. We also demonstrate the performance gains due to each of our contributions and derive insights into the effectiveness of the individual components of AUTO-SKLEARN.", "venue": "Neural Information Processing Systems", "label": 0}, {"loc": [4.077030181884766, 2.1530568599700928], "openalex_id": "https://openalex.org/W2618324598", "title": "TO WHOM DOES THE WORLD BELONG?", "authors": "Dyron Daughrity", "abstract": "In this exciting new volume, an anchor to the Understanding World Christianity series, Dyron B. Daughrity helps readers map out the major changes that have taken place in recent years in the world's largest religion. By comparing trends, analyzing global Christian movements, and tracing the impact of Pentecostalism, interreligious dialogue, global missions, birth rates, and migratory trends, Daughrity sketches a picture of a changing religion and gives the tools needed to understand it. From discussions of sexuality and afterlife to contemporary Christian music and secularization, this book provides a global perspective on what is happening within Christianity today.", "venue": "Pepperdine Digital Commons (Pepperdine University)", "label": 0}, {"loc": [3.865097761154175, -3.8270821571350098], "openalex_id": "https://openalex.org/W2157481474", "title": "Computational criminology: at-scale quantitative analysis of the evolution of cybercrime forums", "authors": "Matthew Williams, Pete Burnap", "abstract": "This paper presents the first criminological analysis of an online social reaction to a crime event of national significance, in particular the detection and propagation of cyberhate on social media following a terrorist attack. We take the Woolwich, London terrorist attack in 2013 as our event of interest and draw on Cohen\u2019s process of warning, impact, inventory and reaction to delineate a sequence of incidents that come to constitute a series of deviant responses following the attack. This paper adds to contemporary debates in criminology and the study of hate crime in three ways: (1) it provides the first analysis of the escalation, duration, diffusion and de-escalation of cyberhate in social media following a terrorist event; (2) it applies Cohen\u2019s work on action, reaction and amplification and the role of the traditional media to the online context and (3) it introduces and provides a case study in \u2018computational criminology\u2019.", "venue": "The British Journal of Criminology", "label": 0}, {"loc": [2.7595860958099365, 2.2887158393859863], "openalex_id": "https://openalex.org/W2192776326", "title": "Automated Data Analytics: Combining Human Creativity and AI Power Using ChatGPT", "authors": "Amin Abbaszadegan, David Grau", "abstract": "abstract: This article assesses the combined influence of information integration and automated data analytics on project performance. To this end, retrospective data on 78 completed projects, with a total installed value of $8 billion, was collected. The data collection effort characterized, for each project, the level of internal and external information integration. Information integration was assessed as the seamlessly interoperable sharing of data produced from a work function with other functions/stakeholders so that no manual data transfer was required. Also, the level of automated data analytics, understood as the full automation of the data analysis function after input data are entered, was also characterized on a project basis. Then, non-parametric statistical techniques were used to assess the impact of such functions on cost and schedule performance. The statistical analysis was also stratified by project type, e.g. greenfield and brownfield, additions, and modifications or shutdowns. Overall, projects with a sophisticated degree of information integration and automated data analytics can control their projects with more reliable information and in a proactive manner so that informed decisions can be timely made on behalf of the project and the organization.", "venue": "Procedia Engineering", "label": 0}, {"loc": [5.509827136993408, 0.6194930076599121], "openalex_id": "https://openalex.org/W2574859650", "title": "AUTOMATIC GRADING OF SHORT ANSWERS", "authors": "Shumin Jing", "abstract": "Developing an effective and impartial grading system for short answers is a challenging problem in educational measurement and assessment, due to the diversity of answers and the subjectivity of graders. In this paper, we design an automatic grading approach for short answers, based on the non-negative semi-supervised document clustering method. After assigning several answer keys, our approach is able to group the large amount of short answers into multiple sets, and output the score for each answer automatically. In this manner, the effort of teachers can be greatly reduced. Moreover, our approach allows the interaction with teachers, and therefore the system performance could be further enhanced. Experimental results on two datasets demonstrate the effectiveness of our approach.", "venue": "Educational Data Mining", "label": 0}, {"loc": [5.374428749084473, 0.7852990627288818], "openalex_id": "https://openalex.org/W2016658851", "title": "Suggested keywords", "authors": "Dina Vrki\u0107", "abstract": "The majority of the scientific papers include keywords besides the obligatory title and abstract. The use of keywords is not just for the description of content, but it is a viral part of the scientific paper which is later used for information retrieval function. The aim of this paper is to present comparison of author suggested keywords in scientific papers of Faculty of Electrical Engineering and Computing (FER), IEEE terms and user tags in academic social network Mendeley. This paper will examine the scientific papers represented in IEEE Journals and Magazines indexed in the IEEE/IET Electronic Library reviewing the overlap between author keywords and IEEE terms from a controlled vocabulary (IEEE thesaurus) and user tags in Mendeley. The results showed no significant use of controlled vocabulary and social tags, which leads to the conclusion that the authors are not aware of the importance of controlled keywords.", "venue": "https://doi.org/10.1109/mipro.2014.6859662", "label": 0}, {"loc": [9.545947074890137, 0.9022426009178162], "openalex_id": "https://openalex.org/W2134033474", "title": "Extracting Entity Mentions", "authors": "Qi Li, Heng Ji", "abstract": "We present an incremental joint framework to simultaneously extract entity mentions and relations using structured perceptron with efficient beam-search. A segment-based decoder based on the idea of semi-Markov chain is adopted to the new framework as opposed to traditional token-based tagging. In addition, by virtue of the inexact search, we developed a number of new and effective global features as soft constraints to capture the interdependency among entity mentions and relations. Experiments on Automatic Content Extraction (ACE) 1 corpora demonstrate that our joint model significantly outperforms a strong pipelined baseline, which attains better performance than the best-reported end-to-end system.", "venue": "https://doi.org/10.3115/v1/p14-1038", "label": 0}, {"loc": [3.3470730781555176, -0.47027167677879333], "openalex_id": "https://openalex.org/W2116868464", "title": "A CURATEd CATalog: Rethinking the Extraction of Pretraining Corpora for Mid-Resourced Languages", "authors": "Danielle Welter, Jacqueline MacArthur, Joannella Morales, Tony Burdett, Peggy Hall, Heather Junkins, Alan Klemm, Paul Flicek, Teri A. Manolio, Lucia A. Hindorff, Helen Parkinson", "abstract": "The National Human Genome Research Institute (NHGRI) Catalog of Published Genome-Wide Association Studies (GWAS) Catalog provides a publicly available manually curated collection of published GWAS assaying at least 100,000 single-nucleotide polymorphisms (SNPs) and all SNP-trait associations with P <1 \u00d7 10(-5). The Catalog includes 1751 curated publications of 11 912 SNPs. In addition to the SNP-trait association data, the Catalog also publishes a quarterly diagram of all SNP-trait associations mapped to the SNPs' chromosomal locations. The Catalog can be accessed via a tabular web interface, via a dynamic visualization on the human karyotype, as a downloadable tab-delimited file and as an OWL knowledge base. This article presents a number of recent improvements to the Catalog, including novel ways for users to interact with the Catalog and changes to the curation infrastructure.", "venue": "Nucleic Acids Research", "label": 0}, {"loc": [6.216540813446045, 5.786715030670166], "openalex_id": "https://openalex.org/W2111078031", "title": "Grounding Language in Images and Videos", "authors": "Michaela Regneri, Marcus Rohrbach, Dominikus Wetzel, Stefan Thater, Bernt Schiele, Manfred Pinkal", "abstract": "Recent work has shown that the integration of visual information into text-based models can substantially improve model predictions, but so far only visual information extracted from static images has been used. In this paper, we consider the problem of grounding sentences describing actions in visual information extracted from videos. We present a general purpose corpus that aligns high quality videos with multiple natural language descriptions of the actions portrayed in the videos, together with an annotation of how similar the action descriptions are to each other. Experimental results demonstrate that a text-based model of similarity between actions improves substantially when combined with visual information from videos depicting the described actions.", "venue": "Transactions of the Association for Computational Linguistics", "label": 26}, {"loc": [7.79884672164917, -0.8187404274940491], "openalex_id": "https://openalex.org/W2557175093", "title": "Creating Parallel Corpora for Ukrainian: a German-Ukrainian Parallel Corpus (ParaRook|| DE-UK)", "authors": "Olena Siruk, Ivan Derzhanski", "abstract": "The paper relates about our ongoing work on the creation of a corpus of Bulgarian and Ukrainian parallel texts. We discuss some differences in the approaches and the interpretation of some concepts, as well as various problems associated with the construction of our corpus, in particular the occasional \u2018nonparallelism\u2019 of original and translated texts. We give examples of the a pplication of the parallel corpus for the study of lexical semantics and note the outstanding role of the corpus in the lexicographic description of Ukrainian and Bulgarian translation equivalents. We draw attention to the importance of creating parallel corpora as objects of national as well as global cultural heritage.", "venue": "Digital Presentation and Preservation of Cultural and Scientific Heritage", "label": 0}, {"loc": [4.975308418273926, 0.9357532262802124], "openalex_id": "https://openalex.org/W2041971736", "title": "ORCID", "authors": "Haak Laurel, Martin Fenner, Laura Paglione, Ed Pentz, Howard Ratner", "abstract": "ABSTRACT The Open Researcher & Contributor ID (ORCID) registry presents a unique opportunity to solve the problem of author name ambiguity. At its core the value of the ORCID registry is that it crosses disciplines, organizations, and countries, linking ORCID with both existing identifier schemes as well as publications and other research activities. By supporting linkages across multiple datasets \u2013 clinical trials, publications, patents, datasets \u2013 such a registry becomes a switchboard for researchers and publishers alike in managing the dissemination of research findings. We describe use cases for embedding ORCID identifiers in manuscript submission workflows, prior work searches, manuscript citations, and repository deposition. We make recommendations for storing and displaying ORCID identifiers in publication metadata to include ORCID identifiers, with CrossRef integration as a specific example. Finally, we provide an overview of ORCID membership and integration tools and resources.", "venue": "Learned Publishing", "label": 0}, {"loc": [2.9098501205444336, 0.0015512054087594151], "openalex_id": "https://openalex.org/W2332509662", "title": "Code Blue: The Threat of Synthetic Data Use to Generative Medical AI", "authors": "Kelley F. Huseman", "abstract": "In this research project, the response times to chest compressions, first defibrillation, and first dose of epinephrine in cardiac arrest were measured over a 3-month period through retrospective chart reviews. All nursing staff then participated in random, unannounced mock code blue drills using a high-fidelity patient simulator. After 3 months of code blue drills, the variables were again measured in patient code blue situations and compared with the response times before training. At the conclusion of this study, the response times for start of chest compressions and epinephrine administration improved significantly; the response time to defibrillation did not improve significantly. The response times were measured for an additional 3-month period to assess if the improvement was sustained.", "venue": "Journal for Nurses in Staff Development", "label": 0}, {"loc": [2.685500144958496, 2.760753870010376], "openalex_id": "https://openalex.org/W3124946451", "title": "Open Source Software Policy in Industry Equilibrium", "authors": "Vineet Kumar, Brett R. Gordon, Kannan Srinivasan", "abstract": "Commercial open source software (COSS) products\u2014privately developed software based on publicly available source code\u2014represent a rapidly growing, multibillion-dollar market. A unique aspect of competition in the COSS market is that many open source licenses require firms to make certain enhancements public, creating an incentive for firms to free ride on the contributions of others. This practice raises a number of puzzling issues. First, why should a firm further develop a product if competitors can freely appropriate these contributions? Second, how does a market based on free riding produce high-quality products? Third, from a public policy perspective, does the mandatory sharing of enhancements raise or lower consumer surplus and industry profits? We develop a two-sided model of competition between COSS firms to address these issues. Our model consists of (1) two firms competing in a vertically differentiated market, in which product quality is a mix of public and private components, and (2) a market for developers that firms hire after observing signals of their contributions to open source. We demonstrate that free-riding behavior is supported in equilibrium, that a mandatory sharing setting can result in high-quality products, and that free riding can actually increase profits and consumer surplus.", "venue": "Marketing Science", "label": 0}, {"loc": [5.616556167602539, 1.5865678787231445], "openalex_id": "https://openalex.org/W2053308952", "title": "Deriving the Meaning of Out-of-Vocabulary Words", "authors": "Ali Derakhshan, Ali Shahrzad", "abstract": "A solid body of research findings substantiates that most vocabulary, in first, second or foreign language, is acquired incidentally which is defined as learning vocabulary as a by-product of any activity not explicitly geared to vocabulary learning. Therefore, the present study mainly focused on the impact of instruction and intervention in deriving word meaning on incidental vocabulary learning in EFL context; secondly, it aimed to find out whether the contextualized words that appear with more clues learned better and consequently kept longer; finally, it sought to explore whether instruction could lead to increase in incidental vocabulary learning in the text. To these ends, 50 freshmen Iranian college students from Teacher Training University of Azerbaijan participated in this study. These students enrolled for the reading class in two separate semesters. The results of the TOEFL and Vocabulary Level Test (VLT) revealed that the participants enjoyed approximately the same level of proficiency. There were two post-tests which were taken at certain time intervals. The results of this study showed that the instruction in deriving word meaning had positive effect on students’ incidental vocabulary learning. Also it was concluded that students should meet the words in contextualized forms more frequently in order to keep and retain them in the long run.", "venue": "World Journal of English Language", "label": 0}, {"loc": [3.453577995300293, -0.41367828845977783], "openalex_id": "https://openalex.org/W1985115576", "title": "Diffused Seeing: The Epistemological Challenge", "authors": "Jason T. Huse, Heidi Phillips, Cameron Brennan", "abstract": "Abstract Diffuse gliomas such as astrocytomas and oligodendrogliomas are the most common form of intrinsic brain tumor in adults. Even within a single pathologic class, these tumors are both histologically and molecularly diverse, although not randomly so. Recent large\u2010scale genomic analyses have revealed patterns of molecular changes within tumor subclasses that harbor distinct underlying biology, clinical prognosis, and pathogenetic routes. Stereotypical mutations in isocitrate dehydrogenase genes (IDH) have been identified in a significant proportion of high\u2010grade gliomas and the large majority of lower\u2010grade astrocytomas and oligodendrogliomas. While the role of IDH mutation in oncogenesis is unclear, it appears to carry a positive prognosis and is also highly associated with other prognostic markers such as MGMT methylation, loss of 1p and 19q chromosome arms, and a newly recognized CpG island methylator phenotype (G\u2010CIMP). This constellation of positive prognostic molecular features is enriched in the transcriptionally defined Proneural glioma subclass and appears to reflect a route of pathogenesis distinct from that taken by other high\u2010grade diffuse gliomas. Another newly discovered and frequent alteration in glioma, deletion or mutation of the NF1 gene, is strongly correlated with the Mesenchymal transcriptomal signature associated with highly aggressive gliomas. Thus, while the unprecedented level of newly available molecular profiling data may seem at first to needlessly balkanize and complicate glioma subclassification, these analyses are in fact providing a more unified picture of key pathogenetic routes and potential avenues for therapeutic intervention. \u00a9 2011 Wiley\u2010Liss, Inc.", "venue": "Glia", "label": 0}, {"loc": [7.751916408538818, -0.8789874911308289], "openalex_id": "https://openalex.org/W2587822382", "title": "Towards Machine Translation Based on Monolingual Texts", "authors": "Peter Waiganjo Wagacha, Gilles-Maurice de Schryver, Guy De Pauwy", "abstract": "Even though the Bantu language of Swahili is spoken by more than fifty million people in East and Central Africa, it is surprisingly resource-scarce from a language technological point of view, an unfortunate situation that holds for most, if not all languages on the continent. The increasing amount of digitally available, vernacular data has prompted researchers to investigate the applicability of corpus-based approaches to African language technology. In this vein, the SAWA corpus project attempts to collect and deploy a parallel corpus English Swahili, not only for the straightforward purpose of developing a machine translation system, but also to investigate the possibility of projection of annotation into a resource-scarce, African language. Compiling a balanced and expansive parallel corpus English Swahili is a rather daunting task. While monolingual Swahili data is abundantly available on the Internet, sourcing parallel texts is cumbersome. Even countries that have both English and Swahili as their official languages, such as Tanzania, Kenya and Uganda, do not tend to translate and/or publish all government documents bilingually. One therefore opportunistically collects whatever can be found in the public domain. At this point in the data collection phase, that means that the 2.2 million word parallel corpus is biased towards religious material, such as bible and quran translations. Nevertheless, the more interesting, secular part of the SAWA corpus (\u00b1 420k words) is steadily increasing, thanks to the inclusion of bilingual investment reports, manually translated movie subtitles, political documents and material kindly donated by local translators to the SAWA project. Each text in the SAWA corpus is automatically part-ofspeech tagged and lemmatized, using the TreeTagger for the English part (Schmid, 1994) and the systems described in De Pauw et al. (2006) and De Pauw and de Schryver (2008) for Swahili. These extra annotation layers allow us to perform more accurate automatic word alignment on the basis of factored data. Table 1: Precision, Recall and F-score for the wordalignment task using GIZA++. Prec. Recall F(\u03b2 = 1)", "venue": "http://cl.haifa.ac.il/MT/abstracts/depauwetal.pdf", "label": 0}, {"loc": [6.779250144958496, 0.27365240454673767], "openalex_id": "https://openalex.org/W2003666043", "title": "WaterPark: A Robustness Assessment of Language Model Watermarking", "authors": "Seong\u2010Joon Koh, Han Gil Cho, Bo Hyun Kim, Bo Youl Choi", "abstract": "In January 2008, an outbreak of acute gastroenteritis at a waterpark was reported to the Bundang-gu Public Health Center in Seongnam, Korea. To determine the etiological agent and mode of transmission, a retrospective cohort study was done using structured questionnaires and stool samples from patients who had current gastrointestinal symptoms and three food handlers were tested. A total of 67 (31.0%) students and teachers developed acute gastroenteritis. No food items were associated with an increased risk of the illness. Norovirus was detected in 3 stool specimens collected from 6 patients who had severe diarrhea using semi-nested RT-PCR. All the specimens contained the genogroup I strains of the norovirus. Norovirus was also detected in the groundwater samples from the waterpark. In the nucleotide sequencing analysis, all the genogroup I noroviruses from the patients and groundwater samples were identified as the norovirus genotype I-4 strain. They were indistinguishable by DNA sequencing with a 97% homology. We conclude the outbreak of acute gastroenteritis caused by the norovirus was closely related to the contaminated groundwater.", "venue": "Journal of Korean Medical Science", "label": 0}, {"loc": [2.7060329914093018, 2.3949365615844727], "openalex_id": "https://openalex.org/W2059804181", "title": "Knowledge Sources", "authors": "Aija Leiponen, Constance E. Helfat", "abstract": "Abstract Given the inherent risk of innovative activity, firms can improve the odds of success by pursuing multiple parallel objectives. Because innovation draws on many sources of ideas, firms also may improve their odds of successful innovation by accessing a large number of knowledge sources. In this study, we conduct one of the first firm\u2010level statistical analyses of the impact on innovation of breadth in both innovation objectives and knowledge sources. The empirical results suggest that broader horizons with respect to innovation objectives and knowledge sources are associated with successful innovation. We do not find diminishing returns to breadth in innovation objectives, which suggests that firms may tend to search too narrowly. We interpret these results in light of well\u2010known cognitive biases toward searching in relatively familiar domains. Copyright \u00a9 2009 John Wiley & Sons, Ltd.", "venue": "Strategic Management Journal", "label": 0}, {"loc": [3.5736660957336426, -0.27048224210739136], "openalex_id": "https://openalex.org/W2155653061", "title": "Protein-Protein Interaction Prediction", "authors": "Mark D. McDowall, Michelle S Scott, Geoffrey J. Barton", "abstract": "The PIPs database (http://www.compbio.dundee.ac.uk/www-pips) is a resource for studying protein-protein interactions in human. It contains predictions of >37,000 high probability interactions of which >34,000 are not reported in the interaction databases HPRD, BIND, DIP or OPHID. The interactions in PIPs were calculated by a Bayesian method that combines information from expression, orthology, domain co-occurrence, post-translational modifications and sub-cellular location. The predictions also take account of the topology of the predicted interaction network. The web interface to PIPs ranks predictions according to their likelihood of interaction broken down by the contribution from each information source and with easy access to the evidence that supports each prediction. Where data exists in OPHID, HPRD, DIP or BIND for a protein pair this is also reported in the output tables returned by a search. A network browser is included to allow convenient browsing of the interaction network for any protein in the database. The PIPs database provides a new resource on protein-protein interactions in human that is straightforward to browse, or can be exploited completely, for interaction network modelling.", "venue": "Nucleic Acids Research", "label": 0}, {"loc": [3.0224456787109375, 2.5406415462493896], "openalex_id": "https://openalex.org/W2330592999", "title": "BIS Papers", "authors": "Yong\u2010Hoon Kim, Sung Kyu Park, Sang\u2010Geun Park, M. K. Han, Jeong In Han", "abstract": "Ink-jet printed 6,13-bis(triisopropylsilylethynyl) (TIPS) pentacene organic thin-film transistors (OTFTs) were fabricated for particle-based electronic papers. As source and drain electrode, a transparent conductive oxide, indium-tin-oxide (ITO) has been employed instead of using Au electrode. Using the ITO electrode, field-effect mobility of 0.06 cm2/Vs has achieved by improving the contact between the ITO electrode and the organic semiconductor layer. For contact treatment, a combination of O2 plasma treatment and 4-chlorophenyl dichlorophosphate treatment have been used. With the OTFTs employing ITO source/drain electrode, a particle-based electronic paper array was fabricated.", "venue": "ECS Transactions", "label": 0}, {"loc": [6.485570430755615, 1.0212244987487793], "openalex_id": "https://openalex.org/W1983774632", "title": "Sketch Grammar for Japanese", "authors": "Irena Srdanovi\u0107, Toma\u017e Erjavec, Adam Kilgarriff", "abstract": "Of all the major world languages, Japanese is lagging behind in terms of publicly accessible and searchable corpora. In this paper we describe the development of JpWaC (Japanese Web as Corpus), a large corpus of 400 million words of Japanese web text, and its encoding for the Sketch Engine. The Sketch Engine is a web-based corpus query tool that supports fast concordancing, grammatical processing, 'word sketching' (one-page summaries of a word's grammatical and collocational behaviour), a distributional thesaurus, and robot use. We describe the steps taken to gather and process the corpus and to establish its validity, in terms of the kinds of language it contains. We then describe the development of a shallow grammar for Japanese to enable word sketching. We believe that the Japanese web corpus as loaded into the Sketch Engine will be a useful resource for a wide number of Japanese researchers, learners, and NLP developers.", "venue": "Journal of Natural Language Processing", "label": 0}, {"loc": [6.619754314422607, 1.3089731931686401], "openalex_id": "https://openalex.org/W2129998937", "title": "Progress Report: Towards European LLMs", "authors": "Jean-Charles Rochet, Jean Tirole", "abstract": "International audience", "venue": "The RAND Journal of Economics", "label": 0}, {"loc": [9.327556610107422, 1.4477061033248901], "openalex_id": "https://openalex.org/W2007495022", "title": "SEMANTIC MATCHING FOR DATA CATALOGS", "authors": "Eleni Stroulia, Yiqiao Wang", "abstract": "The web-services stack of standards is designed to support the reuse and interoperation of software components on the web. A critical step in the process of developing applications based on web services is service discovery, i.e. the identification of existing web services that can potentially be used in the context of a new web application. Discovery through catalog-style browsing (such as supported currently by web-service registries) is clearly insufficient. To support programmatic service discovery, we have developed a suite of methods that assess the similarity between two WSDL (Web Service Description Language) specifications based on the structure of their data types and operations and the semantics of their natural language descriptions and identifiers. Given only a textual description of the desired service, a semantic information-retrieval method can be used to identify and order the most relevant WSDL specifications based on the similarity of the element descriptions of the available specifications with the query. If a (potentially partial) specification of the desired service behavior is also available, this set of likely candidates can be further refined by a semantic structure-matching step, assessing the structural similarity of the desired vs the retrieved services and the semantic similarity of their identifiers. In this paper, we describe and experimentally evaluate our suite of service-similarity assessment methods.", "venue": "International Journal of Cooperative Information Systems", "label": 0}, {"loc": [3.3254921436309814, 2.454228162765503], "openalex_id": "https://openalex.org/W4230146126", "title": "Display options", "authors": "Alan Stanbridge", "abstract": "Abstract The modern conception of \u201cart\u201d has its roots in the development of eighteenth\u2010century aesthetics, yet continues to have an apparently unshakeable hold on the broad field of cultural policy. Across a wide range of programs and disciplines, a lingering faith in the autonomy of art, in the isolated genius of the artist and in the superiority of traditional high art forms over those of popular culture is still, I would argue, clearly evident, and the discourse of aesthetic autonomy has served to inform the development of arts and cultural policies, and especially the policies of the contemporary art museum. In sharp contrast to this traditionalist discourse, the \u201cpostmodern museum\u201d embraces context and interactivity as an integral part of its cultural mandate, which revolves around notions of populism, communication, interactivity and a non\u2010hierarchical \u201cdemocratization\u201d of the previously authoritative modernist museum. In this article, drawing on a specific case study from the contemporary art museum \u2013 the controversy surrounding the purchase of Barnett Newman\u2019s Voice of Fire by Canada\u2019s National Gallery \u2013 and addressing the often heated debate around the populist agenda of the \u201cpostmodern museum\u201d, the author explores the tensions between discourses of art and context in the contemporary gallery and museum, suggesting the need for a radical reassessment of established exhibition practices and modes of display. KEYWORDS: artaesthetic autonomydiscoursecontextpopulismart museumsthe \u201cpostmodern museum\u201d Notes 1. Guilbaut\u2019s argument here has much in common with the work of Lawrence Levine (1988 Levine, L. W. 1988. Highbrow/Lowbrow: The Emergence of Cultural Hierarchy in America, Cambridge, MA: Harvard University Press. [Google Scholar]) and Paul DiMaggio (1982 Dimaggio, P. J. 1982. \u201c\u2018Cultural entrepreneurship in nineteenth\u2010century Boston\u2019\u201d. In Nonprofit Enterprise in the Arts: Studies in Mission and Constraint, Edited by: Dimaggio, P. J. 41\u201361. Oxford: Oxford University Press. [Google Scholar], p.43), which explores the \u201csacralization\u201d of American high culture at the end of the nineteenth century. 2. All the paintings in the exhibit were commissioned by the art critic Alan Solomon on behalf of the USIA (see O\u2019Brian 1996b O\u2019Brian, J. 1996b. \u201c\u2018Who\u2019s afraid of Barnett Newman?\u2019\u201d. In Voices of Fire: Art, Rage, Power and the State, Edited by: Barber, B., Guilbaut, S. and O\u2019Brian, J. 121\u2013136. Toronto: University of Toronto Press. [Google Scholar]). 3. CBC Television News, 8 May 1967 (see the CBC Archives, available online at: http://archives.cbc.ca/IDCC\u20101\u201069\u2010100\u2010552/life_society/expo_67). 4. For further information on Expo \u201867, see the relevant pages on the National Library of Canada and National Archives of Canada website: www.collectionscanada.ca/05/0533/053302_e.html. 5. John O\u2019Brian (1996b O\u2019Brian, J. 1996b. \u201c\u2018Who\u2019s afraid of Barnett Newman?\u2019\u201d. In Voices of Fire: Art, Rage, Power and the State, Edited by: Barber, B., Guilbaut, S. and O\u2019Brian, J. 121\u2013136. Toronto: University of Toronto Press. [Google Scholar], p. 128) observes that Newman had \u201clofty expectations about the power of visual art to communicate moral values, and \u2026 chose an admonitory Old Testament title for the painting\u201d. Nicole Dubreuil\u2010Blondin (1996 Dubreuil\u2010Blondin, N. 1996. \u201c\u2018Tightrope metaphysics\u2019\u201d. In Voices of Fire: Art, Rage, Power and the State, Edited by: Barber, B., Guilbaut, S. and O\u2019Brian, J. 153\u2013164. Toronto: University of Toronto Press. [Google Scholar], p. 159) notes of the title that it \u201csuggests the voice of Jehovah from the burning bush\u201d. 6. The relevance of this point to the discussion in the previous section of Voice of Fire, and its role within the USIA\u2010supported American Pavilion as part of the Expo \u201867 world\u2019s fair in Montreal, is worthwhile highlighting at this stage. 7. For further information on the Burrell Collection, see: www.glasgowmuseums.com/venue/index.cfm?venueid=1. 8. The recent exhibition at the Montreal Museum of Fine Arts/Mus\u00e9e des beaux\u2010arts de Montr\u00e9al, \u201cGlobal Village: The 60s\u201d, offered a stimulating \u2013 and occasionally frustrating \u2013 example of the type of contextualist approach I have in mind. The exhibition embraced art, design, technology and politics in an intriguing manner, although the popular music of the period \u2013 surely a defining feature of the culture of the 1960s \u2013 was rather poorly represented.", "venue": "International Journal of Cultural Policy", "label": 0}, {"loc": [2.300297737121582, 1.712296485900879], "openalex_id": "https://openalex.org/W2134305194", "title": "Teachers First", "authors": "Daniela Sime, Mark Priestley", "abstract": "Abstract This article explores student teachers' views of the use of information and communication technologies (ICT) in schools. There is limited research literature regarding the perceptions that such students develop in relation to the use of ICT in teaching while observing practice in schools. The paper offers an interpretive analysis of the opinions that a cohort of undergraduate student teachers at a Scottish University expressed in an online forum, following a period of school placement. As part of their initial teacher education (ITE), the students were asked to post messages on the forum in relation to the factors that they perceived as promoting or hindering the use of ICT in schools. Perceptions that students held were found to be complex and varied. Students associated the use of ICT with changes in the nature of classroom relations, as well as a reshaping of learning and teaching. While they welcomed the introduction of ICT as a tool for modernising teaching, students identified a variety of factors that hinder this process. The paper finishes by identifying some of the implications for those working with student teachers in encouraging their development of reflective practice with ICT and enhancing their positive attitudes in relation to the use of ICT in schools.", "venue": "Journal of Computer Assisted Learning", "label": 0}, {"loc": [2.334688901901245, 1.7229996919631958], "openalex_id": "https://openalex.org/W2056484152", "title": "Bachelor's Thesis", "authors": "Wil Meeus, Linda Van Looy, Arno Libotton", "abstract": "The theory\u2010oriented approach to the final thesis in higher education is still dominant at the Master as well as Bachelor level. We believe, however, that at the Bachelor level a practice\u2010oriented approach would be more appropriate. Our research as presented below explores the possibilities of a practice\u2010oriented approach to the Bachelor's thesis in teacher education. Semi\u2010structured interviews form the basis for a grounded theory regarding the various current final thesis models and the conditions in which these have to be realized. The conclusion is that the portfolio as final thesis proves to be a satisfactory practice\u2010oriented alternative to the highly criticized traditional final thesis. L'approche privil\u00e9gi\u00e9e pour la r\u00e9alisation du m\u00e9moire de fin d'\u00e9tudes dans les formations sup\u00e9rieures du Master et du Bachelor est encore toujours une approche th\u00e9orique. Pourtant nous croyons que pour le niveau Bachelor, une m\u00e9thode orient\u00e9e vers la pratique serait plus appropri\u00e9e. Notre \u00e9tude explore les possibilit\u00e9s d'une m\u00e9thode orient\u00e9e vers la pratique pour les m\u00e9moires de fin d'\u00e9tude des Bachelors dans la formation initiale des professeurs. Des interviews semi\u2010structur\u00e9es constituent la base pour l'\u00e9laboration d'une th\u00e9orie sur les diff\u00e9rents mod\u00e8les des m\u00e9moires de fin d'\u00e9tude et les conditions dans lesquelles ils doivent se r\u00e9aliser. La conclusion est que le portfolio comme m\u00e9moire de fin d'\u00e9tude s'av\u00e8re une alternative orient\u00e9e sur la pratique satisfaisante pour le m\u00e9moire final traditionnel beaucoup critiqu\u00e9. El m\u00e9todo del 'enfoque t\u00e9orico' de la tesina en la ense\u00f1anza superior del Master y del Bachelor todav\u00eda es el m\u00e1s utilizado. Sin embargo, pensamos que en el nivel del Bachelor, un m\u00e9todo enfocado hacia la pr\u00e1ctica ser\u00eda m\u00e1s apropiado. Nuestro estudio explora las posibilidades de un m\u00e9todo enfocado hacia la pr\u00e1ctica en los Bachelors de formaci\u00f3n inicial de profesores. Entrevistas semi\u2010estructuradas constituyen la base de una teor\u00eda sobre los diferentes modelos de tesinas las condiciones en las que deben realizarse. La concluci\u00f3n es que el portfolio es una alternativa pr\u00e1ctica satisfactoria por la tesina tradicional muy criticada. Noch immer herrscht f\u00fcr Abschlussarbeiten im Rahmen der h\u00f6heren Bildung auf dem Master und Bachelor Niveau der Theorie\u2010orientierte Ansatz vor. Wir haben jedoch Grund zu der Annahme, dass f\u00fcr den Bachelor ein Praxis\u2010orientierter Ansatz passender w\u00e4re. Unsere Forschung untersucht die M\u00f6glichkeiten eines Praxis\u2010orientierten Ansatzes f\u00fcr die Bachelor\u2010Thesis in der Lehrerausbildung. Leitfadeninterviews bilden die Grundlage f\u00fcr eine begr\u00fcndete Theorie in Bezug auf die verschiedenen Modelle einer Abschlussarbeit und unter welchen Bedingungen diese realisiert werden m\u00fcssen. Das Fazit ist, dass Portfolios als Abschlussarbeit bewiesenerma\u00dfen eine durchaus zufriedenstellende Praxis\u2010orientierte Alternative f\u00fcr die viel kritisierte traditionelle Abschlussarbeit darstellen.", "venue": "European Journal of Teacher Education", "label": 0}, {"loc": [6.970314979553223, 3.5151658058166504], "openalex_id": "https://openalex.org/W1553037668", "title": "Emergence of Abstractions: Concept Encoding and Decoding Mechanism for In-Context Learning in Transformers", "authors": "Philip Levis, Samuel Madden, Joseph Polastre, Robert Szewczyk, Alec Woo, Eric Brewer, David Culler", "abstract": "The constraints of sensor networks, an emerging area of network research, require new approaches in system design. We study the evolution of abstractions and techniques in TinyOS, a popular sensor network operating system. Examining CVS repositories of several research institutions that use TinyOS, we trace three areas of development: single-hop networking, multi-hop networking, and network services. We note common techniques and draw conclusions on the emerging abstractions as well as the novel constraints that have shaped them. 1.", "venue": "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.64.8890", "label": 0}, {"loc": [5.3599467277526855, 3.094351053237915], "openalex_id": "https://openalex.org/W1969200114", "title": "The future", "authors": "Stefan Wolf, D. D. Awschalom, R. A. Buhrman, J.M. Daughton, S. von Moln\u00e1r, M. L. Roukes, Almadena Chtchelkanova, Daryl Treger", "abstract": "This review describes a new paradigm of electronics based on the spin degree of freedom of the electron. Either adding the spin degree of freedom to conventional charge-based electronic devices or using the spin alone has the potential advantages of nonvolatility, increased data processing speed, decreased electric power consumption, and increased integration densities compared with conventional semiconductor devices. To successfully incorporate spins into existing semiconductor technology, one has to resolve technical issues such as efficient injection, transport, control and manipulation, and detection of spin polarization as well as spin-polarized currents. Recent advances in new materials engineering hold the promise of realizing spintronic devices in the near future. We review the current state of the spin-based devices, efforts in new materials fabrication, issues in spin transport, and optical spin manipulation.", "venue": "Science", "label": 0}, {"loc": [4.072853088378906, 2.166137218475342], "openalex_id": "https://openalex.org/W1968576534", "title": "Political Disaffection and the Decline of the Centre: Quantitative Text Analysis Approaches", "authors": "Francesco Cavatorta", "abstract": "Until the early 1990s, Italy displayed a stable party system, where newcomers found it particularly difficult to challenge the overwhelming influence of the traditional parties: Christian Democrats (DC), Socialist Party (PSI), and Communist Party (PCI). New political formations managed to emerge, but they were largely unable to sustain their electoral success over a long period of time and failed to establish themselves as credible alternatives. The appearance of the Northern League (NL) in the late 1980s was also treated as temporary disaffection of sectors of the electorate from traditional politics. However, this proved not to be the case and the NL went on to become a very central player in the political system. \r\nThis article examines the conditions for the emergence of the Northern League and its long lasting impact on Italian politics. The Northern League is partly responsible for major changes that occurred in Italy over the last decade and while its electoral fortunes have somewhat declined in recent years, the issues it brought to prominence are today very much central in political debates. The article argues that the NL, far from being a single-issue party, has a clear vision of what Italy in the new millennium should look like. Moreover, the article argues that this vision is similar to the one held by a number of right-wing parties in Western Europe such as Haider\u2019s Freedom Party. Accordingly, the NL has abandoned its pro-independence position and has entered again into a political and electoral alliance with the centre-right Berlusconi-led coalition. This coalition is the favourite to win the 2001 national election and the NL is likely to hold once again a number of key ministerial posts. The presence of the Northern League in the new government will certainly accelerate the pace of decentralising reforms. The initial goal of federalism will probably be achieved after the NL went through a stage of profound radicalisation within which it flirted will independence. The failure of the Padanian project conceived in ethnic terms has brought the NL back to its roots and it is likely that Italy will soon be a federal state.", "venue": "Contemporary Politics", "label": 0}, {"loc": [3.5191311836242676, -0.3933047950267792], "openalex_id": "https://openalex.org/W1986978728", "title": "RIP: Better Models by Survival of the Fittest Prompts", "authors": "Ying Lin, Anne Devin, Yolanda Rodr\u00edguez, Z.-g. Liu", "abstract": "Although the molecular mechanisms of TNF signaling have been largely elucidated, the principle that regulates the balance of life and death is still unknown. We report here that the death domain kinase RIP, a key component of the TNF signaling complex, was cleaved by Caspase-8 in TNF-induced apoptosis. The cleavage site was mapped to the aspartic acid at position 324 of RIP. We demonstrated that the cleavage of RIP resulted in the blockage of TNF-induced NF-kappaB activation. RIPc, one of the cleavage products, enhanced interaction between TRADD and FADD/MORT1 and increased cells' sensitivity to TNF. Most importantly, the Caspase-8 resistant RIP mutants protected cells against TNF-induced apopotosis. These results suggest that cleavage of RIP is an important process in TNF-induced apoptosis. Further more, RIP cleavage was also detected in other death receptor-mediated apoptosis. Therefore, our study provides a potential mechanism to convert cells from life to death in death receptor-mediated apoptosis.", "venue": "Genes & Development", "label": 0}, {"loc": [3.20267915725708, 3.806886672973633], "openalex_id": "https://openalex.org/W629439928", "title": "Banned Books: Analysis of Censorship on Amazon. com", "authors": "Nicholas J. Karolides, Margaret Bald, Dawn B. Sova, Ken Wachsberger", "abstract": "This is a one volume abridgement containing twenty-five critiques from each of the four volumes in the Banned books series", "venue": "Internet Archive (Internet Archive)", "label": 0}, {"loc": [6.1380085945129395, 0.576617419719696], "openalex_id": "https://openalex.org/W1574901103", "title": "Natural Language Processing", "authors": "Christopher D. Manning, Hinrich Sch\u00fctze", "abstract": "Statistical approaches to processing natural language text have become dominant in recent years. This foundational text is the first comprehensive introduction to statistical natural language processing (NLP) to appear. The book contains all the theory and algorithms needed for building NLP tools. It provides broad but rigorous coverage of mathematical and linguistic foundations, as well as detailed discussion of statistical methods, allowing students and researchers to construct their own implementations. The book covers collocation finding, word sense disambiguation, probabilistic parsing, information retrieval, and other applications.", "venue": "http://bvbr.bib-bvb.de:8991/F?func=service&doc_library=BVB01&local_base=BVB01&doc_number=010591577&sequence=000001&line_number=0001&func_code=DB_RECORDS&service_type=MEDIA", "label": 0}, {"loc": [5.462584018707275, 2.9639604091644287], "openalex_id": "https://openalex.org/W1986731219", "title": "Spectra: A Comprehensive Study of Ternary, Quantized, and FP16 Language Models", "authors": "S. L. Dudarev, Gianluigi A. Botton, Sergey Y. Savrasov, C. J. Humphreys, Adrian P. Sutton", "abstract": "We demonstrate how by taking better account of electron correlations in the $3d$ shell of metal ions in nickel oxide it is possible to improve the description of both electron energy loss spectra and parameters characterizing the structural stability of the material compared with local spin density functional theory.", "venue": "Physical review. B, Condensed matter", "label": 0}, {"loc": [5.286030292510986, 3.084794282913208], "openalex_id": "https://openalex.org/W2029436856", "title": "The Blue Behemoth", "authors": "Yvonne Baskin", "abstract": "They grow longer than two city buses and are powered by a taxicab-sized heart. These whales have blue-gray backs and sides and white bellies, but in colder waters, their bellies acquire coats of diatoms. The single-celled algae gives them a yellowish cast and has earned the big blues a second name: sulfur-bottom whales. In summer and fall, the blues bellow out their massive pleated throats and suck in up to 45 tons of water at a gulp, straining it through mouths full of 500 to 800 baleen (whale bone) plates to extract three to four tons of krill each day. The fast-swimming, spindle-shaped leviathan easily eluded sail-powered whaling ships for 300 years. Then, in the 1860s, an enterprising Norwegian captain combined a steam-powered ship with a cannon-fired harpoon. For the next century, whalers vigorously pursued the blue whale and depleted its populations, first in the North Atlantic, then the North Pacific, and finally in its region of greatest abundance, the southern oceans. There, perhaps 190,000 blue whales once had roamed. The hunt reached its peak in the", "venue": "BioScience", "label": 0}, {"loc": [4.691807746887207, 2.073133945465088], "openalex_id": "https://openalex.org/W2002812183", "title": "Beyond Words: Sentiment Analysis of Croatian Language Attitudes", "authors": "Jonathan W. Schooler, Stellan Ohlsson, Kevin R. Brooks", "abstract": "Four experiments examined whether verbalization can interfere with insight problem solving. In Experiment 1, Ss were interrupted during problem solving and asked either to verbalize their strategies (retrospective verbalization) or engage in an unrelated activity (control). Ss in the retrospective verbalization condition were significantly less successful than control subjects at solving the problems. Experiment 2 replicated the finding of Experiment 1 and demonstrated that the control Ss' advantage was not due to any beneficial effect of the interruption. In Experiment 3, concurrent, nondirective verbalization impaired the solving of insight problems but had no effect on noninsight problems. In Experiment 4, the effect of concurrent verbalization on insight was maintained even when Ss were encouraged to consider alternative approaches. Together, these findings are consistent with the hypothesis that verbalization can result in the disruption of nonreportable processes that are critical to achieving insight solutions.", "venue": "Journal of Experimental Psychology General", "label": 0}, {"loc": [2.9350178241729736, 1.899086594581604], "openalex_id": "https://openalex.org/W1497256448", "title": "Natural", "authors": "John H. Holland", "abstract": "Genetic algorithms are playing an increasingly important role in studies of complex adaptive systems, ranging from adaptive agents in economic theory to the use of machine learning techniques in the design of complex devices such as aircraft turbines and integrated circuits. Adaptation in Natural and Artificial Systems is the book that initiated this field of study, presenting the theoretical foundations and exploring applications. In its most familiar form, adaptation is a biological process, whereby organisms evolve by rearranging genetic material to survive in environments confronting them. In this now classic work, Holland presents a mathematical model that allows for the nonlinearity of such complex interactions. He demonstrates the model's universality by applying it to economics, physiological psychology, game theory, and artificial intelligence and then outlines the way in which this approach modifies the traditional views of mathematical genetics. Initially applying his concepts to simply defined artificial systems with limited numbers of parameters, Holland goes on to explore their use in the study of a wide range of complex, naturally occuring processes, concentrating on systems having multiple factors that interact in nonlinear ways. Along the way he accounts for major effects of coadaptation and coevolution: the emergence of building blocks, or schemata, that are recombined and passed on to succeeding generations to provide, innovations and improvements. Bradford Books imprint", "venue": "The MIT Press eBooks", "label": 0}, {"loc": [2.3452749252319336, 1.7437090873718262], "openalex_id": "https://openalex.org/W784374252", "title": "School of Computing", "authors": "Adrie J. Visscher", "abstract": "This article presents a framework for examining school administrative computing and will serve as a point of reference for subsequent articles in this special issue of the Journal of Research on Computing in Education. Initially, four phases in the development of computer assisted school administration are presented. Next, possible school administrative applications are delineated. Finally, the variables that play an important role in the design, use, and effects of school administrative computing are presented by means of a model that depicts their interrelationship.", "venue": "Journal of Research on Computing in Education", "label": 0}, {"loc": [5.352837085723877, 3.047433376312256], "openalex_id": "https://openalex.org/W2765675878", "title": "Turning Point", "authors": "Sumanta Banerjee", "abstract": "Discoveries in modern subatomic physics have introduced into the scientific world the idea that an innate systemic interdependence exists between all physical phenomena. Gradually all investigators of the true nature of reality will adopt this new philosophy of interrelatedness, which will bring today's confused Western culture to a natural 'turning point'.", "venue": "Physics Bulletin", "label": 0}, {"loc": [2.782369375228882, 2.1064295768737793], "openalex_id": "https://openalex.org/W2160560729", "title": "6 Performance Analysis", "authors": "A. Murdoch, J. Winkelman, S. Javid, Robert Barton", "abstract": "This paper discusses an approach to the modeling and performance for the preliminary design phase of a large (6.2 MW) horizontal axis wind turbine generator (WTG). Two control philosophies are presented, both of which are based on linearized models of the WT mechanical and electrical systems. The control designs are compared by showing the performance through detailed non-linear time simulation. The disturbances considered are wind gusts, and electrical faults near the WT terminals.", "venue": "IEEE Transactions on Power Apparatus and Systems", "label": 0}, {"loc": [5.349012851715088, 3.062819719314575], "openalex_id": "https://openalex.org/W2164109169", "title": "Smoky Quartz: A Transparent Bilingual Large Language Model", "authors": "K. Nassau, B. Prescott", "abstract": "By appropriate irradiation and heat-treatment some natural and synthetic quartz can be made blue, green or greenish-yellow. In some of these samples, the A1 and A2 optical absorption bands usually assigned to the smoky color are present without any smoky coloration being evident. On further irradiation there develop: 1. the smoky color; 2. a previously unrecognized absorption band at 2.9 eV (designated A3); and 3, an EPR signal due to the substitutional Al center. Therefore, the smoky color originates in the A3 absorption band. The absorption curves can be reproduced in detail with five Gaussian components, clearly demonstrating the presence of A1 (at 1.85 eV), A2 (2.55 eV), A3 (2.90 eV), B (3.95 eV \u2013 previously reported only in fused silica), and C absorption bands. Durch geeignete Bestrahlung und Temperung gelingt es, verschiedene nat\u00fcrliche und synthetische Quarze blau, gr\u00fcn und gr\u00fcnlich-gelb zu f\u00e4rben. In einigen von diesen F\u00e4llen werden die optischen A1- und A2-Absorptionsbanden, die gew\u00f6hnlich dem Rauchquarz zugeschrieben werden, gefunden, obwohl die Rauchfarbe nicht vorhanden ist. Bei weiterer Bestrahlung entwickelt sich: 1, die Rauchfarbe, 2. eine bisher nicht gefundene Absorptions-bande bei 2,9 eV (mit A3 bezeichnet), und 3. ein EPR-Signal, das von der Al-Substitutionsst\u00f6rstelle herr\u00fchrt. Die Absorptionskurven lassen sich exakt mit f\u00fcnf Gau\u00df-Komponenten reproduzieren, was deutlich das Vorhandensein der A1- (bei 1,85 eV), A2- (2,55 eV), A3- (2,90 eV), B- (3,95 eV \u2013 fr\u00fcher nur in Quarzglas beobachtet) und C-Absorptionsbande zeigt.", "venue": "physica status solidi (a)", "label": 0}]; var labels = ["Other", "arXiv (Cornell University)", "IEEE Access", "Preprints.org", "PeerJ Computer Science", "Proceedings of the AAAI Conference on Artificial Intelligence", "HAL (Le Centre pour la Communication Scientifique Directe)", "ACM Computing Surveys", "Applied Sciences", "Natural Language Processing Journal", "British Journal of Educational Technology", "PLoS ONE", "bioRxiv (Cold Spring Harbor Laboratory)", "Journal of Medical Internet Research", "Proceedings of the AAAI/ACM Conference on AI Ethics and Society", "Natural language processing.", "AI & Society", "Information", "Artificial Intelligence Review", "Electronics", "Scientific Data", "Proceedings of the VLDB Endowment", "ACM Transactions on Asian and Low-Resource Language Information Processing", "Deep Blue (University of Michigan)", "Scientific Reports", "Research Square (Research Square)", "Transactions of the Association for Computational Linguistics", "Computational Linguistics", "JMIR Medical Informatics", "ACM Transactions on Management Information Systems", "Future Internet", "International Journal of Human-Computer Interaction", "Journal of King Saud University - Computer and Information Sciences", "Concurrency and Computation Practice and Experience", "Journal of Physics Conference Series", "International Journal of Advanced Computer Science and Applications", "Complex & Intelligent Systems", "Algorithms", "Interaction design & architecture(s)/ID&A Interaction design & architecture(s)", "Advances in systems analysis, software engineering, and high performance computing book series", "Rhetoric Society Quarterly", "ACM Transactions on Knowledge Discovery from Data", "Multimedia Tools and Applications", "Astronomy and Astrophysics", "Engineering Technology & Applied Science Research", "Digital Humanities in the Nordic and Baltic Countries Publications", "Mathematics", "International Journal of Interactive Multimedia and Artificial Intelligence", "Science and Technology Law Review", "Proceedings of the International AAAI Conference on Web and Social Media", "Ethics and Information Technology"];