Spaces:

NASK-PIB
/

LLaVA-PLLuM

Running

File size: 35,865 Bytes

<!DOCTYPE html>
<html data-theme="light">

<head>
  <meta charset="utf-8" />
  <meta name="description" content="Building an Open Polish Vision-Language Model." />
  <meta name="keywords" content="VLM, Polish, AI, Multimodal, LLM, PLLuM, LLaVA-PLLuM" />
  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <title>LLaVA-PLLuM: a Polish Vision-Language Model</title>
  <link rel="icon"
    href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 36 36'><text x='0' y='32' font-size='32'>🇵🇱</text></svg>" />

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@1.0.4/css/bulma.min.css" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma-carousel@4.0.3/dist/css/bulma-carousel.min.css" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma-slider@2.0.4/dist/css/bulma-slider.min.css" />
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css" />
  <link rel="stylesheet" href="./static/css/index.css" />
  <link rel="stylesheet" href="./static/css/custom.css" />
  <link rel="icon" href="./static/images/favicon.svg" />

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/bulma-carousel@4.0.3/dist/js/bulma-carousel.min.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/bulma-slider@2.0.4/dist/js/bulma-slider.min.js"></script>
  <script src="https://unpkg.com/lucide@latest"></script>
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
  <script src="./static/js/index.js"></script>
</head>

<body>
  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">🇵🇱 LLaVA-PLLuM: Building an Open Polish
              Vision-Language Model</h1>
            <h2 class="subtitle is-4 publication-subtitle mt-4">
              Bridging the gap in multilingual AI with culturally-aware image understanding
            </h2>
            <div class="is-size-5 publication-authors">
              <div class="publication-authors">
                <strong>Grzegorz Statkiewicz, Alicja Dobrzeniecka, Aleksandra Krasnodębska, Sebastian Cygert, Wojciech
                  Kusa</strong>
              </div>
              <div class="author-institute">
                <span class="author-block">NASK National Research Institute</span>
              </div>
              <span class="author-block" style="margin-left: 15px">
                <a href="mailto:firstname.lastname@nask.pl">firstname.lastname@nask.pl</a>
              </span>
            </div>

            <div class="column has-text-centered">
              <div class="publication-links">
                <!-- PDF Link. -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2602.14073" 
                  class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>
                </span>
                <!-- Model Link. -->
                <span class="link-block">
                  <a href="https://huggingface.co/collections/NASK-PIB/llava-pllum"
                    target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fas fa-share-square"></i>
                    </span>
                    <span>Models</span>
                  </a>
                </span>
                <!-- Dataset Link. -->
                <span class="link-block">
                  <a href="https://huggingface.co/datasets/NASK-PIB/MMBench_V11_PL" 
                    target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fas fa-database"></i>
                    </span>
                    <span>Dataset</span>
                  </a>
              </div>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

  <section class="section">
    <div class="container is-max-desktop">
      <div class="columns">
        <!-- Table of Contents -->
        <div class="column is-3 is-hidden-touch">
          <aside class="menu sticky-menu">
            <p class="menu-label">Contents</p>
            <ul class="menu-list">
              <li><a href="#introduction">Introduction</a></li>
              <li><a href="#methodology">Methodology</a></li>
              <li><a href="#evaluation">Evaluation & Results</a></li>
              <li><a href="#qualitative">Qualitative Results</a></li>
              <li><a href="#summary">Summary</a></li>
              <li><a href="#references">References</a></li>
              <li><a href="#bibtex">BibTeX</a></li>
            </ul>
          </aside>
        </div>

        <!-- Main Content -->
        <div class="column is-9">
          <!-- Introduction. -->
          <div class="columns is-centered" id="introduction">
            <div class="column is-full-width">
            <div class="notification is-info is-light">
              <p><strong>🔥Update Feb 2026:</strong> Added two new models—NASK-PIB/LLaVA-PLLuM-12b-nc-instruct and NASK-PIB/LLaVA-Bielik-11b-v2.6-instruct—and released our Polish translation of the MMBench dataset [<a href="https://huggingface.co/collections/NASK-PIB/llava-pllum">HuggingFace</a>]</p>
            </div>
              <h2 class="title is-3">Introduction</h2>
              <div class="content has-text-justified">
                <p>
                  Recent advances in multimodal large language models (MLLMs) have shown impressive capabilities in
                  combining text and visual understanding. However, most state-of-the-art solutions are trained
                  primarily on English data, which limits their applicability in other languages and cultural
                  contexts. Our goal is to bridge this gap by creating a Polish multimodal model that not only
                  understands text and images but also reflects Polish linguistic and cultural nuances.
                </p>
                <p>
                  In this blog post, we describe the methodology used to deliver a proof-of-concept for a Polish Large
                  Language Model capable of handling both text and visual data. Our approach builds on the LLaVA-NeXT
                  framework <a href="#ref-3">[3]</a>, which aligns a pretrained visual encoder with a large language
                  model (LLM) via a lightweight MLP (Multi-Layer Perceptron) projector. We use the following
                  components:
                </p>
                <ul>
                  <li>
                    <strong>Language Model:</strong> PLLuM-12B (Polish Large Language Model)
                    <a href="#ref-1">[1]</a> - a Polish-native, instruction-tuned LLM.
                  </li>
                  <li>
                    <strong>Vision Encoder:</strong> SigLIP2 So400m/14, 384px <a href="#ref-4">[4]</a> - chosen for
                    strong multilingual image-text alignment and improved localization.
                  </li>
                </ul>
                <p>
                  We also train a model based on the Bielik-11B-v2.6 language model <a href="#ref-2">[20]</a>, as an alternative backbone, to explore the impact of different LLMs on Polish multimodal performance.
                  We train our models using automatic translation combined with manual filtering, resulting in
                  approximately 550 thousand samples for pretraining and 2 million samples for instruction fine-tuning.
                  The models
                  accurately describe images, incorporate Polish cultural context, and handle basic visual tasks such as
                  OCR and object counting.
                </p>
                <p>
                  Evaluation on open-source benchmarks and qualitative analysis show notable improvements in Polish
                  language understanding as well as recognizing some of Polish cultural elements, while maintaining
                  general image understanding and reasoning capabilities compared to existing open-source models.
                </p>
                <p>
                  This proof-of-concept marks an initial step toward robust multimodal models for Polish. To accelerate
                  progress and foster collaboration, we are releasing our model weights on Hugging Face.
                </p>
              </div>
            </div>
          </div>
          <!--/ Introduction. -->

          <hr class="section-divider" />

          <!-- Methodology -->
          <div class="columns is-centered" id="methodology">
            <div class="column is-full-width">
              <h2 class="title is-3">Methodology</h2>

              <h3 class="title is-4">Model Architecture</h3>
              <div class="content has-text-justified">
                <p>
                  We build on the LLaVA-NeXT architecture <a href="#ref-3">[3]</a> which aligns a pretrained visual
                  encoder with a large language model (LLM) via a lightweight two-layer MLP projector. This design
                  preserves the LLM’s strong language prior while enabling efficient multimodal grounding. Compared to
                  the original LLaVA, LLaVA-NeXT supports higher input resolutions and dynamic tiling, features that
                  have been observed to improve fine-grained perception and OCR performance.
                </p>
                <p>
                  As the language backbone, we use three different Polish-native, instruction-tuned LLMs: <strong>PLLuM-12B-nc-instruct-250715</strong>
                  <a href="#ref-1">[1]</a>, <strong>PLLuM-12B-nc-instruct</strong>, and <strong>Bielik-11b-v2.6</strong> <a href="#ref-2">[20]</a>. 
                  For the vision tower, we replace
                  the CLIP-like encoder commonly used in LLaVA variants with
                  <strong>SigLIP2 So400m/14, 384px</strong> <a href="#ref-4">[4]</a>, selected for its strong
                  multilingual image-text alignment.
                </p>
              </div>

              <h3 class="title is-4">Training Procedure</h3>
              <div class="content has-text-justified">
                <p>We train the model in two stages following the LLaVA-NeXT procedure:</p>
                <ul>
                  <li>
                    <strong>Stage 1 (Pre-training):</strong> Freeze the LLM backbone and vision encoder, optimize only
                    the MLP projector on pretraining dataset to align the connector.
                  </li>
                  <li>
                    <strong>Stage 2 (Instruction Tuning):</strong> Joint training of vision tower and projector,
                    with LLM adaptation using LoRA on instruction dataset.
                  </li>
                </ul>
                <div class="table-container">
                  <table class="table is-bordered is-striped is-hoverable is-fullwidth">
                    <thead>
                      <tr>
                        <th>Parameter</th>
                        <th>Stage 1</th>
                        <th>Stage 2</th>
                      </tr>
                    </thead>
                    <tbody>
                      <tr>
                        <td><strong>Training Samples</strong></td>
                        <td>558K</td>
                        <td>2M</td>
                      </tr>
                      <tr>
                        <td><strong>Vision Encoder (Trainable)</strong></td>
                        <td>N/A</td>
                        <td>400M</td>
                      </tr>
                      <tr>
                        <td><strong>Projector (Trainable)</strong></td>
                        <td>30M</td>
                        <td>30M</td>
                      </tr>
                      <tr>
                        <td><strong>Language Model (Trainable)</strong></td>
                        <td>N/A</td>
                        <td>12B</td>
                      </tr>
                      <tr>
                        <td><strong>Context Size (Tokens)</strong></td>
                        <td>8,192</td>
                        <td>8,192</td>
                      </tr>
                      <tr>
                        <td><strong>Batch Size</strong></td>
                        <td>256</td>
                        <td>128</td>
                      </tr>
                      <tr>
                        <td><strong>Learning Rate (Vision Encoder)</strong></td>
                        <td>N/A</td>
                        <td>2x10⁻⁶</td>
                      </tr>
                      <tr>
                        <td><strong>Learning Rate (Projector)</strong></td>
                        <td>1x10⁻³</td>
                        <td>2x10⁻⁵</td>
                      </tr>
                      <tr>
                        <td><strong>Learning Rate (Language Model)</strong></td>
                        <td>N/A</td>
                        <td>2x10⁻⁵</td>
                      </tr>
                      <tr>
                        <td><strong>LoRA Rank (Language Model)</strong></td>
                        <td>N/A</td>
                        <td>128</td>
                      </tr>
                      <tr>
                        <td><strong>LoRA Alpha (Language Model)</strong></td>
                        <td>N/A</td>
                        <td>256</td>
                      </tr>
                      <tr>
                        <td><strong>LoRA Dropout (Language Model)</strong></td>
                        <td>N/A</td>
                        <td>0.05</td>
                      </tr>
                      <tr>
                        <td><strong>Epochs</strong></td>
                        <td>1</td>
                        <td>1</td>
                      </tr>
                    </tbody>
                  </table>
                </div>
              </div>

              <h3 class="title is-4">Training Data</h3>
              <div class="content has-text-justified">
                <p>As the pretraining dataset, we use the LLaVA-LCS-558K <a href="#ref-16">[16]</a> following the LLaVA
                  paper <a href="#ref-19">[19]</a>. This dataset is a subset of the LAION/CC/SBU collection, filtered
                  for balanced concept coverage. It consists of 558k image-caption pairs augmented with BLIP synthetic
                  captions, which we translate to Polish to align the visual features with our language model.</p>
                <p>Our instruction dataset spans four skill categories:</p>
                <ul>
                  <li><strong>General:</strong> We translate English datasets: ALLaVA <a href="#ref-5">[5]</a>,
                    LLaVA-Instruct <a href="#ref-6">[6]</a>, Q-Instruct <a href="#ref-7">[7]</a>, LVIS-Instruct4V <a
                      href="#ref-8">[8]</a>, and A-OKVQA <a href="#ref-9">[9]</a>.</li>
                  <li>
                    <strong>OCR:</strong> Synthetic document-style images. We generate a Polish version (SynthDoG-PL)
                    and use the English version (SynthDoG-EN) following the SynthDoG procedure <a
                      href="#ref-10">[10]</a>.
                  </li>
                  <li>
                    <strong>Knowledge:</strong> Based on the WIT dataset <a href="#ref-12">[12]</a>. We select samples
                    with human-written Polish and English captions.
                  </li>
                  <li><strong>Counting:</strong> We translate TallyQA <a href="#ref-13">[13]</a></li>
                </ul>
                <p>
                  For translation, we use the Tower+ 72B model <a href="#ref-14">[14]</a> and the COMET reference-free
                  metric <a href="#ref-15">[15]</a> for filtering.
                  The resulting datasets are instantiated mostly in Polish (85%), with a smaller sample in English
                  (15%).
                </p>
                <div class="column is-full">
                  <div class="box is-shadowless">
                    <canvas id="trainingDataChart"></canvas>
                    <p class="has-text-centered mt-5 has-text-grey">Fine-tuning data distribution</p>
                  </div>
                </div>
              </div>
            </div>
          </div>
          <!--/ Methodology -->

          <hr class="section-divider" />

          <!-- Evaluation -->
          <div class="columns is-centered" id="evaluation">
            <div class="column is-full-width">
              <h2 class="title is-3">Evaluation & Results</h2>
              <div class="content has-text-justified">
              <p>
              We conduct a two-fold evaluation to assess the performance of our Polish vision-language model: (1)
              quantitative benchmarking using MMBench v1.1, and (2) a model-as-a-judge study on image captioning quality
              in Polish.
              </p>
              </div>
              <h3 class="title is-4">MMBench v1.1</h3>
              <div class="content has-text-justified">
                <p>
                  Due to the absence of established multimodal evaluation benchmarks in Polish, we adopt existing
                  English benchmarks for quantitative assessment.
                  As a primary benchmark, we selected MMBench v1.1 <a href="#ref-17">[17]</a>, which evaluates multiple
                  dimensions of visual understanding, including object recognition, OCR, commonsense reasoning, and
                  fine-grained perception.
                  Because the official MMBench test split has not been released, we choose to evaluate on the
                  development set.
                </p>
                <p>
                  To enable Polish evaluation, we translated all MMBench v1.1 questions into Polish using Tower+ 72B <a
                    href="#ref-14">[14]</a>, followed by manual expert correction to ensure linguistic accuracy and
                  eliminate translation artifacts. The resulting MMBench-PL dataset is therefore human-validated and
                  suitable for assessing Polish multimodal reasoning.
                </p>
                <p>
                  The usage of development split makes comparisons strictly fair only against the LLaVA family of
                  models, whose training data and fine-tuning procedures are publicly documented. For other open-source
                  VLMs (e.g., Pixtral, Qwen2.5-VL, PaliGemma), the extent of exposure to MMBench during fine-tuning is
                  unknown.
                  Only PaliGemma partially discloses pre-training information, but not fine-tuning, and therefore direct
                  leaderboard-style comparison should be interpreted with caution.
                </p>
                <div class="table-container">
                  <table class="table is-bordered is-striped is-hoverable is-fullwidth">
                    <thead>
                      <tr>
                        <th>Model</th>
                        <th>MMBench (Polish)</th>
                        <th>MMBench (English)</th>
                      </tr>
                    </thead>
                    <tbody>
                      <tr>
                        <td>LLaVA-1.6-Mistral-7B</td>
                        <td>68.18%</td>
                        <td>76.54%</td>
                      </tr>
                      <tr>
                        <td>LLaVA-1.6-Vicuna-13B</td>
                        <td>69.80%</td>
                        <td>74.39%</td>
                      </tr>
                      <tr>
                        <td>LLaVA-PLLuM-12b-nc-250715 (Ours)</td>
                        <td>76.73%</td>
                        <td>75.23%</td>
                      </tr>
                      <tr>
                        <td>LLaVA-Bielik-11b-v2.6 (Ours)</td>
                        <td>78.24%</td>
                        <td>77.75%</td>
                      </tr>
                      <tr class="is-selected">
                        <td><strong>LLaVA-PLLuM-12b-nc (Ours)</strong></td>
                        <td><strong>79.35%</strong> <span class="tag is-success">+9.55%</span></td>
                        <td><strong>78.43%</strong></td>
                      </tr>

                      <tr class="has-background-light">
                        <td colspan="3" class="has-text-centered">
                          <em>Additional Open-Source Models (different architectures)</em>
                        </td>
                      </tr>

                      <tr>
                        <td>Qwen2.5-VL-7B</td>
                        <td>75.56%</td>
                        <td>80.62%</td>
                      </tr>
                      <tr>
                        <td>PaliGemma2-10B</td>
                        <td>78.39%</td>
                        <td>80.46%</td>
                      </tr>
                      <tr>
                        <td>Pixtral-12B</td>
                        <td>82.06%</td>
                        <td>84.31%</td>
                      </tr>
                    </tbody>
                  </table>
                </div>
                <p>
                    <strong>Key Finding:</strong> Our best model achieves <strong>79.35%</strong> on the Polish MMBench 
                    v1.1 benchmark, representing a <strong>+9.55% improvement</strong> over LLaVA-1.6-Vicuna-13B (69.80%)
                     while maintaining strong English performance at 78.43%. This demonstrates improved 
                     recognition of Polish context and linguistic understanding. When compared to other open-source models, 
                     LLaVA-PLLuM shows notably better Polish language understanding, outperforming Qwen2.5-VL-7B (75.56%) 
                     and PaliGemma2-10B (78.39%) on the Polish benchmark.
              </p>
              </div>

              <h3 class="title is-4">Open-ended Generation Evaluation</h3>
              <div class="content has-text-justified">
                <p>
                  To evaluate abilities that go beyond multiple-choice recognition and involve open-ended text
                  generation, we conducted a second study based on image captioning. For this purpose, we used the
                  Polish portion of the XM3600 dataset [<a href="#ref-18">18</a>].
                  The task in XM3600 requires models to produce accurate, relevant, and grammatically correct
                  descriptions of images, making it a suitable testbed for generative multimodal performance.
                  We benchmarked our models against three competitive open-source vision-language models of different
                  architectures: Qwen2.5-VL-7B-Instruct, Pixtral-12B, and PaliGemma-3B, complementing the MMBench
                  evaluation. Because no Polish human-annotated standard for caption quality currently exists, we adopted a threefold evaluation strategy:
                  (1) Open-source LLM and VLM judges, (2) Closed-source VLM judge, and (3) Human evaluation.
                  Please refer to the <a href="https://arxiv.org/pdf/2602.14073" target="_blank">full paper</a> for complete details of the evaluation methodology and results.
                </p>
                <div class="table-container">
                  <table class="table is-bordered is-striped is-hoverable is-fullwidth">
                    <thead>
                      <tr>
                        <th>Model</th>
                        <th>LLaVA-PLLuM-12B-nc-250715</th>
                        <th>LLaVA-PLLuM-12B-nc</th>
                        <th>LLaVA-Bielik-11B-v2.6</th>
                      </tr>
                    </thead>
                    <tbody>
                      <tr>
                        <td>LLaVA-1.6-Mistral-7B</td>
                        <td>84.91%</td>
                        <td><strong>85.81%</strong></td>
                        <td>82.35%</td>
                      </tr>
                      <tr>
                        <td>LLaVA-1.6-Vicuna-13B</td>
                        <td>63.64%</td>
                        <td><strong>66.71%</strong></td>
                        <td>60.32%</td>
                      </tr>
                      <tr>
                        <td>PaliGemma2-10B</td>
                        <td>77.47%</td>
                        <td><strong>77.53%</strong></td>
                        <td>74.10%</td>
                      </tr>
                      <tr>
                        <td>Pixtral-12B</td>
                        <td>43.38%</td>
                        <td>48.33%</td>
                        <td>40.31%</td>
                      </tr>
                      <tr>
                        <td>Qwen2.5-VL-7B</td>
                        <td>42.69%</td>
                        <td>43.15%</td>
                        <td>34.76%</td>
                      </tr>
                    </tbody>
                    <tfoot>
                      <tr class="has-background-light">
                        <td colspan="4" class="has-text-centered">
                          <em>Preference rate (%) of our models over baseline judged by LLM (Llama-3.3-70B-Instruct) on XM3600 dataset for linguistic correctness of descriptions.</em>
                        </td>
                      </tr>
                    </tfoot>
                  </table>
                </div>
              </div>
            </div>
          </div>
          <!--/ Evaluation -->

          <hr class="section-divider" />

          <!-- Qualitative Results -->
          <div class="columns is-centered" id="qualitative">
            <div class="column is-full-width">
              <h2 class="title is-3">Qualitative Results</h2>
              <div class="content has-text-justified">
                <p>
                  To examine the models’ ability to grasp and understand Polish cultural context, we collected and
                  annotated a small dataset of pictures.
                </p>

                <div id="qualitative-results-container"></div>
              </div>
            </div>
          </div>
          <!--/ Qualitative Results -->

          <hr class="section-divider" />

          <!-- Summary -->
          <div class="columns is-centered" id="summary">
            <div class="column is-full-width">
              <h2 class="title is-3">Summary & Next Steps</h2>
              <div class="content has-text-justified">
                <p>
                  We have presented our pipeline for creating: a Polish vision-language model.
                  Crucially, this system was developed with minimal data curation, relying primarily on synthetic and
                  machine-translated datasets, without human correction or manual
                  annotation. Starting from the open-source LLaVA model family and equipping it with the PLLuM language
                  model, we managed to improve the VLM's ability to understand the Polish language as well as aspects of
                  Polish cultural context. We show gains of 9.5 percentage points over LLaVA-based baselines on a
                  manually corrected Polish-language version of MMBench dataset, underscoring the effectiveness of our data-efficient
                  approach.
                </p>
                <p>
                  This is only the first step toward creating a more capable family of Polish vision-language models.
                  We expect that further scaling of data and leveraging more recent vision-language architectures will
                  lead to additional improvements. We also intend to enhance the evaluation protocols by incorporating
                  human assessments and expanding the benchmark datasets to better capture Polish-specific challenges.
                </p>
              </div>
            </div>
          </div>
          <!--/ Summary -->

          <hr class="section-divider" />

          <!-- References -->
          <div class="columns is-centered" id="references">
            <div class="column is-full-width">
              <h2 class="title is-3">References</h2>
              <div class="content">
                <ol>
                  <li id="ref-1">
                    PLLuM: A Family of Polish Large Language Models -
                    <a href="https://arxiv.org/abs/2511.03823" target="_blank">
                      arXiv:2511.03823
                    </a>
                  </li>
                  <li id="ref-2">
                    PLLuM Model -
                    <a href="https://huggingface.co/CYFRAGOVPL/pllum-12b-nc-instruct-250715" target="_blank">
                      Hugging Face
                    </a>
                  </li>
                  <li id="ref-3">
                    LLaVA-NeXT -
                    <a href="https://llava-vl.github.io/blog/2024-01-30-llava-next/" target="_blank">
                      Blog Post
                    </a>
                  </li>
                  <li id="ref-4">
                    SigLIP2 -
                    <a href="https://arxiv.org/abs/2502.14786" target="_blank">
                      arXiv:2502.14786
                    </a>
                  </li>
                  <li id="ref-5">
                    ALLaVA -
                    <a href="https://arxiv.org/abs/2402.11684" target="_blank">
                      arXiv:2402.11684
                    </a>
                  </li>
                  <li id="ref-6">
                    Visual Instruction Tuning (LLaVA) -
                    <a href="https://arxiv.org/abs/2304.08485" target="_blank">
                      arXiv:2304.08485
                    </a>
                  </li>
                  <li id="ref-7">
                    Q-Instruct -
                    <a href="https://arxiv.org/abs/2311.06783" target="_blank">
                      arXiv:2311.06783
                    </a>
                  </li>
                  <li id="ref-8">
                    LVIS-Instruct4V -
                    <a href="https://arxiv.org/abs/2311.07574" target="_blank">
                      arXiv:2311.07574
                    </a>
                  </li>
                  <li id="ref-9">
                    A-OKVQA -
                    <a href="https://arxiv.org/abs/2206.01718" target="_blank">
                      arXiv:2206.01718
                    </a>
                  </li>
                  <li id="ref-10">
                    SynthDoG -
                    <a href="https://arxiv.org/abs/2111.15664" target="_blank">
                      arXiv:2111.15664
                    </a>
                  </li>
                  <li id="ref-11">
                    MS COCO -
                    <a href="https://arxiv.org/abs/1405.0312" target="_blank">
                      arXiv:1405.0312
                    </a>
                  </li>
                  <li id="ref-12">
                    WIT Dataset -
                    <a href="https://doi.org/10.1145/3404835.3463257" target="_blank">
                      ACM Digital Library
                    </a>
                  </li>
                  <li id="ref-13">
                    TallyQA -
                    <a href="https://arxiv.org/abs/1810.12440" target="_blank">
                      arXiv:1810.12440
                    </a>
                  </li>
                  <li id="ref-14">
                    Tower+ Translation Model -
                    <a href="https://huggingface.co/Unbabel/Tower-Plus-72B" target="_blank">
                      Hugging Face
                    </a>
                  </li>
                  <li id="ref-15">
                    COMET Metric -
                    <a href="https://unbabel.github.io/COMET/html/index.html" target="_blank">
                      Documentation
                    </a>
                  </li>
                  <li id="ref-16">
                    LLaVA-Pretrain Dataset -
                    <a href="https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain" target="_blank">
                      Hugging Face
                    </a>
                  </li>
                  <li id="ref-17">
                    MMBench -
                    <a href="https://huggingface.co/spaces/opencompass/open_vlm_leaderboard" target="_blank">
                      OpenCompass Leaderboard
                    </a>
                  </li>
                  <li id="ref-18">
                    Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset -
                    <a href="https://aclanthology.org/2022.emnlp-main.45/" target="_blank">
                      EMNLP 2022
                    </a>
                  </li>
                  <li id="ref-19">
                    Improved Baselines with Visual Instruction Tuning (LLaVA-1.5) -
                    <a href="https://arxiv.org/abs/2310.03744" target="_blank">
                      arXiv:2310.03744
                    </a>
                  <li id="ref-20">
                    Bielik 11B v2 Technical Report -
                    <a href="https://arxiv.org/abs/2505.02410" target="_blank">
                      arXiv:2505.02410
                    </a>
                  </li>
                </ol>
              </div>
            </div>
          </div>

          <!-- BibTeX -->
          <div class="columns is-centered" id="bibtex">
            <div class="column is-full-width">
              <h2 class="title is-3">BibTeX</h2>
              <pre><code>
@inproceedings{statkiewicz2026annotation,
  title     = {Annotation-Efficient Vision-Language Model Adaptation to the Polish Language Using the LLaVA Framework},
  author    = {Statkiewicz, Grzegorz and
               Dobrzeniecka, Alicja and
               Seweryn, Karolina and
               Krasnodębska, Aleksandra and
               Piosek, Karolina and
               Bogusz, Katarzyna and
               Cygert, Sebastian and
               Kusa, Wojciech},
  booktitle = {Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop},
  month     = mar,
  year      = {2026},
  address   = {Rabat, Morocco},
  publisher = {Association for Computational Linguistics}
}

                </code></pre>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

  <footer class="footer">
    <div class="container">
      <div class="content has-text-centered">
        <p>
          This website is adapted from <a href="https://github.com/nerfies/nerfies.github.io" target="_blank">Nerfies</a>, licensed
          under a
          <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative Commons Attribution-ShareAlike
            4.0 International License</a>.
        </p>
      </div>
    </div>
  </footer>

  <button id="scrollToTopBtn" class="button is-rounded is-dark" title="Go to top">
    <span class="icon">
      <i class="fas fa-arrow-up"></i>
    </span>
  </button>
</body>

</html>