dl3239491 commited on Jan 28

Commit

30c14cd

verified ·

1 Parent(s): ce477b1

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
.gitattributes +3 -0
.pytest_cache/.gitignore +2 -0
.pytest_cache/CACHEDIR.TAG +4 -0
.pytest_cache/README.md +8 -0
.pytest_cache/v/cache/nodeids +3 -0
.ruff_cache/.gitignore +2 -0
.ruff_cache/0.14.10/10241894308290549172 +0 -0
.ruff_cache/0.14.10/1073426088278906643 +0 -0
.ruff_cache/0.14.10/13957033273656742151 +0 -0
.ruff_cache/0.14.10/1442719585850318975 +0 -0
.ruff_cache/0.14.10/14754177912317367819 +0 -0
.ruff_cache/0.14.10/14978186029505022734 +0 -0
.ruff_cache/0.14.10/15569745458013874055 +0 -0
.ruff_cache/0.14.10/17608220473508725558 +0 -0
.ruff_cache/0.14.10/18191902847846296179 +0 -0
.ruff_cache/0.14.10/2046185769257499142 +0 -0
.ruff_cache/0.14.10/3165187837348788939 +0 -0
.ruff_cache/0.14.10/4171122735627067383 +0 -0
.ruff_cache/0.14.10/8273464926453838394 +0 -0
.ruff_cache/0.14.10/9088412491868955099 +0 -0
.ruff_cache/0.14.10/9103521535542433765 +0 -0
.ruff_cache/0.14.10/9189204400079810969 +0 -0
.ruff_cache/0.14.10/9226417474992298237 +0 -0
.ruff_cache/0.14.10/9918913907578606062 +0 -0
.ruff_cache/CACHEDIR.TAG +1 -0
ACKNOWLEDGEMENTS +34 -0
CODE_OF_CONDUCT.md +71 -0
CONTRIBUTING.md +10 -0
LICENSE +46 -0
README.md +407 -0
docs/Gemfile +23 -0
docs/_config.yml +61 -0
docs/getting_started.md +80 -0
docs/index.md +53 -0
docs/inference.md +134 -0
docs/training.md +129 -0
evaluation/evaluate.py +936 -0
evaluation/evaluate.py.bak +910 -0
evaluation/evaluation_data/end_to_end_evaluation/2wiki.zip +3 -0
evaluation/evaluation_data/end_to_end_evaluation/hotpotqa.zip +3 -0
evaluation/evaluation_data/end_to_end_evaluation/musique.zip +3 -0
evaluation/evaluation_data/end_to_end_evaluation/nq.zip +3 -0
evaluation/evaluation_data/instruction_tuning_evaluation/2wiki.zip +3 -0
evaluation/evaluation_data/instruction_tuning_evaluation/hotpotqa.zip +3 -0
evaluation/evaluation_data/instruction_tuning_evaluation/musique.zip +3 -0
evaluation/evaluation_data/instruction_tuning_evaluation/nq.zip +3 -0
example/end_to_end_data.jsonl +3 -0
example/instruction_tuning_data.jsonl +0 -0
example/pretrain_data.jsonl +0 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+example/end_to_end_data.jsonl filter=lfs diff=lfs merge=lfs -text
+figs/intro.png filter=lfs diff=lfs merge=lfs -text
+figs/sample_main.png filter=lfs diff=lfs merge=lfs -text

.pytest_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Created by pytest automatically.
2	+ *

.pytest_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

.pytest_cache/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

.pytest_cache/v/cache/nodeids ADDED Viewed

	@@ -0,0 +1,3 @@

+[
+  "tests/test_placeholder.py::test_placeholder"
+]

.ruff_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Automatically created by ruff.
2	+ *

.ruff_cache/0.14.10/10241894308290549172 ADDED Viewed

Binary file (136 Bytes). View file

.ruff_cache/0.14.10/1073426088278906643 ADDED Viewed

Binary file (136 Bytes). View file

.ruff_cache/0.14.10/13957033273656742151 ADDED Viewed

Binary file (1.05 kB). View file

.ruff_cache/0.14.10/1442719585850318975 ADDED Viewed

Binary file (171 Bytes). View file

.ruff_cache/0.14.10/14754177912317367819 ADDED Viewed

Binary file (132 Bytes). View file

.ruff_cache/0.14.10/14978186029505022734 ADDED Viewed

Binary file (129 Bytes). View file

.ruff_cache/0.14.10/15569745458013874055 ADDED Viewed

Binary file (136 Bytes). View file

.ruff_cache/0.14.10/17608220473508725558 ADDED Viewed

Binary file (132 Bytes). View file

.ruff_cache/0.14.10/18191902847846296179 ADDED Viewed

Binary file (129 Bytes). View file

.ruff_cache/0.14.10/2046185769257499142 ADDED Viewed

Binary file (129 Bytes). View file

.ruff_cache/0.14.10/3165187837348788939 ADDED Viewed

Binary file (171 Bytes). View file

.ruff_cache/0.14.10/4171122735627067383 ADDED Viewed

Binary file (129 Bytes). View file

.ruff_cache/0.14.10/8273464926453838394 ADDED Viewed

Binary file (1.05 kB). View file

.ruff_cache/0.14.10/9088412491868955099 ADDED Viewed

Binary file (1.05 kB). View file

.ruff_cache/0.14.10/9103521535542433765 ADDED Viewed

Binary file (132 Bytes). View file

.ruff_cache/0.14.10/9189204400079810969 ADDED Viewed

Binary file (136 Bytes). View file

.ruff_cache/0.14.10/9226417474992298237 ADDED Viewed

Binary file (132 Bytes). View file

.ruff_cache/0.14.10/9918913907578606062 ADDED Viewed

Binary file (1.05 kB). View file

.ruff_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1 @@


1	+ Signature: 8a477f597d28d172789f06886806bc55

ACKNOWLEDGEMENTS ADDED Viewed

	@@ -0,0 +1,34 @@

+Acknowledgements
+Portions of this CLaRa Software may utilize the following copyrighted
+material, the use of which is hereby acknowledged.
+_____________________
+Naver Labs Europe (PISCO-mistral)
+    Licensed under the Creative Commons Attribution-NonCommercial 4.0 International License (CC BY-NC 4.0).
+    Copyright © Naver Labs Europe
+    You are free to:
+    - Share — copy and redistribute the material in any medium or format
+    - Adapt — remix, transform, and build upon the material
+    Under the following terms:
+    - Attribution — You must give appropriate credit, provide a link to the license,
+      and indicate if changes were made.
+    - NonCommercial — You may not use the material for commercial purposes.
+    Full license text available at: https://creativecommons.org/licenses/by-nc/4.0/
+OpenRLHF authors
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+         http://www.apache.org/licenses/LICENSE-2.0
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4,
+available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html)

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# Contribution Guide
+Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducibility, and beyond its publication there are limited plans for future development of the repository.
+While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged.
+## Before you get started
+By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE).

LICENSE ADDED Viewed

	@@ -0,0 +1,46 @@

+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+IMPORTANT:  This Apple software is supplied to you by Apple
+Inc. ("Apple") in consideration of your agreement to the following
+terms, and your use, installation, modification or redistribution of
+this Apple software constitutes acceptance of these terms.  If you do
+not agree with these terms, please do not use, install, modify or
+redistribute this Apple software.
+In consideration of your agreement to abide by the following terms, and
+subject to these terms, Apple grants you a personal, non-exclusive
+license, under Apple's copyrights in this original Apple software (the
+"Apple Software"), to use, reproduce, modify and redistribute the Apple
+Software, with or without modifications, in source and/or binary forms;
+provided that if you redistribute the Apple Software in its entirety and
+without modifications, you must retain this notice and the following
+text and disclaimers in all such redistributions of the Apple Software.
+Neither the name, trademarks, service marks or logos of Apple Inc. may
+be used to endorse or promote products derived from the Apple Software
+without specific prior written permission from Apple.  Except as
+expressly stated in this notice, no other rights or licenses, express or
+implied, are granted by Apple herein, including but not limited to any
+patent rights that may be infringed by your derivative works or by other
+works in which the Apple Software may be incorporated.
+The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
+MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
+THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
+OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
+MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
+AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
+STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+---
+This CLaRa Software may utilize third party materials. Please refer to the
+ACKNOWLEDGEMENTS file included with this software for attribution and
+license information related to third party code that may be contained in or
+used with this CLaRa Software.

README.md ADDED Viewed

	@@ -0,0 +1,407 @@

+##  CLaRa: Bridging Retrieval and Generation with Continuous Latent Reasoning
+<div align="center">
+  <img src="figs/clara_logo.jpg" width="400"/>
+</div>
+<div align="center">
+  <a href="https://arxiv.org/abs/2511.18659"><img src="https://img.shields.io/badge/arXiv-2511.18659-b31b1b.svg" alt="arXiv"></a>
+  <a href="LICENSE"><img src="https://img.shields.io/badge/License-Apple-blue" alt="License"></a>
+  <a href="https://huggingface.co/apple/CLaRa-7B-Base"><img src="https://img.shields.io/badge/Hugging%20Face-CLaRa_Base-FFEB3B" alt="deploy"></a>
+  <a href="https://huggingface.co/apple/CLaRa-7B-Instruct"><img src="https://img.shields.io/badge/Hugging%20Face-CLaRa_Instruct-FFEB3B" alt="deploy"></a>
+  <a href="https://huggingface.co/apple/CLaRa-7B-E2E"><img src="https://img.shields.io/badge/Hugging%20Face-CLaRa_End_to_end-FFEB3B" alt="deploy"></a>
+  <a href="https://huggingface.co/datasets/apple/CLaRa_multi_stage"><img src="https://img.shields.io/badge/Hugging%20Face-CLaRa_Data-FFEB3B" alt="data"></a>
+</div>
+This is the official open-source release of CLaRa, a state-of-the-art, end-to-end Retrieval-Augmented Generation model.
+### Updates
+- Dec 11, 2025. All used data are available on [Huggingface](https://huggingface.co/datasets/apple/CLaRa_multi_stage).
+- Dec 10, 2025. We are working on an MLX version of the model, to be announced soon.
+- Dec 3, 2025. Evaluation data are available in `./evaluation/evaluation_data`.
+- Nov 25, 2025. Models are available on Huggingface.
+### Motivation
+Retrieval-Augmented Generation (RAG) enhances large language models with external knowledge but suffers from **long contexts** and **disjoint retrieval-generation optimization**. Existing soft compression frameworks face two key limitations: (i) reconstruction-based objectives bias compressors toward surface patterns rather than semantic preservation; (ii) retrievers and compressors are trained separately, requiring double encoding despite compressed vectors being inherently retrievable.
+In this work, we investigate:
+- **How can we improve semantic preservation in compressed representations through better pretraining objectives?**
+- **How can we unify retrieval and generation optimization to avoid redundant encoding and disjoint objectives?**
+<div align="center">
+<img src="figs/intro.png" width="100%"/>
+</div>
+We design a Three-stage training approach and introduce document compression techniques to improve RAG efficiency. The key findings are listed below.
+### Findings
+- **Efficient Compression**: CLaRa achieves significant compression rates (32x-64x) while preserving essential information for accurate answer generation.
+- **Three-Stage Training**: A carefully designed Three-stage training approach (compression pretraining + compression instruction tuning + end-to-end fine-tuning) enables effective learning of both retrieval and generation.
+For more interesting findings, please refer to our original paper!
+---
+### Three-Stage Training
+CLaRa uses a carefully designed three-stage training approach:
+**Stage 1: Compression Pretraining**
+- Train the compressor using SCP framework with QA pairs and paraphrases
+- Retain key semantics through QA-based and paraphrase-guided supervision
+- Support compression rates of 1x-256x
+**Stage 2: Compression Instruction Tuning**
+- Fine-tune the compressor on instruction-following tasks for downstream QA
+- Use text-based QA output to ensure compressed representations retain sufficient semantics
+**Stage 3: End-to-End Fine-tuning (CLaRa)**
+- Jointly train reranker and generator via a single language modeling loss
+- Unify retrieval and generation in shared continuous space using differentiable top-k estimator
+In this repository, we release our implementation of **CLaRa**, built upon [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF).
+### Getting Started
+```
+├── scripts/                      # Training and evaluation scripts
+│   ├── train_pretraining.sh     # Stage 1: Compression pretraining
+│   ├── train_instruction_tuning.sh  # Stage 2: Compression instruction tuning
+│   ├── train_stage_end_to_end.sh    # Stage 3: End-to-end training
+│   └── evaluation_end_to_end.sh     # Evaluation scripts
+├── openrlhf/                     # Core training framework
+│   ├── models/                   # Model implementations
+│   │   └── modeling_clara.py   # CLaRa model definition
+│   ├── datasets/                 # Dataset handling
+│   │   └── sft_dataset.py        # Training dataset
+│   ├── trainer/                  # Training utilities
+│   │   └── sft_trainer.py        # SFT trainer
+│   └── cli/                      # Command line interface
+│       └── train_sft.py          # Main training script
+├── evaluation/                   # Evaluation framework
+├── example/                      # Example training data
+│   ├── pretrain_data.jsonl
+│   ├── instruction_tuning_data.jsonl
+│   └── end_to_end_data.jsonl
+└── README.md                     # This file
+```
+Video instruction for installation (from @Fahd Mirza): https://youtu.be/al2VoAKn8GU?si=Q8bq7QNMaTvcArwa
+Video digest (from @Richard Aragon): https://www.youtube.com/watch?v=yRM92mmKNH4
+#### 1. Prepare code and environment
+Clone the repository and set up the environment:
+```bash
+# Create conda environment
+env=clara
+conda create -n $env python=3.10 -y
+conda activate $env
+# Install dependencies
+pip install -r requirements.txt
+# Set up environment variables
+export PYTHONPATH=/path/to/clara:$PYTHONPATH
+```
+Key dependencies include:
+- PyTorch >= 2.0
+- Transformers >= 4.20
+- DeepSpeed >= 0.18
+- Flash Attention 2
+- Accelerate
+#### 2. Data preparation
+Prepare training data in JSONL format. For pretraining stage:
+```bash
+# Example data format for pretraining
+{
+    "data_type": "qa",
+    "question": ["Question 1",],
+    "answers": ["Answer 1"],
+    "docs": ["Document 1"]
+}
+```
+For end-to-end training:
+```bash
+{
+    "question": "Single question text",
+    "docs": ["Document 1", "Document 2", ...],
+    "gold_answer": "Reference answer"
+}
+```
+#### 3. Start training
+**Stage 1: Salient Compressor Pretraining (SCP)**
+Pre-train the document compressor :
+```bash
+bash scripts/train_pretraining.sh
+```
+Key parameters:
+- `--compress_rate`: Compression rate (default: 32)
+- `--doc_max_length`: Maximum document length (default: 256)
+- `--stage stage1`: Training stage
+- `--mse_loss`: Use MSE loss to align compressed and original representations
+- `--qa_loss`: Use QA loss for semantic preservation
+**Stage 2: Compression Instruction Tuning**
+Fine-tune the compressor on instruction-following tasks:
+```bash
+bash scripts/train_instruction_tuning.sh
+```
+Key parameters:
+- `--pretrain_checkpoint`: Path to stage 1 checkpoint
+- `--stage stage1_2`: Training stage
+- `--generation_top_k`: Top-k sampling for generation (default: 5)
+- `--mse_loss`: Use MSE loss for compression training
+- `--do_eval_gen`: Enable generation evaluation
+**Stage 3: End-to-End Training**
+Fine-tune the model end-to-end with retrieval:
+```bash
+bash scripts/train_stage_end_to_end.sh
+```
+Key parameters:
+- `--pretrain_checkpoint`: Path to stage 2 checkpoint
+- `--stage stage2`: Training stage
+- `--generation_top_k`: Top-k sampling for generation
+- `--do_eval_gen`: Enable generation evaluation
+#### 4. Distributed Training
+The training scripts support distributed training across multiple nodes and GPUs:
+- `--max_len`: Maximum sequence length (default: 2048 for stage1/stage2, 1024 for stage3)
+- `--train_batch_size`: Training batch size
+- `--micro_train_batch_size`: Micro batch size for gradient accumulation
+- `--learning_rate`: Learning rate (default: 1e-4 for stage1/stage2, 5e-6 for stage3)
+- `--max_epochs`: Maximum training epochs
+- `--zero_stage`: ZeRO optimization stage (default: 2)
+- `--bf16`: Use bfloat16 precision
+- `--flash_attn`: Use Flash Attention 2
+### Inference
+The CLaRa models can be loaded and used for inference. We provide three models corresponding to different training stages:
+<details>
+  <summary>Stage 1: Compression Pretraining model (click to expand)</summary>
+  ```python
+  from transformers import AutoModel
+  model_path = "path/to/stage1/model"
+  model = AutoModel.from_pretrained(
+      model_path,
+      trust_remote_code=True
+  ).to('cuda')
+  # Example documents
+  documents = [
+      [
+          "Document 1 content...",
+          "Document 2 content...",
+          "Document 3 content..."
+      ]
+  ]
+  questions = ["" for _ in range(len(documents))]
+  # Generate paraphrase from compressed representations
+  output = model.generate_from_paraphrase(
+      questions=questions,
+      documents=documents,
+      max_new_tokens=64
+  )
+  print('Generated paraphrase:', output[0])
+  ```
+</details>
+<details>
+  <summary>Stage 2: Compression Instruction Tuning model (click to expand)</summary>
+  ```python
+  from transformers import AutoModel
+  model_path = "path/to/stage2/model"
+  model = AutoModel.from_pretrained(
+      model_path,
+      trust_remote_code=True
+  ).to('cuda')
+  # Example documents and question
+  documents = [
+      [
+          "Document 1 content...",
+          "Document 2 content...",
+          "Document 3 content..."
+      ]
+  ]
+  questions = ["Your question here"]
+  # Generate answer from compressed representations
+  output = model.generate_from_text(
+      questions=questions,
+      documents=documents,
+      max_new_tokens=64
+  )
+  print('Generated answer:', output[0])
+  ```
+</details>
+<details>
+  <summary>Stage 3: End-to-End (CLaRa) model (click to expand)</summary>
+  ```python
+  from transformers import AutoModel
+  model_path = "path/to/stage3/model"
+  model = AutoModel.from_pretrained(
+      model_path,
+      trust_remote_code=True
+  ).to('cuda')
+  # Example documents and question
+  # Note: Stage 3 supports retrieval with multiple candidate documents
+  documents = [
+      ["Document 1 content..." for _ in range(20)]  # 20 candidate documents
+  ]
+  questions = ["Your question here"]
+  # Generate answer with retrieval and reranking
+  # The top-k is decided by generation_top_k in config.json
+  output, topk_indices = model.generate_from_questions(
+      questions=questions,
+      documents=documents,
+      max_new_tokens=64
+  )
+  print('Generated answer:', output[0])
+  print('Top-k selected document indices:', topk_indices)
+  ```
+</details>
+### Evaluation
+The evaluation framework is based on standard RAG benchmarks. Run evaluation:
+**End-to-end evaluation:**
+```bash
+bash scripts/evaluation_end_to_end.sh
+```
+**Instruction tuning evaluation:**
+```bash
+bash scripts/evaluation_instruction_tuning.sh
+```
+Supported datasets:
+- **HotpotQA**: Multi-hop question answering
+- **MuSiQue**: Multi-hop question answering with diverse reasoning
+- **2WikiMultiHopQA**: Multi-hop question answering over Wikipedia
+- **Natural Questions**: Open-domain question answering
+### Results
+#### Compression Performance
+We evaluate our document compressor on four QA datasets (NQ, HotpotQA, MuSiQue, 2WikiMultiHopQA) under two settings: **Normal** (retrieving top-5 documents) and **Oracle** (gold document included). CLaRa consistently outperforms all baselines across different compression ratios.
+<div align="center">
+**Main Results (Mistral-7B, Normal Setting)**
+| Model | CR | NQ | HotpotQA | MuSiQue | 2Wiki | Avg |
+|:---|:---:|:---:|:---:|:---:|:---:|:---:|
+| AutoCompressor | - | 17.24 | 14.61 | 3.81 | 19.89 | 13.89 |
+| XRAG | 128 | 32.35 | 25.16 | 3.64 | 28.79 | 22.48 |
+| COCOM | 16 | 24.12 | 21.48 | 3.52 | 24.48 | 18.40 |
+| PCC | 16 | 31.38 | 22.29 | 3.43 | 19.47 | 19.14 |
+| LLMLingua-2 | 4 | 47.53 | 37.05 | 9.02 | 44.35 | 34.49 |
+| PISCO | 16 | 54.39 | 41.94 | 10.09 | 44.88 | 37.83 |
+| Mistral-7B w/ retrieval | - | 54.58 | 42.94 | 8.94 | 44.24 | 37.67 |
+| **CLaRa (CR=4)** | **4** | **57.05** | **45.09** | **10.34** | **46.94** | **39.86** |
+| **CLaRa (CR=16)** | **16** | **55.56** | **43.72** | **10.55** | **46.00** | **38.96** |
+| **CLaRa (CR=32)** | **32** | **54.64** | **43.52** | **10.55** | **46.58** | **38.82** |
+**Oracle Setting Results (Mistral-7B)**
+| Model | CR | NQ | HotpotQA | MuSiQue | 2Wiki | Avg |
+|:---|:---:|:---:|:---:|:---:|:---:|:---:|
+| PISCO | 16 | 73.44 | 66.53 | 33.80 | 60.45 | 58.55 |
+| Mistral-7B w/ retrieval | - | 71.64 | 70.77 | 45.72 | 68.83 | 64.24 |
+| **CLaRa (CR=4)** | **4** | **76.50** | **73.81** | **46.26** | **70.48** | **66.76** |
+| **CLaRa (CR=16)** | **16** | **75.48** | **70.79** | **43.15** | **66.16** | **63.90** |
+| **CLaRa (CR=32)** | **32** | **73.77** | **69.51** | **38.31** | **64.54** | **61.53** |
+</div>
+**Key Findings:**
+- ✅ CLaRa outperforms PISCO by **+1.13%** (Normal) and **+5.35%** (Oracle) on average
+- ✅ CLaRa outperforms LLMLingua-2 by **+5.37%** (Normal) on average
+- ✅ CLaRa matches/exceeds text-based baseline with **+2.36%** average gain on Mistral-7B
+#### Retrieval Performance
+<div align="center">
+<img src="figs/main_recall.png" width="80%"/>
+</div>
+For detailed experimental results and analysis, please refer to our paper.
+## Acknowledgments
+We sincerely appreciate the following works for CLaRa:
+- Our implementation is built upon the [OpenRLHF framework](https://github.com/OpenRLHF/OpenRLHF).
+- Inspired by [PISCO-mistral](https://huggingface.co/naver/pisco-mistral) for document compression techniques
+## Citation
+```bibtex
+@misc{he2025clarabridgingretrievalgeneration,
+      title={CLaRa: Bridging Retrieval and Generation with Continuous Latent Reasoning},
+      author={Jie He and Richard He Bai and Sinead Williamson and Jeff Z. Pan and Navdeep Jaitly and Yizhe Zhang},
+      year={2025},
+      eprint={2511.18659},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2511.18659},
+}
+```

docs/Gemfile ADDED Viewed

	@@ -0,0 +1,23 @@

+source "https://rubygems.org"
+gem "jekyll", "~> 4.3"
+gem "jekyll-feed", "~> 0.12"
+gem "jekyll-seo-tag", "~> 2.8"
+gem "jekyll-sitemap", "~> 1.4"
+gem "jekyll-theme-cayman", "~> 0.2"
+# Windows and JRuby do not include zoneinfo files, so bundle the tzinfo-data gem
+# and associated library.
+platforms :mingw, :x64_mingw, :mswin, :jruby do
+  gem "tzinfo", ">= 1", "< 3"
+  gem "tzinfo-data"
+end
+# Performance-booster for watching directories on Windows
+gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin]
+# Lock `http_parser.rb` gem to `v0.6.x` on JRuby builds since newer versions of the gem
+# do not have a Java counterpart.
+gem "http_parser.rb", "~> 0.6.0", :platforms => [:jruby]

docs/_config.yml ADDED Viewed

	@@ -0,0 +1,61 @@

+# Jekyll configuration for CLaRa Documentation
+# Site settings
+title: CLaRa Documentation
+description: Complete documentation for CLaRa - Unified Retrieval-Augmented Generation with Compression
+baseurl: "/CLaRa" # the subpath of your site for GitHub Pages
+url: "https://aiml-oss.github.io" # the base hostname & protocol for GitHub Pages
+# Theme
+theme: jekyll-theme-cayman
+# Alternative themes:
+# theme: jekyll-theme-minimal
+# theme: jekyll-theme-slate
+# theme: jekyll-theme-architect
+# GitHub repository info
+repository: probe2/CLaRa
+# Build settings
+markdown: kramdown
+kramdown:
+  input: GFM
+  syntax_highlighter: rouge
+# Plugins
+plugins:
+  - jekyll-feed
+  - jekyll-seo-tag
+  - jekyll-sitemap
+# Navigation
+header_pages:
+  - index.md
+  - getting_started.md
+  - training.md
+  - inference.md
+# Exclude from processing
+exclude:
+  - Gemfile
+  - Gemfile.lock
+  - node_modules
+  - vendor/bundle/
+  - vendor/cache/
+  - vendor/gems/
+  - vendor/ruby/
+  - "*.sh"
+  - "*.log"
+# Collections and navigation
+# collections:
+#   docs:
+#     output: true
+#     permalink: /:collection/:path/
+# Defaults
+defaults:
+  - scope:
+      path: ""
+      values:
+        layout: "default"

docs/getting_started.md ADDED Viewed

	@@ -0,0 +1,80 @@

+---
+layout: default
+title: Getting Started
+permalink: /getting_started/
+---
+# Getting Started with CLaRa
+This guide will help you get started with CLaRa, from installation to running your first training.
+## Installation
+### Prerequisites
+- Python 3.10+
+- CUDA-compatible GPU (recommended)
+- PyTorch 2.0+
+- CUDA 11.8 or 12.x
+### Step 1: Create Conda Environment
+```bash
+env=clara
+conda create -n $env python=3.10 -y
+conda activate $env
+```
+### Step 2: Install Dependencies
+```bash
+pip install -r requirements.txt
+```
+Key dependencies include:
+- `torch>=2.0`
+- `transformers>=4.20`
+- `deepspeed>=0.18`
+- `flash-attn>=2.8.0`
+- `accelerate>=1.10.1`
+- `peft>=0.17.1`
+### Step 3: Set Environment Variables
+```bash
+export PYTHONPATH=/path/to/clara:$PYTHONPATH
+```
+## Quick Start
+### 1. Prepare Your Data
+CLaRa uses JSONL format for training data. See the [Training Guide](./training.md) for data format details.
+### 2. Train Stage 1: Compression Pretraining
+```bash
+bash scripts/train_pretraining.sh
+```
+### 3. Train Stage 2: Instruction Tuning
+```bash
+bash scripts/train_instruction_tuning.sh
+```
+### 4. Train Stage 3: End-to-End Training
+```bash
+bash scripts/train_stage_end_to_end.sh
+```
+### 5. Run Inference
+See the [Inference Guide](./inference.md) for examples of using all three model stages.
+## Next Steps
+- [Training Guide](./training.md) - Detailed training instructions and data formats
+- [Inference Guide](./inference.md) - Inference examples for all model stages

docs/index.md ADDED Viewed

	@@ -0,0 +1,53 @@

+---
+layout: default
+title: CLaRa Documentation
+---
+# CLaRa Documentation
+Welcome to the CLaRa documentation! This site provides comprehensive guides and references for using CLaRa.
+## What is CLaRa?
+**CLaRa** (Continuous Latent Reasoning) is a unified framework for retrieval-augmented generation that performs embedding-based compression and joint optimization in a shared continuous space.
+[![Paper](https://img.shields.io/badge/Paper-Arxiv%20Link-green)](https://arxiv.org/abs/XXXX.XXXXX) [![License](https://img.shields.io/badge/License-Apple-blue)](../LICENSE) [![deploy](https://img.shields.io/badge/Hugging%20Face-CLaRa_Base-FFEB3B)](https://huggingface.co/your-org/clara-base) [![deploy](https://img.shields.io/badge/Hugging%20Face-CLaRa_Instruct-FFEB3B)](https://huggingface.co/your-org/clara-instruct) [![deploy](https://img.shields.io/badge/Hugging%20Face-CLaRa_End_to_end-FFEB3B)](https://huggingface.co/your-org/clara-e)
+## Documentation
+- **[Getting Started](./getting_started.md)** - Installation and quick start guide
+- **[Training Guide](./training.md)** - Detailed instructions for all three training stages including data formats
+- **[Inference Guide](./inference.md)** - How to use CLaRa models for inference
+## Quick Links
+- **GitHub Repository**: [github.com/apple/ml-CLaRa](https://github.com/apple/ml-CLaRa)
+- **Main README**: [../README.md](../README.md)
+- **Model Checkpoints**: [Hugging Face](https://huggingface.co/your-org/clara-base) (Coming Soon)
+## Overview
+CLaRa uses a three-stage training approach:
+1. **Stage 1: Compression Pretraining** - Learn effective document compression
+2. **Stage 2: Compression Instruction Tuning** - Adapt for downstream QA tasks
+3. **Stage 3: End-to-End Fine-tuning (CLaRa)** - Joint retrieval and generation optimization
+For more details, see the [Training Guide](./training.md).
+## Citation
+If you use CLaRa in your research, please cite:
+```bibtex
+@article{clara2024,
+  title={CLaRa: Unified Retrieval-Augmented Generation with Compression},
+  author={[Authors]},
+  journal={[Journal]},
+  year={2024},
+  eprint={XXXX.XXXXX},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL},
+  url={https://arxiv.org/abs/XXXX.XXXXX}
+}
+```

docs/inference.md ADDED Viewed

	@@ -0,0 +1,134 @@

+---
+layout: default
+title: Inference Guide
+permalink: /inference/
+---
+# Inference Guide
+This guide shows how to use CLaRa models for inference at different stages.
+## Loading Models
+CLaRa models can be loaded using the standard `AutoModel` interface:
+```python
+from transformers import AutoModel
+model = AutoModel.from_pretrained(
+    "path/to/model",
+    trust_remote_code=True
+).to('cuda')
+```
+## Stage 1: Compression Pretraining Model
+Generate paraphrases from compressed document representations.
+```python
+from transformers import AutoModel
+model = AutoModel.from_pretrained(
+    "path/to/stage1/model",
+    trust_remote_code=True
+).to('cuda')
+# Example documents
+documents = [
+    [
+        "Document 1 content...",
+        "Document 2 content...",
+        "Document 3 content..."
+    ]
+]
+questions = ["" for _ in range(len(documents))]
+# Generate paraphrase from compressed representations
+output = model.generate_from_paraphrase(
+    questions=questions,
+    documents=documents,
+    max_new_tokens=64
+)
+print('Generated paraphrase:', output[0])
+```
+## Stage 2: Compression Instruction Tuning Model
+Generate answers from compressed representations for QA tasks.
+```python
+from transformers import AutoModel
+model = AutoModel.from_pretrained(
+    "path/to/stage2/model",
+    trust_remote_code=True
+).to('cuda')
+# Example documents and question
+documents = [
+    [
+        "Document 1 content...",
+        "Document 2 content...",
+        "Document 3 content..."
+    ]
+]
+questions = ["Your question here"]
+# Generate answer from compressed representations
+output = model.generate_from_text(
+    questions=questions,
+    documents=documents,
+    max_new_tokens=64
+)
+print('Generated answer:', output[0])
+```
+## Stage 3: End-to-End (CLaRa) Model
+Generate answers with retrieval and reranking using joint optimization.
+```python
+from transformers import AutoModel
+model = AutoModel.from_pretrained(
+    "path/to/stage3/model",
+    trust_remote_code=True
+).to('cuda')
+# Example documents and question
+# Note: Stage 3 supports retrieval with multiple candidate documents
+documents = [
+    ["Document 1 content..." for _ in range(20)]  # 20 candidate documents
+]
+questions = ["Your question here"]
+# Generate answer with retrieval and reranking
+# The top-k is decided by generation_top_k in config.json
+output, topk_indices = model.generate_from_questions(
+    questions=questions,
+    documents=documents,
+    max_new_tokens=64
+)
+print('Generated answer:', output[0])
+print('Top-k selected document indices:', topk_indices)
+```
+## Key Parameters
+- `max_new_tokens`: Maximum number of tokens to generate (default: 128)
+- `generation_top_k`: Number of top documents to select (configured in model config)
+## Model Methods
+- `generate_from_paraphrase()` - Stage 1: Generate paraphrases
+- `generate_from_text()` - Stage 2: Generate answers from compressed docs
+- `generate_from_questions()` - Stage 3: Generate with retrieval and reranking

docs/training.md ADDED Viewed

	@@ -0,0 +1,129 @@

+---
+layout: default
+title: Training Guide
+permalink: /training/
+---
+# Training Guide
+This guide covers the three-stage training process in CLaRa.
+## Overview
+CLaRa uses a three-stage training approach:
+1. **Stage 1**: Compression Pretraining
+2. **Stage 2**: Compression Instruction Tuning
+3. **Stage 3**: End-to-End Fine-tuning (CLaRa)
+## Stage 1: Compression Pretraining
+Train the compressor to learn effective document compression.
+### Key Parameters
+- `--stage stage1`: Training stage identifier
+- `--compress_rate`: Compression rate (default: 32)
+- `--doc_max_length`: Maximum document length (default: 256)
+- `--mse_loss`: Use MSE loss for compression alignment
+- `--qa_loss`: Use QA loss for semantic preservation
+### Example Command
+```bash
+bash scripts/train_pretraining.sh
+```
+### Data Format
+**Stage 1 Pretraining Data:**
+```json
+{
+    "data_type": "qa",
+    "question": ["Question 1", "Question 2", ...],
+    "answers": ["Answer 1", "Answer 2", ...],
+    "docs": ["Document 1", "Document 2", ...]
+}
+```
+## Stage 2: Compression Instruction Tuning
+Fine-tune the compressor on instruction-following tasks.
+### Key Parameters
+- `--stage stage1_2`: Training stage identifier
+- `--pretrain_checkpoint`: Path to Stage 1 checkpoint
+- `--generation_top_k`: Top-k sampling (default: 5)
+- `--mse_loss`: Continue using MSE loss
+- `--do_eval_gen`: Enable generation evaluation
+### Example Command
+```bash
+bash scripts/train_instruction_tuning.sh
+```
+### Data Format
+**Stage 2 Instruction Tuning Data:**
+```json
+{
+    "question": "Single question text",
+    "docs": ["Document 1", "Document 2", ...],
+    "gold_answer": "Reference answer",
+    "answer": "Generated answer"
+}
+```
+## Stage 3: End-to-End Training
+Jointly train reranker and generator with retrieval.
+### Key Parameters
+- `--stage stage2`: Training stage identifier
+- `--pretrain_checkpoint`: Path to Stage 2 checkpoint
+- `--generation_top_k`: Top-k sampling for generation
+- `--do_eval_gen`: Enable generation evaluation
+### Example Command
+```bash
+bash scripts/train_stage_end_to_end.sh
+```
+### Data Format
+**Stage 3 End-to-End Data:**
+```json
+{
+    "question": "Single question text",
+    "docs": ["Document 1", "Document 2", ...],
+    "gold_answer": "Reference answer"
+}
+```
+## Distributed Training
+All training stages support distributed training across multiple nodes and GPUs.
+### Key Parameters
+- `--max_len`: Maximum sequence length (2048 for stage1/stage2, 1024 for stage3)
+- `--train_batch_size`: Training batch size
+- `--micro_train_batch_size`: Micro batch size for gradient accumulation
+- `--learning_rate`: Learning rate (1e-4 for stage1/stage2, 5e-6 for stage3)
+- `--max_epochs`: Maximum training epochs
+- `--zero_stage`: ZeRO optimization stage (default: 2)
+- `--bf16`: Use bfloat16 precision
+- `--flash_attn`: Use Flash Attention 2
+## Monitoring Training
+Training progress is logged via:
+- Console output
+- Wandb (if configured)
+- Checkpoint files
+Checkpoints are saved at the path specified by `--save_path`.

evaluation/evaluate.py ADDED Viewed

	@@ -0,0 +1,936 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+import os
+import json
+import argparse
+import gc
+from datetime import timedelta
+from collections import defaultdict, Counter
+from typing import List, Dict, Any, Optional, Tuple
+import torch
+import numpy as np
+from accelerate import Accelerator, InitProcessGroupKwargs
+from transformers import AutoModel
+from datasets import load_dataset
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+from sklearn.decomposition import PCA
+try:
+    import spacy
+    SPACY_AVAILABLE = True
+except Exception as e:
+    SPACY_AVAILABLE = False
+    print(f"Warning: spacy not available ({e}). Entity extraction will be disabled.")
+try:
+    import evaluate as eval_lib
+    EVAL_LIB_AVAILABLE = True
+except Exception as e:
+    EVAL_LIB_AVAILABLE = False
+    eval_lib = None
+    print(f"Warning: evaluate library not available ({e}). BERTScore and ROUGE metrics will be disabled.")
+import re
+import string
+from openrlhf.models.modeling_clara import CLaRa
+# Environment setup
+os.environ["NCCL_TIMEOUT"] = "5400"
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+# Global constants
+TARGET_ENTITY_CATEGORIES = {"PERSON", "GPE", "DATE", "CARDINAL", "ORG"}
+class EvaluationMetrics:
+    """Handles all evaluation metrics and scoring functions."""
+    def __init__(self):
+        if EVAL_LIB_AVAILABLE:
+            self.bertscore = eval_lib.load("bertscore")
+            self.rouge = eval_lib.load("rouge")
+        else:
+            self.bertscore = None
+            self.rouge = None
+        if SPACY_AVAILABLE:
+            self.nlp = spacy.load("en_core_web_sm")
+        else:
+            self.nlp = None
+    @staticmethod
+    def normalize_answer(text: str) -> str:
+        """Normalize text for comparison."""
+        def remove_articles(text):
+            return re.sub(r"\b(a|an|the)\b", " ", text)
+        def white_space_fix(text):
+            return " ".join(text.split())
+        def remove_punc(text):
+            exclude = set(string.punctuation)
+            return "".join(ch for ch in text if ch not in exclude)
+        return white_space_fix(remove_articles(remove_punc(text.lower())))
+    @staticmethod
+    def bool_mapping(text: str) -> str:
+        """Map boolean values to yes/no."""
+        mapping = {"True": "yes", "False": "no"}
+        return mapping.get(text, text)
+    def exact_match_score(self, prediction: str, ground_truth: str) -> bool:
+        """Calculate exact match score."""
+        pred_norm = self.normalize_answer(self.bool_mapping(prediction))
+        gt_norm = self.normalize_answer(self.bool_mapping(ground_truth))
+        return pred_norm == gt_norm
+    def cover_exact_match_score(self, prediction: str, ground_truth: str) -> bool:
+        """Calculate coverage exact match score."""
+        pred_tokens = self.normalize_answer(self.bool_mapping(prediction)).split()
+        gt_tokens = self.normalize_answer(self.bool_mapping(ground_truth)).split()
+        return all(token in pred_tokens for token in gt_tokens)
+    def f1_score(self, prediction: str, ground_truth: str) -> float:
+        """Calculate F1 score."""
+        pred_norm = self.normalize_answer(self.bool_mapping(prediction))
+        gt_norm = self.normalize_answer(self.bool_mapping(ground_truth))
+        # Handle yes/no/noanswer cases
+        if pred_norm in ["yes", "no", "noanswer"] and pred_norm != gt_norm:
+            return 0.0
+        if gt_norm in ["yes", "no", "noanswer"] and pred_norm != gt_norm:
+            return 0.0
+        pred_tokens = pred_norm.split()
+        gt_tokens = gt_norm.split()
+        common = Counter(pred_tokens) & Counter(gt_tokens)
+        num_same = sum(common.values())
+        if num_same == 0:
+            return 0.0
+        precision = num_same / len(pred_tokens)
+        recall = num_same / len(gt_tokens)
+        return (2 * precision * recall) / (precision + recall)
+    def extract_entities(self, text: str) -> set:
+        """Extract entities from text."""
+        if self.nlp is None:
+            return set()  # Return empty set if spacy unavailable
+        doc = self.nlp(text)
+        return set(ent.text.lower().strip() for ent in doc.ents)
+    def extract_entities_by_category(self, text: str) -> Dict[str, set]:
+        """Extract entities by category."""
+        if self.nlp is None:
+            return defaultdict(set)  # Return empty dict if spacy unavailable
+        doc = self.nlp(text)
+        entities_by_category = defaultdict(set)
+        for ent in doc.ents:
+            if ent.label_ in TARGET_ENTITY_CATEGORIES:
+                entities_by_category[ent.label_].add(ent.text.lower().strip())
+        return entities_by_category
+    def entity_preserve_metric(self, prediction: str, reference: str) -> float:
+        """Calculate entity preservation rate."""
+        ref_entities = self.extract_entities(reference)
+        pred_entities = self.extract_entities(prediction)
+        if not ref_entities:
+            return 1.0
+        preserved = ref_entities.intersection(pred_entities)
+        return len(preserved) / len(ref_entities)
+    def entity_preserve_metric_by_category(self, prediction_tokens: List[List[str]],
+                                         reference_docs: List[str]) -> Dict[str, float]:
+        """Calculate entity preservation by category."""
+        # Merge prediction tokens
+        all_prediction_tokens = []
+        for tokens in prediction_tokens:
+            all_prediction_tokens.extend(tokens)
+        prediction_text = " ".join(all_prediction_tokens)
+        # Merge reference documents
+        reference_text = " ".join(reference_docs)
+        # Extract entities
+        pred_entities = self.extract_entities_by_category(prediction_text)
+        ref_entities = self.extract_entities_by_category(reference_text)
+        # Calculate preservation rates
+        preservation_rates = {}
+        for category in TARGET_ENTITY_CATEGORIES:
+            ref_ents = ref_entities.get(category, set())
+            pred_ents = pred_entities.get(category, set())
+            if not ref_ents:
+                preservation_rates[category] = 1.0
+            else:
+                preserved = ref_ents.intersection(pred_ents)
+                preservation_rates[category] = len(preserved) / len(ref_ents)
+        # Calculate overall preservation
+        all_ref_entities = set()
+        all_pred_entities = set()
+        for entities_set in ref_entities.values():
+            all_ref_entities.update(entities_set)
+        for entities_set in pred_entities.values():
+            all_pred_entities.update(entities_set)
+        if not all_ref_entities:
+            preservation_rates["overall"] = 1.0
+        else:
+            preserved_overall = all_ref_entities.intersection(all_pred_entities)
+            preservation_rates["overall"] = len(preserved_overall) / len(all_ref_entities)
+        return preservation_rates
+class ResultCalculator:
+    """Handles result calculation and visualization."""
+    def __init__(self):
+        self.metrics = EvaluationMetrics()
+    def calculate_basic_metrics(self, result_list: List[Dict]) -> Dict[str, float]:
+        """Calculate basic metrics (F1, accuracy, exact match)."""
+        f1_total = 0
+        acc_total = 0
+        em_total = 0
+        avg_output_length = 0
+        answer_key = "golden_answers" if "golden_answers" in result_list[0] else "answer"
+        for result in result_list:
+            prediction = result['CLaRa_normal_output']
+            ground_truth = result[answer_key][0] if answer_key == "golden_answers" else result[answer_key]
+            acc_total += self.metrics.cover_exact_match_score(prediction, ground_truth)
+            f1_total += self.metrics.f1_score(prediction, ground_truth)
+            em_total += self.metrics.exact_match_score(prediction, ground_truth)
+            avg_output_length += len(prediction.split())
+        n = len(result_list)
+        return {
+            "f1": f1_total / n,
+            "acc": acc_total / n,
+            "em": em_total / n,
+            "avg_output_length": avg_output_length / n
+        }
+    def calculate_stage2_metrics(self, result_list: List[Dict], k_values: List[int] = [1, 3, 5]) -> Dict[str, float]:
+        """Calculate stage2 metrics with recall and precision."""
+        basic_metrics = self.calculate_basic_metrics(result_list)
+        recall = {k: 0 for k in k_values}
+        precision = {k: 0 for k in k_values}
+        for result in result_list:
+            scores = result['topk_idx']
+            pos_index = set(result['pos_index'])
+            for k in k_values:
+                top_k = set(scores[:k])
+                hit = len(top_k & pos_index)
+                recall[k] += hit / len(pos_index) if len(pos_index) > 0 else 0
+                precision[k] += hit / k
+        n = len(result_list)
+        recall_metrics = {f"recall@{k}": v / n for k, v in recall.items()}
+        precision_metrics = {f"precision@{k}": v / n for k, v in precision.items()}
+        return {**basic_metrics, **recall_metrics, **precision_metrics}
+    def calculate_paraphrase_metrics(self, result_list: List[Dict]) -> Dict[str, float]:
+        """Calculate paraphrase metrics."""
+        seen_metrics = {'bert-score': 0, 'rouge-1': 0, 'rouge-L': 0, 'entity_preserve': 0}
+        unseen_metrics = {'bert-score': 0, 'rouge-1': 0, 'rouge-L': 0, 'entity_preserve': 0}
+        # Process seen data (first 2000)
+        for result in result_list[:2000]:
+            prediction = result['CLaRa_normal_output']
+            ground_truth = result['doc']
+            if EVAL_LIB_AVAILABLE and self.metrics.bertscore is not None:
+                bs = self.metrics.bertscore.compute(predictions=[prediction], references=[ground_truth], lang="en")
+                seen_metrics['bert-score'] += bs['f1'][0]
+            if EVAL_LIB_AVAILABLE and self.metrics.rouge is not None:
+                rouge_scores = self.metrics.rouge.compute(predictions=[prediction], references=[ground_truth])
+                seen_metrics['rouge-1'] += rouge_scores['rouge1']
+                seen_metrics['rouge-L'] += rouge_scores['rougeL']
+            seen_metrics['entity_preserve'] += self.metrics.entity_preserve_metric(prediction, ground_truth)
+        # Process unseen data (after 2000)
+        for result in result_list[2000:]:
+            prediction = result['CLaRa_normal_output']
+            ground_truth = result['doc']
+            if EVAL_LIB_AVAILABLE and self.metrics.bertscore is not None:
+                bs = self.metrics.bertscore.compute(predictions=[prediction], references=[ground_truth], lang="en")
+                unseen_metrics['bert-score'] += bs['f1'][0]
+            if EVAL_LIB_AVAILABLE and self.metrics.rouge is not None:
+                rouge_scores = self.metrics.rouge.compute(predictions=[prediction], references=[ground_truth])
+                unseen_metrics['rouge-1'] += rouge_scores['rouge1']
+                unseen_metrics['rouge-L'] += rouge_scores['rougeL']
+            unseen_metrics['entity_preserve'] += self.metrics.entity_preserve_metric(prediction, ground_truth)
+        # Normalize
+        n_seen = min(len(result_list[:2000]), 2000)
+        n_unseen = max(len(result_list) - 2000, 0)
+        final_metrics = {}
+        if n_seen > 0:
+            for key, value in seen_metrics.items():
+                final_metrics[f'seen_{key}'] = float(value / n_seen)
+        if n_unseen > 0:
+            for key, value in unseen_metrics.items():
+                final_metrics[f'unseen_{key}'] = float(value / n_unseen)
+        return final_metrics
+    def visualize_mse(self, result_list: List[Dict], save_path: str) -> Dict[str, Any]:
+        """Create t-SNE visualization for MSE analysis."""
+        # Set scientific style
+        plt.rcParams.update({
+            'font.family': 'serif',
+            'font.size': 12,
+            'axes.labelsize': 14,
+            'axes.titlesize': 16,
+            'figure.titlesize': 18,
+            'axes.linewidth': 1.2,
+            'grid.alpha': 0.3,
+        })
+        # Collect representations
+        mem_reps = []
+        non_mem_reps = []
+        for result in result_list:
+            mem_rep = result['CLaRa_compressed_output']
+            non_mem_rep = result['CLaRa_normal_output']
+            if isinstance(mem_rep, torch.Tensor):
+                mem_rep = mem_rep.float().cpu().numpy()
+            if isinstance(non_mem_rep, torch.Tensor):
+                non_mem_rep = non_mem_rep.float().cpu().numpy()
+            mem_reps.append(mem_rep)
+            non_mem_reps.append(non_mem_rep)
+        mem_reps = np.array(mem_reps)
+        non_mem_reps = np.array(non_mem_reps)
+        print(f"Memory representations shape: {mem_reps.shape}")
+        print(f"Document representations shape: {non_mem_reps.shape}")
+        # Combine data for t-SNE
+        all_data = np.vstack([mem_reps, non_mem_reps])
+        original_dim = all_data.shape[1]
+        # PCA preprocessing if needed
+        if all_data.shape[1] > 50:
+            print(f"Applying PCA preprocessing from {all_data.shape[1]} to 50 dimensions...")
+            pca = PCA(n_components=50)
+            all_data = pca.fit_transform(all_data)
+            print(f"PCA explained variance ratio: {pca.explained_variance_ratio_[:5].sum():.3f}")
+        # Apply t-SNE
+        print("Applying t-SNE...")
+        perplexity = min(30, max(5, len(all_data) // 3))
+        tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity,
+                   max_iter=1000, learning_rate=200, verbose=1)
+        tsne_results = tsne.fit_transform(all_data)
+        # Separate results
+        mem_tsne = tsne_results[:len(mem_reps)]
+        doc_tsne = tsne_results[len(mem_reps):]
+        # Create visualization
+        fig, ax = plt.subplots(1, 1, figsize=(10, 8))
+        # Add jitter to separate overlapping points
+        np.random.seed(42)
+        jitter_strength = 1.0
+        mem_jitter = mem_tsne.copy()
+        doc_jitter = doc_tsne.copy()
+        mem_jitter[:, 0] += np.random.normal(0.5, jitter_strength, len(mem_tsne))
+        mem_jitter[:, 1] += np.random.normal(0.5, jitter_strength, len(mem_tsne))
+        doc_jitter[:, 0] += np.random.normal(-0.5, jitter_strength, len(doc_tsne))
+        doc_jitter[:, 1] += np.random.normal(-0.5, jitter_strength, len(doc_tsne))
+        # Plot scatter points
+        ax.scatter(doc_jitter[:, 0], doc_jitter[:, 1], c='#0066CC', alpha=0.7, s=25,
+                  marker='o', edgecolors='white', linewidth=0.5,
+                  label='Document Representations', zorder=2)
+        ax.scatter(mem_jitter[:, 0], mem_jitter[:, 1], c='#FF3333', alpha=0.7, s=25,
+                  marker='o', edgecolors='white', linewidth=0.5,
+                  label='Memory Tokens Representations', zorder=3)
+        # Configure plot
+        ax.set_xlabel('')
+        ax.set_ylabel('')
+        ax.set_title('')
+        legend = ax.legend(frameon=True, fancybox=True, shadow=True,
+                          loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, fontsize=14)
+        legend.get_frame().set_facecolor('white')
+        legend.get_frame().set_alpha(0.9)
+        ax.grid(True, alpha=0.3, linestyle='-', linewidth=0.5)
+        ax.set_axisbelow(True)
+        plt.tight_layout()
+        # Save visualization
+        os.makedirs(save_path, exist_ok=True)
+        plt.savefig(os.path.join(save_path, 'tsne_visualization_scientific.png'),
+                   dpi=300, bbox_inches='tight', facecolor='white')
+        plt.show()
+        # Calculate statistics
+        distances = np.array([
+            np.linalg.norm(mem_reps[i] - non_mem_reps[i])
+            for i in range(len(mem_reps))
+        ])
+        statistics = {
+            'mean_distance': float(np.mean(distances)),
+            'std_distance': float(np.std(distances)),
+            'median_distance': float(np.median(distances)),
+            'min_distance': float(np.min(distances)),
+            'max_distance': float(np.max(distances))
+        }
+        print("\n" + "="*60)
+        print("VISUALIZATION ANALYSIS REPORT")
+        print("="*60)
+        print(f"Dataset Statistics:")
+        print(f"  • Total samples: {len(mem_reps)}")
+        print(f"  • Original dimension: {original_dim}")
+        print(f"  • t-SNE perplexity: {perplexity}")
+        print(f"\nDistance Analysis:")
+        for key, value in statistics.items():
+            print(f"  • {key.replace('_', ' ').title()}: {value:.4f}")
+        print("="*60)
+        return {
+            'mem_tsne': mem_tsne,
+            'doc_tsne': doc_tsne,
+            'original_distances': distances,
+            'statistics': statistics
+        }
+class DataLoader:
+    """Handles data loading for different datasets and stages."""
+    @staticmethod
+    def load_stage1_data(dataset: str, gold_retrieval: bool) -> List[Dict]:
+        """Load stage1 evaluation data."""
+        retrieval_type = "with_pos" if gold_retrieval else "no_pos"
+        file_path = f"/mnt/conductor_data/data/compression_rag_data/generator_training_val_data/stage1_eval/{dataset}/eval_processed_{retrieval_type}.jsonl"
+        data = []
+        with open(file_path, 'r') as f:
+            for line in f:
+                data.append(json.loads(line))
+        processed_data = []
+        for index, item in enumerate(data):
+            docs = item['docs'][:5]  # Take top 5 documents
+            processed_item = {
+                'original_data': item,
+                'documents': docs,
+                'question': item['question'],
+                'global_index': index
+            }
+            processed_data.append(processed_item)
+        return processed_data
+    @staticmethod
+    def load_stage2_data(dataset: str, gold_retrieval: bool) -> List[Dict]:
+        """Load stage2 evaluation data."""
+        retrieval_type = "with_pos" if gold_retrieval else "no_pos"
+        file_path = f"/mnt/conductor_data/data/compression_rag_data/generator_training_val_data/stage2_eval/{dataset}/eval_processed_{retrieval_type}.jsonl"
+        processed_data = []
+        with open(file_path, 'r') as f:
+            for index, line in enumerate(f):
+                item = json.loads(line)
+                processed_item = {
+                    'original_data': item,
+                    'documents': item['docs'],
+                    'question': item['question'],
+                    'global_index': index,
+                    'pos_index': item['pos_index']
+                }
+                processed_data.append(processed_item)
+        return processed_data
+    @staticmethod
+    def load_paraphrase_data(file_path: str) -> List[Dict]:
+        """Load paraphrase data."""
+        data = []
+        with open(file_path, 'r') as f:
+            for line in f:
+                data.append(json.loads(line))
+        processed_data = []
+        for index, item in enumerate(data):
+            processed_item = {
+                'original_data': item,
+                'documents': [item['doc']],
+                'question': "",
+                'global_index': index
+            }
+            processed_data.append(processed_item)
+        return processed_data
+class AcceleratedCLaRaInference:
+    """Main inference engine using Accelerate for distributed processing."""
+    def __init__(self, model_path: str, training_stage: str = None,
+                 generation_top_k: int = None, args = None):
+        self.args = args
+        # Initialize Accelerator
+        process_group_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=5400))
+        self.accelerator = Accelerator(kwargs_handlers=[process_group_kwargs])
+        if self.accelerator.is_main_process:
+            print(f"Using {self.accelerator.num_processes} GPUs for distributed inference")
+            print(f"Current process: {self.accelerator.process_index}")
+            print("Loading CLaRa model...")
+        # Load model
+        self.model = CLaRa.from_pretrained(
+            model_path,
+            training_stage=training_stage,
+            generation_top_k=generation_top_k,
+            pure_inference=True
+        )
+        # Prepare model with Accelerator
+        self.model = self.accelerator.prepare(self.model)
+        self.model.eval()
+        if self.accelerator.is_main_process:
+            print("Model preparation completed")
+    def _get_model(self):
+        """Get the actual model (handles distributed vs single GPU)."""
+        return self.model.module if hasattr(self.model, 'module') else self.model
+    def process_batch(self, batch_questions: List[str], batch_documents: List[List[str]] = None,
+                     stage2_mips: bool = False, training_stage: str = None,
+                     batch_answers: List[str] = None, time_count: bool = False) -> Tuple:
+        """Process a batch of questions and documents."""
+        model = self._get_model()
+        with torch.no_grad():
+            try:
+                if training_stage == 'stage2':
+                    return self._process_stage2(model, batch_questions, batch_documents,
+                                              stage2_mips, time_count)
+                elif training_stage in ['stage1', 'stage1_2']:
+                    return self._process_stage1(model, batch_questions, batch_documents)
+                elif training_stage == 'stage2_reasoning':
+                    return self._process_reasoning(model, batch_questions, batch_answers)
+                elif training_stage == 'stage1_paraphrase':
+                    return self._process_paraphrase(model, batch_questions, batch_documents)
+                elif training_stage == 'stage1_mse_visulize':
+                    return self._process_mse_visualize(model, batch_documents)
+                else:
+                    raise ValueError(f"Unknown training stage: {training_stage}")
+            except torch.cuda.OutOfMemoryError as e:
+                self.accelerator.print(f"CUDA OOM error: {e}")
+                torch.cuda.empty_cache()
+                gc.collect()
+                return self._create_empty_results(batch_questions, training_stage)
+    def _process_stage2(self, model, batch_questions, batch_documents, stage2_mips, time_count):
+        """Process stage2 inference."""
+        if time_count:
+            if stage2_mips:
+                results = model.generate_from_questions(
+                    questions=batch_questions,
+                    max_new_tokens=64,
+                    stage2_mips=stage2_mips,
+                    time_count=True
+                )
+            else:
+                results = model.generate_from_questions(
+                    questions=batch_questions,
+                    max_new_tokens=64,
+                    stage2_mips=stage2_mips,
+                    documents=batch_documents,
+                    time_count=True
+                )
+            return results
+        else:
+            if stage2_mips:
+                batch_out_normal, topk_idx = model.generate_from_questions(
+                    questions=batch_questions,
+                    max_new_tokens=64,
+                    stage2_mips=stage2_mips
+                )
+            else:
+                batch_out_normal, topk_idx = model.generate_from_questions(
+                    questions=batch_questions,
+                    max_new_tokens=64,
+                    stage2_mips=stage2_mips,
+                    documents=batch_documents
+                )
+            return batch_out_normal, batch_out_normal, topk_idx
+    def _process_stage1(self, model, batch_questions, batch_documents):
+        """Process stage1 inference."""
+        batch_out_compressed = []
+        for docs, question in zip(batch_documents, batch_questions):
+            embeddings, _ = model.compress_documents(documents=docs)
+            out_compressed = model.generate_from_compressed_documents_and_questions(
+                questions=[question],
+                compressed_documents=embeddings
+            )
+            batch_out_compressed.extend(out_compressed)
+            del embeddings
+            torch.cuda.empty_cache()
+        return batch_out_compressed, batch_out_compressed, None
+    def _process_reasoning(self, model, batch_questions, batch_answers):
+        """Process reasoning inference."""
+        batch_out_normal = []
+        batch_out_reasoning_list = []
+        for question, answer in zip(batch_questions, batch_answers):
+            temp_out, temp_out_reasoning = model.generate_from_reasoning(
+                questions=[question],
+                max_new_tokens=1024,
+                answers=[answer],
+                save_dir=self.args.model_path
+            )
+            batch_out_normal.append(temp_out[0])
+            batch_out_reasoning_list.extend(temp_out_reasoning)
+        return batch_out_normal, batch_out_normal, None, batch_out_reasoning_list
+    def _process_paraphrase(self, model, batch_questions, batch_documents):
+        """Process paraphrase inference."""
+        batch_out_compressed = []
+        for docs, question in zip(batch_documents, batch_questions):
+            out_compressed = model.generate_from_paraphrase(
+                questions=["" for _ in range(len(docs))],
+                documents=[docs]
+            )
+            batch_out_compressed.extend(out_compressed)
+            torch.cuda.empty_cache()
+        return batch_out_compressed, batch_out_compressed, None
+    def _process_mse_visualize(self, model, batch_documents):
+        """Process MSE visualization."""
+        batch_out_normal = []
+        batch_out_compressed = []
+        for docs in batch_documents:
+            mem_rep, non_mem_rep = model.compress_documents_mse_visulize(documents=docs)
+            batch_out_compressed.append(mem_rep[0])
+            batch_out_normal.append(non_mem_rep[0])
+        return batch_out_normal, batch_out_compressed
+    def _create_empty_results(self, batch_questions, training_stage):
+        """Create empty results for error cases."""
+        empty_results = [""] * len(batch_questions)
+        if training_stage == 'stage2_reasoning':
+            return empty_results, empty_results, None, empty_results
+        elif training_stage == 'stage1_mse_visulize':
+            return empty_results, empty_results
+        else:
+            return empty_results, empty_results, None
+def convert_embeddings_to_list(data):
+    """Convert tensor embeddings to lists for JSON serialization."""
+    if isinstance(data, dict):
+        return {k: convert_embeddings_to_list(v) for k, v in data.items()}
+    elif isinstance(data, list):
+        return [convert_embeddings_to_list(item) for item in data]
+    elif isinstance(data, torch.Tensor):
+        return data.cpu().to(torch.float32).numpy().tolist()
+    elif isinstance(data, np.ndarray):
+        return data.tolist()
+    else:
+        return data
+def main():
+    parser = argparse.ArgumentParser(description="CLaRa Model Inference")
+    parser.add_argument('--model_path', type=str, required=True, help='Path to model checkpoint')
+    parser.add_argument('--batch_size', type=int, default=4, help='Batch size per GPU')
+    parser.add_argument('--stage', type=str, default='stage1',
+                       choices=['stage1', 'stage1_2', 'stage2', 'stage2_reasoning',
+                               'stage1_paraphrase', 'stage1_mse_visulize'],
+                       help='Training stage')
+    parser.add_argument('--stage2_mips', action='store_true', help='Use MIPS for stage2')
+    parser.add_argument('--dataset', type=str, default='musique',
+                       help='Comma-separated list of datasets')
+    parser.add_argument('--gold_retrieval', action='store_true',
+                       help='Use gold retrieval context')
+    parser.add_argument('--generation_top_k', type=int, default=5, help='Top-k for generation')
+    parser.add_argument('--paraphrase_path', type=str, help='Path to paraphrase data')
+    parser.add_argument('--mse_visulize_path', type=str, help='Path to save MSE visualization')
+    parser.add_argument('--efficient_count', action='store_true', help='Count efficiency metrics')
+    args = parser.parse_args()
+    # Process datasets
+    all_results_metrics = {}
+    datasets_list = args.dataset.split(',')
+    for dataset in datasets_list:
+        print(f"Processing dataset: {dataset}")
+        # Load data based on stage
+        if args.stage in ['stage1', 'stage1_2']:
+            processed_data = DataLoader.load_stage1_data(dataset, args.gold_retrieval)
+        elif args.stage == 'stage2':
+            processed_data = DataLoader.load_stage2_data(dataset, args.gold_retrieval)
+        elif args.stage in ['stage1_paraphrase', 'stage1_mse_visulize']:
+            if not args.paraphrase_path:
+                raise ValueError(f"--paraphrase_path required for stage {args.stage}")
+            processed_data = DataLoader.load_paraphrase_data(args.paraphrase_path)
+        else:
+            raise ValueError(f"Unsupported stage: {args.stage}")
+        print(f"Loaded {len(processed_data)} samples for {dataset}")
+        # Initialize inference engine
+        # Use model_path directly if absolute, otherwise use SageMaker path
+        if os.path.isabs(args.model_path):
+            model_path = args.model_path
+        else:
+            model_path = os.path.join('/mnt/task_wrapper/user_output/artifacts/data/train_checkpoint', args.model_path)
+        args.model_path = model_path
+        inference_engine = AcceleratedCLaRaInference(
+            model_path=model_path,
+            training_stage=args.stage,
+            generation_top_k=args.generation_top_k,
+            args=args
+        )
+        # Wait for all processes to be ready
+        inference_engine.accelerator.wait_for_everyone()
+        # Store results
+        all_results = []
+        time_count_dic = {"compress_time": 0, "query_time": 0, "generate_time": 0, "total_time": 0, "count": 0}
+        # Process data in batches using accelerator
+        with inference_engine.accelerator.split_between_processes(processed_data, apply_padding=False) as local_data:
+            print(f"Process {inference_engine.accelerator.process_index}: processing {len(local_data)} samples")
+            batch_size = args.batch_size
+            num_batches = (len(local_data) + batch_size - 1) // batch_size
+            for batch_idx in tqdm(range(num_batches),
+                                desc=f"GPU {inference_engine.accelerator.process_index}",
+                                disable=not inference_engine.accelerator.is_local_main_process):
+                # Get current batch
+                start_idx = batch_idx * batch_size
+                end_idx = min(start_idx + batch_size, len(local_data))
+                batch = local_data[start_idx:end_idx]
+                # Prepare batch data
+                batch_questions = [item['question'] for item in batch]
+                batch_documents = [item['documents'] for item in batch] if 'documents' in batch[0] else None
+                batch_answers = [item.get('answer') for item in batch] if args.stage == 'stage2_reasoning' else None
+                # Process batch
+                if args.efficient_count and args.stage == 'stage2':
+                    results = inference_engine.process_batch(
+                        batch_questions=batch_questions,
+                        batch_documents=batch_documents,
+                        stage2_mips=args.stage2_mips,
+                        training_stage=args.stage,
+                        time_count=True
+                    )
+                    batch_out_normal, batch_out_compressed, batch_topk_idx, compress_time, query_time, generate_time, total_time = results
+                    time_count_dic["compress_time"] += compress_time
+                    time_count_dic["query_time"] += query_time
+                    time_count_dic["generate_time"] += generate_time
+                    time_count_dic["total_time"] += total_time
+                    time_count_dic["count"] += 1
+                else:
+                    results = inference_engine.process_batch(
+                        batch_questions=batch_questions,
+                        batch_documents=batch_documents,
+                        stage2_mips=args.stage2_mips,
+                        training_stage=args.stage,
+                        batch_answers=batch_answers
+                    )
+                    if args.stage == 'stage2_reasoning':
+                        batch_out_normal, batch_out_compressed, batch_topk_idx, batch_out_reasoning = results
+                    elif args.stage == 'stage1_mse_visulize':
+                        batch_out_normal, batch_out_compressed = results
+                        batch_topk_idx = None
+                    else:
+                        batch_out_normal, batch_out_compressed, batch_topk_idx = results
+                # Prepare results
+                batch_results = []
+                for i, (item, normal_out, compressed_out) in enumerate(zip(batch, batch_out_normal, batch_out_compressed)):
+                    result_item = item['original_data'].copy()
+                    result_item['CLaRa_normal_output'] = normal_out
+                    result_item['CLaRa_compressed_output'] = compressed_out
+                    result_item['global_index'] = item['global_index']
+                    if args.stage == 'stage2' and batch_topk_idx is not None:
+                        result_item['topk_idx'] = batch_topk_idx[i].tolist()
+                    elif args.stage == 'stage2_reasoning':
+                        result_item['reasoning_output'] = batch_out_reasoning[i]
+                    batch_results.append(result_item)
+                all_results.extend(batch_results)
+                # Clean up memory
+                torch.cuda.empty_cache()
+                if batch_idx % 10 == 0:
+                    gc.collect()
+        # Save efficiency metrics if requested
+        if args.efficient_count and inference_engine.accelerator.is_main_process:
+            eff_dic = {
+                "compress_time_ms": round((time_count_dic['compress_time'] / time_count_dic['count']) * 1000, 2),
+                "query_time_ms": round((time_count_dic['query_time'] / time_count_dic['count']) * 1000, 2),
+                "generate_time_ms": round((time_count_dic['generate_time'] / time_count_dic['count']) * 1000, 2),
+                "total_time_ms": round((time_count_dic['total_time'] / time_count_dic['count']) * 1000, 2),
+                "sample_count": time_count_dic['count']
+            }
+            eff_output_path = os.path.join(model_path, f"efficiency_{dataset}_{args.stage}_{args.gold_retrieval}_{args.generation_top_k}.json")
+            with open(eff_output_path, 'w') as f:
+                json.dump(eff_dic, f, indent=2)
+        # Wait for all processes to complete
+        inference_engine.accelerator.wait_for_everyone()
+        # Gather results from all processes
+        if inference_engine.accelerator.is_main_process:
+            print("Collecting results from all processes...")
+        all_results_gathered = inference_engine.accelerator.gather_for_metrics(all_results)
+        # Process and save results (main process only)
+        if inference_engine.accelerator.is_main_process:
+            print("Processing and saving results...")
+            # Flatten results
+            final_results = []
+            if isinstance(all_results_gathered, list):
+                for result_batch in all_results_gathered:
+                    if isinstance(result_batch, list):
+                        final_results.extend(result_batch)
+                    else:
+                        final_results.append(result_batch)
+            print(f"Collected {len(final_results)} results")
+            # Sort by global index to maintain order
+            final_results.sort(key=lambda x: x.get('global_index', 0))
+            # Verify data integrity
+            processed_indices = set(item.get('global_index', -1) for item in final_results)
+            expected_indices = set(range(len(processed_data)))
+            missing_indices = expected_indices - processed_indices
+            if missing_indices:
+                print(f"Warning: Missing indices: {sorted(list(missing_indices))}")
+            else:
+                print("✓ Data integrity verification passed")
+            # Remove global index for clean output
+            for item in final_results:
+                item.pop('global_index', None)
+            # Save results
+            output_path = os.path.join(model_path, f"{dataset}_{args.stage}_{args.gold_retrieval}_{args.generation_top_k}.jsonl")
+            with open(output_path, 'w') as f:
+                if args.stage == 'stage1_mse_visulize':
+                    converted_results = convert_embeddings_to_list(final_results)
+                    for item in converted_results:
+                        f.write(json.dumps(item) + '\n')
+                else:
+                    for item in final_results:
+                        f.write(json.dumps(item) + '\n')
+            print(f"Results saved to: {output_path}")
+            # Calculate metrics
+            calculator = ResultCalculator()
+            if args.stage == 'stage2':
+                metrics = calculator.calculate_stage2_metrics(final_results)
+            elif args.stage == 'stage1_paraphrase':
+                metrics = calculator.calculate_paraphrase_metrics(final_results)
+            elif args.stage == 'stage1_mse_visulize':
+                if args.mse_visulize_path:
+                    metrics = calculator.visualize_mse(final_results, args.mse_visulize_path)
+                else:
+                    metrics = {"visualization": "completed"}
+            else:
+                metrics = calculator.calculate_basic_metrics(final_results)
+            print(f"Metrics for {dataset}: {metrics}")
+            all_results_metrics[dataset] = metrics
+        # Clean up
+        del inference_engine
+        torch.cuda.empty_cache()
+        gc.collect()
+    # Save final metrics
+    if len(all_results_metrics) > 0:
+        metrics_path = os.path.join(model_path, f"results_metrics_{args.stage}_{args.gold_retrieval}_{args.generation_top_k}.json")
+        with open(metrics_path, 'w') as f:
+            json.dump(all_results_metrics, f, indent=2)
+        print(f"Final metrics saved to: {metrics_path}")
+if __name__ == '__main__':
+    main()

evaluation/evaluate.py.bak ADDED Viewed

	@@ -0,0 +1,910 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+import os
+import json
+import argparse
+import gc
+from datetime import timedelta
+from collections import defaultdict, Counter
+from typing import List, Dict, Any, Optional, Tuple
+import torch
+import numpy as np
+from accelerate import Accelerator, InitProcessGroupKwargs
+from transformers import AutoModel
+from datasets import load_dataset
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+from sklearn.decomposition import PCA
+import spacy
+import evaluate
+import re
+import string
+from openrlhf.models.modeling_clara import CLaRa
+# Environment setup
+os.environ["NCCL_TIMEOUT"] = "5400"
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+# Global constants
+TARGET_ENTITY_CATEGORIES = {"PERSON", "GPE", "DATE", "CARDINAL", "ORG"}
+class EvaluationMetrics:
+    """Handles all evaluation metrics and scoring functions."""
+    def __init__(self):
+        self.bertscore = evaluate.load("bertscore")
+        self.rouge = evaluate.load("rouge")
+        self.nlp = spacy.load("en_core_web_sm")
+    @staticmethod
+    def normalize_answer(text: str) -> str:
+        """Normalize text for comparison."""
+        def remove_articles(text):
+            return re.sub(r"\b(a|an|the)\b", " ", text)
+        def white_space_fix(text):
+            return " ".join(text.split())
+        def remove_punc(text):
+            exclude = set(string.punctuation)
+            return "".join(ch for ch in text if ch not in exclude)
+        return white_space_fix(remove_articles(remove_punc(text.lower())))
+    @staticmethod
+    def bool_mapping(text: str) -> str:
+        """Map boolean values to yes/no."""
+        mapping = {"True": "yes", "False": "no"}
+        return mapping.get(text, text)
+    def exact_match_score(self, prediction: str, ground_truth: str) -> bool:
+        """Calculate exact match score."""
+        pred_norm = self.normalize_answer(self.bool_mapping(prediction))
+        gt_norm = self.normalize_answer(self.bool_mapping(ground_truth))
+        return pred_norm == gt_norm
+    def cover_exact_match_score(self, prediction: str, ground_truth: str) -> bool:
+        """Calculate coverage exact match score."""
+        pred_tokens = self.normalize_answer(self.bool_mapping(prediction)).split()
+        gt_tokens = self.normalize_answer(self.bool_mapping(ground_truth)).split()
+        return all(token in pred_tokens for token in gt_tokens)
+    def f1_score(self, prediction: str, ground_truth: str) -> float:
+        """Calculate F1 score."""
+        pred_norm = self.normalize_answer(self.bool_mapping(prediction))
+        gt_norm = self.normalize_answer(self.bool_mapping(ground_truth))
+        # Handle yes/no/noanswer cases
+        if pred_norm in ["yes", "no", "noanswer"] and pred_norm != gt_norm:
+            return 0.0
+        if gt_norm in ["yes", "no", "noanswer"] and pred_norm != gt_norm:
+            return 0.0
+        pred_tokens = pred_norm.split()
+        gt_tokens = gt_norm.split()
+        common = Counter(pred_tokens) & Counter(gt_tokens)
+        num_same = sum(common.values())
+        if num_same == 0:
+            return 0.0
+        precision = num_same / len(pred_tokens)
+        recall = num_same / len(gt_tokens)
+        return (2 * precision * recall) / (precision + recall)
+    def extract_entities(self, text: str) -> set:
+        """Extract entities from text."""
+        doc = self.nlp(text)
+        return set(ent.text.lower().strip() for ent in doc.ents)
+    def extract_entities_by_category(self, text: str) -> Dict[str, set]:
+        """Extract entities by category."""
+        doc = self.nlp(text)
+        entities_by_category = defaultdict(set)
+        for ent in doc.ents:
+            if ent.label_ in TARGET_ENTITY_CATEGORIES:
+                entities_by_category[ent.label_].add(ent.text.lower().strip())
+        return entities_by_category
+    def entity_preserve_metric(self, prediction: str, reference: str) -> float:
+        """Calculate entity preservation rate."""
+        ref_entities = self.extract_entities(reference)
+        pred_entities = self.extract_entities(prediction)
+        if not ref_entities:
+            return 1.0
+        preserved = ref_entities.intersection(pred_entities)
+        return len(preserved) / len(ref_entities)
+    def entity_preserve_metric_by_category(self, prediction_tokens: List[List[str]],
+                                         reference_docs: List[str]) -> Dict[str, float]:
+        """Calculate entity preservation by category."""
+        # Merge prediction tokens
+        all_prediction_tokens = []
+        for tokens in prediction_tokens:
+            all_prediction_tokens.extend(tokens)
+        prediction_text = " ".join(all_prediction_tokens)
+        # Merge reference documents
+        reference_text = " ".join(reference_docs)
+        # Extract entities
+        pred_entities = self.extract_entities_by_category(prediction_text)
+        ref_entities = self.extract_entities_by_category(reference_text)
+        # Calculate preservation rates
+        preservation_rates = {}
+        for category in TARGET_ENTITY_CATEGORIES:
+            ref_ents = ref_entities.get(category, set())
+            pred_ents = pred_entities.get(category, set())
+            if not ref_ents:
+                preservation_rates[category] = 1.0
+            else:
+                preserved = ref_ents.intersection(pred_ents)
+                preservation_rates[category] = len(preserved) / len(ref_ents)
+        # Calculate overall preservation
+        all_ref_entities = set()
+        all_pred_entities = set()
+        for entities_set in ref_entities.values():
+            all_ref_entities.update(entities_set)
+        for entities_set in pred_entities.values():
+            all_pred_entities.update(entities_set)
+        if not all_ref_entities:
+            preservation_rates["overall"] = 1.0
+        else:
+            preserved_overall = all_ref_entities.intersection(all_pred_entities)
+            preservation_rates["overall"] = len(preserved_overall) / len(all_ref_entities)
+        return preservation_rates
+class ResultCalculator:
+    """Handles result calculation and visualization."""
+    def __init__(self):
+        self.metrics = EvaluationMetrics()
+    def calculate_basic_metrics(self, result_list: List[Dict]) -> Dict[str, float]:
+        """Calculate basic metrics (F1, accuracy, exact match)."""
+        f1_total = 0
+        acc_total = 0
+        em_total = 0
+        avg_output_length = 0
+        answer_key = "golden_answers" if "golden_answers" in result_list[0] else "answer"
+        for result in result_list:
+            prediction = result['CLaRa_normal_output']
+            ground_truth = result[answer_key][0] if answer_key == "golden_answers" else result[answer_key]
+            acc_total += self.metrics.cover_exact_match_score(prediction, ground_truth)
+            f1_total += self.metrics.f1_score(prediction, ground_truth)
+            em_total += self.metrics.exact_match_score(prediction, ground_truth)
+            avg_output_length += len(prediction.split())
+        n = len(result_list)
+        return {
+            "f1": f1_total / n,
+            "acc": acc_total / n,
+            "em": em_total / n,
+            "avg_output_length": avg_output_length / n
+        }
+    def calculate_stage2_metrics(self, result_list: List[Dict], k_values: List[int] = [1, 3, 5]) -> Dict[str, float]:
+        """Calculate stage2 metrics with recall and precision."""
+        basic_metrics = self.calculate_basic_metrics(result_list)
+        recall = {k: 0 for k in k_values}
+        precision = {k: 0 for k in k_values}
+        for result in result_list:
+            scores = result['topk_idx']
+            pos_index = set(result['pos_index'])
+            for k in k_values:
+                top_k = set(scores[:k])
+                hit = len(top_k & pos_index)
+                recall[k] += hit / len(pos_index) if len(pos_index) > 0 else 0
+                precision[k] += hit / k
+        n = len(result_list)
+        recall_metrics = {f"recall@{k}": v / n for k, v in recall.items()}
+        precision_metrics = {f"precision@{k}": v / n for k, v in precision.items()}
+        return {**basic_metrics, **recall_metrics, **precision_metrics}
+    def calculate_paraphrase_metrics(self, result_list: List[Dict]) -> Dict[str, float]:
+        """Calculate paraphrase metrics."""
+        seen_metrics = {'bert-score': 0, 'rouge-1': 0, 'rouge-L': 0, 'entity_preserve': 0}
+        unseen_metrics = {'bert-score': 0, 'rouge-1': 0, 'rouge-L': 0, 'entity_preserve': 0}
+        # Process seen data (first 2000)
+        for result in result_list[:2000]:
+            prediction = result['CLaRa_normal_output']
+            ground_truth = result['doc']
+            bs = self.metrics.bertscore.compute(predictions=[prediction], references=[ground_truth], lang="en")
+            seen_metrics['bert-score'] += bs['f1'][0]
+            rouge_scores = self.metrics.rouge.compute(predictions=[prediction], references=[ground_truth])
+            seen_metrics['rouge-1'] += rouge_scores['rouge1']
+            seen_metrics['rouge-L'] += rouge_scores['rougeL']
+            seen_metrics['entity_preserve'] += self.metrics.entity_preserve_metric(prediction, ground_truth)
+        # Process unseen data (after 2000)
+        for result in result_list[2000:]:
+            prediction = result['CLaRa_normal_output']
+            ground_truth = result['doc']
+            bs = self.metrics.bertscore.compute(predictions=[prediction], references=[ground_truth], lang="en")
+            unseen_metrics['bert-score'] += bs['f1'][0]
+            rouge_scores = self.metrics.rouge.compute(predictions=[prediction], references=[ground_truth])
+            unseen_metrics['rouge-1'] += rouge_scores['rouge1']
+            unseen_metrics['rouge-L'] += rouge_scores['rougeL']
+            unseen_metrics['entity_preserve'] += self.metrics.entity_preserve_metric(prediction, ground_truth)
+        # Normalize
+        n_seen = min(len(result_list[:2000]), 2000)
+        n_unseen = max(len(result_list) - 2000, 0)
+        final_metrics = {}
+        if n_seen > 0:
+            for key, value in seen_metrics.items():
+                final_metrics[f'seen_{key}'] = float(value / n_seen)
+        if n_unseen > 0:
+            for key, value in unseen_metrics.items():
+                final_metrics[f'unseen_{key}'] = float(value / n_unseen)
+        return final_metrics
+    def visualize_mse(self, result_list: List[Dict], save_path: str) -> Dict[str, Any]:
+        """Create t-SNE visualization for MSE analysis."""
+        # Set scientific style
+        plt.rcParams.update({
+            'font.family': 'serif',
+            'font.size': 12,
+            'axes.labelsize': 14,
+            'axes.titlesize': 16,
+            'figure.titlesize': 18,
+            'axes.linewidth': 1.2,
+            'grid.alpha': 0.3,
+        })
+        # Collect representations
+        mem_reps = []
+        non_mem_reps = []
+        for result in result_list:
+            mem_rep = result['CLaRa_compressed_output']
+            non_mem_rep = result['CLaRa_normal_output']
+            if isinstance(mem_rep, torch.Tensor):
+                mem_rep = mem_rep.float().cpu().numpy()
+            if isinstance(non_mem_rep, torch.Tensor):
+                non_mem_rep = non_mem_rep.float().cpu().numpy()
+            mem_reps.append(mem_rep)
+            non_mem_reps.append(non_mem_rep)
+        mem_reps = np.array(mem_reps)
+        non_mem_reps = np.array(non_mem_reps)
+        print(f"Memory representations shape: {mem_reps.shape}")
+        print(f"Document representations shape: {non_mem_reps.shape}")
+        # Combine data for t-SNE
+        all_data = np.vstack([mem_reps, non_mem_reps])
+        original_dim = all_data.shape[1]
+        # PCA preprocessing if needed
+        if all_data.shape[1] > 50:
+            print(f"Applying PCA preprocessing from {all_data.shape[1]} to 50 dimensions...")
+            pca = PCA(n_components=50)
+            all_data = pca.fit_transform(all_data)
+            print(f"PCA explained variance ratio: {pca.explained_variance_ratio_[:5].sum():.3f}")
+        # Apply t-SNE
+        print("Applying t-SNE...")
+        perplexity = min(30, max(5, len(all_data) // 3))
+        tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity,
+                   max_iter=1000, learning_rate=200, verbose=1)
+        tsne_results = tsne.fit_transform(all_data)
+        # Separate results
+        mem_tsne = tsne_results[:len(mem_reps)]
+        doc_tsne = tsne_results[len(mem_reps):]
+        # Create visualization
+        fig, ax = plt.subplots(1, 1, figsize=(10, 8))
+        # Add jitter to separate overlapping points
+        np.random.seed(42)
+        jitter_strength = 1.0
+        mem_jitter = mem_tsne.copy()
+        doc_jitter = doc_tsne.copy()
+        mem_jitter[:, 0] += np.random.normal(0.5, jitter_strength, len(mem_tsne))
+        mem_jitter[:, 1] += np.random.normal(0.5, jitter_strength, len(mem_tsne))
+        doc_jitter[:, 0] += np.random.normal(-0.5, jitter_strength, len(doc_tsne))
+        doc_jitter[:, 1] += np.random.normal(-0.5, jitter_strength, len(doc_tsne))
+        # Plot scatter points
+        ax.scatter(doc_jitter[:, 0], doc_jitter[:, 1], c='#0066CC', alpha=0.7, s=25,
+                  marker='o', edgecolors='white', linewidth=0.5,
+                  label='Document Representations', zorder=2)
+        ax.scatter(mem_jitter[:, 0], mem_jitter[:, 1], c='#FF3333', alpha=0.7, s=25,
+                  marker='o', edgecolors='white', linewidth=0.5,
+                  label='Memory Tokens Representations', zorder=3)
+        # Configure plot
+        ax.set_xlabel('')
+        ax.set_ylabel('')
+        ax.set_title('')
+        legend = ax.legend(frameon=True, fancybox=True, shadow=True,
+                          loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, fontsize=14)
+        legend.get_frame().set_facecolor('white')
+        legend.get_frame().set_alpha(0.9)
+        ax.grid(True, alpha=0.3, linestyle='-', linewidth=0.5)
+        ax.set_axisbelow(True)
+        plt.tight_layout()
+        # Save visualization
+        os.makedirs(save_path, exist_ok=True)
+        plt.savefig(os.path.join(save_path, 'tsne_visualization_scientific.png'),
+                   dpi=300, bbox_inches='tight', facecolor='white')
+        plt.show()
+        # Calculate statistics
+        distances = np.array([
+            np.linalg.norm(mem_reps[i] - non_mem_reps[i])
+            for i in range(len(mem_reps))
+        ])
+        statistics = {
+            'mean_distance': float(np.mean(distances)),
+            'std_distance': float(np.std(distances)),
+            'median_distance': float(np.median(distances)),
+            'min_distance': float(np.min(distances)),
+            'max_distance': float(np.max(distances))
+        }
+        print("\n" + "="*60)
+        print("VISUALIZATION ANALYSIS REPORT")
+        print("="*60)
+        print(f"Dataset Statistics:")
+        print(f"  • Total samples: {len(mem_reps)}")
+        print(f"  • Original dimension: {original_dim}")
+        print(f"  • t-SNE perplexity: {perplexity}")
+        print(f"\nDistance Analysis:")
+        for key, value in statistics.items():
+            print(f"  • {key.replace('_', ' ').title()}: {value:.4f}")
+        print("="*60)
+        return {
+            'mem_tsne': mem_tsne,
+            'doc_tsne': doc_tsne,
+            'original_distances': distances,
+            'statistics': statistics
+        }
+class DataLoader:
+    """Handles data loading for different datasets and stages."""
+    @staticmethod
+    def load_stage1_data(dataset: str, gold_retrieval: bool) -> List[Dict]:
+        """Load stage1 evaluation data."""
+        retrieval_type = "with_pos" if gold_retrieval else "no_pos"
+        file_path = f"/mnt/conductor_data/data/compression_rag_data/generator_training_val_data/stage1_eval/{dataset}/eval_processed_{retrieval_type}.jsonl"
+        data = []
+        with open(file_path, 'r') as f:
+            for line in f:
+                data.append(json.loads(line))
+        processed_data = []
+        for index, item in enumerate(data):
+            docs = item['docs'][:5]  # Take top 5 documents
+            processed_item = {
+                'original_data': item,
+                'documents': docs,
+                'question': item['question'],
+                'global_index': index
+            }
+            processed_data.append(processed_item)
+        return processed_data
+    @staticmethod
+    def load_stage2_data(dataset: str, gold_retrieval: bool) -> List[Dict]:
+        """Load stage2 evaluation data."""
+        retrieval_type = "with_pos" if gold_retrieval else "no_pos"
+        file_path = f"/mnt/conductor_data/data/compression_rag_data/generator_training_val_data/stage2_eval/{dataset}/eval_processed_{retrieval_type}.jsonl"
+        processed_data = []
+        with open(file_path, 'r') as f:
+            for index, line in enumerate(f):
+                item = json.loads(line)
+                processed_item = {
+                    'original_data': item,
+                    'documents': item['docs'],
+                    'question': item['question'],
+                    'global_index': index,
+                    'pos_index': item['pos_index']
+                }
+                processed_data.append(processed_item)
+        return processed_data
+    @staticmethod
+    def load_paraphrase_data(file_path: str) -> List[Dict]:
+        """Load paraphrase data."""
+        data = []
+        with open(file_path, 'r') as f:
+            for line in f:
+                data.append(json.loads(line))
+        processed_data = []
+        for index, item in enumerate(data):
+            processed_item = {
+                'original_data': item,
+                'documents': [item['doc']],
+                'question': "",
+                'global_index': index
+            }
+            processed_data.append(processed_item)
+        return processed_data
+class AcceleratedCLaRaInference:
+    """Main inference engine using Accelerate for distributed processing."""
+    def __init__(self, model_path: str, training_stage: str = None,
+                 generation_top_k: int = None, args = None):
+        self.args = args
+        # Initialize Accelerator
+        process_group_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=5400))
+        self.accelerator = Accelerator(kwargs_handlers=[process_group_kwargs])
+        if self.accelerator.is_main_process:
+            print(f"Using {self.accelerator.num_processes} GPUs for distributed inference")
+            print(f"Current process: {self.accelerator.process_index}")
+            print("Loading CLaRa model...")
+        # Load model
+        self.model = CLaRa.from_pretrained(
+            model_path,
+            training_stage=training_stage,
+            generation_top_k=generation_top_k,
+            pure_inference=True
+        )
+        # Prepare model with Accelerator
+        self.model = self.accelerator.prepare(self.model)
+        self.model.eval()
+        if self.accelerator.is_main_process:
+            print("Model preparation completed")
+    def _get_model(self):
+        """Get the actual model (handles distributed vs single GPU)."""
+        return self.model.module if hasattr(self.model, 'module') else self.model
+    def process_batch(self, batch_questions: List[str], batch_documents: List[List[str]] = None,
+                     stage2_mips: bool = False, training_stage: str = None,
+                     batch_answers: List[str] = None, time_count: bool = False) -> Tuple:
+        """Process a batch of questions and documents."""
+        model = self._get_model()
+        with torch.no_grad():
+            try:
+                if training_stage == 'stage2':
+                    return self._process_stage2(model, batch_questions, batch_documents,
+                                              stage2_mips, time_count)
+                elif training_stage in ['stage1', 'stage1_2']:
+                    return self._process_stage1(model, batch_questions, batch_documents)
+                elif training_stage == 'stage2_reasoning':
+                    return self._process_reasoning(model, batch_questions, batch_answers)
+                elif training_stage == 'stage1_paraphrase':
+                    return self._process_paraphrase(model, batch_questions, batch_documents)
+                elif training_stage == 'stage1_mse_visulize':
+                    return self._process_mse_visualize(model, batch_documents)
+                else:
+                    raise ValueError(f"Unknown training stage: {training_stage}")
+            except torch.cuda.OutOfMemoryError as e:
+                self.accelerator.print(f"CUDA OOM error: {e}")
+                torch.cuda.empty_cache()
+                gc.collect()
+                return self._create_empty_results(batch_questions, training_stage)
+    def _process_stage2(self, model, batch_questions, batch_documents, stage2_mips, time_count):
+        """Process stage2 inference."""
+        if time_count:
+            if stage2_mips:
+                results = model.generate_from_questions(
+                    questions=batch_questions,
+                    max_new_tokens=64,
+                    stage2_mips=stage2_mips,
+                    time_count=True
+                )
+            else:
+                results = model.generate_from_questions(
+                    questions=batch_questions,
+                    max_new_tokens=64,
+                    stage2_mips=stage2_mips,
+                    documents=batch_documents,
+                    time_count=True
+                )
+            return results
+        else:
+            if stage2_mips:
+                batch_out_normal, topk_idx = model.generate_from_questions(
+                    questions=batch_questions,
+                    max_new_tokens=64,
+                    stage2_mips=stage2_mips
+                )
+            else:
+                batch_out_normal, topk_idx = model.generate_from_questions(
+                    questions=batch_questions,
+                    max_new_tokens=64,
+                    stage2_mips=stage2_mips,
+                    documents=batch_documents
+                )
+            return batch_out_normal, batch_out_normal, topk_idx
+    def _process_stage1(self, model, batch_questions, batch_documents):
+        """Process stage1 inference."""
+        batch_out_compressed = []
+        for docs, question in zip(batch_documents, batch_questions):
+            embeddings, _ = model.compress_documents(documents=docs)
+            out_compressed = model.generate_from_compressed_documents_and_questions(
+                questions=[question],
+                compressed_documents=embeddings
+            )
+            batch_out_compressed.extend(out_compressed)
+            del embeddings
+            torch.cuda.empty_cache()
+        return batch_out_compressed, batch_out_compressed, None
+    def _process_reasoning(self, model, batch_questions, batch_answers):
+        """Process reasoning inference."""
+        batch_out_normal = []
+        batch_out_reasoning_list = []
+        for question, answer in zip(batch_questions, batch_answers):
+            temp_out, temp_out_reasoning = model.generate_from_reasoning(
+                questions=[question],
+                max_new_tokens=1024,
+                answers=[answer],
+                save_dir=self.args.model_path
+            )
+            batch_out_normal.append(temp_out[0])
+            batch_out_reasoning_list.extend(temp_out_reasoning)
+        return batch_out_normal, batch_out_normal, None, batch_out_reasoning_list
+    def _process_paraphrase(self, model, batch_questions, batch_documents):
+        """Process paraphrase inference."""
+        batch_out_compressed = []
+        for docs, question in zip(batch_documents, batch_questions):
+            out_compressed = model.generate_from_paraphrase(
+                questions=["" for _ in range(len(docs))],
+                documents=[docs]
+            )
+            batch_out_compressed.extend(out_compressed)
+            torch.cuda.empty_cache()
+        return batch_out_compressed, batch_out_compressed, None
+    def _process_mse_visualize(self, model, batch_documents):
+        """Process MSE visualization."""
+        batch_out_normal = []
+        batch_out_compressed = []
+        for docs in batch_documents:
+            mem_rep, non_mem_rep = model.compress_documents_mse_visulize(documents=docs)
+            batch_out_compressed.append(mem_rep[0])
+            batch_out_normal.append(non_mem_rep[0])
+        return batch_out_normal, batch_out_compressed
+    def _create_empty_results(self, batch_questions, training_stage):
+        """Create empty results for error cases."""
+        empty_results = [""] * len(batch_questions)
+        if training_stage == 'stage2_reasoning':
+            return empty_results, empty_results, None, empty_results
+        elif training_stage == 'stage1_mse_visulize':
+            return empty_results, empty_results
+        else:
+            return empty_results, empty_results, None
+def convert_embeddings_to_list(data):
+    """Convert tensor embeddings to lists for JSON serialization."""
+    if isinstance(data, dict):
+        return {k: convert_embeddings_to_list(v) for k, v in data.items()}
+    elif isinstance(data, list):
+        return [convert_embeddings_to_list(item) for item in data]
+    elif isinstance(data, torch.Tensor):
+        return data.cpu().to(torch.float32).numpy().tolist()
+    elif isinstance(data, np.ndarray):
+        return data.tolist()
+    else:
+        return data
+def main():
+    parser = argparse.ArgumentParser(description="CLaRa Model Inference")
+    parser.add_argument('--model_path', type=str, required=True, help='Path to model checkpoint')
+    parser.add_argument('--batch_size', type=int, default=4, help='Batch size per GPU')
+    parser.add_argument('--stage', type=str, default='stage1',
+                       choices=['stage1', 'stage1_2', 'stage2', 'stage2_reasoning',
+                               'stage1_paraphrase', 'stage1_mse_visulize'],
+                       help='Training stage')
+    parser.add_argument('--stage2_mips', action='store_true', help='Use MIPS for stage2')
+    parser.add_argument('--dataset', type=str, default='musique',
+                       help='Comma-separated list of datasets')
+    parser.add_argument('--gold_retrieval', action='store_true',
+                       help='Use gold retrieval context')
+    parser.add_argument('--generation_top_k', type=int, default=5, help='Top-k for generation')
+    parser.add_argument('--paraphrase_path', type=str, help='Path to paraphrase data')
+    parser.add_argument('--mse_visulize_path', type=str, help='Path to save MSE visualization')
+    parser.add_argument('--efficient_count', action='store_true', help='Count efficiency metrics')
+    args = parser.parse_args()
+    # Process datasets
+    all_results_metrics = {}
+    datasets_list = args.dataset.split(',')
+    for dataset in datasets_list:
+        print(f"Processing dataset: {dataset}")
+        # Load data based on stage
+        if args.stage in ['stage1', 'stage1_2']:
+            processed_data = DataLoader.load_stage1_data(dataset, args.gold_retrieval)
+        elif args.stage == 'stage2':
+            processed_data = DataLoader.load_stage2_data(dataset, args.gold_retrieval)
+        elif args.stage in ['stage1_paraphrase', 'stage1_mse_visulize']:
+            if not args.paraphrase_path:
+                raise ValueError(f"--paraphrase_path required for stage {args.stage}")
+            processed_data = DataLoader.load_paraphrase_data(args.paraphrase_path)
+        else:
+            raise ValueError(f"Unsupported stage: {args.stage}")
+        print(f"Loaded {len(processed_data)} samples for {dataset}")
+        # Initialize inference engine
+        # Use model_path directly if absolute, otherwise use SageMaker path
+if os.path.isabs(args.model_path):
+    model_path = args.model_path
+else:
+    model_path = os.path.join('/mnt/task_wrapper/user_output/artifacts/data/train_checkpoint', args.model_path)
+        args.model_path = model_path
+        inference_engine = AcceleratedCLaRaInference(
+            model_path=model_path,
+            training_stage=args.stage,
+            generation_top_k=args.generation_top_k,
+            args=args
+        )
+        # Wait for all processes to be ready
+        inference_engine.accelerator.wait_for_everyone()
+        # Store results
+        all_results = []
+        time_count_dic = {"compress_time": 0, "query_time": 0, "generate_time": 0, "total_time": 0, "count": 0}
+        # Process data in batches using accelerator
+        with inference_engine.accelerator.split_between_processes(processed_data, apply_padding=False) as local_data:
+            print(f"Process {inference_engine.accelerator.process_index}: processing {len(local_data)} samples")
+            batch_size = args.batch_size
+            num_batches = (len(local_data) + batch_size - 1) // batch_size
+            for batch_idx in tqdm(range(num_batches),
+                                desc=f"GPU {inference_engine.accelerator.process_index}",
+                                disable=not inference_engine.accelerator.is_local_main_process):
+                # Get current batch
+                start_idx = batch_idx * batch_size
+                end_idx = min(start_idx + batch_size, len(local_data))
+                batch = local_data[start_idx:end_idx]
+                # Prepare batch data
+                batch_questions = [item['question'] for item in batch]
+                batch_documents = [item['documents'] for item in batch] if 'documents' in batch[0] else None
+                batch_answers = [item.get('answer') for item in batch] if args.stage == 'stage2_reasoning' else None
+                # Process batch
+                if args.efficient_count and args.stage == 'stage2':
+                    results = inference_engine.process_batch(
+                        batch_questions=batch_questions,
+                        batch_documents=batch_documents,
+                        stage2_mips=args.stage2_mips,
+                        training_stage=args.stage,
+                        time_count=True
+                    )
+                    batch_out_normal, batch_out_compressed, batch_topk_idx, compress_time, query_time, generate_time, total_time = results
+                    time_count_dic["compress_time"] += compress_time
+                    time_count_dic["query_time"] += query_time
+                    time_count_dic["generate_time"] += generate_time
+                    time_count_dic["total_time"] += total_time
+                    time_count_dic["count"] += 1
+                else:
+                    results = inference_engine.process_batch(
+                        batch_questions=batch_questions,
+                        batch_documents=batch_documents,
+                        stage2_mips=args.stage2_mips,
+                        training_stage=args.stage,
+                        batch_answers=batch_answers
+                    )
+                    if args.stage == 'stage2_reasoning':
+                        batch_out_normal, batch_out_compressed, batch_topk_idx, batch_out_reasoning = results
+                    elif args.stage == 'stage1_mse_visulize':
+                        batch_out_normal, batch_out_compressed = results
+                        batch_topk_idx = None
+                    else:
+                        batch_out_normal, batch_out_compressed, batch_topk_idx = results
+                # Prepare results
+                batch_results = []
+                for i, (item, normal_out, compressed_out) in enumerate(zip(batch, batch_out_normal, batch_out_compressed)):
+                    result_item = item['original_data'].copy()
+                    result_item['CLaRa_normal_output'] = normal_out
+                    result_item['CLaRa_compressed_output'] = compressed_out
+                    result_item['global_index'] = item['global_index']
+                    if args.stage == 'stage2' and batch_topk_idx is not None:
+                        result_item['topk_idx'] = batch_topk_idx[i].tolist()
+                    elif args.stage == 'stage2_reasoning':
+                        result_item['reasoning_output'] = batch_out_reasoning[i]
+                    batch_results.append(result_item)
+                all_results.extend(batch_results)
+                # Clean up memory
+                torch.cuda.empty_cache()
+                if batch_idx % 10 == 0:
+                    gc.collect()
+        # Save efficiency metrics if requested
+        if args.efficient_count and inference_engine.accelerator.is_main_process:
+            eff_dic = {
+                "compress_time_ms": round((time_count_dic['compress_time'] / time_count_dic['count']) * 1000, 2),
+                "query_time_ms": round((time_count_dic['query_time'] / time_count_dic['count']) * 1000, 2),
+                "generate_time_ms": round((time_count_dic['generate_time'] / time_count_dic['count']) * 1000, 2),
+                "total_time_ms": round((time_count_dic['total_time'] / time_count_dic['count']) * 1000, 2),
+                "sample_count": time_count_dic['count']
+            }
+            eff_output_path = os.path.join(model_path, f"efficiency_{dataset}_{args.stage}_{args.gold_retrieval}_{args.generation_top_k}.json")
+            with open(eff_output_path, 'w') as f:
+                json.dump(eff_dic, f, indent=2)
+        # Wait for all processes to complete
+        inference_engine.accelerator.wait_for_everyone()
+        # Gather results from all processes
+        if inference_engine.accelerator.is_main_process:
+            print("Collecting results from all processes...")
+        all_results_gathered = inference_engine.accelerator.gather_for_metrics(all_results)
+        # Process and save results (main process only)
+        if inference_engine.accelerator.is_main_process:
+            print("Processing and saving results...")
+            # Flatten results
+            final_results = []
+            if isinstance(all_results_gathered, list):
+                for result_batch in all_results_gathered:
+                    if isinstance(result_batch, list):
+                        final_results.extend(result_batch)
+                    else:
+                        final_results.append(result_batch)
+            print(f"Collected {len(final_results)} results")
+            # Sort by global index to maintain order
+            final_results.sort(key=lambda x: x.get('global_index', 0))
+            # Verify data integrity
+            processed_indices = set(item.get('global_index', -1) for item in final_results)
+            expected_indices = set(range(len(processed_data)))
+            missing_indices = expected_indices - processed_indices
+            if missing_indices:
+                print(f"Warning: Missing indices: {sorted(list(missing_indices))}")
+            else:
+                print("✓ Data integrity verification passed")
+            # Remove global index for clean output
+            for item in final_results:
+                item.pop('global_index', None)
+            # Save results
+            output_path = os.path.join(model_path, f"{dataset}_{args.stage}_{args.gold_retrieval}_{args.generation_top_k}.jsonl")
+            with open(output_path, 'w') as f:
+                if args.stage == 'stage1_mse_visulize':
+                    converted_results = convert_embeddings_to_list(final_results)
+                    for item in converted_results:
+                        f.write(json.dumps(item) + '\n')
+                else:
+                    for item in final_results:
+                        f.write(json.dumps(item) + '\n')
+            print(f"Results saved to: {output_path}")
+            # Calculate metrics
+            calculator = ResultCalculator()
+            if args.stage == 'stage2':
+                metrics = calculator.calculate_stage2_metrics(final_results)
+            elif args.stage == 'stage1_paraphrase':
+                metrics = calculator.calculate_paraphrase_metrics(final_results)
+            elif args.stage == 'stage1_mse_visulize':
+                if args.mse_visulize_path:
+                    metrics = calculator.visualize_mse(final_results, args.mse_visulize_path)
+                else:
+                    metrics = {"visualization": "completed"}
+            else:
+                metrics = calculator.calculate_basic_metrics(final_results)
+            print(f"Metrics for {dataset}: {metrics}")
+            all_results_metrics[dataset] = metrics
+        # Clean up
+        del inference_engine
+        torch.cuda.empty_cache()
+        gc.collect()
+    # Save final metrics
+    if len(all_results_metrics) > 0:
+        metrics_path = os.path.join(model_path, f"results_metrics_{args.stage}_{args.gold_retrieval}_{args.generation_top_k}.json")
+        with open(metrics_path, 'w') as f:
+            json.dump(all_results_metrics, f, indent=2)
+        print(f"Final metrics saved to: {metrics_path}")
+if __name__ == '__main__':
+    main()

evaluation/evaluation_data/end_to_end_evaluation/2wiki.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39cf44bcfa24938c40617ef5bba90235642bf02f537297f4055b3f6bc756846c
+size 93670063

evaluation/evaluation_data/end_to_end_evaluation/hotpotqa.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f46d7cfc23199f6cdff5e3ce1872ff150e6d940eb83b343cf37431cd740fa4db
+size 61751762

evaluation/evaluation_data/end_to_end_evaluation/musique.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85a55afc5c6067d00eef1888e13b598039a515f787791b23fbb495c35827e264
+size 18789210

evaluation/evaluation_data/end_to_end_evaluation/nq.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d26d5c29694cd81cccfcac4fd29c16ae7f245b4c554623cbe3c6ec8c3a0ad41
+size 60057585

evaluation/evaluation_data/instruction_tuning_evaluation/2wiki.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:690a91abab47ebb8e32f335b2f1e31f40a1b2e78452988c7bffd0c30cc5f5463
+size 93670063

evaluation/evaluation_data/instruction_tuning_evaluation/hotpotqa.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:808713253d9a0821c43800b16e56fd76a9b046b42088afb217630c735196b4e4
+size 61751762

evaluation/evaluation_data/instruction_tuning_evaluation/musique.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:171fb546901128d338de9a343080e6e41ca73c3f8246c7c0d270330f06cef0d9
+size 18789210

evaluation/evaluation_data/instruction_tuning_evaluation/nq.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:225bc8fd6f5b5b3156f42952602fc296a7a390bf873de24ceb0328ceb61eabfd
+size 60057585

example/end_to_end_data.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bf9c2e07e6c833288c7041a93dfa7fd3cf41e34b22800d97aafbd93c78d3597
+size 13128781

example/instruction_tuning_data.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

example/pretrain_data.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff