Commit
·
067cdc9
0
Parent(s):
first commit
Browse files- .gitignore +17 -0
- .python-version +1 -0
- README.md +22 -0
- config.py +8 -0
- docs/markdowns/agentmem.md +429 -0
- docs/markdowns/deepanalyze.md +0 -0
- docs/markdowns/deepseek_ocr.md +519 -0
- docs/markdowns/sam3.md +0 -0
- docs/markdowns/sam3d.md +0 -0
- knowledge_base/chroma.py +57 -0
- knowledge_base/embeddings.py +22 -0
- knowledge_base/prepare_documents.py +31 -0
- main.py +6 -0
- notebook.ipynb +37 -0
- pyproject.toml +7 -0
- requirements.txt +11 -0
- uv.lock +8 -0
.gitignore
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python-generated files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[oc]
|
| 4 |
+
build/
|
| 5 |
+
dist/
|
| 6 |
+
wheels/
|
| 7 |
+
*.egg-info
|
| 8 |
+
|
| 9 |
+
# Virtual environments
|
| 10 |
+
.venv
|
| 11 |
+
|
| 12 |
+
# Environment variables
|
| 13 |
+
.env
|
| 14 |
+
|
| 15 |
+
# Unwanted files
|
| 16 |
+
*.log
|
| 17 |
+
*.pdf
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.10
|
README.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rag_agent/
|
| 2 |
+
├── app.py # Main Gradio application entry point
|
| 3 |
+
├── config.py # Configuration hub (models, chunk sizes, providers)
|
| 4 |
+
├── util.py # PDF to markdown conversion
|
| 5 |
+
├── document_chunker.py # Chunking strategy
|
| 6 |
+
├── core/ # Core RAG components orchestration
|
| 7 |
+
│ ├── chat_interface.py
|
| 8 |
+
│ ├── document_manager.py
|
| 9 |
+
│ └── rag_system.py
|
| 10 |
+
├── knowledge_base/ # Storage management
|
| 11 |
+
│ ├── chroma.py # Parent chunks storage (JSON)
|
| 12 |
+
│ └── vector_db_manager.py # Qdrant vector database setup
|
| 13 |
+
├── agent_logic/ # LangGraph agent workflow
|
| 14 |
+
│ ├── edges.py # Conditional routing logic
|
| 15 |
+
│ ├── graph.py # Graph construction and compilation
|
| 16 |
+
│ ├── graph_state.py # State definitions
|
| 17 |
+
│ ├── nodes.py # Processing nodes (summarize, rewrite, agent)
|
| 18 |
+
│ ├── prompts.py # System prompts
|
| 19 |
+
│ ├── schemas.py # Pydantic data models
|
| 20 |
+
│ └── tools.py # Retrieval tools
|
| 21 |
+
└── ui/ # User interface
|
| 22 |
+
└── gradio_app.py # Gradio interface components
|
config.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
configs = {
|
| 4 |
+
"DATA_PATH": "./docs/markdowns",
|
| 5 |
+
"PERSIST_PATH": "./knowledge_base/chroma_data",
|
| 6 |
+
"EMBEDDING_MODEL_NAME": "sentence-transformers/all-mpnet-base-v2",
|
| 7 |
+
"COLLECTION_NAME": "langchain_mpnet_collection"
|
| 8 |
+
}
|
docs/markdowns/agentmem.md
ADDED
|
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# **General Agentic Memory Via Deep Research**
|
| 2 |
+
|
| 3 |
+
**B.Y. Yan**[1] **, Chaofan Li**[1] **, Hongjin Qian**[1] _[,]_[3] **, Shuqi Lu**[1] **, Zheng Liu**[1] _[,]_[4] _[∗]_
|
| 4 |
+
|
| 5 |
+
1. Beijing Academy of Artificial Intelligence 2. Renmin University of China
|
| 6 |
+
|
| 7 |
+
3. Peking University 4. Hong Kong Polytechnic University
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
{chienqhj,zhengliu1026}@gmail.com
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
## **Abstract**
|
| 14 |
+
|
| 15 |
+
Memory is critical for AI agents, yet the widely-adopted static memory, aiming to create readily available memory in advance, is inevitably subject to severe information loss. To address this limitation, we propose a novel framework called **general agentic memory (GAM)** . GAM follows the principle of “ **just-in time (JIT) compilation** ” where it focuses on creating optimized contexts for its client at runtime while keeping only simple but useful memory during the offline stage. To this end, GAM employs a duo-design with the following components. 1) **Memorizer** , which highlights key historical information using a lightweight memory, while maintaining complete historical information within a universal page-store. 2) **Researcher** , which retrieves and integrates useful information from the pagestore for its online request guided by the pre-constructed memory. This design allows GAM to effectively leverage the agentic capabilities and test-time scalability of frontier large language models (LLMs), while also facilitating end-to-end performance optimization through reinforcement learning. In our experimental study, we demonstrate that GAM achieves substantial improvement on various memory-grounded task completion scenarios against existing memory systems.
|
| 16 |
+
|
| 17 |
+
## **1 Introduction**
|
| 18 |
+
|
| 19 |
+
" _Intelligence is not the ability to store information, but to know where to find it_ ."
|
| 20 |
+
|
| 21 |
+
## —Albert Einstein
|
| 22 |
+
|
| 23 |
+
AI agents become increasingly popular thanks to the rapid advancement of large language models (LLMs) [1]. Today, prototypes of AI agents are being deployed across many crucial domains, such as information seeking, software engineering, and scientific research, showcasing huge potential in improving the productivity of human society [2, 3, 4]. This widespread application, however, creates an urgent need to manage complex and rapidly expanding contexts, as AI agents must continuously integrate vast amounts of information generated by both their internal reasoning and external feedback [5]. To address this challenge, there has been growing interest in developing specialized memory systems that provide agents with essential contextual information to support downstream tasks [6]. Most existing memory systems follow the principle of **Ahead-of-Time (AOT) Compilation** . Under this paradigm, substantial computation is performed during the offline stage to compress raw contexts as lightweight memory, while incoming requests are served primarily based on this pre-constructed memory [7, 8, 9, 10]. Although widely adopted, this AOT-style approach suffers from critical limitations.
|
| 24 |
+
|
| 25 |
+
_**⋆ Memorization** is a form of data compression; thus, it is inevitably subject to_ _**information loss** ._ The precomputed memory, being a compressed representation of raw data, inevitably suffers from _information loss_ , making it difficult to satisfy the fine-grained information needs requested by client
|
| 26 |
+
|
| 27 |
+
> _∗_ Project lead
|
| 28 |
+
|
| 29 |
+
--- end of page.page_number=1 ---
|
| 30 |
+
|
| 31 |
+
**==> picture [393 x 143] intentionally omitted <==**
|
| 32 |
+
|
| 33 |
+
**----- Start of picture text -----**<br>
|
| 34 |
+
Memorization Deep-Research<br>q Memorizing : abstracting each page with its memo, q Planning: information needs (what to search),<br>e.g., { Session ID: --, Session memo: -- } searching plan (how to search)<br>q Paging : desecrating each session with its contextual q Searching: taking a search action, e.g., tool-using<br>information, e.g., (context info, session content) or direct browsing<br>q Reflection: whether search results are accurate<br>page page-store and complete. If not, what’s to search next<br>Memory is used as the working context ⇲ Integration<br>for both Memorizer and Researcher<br>Memorizer Researcher output<br>sess 1 … sess N memo memory Request<br>**----- End of picture text -----**<br>
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
Figure 1: Overview of GAM. The memorizer generates a light memory by for agent history and keeps the complete history in the page-store during the offline stage. The researcher performs deep-research to retrieve and integrate useful information for its request in the online service.
|
| 38 |
+
|
| 39 |
+
agents. In addition, such memory systems generally assume a _static structure_ , preventing them from flexibly adapting to ad-hoc or unforeseen requests that demand nuanced interpretation and integration of information. Finally, existing approaches often rely heavily on _domain expertise and handcrafted heuristics_ to determine how memory is constructed and organized, which further constrains generalization across domains and tasks of the AOT-style memory systems.
|
| 40 |
+
|
| 41 |
+
_**⋆ Search** is made as the core of memory, while memorization is conducted to enable effective search._ We argue that lossless memory can only be realized via searching over a database of the complete history, where the pre-computed memory is introduced to support such a search process. With this insight, we propose **General Agentic Memory (GAM)** , a novel memory framework for general AI agents following the principle of **Just-in-Time (JIT) Compilation** . During the offline stage, it creates a light memory for the crucial historical information while maintaining the complete historical information in the database. At runtime, it performs intensive computation, namely deep research, to generate a customized, high-utility context for its request based on the pre-constructed memory.
|
| 42 |
+
|
| 43 |
+
_**⋆ Dual-architecture**_ . Based on the above JIT principle, GAM is realized based on a dual-agent framework with two fundamental roles: the Memorizer and the Researcher (Figure 1):
|
| 44 |
+
|
| 45 |
+
- The **Memorizer** receives the client’s streaming history as a sequence of sessions, where it takes two actions: 1) it dynamically compresses the key historical information with a lightweight memory, and 2) it merges each session and its corresponding memory into a page and save all pages into a page-store, ensuring that the historical information is coherently and inclusively preserved.
|
| 46 |
+
|
| 47 |
+
- The **Researcher** receives an online request from its client and performs deep research based on the pre-constructed memory to address the client’s needs. It iteratively analyzes information need and plans search actions, retrieves relevant information from the page-store, and reflects on the results until the gathered information fully satisfies the client’s request.
|
| 48 |
+
|
| 49 |
+
The above framework endows GAM with several key advantages. 1) _High-fidelity and taskadaptability_ , enabling the generation of concise yet highly informative memory tailored to downstream tasks. 2) _Domain generalizability_ , allowing GAM to operate effectively across general scenarios without relying on domain-specific expertise or handcrafted heuristics. 3) _Optimizability_ , harnessing advanced LLMs’ agentic capability and test-time scalability for performance optimization, while also facilitating continual improvement through reinforcement learning.
|
| 50 |
+
|
| 51 |
+
We evaluate GAM’s performance through rigorous experimental studies. We jointly leverage the traditional memory benchmark LoCoMo [11], together with popular long-context benchmarks such as HotpotQA [7], RULER [12], and NarrativeQA [13]. Across all these experiments, GAM consistently and significantly outperforms existing methods, demonstrating its strong ability to preserve finegrained historical information and to optimize downstream task-completion performance for its clients. Our project is made publicly available to facilitate future research in this field[2] .
|
| 52 |
+
|
| 53 |
+
> 2https://github.com/VectorSpaceLab/general-agentic-memory
|
| 54 |
+
|
| 55 |
+
--- end of page.page_number=2 ---
|
| 56 |
+
|
| 57 |
+
## **2 Methodology**
|
| 58 |
+
|
| 59 |
+
## **2.1 Definition**
|
| 60 |
+
|
| 61 |
+
LLM agents often require long trajectories, comprising multi-step reasoning and tool using, to accomplish complex tasks, e.g., software engineering and deep research. In our work, we define each historical trajectory (history for short) as a sequence of temporally ordered units called sessions: hist : _s_ 1 _, ..., sT_ . The rapidly growing history leads to several crucial challenges, including prohibitive computational costs, context window overflow, and performance degradation. To address these issues, a memory system is introduced to manage the information overload. Its primary objective is to extract useful yet concise information from the history, which is essential for the completion of the agent’s task. That is to say, the memory system is to optimize the _cost-effectiveness_ of the agent’s task completion grounded on its produced context. This objective can be formulated as the following min–max optimization problem.
|
| 62 |
+
|
| 63 |
+
**Definition 2.1** ( **Memory** ) **.** A memory system produces the optimized context for an agent based on its task and history: _c[∗] ←_ Memory(task _,_ history), which is of the minimum size while optimizing the task completion performance: _c[∗]_ = argmin _C[∗] |_ c _|_ , where _C[∗]_ = argmax _C_ Agent(task _,_ context).
|
| 64 |
+
|
| 65 |
+
## **2.2 General Agentic Memory**
|
| 66 |
+
|
| 67 |
+
The overall architecture of GAM, depicted in Figure 1, consists of two main modules: the memorizer and the researcher. Both modules are LLM-based agents, each with customized prompts[3] , working together to generate optimized memory that addresses requests from the client agent.
|
| 68 |
+
|
| 69 |
+
## **2.2.1 Memorizer**
|
| 70 |
+
|
| 71 |
+
The memorizer is responsible for processing the agent’s trajectory during the offline stage, ensuring that it can be efficiently stored and effectively utilized. Each memorization step is triggered by the arrival of a new session ( _si_ ), where two operations are performed. 1. _Memorizing_ , which produces memo ( _µi_ ) as a concise and well-structured snapshot of the new session. The memo is generated based on both the new session and the existing memory ( _mi_ ), highlighting its crucial information for the entire trajectory. The memory is therefore incrementally updated with the addition of the memo:
|
| 72 |
+
|
| 73 |
+
Memorizer _._ memorize( _si, mi_ ) _→ µi_ ; _mi_ + _{µi} → mi_ +1 _._ (1)
|
| 74 |
+
|
| 75 |
+
2. _Paging_ , which creates pages to maintain the complete information of the agent’s trajectory. It begins by generating a header for the new session, which contains crucial contextual information from its preceding trajectory. The header is then used to decorate the session, forming a new page that is subsequently added to the page-store ( _p_ ):
|
| 76 |
+
|
| 77 |
+
Memorizer _._ page( _si, mi_ ) _→ hi_ ; _{_ header : _hi,_ content : _si} → pi_ ; _p._ append( _pi_ ) _._ (2)
|
| 78 |
+
|
| 79 |
+
This process shares the same principle of BGE landmark retrieval [14] and Anthropic contextual retrieval [15], which preserve the consistency of page semantics, ensuring that they can be accurately retrieved in subsequent stages.
|
| 80 |
+
|
| 81 |
+
## **2.2.2 Researcher**
|
| 82 |
+
|
| 83 |
+
The researcher is to address the client’s request by retrieving and integrating useful information from the page-store. The process is iteratively conducted with three operations. 1) _Planning_ , which performs a chain-of-thought reasoning based on the existing memory to analyze the underlying information needed by request ( _r_ ). Based on this initial reasoning result, it further generates concrete search plans according to the provided search toolkit ( _T_ ):
|
| 84 |
+
|
| 85 |
+
Researcher _._ plan( _r, mi, T_ ) _→{_ tool : _t_ ; parameter : _ρt}t∈T ._ (3)
|
| 86 |
+
|
| 87 |
+
In our implementation, we offer three available tools for the researcher: an embedding model for vector search, a BM25 retriever for keyword-based search, and an ID-based retriever for direct page exploration. 2) _Searching_ . Upon obtaining the search plan, the researcher executes each search action in parallel, retrieving relevant pages ( _pt_ ) from the page-store. The researcher then integrates the
|
| 88 |
+
|
| 89 |
+
> 3We include the detailed prompts of all functions in the appendix of the paper.
|
| 90 |
+
|
| 91 |
+
--- end of page.page_number=3 ---
|
| 92 |
+
|
| 93 |
+
information from the union of the retrieved pages together with the last integration result ( _I_ ) for the request ( _r_ ), leading to an updated temporal integration result:
|
| 94 |
+
|
| 95 |
+
For each _t_ : _t_ ( _ρt_ ) _→ pt_ ; Researcher _._ integrate(� _t∈T[p][t][,][ I][, r]_[)] _[ →I][.]_ (4)
|
| 96 |
+
|
| 97 |
+
3) _Reflection_ . The researcher performs a reflection on whether the needed information in the request ( _r_ ) has been fully collected by the integration result _I_ using a binary indicator ( _y_ ). If no, it further analyzes for the missing information, leading to a new request _r[′]_ to drive another round of deep research. If yes, the research process will be concluded by returning the integration result:
|
| 98 |
+
|
| 99 |
+
Researcher _._ reflect( _I, r_ ) _→ y, r[′]_ ; if _y_ = No, Researcher( _r[′] , I_ ); if _y_ = Yes, return _I._ (5)
|
| 100 |
+
|
| 101 |
+
Finally, the integrated result, along with the original information extracted from the associated pages, is returned to the client as the optimized context for its downstream task completion.
|
| 102 |
+
|
| 103 |
+
## **2.2.3 Optimization**
|
| 104 |
+
|
| 105 |
+
A unified end-to-end performance optimization framework is introduced for GAM. Suppose a training dataset _D_ = _{_ (task _,_ hist) _}_ is given, the system creates the memory and page-store as: M _,_ P _←_ Memorizer(hist), and then generates a candidate context for the task via: _c ←_ Researcher(task _,_ M _,_ P). Using this candidate context, the client samples an answer (ans), whose _·_ quality is measured by the reward function Γ( ). Thus, the expected reward is derived as:
|
| 106 |
+
|
| 107 |
+
_R_ = Etask _,_ hist _∼D_ EM _,_ P _∼_ Memorizer(hist) E _c∼_ Researcher(task _,_ M _,_ P) Eans _∼_ Client( _c,_ task) Γ(ans) _._ (6)
|
| 108 |
+
|
| 109 |
+
When focusing on optimizing GAM’s performance, the memorizer and the researcher are learned via reinforcement, while the client is excluded from the learning process. Without loss of generality, the policy gradients for the memorizer and researcher are given by:
|
| 110 |
+
|
| 111 |
+
**==> picture [246 x 14] intentionally omitted <==**
|
| 112 |
+
|
| 113 |
+
> _,_ (7) _∇θr_ = Etask _,_ hist _∼D_ (Γ(ans) _−_ Γ[¯] _r_ ) _∇θr_ log _πr_ (c _|_ task _,_ M _,_ P) _._
|
| 114 |
+
|
| 115 |
+
Here, _θm_ and _θr_ denote the model parameters of memorizer and researcher, respectively; Γ[¯] _m_ and Γ[¯] _r_ are the baseline answer rewards of the two modules; while _θm_ ( _·_ ) and _θr_ ( _·_ ) stand for the memorizer and researcher’s generation likelihood.
|
| 116 |
+
|
| 117 |
+
## **3 Experiment**
|
| 118 |
+
|
| 119 |
+
In this section, we conduct comprehensive experimental studies to evaluate the effectiveness of GAM. We focus on the investigation of the following three research questions. **RQ 1** : How does GAM perform compared with existing memory systems? **RQ 2** : How does GAM’s performance vary across different scenarios? **RQ 3** : How do key technical factors within GAM influence its performance?
|
| 120 |
+
|
| 121 |
+
## **3.1 Experiment Setting**
|
| 122 |
+
|
| 123 |
+
**Datasets.** To rigorously evaluate the effectiveness of GAM, specifically 1) the memory’s ability to preserve historical information and 2) its ability to support downstream task completion, we employ the following benchmarks in our experimental studies. **1) LoCoMo** [11]. A widely used memory benchmark for conversational settings, designed to evaluate an agent’s ability to maintain and recall information across extended multi-session dialogues. We adopt its single-hop, multi-hop, temporal-reasoning, and open-domain tasks in our experiments. **2) HotpotQA** [16]. A popular multi-hop question answering benchmark based on the Wikipedia corpus. We use the curated memory-evaluation dataset in MemAgent [7] that concatenates gold supporting documents with distracting passages. By varying the number of distractions, the dataset provides three versions with context lengths of 56K, 224K, and 448K tokens. **3) RULER** [12]. A popular long-context understanding benchmark with four types of evaluation tasks, including retrieval (Retri.), multi-hop tracing (MT), aggregation (AGG.), and question answering (QA). We use the 128K-token setting in our experiments. **4) NarrativeQA** [13]. A long-context question answering benchmark that provides an entire book or movie script as the input context for each sample. We randomly sample a subset of 300 questions for evaluation, whose average token length is 87K.
|
| 124 |
+
|
| 125 |
+
--- end of page.page_number=4 ---
|
| 126 |
+
|
| 127 |
+
**Baselines.** We consider the following baselines in our experiments. **1) Memory-free methods** , including the _brute-force long-LLM_ (long-LLM for brevity) and retrieval-augmented generation (RAG). The long-LLM baseline attempts to process the entire input within the model’s context window. When the number of input tokens exceeds the maximum allowable context length _Lmax_ , the input is evenly partitioned into _N_ chunks of length _Lmax_ : _{S_ 1 _, ..., SN }_ , where the final score is reported as the maximum over all chunks: max _{_ LLM( _S_ 1) _..._ LLM( _SN_ ) _}_ . For the RAG baseline, the input is uniformly partitioned into segments of 2,048 tokens, and the top-5 retrieved segments are used to perform the downstream task. **2) Memory-based methods** , including A-Mem [8], Mem0 [9], MemoryOS [10] and LightMem [17]. These approaches construct specialized memory structures to store historical information, which can be utilized to address memory-related tasks at runtime.
|
| 128 |
+
|
| 129 |
+
**Implementation Details.** In our experiments, we adopt GPT-4o-mini and Qwen2.5-14B-Instruct [18] as the backbone models for both GAM and all baselines. Both LLMs offer a long-context window of 128K tokens. We use BGE-M3 [19] as the default dense retriever. For GAM’s detailed configuration, we set the maximum reflection depth to 3 and the maximum number of retrieved pages to 5. The input context is segmented into 2,048-token pages for stream processing in the memorization module.
|
| 130 |
+
|
| 131 |
+
## **3.2 Main Results: Overall Effectiveness**
|
| 132 |
+
|
| 133 |
+
Table 1 presents the main results of GAM and baselines on the experimental benchmarks, from which the following observations can be made. First, GAM consistently outperforms all baselines, including both memory-free and memory-based approaches, across every benchmark. Moreover, its advantage becomes particularly pronounced on benchmarks like HotpotQA and RULER, where tasks require multi-step retrieval and reasoning over information dispersed within the input context. For instance, GAM achieves over 90% accuracy on the multi-hop tracing (MT) tasks in the RULER benchmark, which demand tracking variable values across multiple steps of assignment; in contrast, most baselines fail to achieve satisfactory performance under such complexity. Finally, GAM maintains stable and competitive performance under varying input-context lengths, as reflected in the results on HotpotQA. In summary, these experimental results preliminarily verify GAM’s overall effectiveness and its robustness to task complexity and growing context lengths.
|
| 134 |
+
|
| 135 |
+
We obtain the following interesting things besides the main observations. First, the performance of long-LLMs is under-expectation compared with the other methods, despite that it has adopted LLMs with a 128K context window, long enough to fully cover the input context in LoCoMo, HotpotQA56K, and NarrativeQA. This suggests that simply extending the context window is insufficient to effectively address long-context challenges. This also aligns with the recently discussed phenomenon of context rot[4] , which indicates that the substantial distracting or irrelevant information within long contexts can severely degrade LLMs’ performance. Second, direct applications of retrieval, i.e., RAG, exhibit highly variable performance across different scenarios. RAG improves performance over long-LLMs and the memory-based methods when the relevant information is explicitly presented, such as LoCoMo single-hop and RULER retrieval. However, it performs badly in HotpotQA, RULER multi-hop tracing, and RULER aggregation tasks, where relevant information is unobvious. In comparison, the memory-based methods show lower variance but remain constrained due to the loss of crucial details of the original context. In contrast, GAM leverages memory to support effective retrieval of task-relevant information, enabling it to achieve substantially improved performance.
|
| 136 |
+
|
| 137 |
+
## **3.3 Model’s Impact**
|
| 138 |
+
|
| 139 |
+
Table 2 presents the performance of GAM on HotpotQA and NarrativeQA implemented with different LLMs. We apply Qwen-2.5 variants of different sizes (from 0.5B to 32B) and GPT-4o-mini as the backbones of the memorization and research module. As demonstrated by the experiment result, larger and stronger LLM-backbones for both memorizer and researcher result in consistent performance improvement, indicating that GAM can effectively leverage the increased LLM capacity to improve its memory quality. However, we also observe that the research module is considerably more sensitive to the LLM’s scale than the memorization module. Notably, GAM maintains strong performance even when the memorizer is downsized and remains competitive with the smallest Qwen-2.5-0.5B model. In contrast, GAM’s overall performance deteriorates significantly when the research module’s backbone is reduced to 7B or smaller. This discrepancy reflects the distinct complexity of the two
|
| 140 |
+
|
| 141 |
+
> 4https://research.trychroma.com/context-rot
|
| 142 |
+
|
| 143 |
+
--- end of page.page_number=5 ---
|
| 144 |
+
|
| 145 |
+
Table 1: Results from GAM and baselines (memory-free and memory-based) on LoCoMo, HotpotQA, RULER, and NarrativeQA. Two LLMs, GPT-4o-mini and Qwen-2.5-14B, are used in experiment.
|
| 146 |
+
|
| 147 |
+
(a) Results on LoCoMo.
|
| 148 |
+
|
| 149 |
+
|**Model**||**Method**|||||**LoCoMo**|**LoCoMo**|**LoCoMo**|**LoCoMo**|**LoCoMo**|**LoCoMo**|**LoCoMo**|**LoCoMo**|**LoCoMo**||||
|
| 150 |
+
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
| 151 |
+
||||**Single Hop**||||**Multi Hop**|||||**Temporal**||||**Open Domain**|||
|
| 152 |
+
||||**F1**<br>**BLEU-1**||||**F1**<br>**BLEU-1**|||||**F1**<br>**BLEU-1**||||**F1**<br>**BLEU-1**|||
|
| 153 |
+
|**GPT-4o-mini**||LONG-LLM<br>RAG|46.68<br>52.45||37.54<br>47.94||29.23<br>27.50|||22.76<br>20.13||25.97<br>46.07||19.42<br>40.35||16.87<br>23.23||13.70<br>17.94|
|
| 154 |
+
|||A-MEM<br>MEM0<br>MEMORYOS<br>LIGHTMEM|44.65<br>47.65<br>48.62<br>41.79||37.06<br>38.72<br>42.99<br>37.83||27.02<br>38.72<br>35.27<br>29.78|||20.09<br>27.13<br>25.22<br>24.80||45.85<br>48.93<br>41.15<br>43.71||36.67<br>40.51<br>30.76<br>39.72||12.14<br>28.64<br>20.02<br>16.89||12.00<br>21.58<br>16.52<br>13.92|
|
| 155 |
+
|||GAM|**57.75**||**52.10**||**42.29**|||**34.44**||**59.45**||**53.11**||**33.30**||**26.97**|
|
| 156 |
+
|**Qwen2.5 14b**||LONG-LLM<br>RAG|46.05<br>47.87||39.56<br>42.79||32.08<br>26.38|||24.46<br>19.54||30.51<br>30.78||24.45<br>25.97||14.89<br>14.16||11.41<br>10.52|
|
| 157 |
+
|||A-MEM<br>MEM0<br>MEMORYOS<br>LIGHTMEM|33.75<br>42.58<br>46.33<br>34.92||30.04<br>35.15<br>41.62<br>31.22||22.09<br>31.73<br>38.19<br>25.45|||15.28<br>24.82<br>29.26<br>19.61||27.19<br>28.96<br>32.24<br>32.03||22.05<br>26.24<br>27.86<br>27.70||13.49<br>15.03<br>20.27<br>15.81||10.74<br>11.28<br>15.94<br>11.81|
|
| 158 |
+
|||GAM|**58.93**||**53.74**||**42.96**|||**34.48**||**51.52**||**44.43**||**30.63**||**26.04**|
|
| 159 |
+
|**Model**<br>**GPT-4o-mini**<br>**Qwen2.5 14b**|||(b) Results on HotpotQA, RULER, and NarrativeQA.||||||||||||||||
|
| 160 |
+
|||**Method**||**HotpotQA**|||||||**RULER(128k)**|||||**NarrativeQA**|||
|
| 161 |
+
|||||**56K**<br>**F1**|**224K**<br>**F1**|**448K**<br>**F1**|||**Retri.**<br>**Acc.**||**MT**<br>**Acc.**||**AGG.**<br>**Acc.**||**QA**<br>**Acc.**||F1||
|
| 162 |
+
|||LONG-LLM<br>RAG||56.56<br>52.71|54.29<br>51.84|53.92<br>54.01|||80.30<br>94.25||60.60<br>0.00||36.70<br>35.50||61.60<br>55.90||31.26<br>25.00||
|
| 163 |
+
|||A-MEM<br>MEM0<br>MEMORYOS<br>LIGHTMEM||33.90<br>32.58<br>26.47<br>40.93|30.22<br>31.74<br>23.10<br>35.28|31.37<br>27.41<br>24.16<br>30.02|||44.23<br>46.83<br>63.10<br>27.63||0.00<br>53.80<br>2.40<br>36.20||29.20<br>34.10<br>35.60<br>34.00||46.50<br>51.70<br>36.90<br>52.60||27.07<br>29.16<br>26.70<br>17.51||
|
| 164 |
+
|||GAM||**63.22**|**64.56**|**59.81**|||**97.70**||**93.20**||**42.50**||**72.50**||**36.86**||
|
| 165 |
+
|||LONG-LLM<br>RAG||49.75<br>51.81|46.82<br>46.72|43.17<br>48.36|||70.85<br>92.78||80.00<br>0.00||15.40<br>24.70||45.60<br>47.80||29.69<br>18.29||
|
| 166 |
+
|||A-MEM<br>MEM0<br>MEMORYOS<br>LIGHTMEM||27.04<br>30.12<br>24.58<br>37.30|25.65<br>32.44<br>30.25<br>27.72|22.92<br>26.55<br>23.13<br>28.25|||39.73<br>43.03<br>54.58<br>27.53||0.00<br>41.20<br>3.00<br>17.40||25.80<br>31.50<br>5.20<br>25.60||40.20<br>46.10<br>34.60<br>53.00||25.18<br>27.80<br>23.45<br>16.57||
|
| 167 |
+
|||GAM||**64.07**|**55.99**|**57.87**|||**93.43**||**90.20**||**36.10**||**74.50**||**34.77**||
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
modules: the memorizer primarily extracts salient information from the input context, which is a relatively straightforward task, whereas the researcher must conduct iterative planning, searching, and reflection, which is much more complex and thus demands greater model capacity.
|
| 172 |
+
|
| 173 |
+
## **3.4 Increasing Test-Time Computation**
|
| 174 |
+
|
| 175 |
+
As shown in Figure 2, we investigate the impact of increasing test-time computation from two perspectives: 1) the depth of reflection and 2) the amount of retrieved pages. First, we vary the maximum reflection depth from 1 to 5 (3 by default), allowing GAM to perform additional research steps when necessary. Note that GAM autonomously determines the actual number of reflections
|
| 176 |
+
|
| 177 |
+
--- end of page.page_number=6 ---
|
| 178 |
+
|
| 179 |
+
Table 2: Model’s impact on memorizer (left) and researcher (right), reflected by GAM’s performance.
|
| 180 |
+
|
| 181 |
+
**==> picture [396 x 100] intentionally omitted <==**
|
| 182 |
+
|
| 183 |
+
**----- Start of picture text -----**<br>
|
| 184 |
+
(a) Memorizer (b) Researcher<br>HotpotQA NarrativeQA Avg HotpotQA NarrativeQA Avg<br>Model 56K 224K 448K Model 56K 224K 448K<br>F1 F1 F1 F1<br>F1 F1 F1 F1 F1 F1<br>QWEN2.5 0.5B 56.46 55.96 53.33 29.55 48.83 QWEN2.5 0.5B 10.03 11.14 11.64 3.50 9.08<br>QWEN2.5 3B 58.05 56.52 55.50 32.10 50.54 QWEN2.5 3B 39.76 37.16 33.04 23.96 33.48<br>QWEN2.5 7B 59.06 58.34 56.17 32.53 51.53 QWEN2.5 7B 51.95 47.95 48.55 26.93 43.85<br>QWEN2.5 14B 64.07 55.99 57.87 34.77 53.18 QWEN2.5 14B 64.07 55.99 57.87 34.77 53.18<br>QWEN2.5 32B 63.05 59.75 56.26 34.94 53.50 QWEN2.5 32B 61.93 59.19 61.53 35.33 54.50<br>GPT-4O MINI 64.77 59.29 57.25 34.87 54.05 GPT-4O-MINI 62.06 62.97 61.54 35.24 55.45<br>**----- End of picture text -----**<br>
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
**==> picture [397 x 100] intentionally omitted <==**
|
| 188 |
+
|
| 189 |
+
**==> picture [397 x 137] intentionally omitted <==**
|
| 190 |
+
|
| 191 |
+
**----- Start of picture text -----**<br>
|
| 192 |
+
(a) Impact of maximum reflection depth.<br>(b) Impact of the amount of retrieved pages.<br>**----- End of picture text -----**<br>
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
Figure 2: Impact of increasing test-time computation in reflection (top) and retrieval (bottom).
|
| 196 |
+
|
| 197 |
+
and does not always reach the maximum step. This increased flexibility enables GAM to collect more information from the page-store, thus yielding consistent performance improvements across all datasets. However, the marginal gains gradually diminish, as many tasks do not require deep multi-step reasoning. Second, we increase the number of retrieved pages from 3 to 20 (5 by default), enabling GAM to browse more pages in each step of research. The increase in retrieval results also leads to consistent performance improvements. Overall, both forms of increased test-time computation result in steady performance gains, which demonstrates GAM’s ability to benefit from test-time scaling, an advantage that baseline methods lack due to their fixed workflows.
|
| 198 |
+
|
| 199 |
+
## **3.5 Detailed Factors’ Analysis**
|
| 200 |
+
|
| 201 |
+
We perform ablation studies to analyze other detailed influential factors, including searching tools, formation of GAM, and output formats.
|
| 202 |
+
|
| 203 |
+
First, we examine the impact of each searching tool and its combinations. As shown in Table 3, combining any two of the searching tools yields better performance than using each single tool alone, and the joint use of all three tools (i.e., GAM with the default setting) achieves the best performance. This observation validates the effectiveness of the search tools. Moreover, employing multiple tools enables broader exploration of the page-store, leading to better coverage of relevant information and, consequently, improved performance.
|
| 204 |
+
|
| 205 |
+
--- end of page.page_number=7 ---
|
| 206 |
+
|
| 207 |
+
Table 3: Ablation study of detailed factors.
|
| 208 |
+
|
| 209 |
+
|**Method**|**HotpotQA**|**NarrativeQA**|**Avg**|
|
| 210 |
+
|---|---|---|---|
|
| 211 |
+
||**56K**<br>**224K**<br>**448K**<br>**F1**<br>**F1**<br>**F1**|**F1**|**F1**|
|
| 212 |
+
|GAM|64.07<br>55.99<br>57.87|34.77|53.18|
|
| 213 |
+
|**Tools**||||
|
| 214 |
+
|ONLY PAGE-ID<br>ONLY EMBEDDING<br>ONLY BM25<br>EMBEDDING+PAGE-ID<br>EMBEDDING+BM25<br>BM25+PAGE-ID|44.86<br>21.65<br>19.02<br>39.59<br>32.71<br>26.67<br>59.24<br>52.29<br>51.52<br>47.25<br>34.78<br>28.43<br>61.37<br>55.00<br>54.90<br>63.57<br>55.38<br>55.62|30.30<br>30.25<br>31.50<br>33.41<br>33.20<br>32.05|28.96<br>32.31<br>48.64<br>35.97<br>51.12<br>51.66|
|
| 215 |
+
|**Modules**||||
|
| 216 |
+
|RESEARCH WITHOUT MEMORY<br>MEMORY WITHOUT RESEARCH|57.40<br>49.72<br>53.98<br>42.67<br>19.75<br>17.38|31.97<br>30.18|48.27<br>27.50|
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
Table 4: Performance across different output formats.
|
| 221 |
+
|
| 222 |
+
|**Model**|**Metric**|**HotpotQA**|**NarrativeQA**|**Avg**|
|
| 223 |
+
|---|---|---|---|---|
|
| 224 |
+
|||**56K**<br>**224K**<br>**448K**|||
|
| 225 |
+
|INTEGRATION ONLY|F1<br>Tokens|64.07<br>55.99<br>57.87<br>103.42<br>102.55<br>109.98|34.77<br>107.64|53.18<br>105.90|
|
| 226 |
+
|INTEGRATION WITH PAGE|F1<br>Tokens|68.66<br>59.77<br>59.42<br>1444.30<br>499.23<br>620.11|34.99<br>6955.62|55.71<br>2379.82|
|
| 227 |
+
|INTEGRATION WITH EXTRACTION|F1<br>Tokens|67.41<br>57.83<br>57.81<br>220.78<br>227.57<br>230.47|34.82<br>244.20|54.47<br>230.76|
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
Second, we evaluate GAM’s performance when each module is used in isolation, namely 1) research without memory, and 2) memory without research. According to the experiment result in Table 3, using the research module alone leads to a substantial performance drop compared with the complete GAM system, highlighting the crucial role of memory in supporting effective exploration of relevant information. Using the memory module alone results in even worse performance, indicating that the pre-computed memory is prone to severe information loss. This observation further echoes our previous conclusion that the pre-constructed memory used in traditional ahead-of-time paradigms is far more limited than the just-in-time approach adopted by GAM.
|
| 232 |
+
|
| 233 |
+
Third, we explore the impact of different forms of output, including 1) the researcher’s integration result (default), 2) the integration result accompanied by the relevant pages that provided its source information, and 3) the integration result paired with extracted source snippets from those relevant pages. As shown in Table 4, using only the integration result already achieves highly competitive performance. However, augmenting it with source information from the relevant pages yields further improvements, as it helps mitigate the loss of fine-grained details that may occur during integration.
|
| 234 |
+
|
| 235 |
+
## **3.6 Efficiency**
|
| 236 |
+
|
| 237 |
+
To assess the working efficiency of GAM, we measure the average time consumption, including both offline memory construction and online serving, when processing HotpotQA tasks under the 56K, 224K, and 448K settings. As shown in Table 5, GAM incurs a time cost comparable to Mem0 and MemoryOS, and is substantially faster than A-mem. All methods exhibit approximately linear growth in offline construction time as context length increases, while maintaining relatively stable online serving time. Overall, GAM delivers strong performance with competitive efficiency, offering the best cost-effectiveness among experimental approaches.
|
| 238 |
+
|
| 239 |
+
--- end of page.page_number=8 ---
|
| 240 |
+
|
| 241 |
+
Table 5: Efficiency analysis on HotpotQA
|
| 242 |
+
|
| 243 |
+
|**Dataset**|**Metric**<br>**A-mem**<br>**Mem0**<br>**MemoryOS**<br>**LightMem**<br>**GAM**|
|
| 244 |
+
|---|---|
|
| 245 |
+
|**HotpotQA 56k**|OFFLINE BUILD (s)<br>209.74<br>37.42<br>80.36<br>4.93<br>56.89<br>ONLINE SERVE (s)<br>0.52<br>0.15<br>0.44<br>0.20<br>12.43<br>TOTAL (s)<br>210.26<br>37.57<br>80.80<br>5.13<br>69.32|
|
| 246 |
+
||ANSWER QUALITY (F1)<br>27.04<br>30.12<br>24.58<br>37.30<br>64.07|
|
| 247 |
+
|**HotpotQA 224k**|OFFLINE BUILD (s)<br>904.99<br>165.30<br>325.70<br>16.61<br>252.72<br>ONLINE SERVE (s)<br>0.48<br>0.17<br>0.55<br>0.25<br>16.65<br>TOTAL (s)<br>905.46<br>165.47<br>326.25<br>16.86<br>269.37|
|
| 248 |
+
||ANSWER QUALITY (F1)<br>25.65<br>32.44<br>30.25<br>27.72<br>55.99|
|
| 249 |
+
|**HotpotQA 448k**|OFFLINE BUILD (s)<br>1796.82<br>274.87<br>702.72<br>40.56<br>557.16<br>ONLINE SERVE (s)<br>0.47<br>0.18<br>0.46<br>0.21<br>18.49<br>TOTAL (s)<br>1797.29<br>275.05<br>703.18<br>40.78<br>575.65|
|
| 250 |
+
||ANSWER QUALITY (F1)<br>22.92<br>26.55<br>23.13<br>28.25<br>57.87|
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
## **4 Conclusion**
|
| 255 |
+
|
| 256 |
+
In this paper, we present a novel memory system called General Agentic Memory (GAM), which is developed under the just-in-time compilation principle. GAM employs a dual-framework comprising a memorizer and a researcher. During the offline stage, the memorizer extracts the key information for its incoming context with lightweight memory and preserve the complete information within a page-store. For each online request, the researcher performs deep-research over the page-store based on the pre-constructed memory, which generates concise yet informative memory to support the downstream task. We perform comprehensive empirical studies using a variety of popular memory and long-context benchmarks, whose result validates the effectiveness of GAM given its significant and consistent improvements over existing methods.
|
| 257 |
+
|
| 258 |
+
## **References**
|
| 259 |
+
|
| 260 |
+
- [1] Yuheng Cheng, Ceyao Zhang, Zhengwen Zhang, Xiangrui Meng, Sirui Hong, Wenhao Li, Zihao Wang, Zekai Wang, Feng Yin, Junhua Zhao, et al. Exploring large language model based intelligent agents: Definitions, methods, and prospects. _arXiv preprint arXiv:2401.03428_ , 2024.
|
| 261 |
+
|
| 262 |
+
- [2] Xiang Deng, Yu Gu, Boyuan Zheng, Shijie Chen, Sam Stevens, Boshi Wang, Huan Sun, and Yu Su. Mind2web: Towards a generalist agent for the web. _Advances in Neural Information Processing Systems_ , 36:28091–28114, 2023.
|
| 263 |
+
|
| 264 |
+
- [3] Wei Tao, Yucheng Zhou, Yanlin Wang, Wenqiang Zhang, Hongyu Zhang, and Yu Cheng. Magis: Llm-based multi-agent framework for github issue resolution. _Advances in Neural Information Processing Systems_ , 37:51963–51993, 2024.
|
| 265 |
+
|
| 266 |
+
- [4] Samuel Schmidgall, Yusheng Su, Ze Wang, Ximeng Sun, Jialian Wu, Xiaodong Yu, Jiang Liu, Michael Moor, Zicheng Liu, and Emad Barsoum. Agent laboratory: Using llm agents as research assistants. _arXiv preprint arXiv:2501.04227_ , 2025.
|
| 267 |
+
|
| 268 |
+
- [5] Shouyuan Chen, Sherman Wong, Liangjian Chen, and Yuandong Tian. Extending context window of large language models via positional interpolation. _arXiv preprint arXiv:2306.15595_ , 2023.
|
| 269 |
+
|
| 270 |
+
- [6] Zeyu Zhang, Quanyu Dai, Xiaohe Bo, Chen Ma, Rui Li, Xu Chen, Jieming Zhu, Zhenhua Dong, and Ji-Rong Wen. A survey on the memory mechanism of large language model-based agents. _ACM Transactions on Information Systems_ , 43(6):1–47, 2025.
|
| 271 |
+
|
| 272 |
+
- [7] Hongli Yu, Tinghong Chen, Jiangtao Feng, Jiangjie Chen, Weinan Dai, Qiying Yu, Ya-Qin Zhang, Wei-Ying Ma, Jingjing Liu, Mingxuan Wang, et al. Memagent: Reshaping long-context llm with multi-conv rl-based memory agent. _arXiv preprint arXiv:2507.02259_ , 2025.
|
| 273 |
+
|
| 274 |
+
--- end of page.page_number=9 ---
|
| 275 |
+
|
| 276 |
+
- [8] Wujiang Xu, Kai Mei, Hang Gao, Juntao Tan, Zujie Liang, and Yongfeng Zhang. A-mem: Agentic memory for llm agents. _arXiv preprint arXiv:2502.12110_ , 2025.
|
| 277 |
+
|
| 278 |
+
- [9] Prateek Chhikara, Dev Khant, Saket Aryan, Taranjeet Singh, and Deshraj Yadav. Mem0: Building production-ready ai agents with scalable long-term memory. _arXiv preprint arXiv:2504.19413_ , 2025.
|
| 279 |
+
|
| 280 |
+
- [10] Jiazheng Kang, Mingming Ji, Zhe Zhao, and Ting Bai. Memory os of ai agent. _arXiv preprint arXiv:2506.06326_ , 2025.
|
| 281 |
+
|
| 282 |
+
- [11] Adyasha Maharana, Dong-Ho Lee, Sergey Tulyakov, Mohit Bansal, Francesco Barbieri, and Yuwei Fang. Evaluating very long-term conversational memory of llm agents. _arXiv preprint arXiv:2402.17753_ , 2024.
|
| 283 |
+
|
| 284 |
+
- [12] Cheng-Ping Hsieh, Simeng Sun, Samuel Kriman, Shantanu Acharya, Dima Rekesh, Fei Jia, Yang Zhang, and Boris Ginsburg. Ruler: What’s the real context size of your long-context language models? _arXiv preprint arXiv:2404.06654_ , 2024.
|
| 285 |
+
|
| 286 |
+
- [13] Tomáš Koˇcisky, Jonathan Schwarz, Phil Blunsom, Chris Dyer, Karl Moritz Hermann, Gábor` Melis, and Edward Grefenstette. The narrativeqa reading comprehension challenge. _Transactions of the Association for Computational Linguistics_ , 6:317–328, 2018.
|
| 287 |
+
|
| 288 |
+
- [14] Kun Luo, Zheng Liu, Shitao Xiao, Tong Zhou, Yubo Chen, Jun Zhao, and Kang Liu. Landmark embedding: a chunking-free embedding method for retrieval augmented long-context large language models. In _Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)_ , pages 3268–3281, 2024.
|
| 289 |
+
|
| 290 |
+
- [15] Anthropic. Introducing contextual retrieval. _https://www.anthropic.com/engineering/contextualretrieval_ , 2024.
|
| 291 |
+
|
| 292 |
+
- [16] Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, William W Cohen, Ruslan Salakhutdinov, and Christopher D Manning. Hotpotqa: A dataset for diverse, explainable multi-hop question answering. _arXiv preprint arXiv:1809.09600_ , 2018.
|
| 293 |
+
|
| 294 |
+
- [17] Jizhan Fang, Xinle Deng, Haoming Xu, Ziyan Jiang, Yuqi Tang, Ziwen Xu, Shumin Deng, Yunzhi Yao, Mengru Wang, Shuofei Qiao, et al. Lightmem: Lightweight and efficient memoryaugmented generation. _arXiv preprint arXiv:2510.18866_ , 2025.
|
| 295 |
+
|
| 296 |
+
- [18] Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, et al. Qwen technical report. _arXiv preprint arXiv:2309.16609_ , 2023.
|
| 297 |
+
|
| 298 |
+
- [19] Jianlv Chen, Shitao Xiao, Peitian Zhang, Kun Luo, Defu Lian, and Zheng Liu. Bge m3embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation. _arXiv preprint arXiv:2402.03216_ , 2024.
|
| 299 |
+
|
| 300 |
+
--- end of page.page_number=10 ---
|
| 301 |
+
|
| 302 |
+
## **Appendix**
|
| 303 |
+
|
| 304 |
+
## **Baseline Reproduction Details**
|
| 305 |
+
|
| 306 |
+
When reproducing the baseline methods on the LocoMo dataset, we found that the category labels used for A-mem, Mem0, and MemoryOS were incorrect. Based on the official LocoMo annotations, we corrected the corresponding category–label mapping.
|
| 307 |
+
|
| 308 |
+
## **Prompts**
|
| 309 |
+
|
| 310 |
+
**Prompt for Memorizing** You are the MemoryAgent. Your job is to write one concise abstract that can be stored as long-term memory. MAIN OBJECTIVE: Generate a concise, self-contained and coherent abstract of INPUT_MESSAGE that preserves ALL important information in INPUT_MESSAGE. MEMORY_CONTEXT is provided so you can understand the broader situation such as people, modules, decisions, ongoing tasks and keep wording consistent. INPUTS: **MEMORY_CONTEXT** : {memory_context} **INPUT_MESSAGE** : {input_message} YOUR TASK: 1. Read INPUT_MESSAGE and extract all specific, memory-relevant information, such as: - plans, goals, decisions, requests, preferences
|
| 311 |
+
|
| 312 |
+
- actions taken, next steps, assignments, and responsibilities
|
| 313 |
+
|
| 314 |
+
- problems, blockers, bugs, questions that need follow-up - specific facts such as names, dates, numbers, locations 2. Use MEMORY_CONTEXT to: - resolve or disambiguate the entities, components, tasks, or resources mentioned in INPUT_MESSAGE,
|
| 315 |
+
|
| 316 |
+
- keep terminology (names of agents, modules, datasets, etc.) consistent with prior usage, - include minimal background context if it is required for the abstract to be understandable. You MUST NOT invent or add information that appears only in MEMORY_CONTEXT and is NOT implied or mentioned in INPUT_MESSAGE. 3. Your abstract MUST:
|
| 317 |
+
|
| 318 |
+
- summarize all important content from INPUT_MESSAGE,
|
| 319 |
+
|
| 320 |
+
- be understandable on its own without seeing INPUT_MESSAGE,
|
| 321 |
+
|
| 322 |
+
- be factual and specific.
|
| 323 |
+
|
| 324 |
+
STYLE RULES: - Output exactly ONE concise paragraph. No bullet points. - Do NOT include meta phrases like "The user said..." or "The conversation is about...".
|
| 325 |
+
|
| 326 |
+
- Do NOT give advice, opinions, or suggestions.
|
| 327 |
+
|
| 328 |
+
- Do NOT ask questions. - Do NOT include anything that is not grounded in INPUT_MESSAGE. OUTPUT FORMAT: Return ONLY the single paragraph. Do NOT add any headings or labels.
|
| 329 |
+
|
| 330 |
+
--- end of page.page_number=11 ---
|
| 331 |
+
|
| 332 |
+
## **Prompt for Planning Part 1**
|
| 333 |
+
|
| 334 |
+
You are the PlanningAgent. Your job is to generate a concrete retrieval plan for how to gather information needed to answer the QUESTION. You must use the QUESTION and the current MEMORY (which contains abstracts of all messages so far). **QUESTION** : {request} **MEMORY** : {memory}
|
| 335 |
+
|
| 336 |
+
PLANNING PROCEDURE 1. Interpret the QUESTION using the context in MEMORY. Identify what is need to satisfy the QUESTION. 2. Break that need into concrete "info needs": specific sub-questions you must answer to fully respond to the QUESTION. 3. For each info need, decide which retrieval tools are useful. You may assign multiple tools to the same info need: - Use "keyword" for exact entities / functions / key attributes. - Use "vector" for conceptual understanding. - Use "page_index" if MEMORY already points to clearly relevant page indices. 4. Build the final plan: - "info_needs": a list of all the specific sub-questions / missing facts you still need. - "tools": which of ["keyword","vector","page_index"] you will actually use in this plan. This can include more than one tool. - "keyword_collection": a list of short keyword-style queries you will issue. - "vector_queries": a list of semantic / natural-language queries you will issue. - "page_index": a list of integer page indices you plan to read fully.
|
| 337 |
+
|
| 338 |
+
AVAILABLE RETRIEVAL TOOLS: All of the following retrieval tools are available to you. You may select one, several, or all of them in the same plan to maximize coverage. Parallel use of multiple tools is allowed and encouraged if it helps answer the QUESTION. 1. "keyword" - WHAT IT DOES: Exact keyword match retrieval. It finds pages that contain specific names, function names, key attributes, etc. - HOW TO USE: Provide short, high-signal keywords. Do NOT write long natural-language questions here. Use crisp keywords and phrases that should literally appear in relevant text. 2. "vector" - WHAT IT DOES: Semantic retrieval by meaning. It finds conceptually related pages. This is good for high-level questions, reasoning questions, or "how/why" style questions. - HOW TO USE: Write each query as a short natural-language sentence that clearly states what you want to know, using full context and entities from MEMORY and QUESTION. Example style: "How does the DenseRetriever assign GPUs during index building?"
|
| 339 |
+
|
| 340 |
+
--- end of page.page_number=12 ---
|
| 341 |
+
|
| 342 |
+
**Prompt for Planning Part 2** 3. "page_index" - WHAT IT DOES: Directly ask to re-read full pages (by page ID) that are already known to be relevant. MEMORY may mention specific page IDs or indices that correspond to important configs, attributes, or names. Use this if you already know specific page indices that should be inspected in full. - HOW TO USE: Return a list of those integer page indices (e.g. [0, 2, 5]), max 5 pages. You MUST NOT invent or guess page indices. RULES - Avoid simple repetition. Whether it's keywords or sentences for search, make them as independent as possible rather than duplicated. - Be specific. Avoid vague items like "get more details" or "research background". - Every string in "keyword_collection" and "vector_queries" must be directly usable as a retrieval query. - You may include multiple tools. Do NOT limit yourself to a single tool if more than one is useful. - Do NOT invent tools. Only use "keyword", "vector", "page_index". - Do NOT invent page indices. If you are not sure about a page index, return []. - You are only planning retrieval. Do NOT answer the QUESTION here. THINKING STEP - Before producing the output, think through the procedure and choices inside <think>...</think>. - Keep the <think> concise but sufficient to validate decisions. - After </think>, output ONLY the JSON object specified below. The <think> section must NOT be included in the JSON. OUTPUT JSON SPEC Return ONE JSON object with EXACTLY these keys: - "info_needs": array of strings (required) - "tools": array of strings from ["keyword","vector","page_index"] (required) - "keyword_collection": array of strings (required) - "vector_queries": array of strings (required) - "page_index": array of integers (required), max 5. All keys MUST appear. After the <think> section, return ONLY the JSON object. Do NOT include any commentary or explanation outside the JSON.
|
| 343 |
+
|
| 344 |
+
--- end of page.page_number=13 ---
|
| 345 |
+
|
| 346 |
+
## **Prompt for Integrating Part 1**
|
| 347 |
+
|
| 348 |
+
You are the IntegrateAgent. Your job is to build an integrated factual summary for a QUESTION. YOU ARE GIVEN: - QUESTION: what must be answered. - EVIDENCE_CONTEXT: newly retrieved supporting evidence that may contain facts relevant to the QUESTION. - RESULT: the current working notes / draft summary about this same QUESTION (may be incomplete).
|
| 349 |
+
|
| 350 |
+
## YOUR OBJECTIVE:
|
| 351 |
+
|
| 352 |
+
Produce an UPDATED_RESULT that is a consolidated factual summary of all information that is relevant to the QUESTION. This is NOT a final answer to the QUESTION. It is an integrated summary of all useful facts that could be used to answer the QUESTION.
|
| 353 |
+
|
| 354 |
+
The UPDATED_RESULT must: 1. Keep useful, correct, on-topic information from RESULT. 2. Add any new, relevant, well-supported facts from EVIDENCE_CONTEXT. 3. Remove anything that is off-topic for the QUESTION.
|
| 355 |
+
|
| 356 |
+
**QUESTION** : {question} **EVIDENCE_CONTEXT** : {evidence_context} **RESULT** : {result}
|
| 357 |
+
|
| 358 |
+
INSTRUCTIONS: 1. Understand the QUESTION. Identify exactly what needs to be answered. 2. From RESULT: - Keep any statements that are relevant to the QUESTION. 3. From EVIDENCE_CONTEXT: - Extract every fact that helps describe, clarify, or support an answer to the QUESTION. - Prefer concrete details such as entities, numbers, versions, decisions, timelines, outcomes, responsibilities, constraints. - Ignore anything unrelated to the QUESTION. 4. Synthesis: - Merge the selected content from RESULT with the selected content from EVIDENCE_CONTEXT. - The merged text MUST read as one coherent factual summary related to the QUESTION (not the direct answer). - The merged summary MUST collect all important factual information needed to answer the QUESTION, so it can stand alone later without needing RESULT or EVIDENCE_CONTEXT. - Do NOT add interpretation, recommendations, or conclusions beyond what is explicitly stated in RESULT or EVIDENCE_CONTEXT.
|
| 359 |
+
|
| 360 |
+
--- end of page.page_number=14 ---
|
| 361 |
+
|
| 362 |
+
**Prompt for Integrating Part 2** RULES: - "content" MUST ONLY include factual information that is relevant to the QUESTION. - You are NOT producing a final answer, decision, recommendation, or plan. You are producing a cleaned, merged factual summary. - Do NOT invent or infer facts that do not appear in RESULT or EVIDENCE_CONTEXT. - Do NOT include meta language (e.g. "the evidence says", "according to RESULT", "the model stated"). - Do NOT include instructions, reasoning steps, or analysis of your own process. - Do NOT include any keys other than "content" and "sources". - "sources" should on incluede the page_ids of the pages that supported the included facts. THINKING STEP - Before producing the output, think about selection and synthesis steps inside <think>...</think>. - Keep the <think> concise but sufficient to ensure correctness and relevance. - After </think>, output ONLY the JSON object. The <think> section must NOT be included in the JSON. OUTPUT JSON SPEC: Return ONE JSON object with EXACTLY: - "content": string. This is the UPDATED_RESULT, i.e. the integrated final information related to the QUESTION, if there not exist any useful information, just provide "". - "sources": array of strings/objects. Both keys MUST be present. After the <think> section, return ONLY the JSON object. Do NOT output Markdown, comments, headings, or explanations outside the JSON.
|
| 363 |
+
|
| 364 |
+
--- end of page.page_number=15 ---
|
| 365 |
+
|
| 366 |
+
**Prompt for InfoCheck** You are the InfoCheckAgent. Your job is to judge whether the currently collected information is sufficient to answer a specific QUESTION. YOU ARE GIVEN: - REQUEST: the QUESTION that needs to be answered. - RESULT: the current integrated factual summary about that QUESTION. RESULT is intended to contain all useful known information so far. YOUR OBJECTIVE: Decide whether RESULT already contains all of the information needed to fully answer REQUEST with specific, concrete details. You are NOT answering REQUEST. You are only judging completeness. **REQUEST** : {request} **RESULT** : {result} EVALUATION PROCEDURE: 1. Decompose REQUEST: - Identify the key pieces of information that are required to answer REQUEST completely (facts, entities, steps, reasoning, comparisons, constraints, timelines, outcomes, etc.). 2. Check RESULT: - For each required piece, check whether RESULT already provides that information clearly and specifically. - RESULT must be specific enough that someone could now write a final answer directly from it without needing further retrieval. 3. Decide completeness: - "enough" = true ONLY IF RESULT covers all required pieces with sufficient clarity and specificity. - "enough" = false otherwise. THINKING STEP - Before producing the output, perform your decomposition and evaluation inside <think>...</think>. - Keep the <think> concise but ensure it verifies completeness rigorously. - After </think>, output ONLY the JSON object with the key specified below. The <think> section must NOT be included in the JSON. OUTPUT REQUIREMENTS: Return ONE JSON object with EXACTLY this key: - "enough": boolean. true if RESULT is sufficient to answer REQUEST fully; false otherwise. RULES: - Do NOT invent facts. - Do NOT answer REQUEST. - Do NOT include any explanation, reasoning, or extra keys. - After the <think> section, return ONLY the JSON object.
|
| 367 |
+
|
| 368 |
+
--- end of page.page_number=16 ---
|
| 369 |
+
|
| 370 |
+
## **Prompt for Requests Generating**
|
| 371 |
+
|
| 372 |
+
You are the FollowUpRequestAgent. Your job is to propose targeted follow-up retrieval questions for missing information.
|
| 373 |
+
|
| 374 |
+
## YOU ARE GIVEN:
|
| 375 |
+
|
| 376 |
+
- REQUEST: the original QUESTION that we ultimately want to be able to answer. - RESULT: the current integrated factual summary about this QUESTION. RESULT represents everything we know so far.
|
| 377 |
+
|
| 378 |
+
## YOUR OBJECTIVE:
|
| 379 |
+
|
| 380 |
+
Identify what important information is still missing from RESULT in order to fully answer REQUEST, and generate focused retrieval questions that would fill those gaps.
|
| 381 |
+
|
| 382 |
+
## **REQUEST** : {request}
|
| 383 |
+
|
| 384 |
+
**RESULT** : {result}
|
| 385 |
+
|
| 386 |
+
## INSTRUCTIONS:
|
| 387 |
+
|
| 388 |
+
1. Read REQUEST and determine what information is required to answer it completely (facts, numbers, definitions, procedures, timelines, responsibilities, comparisons, outcomes, constraints, etc.).
|
| 389 |
+
|
| 390 |
+
2. Read RESULT and determine which of those required pieces are still missing, unclear, or underspecified.
|
| 391 |
+
|
| 392 |
+
3. For each missing piece, generate ONE standalone retrieval question that would directly obtain that missing information.
|
| 393 |
+
|
| 394 |
+
- Each question MUST:
|
| 395 |
+
|
| 396 |
+
- mention concrete entities / modules / components / datasets / events if they are known, - ask for factual information that could realistically be found by retrieval (not "analyze", "think", "infer", or "judge").
|
| 397 |
+
|
| 398 |
+
4. Rank the questions from most critical missing information to least critical.
|
| 399 |
+
|
| 400 |
+
5. Produce at most 5 questions.
|
| 401 |
+
|
| 402 |
+
## THINKING STEP
|
| 403 |
+
|
| 404 |
+
- Before producing the output, reason about gaps and prioritize inside <think>...</think>.
|
| 405 |
+
|
| 406 |
+
- Keep the <think> concise but ensure prioritization makes sense. - After </think>, output ONLY the JSON object specified below. The <think> section must NOT be included in the JSON.
|
| 407 |
+
|
| 408 |
+
## OUTPUT FORMAT:
|
| 409 |
+
|
| 410 |
+
Return ONE JSON object with EXACTLY this key:
|
| 411 |
+
|
| 412 |
+
- "new_requests": array of strings (0 to 5 items). Each string is one retrieval question.
|
| 413 |
+
|
| 414 |
+
## RULES:
|
| 415 |
+
|
| 416 |
+
- Do NOT include any extra keys besides "new_requests".
|
| 417 |
+
|
| 418 |
+
- After the <think> section, do NOT include explanations, reasoning steps, or Markdown outside the JSON.
|
| 419 |
+
|
| 420 |
+
- Do NOT generate vague requests like "Get more info".
|
| 421 |
+
|
| 422 |
+
- Do NOT answer REQUEST yourself.
|
| 423 |
+
|
| 424 |
+
- Do NOT invent facts that are not asked by REQUEST.
|
| 425 |
+
|
| 426 |
+
After the <think> section, return ONLY the JSON object.
|
| 427 |
+
|
| 428 |
+
--- end of page.page_number=17 ---
|
| 429 |
+
|
docs/markdowns/deepanalyze.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
docs/markdowns/deepseek_ocr.md
ADDED
|
@@ -0,0 +1,519 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# **DeepSeek-OCR: Contexts Optical Compression**
|
| 2 |
+
|
| 3 |
+
Haoran Wei, Yaofeng Sun, Yukun Li
|
| 4 |
+
|
| 5 |
+
**DeepSeek-AI**
|
| 6 |
+
|
| 7 |
+
## **Abstract**
|
| 8 |
+
|
| 9 |
+
We present DeepSeek-OCR as an initial investigation into the feasibility of compressing long contexts via optical 2D mapping. DeepSeek-OCR consists of two components: DeepEncoder and DeepSeek3B-MoE-A570M as the decoder. Specifically, DeepEncoder serves as the core engine, designed to maintain low activations under high-resolution input while achieving high compression ratios to ensure an optimal and manageable number of vision tokens. Experiments show that when the number of text tokens is within 10 times that of vision tokens (i.e., a compression ratio < 10×), the model can achieve decoding (OCR) precision of 97%. Even at a compression ratio of 20×, the OCR accuracy still remains at about 60%. This shows considerable promise for research areas such as historical long-context compression and memory forgetting mechanisms in LLMs. Beyond this, DeepSeek-OCR also demonstrates high practical value. On OmniDocBench, it surpasses GOT-OCR2.0 (256 tokens/page) using only 100 vision tokens, and outperforms MinerU2.0 (6000+ tokens per page on average) while utilizing fewer than 800 vision tokens. In production, DeepSeek-OCR can generate training data for LLMs/VLMs at a scale of 200k+ pages per day (a single A100-40G). Codes and model weights are publicly accessible at `http://github.com/deepseek-ai/DeepSeek-OCR` .
|
| 10 |
+
|
| 11 |
+
**==> picture [438 x 193] intentionally omitted <==**
|
| 12 |
+
|
| 13 |
+
**----- Start of picture text -----**<br>
|
| 14 |
+
(a) Compression on Fox benchmark (b) Performance on Omnidocbench<br>**----- End of picture text -----**<br>
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
Figure 1 | Figure (a) shows the compression ratio (number of text tokens in ground truth/number of vision tokens model used) testing on Fox [21] benchmark; Figure (b) shows performance comparisons on OmniDocBench [27]. DeepSeek-OCR can achieve state-of-the-art performance among end-to-end models enjoying the fewest vision tokens.
|
| 18 |
+
|
| 19 |
+
--- end of page.page_number=1 ---
|
| 20 |
+
|
| 21 |
+
## **Contents**
|
| 22 |
+
|
| 23 |
+
|**1**|**Introduction**|**Introduction**|**3**|
|
| 24 |
+
|---|---|---|---|
|
| 25 |
+
|**2**|**Related Works**||**4**|
|
| 26 |
+
||2.1|Typical Vision Encoders in VLMs . . . . . . . . . . . . . . . . . . . . . . . . . . . .|4|
|
| 27 |
+
||2.2|End-to-end OCR Models . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|4|
|
| 28 |
+
|**3**|**Methodology**||**5**|
|
| 29 |
+
||3.1|Architecture . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|5|
|
| 30 |
+
||3.2|DeepEncoder<br>. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|5|
|
| 31 |
+
|||3.2.1<br>Architecture of DeepEncoder . . . . . . . . . . . . . . . . . . . . . . . . . .|5|
|
| 32 |
+
|||3.2.2<br>Multiple resolution support . . . . . . . . . . . . . . . . . . . . . . . . . . .|6|
|
| 33 |
+
||3.3|The MoE Decoder . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|7|
|
| 34 |
+
||3.4|Data Engine . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|7|
|
| 35 |
+
|||3.4.1<br>OCR 1.0 data<br>. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|7|
|
| 36 |
+
|||3.4.2<br>OCR 2.0 data<br>. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|8|
|
| 37 |
+
|||3.4.3<br>General vision data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|9|
|
| 38 |
+
|||3.4.4<br>Text-only data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|9|
|
| 39 |
+
||3.5|Training Pipelines . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|9|
|
| 40 |
+
|||3.5.1<br>Training DeepEncoder . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|10|
|
| 41 |
+
|||3.5.2<br>Training DeepSeek-OCR . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|10|
|
| 42 |
+
|**4**|**Evaluation**||**10**|
|
| 43 |
+
||4.1|Vision-text Compression Study . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|10|
|
| 44 |
+
||4.2|OCR Practical Performance<br>. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|12|
|
| 45 |
+
||4.3|Qualitative Study . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|12|
|
| 46 |
+
|||4.3.1<br>Deep parsing<br>. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|12|
|
| 47 |
+
|||4.3.2<br>Multilingual recognition . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|16|
|
| 48 |
+
|||4.3.3<br>General vision understanding . . . . . . . . . . . . . . . . . . . . . . . . . .|17|
|
| 49 |
+
|**5**|**Discussion**||**18**|
|
| 50 |
+
|**6**|**Conclusion**||**19**|
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
--- end of page.page_number=2 ---
|
| 55 |
+
|
| 56 |
+
## **1. Introduction**
|
| 57 |
+
|
| 58 |
+
Current Large Language Models (LLMs) face significant computational challenges when processing long textual content due to quadratic scaling with sequence length. We explore a potential solution: leveraging visual modality as an efficient compression medium for textual information. A single image containing document text can represent rich information using substantially fewer tokens than the equivalent digital text, suggesting that optical compression through vision tokens could achieve much higher compression ratios.
|
| 59 |
+
|
| 60 |
+
This insight motivates us to reexamine vision-language models (VLMs) from an LLM-centric perspective, focusing on how vision encoders can enhance LLMs’ efficiency in processing textual information rather than basic VQA [12, 16, 24, 32, 41] what humans excel at. OCR tasks, as an intermediate modality bridging vision and language, provide an ideal testbed for this visiontext compression paradigm, as they establish a natural compression-decompression mapping between visual and textual representations while offering quantitative evaluation metrics.
|
| 61 |
+
|
| 62 |
+
Accordingly, we present DeepSeek-OCR, a VLM designed as a preliminary proof-of-concept for efficient vision-text compression. Our work makes three primary contributions:
|
| 63 |
+
|
| 64 |
+
First, we provide comprehensive quantitative analysis of vision-text token compression ratios. Our method achieves 96%+ OCR decoding precision at 9-10× text compression, ∼90% at 10-12× compression, and ∼60% at 20× compression on Fox [21] benchmarks featuring diverse document layouts (with actual accuracy being even higher when accounting for formatting differences between output and ground truth), as shown in Figure 1(a). The results demonstrate that compact language models can effectively learn to decode compressed visual representations, suggesting that larger LLMs could readily acquire similar capabilities through appropriate pretraining design.
|
| 65 |
+
|
| 66 |
+
Second, we introduce DeepEncoder, a novel architecture that maintains low activation memory and minimal vision tokens even with high-resolution inputs. It serially connects window attention and global attention encoder components through a 16× convolutional compressor. This design ensures that the window attention component processes a large number of vision tokens, while the compressor reduces vision tokens before they enter the dense global attention component, achieving effective memory and token compression.
|
| 67 |
+
|
| 68 |
+
Third, we develop DeepSeek-OCR based on DeepEncoder and DeepSeek3B-MoE [19, 20]. As shown in Figure 1(b), it achieves state-of-the-art performance within end-to-end models on OmniDocBench while using the fewest vision tokens. Additionally, we equip the model with capabilities for parsing charts, chemical formulas, simple geometric figures, and natural images to enhance its practical utility further. In production, DeepSeek-OCR can generate 33 million pages of data per day for LLMs or VLMs using 20 nodes (each with 8 A100-40G GPUs).
|
| 69 |
+
|
| 70 |
+
In summary, this work presents a preliminary exploration of using visual modality as an efficient compression medium for textual information processing in LLMs. Through DeepSeekOCR, we demonstrate that vision-text compression can achieve significant token reduction (7-20×) for different historical context stages, offering a promising direction for addressing long-context challenges in large language models. Our quantitative analysis provides empirical guidelines for VLM token allocation optimization, while the proposed DeepEncoder architecture showcases practical feasibility with real-world deployment capabilities. Although focused on OCR as a proof-of-concept, this paradigm opens new possibilities for rethinking how vision and language modalities can be synergistically combined to enhance computational efficiency in large-scale text processing and agent systems.
|
| 71 |
+
|
| 72 |
+
--- end of page.page_number=3 ---
|
| 73 |
+
|
| 74 |
+
**==> picture [438 x 103] intentionally omitted <==**
|
| 75 |
+
|
| 76 |
+
**----- Start of picture text -----**<br>
|
| 77 |
+
Vary/DeepSeekVL/... usually >15 InternVL series/ Qwen2(.5)/3VL series...<br>DeepSeekVL2/...<br>w<br>VITDet sampleDown- VIT<br>1024 384 Down- h (navit)VIT sampleDown- LLM<br>LLM sample<br>224<br>VIT 384 LLM<br> tokens = (w//14(16))×(h//14(16))<br>[×] unsupported pipeline parallel [×] two pre-processes [×] low native resolution [×] overly small patches [×] too many vision tokens [×] need long sequence length<br>[×] unsupported extreme resolution [×] hard to deployment [×] too many vision tokens [×] small global view [×] large activations [×] slow inference speed<br>1024<br>384<br>224<br>384<br>**----- End of picture text -----**<br>
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
Figure 2 | Typical vision encoders in popular VLMs. Here are three types of encoders commonly used in current open-source VLMs, all of which suffer from their respective deficiencies.
|
| 81 |
+
|
| 82 |
+
## **2. Related Works**
|
| 83 |
+
|
| 84 |
+
## **2.1. Typical Vision Encoders in VLMs**
|
| 85 |
+
|
| 86 |
+
Current open-source VLMs employ three main types of vision encoders, as illustrated in Figure 2. The first type is a dual-tower architecture represented by Vary [36], which utilizes parallel SAM [17] encoder to increase visual vocabulary parameters for high-resolution image processing. While offering controllable parameters and activation memory, this approach suffers from significant drawbacks: it requires dual image preprocessing that complicates deployment and makes encoder pipeline parallelism challenging during training. The second type is tile-based method exemplified by InternVL2.0 [8], which processes images by dividing them into small tiles for parallel computation, reducing activation memory under high-resolution settings. Although capable of handling extremely high resolutions, this approach has notable limitations due to its typically low native encoder resolution (below 512×512), causing large images to be excessively fragmented and resulting in numerous vision tokens. The third type is adaptive resolution encoding represented by Qwen2-VL [35], which adopts the NaViT [10] paradigm to directly process full images through patch-based segmentation without tile parallelization. While this encoder can handle diverse resolutions flexibly, it faces substantial challenges with large images due to massive activation memory consumption that can cause GPU memory overflow, and sequence packing requires extremely long sequence lengths during training. Long vision tokens will slow down both prefill and generation phases of inference.
|
| 87 |
+
|
| 88 |
+
## **2.2. End-to-end OCR Models**
|
| 89 |
+
|
| 90 |
+
OCR, particularly document parsing task, has been a highly active topic in the image-to-text domain. With the advancement of VLMs, a large number of end-to-end OCR models have emerged, fundamentally transforming the traditional pipeline architecture (which required separate detection and recognition expert models) by simplifying OCR systems. Nougat [6] first employs end-to-end framework for academic paper OCR on arXiv, demonstrating the potential of models in handling dense perception tasks. GOT-OCR2.0 [38] expands the scope of OCR2.0 to include more synthetic image parsing tasks and designs an OCR model with performance-efficiency trade-offs, further highlighting the potential of end-to-end OCR researches. Additionally, general vision models such as Qwen-VL series [35], InternVL series [8], and many their derivatives continuously enhance their document OCR capabilities to explore dense visual perception boundaries. However, a crucial research question that current models have not addressed is: _for a document containing 1000 words, how many vision tokens are at least needed for decoding?_ This question holds significant importance for research in the principle that " _a picture is worth a thousand words._ "
|
| 91 |
+
|
| 92 |
+
--- end of page.page_number=4 ---
|
| 93 |
+
|
| 94 |
+
**==> picture [449 x 117] intentionally omitted <==**
|
| 95 |
+
|
| 96 |
+
**----- Start of picture text -----**<br>
|
| 97 |
+
Output<br>n/16 ...<br>... SAM Conv ... CLIP VIT 300M<br>VITDET 16x DeepSeek-3B<br>80M (MOE-A570M)<br>down- vision global attention<br>sample tokens<br>Decoder<br>Input local attention Embedding layer<br>n×16×16 low activation<br>patches Tokenizer DeepEncoder Prompt<br>**----- End of picture text -----**<br>
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
Figure 3 | The architecture of DeepSeek-OCR. DeepSeek-OCR consists of a DeepEncoder and a DeepSeek-3B-MoE decoder. DeepEncoder is the core of DeepSeek-OCR, comprising three components: a SAM [17] for perception dominated by window attention, a CLIP [29] for knowledge with dense global attention, and a 16× token compressor that bridges between them.
|
| 101 |
+
|
| 102 |
+
## **3. Methodology**
|
| 103 |
+
|
| 104 |
+
## **3.1. Architecture**
|
| 105 |
+
|
| 106 |
+
As shown in Figure 3, DeepSeek-OCR enjoys a unified end-to-end VLM architecture consisting of an encoder and a decoder. The encoder (namely DeepEncoder) is responsible for extracting image features and tokenizing as well as compressing visual representations. The decoder is used for generating the required result based on image tokens and prompts. DeepEncoder is approximately 380M in parameters, mainly composed of an 80M SAM-base [17] and a 300M CLIP-large [29] connected in series. The decoder adopts a 3B MoE [19, 20] architecture with 570M activated parameters. In the following paragraphs, we will delve into the model components, data engineering, and training skills.
|
| 107 |
+
|
| 108 |
+
## **3.2. DeepEncoder**
|
| 109 |
+
|
| 110 |
+
To explore the feasibility of contexts optical compression, we need a vision encoder with the following features: 1.Capable of processing high resolutions; 2.Low activation at high resolutions; 3.Few vision tokens; 4.Support for multiple resolution inputs; 5. Moderate parameter count. However, as described in the Section 2.1, current open-source encoders cannot fully satisfy all these conditions. Therefore, we design a novel vision encoder ourselves, named DeepEncoder.
|
| 111 |
+
|
| 112 |
+
## _**3.2.1. Architecture of DeepEncoder**_
|
| 113 |
+
|
| 114 |
+
DeepEncoder mainly consists of two components: a visual perception feature extraction component dominated by window attention, and a visual knowledge feature extraction component with dense global attention. To benefit from the pretraining gains of previous works, we use SAM-base (patch-size 16) and CLIP-large as the main architectures for the two components respectively. For CLIP, we remove the first patch embedding layer since its input is no longer images but output tokens from the previous pipeline. Between the two components, we borrow from Vary [36] and use a 2-layer convolutional module to perform 16× downsampling of vision tokens. Each convolutional layer has a kernel size of 3, stride of 2, padding of 1, and channels increase from 256 to 1024. Assuming we input a 1024×1024 image, the DeepEncoder will segment it into 1024/16×1024/16=4096 patch tokens. Since the first half of encoder is dominated by window attention and only 80M, the activation is acceptable. Before entering global attention,
|
| 115 |
+
|
| 116 |
+
--- end of page.page_number=5 ---
|
| 117 |
+
|
| 118 |
+
**==> picture [437 x 131] intentionally omitted <==**
|
| 119 |
+
|
| 120 |
+
**----- Start of picture text -----**<br>
|
| 121 |
+
W:1024||1280<br>W:512||640<br>+<br>Resize Padding 640||1024 n=6 W:1024||1280<br>Mode: Tiny||Small Mode: Base||Large Mode: Gundam||Gundam (Master)<br>Token: 64||100 Token: 256||400 Valid: (256||400)×R R=1-(H-W)/W Token: n×(100||256) + (256||400) Valid: n×(100||256) + (256||400)×R n ∈ [2:9]<br>H:512||640 H:1024||1280 H:1024||1280<br>640||1024<br>**----- End of picture text -----**<br>
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
Figure 4 | To test model performance under different compression ratios (requiring different numbers of vision tokens) and enhance the practicality of DeepSeek-OCR, we configure it with multiple resolution modes.
|
| 125 |
+
|
| 126 |
+
the 4096 tokens go through the compression module and the token count becomes 4096/16=256, thus making the overall activation memory controllable.
|
| 127 |
+
|
| 128 |
+
Table 1 | Multi resolution support of DeepEncoder. For both research and application purposes, we design DeepEncoder with diverse native resolution and dynamic resolution modes.
|
| 129 |
+
|
| 130 |
+
||**Native Resolution**|**Native Resolution**|**Dynamic Resolution**|
|
| 131 |
+
|---|---|---|---|
|
| 132 |
+
|**Mode**<br>|Tiny<br>Small<br>Base<br>Large||Gundam<br>Gundam-M|
|
| 133 |
+
|||||
|
| 134 |
+
|Resolution<br>512<br>640<br>1024<br>1280<br>Tokens<br>64<br>100<br>256<br>400<br>Process<br>resize resize padding padding||640+1024<br>1024+1280<br>n×100+256<br>n×256+400<br>resize + padding resize + padding||
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
## _**3.2.2. Multiple resolution support**_
|
| 139 |
+
|
| 140 |
+
Suppose we have an image with 1000 optical characters and we want to test how many vision tokens are needed for decoding. This requires the model to support a variable number of vision tokens. That is to say the DeepEncoder needs to support multiple resolutions.
|
| 141 |
+
|
| 142 |
+
We meet the requirement aforementioned through dynamic interpolation of positional encodings, and design several resolution modes for simultaneous model training to achieve the capability of a single DeepSeek-OCR model supporting multiple resolutions. As shown in Figure 4, DeepEncoder mainly supports two major input modes: native resolution and dynamic resolution. Each of them contains multiple sub-modes.
|
| 143 |
+
|
| 144 |
+
Native resolution supports four sub-modes: Tiny, Small, Base, and Large, with corresponding resolutions and token counts of 512×512 (64), 640×640 (100), 1024×1024 (256), and 1280×1280 (400) respectively. Since Tiny and Small modes have relatively small resolutions, to avoid wasting vision tokens, images are processed by directly resizing the original shape. For Base and Large modes, in order to preserve the original image aspect ratio, images are padded to the corresponding size. After padding, the number of valid vision tokens is less than the actual number of vision tokens, with the calculation formula being:
|
| 145 |
+
|
| 146 |
+
**==> picture [374 x 12] intentionally omitted <==**
|
| 147 |
+
|
| 148 |
+
where _𝑤_ and _ℎ_ represent the width and height of the original input image.
|
| 149 |
+
|
| 150 |
+
--- end of page.page_number=6 ---
|
| 151 |
+
|
| 152 |
+
Dynamic resolution can be composed of two native resolutions. For example, Gundam mode consists of n×640×640 tiles (local views) and a 1024×1024 global view. The tiling method following InternVL2.0 [8]. Supporting dynamic resolution is mainly for application considerations, especially for ultra-high-resolution inputs (such as newspaper images). Tiling is a form of secondary window attention that can effectively reduce activation memory further. It’s worth noting that due to our relatively large native resolutions, images won’t be fragmented too much under dynamic resolution (the number of tiles is controlled within the range of 2 to 9). The vision token number output by the DeepEncoder under Gundam mode is: _𝑛_ × 100 + 256, where _𝑛_ is the number of tiles. For images with both width and height smaller than 640, _𝑛_ is set to 0, i.e., Gundam mode will degrade to Base mode.
|
| 153 |
+
|
| 154 |
+
Gundam mode is trained together with the four native resolution modes to achieve the goal of one model supporting multiple resolutions. Note that Gundam-master mode (1024×1024 local views+1280×1280 global view) is obtained through continued training on a trained DeepSeekOCR model. This is mainly for load balancing, as Gundam-master’s resolution is too large and training it together would slow down the overall training speed.
|
| 155 |
+
|
| 156 |
+
## **3.3. The MoE Decoder**
|
| 157 |
+
|
| 158 |
+
Our decoder uses the DeepSeekMoE [19, 20], specifically DeepSeek-3B-MoE. During inference, the model activates 6 out of 64 routed experts and 2 shared experts, with about 570M activated parameters. The 3B DeepSeekMoE is very suitable for domain-centric (OCR for us) VLM research, as it obtains the expressive capability of a 3B model while enjoying the inference efficiency of a 500M small model.
|
| 159 |
+
|
| 160 |
+
The decoder reconstructs the original text representation from the compressed latent vision tokens of DeepEncoder as:
|
| 161 |
+
|
| 162 |
+
**==> picture [356 x 15] intentionally omitted <==**
|
| 163 |
+
|
| 164 |
+
where **Z** ∈ **R** _[𝑛]_[×] _[𝑑]_[latent] are the compressed latent(vision) tokens from DeepEncoder and **X**[ˆ] ∈ **R** _[𝑁]_[×] _[𝑑]_[text] is the reconstructed text representation. The function _𝑓_ dec represents a non-linear mapping that can be effectively learned by compact language models through OCR-style training. It is reasonable to conjecture that LLMs, through specialized pretraining optimization, would demonstrate more natural integration of such capabilities.
|
| 165 |
+
|
| 166 |
+
## **3.4. Data Engine**
|
| 167 |
+
|
| 168 |
+
We constructe complex and diverse training data for DeepSeek-OCR, including OCR 1.0 data, which mainly consists of traditional OCR tasks such as scene image OCR and document OCR; OCR 2.0 data, which mainly includes parsing tasks for complex artificial images, such as common charts, chemical formulas, and plane geometry parsing data; General vision data, which is mainly used to inject certain general image understanding capabilities into DeepSeekOCR and preserve the general vision interface.
|
| 169 |
+
|
| 170 |
+
## _**3.4.1. OCR 1.0 data**_
|
| 171 |
+
|
| 172 |
+
Document data is the top priority for DeepSeek-OCR. We collect 30M pages of diverse PDF data covering about 100 languages from the Internet, with Chinese and English accounting for approximately 25M and other languages accounting for 5M. For this data, we create two types of ground truth: coarse annotations and fine annotations. Coarse annotations are extracted
|
| 173 |
+
|
| 174 |
+
--- end of page.page_number=7 ---
|
| 175 |
+
|
| 176 |
+
**==> picture [214 x 263] intentionally omitted <==**
|
| 177 |
+
|
| 178 |
+
**==> picture [214 x 263] intentionally omitted <==**
|
| 179 |
+
|
| 180 |
+
**==> picture [336 x 10] intentionally omitted <==**
|
| 181 |
+
|
| 182 |
+
**----- Start of picture text -----**<br>
|
| 183 |
+
(a) Ground truth image (b) Fine annotations with layouts<br>**----- End of picture text -----**<br>
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
Figure 5 | OCR 1.0 fine annotations display. We format the ground truth into an interleaved layout and text format, where each paragraph of text is preceded by the coordinates and label of it in the original image. All coordinates are normalized into 1000 bins.
|
| 187 |
+
|
| 188 |
+
directly from the full dataset using _fitz_ , aimed at teaching the model to recognize optical text, especially in minority languages. Fine annotations include 2M pages each for Chinese and English, labeled using advanced layout models (such as PP-DocLayout [33]) and OCR models (such as MinuerU [34] and GOT-OCR2.0 [38]) to construct detection and recognition interleaved data. For minority languages, in the detection part, we find that the layout model enjoys certain generalization capabilities. In the recognition part, we use _fitz_ to create small patch data to train a GOT-OCR2.0, then use the trained model to label small patches after layout processing, employing a model flywheel to create 600K data samples. During the training of DeepSeekOCR, coarse labels and fine labels are distinguished using different prompts. The ground truth for fine annotation image-text pairs can be seen in Figure 5. We also collect 3M _Word_ data, constructing high-quality image-text pairs without layout by directly extracting content. This data mainly brings benefits to formulas and HTML-formatted tables. Additionally, we select some open-source data [28, 37] as supplements.
|
| 189 |
+
|
| 190 |
+
For natural scene OCR, our model mainly supports Chinese and English. The image data sources come from LAION [31] and Wukong [13], labeled using PaddleOCR [9], with 10M data samples each for Chinese and English. Like document OCR, natural scene OCR can also control whether to output detection boxes through prompts.
|
| 191 |
+
|
| 192 |
+
## _**3.4.2. OCR 2.0 data**_
|
| 193 |
+
|
| 194 |
+
Following GOT-OCR2.0 [38], we refer to chart, chemical formula, and plane geometry parsing data as OCR 2.0 data. For chart data, following OneChart [7], we use pyecharts and matplotlib
|
| 195 |
+
|
| 196 |
+
--- end of page.page_number=8 ---
|
| 197 |
+
|
| 198 |
+
**==> picture [219 x 84] intentionally omitted <==**
|
| 199 |
+
|
| 200 |
+
**==> picture [219 x 74] intentionally omitted <==**
|
| 201 |
+
|
| 202 |
+
**==> picture [379 x 10] intentionally omitted <==**
|
| 203 |
+
|
| 204 |
+
**----- Start of picture text -----**<br>
|
| 205 |
+
(a) Image-text ground truth of chart (b) Image-text ground truth of geometry<br>**----- End of picture text -----**<br>
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
Figure 6 | For charts, we do not use OneChart’s [7] dictionary format, but instead use HTML table format as labels, which can save a certain amount of tokens. For plane geometry, we convert the ground truth to dictionary format, where the dictionary contains keys such as line segments, endpoint coordinates, line segment types, etc., for better readability. Each line segment is encoded using the Slow Perception [39] manner.
|
| 209 |
+
|
| 210 |
+
to render 10M images, mainly including commonly used line, bar, pie, and composite charts. We define chart parsing as image-to-HTML-table conversion task, as shown in Figure 6(a). For chemical formulas, we utilize SMILES format from PubChem as the data source and render them into images using RDKit, constructing 5M image-text pairs. For plane geometry images, we follow Slow Perception [39] for generation. Specifically, we use perception-ruler size as 4 to model each line segment. To increase the diversity of rendered data, we introduce geometric translation-invariant data augmentation, where the same geometric image is translated in the original image, corresponding to the same ground truth drawn at the centered position in the coordinate system. Based on this, we construct a total of 1M plane geometry parsing data, as illustrated in Figure 6(b).
|
| 211 |
+
|
| 212 |
+
## _**3.4.3. General vision data**_
|
| 213 |
+
|
| 214 |
+
DeepEncoder can benefit from CLIP’s pretraining gains and has sufficient parameters to incorporate general visual knowledge. Therefore, we also prepare some corresponding data for DeepSeek-OCR. Following DeepSeek-VL2 [40], we generate relevant data for tasks such as caption, detection, and grounding. Note that DeepSeek-OCR is not a general VLM model, and this portion of data accounts for only 20% of the total data. We introduce such type of data mainly to preserve the general vision interface, so that researchers interested in our model and general vision task can conveniently advance their work in the future.
|
| 215 |
+
|
| 216 |
+
## _**3.4.4. Text-only data**_
|
| 217 |
+
|
| 218 |
+
To ensure the model’s language capabilities, we introduced 10% of in-house text-only pretrain data, with all data processed to a length of 8192 tokens, which is also the sequence length for DeepSeek-OCR. In summary, when training DeepSeek-OCR, OCR data accounts for 70%, general vision data accounts for 20%, and text-only data accounts for 10%.
|
| 219 |
+
|
| 220 |
+
## **3.5. Training Pipelines**
|
| 221 |
+
|
| 222 |
+
Our training pipeline is very simple and consists mainly of two stages: a).Training DeepEncoder independently; b).Training the DeepSeek-OCR. Note that the Gundam-master mode is obtained by continuing training on a pre-trained DeepSeek-OCR model with 6M sampled data. Since the training protocol is identical to other modes, we omit the detailed description hereafter.
|
| 223 |
+
|
| 224 |
+
--- end of page.page_number=9 ---
|
| 225 |
+
|
| 226 |
+
## _**3.5.1. Training DeepEncoder**_
|
| 227 |
+
|
| 228 |
+
Following Vary [36], we utilize a compact language model [15] and use the next token prediction framework to train DeepEncoder. In this stage, we use all OCR 1.0 and 2.0 data aforementioned, as well as 100M general data sampled from the LAION [31] dataset. All data is trained for 2 epochs with a batch size of 1280, using the AdamW [23] optimizer with cosine annealing scheduler [22] and a learning rate of 5e-5. The training sequence length is 4096.
|
| 229 |
+
|
| 230 |
+
## _**3.5.2. Training DeepSeek-OCR**_
|
| 231 |
+
|
| 232 |
+
After DeepEncoder is ready, we use data mentioned in Section 3.4 to train the DeepSeek-OCR. with the entire training process conducted on the HAI-LLM [14] platform. The entire model uses pipeline parallelism (PP) and is divided into 4 parts, with DeepEncoder taking two parts and the decoder taking two parts. For DeepEncoder, we treat SAM and the compressor as the vision tokenizer, place them in PP0 and freeze their parameters, while treating the CLIP part as input embedding layer and place it in PP1 with unfrozen weights for training. For the language model part, since DeepSeek3B-MoE has 12 layers, we place 6 layers each on PP2 and PP3. We use 20 nodes (each with 8 A100-40G GPUs) for training, with a data parallelism (DP) of 40 and a global batch size of 640. We use the AdamW optimizer with a step-based scheduler and an initial learning rate of 3e-5. For text-only data, the training speed is 90B tokens/day, while for multimodal data, the training speed is 70B tokens/day.
|
| 233 |
+
|
| 234 |
+
Table 2 | We test DeepSeek-OCR’s vision-text compression ratio using all English documents with 600-1300 tokens from the Fox [21] benchmarks. Text tokens represent the number of tokens after tokenizing the ground truth text using DeepSeek-OCR’s tokenizer. Vision Tokens=64 or 100 respectively represent the number of vision tokens output by DeepEncoder after resizing input images to 512×512 and 640×640.
|
| 235 |
+
|
| 236 |
+
||**Vision Tokens =64**|**Vision Tokens =64**|**Vision Tokens=100**|
|
| 237 |
+
|---|---|---|---|
|
| 238 |
+
|**Text Tokens**<br>Precision Compression||Precision Compression Pages||
|
| 239 |
+
|||||
|
| 240 |
+
|600-700<br>96.5%<br>10.5×<br>700-800<br>93.8%<br>11.8×<br>800-900<br>83.8%<br>13.2×<br>900-1000<br>85.9%<br>15.1×<br>1000-1100<br>79.3%<br>16.5×<br>1100-1200<br>76.4%<br>17.7×<br>1200-1300<br>59.1%<br>19.7×||98.5%<br>6.7×<br>7<br>97.3%<br>7.5×<br>28<br>96.8%<br>8.5×<br>28<br>96.8%<br>9.7×<br>14<br>91.5%<br>10.6×<br>11<br>89.8%<br>11.3×<br>8<br>87.1%<br>12.6×<br>4||
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
## **4. Evaluation**
|
| 245 |
+
|
| 246 |
+
## **4.1. Vision-text Compression Study**
|
| 247 |
+
|
| 248 |
+
We select Fox [21] benchmarks to verify DeepSeek-OCR’s compression-decompression capability for text-rich documents, in order to preliminarily explore the feasibility and boundaries of contexts optical compression. We use the English document portion of Fox, tokenize the ground truth text with DeepSeek-OCR’s tokenizer (vocabulary size of approximately 129k), and select documents with 600-1300 tokens for testing, which happens to be 100 pages. Since the number of text tokens is not large, we only need to test performance in Tiny and Small modes, where Tiny mode corresponds to 64 tokens and Small mode corresponds to 100 tokens. We use the prompt
|
| 249 |
+
|
| 250 |
+
--- end of page.page_number=10 ---
|
| 251 |
+
|
| 252 |
+
Table 3 | We use OmniDocBench [27] to test the performance of DeepSeek-OCR on real document parsing tasks. All metrics in the table are edit distances, where smaller values indicate better performance. "Tokens" represents the average number of vision tokens used per page, and "[†][200dpi] " means using _fitz_ to interpolate the original image to 200dpi. For the DeepSeek-OCR model, the values in parentheses in the "Tokens" column represent valid vision tokens, calculated according to Equation 1.
|
| 253 |
+
|
| 254 |
+
|**Model**<br>**Tokens**|**English**<br>overall text formula table order|**Chinese**<br>overall text formula table order|
|
| 255 |
+
|---|---|---|
|
| 256 |
+
||**Pipline Models**||
|
| 257 |
+
|Dolphin [11]<br>-<br>Marker [1]<br>-<br>Mathpix [2]<br>-<br>MinerU-2.1.1 [34]<br>-<br>MonkeyOCR-1.2B [18]<br>-<br>PPstructure-v3 [9]<br>-|0.356<br>0.352<br>0.465<br>0.258 0.35<br>0.296<br>0.085<br>0.374<br>0.609 0.116<br>0.191<br>0.105<br>0.306<br>0.243 0.108<br>0.162<br>0.072<br>0.313<br>0.166 0.097<br>0.154<br>0.062<br>0.295<br>0.164 0.094<br>0.152<br>0.073<br>0.295<br>0.162 0.077|0.44<br>0.44<br>0.604<br>0.367 0.351<br>0.497<br>0.293<br>0.688<br>0.678 0.329<br>0.364<br>0.381<br>0.454<br>0.32<br>0.30<br>0.244<br>0.111<br>0.581<br>0.15 0.136<br>0.263<br>0.179<br>0.464<br>0.168 0.243<br>0.223<br>0.136<br>0.535<br>0.111 0.11|
|
| 258 |
+
||**End-to-end Models**||
|
| 259 |
+
|Nougat [6]<br>2352<br>SmolDocling [25]<br>392<br>InternVL2-76B [8]<br>6790<br>Qwen2.5-VL-7B [5]<br>3949<br>OLMOCR [28]<br>3949<br>GOT-OCR2.0 [38]<br>256<br>OCRFlux-3B [3]<br>3949<br>GPT4o [26]<br>-<br>InternVL3-78B [42]<br>6790<br>Qwen2.5-VL-72B [5]<br>3949<br>dots.ocr [30]<br>3949<br>Gemini2.5-Pro [4]<br>-<br>MinerU2.0 [34]<br>6790<br>dots.ocr†200dpi [30]<br>5545|0.452<br>0.365<br>0.488<br>0.572 0.382<br>0.493<br>0.262<br>0.753<br>0.729 0.227<br>0.44<br>0.353<br>0.543<br>0.547 0.317<br>0.316<br>0.151<br>0.376<br>0.598 0.138<br>0.326<br>0.097<br>0.455<br>0.608 0.145<br>0.287<br>0.189<br>0.360<br>0.459 0.141<br>0.238<br>0.112<br>0.447<br>0.269 0.126<br>0.233<br>0.144<br>0.425<br>0.234 0.128<br>0.218<br>0.117<br>0.38<br>0.279 0.095<br>0.214<br>0.092<br>0.315<br>0.341 0.106<br>0.182<br>0.137<br>0.320<br>0.166 0.182<br>0.148<br>0.055<br>0.356<br>0.13 0.049<br>0.133<br>0.045<br>0.273<br>0.15 0.066<br>0.125<br>**0.032**<br>0.329<br>**0.099 0.04**|0.973<br>0.998<br>0.941<br>1.00 0.954<br>0.816<br>0.838<br>0.997<br>0.907 0.522<br>0.443<br>0.29<br>0.701<br>0.555 0.228<br>0.399<br>0.243<br>0.5<br>0.627 0.226<br>0.469<br>0.293<br>0.655<br>0.652 0.277<br>0.411<br>0.315<br>0.528<br>0.52<br>0.28<br>0.349<br>0.256<br>0.716<br>0.162 0.263<br>0.399<br>0.409<br>0.606<br>0.329 0.251<br>0.296<br>0.21<br>0.533<br>0.282 0.161<br>0.261<br>0.18<br>0.434<br>0.262 0.168<br>0.261<br>0.229<br>0.468<br>0.160 0.261<br>0.212<br>0.168<br>0.439<br>0.119 0.121<br>0.238<br>0.115<br>0.506<br>0.209 0.122<br>0.16<br>**0.066**<br>0.416<br>0.092** 0.067**|
|
| 260 |
+
||**DeepSeek-OCR (end2end)**||
|
| 261 |
+
|Tiny<br>**64**<br>Small<br>100<br>Base<br>256(182)<br>Large<br>400(285)<br>Gundam<br>795<br>Gundam-M†200dpi<br>1853|0.386<br>0.373<br>0.469<br>0.422 0.283<br>0.221<br>0.142<br>0.373<br>0.242 0.125<br>0.137<br>0.054<br>0.267<br>0.163 0.064<br>0.138<br>0.054<br>0.277<br>0.152 0.067<br>0.127<br>0.043<br>0.269<br>0.134 0.062<br>**0.123**<br>0.049<br>**0.242**<br>0.147 0.056|0.361<br>0.307<br>0.635<br>0.266 0.236<br>0.284<br>0.24<br>0.53<br>0.159 0.205<br>0.24<br>0.205<br>0.474<br>0.1<br>0.181<br>0.208<br>0.143<br>0.461<br>0.104 0.123<br>0.181<br>0.097<br>0.432<br>0.089 0.103<br>**0.157**<br>0.087<br>**0.377**<br>**0.08** 0.085|
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
without layout: "<image> `\n` Free OCR." to control the model’s output format. Nevertheless, the output format still cannot completely match Fox benchmarks, so the actual performance would be somewhat higher than the test results.
|
| 266 |
+
|
| 267 |
+
As shown in Table 2, within a 10× compression ratio, the model’s decoding precision can reach approximately 97%, which is a very promising result. In the future, it may be possible to achieve nearly 10× lossless contexts compression through text-to-image approaches. When the compression ratio exceeds 10×, performance begins to decline, which may have two reasons: one is that the layout of long documents becomes more complex, and another reason may be that long texts become blurred at 512×512 or 640×640 resolution. The first issue can be solved by rendering texts onto a single layout page, while we believe the second issue will become
|
| 268 |
+
|
| 269 |
+
--- end of page.page_number=11 ---
|
| 270 |
+
|
| 271 |
+
a feature of the forgetting mechanism. When compressing tokens by nearly 20×, we find that precision can still approach 60%. These results indicate that optical contexts compression is a very promising and worthwhile research direction, and this approach does not bring any overhead because it can leverage VLM infrastructure, as multimodal systems inherently require an additional vision encoder.
|
| 272 |
+
|
| 273 |
+
Table 4 | Edit distances for different categories of documents in OmniDocBench. The results show that some types of documents can achieve good performance with just 64 or 100 vision tokens, while others require Gundam mode.
|
| 274 |
+
|
| 275 |
+
|Mode<br>Type|Book SlidesFinancial<br>ReportTextbook Exam<br>PaperMagazine Academic<br>Papers<br>Notes Newspaper Overall|
|
| 276 |
+
|---|---|
|
| 277 |
+
|||
|
| 278 |
+
|Tiny<br>Small<br>Base<br>Large<br>Gundam<br>Guandam-M|0.147 0.116<br>0.207<br>0.173<br>0.294<br>0.201<br>0.395<br>0.297<br>0.94<br>0.32<br>0.085 0.111<br>0.079<br>0.147<br>0.171<br>0.107<br>0.131<br>0.187<br>0.744<br>0.205<br>0.037 0.08<br>0.027<br>0.1<br>0.13<br>0.073<br>0.052<br>0.176<br>0.645<br>0.156<br>0.038 0.108<br>0.022<br>0.084<br>0.109<br>0.06<br>0.053<br>0.155<br>0.353<br>0.117<br>0.035 0.085<br>0.289<br>0.095<br>0.094<br>0.059<br>0.039<br>0.153<br>0.122<br>0.083<br>0.052 0.09<br>0.034<br>0.091<br>0.079<br>0.079<br>0.048<br>0.1<br>0.099<br>0.077|
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
## **4.2. OCR Practical Performance**
|
| 283 |
+
|
| 284 |
+
DeepSeek-OCR is not only an experimental model; it has strong practical capabilities and can construct data for LLM/VLM pretraining. To quantify OCR performance, we test DeepSeekOCR on OmniDocBench [27], with results shown in Table 3. Requiring only 100 vision tokens (640×640 resolution), DeepSeek-OCR surpasses GOT-OCR2.0 [38] which uses 256 tokens; with 400 tokens (285 valid tokens, 1280×1280 resolution), it achieves on-par performance with stateof-the-arts on this benchmark. Using fewer than 800 tokens (Gundam mode), DeepSeek-OCR outperforms MinerU2.0 [34] which needs nearly 7,000 vision tokens. These results demonstrate that our DeepSeek-OCR model is powerful in practical applications, and because the higher tokens compression, it enjoys a higher research ceiling.
|
| 285 |
+
|
| 286 |
+
As shown in Table 4, some categories of documents require very few tokens to achieve satisfactory performance, such as slides which only need 64 vision tokens. For book and report documents, DeepSeek-OCR can achieve good performance with only 100 vision tokens. Combined with the analysis from Section 4.1, this may be because most text tokens in these document categories are within 1,000, meaning the vision-token compression ratio does not exceed 10×. For newspapers, Gundam or even Gundam-master mode is required to achieve acceptable edit distances, because the text tokens in newspapers are 4-5,000, far exceeding the 10× compression of other modes. These experimental results further demonstrate the boundaries of contexts optical compression, which may provide effective references for researches on the vision token optimization in VLMs and context compression, forgetting mechanisms in LLMs.
|
| 287 |
+
|
| 288 |
+
## **4.3. Qualitative Study**
|
| 289 |
+
|
| 290 |
+
## _**4.3.1. Deep parsing**_
|
| 291 |
+
|
| 292 |
+
DeepSeek-OCR possesses both layout and OCR 2.0 capabilities, enabling it to further parse images within documents through secondary model calls, a feature we refer to as "deep parsing". As shown in Figures 7,8,9,10, our model can perform deep parsing on charts, geometry, chemical formulas, and even natural images, requiring only a unified prompt.
|
| 293 |
+
|
| 294 |
+
--- end of page.page_number=12 ---
|
| 295 |
+
|
| 296 |
+
**==> picture [446 x 560] intentionally omitted <==**
|
| 297 |
+
|
| 298 |
+
**----- Start of picture text -----**<br>
|
| 299 |
+
<image>\n<|grounding|>Convert the document to markdown.<br>Input image Result<br><image>\nParse the figure.<br>Deep Parsing Rendering<br>**----- End of picture text -----**<br>
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
Figure 7 | In the field of financial research reports, the deep parsing mode of DeepSeek-OCR can be used to obtain structured results of charts within documents. Charts are a crucial form of data representation in finance and scientific fields, and the chart structured extraction is an indispensable capability for future OCR models.
|
| 303 |
+
|
| 304 |
+
--- end of page.page_number=13 ---
|
| 305 |
+
|
| 306 |
+
**==> picture [200 x 276] intentionally omitted <==**
|
| 307 |
+
|
| 308 |
+
**==> picture [48 x 11] intentionally omitted <==**
|
| 309 |
+
|
| 310 |
+
**----- Start of picture text -----**<br>
|
| 311 |
+
Input image<br>**----- End of picture text -----**<br>
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
**==> picture [99 x 15] intentionally omitted <==**
|
| 315 |
+
|
| 316 |
+
**----- Start of picture text -----**<br>
|
| 317 |
+
<image>\nParse the figure.<br>**----- End of picture text -----**<br>
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
**==> picture [179 x 252] intentionally omitted <==**
|
| 321 |
+
|
| 322 |
+
**----- Start of picture text -----**<br>
|
| 323 |
+
The image depicts an indoor classroom setting with a group of children and an<br>adult. The children are seated on the floor, facing a woman who is standing and<br>appears to be reading or presenting to them. The woman is wearing a brown<br>sweater and blue jeans. The children are dressed in various colors, with some<br>wearing short pants and others in long pants.<br>The classroom has a green wall with educational posters and a bulletin board.<br>The floor is covered with a gray carpet. To the left, there is a wooden dresser with<br>a drawer partially open, and a chair is visible behind it. On the right side of the<br>image, there is a purple bean bag chair.<br>The children are engaged with the woman, with some looking at her and others<br>looking down or away. The room is well-lit, and the overall atmosphere seems to<br>be one of attentiveness and learning.<br>The text "BIBLIOTECA" is visible on the wall, suggesting that the room may be<br>part of a library or a section dedicated to books. The presence of educational<br>materials and the organized layout of the room indicate that this is a space<br>designed for learning and reading.<br>Deep Parsing<br>**----- End of picture text -----**<br>
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
**==> picture [212 x 19] intentionally omitted <==**
|
| 327 |
+
|
| 328 |
+
**----- Start of picture text -----**<br>
|
| 329 |
+
<image>\n<|grounding|>Convert the document to markdown.<br>**----- End of picture text -----**<br>
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
**==> picture [197 x 256] intentionally omitted <==**
|
| 333 |
+
|
| 334 |
+
**==> picture [27 x 11] intentionally omitted <==**
|
| 335 |
+
|
| 336 |
+
**----- Start of picture text -----**<br>
|
| 337 |
+
Result<br>**----- End of picture text -----**<br>
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
**==> picture [208 x 240] intentionally omitted <==**
|
| 341 |
+
|
| 342 |
+
**==> picture [43 x 11] intentionally omitted <==**
|
| 343 |
+
|
| 344 |
+
**----- Start of picture text -----**<br>
|
| 345 |
+
Rendering<br>**----- End of picture text -----**<br>
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
Figure 8 | For books and articles, the deep parsing mode can output dense captions for natural images in the documents. With just a prompt, the model can automatically identify what type of image it is and output the required results.
|
| 349 |
+
|
| 350 |
+
--- end of page.page_number=14 ---
|
| 351 |
+
|
| 352 |
+
**==> picture [446 x 562] intentionally omitted <==**
|
| 353 |
+
|
| 354 |
+
**----- Start of picture text -----**<br>
|
| 355 |
+
<image>\n<|grounding|>Convert the document to markdown.<br>Input image Result<br><image>\nParse the figure.<br>Rendering<br>Deep Parsing<br>**----- End of picture text -----**<br>
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
Figure 9 | DeepSeek-OCR in deep parsing mode can also recognize chemical formulas within chemical documents and convert them to SMILES format. In the future, OCR 1.0+2.0 technology may play a significant role in the development of VLM/LLM in STEM fields.
|
| 359 |
+
|
| 360 |
+
--- end of page.page_number=15 ---
|
| 361 |
+
|
| 362 |
+
**==> picture [440 x 502] intentionally omitted <==**
|
| 363 |
+
|
| 364 |
+
**----- Start of picture text -----**<br>
|
| 365 |
+
<image>\n<|grounding|>Convert the document to markdown.<br>Input image Result<br><image>\nParse the figure.<br>Deep Parsing Rendering<br>**----- End of picture text -----**<br>
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
Figure 10 | DeepSeek-OCR also possesses the capability to copy (structure) simple planar geometric figures. Due to the intricate interdependencies among line segments in geometric shapes, parsing geometry task is extremely challenging and has a long way to go.
|
| 369 |
+
|
| 370 |
+
## _**4.3.2. Multilingual recognition**_
|
| 371 |
+
|
| 372 |
+
PDF data on the Internet contains not only Chinese and English, but also a large amount of multilingual data, which is also crucial when training LLMs. For PDF documents, DeepSeekOCR can handle nearly 100 languages. Like Chinese and English documents, multilingual data also supports both layout and non-layout OCR formats. The visualization results are shown in Figure 11, where we select Arabic and Sinhala languages to demonstrate results.
|
| 373 |
+
|
| 374 |
+
--- end of page.page_number=16 ---
|
| 375 |
+
|
| 376 |
+
**==> picture [82 x 19] intentionally omitted <==**
|
| 377 |
+
|
| 378 |
+
**----- Start of picture text -----**<br>
|
| 379 |
+
<image>\nFree OCR.<br>**----- End of picture text -----**<br>
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
**==> picture [445 x 261] intentionally omitted <==**
|
| 383 |
+
|
| 384 |
+
<image>\n<|grounding|>Convert the document to markdown.
|
| 385 |
+
|
| 386 |
+
**==> picture [445 x 260] intentionally omitted <==**
|
| 387 |
+
|
| 388 |
+
Figure 11 | To endow the capability of processing widely crawled PDFs (multilingual data), we train our model with OCR capabilities for nearly 100 languages. Minority language documents can also support both layout and non-layout outputs through different prompts.
|
| 389 |
+
|
| 390 |
+
## _**4.3.3. General vision understanding**_
|
| 391 |
+
|
| 392 |
+
We also provide DeepSeek-OCR with a certain degree of general image understanding capabilities. The related visualization results are shown in Figure 12.
|
| 393 |
+
|
| 394 |
+
--- end of page.page_number=17 ---
|
| 395 |
+
|
| 396 |
+
**==> picture [446 x 460] intentionally omitted <==**
|
| 397 |
+
|
| 398 |
+
**----- Start of picture text -----**<br>
|
| 399 |
+
<image>\nLocate <|ref|>11-2=<|/ref|> in the image. <image>\nDescribe this image in detail.<br>A green plastic container labeled for food storage purposes. The lid is yellow and<br>appears slightly worn out.<br>On the front label:<br>- At the top left corner, some Chinese characters.<br>- In bold black letters near the center-top reads "6 月香 " (June Fragrance).<br>- Below that, smaller white font states " 鲜甜咸香 " which translates roughly to "Fresh<br>Sweet Salty Fragrant."<br>- Dominating the middle part of the label is large lettering stating " 豆瓣酱 ," meaning<br>"Bean Paste Sauce."<br>Additional Information includes:<br>- A small circular logo indicating quality assurance (" 优质品 ").<br>- An illustration showing what seems like a dish made using bean paste sauce.<br>- Text below the main title says " 专注 25 年,只做原酿酱 " translating to "Focusing<br>since 1998, only making authentic fermented bean paste."<br>- On the bottom right side, additional product specifications include " 净含量: 500g".<br>Overall Style/Type:<br>It looks like commercial packaging designed specifically for preserving food items<br>within the jar. It has clear labeling intended for easy identification and<br>understanding among consumers who read Mandarin.<br>Note: There might be slight variations due to handwriting styles but overall<br>maintains readability regarding brand identity and contents description.<br><image>\nIdentify all objects in the image and output them in<br><image>\nLocate <|ref|>the teacher<|/ref|> in the image.<br>bounding boxes.<br><image>\n 这是一张<br><image>\n<|grounding|>OCR the image.<br>照片,展示了一辆红色的消 君不见,黄河之水天上来<br>防栓。消防栓上有一个笑<br>脸,显得非常友好和亲切。<br>消防栓的顶部有一个黑色的 ,奔流到海不复回。君不见,高堂明镜悲白发,<br>盖子,周围有一些金属铆 朝如青丝暮成雪。人生得意须尽欢,莫使金樽空<br>钉。在消防栓的底部,有一 对月。天生我材必有用,千金散尽还复来。烹羊<br>个粉红色的贴纸,上面写着 宰牛且为乐,会须一饮三百杯。岑夫子,丹丘<br>“bowtie”。背景中可以看 生,将进酒,杯莫停。与君歌一曲,请君为我倾<br>到一条街道,街道上有几辆 耳听。钟鼓馔玉不足贵,但愿长醉不愿醒。古来<br>停放的汽车和一些树木。整 圣贤皆寂寞,惟有饮者留其名。陈王昔时宴平<br>体画面给人一种温馨和友好 乐,斗酒十千恣欢谑。主人何为言少钱,径须沽<br>的感觉。 取对君酌。五花马,千金裘,呼儿将出换美酒,<br>与尔同销万古愁。<br>**----- End of picture text -----**<br>
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
Figure 12 | We retain DeepSeek-OCR’s capabilities in general visual understanding, mainly including image description, object detection, grounding, etc. Meanwhile, due to the inclusion of text-only data, DeepSeek-OCR’s language capabilities are also retained. Note that since we do not include SFT (Supervised Fine-Tuning) stage, the model is not a chatbot, and some capabilities need completion prompts to be activated.
|
| 403 |
+
|
| 404 |
+
## **5. Discussion**
|
| 405 |
+
|
| 406 |
+
Our work represents an initial exploration into the boundaries of vision-text compression, investigating how many vision tokens are required to decode _𝑁_ text tokens. The preliminary results are encouraging: DeepSeek-OCR achieves near-lossless OCR compression at approximately 10× ratios, while 20× compression still retains 60% accuracy. These findings suggest promising directions for future applications, such as implementing optical processing for dialogue histories beyond _𝑘_ rounds in multi-turn conversations to achieve 10× compression efficiency.
|
| 407 |
+
|
| 408 |
+
--- end of page.page_number=18 ---
|
| 409 |
+
|
| 410 |
+
**==> picture [449 x 140] intentionally omitted <==**
|
| 411 |
+
|
| 412 |
+
**----- Start of picture text -----**<br>
|
| 413 |
+
Crystal Clear Very Clear Clear Blurry Very Blurry Almost Gone<br>Time →<br>Memory Just happened 1 hour 1 day 1 week 1 month 1 year<br>Crystal Clear Very Clear Clear Blurry Very Blurry Almost Gone<br>Distance ↑<br>Vision 10cm 50cm 1m 3m 10m 20m<br>Crystal Clear Very Clear Clear Blurry Very Blurry Almost Gone<br>Resolution ↓<br>Text Text token Gundam Large Base Small Tiny<br>**----- End of picture text -----**<br>
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
Figure 13 | Forgetting mechanisms constitute one of the most fundamental characteristics of human memory. The contexts optical compression approach can simulate this mechanism by rendering previous rounds of historical text onto images for initial compression, then progressively resizing older images to achieve multi-level compression, where token counts gradually decrease and text becomes increasingly blurred, thereby accomplishing textual forgetting.
|
| 417 |
+
|
| 418 |
+
For older contexts, we could progressively downsizing the rendered images to further reduce token consumption. This assumption draws inspiration from the natural parallel between human memory decay over time and visual perception degradation over spatial distance—both exhibit similar patterns of progressive information loss, as shown in Figure 13. By combining these mechanisms, contexts optical compression method enables a form of memory decay that mirrors biological forgetting curves, where recent information maintains high fidelity while distant memories naturally fade through increased compression ratios.
|
| 419 |
+
|
| 420 |
+
While our initial exploration shows potential for scalable ultra-long context processing, where recent contexts preserve high resolution and older contexts consume fewer resources, we acknowledge this is early-stage work that requires further investigation. The approach suggests a path toward theoretically unlimited context architectures that balance information retention with computational constraints, though the practical implications and limitations of such vision-text compression systems warrant deeper study in future research.
|
| 421 |
+
|
| 422 |
+
## **6. Conclusion**
|
| 423 |
+
|
| 424 |
+
In this technical report, we propose DeepSeek-OCR and preliminarily validate the feasibility of contexts optical compression through this model, demonstrating that the model can effectively decode text tokens exceeding 10 times the quantity from a small number of vision tokens. We believe this finding will facilitate the development of VLMs and LLMs in the future. Additionally, DeepSeek-OCR is a highly practical model capable of large-scale pretraining data production, serving as an indispensable assistant for LLMs. Of course, OCR alone is insufficient to fully validate true context optical compression and we will conduct digital-optical text interleaved pretraining, needle-in-a-haystack testing, and other evaluations in the future. From another perspective, optical contexts compression still offers substantial room for research and improvement, representing a promising new direction.
|
| 425 |
+
|
| 426 |
+
--- end of page.page_number=19 ---
|
| 427 |
+
|
| 428 |
+
## **References**
|
| 429 |
+
|
| 430 |
+
- [1] Marker. URL `https://github.com/datalab-to/marker` .
|
| 431 |
+
|
| 432 |
+
- [2] Mathpix. URL `https://mathpix.com/` .
|
| 433 |
+
|
| 434 |
+
- [3] Ocrflux, 2025. URL `https://github.com/chatdoc-com/OCRFlux` .
|
| 435 |
+
|
| 436 |
+
- [4] G. AI. Gemini 2.5-pro, 2025. URL `https://gemini.google.com/` .
|
| 437 |
+
|
| 438 |
+
- [5] S. Bai, K. Chen, X. Liu, J. Wang, W. Ge, S. Song, K. Dang, P. Wang, S. Wang, J. Tang, H. Zhong, Y. Zhu, M. Yang, Z. Li, J. Wan, P. Wang, W. Ding, Z. Fu, Y. Xu, J. Ye, X. Zhang, T. Xie, Z. Cheng, H. Zhang, Z. Yang, H. Xu, and J. Lin. Qwen2.5-vl technical report. arXiv preprint arXiv:2502.13923, 2025.
|
| 439 |
+
|
| 440 |
+
- [6] L. Blecher, G. Cucurull, T. Scialom, and R. Stojnic. Nougat: Neural optical understanding for academic documents. arXiv preprint arXiv:2308.13418, 2023.
|
| 441 |
+
|
| 442 |
+
- [7] J. Chen, L. Kong, H. Wei, C. Liu, Z. Ge, L. Zhao, J. Sun, C. Han, and X. Zhang. Onechart: Purify the chart structural extraction via one auxiliary token. In Proceedings of the 32nd ACM International Conference on Multimedia, pages 147–155, 2024.
|
| 443 |
+
|
| 444 |
+
- [8] Z. Chen, W. Wang, H. Tian, S. Ye, Z. Gao, E. Cui, W. Tong, K. Hu, J. Luo, Z. Ma, et al. How far are we to gpt-4v? closing the gap to commercial multimodal models with open-source suites. arXiv preprint arXiv:2404.16821, 2024.
|
| 445 |
+
|
| 446 |
+
- [9] C. Cui, T. Sun, M. Lin, T. Gao, Y. Zhang, J. Liu, X. Wang, Z. Zhang, C. Zhou, H. Liu, et al. Paddleocr 3.0 technical report. arXiv preprint arXiv:2507.05595, 2025.
|
| 447 |
+
|
| 448 |
+
- [10] M. Dehghani, J. Djolonga, B. Mustafa, P. Padlewski, J. Heek, J. Gilmer, A. Steiner, M. Caron, R. Geirhos, I. Alabdulmohsin, et al. Patch n’ pack: Navit, a vision transformer for any aspect ratio and resolution. Advances in Neural Information Processing Systems, 36:3632–3656, 2023.
|
| 449 |
+
|
| 450 |
+
- [11] H. Feng, S. Wei, X. Fei, W. Shi, Y. Han, L. Liao, J. Lu, B. Wu, Q. Liu, C. Lin, et al. Dolphin: Document image parsing via heterogeneous anchor prompting. arXiv preprint arXiv:2505.14059, 2025.
|
| 451 |
+
|
| 452 |
+
- [12] Y. Goyal, T. Khot, D. Summers-Stay, D. Batra, and D. Parikh. Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 6904–6913, 2017.
|
| 453 |
+
|
| 454 |
+
- [13] J. Gu, X. Meng, G. Lu, L. Hou, N. Minzhe, X. Liang, L. Yao, R. Huang, W. Zhang, X. Jiang, et al. Wukong: A 100 million large-scale chinese cross-modal pre-training benchmark. Advances in Neural Information Processing Systems, 35:26418–26431, 2022.
|
| 455 |
+
|
| 456 |
+
- [14] High-flyer. HAI-LLM: Efficient and lightweight training tool for large models, 2023. URL `https://www.high-flyer.cn/en/blog/hai-llm` .
|
| 457 |
+
|
| 458 |
+
- [15] S. Iyer, X. V. Lin, R. Pasunuru, T. Mihaylov, D. Simig, P. Yu, K. Shuster, T. Wang, Q. Liu, P. S. Koura, et al. Opt-iml: Scaling language model instruction meta learning through the lens of generalization. arXiv preprint arXiv:2212.12017, 2022.
|
| 459 |
+
|
| 460 |
+
- [16] S. Kazemzadeh, V. Ordonez, M. Matten, and T. Berg. Referitgame: Referring to objects in photographs of natural scenes. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pages 787–798, 2014.
|
| 461 |
+
|
| 462 |
+
--- end of page.page_number=20 ---
|
| 463 |
+
|
| 464 |
+
- [17] A. Kirillov, E. Mintun, N. Ravi, H. Mao, C. Rolland, L. Gustafson, T. Xiao, S. Whitehead, A. C. Berg, W.-Y. Lo, et al. Segment anything. arXiv preprint arXiv:2304.02643, 2023.
|
| 465 |
+
|
| 466 |
+
- [18] Z. Li, Y. Liu, Q. Liu, Z. Ma, Z. Zhang, S. Zhang, Z. Guo, J. Zhang, X. Wang, and X. Bai. Monkeyocr: Document parsing with a structure-recognition-relation triplet paradigm. arXiv preprint arXiv:2506.05218, 2025.
|
| 467 |
+
|
| 468 |
+
- [19] A. Liu, B. Feng, B. Wang, B. Wang, B. Liu, C. Zhao, C. Dengr, C. Ruan, D. Dai, D. Guo, et al. Deepseek-v2: A strong, economical, and efficient mixture-of-experts language model. arXiv preprint arXiv:2405.04434, 2024.
|
| 469 |
+
|
| 470 |
+
- [20] A. Liu, B. Feng, B. Xue, B. Wang, B. Wu, C. Lu, C. Zhao, C. Deng, C. Zhang, C. Ruan, et al. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437, 2024.
|
| 471 |
+
|
| 472 |
+
- [21] C. Liu, H. Wei, J. Chen, L. Kong, Z. Ge, Z. Zhu, L. Zhao, J. Sun, C. Han, and X. Zhang. Focus anywhere for fine-grained multi-page document understanding. arXiv preprint arXiv:2405.14295, 2024.
|
| 473 |
+
|
| 474 |
+
- [22] I. Loshchilov and F. Hutter. Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983, 2016.
|
| 475 |
+
|
| 476 |
+
- [23] I. Loshchilov and F. Hutter. Decoupled weight decay regularization. In ICLR, 2019.
|
| 477 |
+
|
| 478 |
+
- [24] A. Masry, D. X. Long, J. Q. Tan, S. Joty, and E. Hoque. Chartqa: A benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244, 2022.
|
| 479 |
+
|
| 480 |
+
- [25] A. Nassar, A. Marafioti, M. Omenetti, M. Lysak, N. Livathinos, C. Auer, L. Morin, R. T. de Lima, Y. Kim, A. S. Gurbuz, et al. Smoldocling: An ultra-compact vision-language model for end-to-end multi-modal document conversion. arXiv preprint arXiv:2503.11576, 2025.
|
| 481 |
+
|
| 482 |
+
- [26] OpenAI. Gpt-4 technical report, 2023.
|
| 483 |
+
|
| 484 |
+
- [27] L. Ouyang, Y. Qu, H. Zhou, J. Zhu, R. Zhang, Q. Lin, B. Wang, Z. Zhao, M. Jiang, X. Zhao, et al. Omnidocbench: Benchmarking diverse pdf document parsing with comprehensive annotations. In Proceedings of the Computer Vision and Pattern Recognition Conference, pages 24838–24848, 2025.
|
| 485 |
+
|
| 486 |
+
- [28] J. Poznanski, A. Rangapur, J. Borchardt, J. Dunkelberger, R. Huff, D. Lin, C. Wilhelm, K. Lo, and L. Soldaini. olmocr: Unlocking trillions of tokens in pdfs with vision language models. arXiv preprint arXiv:2502.18443, 2025.
|
| 487 |
+
|
| 488 |
+
- [29] A. Radford, J. W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, et al. Learning transferable visual models from natural language supervision. In International conference on machine learning, pages 8748–8763. PMLR, 2021.
|
| 489 |
+
|
| 490 |
+
- [30] Rednote. dots.ocr, 2025. URL `https://github.com/rednote-hilab/dots.ocr` .
|
| 491 |
+
|
| 492 |
+
- [31] C. Schuhmann, R. Vencu, R. Beaumont, R. Kaczmarczyk, C. Mullis, A. Katta, T. Coombes, J. Jitsev, and A. Komatsuzaki. Laion-400m: Open dataset of clip-filtered 400 million imagetext pairs. arXiv preprint arXiv:2111.02114, 2021.
|
| 493 |
+
|
| 494 |
+
--- end of page.page_number=21 ---
|
| 495 |
+
|
| 496 |
+
- [32] A. Singh, V. Natarajan, M. Shah, Y. Jiang, X. Chen, D. Batra, D. Parikh, and M. Rohrbach. Towards vqa models that can read. In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition, pages 8317–8326, 2019.
|
| 497 |
+
|
| 498 |
+
- [33] T. Sun, C. Cui, Y. Du, and Y. Liu. Pp-doclayout: A unified document layout detection model to accelerate large-scale data construction. arXiv preprint arXiv:2503.17213, 2025.
|
| 499 |
+
|
| 500 |
+
- [34] B. Wang, C. Xu, X. Zhao, L. Ouyang, F. Wu, Z. Zhao, R. Xu, K. Liu, Y. Qu, F. Shang, et al. Mineru: An open-source solution for precise document content extraction. arXiv preprint arXiv:2409.18839, 2024.
|
| 501 |
+
|
| 502 |
+
- [35] P. Wang, S. Bai, S. Tan, S. Wang, Z. Fan, J. Bai, K. Chen, X. Liu, J. Wang, W. Ge, et al. Qwen2-vl: Enhancing vision-language model’s perception of the world at any resolution. arXiv preprint arXiv:2409.12191, 2024.
|
| 503 |
+
|
| 504 |
+
- [36] H. Wei, L. Kong, J. Chen, L. Zhao, Z. Ge, J. Yang, J. Sun, C. Han, and X. Zhang. Vary: Scaling up the vision vocabulary for large vision-language model. In European Conference on Computer Vision, pages 408–424. Springer, 2024.
|
| 505 |
+
|
| 506 |
+
- [37] H. Wei, L. Kong, J. Chen, L. Zhao, Z. Ge, E. Yu, J. Sun, C. Han, and X. Zhang. Small language model meets with reinforced vision vocabulary. arXiv preprint arXiv:2401.12503, 2024.
|
| 507 |
+
|
| 508 |
+
- [38] H. Wei, C. Liu, J. Chen, J. Wang, L. Kong, Y. Xu, Z. Ge, L. Zhao, J. Sun, Y. Peng, et al. General ocr theory: Towards ocr-2.0 via a unified end-to-end model. arXiv preprint arXiv:2409.01704, 2024.
|
| 509 |
+
|
| 510 |
+
- [39] H. Wei, Y. Yin, Y. Li, J. Wang, L. Zhao, J. Sun, Z. Ge, X. Zhang, and D. Jiang. Slow perception: Let’s perceive geometric figures step-by-step. arXiv preprint arXiv:2412.20631, 2024.
|
| 511 |
+
|
| 512 |
+
- [40] Z. Wu, X. Chen, Z. Pan, X. Liu, W. Liu, D. Dai, H. Gao, Y. Ma, C. Wu, B. Wang, et al. Deepseek-vl2: Mixture-of-experts vision-language models for advanced multimodal understanding. arXiv preprint arXiv:2412.10302, 2024.
|
| 513 |
+
|
| 514 |
+
- [41] W. Yu, Z. Yang, L. Li, J. Wang, K. Lin, Z. Liu, X. Wang, and L. Wang. Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490, 2023.
|
| 515 |
+
|
| 516 |
+
- [42] J. Zhu, W. Wang, Z. Chen, Z. Liu, S. Ye, L. Gu, H. Tian, Y. Duan, W. Su, J. Shao, et al. Internvl3: Exploring advanced training and test-time recipes for open-source multimodal models. arXiv preprint arXiv:2504.10479, 2025.
|
| 517 |
+
|
| 518 |
+
--- end of page.page_number=22 ---
|
| 519 |
+
|
docs/markdowns/sam3.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
docs/markdowns/sam3d.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
knowledge_base/chroma.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
# Core LangChain components
|
| 3 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 4 |
+
from langchain_community.document_loaders.text import TextLoader
|
| 5 |
+
from langchain_community.document_loaders.directory import DirectoryLoader
|
| 6 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 7 |
+
from langchain_chroma import Chroma
|
| 8 |
+
|
| 9 |
+
from config import configs
|
| 10 |
+
|
| 11 |
+
if __name__ == "__main__":
|
| 12 |
+
# --- 1. Load Documents ---
|
| 13 |
+
print("Loading documents from directory...")
|
| 14 |
+
loader = DirectoryLoader(
|
| 15 |
+
path=configs["DATA_PATH"],
|
| 16 |
+
glob="*.md",
|
| 17 |
+
loader_cls=TextLoader,
|
| 18 |
+
silent_errors=True # Set to False if you want to see loader errors
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
raw_documents = loader.load()
|
| 22 |
+
if not raw_documents:
|
| 23 |
+
print(f"Error: No documents found in {configs['DATA_PATH']}. Check your path and file types.")
|
| 24 |
+
exit()
|
| 25 |
+
|
| 26 |
+
# --- 2. Split Documents into Chunks ---
|
| 27 |
+
print(f"Loaded {len(raw_documents)} raw documents. Splitting into chunks...")
|
| 28 |
+
# Recursive splitting is better than simple splitting, preserving context.
|
| 29 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 30 |
+
chunk_size=1000,
|
| 31 |
+
chunk_overlap=200,
|
| 32 |
+
separators=["\n\n", "\n", " ", ""] # Optimal separators for markdown/text
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
documents_to_embed = text_splitter.split_documents(raw_documents)
|
| 36 |
+
print(f"Split into {len(documents_to_embed)} chunks.")
|
| 37 |
+
|
| 38 |
+
# --- 3. Define Custom Embedding Model ---
|
| 39 |
+
print(f"Initializing custom embedding model: {configs['EMBEDDING_MODEL_NAME']}...")
|
| 40 |
+
dense_embeddings = HuggingFaceEmbeddings(
|
| 41 |
+
model_name=configs["EMBEDDING_MODEL_NAME"]
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# --- 4. Create and Persist the Vector Store ---
|
| 45 |
+
print(f"Creating Chroma vector store and persisting data to {configs['PERSIST_PATH']}...")
|
| 46 |
+
vectorstore = Chroma.from_documents(
|
| 47 |
+
documents=documents_to_embed, # The prepared Document chunks
|
| 48 |
+
embedding=dense_embeddings,
|
| 49 |
+
collection_name=configs["COLLECTION_NAME"],
|
| 50 |
+
persist_directory=configs["PERSIST_PATH"]
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Explicitly persist the data for immediate use
|
| 54 |
+
vectorstore.persist()
|
| 55 |
+
|
| 56 |
+
print("✅ Success: Chroma vector store created and data persisted.")
|
| 57 |
+
print(f"The vector database is now ready for query using the collection: '{configs['COLLECTION_NAME']}'")
|
knowledge_base/embeddings.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 2 |
+
from langchain_chroma import Chroma
|
| 3 |
+
|
| 4 |
+
# 1. Define the custom embedding object
|
| 5 |
+
dense_embeddings = HuggingFaceEmbeddings(
|
| 6 |
+
model_name="sentence-transformers/all-mpnet-base-v2"
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
# 2. Initialize the LangChain Chroma vector store, passing the embeddings
|
| 10 |
+
vectorstore = Chroma.from_documents(
|
| 11 |
+
documents=["./docs/markdowns"], # Placeholder for actual documents
|
| 12 |
+
embedding=dense_embeddings,
|
| 13 |
+
collection_name="langchain_mpnet_collection",
|
| 14 |
+
persist_directory="./knowledge_base/chroma_data"
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# 3. Save the database (essential for persistence)
|
| 18 |
+
vectorstore.persist()
|
| 19 |
+
print("LangChain Chroma vector store created with custom embeddings and persisted.")
|
| 20 |
+
|
| 21 |
+
if __name__ == "__main__":
|
| 22 |
+
pass
|
knowledge_base/prepare_documents.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pymupdf.layout
|
| 3 |
+
import pymupdf4llm
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import glob
|
| 6 |
+
|
| 7 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 8 |
+
|
| 9 |
+
def pdf_to_markdown(pdf_path, output_dir):
|
| 10 |
+
doc = pymupdf.open(pdf_path)
|
| 11 |
+
md = pymupdf4llm.to_markdown(doc, header=False, footer=False, page_separators=True, ignore_images=True, write_images=False, image_path=None)
|
| 12 |
+
md_cleaned = md.encode('utf-8', errors='surrogatepass').decode('utf-8', errors='ignore')
|
| 13 |
+
output_path = Path(output_dir) / Path(doc.name).stem
|
| 14 |
+
Path(output_path).with_suffix(".md").write_bytes(md_cleaned.encode('utf-8'))
|
| 15 |
+
|
| 16 |
+
def pdfs_to_markdowns(path_pattern, overwrite: bool = False):
|
| 17 |
+
output_dir = Path('./docs/markdowns')
|
| 18 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 19 |
+
|
| 20 |
+
for pdf_path in map(Path, glob.glob(path_pattern)):
|
| 21 |
+
md_path = (output_dir / pdf_path.stem).with_suffix(".md")
|
| 22 |
+
if overwrite or not md_path.exists():
|
| 23 |
+
pdf_to_markdown(pdf_path, output_dir)
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
pdf_folder = Path('./docs/pdf')
|
| 27 |
+
for file in os.listdir(pdf_folder):
|
| 28 |
+
file_path = pdf_folder / file
|
| 29 |
+
if file_path.suffix.lower() == '.pdf':
|
| 30 |
+
print(f"Processing file: {file_path}")
|
| 31 |
+
pdf_to_markdown(file_path, './docs/markdowns')
|
main.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def main():
|
| 2 |
+
print("Hello from rag-agent!")
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
main()
|
notebook.ipynb
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "524b8568",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"from langchain_community.document_loaders.text import DirectoryLoader, TextLoader"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": null,
|
| 16 |
+
"id": "babc2558",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"print(\"Y\")"
|
| 21 |
+
]
|
| 22 |
+
}
|
| 23 |
+
],
|
| 24 |
+
"metadata": {
|
| 25 |
+
"kernelspec": {
|
| 26 |
+
"display_name": "rag_agent",
|
| 27 |
+
"language": "python",
|
| 28 |
+
"name": "python3"
|
| 29 |
+
},
|
| 30 |
+
"language_info": {
|
| 31 |
+
"name": "python",
|
| 32 |
+
"version": "3.10.17"
|
| 33 |
+
}
|
| 34 |
+
},
|
| 35 |
+
"nbformat": 4,
|
| 36 |
+
"nbformat_minor": 5
|
| 37 |
+
}
|
pyproject.toml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "rag-agent"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
dependencies = []
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
langchain
|
| 2 |
+
langgraph
|
| 3 |
+
langchain-huggingface
|
| 4 |
+
langchain-google-genai
|
| 5 |
+
langchain-chroma
|
| 6 |
+
fastapi
|
| 7 |
+
uvicorn
|
| 8 |
+
pydantic
|
| 9 |
+
chromadb
|
| 10 |
+
pymupdf
|
| 11 |
+
pymupdf4llm
|
uv.lock
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version = 1
|
| 2 |
+
revision = 2
|
| 3 |
+
requires-python = ">=3.10"
|
| 4 |
+
|
| 5 |
+
[[package]]
|
| 6 |
+
name = "rag-agent"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
source = { virtual = "." }
|