Spaces:

dnnhhuy
/

RAG-ColPali

Runtime error

App Files Files Community

Huy commited on Nov 4, 2024

Commit

d8bb2be

1 Parent(s): c20735c

First commit

Browse files

Files changed (27) hide show

.gitattributes +1 -0
.gitignore +4 -0
README.md +0 -13
app.py +245 -0
env.yaml +241 -0
llamaindex_utils.py +558 -0
models/__init__.py +5 -0
models/colpali.py +89 -0
models/colpali_processor.py +89 -0
models/gemma.py +285 -0
models/lora.py +68 -0
models/paligemma.py +162 -0
models/paligemma_processor.py +103 -0
models/siglip.py +168 -0
pretrained/colpaligemma-3b-mix-448-base/adapter_model.safetensors +3 -0
pretrained/colpaligemma-3b-mix-448-base/config.json +3 -0
pretrained/colpaligemma-3b-mix-448-base/model-00001-of-00002.safetensors +3 -0
pretrained/colpaligemma-3b-mix-448-base/model-00002-of-00002.safetensors +3 -0
pretrained/colpaligemma-3b-mix-448-base/preprocessor_config.json +3 -0
pretrained/colpaligemma-3b-mix-448-base/tokenizer.json +3 -0
pretrained/colpaligemma-3b-mix-448-base/tokenizer.model +3 -0
pretrained/colpaligemma-3b-mix-448-base/tokenizer_config.json +3 -0
prompt_templates.py +132 -0
rag_pipeline.py +531 -0
requirements.txt +225 -0
utils/__init__.py +2 -0
utils/utils.py +44 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+.ipynb_checkpoints/
+env/
+.DS_Store

README.md CHANGED Viewed

@@ -1,13 +0,0 @@
----
-title: RAG ColPali
-emoji: 📊
-colorFrom: yellow
-colorTo: gray
-sdk: gradio
-sdk_version: 5.4.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import os
+import torch
+import base64
+import asyncio
+from io import BytesIO
+import gradio as gr
+import qdrant_client
+from PIL import Image
+from typing import List, Dict, Tuple
+import llamaindex_utils
+from rag_pipeline import async_indexDocument
+from models import get_lora_model, enable_lora, ColPali, ColPaliProcessor
+from utils import load_tokenizer
+from llama_index.llms.gemini import Gemini
+from llama_index.core.tools import RetrieverTool
+GEMINI_API_KEY = os.getenv(key="GEMINI_API_KEY")
+QDRANT_API_KEY = os.getenv(key="QDRANT_API_KEY")
+device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+async def initialize_model() -> Dict:
+    """Initialize models
+    Returns:
+        model_dict: Dict: Dictionary stores neccessary models
+    """
+    model = ColPali.from_pretrained(model_dir='./pretrained/colpaligemma-3b-mix-448-base', torch_dtype=torch.bfloat16)
+    tokenizer = load_tokenizer(tokenizer_dir='./pretrained/colpaligemma-3b-mix-448-base')
+    processor = ColPaliProcessor(tokenizer=tokenizer).from_pretrained(pretrained_dir='./pretrained/colpaligemma-3b-mix-448-base')
+    model.model.language_model.model = get_lora_model(model.model.language_model.model,
+                                                      rank=32,
+                                                      alphas=32,
+                                                      lora_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'down_proj', 'gate_proj', 'up_proj'],
+                                                      training=False,
+                                                      dropout_p=0.1,
+                                                      torch_dtype=torch.bfloat16)
+    model.model.language_model.model = enable_lora(model.model.language_model.model, lora_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'down_proj', 'gate_proj', 'up_proj'], enabled=True)
+    model = get_lora_model(model,
+                           rank=32,
+                           alphas=32,
+                           lora_modules=['custom_text_proj'],
+                           training=False,
+                           dropout_p=0.1,
+                           torch_dtype=torch.bfloat16)
+    model = enable_lora(model, lora_modules=['custom_text_proj'], enabled=True)
+    model.load_lora('./pretrained/colpaligemma-3b-mix-448-base')
+    # Initialize LLM
+    generation_config = {
+    "temperature": 0.0,
+    "top_p": 0.95,
+    "top_k": 64,
+    "max_output_tokens": 1024,
+    "response_mime_type": "text/plain",
+    }
+    llm = Gemini(api_key=GEMINI_API_KEY, generation_config=generation_config)
+    # Setup Qdrant
+    # Creating Qdrant Client
+    vector_store_client = qdrant_client.AsyncQdrantClient(location="https://b3878645-ec71-426c-8afa-b8b3b7589e40.us-east4-0.gcp.cloud.qdrant.io",
+                                                          api_key=QDRANT_API_KEY,
+                                                          timeout=100)
+    embed_model = llamaindex_utils.ColPaliGemmaEmbedding(model=model,
+                                                         processor=processor,
+                                                         device=device)
+    collections = await get_collection_names(vector_store_client)
+    retrievers_dict = {}
+    for name in collections:
+        if name not in retrievers_dict:
+            retrievers_dict[name] = llamaindex_utils.ColPaliRetriever(vector_store_client=vector_store_client,
+                                                                    target_collection=name,
+                                                                    embed_model=embed_model,
+                                                                    similarity_top_k=3)
+    return {"llm": llm,
+            "vector_store_client": vector_store_client,
+            "model": model,
+            "processor": processor,
+            "embed_model": embed_model,
+            "collections": collections,
+            "retrievers_dict": retrievers_dict}
+async def get_collection_names(vector_store_client):
+    collections = await vector_store_client.get_collections()
+    return [collection.name for collection in collections.collections]
+async def index(files: List[str],
+          target_collection: str
+          ) -> Tuple[str, gr.Dropdown, List[str], Dict[str, llamaindex_utils.ColPaliRetriever]]:
+    """
+    Insert all image pages from files to speicified target collection to the vector store
+    and return the mapping from retriever's name to its object instance.
+    Args:
+        files (List[str]): List of file path
+        target_collection (str): Target collection to insert into the vector store
+    Returns:
+        Tuple[str, gr.Dropdown, List[str], Dict[str, llamaindex_utils.ColPaliRetriever]]: Return message, dropdown component, collections' names, dictionary mapping retriever to its object instance
+    """
+    for file in files:
+        await async_indexDocument(file_path=file,
+                                vector_store_client=model_dict["vector_store_client"],
+                                target_collection=target_collection,
+                                model=model_dict["model"],
+                                processor=model_dict["processor"],
+                                device=device)
+    if target_collection not in retrievers:
+        retrievers[target_collection] = llamaindex_utils.ColPaliRetriever(vector_store_client=model_dict["vector_store_client"],
+                                                                            target_collection=target_collection,
+                                                                            embed_model=model_dict["embed_model"],
+                                                                            similarity_top_k=3)
+    collection_names = await get_collection_names(model_dict["vector_store_client"])
+    return (f"Uploaded and index {len(files)} files.",
+            gr.Dropdown(choices=collection_names),
+            collection_names)
+async def search_with_llm(query: str,
+                    similarity_top_k: int,
+                    num_children: int) -> Tuple[str, List[Image.Image]]:
+    """Search the result given query and list of retrievers.
+    Returns the search's response and list of images support for that response.
+    Args:
+        query (str): Query question
+        retrievers (Dict[str, llamaindex_utils.ColPaliRetriever]): Dictionary mapping between retrievers' names and their object instances
+        similarity_top_k (int): top K similarity results retrieved from the retriever
+        num_children (int): number of children for tree summarization
+    Returns:
+        Tuple[str, List[Image.Image]]:  Returns the search's response and list of images support for that response.
+    """
+    retriever_tools = [RetrieverTool.from_defaults(
+                        name=key,
+                        retriever=value,
+                        description=f"Useful for retrieving information about {key} financials") for key, value in retrievers.items()]
+    retriever_mappings = {retriever_tool.metadata.name: retriever_tool.retriever for retriever_tool in retriever_tools}
+    fusion_retriever = llamaindex_utils.CustomFusionRetriever(llm=model_dict["llm"],
+                                                            retriever_mappings=retriever_mappings,
+                                                            similarity_top_k=similarity_top_k)
+    query_engine = llamaindex_utils.CustomQueryEngine(retriever_tools=[retriever_tool.metadata for retriever_tool in retriever_tools],
+                                                    fusion_retriever=fusion_retriever,
+                                                    llm=model_dict["llm"],
+                                                    num_children=num_children)
+    response = await query_engine.aquery(query_str=query)
+    return response.response, [Image.open(BytesIO(base64.b64decode(image))) for image in response.source_images]
+def build_gui():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Image Based RAG System using ColPali 📚🔍")
+        with gr.Row(equal_height=True):
+            with gr.Column():
+                gr.Markdown("## 1️. Upload PDFs")
+                files = gr.File(file_types=["pdf"],
+                                file_count="multiple",
+                                interactive=True)
+                choices = gr.State(value=model_dict["collections"])
+                gr.Markdown("## 2️. Index the PDFs and upload")
+                target_collection = gr.Dropdown(choices=choices.value,
+                                                allow_custom_value=True,
+                                                label="Collection name",
+                                                show_label=True,
+                                                interactive=True)
+                message_box = gr.Textbox(value="File not yet uploaded",
+                                        show_label=False,
+                                        interactive=False)
+                convert_button = gr.Button("🔄 Convert and upload")
+                # Define the actions for conversion
+                convert_button.click(index, inputs=[files, target_collection], outputs=[message_box, target_collection, choices])
+            with gr.Column():
+                gr.Markdown("## 3️. Enter your question")
+                query = gr.Textbox(placeholder="Enter your query to match",
+                                lines=15,
+                                max_lines=20,
+                                autoscroll=True)
+                with gr.Accordion(label="Additional Settings", open=False):
+                    similarity_top_k = gr.Slider(minimum=1,
+                                                maximum=10,
+                                                value=3,
+                                                step=1.0,
+                                                label="Top K similarity retrieved from the retriever")
+                    num_children = gr.Slider(minimum=1,
+                                            maximum=10,
+                                            value=3,
+                                            step=1.0,
+                                            label="Set number of children for Tree Summarization")
+                search_button = gr.Button("🔍 Search")
+        gr.Markdown("## 4️. ColPali Retrieval")
+        with gr.Row(equal_height=True):
+            output_text = gr.Textbox(label="Query result",
+                                    show_label=True,
+                                    placeholder="Response from query",
+                                    lines=8,
+                                    max_lines=20,
+                                    interactive=False)
+            output_imgs = gr.Gallery(label="Most relevant images is...",
+                                        show_fullscreen_button=True,
+                                        show_label=True,
+                                        show_download_button=True,
+                                        interactive=False)
+        # Action for search button
+        search_button.click(
+                    search_with_llm,
+                    inputs=[query, similarity_top_k, num_children],
+                    outputs=[output_text, output_imgs])
+    return demo
+async def amain():
+    global model_dict, retrievers
+    model_dict = await initialize_model()
+    retrievers = model_dict["retrievers_dict"]
+    demo = build_gui()
+    demo.queue().launch(debug=True, share=False)
+if __name__ == "__main__":
+    asyncio.run(amain())

env.yaml ADDED Viewed

	@@ -0,0 +1,241 @@

+channels:
+  - defaults
+dependencies:
+  - bzip2=1.0.8=h80987f9_6
+  - ca-certificates=2024.7.2=hca03da5_0
+  - libffi=3.4.4=hca03da5_1
+  - ncurses=6.4=h313beb8_0
+  - openssl=3.0.15=h80987f9_0
+  - pip=24.2=py311hca03da5_0
+  - python=3.11.9=hb885b13_0
+  - readline=8.2=h1a28f6b_0
+  - setuptools=75.1.0=py311hca03da5_0
+  - sqlite=3.45.3=h80987f9_0
+  - tk=8.6.14=h6ba3021_0
+  - wheel=0.44.0=py311hca03da5_0
+  - xz=5.4.6=h80987f9_1
+  - zlib=1.2.13=h18a0788_1
+  - pip:
+      - accelerate==1.1.0
+      - aiofiles==23.2.1
+      - aiohappyeyeballs==2.4.3
+      - aiohttp==3.10.10
+      - aiosignal==1.3.1
+      - annotated-types==0.7.0
+      - anyio==4.6.2.post1
+      - appnope==0.1.4
+      - argon2-cffi==23.1.0
+      - argon2-cffi-bindings==21.2.0
+      - arrow==1.3.0
+      - asttokens==2.4.1
+      - async-lru==2.0.4
+      - attrs==24.2.0
+      - babel==2.16.0
+      - beautifulsoup4==4.12.3
+      - bleach==6.2.0
+      - cachetools==5.5.0
+      - certifi==2024.8.30
+      - cffi==1.17.1
+      - charset-normalizer==3.4.0
+      - click==8.1.7
+      - comm==0.2.2
+      - contourpy==1.3.0
+      - cycler==0.12.1
+      - dataclasses-json==0.6.7
+      - datasets==3.0.1
+      - debugpy==1.8.7
+      - decorator==5.1.1
+      - defusedxml==0.7.1
+      - deprecated==1.2.14
+      - dill==0.3.8
+      - dirtyjson==1.0.8
+      - distro==1.9.0
+      - executing==2.1.0
+      - fastapi==0.115.4
+      - fastjsonschema==2.20.0
+      - ffmpy==0.4.0
+      - filelock==3.16.1
+      - fonttools==4.54.1
+      - fqdn==1.5.1
+      - frozenlist==1.5.0
+      - fsspec==2024.6.1
+      - google-ai-generativelanguage==0.6.4
+      - google-api-core==2.20.0
+      - google-api-python-client==2.147.0
+      - google-auth==2.35.0
+      - google-auth-httplib2==0.2.0
+      - google-generativeai==0.5.4
+      - googleapis-common-protos==1.65.0
+      - gradio==4.44.1
+      - gradio-client==1.3.0
+      - greenlet==3.1.1
+      - grpcio==1.67.1
+      - grpcio-status==1.62.3
+      - grpcio-tools==1.62.3
+      - h11==0.14.0
+      - h2==4.1.0
+      - hpack==4.0.0
+      - httpcore==1.0.6
+      - httplib2==0.22.0
+      - httpx==0.27.2
+      - huggingface-hub==0.26.2
+      - hyperframe==6.0.1
+      - idna==3.10
+      - importlib-resources==6.4.5
+      - instructorembedding==1.0.1
+      - ipykernel==6.29.5
+      - ipython==8.29.0
+      - isoduration==20.11.0
+      - jedi==0.19.1
+      - jinja2==3.1.4
+      - jiter==0.7.0
+      - joblib==1.4.2
+      - json5==0.9.25
+      - jsonpointer==3.0.0
+      - jsonschema==4.23.0
+      - jsonschema-specifications==2024.10.1
+      - jupyter-client==8.6.3
+      - jupyter-core==5.7.2
+      - jupyter-events==0.10.0
+      - jupyter-lsp==2.2.5
+      - jupyter-server==2.14.2
+      - jupyter-server-terminals==0.5.3
+      - jupyterlab==4.2.5
+      - jupyterlab-pygments==0.3.0
+      - jupyterlab-server==2.27.3
+      - kiwisolver==1.4.7
+      - llama-cloud==0.1.2
+      - llama-index==0.11.17
+      - llama-index-agent-openai==0.3.4
+      - llama-index-cli==0.3.1
+      - llama-index-core==0.11.17
+      - llama-index-embeddings-huggingface==0.3.1
+      - llama-index-embeddings-instructor==0.2.1
+      - llama-index-embeddings-openai==0.2.5
+      - llama-index-indices-managed-llama-cloud==0.4.0
+      - llama-index-legacy==0.9.48.post3
+      - llama-index-llms-gemini==0.3.7
+      - llama-index-llms-openai==0.2.13
+      - llama-index-multi-modal-llms-gemini==0.3.1
+      - llama-index-multi-modal-llms-openai==0.2.2
+      - llama-index-postprocessor-colbert-rerank==0.2.1
+      - llama-index-program-openai==0.2.0
+      - llama-index-question-gen-openai==0.2.0
+      - llama-index-readers-file==0.2.2
+      - llama-index-readers-llama-parse==0.3.0
+      - llama-index-vector-stores-qdrant==0.3.1
+      - llama-parse==0.5.7
+      - markdown-it-py==3.0.0
+      - markupsafe==2.1.5
+      - marshmallow==3.23.1
+      - matplotlib==3.9.2
+      - matplotlib-inline==0.1.7
+      - mdurl==0.1.2
+      - mistune==3.0.2
+      - mpmath==1.3.0
+      - multidict==6.1.0
+      - multiprocess==0.70.16
+      - mypy-extensions==1.0.0
+      - nbclient==0.10.0
+      - nbconvert==7.16.4
+      - nbformat==5.10.4
+      - nest-asyncio==1.6.0
+      - networkx==3.4.2
+      - nltk==3.9.1
+      - notebook==7.2.2
+      - notebook-shim==0.2.4
+      - numpy==1.26.4
+      - openai==1.53.0
+      - orjson==3.10.11
+      - overrides==7.7.0
+      - packaging==24.1
+      - pandas==2.2.3
+      - pandocfilters==1.5.1
+      - parso==0.8.4
+      - pdf2image==1.17.0
+      - peft==0.11.1
+      - pexpect==4.9.0
+      - pillow==10.4.0
+      - platformdirs==4.3.6
+      - portalocker==2.10.1
+      - prometheus-client==0.21.0
+      - prompt-toolkit==3.0.48
+      - propcache==0.2.0
+      - proto-plus==1.24.0
+      - protobuf==4.25.5
+      - psutil==6.0.0
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.3
+      - pyarrow==17.0.0
+      - pyasn1==0.6.1
+      - pyasn1-modules==0.4.1
+      - pycparser==2.22
+      - pydantic==2.9.2
+      - pydantic-core==2.23.4
+      - pydub==0.25.1
+      - pygments==2.18.0
+      - pyparsing==3.1.4
+      - pypdf==4.3.1
+      - python-dateutil==2.9.0.post0
+      - python-json-logger==2.0.7
+      - python-multipart==0.0.12
+      - pytz==2024.2
+      - pyyaml==6.0.2
+      - pyzmq==26.2.0
+      - qdrant-client==1.12.0
+      - referencing==0.35.1
+      - regex==2024.9.11
+      - requests==2.32.3
+      - rfc3339-validator==0.1.4
+      - rfc3986-validator==0.1.1
+      - rich==13.9.4
+      - rpds-py==0.20.1
+      - rsa==4.9
+      - ruff==0.7.2
+      - safetensors==0.4.5
+      - scikit-learn==1.5.2
+      - scipy==1.14.1
+      - semantic-version==2.10.0
+      - send2trash==1.8.3
+      - sentence-transformers==2.7.0
+      - shellingham==1.5.4
+      - six==1.16.0
+      - sniffio==1.3.1
+      - soupsieve==2.6
+      - sqlalchemy==2.0.36
+      - stack-data==0.6.3
+      - starlette==0.41.2
+      - striprtf==0.0.26
+      - sympy==1.13.3
+      - tenacity==8.5.0
+      - terminado==0.18.1
+      - threadpoolctl==3.5.0
+      - tiktoken==0.8.0
+      - tinycss2==1.4.0
+      - tokenizers==0.20.1
+      - tomlkit==0.12.0
+      - torch==2.4.1
+      - torchinfo==1.8.0
+      - torchvision==0.19.1
+      - tornado==6.4.1
+      - tqdm==4.66.5
+      - traitlets==5.14.3
+      - transformers==4.45.1
+      - typer==0.12.5
+      - types-python-dateutil==2.9.0.20241003
+      - typing-extensions==4.12.2
+      - typing-inspect==0.9.0
+      - tzdata==2024.2
+      - uri-template==1.3.0
+      - uritemplate==4.1.1
+      - urllib3==2.2.3
+      - uvicorn==0.32.0
+      - wcwidth==0.2.13
+      - webcolors==24.8.0
+      - webencodings==0.5.1
+      - websocket-client==1.8.0
+      - websockets==12.0
+      - wrapt==1.16.0
+      - xxhash==3.5.0
+      - yarl==1.17.1

llamaindex_utils.py ADDED Viewed

	@@ -0,0 +1,558 @@

+import torch
+import json
+import asyncio
+import qdrant_client
+from PIL import Image
+from pydantic import PrivateAttr, Field
+from typing import Union, Optional, List, Any, Dict, Set
+from dataclasses import dataclass
+from llama_index.core.vector_stores.types import VectorStoreQueryResult
+from llama_index.core.vector_stores.utils import (
+    legacy_metadata_dict_to_node,
+    metadata_dict_to_node,
+)
+from llama_index.core.embeddings import BaseEmbedding
+from llama_index.core.retrievers import BaseRetriever
+from llama_index.core import QueryBundle, PromptTemplate
+from llama_index.core.schema import NodeWithScore, TextNode
+from llama_index.core.llms import LLM
+from llama_index.core.question_gen import LLMQuestionGenerator
+from llama_index.core.tools import ToolMetadata
+from llama_index.core.output_parsers.utils import parse_json_markdown
+from llama_index.core.question_gen.types import SubQuestion
+from models import ColPali, ColPaliProcessor
+from prompt_templates import (DEFAULT_GEN_PROMPT_TMPL,
+                     DEFAULT_FINAL_ANSWER_PROMPT_TMPL,
+                     DEFAULT_SUB_QUESTION_PROMPT_TMPL,
+                     DEFAULT_SYNTHESIZE_PROMPT_TMPL)
+from typing import Any, List, Optional, Tuple, cast
+from qdrant_client.http.models import Payload
+from collections import defaultdict
+def parse_to_query_result(response: List[Any]) -> VectorStoreQueryResult:
+    """
+    Convert vector store response to VectorStoreQueryResult.
+    Args:
+        response: List[Any]: List of results returned from the vector store.
+    """
+    nodes = []
+    similarities = []
+    ids = []
+    for point in response:
+        payload = cast(Payload, point.payload)
+        try:
+            node = metadata_dict_to_node(payload)
+        except Exception:
+            metadata, node_info, relationships = legacy_metadata_dict_to_node(
+                payload
+            )
+            node = TextNode(
+                id_=str(point.id),
+                text=payload.get("text"),
+                metadata=metadata,
+                start_char_idx=node_info.get("start", None),
+                end_char_idx=node_info.get("end", None),
+                relationships=relationships,
+            )
+        nodes.append(node)
+        ids.append(str(point.id))
+        try:
+            similarities.append(point.score)
+        except AttributeError:
+            # certain requests do not return a score
+            similarities.append(1.0)
+    return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)
+class ColPaliGemmaEmbedding(BaseEmbedding):
+    _model: ColPali = PrivateAttr()
+    _processor: ColPaliProcessor = PrivateAttr()
+    device: Union[torch.device | str] = Field(default="cpu",
+                                            description="Device to use")
+    def __init__(self,
+                model: ColPali,
+                processor: ColPaliProcessor,
+                device: Optional[str] = 'cpu',
+                **kwargs):
+        super().__init__(device=device,
+                        **kwargs)
+        self._model = model.to(device).eval()
+        self._processor = processor
+    @classmethod
+    def class_name(cls) -> str:
+        return "ColPaliGemmaEmbedding"
+    def _get_query_embedding(self, query: str) -> List[float]:
+        """Get query embedding.
+        Args:
+            query (str): Query String
+        """
+        with torch.no_grad():
+            processed_query = self._processor.process_queries([query])
+            processed_query = {k: v.to(self.device) for k, v in processed_query.items()}
+            query_embeddings = self._model(**processed_query)
+        return query_embeddings.to('cpu')[0]
+    def _get_text_embedding(self, text: str) -> List[float]:
+        """Get text embedding.
+        Args:
+            text (str): Text String
+        """
+        with torch.no_grad():
+            processed_query = self._processor.process_queries([text])
+            processed_query = {k: v.to(self.device) for k, v in processed_query.items()}
+            query_embeddings = self._model(**processed_query)
+        return query_embeddings.to('cpu')[0]
+    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
+        """Get text embeddings.
+        Args:
+            texts (List[str]): List of text string
+        """
+        with torch.no_grad():
+            processed_queries = self._processor.process_queries(texts)
+            processed_query = {k: v.to(self.device) for k, v in processed_query.items()}
+            query_embeddings = self._model(**processed_queries)
+        return query_embeddings.to('cpu')
+    async def _aget_query_embedding(self, query: str) -> List[float]:
+        return self._get_query_embedding(query)
+    async def _aget_text_embedding(self, text: str) -> List[float]:
+        return self._get_text_embedding(text)
+class  ColPaliRetriever(BaseRetriever):
+    def __init__(self,
+                vector_store_client: Union[qdrant_client.QdrantClient | qdrant_client.AsyncQdrantClient],
+                target_collection: str,
+                embed_model: ColPaliGemmaEmbedding,
+                query_mode: str = 'default',
+                similarity_top_k: int = 3,
+                ) -> None:
+        self._vector_store_client = vector_store_client
+        self._target_collection = target_collection
+        self._embed_model = embed_model
+        self._query_mode = query_mode
+        self._similarity_top_k = similarity_top_k
+        super().__init__()
+    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
+        """Get retrived nodes from the vector store by retriever given query string.
+        Args:
+            query_bundle (QueryBundle): QueryBundle class includes query string
+        Returns:
+            List[NodeWithScore]: List of retrieved nodes.
+        """
+        if query_bundle.embedding is None:
+            query_embedding = self._embed_model._get_query_embedding(query_bundle.query_str)
+        else:
+            query_embedding = query_bundle.embedding
+        query_embedding = query_embedding.cpu().float().numpy().tolist()
+        # Get nodes from vector store
+        response = self._vector_store_client.query_points(collection_name=self._target_collection,
+                                                        query=query_embedding,
+                                                        limit=self._similarity_top_k).points
+        # Parse to structured output nodes
+        query_result = parse_to_query_result(response)
+        nodes_with_scores = []
+        for idx, node in enumerate(query_result.nodes):
+            score = None
+            if query_result.similarities is not None:
+                score = query_result.similarities[idx]
+            nodes_with_scores.append(NodeWithScore(node=node, score=score))
+        return nodes_with_scores
+    async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
+        """Asynchronously get retrived nodes from the vector store by retriever given query string.
+        Args:
+            query_bundle (QueryBundle): QueryBundle class includes query string
+        Returns:
+            List[NodeWithScore]: List of retrieved nodes.
+        """
+        if query_bundle.embedding is None:
+            query_embedding = await self._embed_model._aget_query_embedding(query_bundle.query_str)
+        else:
+            query_embedding = query_bundle.embedding
+        query_embedding = query_embedding.cpu().float().numpy().tolist()
+        # Get nodes from vector store
+        responses = await self._vector_store_client.query_points(collection_name=self._target_collection,
+                                                            query=query_embedding,
+                                                            limit=self._similarity_top_k)
+        responses = responses.points
+        # Parse to structured output nodes
+        query_result = parse_to_query_result(responses)
+        nodes_with_scores = []
+        for idx, node in enumerate(query_result.nodes):
+            score = None
+            if query_result.similarities is not None:
+                score = query_result.similarities[idx]
+            nodes_with_scores.append(NodeWithScore(node=node, score=score))
+        return nodes_with_scores
+def fuse_results(retrieved_nodes: List[NodeWithScore], similarity_top_k: int) -> List[NodeWithScore]:
+    """Fuse retrieved nodes using Reciprocal Rank
+    Args:
+        retrieved_nodes (List[NodeWithScore]): List of nodes.
+        similarity_top_k (int): get top K nodes.
+    Returns:
+        List[NodeWithScore]: List of nodes after fused
+    """
+    k = 60.0
+    fused_scores = {}
+    text_to_node = {}
+    for rank, node_with_score in enumerate(sorted(retrieved_nodes, key=lambda x: x.score or 0.0, reverse=True)):
+        text = node_with_score.node.get_content(metadata_mode='all')
+        text_to_node[text] = node_with_score
+        fused_scores[text] = fused_scores.get(text, 0.0) + 1.0 / (rank + k)
+    # Sort results by calculated score
+    reranked_results = dict(sorted(fused_scores.items(), key=lambda x: x[1], reverse=True))
+    reranked_nodes: List[NodeWithScore] = []
+    for text, score in reranked_results.items():
+        reranked_nodes.append(text_to_node[text])
+        reranked_nodes[-1].score = score
+    return reranked_nodes[:similarity_top_k]
+def generate_queries(llm: LLM, query: str, num_queries: int) -> List[str]:
+    """Generate num_queries queries
+    Args:
+        llm (LLM): LLM model
+        query (str): query string
+        num_queries (int): Number of queries to generate
+    Returns:
+        generate_queries List[str]: List of generated queries
+    """
+    query_prompt = PromptTemplate(DEFAULT_GEN_PROMPT_TMPL)
+    generate_queries = llm.predict(query_prompt,
+                                    num_queries=num_queries,
+                                    query=query)
+    generate_queries = generate_queries.split('\n')
+    return generate_queries
+async def agenerate_queries(llm: LLM, query: str, num_queries: int):
+    """Asynchronously generate num_queries queries
+    Args:
+        llm (LLM): LLM model
+        query (str): query string
+        num_queries (int): Number of queries to generate
+    Returns:
+        generate_queries List[str]: List of generated queries
+    """
+    query_prompt = PromptTemplate(DEFAULT_GEN_PROMPT_TMPL)
+    generate_queries = await llm.apredict(query_prompt,
+                                    num_queries=num_queries,
+                                    query=query)
+    generate_queries = generate_queries.split('\n')
+    return generate_queries
+# Tree Summarization
+def synthesize_results(queries: List[SubQuestion], contexts: Dict[str, Set[str]], llm: LLM, num_children: int) -> Tuple[str, List[str]]:
+    """Summarize the results generated from LLM.
+    Args:
+        queries (List[SubQuestion]): Generated results
+        contexts (Dict[str, Set[str]]): Dictionary maps context information string to its set of source images
+        llm (LLM): LLM Model
+        num_children (int): Number of children for Tree Summarization
+    Returns:
+        Tuple[str, List[str]]: Synthesized text, set of source images.
+    """
+    qa_prompt = PromptTemplate(DEFAULT_SYNTHESIZE_PROMPT_TMPL)
+    new_contexts = defaultdict(set)
+    keys = list(contexts.keys())
+    for idx in range(0, len(keys), num_children):
+        contexts_batch = keys[idx: idx + num_children]
+        context_str = '\n\n'.join([f"{i + 1}. {text}" for i, text in enumerate(contexts_batch)])
+        fmt_qa_prompt = qa_prompt.format(context_str=context_str, query_str="\n".join([query.sub_question for query in queries]))
+        combined_result = llm.complete(fmt_qa_prompt)
+        # Parse json string to dictionary
+        json_dict = parse_json_markdown(str(combined_result))
+        if len(json_dict['choices']) > 0:
+            for choice in json_dict['choices']:
+                new_contexts[json_dict['summarized_text']] = new_contexts[json_dict['summarized_text']].union(contexts[contexts_batch[choice - 1]])
+        else:
+            new_contexts[json_dict['summarized_text']] = set()
+    if len(new_contexts) == 1:
+        synthesized_text = list(new_contexts.keys())[0]
+        return synthesized_text, list(new_contexts[synthesized_text])
+    else:
+        return synthesize_results(queries, new_contexts, llm, num_children=num_children)
+async def asynthesize_results(queries: List[SubQuestion], contexts: Dict[str, Set[str]], llm: LLM, num_children: int) -> Union[str, List[str]]:
+    """Asynchronously sumamarize the results generated from LLM.
+    Args:
+        queries (List[SubQuestion]): Generated results
+        contexts (Dict[str, Set[str]]): Dictionary maps context information string to its set of source images
+        llm (LLM): LLM Model
+        num_children (int): Number of children for Tree Summarization
+    Returns:
+        Tuple[str, List[str]]: Synthesized text, set of source images.
+    """
+    qa_prompt = PromptTemplate(DEFAULT_SYNTHESIZE_PROMPT_TMPL)
+    fmt_qa_prompts = []
+    keys = list(contexts.keys())
+    contexts_batches = []
+    for idx in range(0, len(keys), num_children):
+        contexts_batch = keys[idx: idx + num_children]
+        context_str = '\n\n'.join([f"{idx + 1}. {text}" for idx, text in enumerate(contexts_batch)])
+        fmt_qa_prompt = qa_prompt.format(context_str=context_str, query_str="\n".join([query.sub_question for query in queries]))
+        fmt_qa_prompts.append(fmt_qa_prompt)
+        contexts_batches.append(contexts_batch)
+    tasks = []
+    async with asyncio.TaskGroup() as tg:
+        for fmt_qa_prompt in fmt_qa_prompts:
+            task = tg.create_task(llm.acomplete(fmt_qa_prompt))
+            tasks.append(task)
+    responses = [str(task.result()) for task in tasks]
+    new_contexts = defaultdict(set)
+    for idx, response in enumerate(responses):
+        # Parse json string to dictionary
+        json_dict = parse_json_markdown(response)
+        if len(json_dict["choices"]) > 1:
+            for choice in json_dict["choices"]:
+                new_contexts[json_dict["summarized_text"]] = new_contexts[json_dict["summarized_text"]].union(contexts[contexts_batches[idx][choice - 1]])
+        else:
+            new_contexts[json_dict["summarized_text"]] = set()
+    if len(new_contexts) == 1:
+        synthesized_text = list(new_contexts.keys())[0]
+        return synthesized_text, list(new_contexts[synthesized_text])
+    else:
+        return await asynthesize_results(queries, new_contexts, llm, num_children=num_children)
+class CustomFusionRetriever(BaseRetriever):
+    def __init__(self,
+                 llm,
+                 retriever_mappings: Dict[str, BaseRetriever],
+                 similarity_top_k: int = 3,
+                 num_generated_queries = 3,
+                 ) -> None:
+        self._retriever_mappings = retriever_mappings
+        self._similarity_top_k = similarity_top_k
+        self._num_generated_queries = num_generated_queries
+        self._llm = llm
+        super().__init__()
+    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
+        """Retrieve self._similarity_top_k content nodes given query
+        Args:
+            query_bundle (QueryBundle): query bundle include query string
+        """
+        # Get data from query bundle
+        query_dict = json.loads(query_bundle.query_str)
+        original_query = query_dict['sub_question']
+        tool_name = query_dict['tool_name']
+        # Rewrite original query to n queries
+        generated_queries = generate_queries(self._llm, original_query, num_queries=self._num_generated_queries)
+        # For each generated query, retrieve relevant nodes
+        retrieved_nodes = []
+        for query in generated_queries:
+            if len(query) == 0:
+                continue
+            retrieved_nodes.extend(self._retriever_mappings[tool_name].retrieve(query))
+        # Fuse retrieved nodes using reciprocal rank
+        fused_results = fuse_results(retrieved_nodes,
+                                     similarity_top_k=self._similarity_top_k)
+        return fused_results
+    async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
+        """Asynchronously retrieve self._similarity_top_k content nodes given query
+        Args:
+            query_bundle (QueryBundle): query bundle include query string
+        """
+        # Get data from query bundle
+        query_dict = json.loads(query_bundle.query_str)
+        original_query = query_dict['sub_question']
+        tool_name = query_dict['tool_name']
+        # Rewrite original query to n queries
+        generated_queries = await agenerate_queries(llm=self._llm, query=original_query, num_queries=self._num_generated_queries)
+        # For each generated query, retrieve relevant nodes
+        tasks = []
+        async with asyncio.TaskGroup() as tg:
+            for query in generated_queries:
+                if len(query) == 0:
+                    continue
+                task = tg.create_task(self._retriever_mappings[tool_name].aretrieve(query))
+                tasks.append(task)
+        retrieved_nodes = [node for task in tasks for node in task.result()]
+        # Fuse retrieved nodes using reciprocal rank
+        fused_results = fuse_results(retrieved_nodes,
+                                     similarity_top_k=self._similarity_top_k)
+        return fused_results
+@dataclass
+class Response:
+    response: str
+    source_images: Optional[List] = None
+    def __str__(self):
+        return self.response
+class CustomQueryEngine:
+    def __init__(self,
+                 retriever_tools: List[ToolMetadata],
+                 fusion_retriever: BaseRetriever,
+                 qa_prompt: PromptTemplate = None,
+                 llm: LLM = None,
+                 num_children: int = 3):
+        self._qa_prompt = qa_prompt if qa_prompt else PromptTemplate(DEFAULT_FINAL_ANSWER_PROMPT_TMPL)
+        self._llm = llm
+        self._num_children = num_children
+        self._sub_question_generator = LLMQuestionGenerator.from_defaults(llm=self._llm,
+                                                                          prompt_template_str=DEFAULT_SUB_QUESTION_PROMPT_TMPL)
+        self._fusion_retriever = fusion_retriever
+        self._retriever_tools = retriever_tools
+    def query(self, query_str: str) -> Response:
+        # Generate sub queries
+        sub_queries = self._sub_question_generator.generate(tools=self._retriever_tools,
+                                                            query=QueryBundle(query_str=query_str))
+        if len(sub_queries) == 0:
+            response_template = PromptTemplate("Cannot answer the query: {query_str}")
+            return Response(response=response_template.format(query_str=query_str), source_images=[])
+        else:
+            # Dictionary to map response -> source_images
+            response2images_mapping = defaultdict(set)
+            # For each sub queries retrieve relevant image nodes
+            # With fusion retriever, each sub query is rewritten to n queries -> retrieve relevant nodes for each generated query
+            # -> fuse all nodes retrieved from multiple generated queries using reciprocal rank -> get top k results
+            for sub_query in sub_queries:
+                retrieved_nodes = self._fusion_retriever.retrieve(QueryBundle(query_str=sub_query.model_dump_json()))
+                # Using LLM to get the answer for sub query from retrieved nodes
+                for retrieved_node in retrieved_nodes:
+                    response2images_mapping[str(self._llm.complete([sub_query.sub_question, Image.open(retrieved_node.node.resolve_image())]))].add(retrieved_node.node.image)
+            # Synthesize results
+            synthesized_text, source_images = synthesize_results(queries=sub_queries,
+                                                                contexts=response2images_mapping,
+                                                                llm=self._llm,
+                                                                num_children=self._num_children)
+            final_answer = self._llm.predict(self._qa_prompt,
+                                            context_str=synthesized_text,
+                                            query_str=query_str)
+            response_template = PromptTemplate("Retrieved Information:\n"
+                                            "------------------------\n"
+                                            "{retrieved_information}\n"
+                                            "-------------------------\n\n"
+                                            "Answer:\n"
+                                            "{final_answer}")
+        return Response(response=response_template.format(retrieved_information=synthesized_text, final_answer=final_answer), source_images=source_images)
+    async def aquery(self, query_str: str):
+        sub_queries = await self._sub_question_generator.agenerate(tools=self._retriever_tools,
+                                                            query=QueryBundle(query_str=query_str))
+        if len(sub_queries) == 0:
+            response_template = PromptTemplate("Cannot answer the query: {query_str}")
+            return Response(response=response_template.format(query_str=query_str), source_images=[])
+        else:
+            retrieved_subquestion_nodes = []
+            async with asyncio.TaskGroup() as tg:
+                for sub_query in sub_queries:
+                    task = tg.create_task(self._fusion_retriever.aretrieve(QueryBundle(query_str=sub_query.model_dump_json())))
+                    retrieved_subquestion_nodes.append([sub_query.sub_question, task])
+            retrieved_subquestion_nodes = [[sub_question, task.result()] for sub_question, task in retrieved_subquestion_nodes]
+            answers = []
+            # For each sub queries retrieve relevant image nodes
+            # With fusion retriever, each sub query is rewritten to n queries -> retrieve relevant nodes for each generated query
+            # -> fuse all nodes retrieved from multiple generated queries using reciprocal rank -> get top k results
+            async with asyncio.TaskGroup() as tg:
+                for sub_question, retrieved_nodes in retrieved_subquestion_nodes:
+                    for retrieved_node in retrieved_nodes:
+                        task = tg.create_task(self._llm.acomplete([sub_question, Image.open(retrieved_node.node.resolve_image())]))
+                        answers.append([task, retrieved_node.node.image])
+            # Dictionary to map response -> source_images
+            response2images_mapping = defaultdict(set)
+            for task, image in answers:
+                response2images_mapping[str(task.result())].add(image)
+            # Synthesize results
+            synthesized_text, source_images = await asynthesize_results(queries=sub_queries,
+                                                        contexts=response2images_mapping,
+                                                        llm=self._llm,
+                                                        num_children=self._num_children)
+            final_answer = await self._llm.apredict(self._qa_prompt,
+                                            context_str=synthesized_text,
+                                            query_str=query_str)
+            response_template = PromptTemplate("Retrieved Information:\n"
+                                            "------------------------\n"
+                                            "{retrieved_information}\n"
+                                            "-------------------------\n\n"
+                                            "Answer:\n"
+                                            "{final_answer}")
+            return Response(response=response_template.format(retrieved_information=synthesized_text, final_answer=final_answer), source_images=source_images)

models/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .colpali import ColPali, KVCache
+from .paligemma_processor import PaliGemmaProcessor
+from .colpali_processor import ColPaliProcessor
+from .paligemma import PaliGemma
+from .lora import *

models/colpali.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import json
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import dataclass
+from .gemma import KVCache
+from .paligemma import PaliGemma, PaliGemmaConfig
+from typing import Optional
+from utils import *
+from pathlib import Path
+from safetensors import safe_open
+def convert_weights_dict(original_weights):
+    converted_weights = {}
+    converted_weights['custom_text_proj.lora_A.weight'] = original_weights['base_model.model.custom_text_proj.lora_A.weight']
+    converted_weights['custom_text_proj.lora_B.weight'] = original_weights['base_model.model.custom_text_proj.lora_B.weight']
+    for i in range(18):
+        converted_weights[f'model.language_model.model.layers.{i}.mlp.down_proj.lora_A.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.mlp.down_proj.lora_A.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.mlp.down_proj.lora_B.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.mlp.down_proj.lora_B.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.mlp.gate_proj.lora_A.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.mlp.gate_proj.lora_A.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.mlp.gate_proj.lora_B.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.mlp.gate_proj.lora_B.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.mlp.up_proj.lora_A.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.mlp.up_proj.lora_A.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.mlp.up_proj.lora_B.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.mlp.up_proj.lora_B.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.self_attn.q_proj.lora_A.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.self_attn.q_proj.lora_A.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.self_attn.q_proj.lora_B.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.self_attn.q_proj.lora_B.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.self_attn.k_proj.lora_A.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.self_attn.k_proj.lora_A.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.self_attn.k_proj.lora_B.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.self_attn.k_proj.lora_B.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.self_attn.v_proj.lora_A.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.self_attn.v_proj.lora_A.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.self_attn.v_proj.lora_B.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.self_attn.v_proj.lora_B.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.self_attn.o_proj.lora_A.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.self_attn.o_proj.lora_A.weight']
+        converted_weights[f'model.language_model.model.layers.{i}.self_attn.o_proj.lora_B.weight'] = original_weights[f'base_model.model.model.language_model.model.layers.{i}.self_attn.o_proj.lora_B.weight']
+    return converted_weights
+class ColPali(nn.Module):
+    def __init__(self, cfg: PaliGemmaConfig):
+        super().__init__()
+        self.model = PaliGemma(cfg=cfg)
+        self.dim = 128
+        self.custom_text_proj = nn.Linear(self.model.cfg.text_config.hidden_size, self.dim, bias=False)
+    @staticmethod
+    def from_pretrained(model_dir, torch_dtype: torch.dtype = torch.float32):
+        torch.set_default_dtype(torch_dtype)
+        with open(os.path.join(model_dir, 'config.json'), "r") as f:
+            model_config = json.loads(f.read())
+        config = PaliGemmaConfig.from_dict(model_config)
+        safetensor_files = Path(model_dir).glob("*.safetensors")
+        weights = {}
+        for file in safetensor_files:
+            with safe_open(file, framework='pt', device="cpu") as f:
+                for key in f.keys():
+                    weights[key] = f.get_tensor(key)
+        model = ColPali(config)
+        model.load_state_dict(weights, strict=False)
+        model.tie_weights()
+        return model
+    def load_lora(self, model_dir):
+        weights = {}
+        with safe_open(os.path.join(model_dir, "adapter_model.safetensors"), framework="pt", device="cpu") as f:
+            for key in f.keys():
+                weights[key] = f.get_tensor(key)
+        converted_weights = convert_weights_dict(weights)
+        self.load_state_dict(converted_weights, strict=False)
+    def tie_weights(self):
+        self.model.language_model.tie_weights()
+    def forward(self, *args, **kwargs) -> torch.Tensor:
+        outputs = self.model(*args, **kwargs)
+        last_hidden_states = outputs[0]
+        proj = self.custom_text_proj(last_hidden_states)
+        # L2 normalization
+        proj = proj / proj.norm(dim=-1, keepdim=True)  # (batch_size, sequence_length, dim)
+        proj = proj * kwargs['attention_mask'].unsqueeze(-1)  # (batch_size, sequence_length, dim)
+        return proj

models/colpali_processor.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+from PIL import Image
+from typing import Tuple, List
+import numpy as np
+from transformers import GemmaTokenizerFast
+from .paligemma_processor import PaliGemmaProcessor
+from typing import Optional
+def process_imgs(imgs: List[Image.Image],
+                img_size: Tuple[int, int],
+                rescale: float,
+                mean: Tuple[float, float, float],
+                std: Tuple[float, float, float]):
+    def normalize(img, mean, std):
+        img = (img - np.array(mean, dtype=img.dtype)) / np.array(std, dtype=img.dtype)
+        return img
+    resized_imgs = [img.resize((img_size[0], img_size[1]), resample=Image.Resampling.BICUBIC) for img in imgs]
+    rescaled_imgs = [np.array(img, dtype=np.float32) * rescale for img in resized_imgs]
+    normalized_imgs = [normalize(img, mean, std) for img in rescaled_imgs]
+    transposed_imgs = [img.transpose(2, 0, 1) for img in normalized_imgs]
+    tensor_imgs = torch.tensor(np.stack(transposed_imgs, axis=0), dtype=torch.float32)
+    return tensor_imgs
+def process_prompts(prompt, image_token, max_num_image_token, bos_token):
+    return f"{image_token * max_num_image_token}{bos_token}{prompt}\n"
+class ColPaliProcessor(PaliGemmaProcessor):
+    def __init__(self,
+                 tokenizer: GemmaTokenizerFast) -> None:
+        super().__init__(tokenizer=tokenizer)
+        self.mock_image = Image.new(mode='RGB', size=(16, 16), color='black')
+    def process_images(self, images: List[Image.Image]):
+        input_prompts = ["Describe the image."] * len(images)
+        images = [image.convert("RGB") for image in images]
+        return_data = self(images,
+                           input_prompts,
+                           padding="longest",
+                           truncation=False)
+        return return_data
+    def process_queries(self,
+                        queries: List[str],
+                        max_length: int = 50,
+                        suffix: Optional[str] = None):
+        if suffix is None:
+            suffix = "<pad>" * 10
+        texts_query: List[str] = []
+        for query in queries:
+            query = f"Question: {query}"
+            query += suffix
+            texts_query.append(query)
+        batch_query = self(imgs=[self.mock_image] * len(texts_query),
+                            prompts=texts_query,
+                            padding="longest",
+                            max_length=max_length + self.image_seq_length,
+                            truncation=True)
+        del batch_query["pixel_values"]
+        batch_query["input_ids"] = batch_query["input_ids"][..., self.image_seq_length:]
+        batch_query["attention_mask"] = batch_query["attention_mask"][..., self.image_seq_length:]
+        return batch_query

models/gemma.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.utils.parametrize as parametrize
+from dataclasses import dataclass
+from typing import Optional, List
+import math
+import torch.utils.checkpoint as checkpoint
+@dataclass
+class GemmaConfig:
+    hidden_size: int = 2048
+    intermediate_size: int = 16384
+    num_attention_heads: int = 8
+    num_hidden_layers: int = 18
+    num_image_tokens: int = 256
+    num_key_value_heads: int = 1
+    vocab_size: int = 257216
+    norm_eps: float = 1e-6
+    max_seq_len: int = 8192
+    attention_dropout: float = 0.0
+    use_lora: bool = False
+    training: bool = False
+    @classmethod
+    def from_dict(cls, data):
+        return cls(
+            hidden_size = data['hidden_size'],
+            intermediate_size = data['intermediate_size'],
+            num_attention_heads = data['num_attention_heads'],
+            num_hidden_layers = data['num_hidden_layers'],
+            num_image_tokens = data['num_image_tokens'],
+            num_key_value_heads = data['num_key_value_heads'],
+            vocab_size = data['vocab_size'],
+            training = data['training'])
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, norm_eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(dim))
+        self.norm_eps = norm_eps
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.norm_eps)
+    def forward(self, x: torch.Tensor):
+        output = self._norm(x.float())
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+def precompute_freqs(head_dim: int, max_seq_len: int, theta: int = 10000):
+    thetas = 1 / (theta ** (torch.arange(0, head_dim, 2, dtype=torch.int64).float() / head_dim))
+    m = torch.arange(max_seq_len, dtype=torch.long)
+    # (max_seq_len, head_dim // 2)
+    freqs = torch.outer(m, thetas)
+    # (max_seq_len, head_dim // 2) -> (max_seq_len, head_dim)
+    freqs = torch.cat((freqs, freqs), dim=-1)
+    return freqs
+def roate_half(x: torch.Tensor):
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_embed(x: torch.Tensor,
+                       freqs: torch.Tensor):
+    # x: (n, n_heads, seq_len, head_dim)
+    # freqs: (n, seq_len, head_dim)
+    device_type = x.device.type
+    device_type = device_type if device_type != 'mps' else 'cpu'
+    with torch.autocast(device_type=device_type, enabled=False):
+        cos = freqs.cos()
+        sin = freqs.sin()
+        while len(cos.shape) < len(x.shape):
+            cos = cos.unsqueeze(1)
+            sin = sin.unsqueeze(1)
+    cos = cos.to(x.dtype)
+    sin = sin.to(x.dtype)
+    x = (x * cos) + (roate_half(x) * sin)
+    return x
+class KVCache:
+    def __init__(self):
+        self.cache_k: List[torch.Tensor] = []
+        self.cache_v: List[torch.Tensor] = []
+    def num_items(self):
+        if len(self.cache_k) == 0:
+            return 0
+        else:
+            # (n, num_heads, seq_len, head_dim)
+            return self.cache_k[0].shape[-2]
+    def update(self, xk, xv, layer_idx):
+        if layer_idx < len(self.cache_k):
+            self.cache_k[layer_idx] = torch.cat((self.cache_k[layer_idx], xk), dim=-2)
+            self.cache_v[layer_idx] = torch.cat((self.cache_v[layer_idx], xv), dim=-2)
+        else:
+            self.cache_k.append(xk)
+            self.cache_v.append(xv)
+        return self.cache_k[layer_idx], self.cache_v[layer_idx]
+class GemmaTransformerAttention(nn.Module):
+    def __init__(self, cfg: GemmaConfig, layer_idx: int):
+        super().__init__()
+        self.cfg = cfg
+        self.layer_idx = layer_idx
+        self.vocab_size = cfg.vocab_size
+        self.hidden_size = cfg.hidden_size
+        self.num_attention_heads = cfg.num_attention_heads
+        self.num_key_value_heads = cfg.num_key_value_heads
+        self.max_seq_len = cfg.max_seq_len
+        assert self.hidden_size % self.num_attention_heads == 0
+        self.n_rep =self.num_attention_heads // self.num_key_value_heads
+        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.q_proj = nn.Linear(self.hidden_size, self.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.attn_dropout = cfg.attention_dropout
+        self.training = cfg.training
+        self.register_buffer('freqs',
+                      precompute_freqs(self.head_dim, cfg.max_seq_len),
+                      persistent=False)
+    def forward(self, x: torch.Tensor,
+                position_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                kv_cache: Optional[KVCache] = None):
+        batch_size, seq_len, embed_dim = x.shape
+        xq = self.q_proj(x)
+        xk = self.k_proj(x)
+        xv = self.v_proj(x)
+        # (n, seq_len, hidden_size) -> (n, seq_len, num_heads, head_dim) -> (n, num_heads, seq_len, head_dim)
+        xq = xq.view(batch_size, seq_len, self.num_attention_heads, self.head_dim).transpose(1, 2)
+        # (n, seq_len, hidden_size) -> (n, seq_len, num_kv_heads, head_dim) -> (n, num_kv_heads, seq_len, head_dim)
+        xk = xk.view(batch_size, seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        xv = xv.view(batch_size, seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        xq = apply_rotary_embed(xq, self.freqs[position_ids, :])
+        xk = apply_rotary_embed(xk, self.freqs[position_ids, :])
+        if kv_cache is not None:
+            keys, values = kv_cache.update(xk, xv, self.layer_idx)
+        else:
+            keys, values = xk, xv
+        # (n, num_kv_heads, seq_len, head_dim) -> (n, num_kv_heads * n_rep, seq_len, head_dim) -> (n, num_heads, seq_len, head_dim)
+        keys = keys[:, :, None, :, :].expand(-1, -1, self.n_rep, -1, -1).view(batch_size, -1, keys.shape[-2], self.head_dim)
+        values = values[:, :, None, :, :].expand(-1, -1, self.n_rep, -1, -1).view(batch_size, -1, keys.shape[-2], self.head_dim)
+        assert attention_mask is not None
+        # (n, num_heads, seq_len, head_dim) @ (n, num_heads, head_dim, seq_len) -> (n, num_heads, seq_len, seq_len)
+        attn_weights = torch.softmax(xq @ keys.transpose(2, 3) / math.sqrt(self.head_dim) + attention_mask, dim=-1)
+        # dropout when training
+        attn_weights = F.dropout(attn_weights, p=self.attn_dropout, training=self.training)
+        # (n, num_heads, seq_len, seq_len) @ (n, num_heads, seq_len, head_dim) -> (n, num_heads, seq_len, head_dim)
+        attn_output = attn_weights @ values
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(*x.shape)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class GemmaTransformerMLP(nn.Module):
+    def __init__(self, cfg: GemmaConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.down_proj = nn.Linear(cfg.intermediate_size, cfg.hidden_size, bias=False)
+        self.gate_proj = nn.Linear(cfg.hidden_size, cfg.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(cfg.hidden_size, cfg.intermediate_size, bias=False)
+    def forward(self, x: torch.Tensor):
+        return self.down_proj(F.gelu(self.gate_proj(x), approximate="tanh") * self.up_proj(x))
+class GemmaTransformerDecoder(nn.Module):
+    def __init__(self, cfg: GemmaConfig, layer_idx: int) -> None:
+        super().__init__()
+        self.cfg = cfg
+        self.input_layernorm = RMSNorm(cfg.hidden_size, cfg.norm_eps)
+        self.self_attn = GemmaTransformerAttention(cfg, layer_idx)
+        self.mlp = GemmaTransformerMLP(cfg)
+        self.post_attention_layernorm = RMSNorm(cfg.hidden_size, cfg.norm_eps)
+        self.gradient_checking = False
+    def forward(self, x: torch.Tensor,
+                position_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                kv_cache: Optional[KVCache] = None):
+        residual = x
+        x = self.input_layernorm(x)
+        if self.gradient_checking:
+            x = checkpoint.checkpoint(self.self_attn, x, position_ids, attention_mask, kv_cache)
+        else:
+            x = self.self_attn(x,
+                               position_ids,
+                               attention_mask,
+                               kv_cache)[0]
+        x += residual
+        residual = x
+        x = self.post_attention_layernorm(x)
+        x = residual + self.mlp(x)
+        return x
+class GemmaModel(nn.Module):
+    def __init__(self, cfg: GemmaConfig) -> None:
+        super().__init__()
+        self.cfg = cfg
+        self.embed_tokens = nn.Embedding(cfg.vocab_size, cfg.hidden_size)
+        self.layers = nn.ModuleList(
+            [GemmaTransformerDecoder(cfg, layer_idx) for layer_idx in range(cfg.num_hidden_layers)]
+        )
+        self.norm = RMSNorm(cfg.hidden_size, cfg.norm_eps)
+    def forward(self, x: torch.Tensor,
+                position_ids: Optional[torch.Tensor],
+                attention_mask: Optional[torch.Tensor],
+                kv_cache: Optional[KVCache]) -> torch.Tensor:
+        output = x * torch.tensor(self.cfg.hidden_size ** 0.5, dtype=x.dtype)
+        for layer in self.layers:
+            output = layer(output,
+                           position_ids,
+                           attention_mask,
+                           kv_cache)
+        output = self.norm(output)
+        return output
+class Gemma(nn.Module):
+    def __init__(self, cfg: GemmaConfig) -> None:
+        super().__init__()
+        self.cfg = cfg
+        self.model = GemmaModel(cfg)
+        self.vocab_size = cfg.vocab_size
+        self.lm_head = nn.Linear(cfg.hidden_size, cfg.vocab_size, bias=False)
+    def gradient_checkpointing_enabled(self, enabled=False):
+        for name, module in self.model.named_modules():
+            if isinstance(module, GemmaTransformerDecoder):
+                module.gradient_checking = enabled
+    def tie_weights(self):
+        self.lm_head.weight = self.model.embed_tokens.weight
+    def forward(self,
+                input_embeds: torch.Tensor,
+                position_ids: Optional[torch.Tensor],
+                attention_mask: Optional[torch.Tensor],
+                kv_cache: Optional[KVCache]):
+        output = self.model(input_embeds,
+                            position_ids,
+                            attention_mask,
+                            kv_cache)
+        return output, kv_cache

models/lora.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.utils.parametrize as parametrize
+from typing import List
+class LoRALayer:
+    def __init__(self, features_in: int, features_out: int, rank: int=1, alphas: int=1):
+        super().__init__()
+        self.lora_A = nn.Linear(features_in, rank, bias=False)
+        self.lora_B = nn.Linear(rank, features_out, bias=False)
+        nn.init.normal_(self.lora_A.weight, mean=0, std=1/rank)
+        self.scale = alphas / rank
+class LoRALinear(nn.Module, LoRALayer):
+    def __init__(self, base_layer: nn.Module, rank: int=1, alphas: int=1, dropout_p: float=0.0):
+        features_out, features_in = base_layer.weight.shape
+        super().__init__()
+        LoRALayer.__init__(self, features_in=features_in, features_out=features_out, rank=rank, alphas=alphas)
+        self.base_layer = nn.Linear(features_in, features_out, bias=False)
+        self.base_layer.weight = base_layer.weight
+        if dropout_p > 0.0:
+            self.lora_dropout = nn.Dropout(p=dropout_p, inplace=False)
+        else:
+            self.lora_dropout = nn.Identity()
+        self.enabled = False
+    def forward(self, x: torch.Tensor):
+        result = self.base_layer(x)
+        if self.enabled:
+            result = result + self.lora_B(self.lora_A(self.lora_dropout(x))) * self.scale
+        return result
+def enable_lora(model: nn.Module, lora_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'], enabled=True):
+    for name, module in model.named_modules():
+       if name.split('.')[-1] in lora_modules:
+           module.enabled = enabled
+    return model
+def replace_module(module: nn.Module, target_modules: List[str], torch_dtype: torch.dtype, **kwargs):
+    for child_name, child_module in module.named_children():
+        if child_name in target_modules:
+            new_module = LoRALinear(child_module, **kwargs).to(torch_dtype)
+            setattr(module, child_name, new_module)
+        else:
+            replace_module(child_module, target_modules, torch_dtype, **kwargs)
+def get_lora_model(model: nn.Module, rank: float, alphas: float, lora_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'], dropout_p: float = 0.0, training: bool = False, torch_dtype: torch.dtype = torch.bfloat16):
+    lora_config = {'rank': rank,
+                   'alphas': alphas,
+                   'dropout_p': dropout_p}
+    replace_module(model, lora_modules, torch_dtype, **lora_config)
+    for name, param in model.named_parameters():
+        if 'lora' not in name:
+            param.requires_grad = False
+        else:
+            if training:
+                param.requires_grad = True
+            else:
+                param.requires_grad = False
+    return model

models/paligemma.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import dataclass
+from .gemma import GemmaConfig, Gemma, KVCache
+from .siglip import SigLIPConfig, SigLIPVisionTower
+from typing import Optional
+import os
+import json
+from pathlib import Path
+from safetensors import safe_open
+@dataclass
+class PaliGemmaConfig:
+    bos_token_id: int = 2
+    eos_token_id: int = 1
+    hidden_size: int = 2048
+    ignore_index: int = -100
+    image_token_index: int = 257152
+    pad_token_id: int = 0
+    projection_dim: int = 2048
+    text_config: GemmaConfig = None
+    vision_config: SigLIPConfig = None
+    vocab_size: int = 257216
+    @classmethod
+    def from_dict(cls, data):
+        return cls(
+            bos_token_id = data['bos_token_id'],
+            eos_token_id = data['eos_token_id'],
+            hidden_size = data['hidden_size'],
+            ignore_index = data['ignore_index'],
+            image_token_index = data['image_token_index'],
+            pad_token_id = data['pad_token_id'],
+            projection_dim = data['projection_dim'],
+            text_config = GemmaConfig.from_dict(data['text_config']),
+            vision_config = SigLIPConfig.from_dict(data['vision_config'])
+        )
+class PaliGemmaMultimodalProjector(nn.Module):
+    def __init__(self, cfg: PaliGemmaConfig):
+        super().__init__()
+        self.linear = nn.Linear(cfg.vision_config.hidden_size, cfg.vision_config.projection_dim)
+    def forward(self, x: torch.Tensor):
+        x = self.linear(x)
+        return x
+class PaliGemma(nn.Module):
+    def __init__(self, cfg: PaliGemmaConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.language_model = Gemma(cfg.text_config)
+        self.vision_tower = SigLIPVisionTower(cfg.vision_config)
+        self.multi_modal_projector = PaliGemmaMultimodalProjector(cfg)
+    def tie_weights(self):
+        self.language_model.tie_weights()
+    def _merge_img_embeds_and_input_embeds(self, img_embeds: torch.Tensor,
+                                                input_embeds: torch.Tensor,
+                                                input_tokens: torch.Tensor):
+        batch_size, seq_len, embed_dim = input_embeds.shape
+        scaled_img = img_embeds / (self.cfg.hidden_size ** 0.5)
+        final_embeddings = torch.zeros((batch_size, seq_len, embed_dim), dtype=img_embeds.dtype, device=img_embeds.device)
+        # (n, seq_len)
+        text_mask = (input_tokens != self.cfg.pad_token_id) & (input_tokens != self.cfg.image_token_index)
+        img_mask = input_tokens == self.cfg.image_token_index
+        pad_mask = input_tokens == self.cfg.pad_token_id
+        text_mask = text_mask.unsqueeze(-1).expand(-1, -1, embed_dim)
+        img_mask = img_mask.unsqueeze(-1).expand(-1, -1, embed_dim)
+        pad_mask = pad_mask.unsqueeze(-1).expand(-1, -1, embed_dim)
+        # (n, seq_len, embed_dim)
+        final_embeddings = torch.where(text_mask, input_embeds, final_embeddings)
+        final_embeddings = final_embeddings.masked_scatter(img_mask, scaled_img)
+        final_embeddings = torch.where(pad_mask, torch.zeros_like(final_embeddings), final_embeddings)
+        return final_embeddings
+    def _create_position_ids_and_attention_mask(self,
+                                                device: str = '',
+                                                dtype: torch.dtype = torch.float32,
+                                                batch_size: int = 32,
+                                                seq_len: int = 1,
+                                                attention_mask: Optional[torch.Tensor] = None,
+                                                kv_cache: Optional[KVCache] = None):
+        # Create Attention Mask
+        if kv_cache is None or kv_cache.num_items() == 0:
+            causal_mask = torch.full((batch_size, seq_len, seq_len), 0, dtype=dtype, device=device)
+            position_ids = attention_mask.cumsum(dim=-1).masked_fill_((attention_mask == 0), 1).to(device)
+        else:
+            assert seq_len == 1
+            kv_len = kv_cache.num_items() + 1
+            causal_mask = torch.full((batch_size, 1, kv_len), 0, dtype=dtype, device=device)
+            position_ids = attention_mask.cumsum(dim=-1)[:, -1].to(device)
+        # (n, seq_len, kv_len) -> (n, 1, seq_len, kv_len)
+        causal_mask = causal_mask.unsqueeze(1)
+        return position_ids, causal_mask
+    @staticmethod
+    def from_pretrained(model_dir):
+        with open(os.path.join(model_dir, 'config.json'), "r") as f:
+            model_config = json.loads(f.read())
+        config = PaliGemmaConfig.from_dict(model_config)
+        safetensor_files = Path(model_dir).glob("*.safetensors")
+        weights = {}
+        for file in safetensor_files:
+            with safe_open(file, framework='pt', device="cpu") as f:
+                for key in f.keys():
+                    weights[key] = f.get_tensor(key)
+        model = PaliGemma(config)
+        model.load_state_dict(weights, strict=False)
+        model.tie_weights()
+        return model
+    def forward(self, *args, **kwargs):
+        # input_tokens: (n, seq_len)
+        # -> (n, seq_len, embed_dim)
+        kv_cache = kwargs['kv_cache'] if 'kv_cache' in kwargs else None
+        input_tokens = kwargs['input_ids']
+        pixel_values = kwargs['pixel_values'] if 'pixel_values' in kwargs else None
+        attention_mask = kwargs['attention_mask']
+        input_embeds = self.language_model.model.embed_tokens(input_tokens)
+        if pixel_values is not None:
+            img_embeds = self.vision_tower(pixel_values.to(input_embeds.dtype))
+            img_embeds = self.multi_modal_projector(img_embeds)
+            final_embeddings = self._merge_img_embeds_and_input_embeds(img_embeds=img_embeds,
+                                                                        input_embeds=input_embeds,
+                                                                        input_tokens=input_tokens)
+        else:
+            final_embeddings = input_embeds
+        position_ids, causal_mask = self._create_position_ids_and_attention_mask(device=final_embeddings.device.type,
+                                                                                    dtype=final_embeddings.dtype,
+                                                                                    batch_size=final_embeddings.shape[0],
+                                                                                    seq_len=final_embeddings.shape[1],
+                                                                                    attention_mask=attention_mask,
+                                                                                    kv_cache=kv_cache)
+        outputs, kv_cache = self.language_model(
+            input_embeds=final_embeddings,
+            position_ids=position_ids,
+            attention_mask=causal_mask,
+            kv_cache=kv_cache
+        )
+        return outputs, kv_cache

models/paligemma_processor.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+from PIL import Image
+from typing import Tuple, List
+import numpy as np
+from transformers import GemmaTokenizerFast, BatchFeature
+import json
+import os
+def preprocess_imgs(imgs: List[Image.Image],
+                img_size: Tuple[int, int],
+                rescale: float,
+                mean: Tuple[float, float, float],
+                std: Tuple[float, float, float]):
+    def normalize(img, mean, std):
+        img = (img - np.array(mean, dtype=img.dtype)) / np.array(std, dtype=img.dtype)
+        return img
+    resized_imgs = [np.array(img.resize((img_size[0], img_size[1]), resample=3)) for img in imgs]
+    rescaled_imgs = [(img * rescale).astype(np.float32) for img in resized_imgs]
+    normalized_imgs = [normalize(img, mean, std) for img in rescaled_imgs]
+    transposed_imgs = [img.transpose(2, 0, 1) for img in normalized_imgs]
+    tensor_imgs = torch.tensor(np.stack(transposed_imgs, axis=0), dtype=torch.float32)
+    return tensor_imgs
+def preprocess_prompts(prompt, image_token, max_num_image_token, bos_token):
+    return f"{image_token * max_num_image_token}{bos_token}{prompt}\n"
+class PaliGemmaProcessor:
+    IMAGE_TOKEN = "<image>"
+    def __init__(self,
+                 tokenizer: GemmaTokenizerFast) -> None:
+        additional_special_tokens = {"additional_special_tokens": [self.IMAGE_TOKEN]}
+        tokenizer.add_special_tokens(additional_special_tokens)
+        EXTRA_TOKENS = [
+            f"<loc{i:04d}>" for i in range(1024)
+        ]  # These tokens are used for object detection (bounding boxes)
+        EXTRA_TOKENS += [
+            f"<seg{i:03d}>" for i in range(128)
+        ]
+        tokenizer.add_tokens(EXTRA_TOKENS)
+        tokenizer.add_bos_token = False
+        tokenizer.add_eos_token = False
+        self.tokenizer = tokenizer
+    def from_pretrained(self, pretrained_dir):
+        with open(os.path.join(pretrained_dir, "preprocessor_config.json"), "r") as f:
+            config = json.loads(f.read())
+        self.image_seq_length = config['image_seq_length']
+        self.image_mean = config['image_mean']
+        self.image_std = config['image_std']
+        self.resample = config['resample']
+        self.rescale_factor = config['rescale_factor']
+        self.size = (config['size']['height'], config['size']['width'])
+        return self
+    def __call__(self,
+                 imgs: List[Image.Image],
+                 prompts: List[str],
+                 padding: str = "longest",
+                 truncation: bool = True,
+                 max_length: int = None):
+        processed_imgs = preprocess_imgs(imgs,
+                                      img_size=self.size,
+                                      rescale=self.rescale_factor,
+                                      mean=self.image_mean,
+                                      std=self.image_mean)
+        processed_prompts =  [preprocess_prompts(prompt,
+                                          image_token=self.IMAGE_TOKEN,
+                                          max_num_image_token=self.image_seq_length,
+                                          bos_token=self.tokenizer.bos_token) for prompt in prompts]
+        model_inputs = self.tokenizer(processed_prompts,
+                                    return_tensors='pt',
+                                    padding=padding,
+                                    truncation=truncation,
+                                    max_length=max_length)
+        return {**model_inputs, "pixel_values": processed_imgs}

models/siglip.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class SigLIPConfig:
+    hidden_size: int = 1152
+    intermediate_size: int = 4304
+    num_attention_heads: int = 16
+    num_hidden_layers: int = 27
+    num_image_tokens: int = 256
+    patch_size: int = 14
+    projection_dim: int = 2048
+    n_channels: int = 3
+    img_size: int = 224
+    norm_eps: float = 1e-6
+    attention_dropout: float = 0.0
+    @classmethod
+    def from_dict(cls, data):
+        return cls(
+            hidden_size = data['hidden_size'],
+            intermediate_size = data['intermediate_size'],
+            num_attention_heads = data['num_attention_heads'],
+            num_hidden_layers = data['num_hidden_layers'],
+            num_image_tokens = data['num_image_tokens'],
+            patch_size = data['patch_size'],
+            projection_dim = data['projection_dim']
+        )
+class SigLIPEmbedding(nn.Module):
+    def __init__(self, cfg: SigLIPConfig):
+        super().__init__()
+        self.patch_embedding = nn.Conv2d(cfg.n_channels, cfg.hidden_size, kernel_size=cfg.patch_size, stride=cfg.patch_size, padding='valid')
+        self.num_patches = (cfg.img_size // cfg.patch_size) ** 2
+        self.position_embedding = nn.Embedding(cfg.num_image_tokens, cfg.hidden_size)
+        self.register_buffer('position_ids',
+                             torch.arange(cfg.num_image_tokens).expand(1, -1),
+                             persistent=False)
+    def forward(self, x: torch.FloatTensor):
+        # x: (n, c, h, w) -> (n, c, num_patch_h, num_patch_w)
+        img_embeds = self.patch_embedding(x)
+        # (n, c, num_patch_h, num_patch_w) -> (n, c, num_patches) -> (n, num_patches, c)
+        img_embeds = img_embeds.reshape(*img_embeds.shape[:2], -1).transpose(1, 2)
+        return img_embeds + self.position_embedding(self.position_ids.to(torch.int64))
+class SigLIPTransformerAttention(nn.Module):
+    def __init__(self, cfg: SigLIPConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.num_attention_heads = cfg.num_attention_heads
+        self.head_dim = cfg.hidden_size // self.num_attention_heads
+        self.q_proj = nn.Linear(cfg.hidden_size, cfg.hidden_size)
+        self.k_proj = nn.Linear(cfg.hidden_size, cfg.hidden_size)
+        self.v_proj = nn.Linear(cfg.hidden_size, cfg.hidden_size)
+        self.out_proj = nn.Linear(cfg.hidden_size, cfg.hidden_size)
+        self.dropout_p = self.cfg.attention_dropout
+    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor):
+        batch_size, num_patches, _ = x.shape
+        xq = self.q_proj(x)
+        xk = self.k_proj(x)
+        xv = self.v_proj(x)
+        xq = xq.view(batch_size, num_patches, self.num_attention_heads, self.head_dim).transpose(1, 2)
+        xk = xk.view(batch_size, num_patches, self.num_attention_heads, self.head_dim).transpose(1, 2)
+        xv = xv.view(batch_size, num_patches, self.num_attention_heads, self.head_dim).transpose(1, 2)
+        # attn_weights = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
+        # attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(xq.dtype)
+        # attn_output  = torch.matmul(attn_weights, xv)
+        # attn_output = attn_output.transpose(1, 2).contiguous()
+        # attn_output = attn_output.view(batch_size, num_patches, -1)
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query=xq,
+            key=xk,
+            value=xv,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout_p,
+            is_causal=False
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, num_patches, -1)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, None
+class SigLIPTransformerMLP(nn.Module):
+    def __init__(self, cfg: SigLIPConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.fc1 = nn.Linear(cfg.hidden_size, cfg.intermediate_size)
+        self.fc2 = nn.Linear(cfg.intermediate_size, cfg.hidden_size)
+    def forward(self, x: torch.Tensor):
+        x = self.fc1(x)
+        x = F.gelu(x, approximate='tanh')
+        x = self.fc2(x)
+        return x
+class SigLIPTransformerBlock(nn.Module):
+    def __init__(self, cfg: SigLIPConfig):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(cfg.hidden_size, eps=cfg.norm_eps)
+        self.layer_norm2 = nn.LayerNorm(cfg.hidden_size, eps=cfg.norm_eps)
+        self.self_attn = SigLIPTransformerAttention(cfg)
+        self.mlp = SigLIPTransformerMLP(cfg)
+    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = self.layer_norm1(x)
+        x = residual + self.self_attn(x, attention_mask)[0]
+        residual = x
+        x = self.layer_norm2(x)
+        x = residual + self.mlp(x)
+        return x
+class SigLIPTransformerEncoder(nn.Module):
+    def __init__(self, cfg: SigLIPConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.layers = nn.ModuleList(
+            [SigLIPTransformerBlock(cfg) for _ in range(cfg.num_hidden_layers)]
+        )
+    def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        for layer in self.layers:
+            x = layer(x, attention_mask)
+        return x
+class SigLIPModel(nn.Module):
+    def __init__(self, cfg: SigLIPConfig):
+        super().__init__()
+        self.embeddings = SigLIPEmbedding(cfg)
+        self.encoder = SigLIPTransformerEncoder(cfg)
+        self.post_layernorm = nn.LayerNorm(cfg.hidden_size, eps=cfg.norm_eps)
+    def forward(self, x: torch.Tensor):
+        img_embed = self.embeddings(x)
+        output = self.encoder(img_embed)
+        output = self.post_layernorm(output)
+        return output
+class SigLIPVisionTower(nn.Module):
+    def __init__(self, cfg: SigLIPConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.vision_model = SigLIPModel(cfg)
+    def forward(self, x: torch.Tensor):
+        return self.vision_model(x)

pretrained/colpaligemma-3b-mix-448-base/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:caed65068cae6d50e572d984914324a7d8a9360cdd7f4263ea82f1792614391f
+size 78625112

pretrained/colpaligemma-3b-mix-448-base/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:012239f7d70c76d7f85bfca5e23f6afcde455f9ed23fa3f2ec9057b6028f6a5b
+size 1047

pretrained/colpaligemma-3b-mix-448-base/model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c128f5670d7a66942a194be6e2d324dc329c0de19e99c6f047513878e14f988e
+size 4986817288

pretrained/colpaligemma-3b-mix-448-base/model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8352c38e4d1785c4a35547d13f4d8d5562faab6fe8e9a30b1f5d8039d355a409
+size 862495528

pretrained/colpaligemma-3b-mix-448-base/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fc342baea95529a5eb9746a0232fb88941d759812d7b616c382f2f87ba6123f
+size 700

pretrained/colpaligemma-3b-mix-448-base/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffd310e50986db7a039948ab83441d612689e7f989198e31b5c8984ca458adf6
+size 17763459

pretrained/colpaligemma-3b-mix-448-base/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8986bb4f423f07f8c7f70d0dbe3526fb2316056c17bae71b1ea975e77a168fc6
+size 4264023

pretrained/colpaligemma-3b-mix-448-base/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5e95b5ab863693113e65e4899e1db28c09d892fa84243c7dfe6ce7f727f1888
+size 242696

prompt_templates.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import json
+from llama_index.core.question_gen.types import SubQuestion
+from llama_index.core.tools.types import ToolMetadata
+from llama_index.core.question_gen.prompts import build_tools_text
+PREFIX = """\
+Given a user question, and a list of tools, output a list of relevant sub-questions \
+in json markdown that when composed can help answer the full user question:
+"""
+example_query_str = (
+    "Compare and contrast the revenue growth and EBITDA of Uber and Lyft for year 2021"
+)
+example_tools = [
+    ToolMetadata(
+        name="uber_10k",
+        description="Provides information about Uber financials for year 2021",
+    ),
+    ToolMetadata(
+        name="lyft_10k",
+        description="Provides information about Lyft financials for year 2021",
+    ),
+]
+example_tools_str = build_tools_text(example_tools)
+example_output = [
+    SubQuestion(
+        sub_question="What is the revenue growth of Uber", tool_name="uber_10k"
+    ),
+    SubQuestion(sub_question="What is the EBITDA of Uber", tool_name="uber_10k"),
+    SubQuestion(
+        sub_question="What is the revenue growth of Lyft", tool_name="lyft_10k"
+    ),
+    SubQuestion(sub_question="What is the EBITDA of Lyft", tool_name="lyft_10k"),
+]
+example_output_str = json.dumps(
+    {"items": [x.model_dump() for x in example_output]}, indent=4
+)
+EXAMPLES = f"""\
+# Example 1
+<Tools>
+```json
+{example_tools_str}
+```
+<User Question>
+{example_query_str}
+<Output>
+```json
+{example_output_str}
+```
+"""
+SUFFIX = """\
+# Example 2
+<Tools>
+```json
+{tools_str}
+```
+<User Question>
+{query_str}
+<Output>
+"""
+DEFAULT_SUB_QUESTION_PROMPT_TMPL = PREFIX + EXAMPLES + SUFFIX
+DEFAULT_GEN_PROMPT_TMPL = """\
+    You are a helpful assistant that generates multiple search queries based on a \
+    single input query. Generate {num_queries} search queries, one on each line, \
+    related to the following input query:
+    Query: {query}
+    Queries:
+    """
+DEFAULT_FINAL_ANSWER_PROMPT_TMPL = """\
+    Context information is below.
+    ---------------------
+    {context_str}
+    ---------------------
+    Given the context information and not prior knowledge, answer the query.
+    Query: {query_str}
+    Answer: \
+    """
+SYNTHESIZE_PROMPT = """\
+    Context information is below.
+    ---------------------
+    {context_str}
+    ---------------------
+    Given the information from multiple sources and not prior knowledge,
+    Summarize the information that are most relevant to the queries and return index of choices chosen to summarize.
+    Query: {query_str}\n
+    """
+SYNTHESIZE_OUTPUT_FORMAT = """Return the output that conforms to the JSON schema below.
+    Here is the output schema.
+    {
+        "properties": {
+            "summarized_text": {
+            "title": "Summarized Text",
+            "type": "string"
+            },
+            "choices": {
+            "items": {
+                "type": "integer"
+            },
+            "title": "Choices",
+            "type": "array"
+            }
+        },
+        "required": [
+            "summarized_text",
+            "choices"
+        ],
+        "title": "SummarizeAnswer",
+        "type": "object"
+    }
+    Answer: \
+    """.replace("{", "{{").replace("}", "}}")
+DEFAULT_SYNTHESIZE_PROMPT_TMPL = SYNTHESIZE_PROMPT + SYNTHESIZE_OUTPUT_FORMAT

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,531 @@

+import torch
+import asyncio
+from torch.utils.data import DataLoader
+import os
+import uuid
+import base64
+from io import BytesIO
+from PIL import Image
+from pdf2image import pdf2image
+from typing import List, Union
+from tqdm.auto import tqdm
+from utils import *
+from models import ColPali, ColPaliProcessor, get_lora_model, enable_lora
+import qdrant_client
+from qdrant_client.http import models as rest
+from llamaindex_utils import ColPaliGemmaEmbedding, ColPaliRetriever, CustomFusionRetriever, CustomQueryEngine
+from llama_index.llms.gemini import Gemini
+from llama_index.core.tools import RetrieverTool
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def embed_imgs(model: ColPali,
+               processor: ColPaliProcessor,
+               input_imgs: List[Image.Image],
+               device: str = 'cpu') -> List[torch.Tensor]:
+    """Generates embeddings given images.
+    Args:
+        model (ColPali): Main model
+        processor (ColPaliProcessor): Data Processor
+        input_imgs (List[Image.Image]): List of input images
+        device (str, optional): device to run model. Defaults to 'cpu'.
+    Returns:
+        List[torch.Tensor]: List of output embedings.
+    """
+    colpali_model = model.to(device=device).eval()
+    dataloader = DataLoader(input_imgs,
+                            batch_size=8,
+                            shuffle=False,
+                            num_workers=0,
+                            collate_fn=lambda x: processor.process_images(x))
+    document_embeddings = []
+    with torch.no_grad():
+        for batch, model_inputs in tqdm(enumerate(dataloader)):
+            model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
+            # Encode images
+            img_embeds = colpali_model(**model_inputs, kv_cache=None)
+            document_embeddings.extend(list(torch.unbind(img_embeds.to('cpu').to(torch.float32))))
+    return document_embeddings
+def embed_queries(model: ColPali,
+                  processor: ColPaliProcessor,
+                  queries: List[str],
+                  device: str = 'cpu') -> List[torch.Tensor]:
+    """Generate embeddings given queries.
+    Args:
+        model (ColPali): Embedding model
+        processor (ColPaliProcessor): Data Processor
+        queries (List[str]): List of query strings
+        device (str, optional): Device to run model. Defaults to 'cpu'.
+    Returns:
+        List[torch.Tensor]: List of embeddings
+    """
+    colpali_model = model.to(device=device).eval()
+    dataloader = DataLoader(queries,
+                            batch_size=8,
+                            shuffle=False,
+                            num_workers=0,
+                            collate_fn=lambda x: processor.process_queries(x))
+    queries_embeddings = []
+    with torch.no_grad():
+        for batch, model_inputs in tqdm(enumerate(dataloader)):
+            model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
+            # Encode Queries
+            query_embeds = colpali_model(**model_inputs, kv_cache=None)
+        queries_embeddings.extend(torch.unbind(query_embeds.to('cpu').type(torch.float32)))
+    return queries_embeddings
+def score_single_vectors(qs: List[torch.Tensor],
+                        ps: List[torch.Tensor]) -> torch.FloatTensor:
+    """Calculate similarity between 2 single vectors
+    Args:
+        qs (List[torch.Tensor]): First Embeddings
+        ps (List[torch.Tensor]): Second Embeddings
+    Returns:
+        torch.FloatTensor: Score Tensor
+    """
+    assert len(qs) != 0 and len(ps) != 0
+    qs_stacked = torch.stack(qs)
+    ps_stacked = torch.stack(ps)
+    scores = torch.einsum("bd,cd->bc", qs_stacked, ps_stacked)
+    assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
+    scores = scores.to(torch.float32)
+    return scores
+def score_multi_vectors(qs: List[torch.Tensor],
+                        ps: List[torch.Tensor],
+                        batch_size: int = 8,
+                        device: Union[torch.device|str] = "cpu") -> torch.FloatTensor:
+    """Calculate MaxSim between 2 list of vectors.
+    Args:
+        qs (List[torch.Tensor]): List of query embeddings
+        ps (List[torch.Tensor]): List of document embeddings
+        batch_size (int, optional): Batch Size. Defaults to 8.
+        device (Union[torch.device | str], optional): Device to cast tensor to. Defaults to "cpu".
+    Returns:
+        torch.FloatTensor: Score tensors.
+    """
+    assert len(qs) != 0 and len(ps) != 0
+    scores_list = []
+    for i in range(0, len(qs), batch_size):
+        scores_batch = []
+        qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i:i+batch_size], batch_first=True, padding_value=0).to(device)
+        for j in range(0, len(ps), batch_size):
+            ps_batch = torch.nn.utils.rnn.pad_sequence(ps[j:j+batch_size], batch_first=True, padding_value=0).to(device)
+            tmp = torch.einsum("abd,ced->acbe", qs_batch, ps_batch).max(dim=-1)[0].sum(dim=2)
+            scores_batch.append(tmp)
+        scores_batch = torch.cat(scores_batch, dim=1).cpu()
+        scores_list.append(scores_batch)
+    scores = torch.cat(scores_list, dim=0)
+    return scores.to(torch.float32)
+def indexDocument(file_path: str,
+                  vector_store_client,
+                  target_collection: str,
+                  model: nn.Module,
+                  processor: ColPaliProcessor,
+                  device: Union[str|torch.device]) -> None:
+    """Index document given file_path.
+    Each page in document is embedded by ColPaliGemma Model, then insert into Qdrant vector store given target collection.
+    Creates taret collection if it is not created in the vector store yet.
+    Args:
+        file_path (str): _description_
+        vector_store_client (_type_): _description_
+        target_collection (str): _description_
+        model (nn.Module): _description_
+        processor (ColPaliProcessor): _description_
+        device (Union[str | torch.device]): _description_
+    """
+    document_images = []
+    document_embeddings = []
+    document_images.extend(pdf2image.convert_from_path(file_path))
+    document_embeddings = embed_imgs(model=model,
+                                     processor=processor,
+                                     input_imgs=document_images,
+                                     device=device)
+    # Create Qdrant Collectioon
+    if not vector_store_client.collection_exists(collection_name=target_collection):
+        # Specify vectors_config
+        scalar_quant = rest.ScalarQuantizationConfig(
+            type=rest.ScalarType.INT8,
+            quantile=0.99,
+            always_ram=False
+        )
+        vector_params = rest.VectorParams(
+            size=128,
+            distance=rest.Distance.COSINE,
+            multivector_config=rest.MultiVectorConfig(
+                comparator=rest.MultiVectorComparator.MAX_SIM
+            ),
+            quantization_config=rest.ScalarQuantization(
+                scalar=scalar_quant
+            ),
+        )
+        vector_store_client.create_collection(
+            collection_name=target_collection,
+            on_disk_payload=True,
+            optimizers_config=rest.OptimizersConfigDiff(
+                indexing_threshold=100
+            ),
+            vectors_config=vector_params
+        )
+    # Add embedding to Qdrant Collection
+    points = []
+    for i, embedding in enumerate(document_embeddings):
+        multivector = embedding.cpu().float().numpy().tolist()
+        buffer = BytesIO()
+        document_images[i].save(buffer, format='JPEG')
+        image_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
+        # Define payload
+        payload = {}
+        node_metadata = {"file_name": file_path,
+                        "page_id": i + 1}
+        node_content = {'id_': abs(hash(file_path + str(i + 1))),
+                        'image': image_str,
+                        "metadata": node_metadata}
+        payload["_node_content"] = json.dumps(node_content)
+        payload["_node_type"] = "ImageNode"
+        # store ref doc id at top level to allow metadata filtering
+        # kept for backwards compatibility, will consolidate in future
+        payload["document_id"] = "None"  # for Chroma
+        payload["doc_id"] = "None"  # for Pinecone, Qdrant, Redis
+        payload["ref_doc_id"] = "None"  # for Weaviate
+        points.append(rest.PointStruct(
+            id=node_content['id_'],
+            vector=multivector,
+            payload=payload,
+        ))
+    step = 8
+    for i in range(0, len(points), step):
+        points_batch = points[i: i + step]
+        vector_store_client.upsert(collection_name=target_collection,
+                                points=points_batch,
+                                wait=False)
+async def async_indexDocument(file_path: str,
+                  vector_store_client: qdrant_client.AsyncQdrantClient,
+                  target_collection: str,
+                  model: nn.Module,
+                  processor: ColPaliProcessor,
+                  device: Union[str|torch.device]) -> None:
+    """Asynchrously index document given file_path.
+    Each page in document is embedded by ColPaliGemma Model, then insert into Qdrant vector store given target collection.
+    Creates taret collection if it is not created in the vector store yet.
+    Args:
+        file_path (str): _description_
+        vector_store_client (_type_): _description_
+        target_collection (str): _description_
+        model (nn.Module): _description_
+        processor (ColPaliProcessor): _description_
+        device (Union[str | torch.device]): _description_
+    """
+    document_images = []
+    document_embeddings = []
+    document_images.extend(pdf2image.convert_from_path(file_path))
+    document_embeddings = embed_imgs(model=model,
+                                     processor=processor,
+                                     input_imgs=document_images,
+                                     device=device)
+    # Create Qdrant Collectioon
+    if not await vector_store_client.collection_exists(collection_name=target_collection):
+        # Specify vectors_config
+        scalar_quant = rest.ScalarQuantizationConfig(
+            type=rest.ScalarType.INT8,
+            quantile=0.99,
+            always_ram=False
+        )
+        vector_params = rest.VectorParams(
+            size=128,
+            distance=rest.Distance.COSINE,
+            multivector_config=rest.MultiVectorConfig(
+                comparator=rest.MultiVectorComparator.MAX_SIM
+            ),
+            quantization_config=rest.ScalarQuantization(
+                scalar=scalar_quant
+            ),
+        )
+        await vector_store_client.create_collection(
+            collection_name=target_collection,
+            on_disk_payload=True,
+            optimizers_config=rest.OptimizersConfigDiff(
+                indexing_threshold=100
+            ),
+            vectors_config=vector_params
+        )
+    # Add embedding to Qdrant Collection
+    points = []
+    for i, embedding in enumerate(document_embeddings):
+        multivector = embedding.cpu().float().numpy().tolist()
+        buffer = BytesIO()
+        document_images[i].save(buffer, format='JPEG')
+        image_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
+        # Define payload
+        payload = {}
+        node_metadata = {"file_name": file_path,
+                        "page_id": i + 1}
+        node_content = {'id_': abs(hash(file_path + str(i + 1))),
+                        'image': image_str,
+                        "metadata": node_metadata}
+        payload["_node_content"] = json.dumps(node_content)
+        payload["_node_type"] = "ImageNode"
+        # store ref doc id at top level to allow metadata filtering
+        # kept for backwards compatibility, will consolidate in future
+        payload["document_id"] = "None"  # for Chroma
+        payload["doc_id"] = "None"  # for Pinecone, Qdrant, Redis
+        payload["ref_doc_id"] = "None"  # for Weaviate
+        points.append(rest.PointStruct(
+            id=node_content['id_'],
+            vector=multivector,
+            payload=payload,
+        ))
+    step = 8
+    for i in range(0, len(points), step):
+        points_batch = points[i: i + step]
+        await vector_store_client.upsert(collection_name=target_collection,
+                    points=points_batch,
+                    wait=False)
+GEMINI_API_KEY = os.getenv(key="GEMINI_API_KEY")
+def main():
+    model = ColPali.from_pretrained(model_dir='./pretrained/colpaligemma-3b-mix-448-base', torch_dtype=torch.bfloat16)
+    tokenizer = load_tokenizer(tokenizer_dir='./pretrained/colpaligemma-3b-mix-448-base')
+    processor = ColPaliProcessor(tokenizer=tokenizer).from_pretrained(pretrained_dir='./pretrained/colpaligemma-3b-mix-448-base')
+    model.model.language_model.model = get_lora_model(model.model.language_model.model,
+                                                      rank=32,
+                                                      alphas=32,
+                                                      lora_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'down_proj', 'gate_proj', 'up_proj'],
+                                                      training=False,
+                                                      dropout_p=0.1,
+                                                      torch_dtype=torch.bfloat16)
+    model.model.language_model.model = enable_lora(model.model.language_model.model, lora_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'down_proj', 'gate_proj', 'up_proj'], enabled=True)
+    model = get_lora_model(model,
+                           rank=32,
+                           alphas=32,
+                           lora_modules=['custom_text_proj'],
+                           training=False,
+                           dropout_p=0.1,
+                           torch_dtype=torch.bfloat16)
+    model = enable_lora(model, lora_modules=['custom_text_proj'], enabled=True)
+    model.load_lora('./pretrained/colpaligemma-3b-mix-448-base')
+    # Initialize LLM
+    generation_config = {
+    "temperature": 0.0,
+    "top_p": 0.95,
+    "top_k": 64,
+    "max_output_tokens": 1024,
+    "response_mime_type": "text/plain",
+    }
+    llm = Gemini(api_key=GEMINI_API_KEY, generation_config=generation_config)
+    # Setup Qdrant
+    # Creating Qdrant Client
+    vector_store_client = qdrant_client.QdrantClient(location="http://localhost:6333", timeout=100)
+    indexDocument('./data/pdfs-financial/Alphabet_Inc_goog-10-q-q1-2024.pdf',
+                  vector_store_client=vector_store_client,
+                  target_collection="Alphabet",
+                  model=model,
+                  processor=processor,
+                  device='mps')
+    indexDocument('./data/pdfs-financial/Nvidia_ecefb2b2-efcb-45f3-b72b-212d90fcd873.pdf',
+                  vector_store_client=vector_store_client,
+                  target_collection="Nvidia",
+                    model=model,
+                    processor=processor,
+                    device='mps')
+    # RAG using LLamaIndex
+    embed_model = ColPaliGemmaEmbedding(model=model, processor=processor, device="mps")
+    alphabet_retriever = ColPaliRetriever(vector_store_client=vector_store_client,
+                                          target_collection="Alphabet",
+                                          embed_model=embed_model,
+                                          query_mode='default',
+                                          similarity_top_k=3)
+    nvidia_retriever = ColPaliRetriever(vector_store_client=vector_store_client,
+                                          target_collection="Nvidia",
+                                          embed_model=embed_model,
+                                          query_mode='default',
+                                          similarity_top_k=3)
+    # Query Router Among Multiple Retrievers
+    retriever_tools = [
+        RetrieverTool.from_defaults(
+            name="alphabet",
+            retriever=alphabet_retriever,
+            description="Useful for retrieving information about Alphabet Inc financials"
+            ),
+        RetrieverTool.from_defaults(
+            name="nvidia",
+            retriever=nvidia_retriever,
+            description="Useful for retrieving information about Nvidia financials"
+            )
+        ]
+    retriever_mappings = {retriever_tool.metadata.name: retriever_tool.retriever for retriever_tool in retriever_tools}
+    fusion_retriever = CustomFusionRetriever(llm=llm,
+                                             retriever_mappings=retriever_mappings,
+                                             num_generated_queries=3,
+                                             similarity_top_k=3)
+    query_engine = CustomQueryEngine(retriever_tools=[retriever_tool.metadata for retriever_tool in retriever_tools],
+                                     fusion_retriever=fusion_retriever,
+                                     llm=llm,
+                                     num_children=3)
+    query_str = "Compare the net income between Nvidia and Alphabet"
+    response = query_engine.query(query_str=query_str)
+    print(response.response)
+async def amain():
+    model = ColPali.from_pretrained(model_dir='./pretrained/colpaligemma-3b-mix-448-base', torch_dtype=torch.bfloat16)
+    tokenizer = load_tokenizer(tokenizer_dir='./pretrained/colpaligemma-3b-mix-448-base')
+    processor = ColPaliProcessor(tokenizer=tokenizer).from_pretrained(pretrained_dir='./pretrained/colpaligemma-3b-mix-448-base')
+    model.model.language_model.model = get_lora_model(model.model.language_model.model,
+                                                      rank=32,
+                                                      alphas=32,
+                                                      lora_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'down_proj', 'gate_proj', 'up_proj'],
+                                                      training=False,
+                                                      dropout_p=0.1,
+                                                      torch_dtype=torch.bfloat16)
+    model.model.language_model.model = enable_lora(model.model.language_model.model, lora_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'down_proj', 'gate_proj', 'up_proj'], enabled=True)
+    model = get_lora_model(model,
+                           rank=32,
+                           alphas=32,
+                           lora_modules=['custom_text_proj'],
+                           training=False,
+                           dropout_p=0.1,
+                           torch_dtype=torch.bfloat16)
+    model = enable_lora(model, lora_modules=['custom_text_proj'], enabled=True)
+    model.load_lora('./pretrained/colpaligemma-3b-mix-448-base')
+    # Initialize LLM
+    generation_config = {
+    "temperature": 0.0,
+    "top_p": 0.95,
+    "top_k": 64,
+    "max_output_tokens": 1024,
+    "response_mime_type": "text/plain",
+    }
+    llm = Gemini(api_key=GEMINI_API_KEY, generation_config=generation_config)
+    # Setup Qdrant
+    # Creating Qdrant Client
+    vector_store_client = qdrant_client.AsyncQdrantClient(location="http://localhost:6333", timeout=100)
+    await async_indexDocument('./data/pdfs-financial/Alphabet_Inc_goog-10-q-q1-2024.pdf',
+                  vector_store_client=vector_store_client,
+                  target_collection="Alphabet",
+                  model=model,
+                  processor=processor,
+                  device='mps')
+    await async_indexDocument('./data/pdfs-financial/Nvidia_ecefb2b2-efcb-45f3-b72b-212d90fcd873.pdf',
+                                                    vector_store_client=vector_store_client,
+                                                    target_collection="Nvidia",
+                                                    model=model,
+                                                    processor=processor,
+                                                    device='mps')
+    embed_model = ColPaliGemmaEmbedding(model=model, processor=processor, device="mps")
+    alphabet_retriever = ColPaliRetriever(vector_store_client=vector_store_client,
+                                          target_collection="Alphabet",
+                                        embed_model=embed_model,
+                                        query_mode='default',
+                                        similarity_top_k=3)
+    nvidia_retriever = ColPaliRetriever(vector_store_client=vector_store_client,
+                                        target_collection="Nvidia",
+                                        embed_model=embed_model,
+                                        query_mode='default',
+                                        similarity_top_k=3)
+    # Query Router Among Multiple Retrievers
+    retriever_tools = [
+        RetrieverTool.from_defaults(
+            name="alphabet",
+            retriever=alphabet_retriever,
+            description="Useful for retrieving information about Alphabet Inc financials"
+            ),
+        RetrieverTool.from_defaults(
+            name="nvidia",
+            retriever=nvidia_retriever,
+            description="Useful for retrieving information about Nvidia financials"
+            )
+        ]
+    retriever_mappings = {retriever_tool.metadata.name: retriever_tool.retriever for retriever_tool in retriever_tools}
+    fusion_retriever = CustomFusionRetriever(llm=llm,
+                                             retriever_mappings=retriever_mappings,
+                                             similarity_top_k=3)
+    query_engine = CustomQueryEngine(retriever_tools=[retriever_tool.metadata for retriever_tool in retriever_tools],
+                                     fusion_retriever=fusion_retriever,
+                                     llm=llm,
+                                     num_children=3)
+    query_str = "Compare the net income between Nvidia and Alphabet"
+    response = await query_engine.aquery(query_str=query_str)
+    print(str(response))
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,225 @@

+accelerate==1.1.0
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp==3.10.10
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+appnope==0.1.4
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.2.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+click==8.1.7
+comm==0.2.2
+contourpy==1.3.0
+cycler==0.12.1
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+Deprecated==1.2.14
+dill==0.3.8
+dirtyjson==1.0.8
+distro==1.9.0
+executing==2.1.0
+fastapi==0.115.4
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.5.0
+fsspec==2024.6.1
+google-ai-generativelanguage==0.6.4
+google-api-core==2.20.0
+google-api-python-client==2.147.0
+google-auth==2.35.0
+google-auth-httplib2==0.2.0
+google-generativeai==0.5.4
+googleapis-common-protos==1.65.0
+gradio==4.44.1
+gradio_client==1.3.0
+greenlet==3.1.1
+grpcio==1.67.1
+grpcio-status==1.62.3
+grpcio-tools==1.62.3
+h11==0.14.0
+h2==4.1.0
+hpack==4.0.0
+httpcore==1.0.6
+httplib2==0.22.0
+httpx==0.27.2
+huggingface-hub==0.26.2
+hyperframe==6.0.1
+idna==3.10
+importlib_resources==6.4.5
+InstructorEmbedding==1.0.1
+ipykernel==6.29.5
+ipython==8.29.0
+isoduration==20.11.0
+jedi==0.19.1
+Jinja2==3.1.4
+jiter==0.7.0
+joblib==1.4.2
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+llama-cloud==0.1.2
+llama-index==0.11.17
+llama-index-agent-openai==0.3.4
+llama-index-cli==0.3.1
+llama-index-core==0.11.17
+llama-index-embeddings-huggingface==0.3.1
+llama-index-embeddings-instructor==0.2.1
+llama-index-embeddings-openai==0.2.5
+llama-index-indices-managed-llama-cloud==0.4.0
+llama-index-legacy==0.9.48.post3
+llama-index-llms-gemini==0.3.7
+llama-index-llms-openai==0.2.13
+llama-index-multi-modal-llms-gemini==0.3.1
+llama-index-multi-modal-llms-openai==0.2.2
+llama-index-postprocessor-colbert-rerank==0.2.1
+llama-index-program-openai==0.2.0
+llama-index-question-gen-openai==0.2.0
+llama-index-readers-file==0.2.2
+llama-index-readers-llama-parse==0.3.0
+llama-index-vector-stores-qdrant==0.3.1
+llama-parse==0.5.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.23.1
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.0.2
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.2
+nltk==3.9.1
+notebook==7.2.2
+notebook_shim==0.2.4
+numpy==1.26.4
+openai==1.53.0
+orjson==3.10.11
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pdf2image==1.17.0
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+portalocker==2.10.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.24.0
+protobuf==4.25.5
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.4
+pypdf==4.3.1
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+python-multipart==0.0.12
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+qdrant-client==1.12.0
+referencing==0.35.1
+regex==2024.9.11
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.4
+rpds-py==0.20.1
+rsa==4.9
+ruff==0.7.2
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+semantic-version==2.10.0
+Send2Trash==1.8.3
+sentence-transformers==2.7.0
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==2.0.36
+stack-data==0.6.3
+starlette==0.41.2
+striprtf==0.0.26
+sympy==1.13.3
+tenacity==8.5.0
+terminado==0.18.1
+threadpoolctl==3.5.0
+tiktoken==0.8.0
+tinycss2==1.4.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+torch==2.4.1
+torchinfo==1.8.0
+torchvision==0.19.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.1
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+typing_extensions==4.12.2
+typing-inspect==0.9.0
+tzdata==2024.2
+uri-template==1.3.0
+uritemplate==4.1.1
+urllib3==2.2.3
+uvicorn==0.32.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.17.1

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .utils import *
2	+ IMAGE_TOKEN = "<image>"

utils/utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch
+from PIL import Image
+from typing import Tuple, List
+import numpy as np
+import torch.nn as nn
+import os
+from transformers import AutoTokenizer, GemmaTokenizerFast
+from safetensors import safe_open
+import json
+from pathlib import Path
+from models.paligemma import PaliGemmaConfig, PaliGemma
+def load_model(model_dir: str):
+    with open(os.path.join(model_dir, 'config.json'), "r") as f:
+        model_config = json.loads(f.read())
+        config = PaliGemmaConfig.from_dict(model_config)
+    safetensor_files = Path(model_dir).glob("*.safetensors")
+    weights = {}
+    for file in safetensor_files:
+        with safe_open(file, framework='pt', device="cpu") as f:
+            for key in f.keys():
+                weights[key] = f.get_tensor(key)
+    model = PaliGemma(config)
+    model.load_state_dict(weights, strict=False)
+    model.tie_weights()
+    return model
+def load_tokenizer(tokenizer_dir: str):
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, padding_side='right')
+    return tokenizer
+def freeze_model(model: nn.Module):
+    for param in model.parameters():
+        param.requires_grad = False
+    return model