File size: 3,061 Bytes
9cf08e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import logging
logger = logging.getLogger("nano-graphrag")
# from dashscope import get_tokenizer 
from transformers import AutoTokenizer
from ._utils import compute_mdhash_id
from typing import Callable, Dict, List, Optional, Type, Union, cast
import asyncio
import os
from ._llm import Qwen3
from ._llm import Qwen3TokenizerClient


qwen3_model = Qwen3()
tiktoken_client = Qwen3TokenizerClient()


def chunking_by_video_segments(
    tokens_list: list[list[int]],
    doc_keys,
    max_token_size=1024,
):
    # make sure each segment is not larger than max_token_size
    for index in range(len(tokens_list)):
        if len(tokens_list[index]) > max_token_size:
            tokens_list[index] = tokens_list[index][:max_token_size]
    
    results = []
    chunk_token = []
    chunk_segment_ids = []
    chunk_order_index = 0
    for index, tokens in enumerate(tokens_list):
        
        if len(chunk_token) + len(tokens) <= max_token_size:
            # add new segment
            chunk_token += tokens.copy()
            chunk_segment_ids.append(doc_keys[index])
        else:
            

            chunk = tiktoken_client.decode(chunk_token, skip_special_tokens=True)
            results.append(
                {
                    "tokens": len(chunk_token),
                    "content": chunk["text"],
                    "chunk_order_index": chunk_order_index,
                    "video_segment_id": chunk_segment_ids,
                }
            )
            # new chunk with current segment as begin
            chunk_token = []
            chunk_segment_ids = []
            chunk_token += tokens.copy()
            chunk_segment_ids.append(doc_keys[index])
            chunk_order_index += 1
    
    # save the last chunk
    if len(chunk_token) > 0:

        chunk = tiktoken_client.decode(chunk_token, skip_special_tokens=True)
        results.append(
            {
                "tokens": len(chunk_token),
                "content": chunk["text"],
                "chunk_order_index": chunk_order_index,
                "video_segment_id": chunk_segment_ids,
            }
        )
    
    return results


def get_chunks(new_videos, chunk_func=chunking_by_video_segments, **chunk_func_params):
    inserting_chunks = {}

    new_videos_list = list(new_videos.keys())
    for video_name in new_videos_list:
        segment_id_list = list(new_videos[video_name].keys())
        docs = [new_videos[video_name][index]["content"] for index in segment_id_list]
        doc_keys = [f'{video_name}_{index}' for index in segment_id_list]


        tokens_dict = tiktoken_client.batch_encode(docs, padding=True)
        tokens_list = tokens_dict.get("token_ids_batch", tokens_dict) if isinstance(tokens_dict, dict) else tokens_dict

        chunks = chunk_func(
            tokens_list, doc_keys=doc_keys,  **chunk_func_params
        )

        for chunk in chunks:
            inserting_chunks.update(
                {compute_mdhash_id(chunk["content"], prefix="chunk-"): chunk}
            )

    return inserting_chunks