MrSimple07 commited on
Commit
79a7114
·
1 Parent(s): 3f09b3e

token based chunking

Browse files
Files changed (3) hide show
  1. app.py +6 -5
  2. documents_prep.py +47 -20
  3. requirements.txt +2 -1
app.py CHANGED
@@ -100,7 +100,7 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
100
  log_message("Инициализация системы")
101
  os.makedirs(download_dir, exist_ok=True)
102
  from config import CHUNK_SIZE, CHUNK_OVERLAP
103
- from llama_index.core.text_splitter import SentenceSplitter
104
 
105
  embed_model = get_embedding_model()
106
  llm = get_llm_model(DEFAULT_MODEL)
@@ -108,14 +108,15 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
108
 
109
  Settings.embed_model = embed_model
110
  Settings.llm = llm
111
- Settings.text_splitter = SentenceSplitter(
112
  chunk_size=CHUNK_SIZE,
113
  chunk_overlap=CHUNK_OVERLAP,
114
- separator=" "
 
115
  )
116
 
117
- log_message(f"Configured chunk size: {CHUNK_SIZE}")
118
- log_message(f"Configured chunk overlap: {CHUNK_OVERLAP}")
119
 
120
  all_documents = []
121
  chunks_df = None
 
100
  log_message("Инициализация системы")
101
  os.makedirs(download_dir, exist_ok=True)
102
  from config import CHUNK_SIZE, CHUNK_OVERLAP
103
+ from llama_index.core.text_splitter import TokenTextSplitter
104
 
105
  embed_model = get_embedding_model()
106
  llm = get_llm_model(DEFAULT_MODEL)
 
108
 
109
  Settings.embed_model = embed_model
110
  Settings.llm = llm
111
+ Settings.text_splitter = TokenTextSplitter(
112
  chunk_size=CHUNK_SIZE,
113
  chunk_overlap=CHUNK_OVERLAP,
114
+ separator=" ",
115
+ backup_separators=["\n", ".", "!", "?"]
116
  )
117
 
118
+ log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
119
+ log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
120
 
121
  all_documents = []
122
  chunks_df = None
documents_prep.py CHANGED
@@ -8,15 +8,32 @@ from llama_index.core.text_splitter import SentenceSplitter
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
11
  def chunk_document(doc, chunk_size=None, chunk_overlap=None):
 
12
  if chunk_size is None:
13
  chunk_size = CHUNK_SIZE
14
  if chunk_overlap is None:
15
  chunk_overlap = CHUNK_OVERLAP
16
- text_splitter = SentenceSplitter(
 
 
 
 
17
  chunk_size=chunk_size,
18
  chunk_overlap=chunk_overlap,
19
- separator=" "
 
20
  )
21
 
22
  text_chunks = text_splitter.split_text(doc.text)
@@ -24,10 +41,12 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
24
  chunked_docs = []
25
  for i, chunk_text in enumerate(text_chunks):
26
  chunk_metadata = doc.metadata.copy()
 
27
  chunk_metadata.update({
28
  "chunk_id": i,
29
  "total_chunks": len(text_chunks),
30
- "chunk_size": len(chunk_text),
 
31
  "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
32
  })
33
 
@@ -39,7 +58,6 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
39
 
40
  return chunked_docs
41
 
42
-
43
  def process_documents_with_chunking(documents):
44
  all_chunked_docs = []
45
  chunk_info = []
@@ -51,24 +69,27 @@ def process_documents_with_chunking(documents):
51
 
52
  for doc in documents:
53
  doc_type = doc.metadata.get('type', 'text')
 
 
54
 
55
  if doc_type == 'table':
56
  table_count += 1
57
- doc_size = len(doc.text)
58
- if doc_size > CHUNK_SIZE:
59
  large_tables_count += 1
60
- log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
61
 
62
  # Chunk large tables
63
  chunked_docs = chunk_document(doc)
64
  all_chunked_docs.extend(chunked_docs)
65
 
66
  for i, chunk_doc in enumerate(chunked_docs):
 
67
  chunk_info.append({
68
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
69
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
70
  'chunk_id': i,
71
- 'chunk_size': len(chunk_doc.text),
 
72
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
73
  'type': 'table',
74
  'table_number': chunk_doc.metadata.get('table_number', 'unknown')
@@ -79,7 +100,8 @@ def process_documents_with_chunking(documents):
79
  'document_id': doc.metadata.get('document_id', 'unknown'),
80
  'section_id': doc.metadata.get('section_id', 'unknown'),
81
  'chunk_id': 0,
82
- 'chunk_size': doc_size,
 
83
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
84
  'type': 'table',
85
  'table_number': doc.metadata.get('table_number', 'unknown')
@@ -87,21 +109,22 @@ def process_documents_with_chunking(documents):
87
 
88
  elif doc_type == 'image':
89
  image_count += 1
90
- doc_size = len(doc.text)
91
- if doc_size > CHUNK_SIZE:
92
  large_images_count += 1
93
- log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
94
 
95
  # Chunk large images
96
  chunked_docs = chunk_document(doc)
97
  all_chunked_docs.extend(chunked_docs)
98
 
99
  for i, chunk_doc in enumerate(chunked_docs):
 
100
  chunk_info.append({
101
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
102
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
103
  'chunk_id': i,
104
- 'chunk_size': len(chunk_doc.text),
 
105
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
106
  'type': 'image',
107
  'image_number': chunk_doc.metadata.get('image_number', 'unknown')
@@ -112,25 +135,27 @@ def process_documents_with_chunking(documents):
112
  'document_id': doc.metadata.get('document_id', 'unknown'),
113
  'section_id': doc.metadata.get('section_id', 'unknown'),
114
  'chunk_id': 0,
115
- 'chunk_size': doc_size,
 
116
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
117
  'type': 'image',
118
  'image_number': doc.metadata.get('image_number', 'unknown')
119
  })
120
 
121
  else: # text documents
122
- doc_size = len(doc.text)
123
- if doc_size > CHUNK_SIZE:
124
  chunked_docs = chunk_document(doc)
125
  all_chunked_docs.extend(chunked_docs)
126
  text_chunks_count += len(chunked_docs)
127
 
128
  for i, chunk_doc in enumerate(chunked_docs):
 
129
  chunk_info.append({
130
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
131
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
132
  'chunk_id': i,
133
- 'chunk_size': len(chunk_doc.text),
 
134
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
135
  'type': 'text'
136
  })
@@ -140,22 +165,24 @@ def process_documents_with_chunking(documents):
140
  'document_id': doc.metadata.get('document_id', 'unknown'),
141
  'section_id': doc.metadata.get('section_id', 'unknown'),
142
  'chunk_id': 0,
143
- 'chunk_size': doc_size,
 
144
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
145
  'type': 'text'
146
  })
147
 
148
  log_message(f"=== PROCESSING STATISTICS ===")
149
  log_message(f"Total tables processed: {table_count}")
150
- log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
151
  log_message(f"Total images processed: {image_count}")
152
- log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
153
  log_message(f"Total text chunks created: {text_chunks_count}")
154
  log_message(f"Total documents after processing: {len(all_chunked_docs)}")
155
 
156
  return all_chunked_docs, chunk_info
157
 
158
 
 
159
  def extract_text_from_json(data, document_id, document_name):
160
  documents = []
161
 
 
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP
9
 
10
 
11
+ import tiktoken
12
+
13
+ def count_tokens(text, model="gpt-3.5-turbo"):
14
+ """Count tokens in text using tiktoken"""
15
+ try:
16
+ encoding = tiktoken.encoding_for_model(model)
17
+ return len(encoding.encode(text))
18
+ except:
19
+ # Fallback: approximate 1 token = 4 characters for Russian/English text
20
+ return len(text) // 4
21
+
22
  def chunk_document(doc, chunk_size=None, chunk_overlap=None):
23
+ """Chunk document based on tokens instead of characters"""
24
  if chunk_size is None:
25
  chunk_size = CHUNK_SIZE
26
  if chunk_overlap is None:
27
  chunk_overlap = CHUNK_OVERLAP
28
+
29
+ from llama_index.core.text_splitter import TokenTextSplitter
30
+
31
+ # Use TokenTextSplitter instead of SentenceSplitter
32
+ text_splitter = TokenTextSplitter(
33
  chunk_size=chunk_size,
34
  chunk_overlap=chunk_overlap,
35
+ separator=" ",
36
+ backup_separators=["\n", ".", "!", "?"]
37
  )
38
 
39
  text_chunks = text_splitter.split_text(doc.text)
 
41
  chunked_docs = []
42
  for i, chunk_text in enumerate(text_chunks):
43
  chunk_metadata = doc.metadata.copy()
44
+ chunk_tokens = count_tokens(chunk_text)
45
  chunk_metadata.update({
46
  "chunk_id": i,
47
  "total_chunks": len(text_chunks),
48
+ "chunk_size_tokens": chunk_tokens,
49
+ "chunk_size_chars": len(chunk_text),
50
  "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
51
  })
52
 
 
58
 
59
  return chunked_docs
60
 
 
61
  def process_documents_with_chunking(documents):
62
  all_chunked_docs = []
63
  chunk_info = []
 
69
 
70
  for doc in documents:
71
  doc_type = doc.metadata.get('type', 'text')
72
+ doc_tokens = count_tokens(doc.text)
73
+ doc_chars = len(doc.text)
74
 
75
  if doc_type == 'table':
76
  table_count += 1
77
+ if doc_tokens > CHUNK_SIZE:
 
78
  large_tables_count += 1
79
+ log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens ({doc_chars} characters)")
80
 
81
  # Chunk large tables
82
  chunked_docs = chunk_document(doc)
83
  all_chunked_docs.extend(chunked_docs)
84
 
85
  for i, chunk_doc in enumerate(chunked_docs):
86
+ chunk_tokens = chunk_doc.metadata.get('chunk_size_tokens', count_tokens(chunk_doc.text))
87
  chunk_info.append({
88
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
89
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
90
  'chunk_id': i,
91
+ 'chunk_size_tokens': chunk_tokens,
92
+ 'chunk_size_chars': len(chunk_doc.text),
93
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
94
  'type': 'table',
95
  'table_number': chunk_doc.metadata.get('table_number', 'unknown')
 
100
  'document_id': doc.metadata.get('document_id', 'unknown'),
101
  'section_id': doc.metadata.get('section_id', 'unknown'),
102
  'chunk_id': 0,
103
+ 'chunk_size_tokens': doc_tokens,
104
+ 'chunk_size_chars': doc_chars,
105
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
106
  'type': 'table',
107
  'table_number': doc.metadata.get('table_number', 'unknown')
 
109
 
110
  elif doc_type == 'image':
111
  image_count += 1
112
+ if doc_tokens > CHUNK_SIZE:
 
113
  large_images_count += 1
114
+ log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens ({doc_chars} characters)")
115
 
116
  # Chunk large images
117
  chunked_docs = chunk_document(doc)
118
  all_chunked_docs.extend(chunked_docs)
119
 
120
  for i, chunk_doc in enumerate(chunked_docs):
121
+ chunk_tokens = chunk_doc.metadata.get('chunk_size_tokens', count_tokens(chunk_doc.text))
122
  chunk_info.append({
123
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
124
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
125
  'chunk_id': i,
126
+ 'chunk_size_tokens': chunk_tokens,
127
+ 'chunk_size_chars': len(chunk_doc.text),
128
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
129
  'type': 'image',
130
  'image_number': chunk_doc.metadata.get('image_number', 'unknown')
 
135
  'document_id': doc.metadata.get('document_id', 'unknown'),
136
  'section_id': doc.metadata.get('section_id', 'unknown'),
137
  'chunk_id': 0,
138
+ 'chunk_size_tokens': doc_tokens,
139
+ 'chunk_size_chars': doc_chars,
140
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
141
  'type': 'image',
142
  'image_number': doc.metadata.get('image_number', 'unknown')
143
  })
144
 
145
  else: # text documents
146
+ if doc_tokens > CHUNK_SIZE:
 
147
  chunked_docs = chunk_document(doc)
148
  all_chunked_docs.extend(chunked_docs)
149
  text_chunks_count += len(chunked_docs)
150
 
151
  for i, chunk_doc in enumerate(chunked_docs):
152
+ chunk_tokens = chunk_doc.metadata.get('chunk_size_tokens', count_tokens(chunk_doc.text))
153
  chunk_info.append({
154
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
155
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
156
  'chunk_id': i,
157
+ 'chunk_size_tokens': chunk_tokens,
158
+ 'chunk_size_chars': len(chunk_doc.text),
159
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
160
  'type': 'text'
161
  })
 
165
  'document_id': doc.metadata.get('document_id', 'unknown'),
166
  'section_id': doc.metadata.get('section_id', 'unknown'),
167
  'chunk_id': 0,
168
+ 'chunk_size_tokens': doc_tokens,
169
+ 'chunk_size_chars': doc_chars,
170
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
171
  'type': 'text'
172
  })
173
 
174
  log_message(f"=== PROCESSING STATISTICS ===")
175
  log_message(f"Total tables processed: {table_count}")
176
+ log_message(f"Large tables (>{CHUNK_SIZE} tokens): {large_tables_count}")
177
  log_message(f"Total images processed: {image_count}")
178
+ log_message(f"Large images (>{CHUNK_SIZE} tokens): {large_images_count}")
179
  log_message(f"Total text chunks created: {text_chunks_count}")
180
  log_message(f"Total documents after processing: {len(all_chunked_docs)}")
181
 
182
  return all_chunked_docs, chunk_info
183
 
184
 
185
+
186
  def extract_text_from_json(data, document_id, document_name):
187
  documents = []
188
 
requirements.txt CHANGED
@@ -14,4 +14,5 @@ python-docx
14
  openpyxl
15
  llama-index-llms-openai
16
  llama-index-vector-stores-faiss
17
- llama-index-retrievers-bm25
 
 
14
  openpyxl
15
  llama-index-llms-openai
16
  llama-index-vector-stores-faiss
17
+ llama-index-retrievers-bm25
18
+ tiktoken