MrSimple07 commited on
Commit
bf0077f
·
1 Parent(s): dd15743

chunk size = 8192

Browse files
Files changed (2) hide show
  1. config.py +1 -1
  2. documents_prep.py +20 -59
config.py CHANGED
@@ -50,7 +50,7 @@ AVAILABLE_MODELS = {
50
 
51
  DEFAULT_MODEL = "Gemini 2.5 Flash"
52
 
53
- CHUNK_SIZE = 2048
54
  CHUNK_OVERLAP = 256
55
 
56
  CUSTOM_PROMPT = """
 
50
 
51
  DEFAULT_MODEL = "Gemini 2.5 Flash"
52
 
53
+ CHUNK_SIZE = 8192
54
  CHUNK_OVERLAP = 256
55
 
56
  CUSTOM_PROMPT = """
documents_prep.py CHANGED
@@ -8,44 +8,15 @@ from llama_index.core.text_splitter import SentenceSplitter
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP
9
 
10
 
11
- import tiktoken
12
- from transformers import AutoTokenizer
13
-
14
- def count_tokens(text, model="gpt-3.5-turbo"):
15
- try:
16
- tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
17
- max_chunk_size = 2048
18
- total_tokens = 0
19
-
20
- for i in range(0, len(text), max_chunk_size * 4): # Approximate 4 chars per token
21
- chunk = text[i:i + max_chunk_size * 4]
22
- tokens = tokenizer.encode(chunk, add_special_tokens=False, truncation=True, max_length=1024)
23
- total_tokens += len(tokens)
24
-
25
- return total_tokens
26
- except:
27
- try:
28
- encoding = tiktoken.encoding_for_model(model)
29
- return len(encoding.encode(text))
30
- except:
31
- # Final fallback: approximate 1 token = 4 characters
32
- return len(text) // 4
33
-
34
  def chunk_document(doc, chunk_size=None, chunk_overlap=None):
35
- """Chunk document based on tokens instead of characters"""
36
  if chunk_size is None:
37
  chunk_size = CHUNK_SIZE
38
  if chunk_overlap is None:
39
  chunk_overlap = CHUNK_OVERLAP
40
-
41
- from llama_index.core.text_splitter import TokenTextSplitter
42
-
43
- # Use TokenTextSplitter instead of SentenceSplitter
44
- text_splitter = TokenTextSplitter(
45
  chunk_size=chunk_size,
46
  chunk_overlap=chunk_overlap,
47
- separator=" ",
48
- backup_separators=["\n", ".", "!", "?"]
49
  )
50
 
51
  text_chunks = text_splitter.split_text(doc.text)
@@ -53,12 +24,10 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
53
  chunked_docs = []
54
  for i, chunk_text in enumerate(text_chunks):
55
  chunk_metadata = doc.metadata.copy()
56
- chunk_tokens = count_tokens(chunk_text)
57
  chunk_metadata.update({
58
  "chunk_id": i,
59
  "total_chunks": len(text_chunks),
60
- "chunk_size_tokens": chunk_tokens,
61
- "chunk_size_chars": len(chunk_text),
62
  "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
63
  })
64
 
@@ -70,6 +39,7 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
70
 
71
  return chunked_docs
72
 
 
73
  def process_documents_with_chunking(documents):
74
  all_chunked_docs = []
75
  chunk_info = []
@@ -81,27 +51,24 @@ def process_documents_with_chunking(documents):
81
 
82
  for doc in documents:
83
  doc_type = doc.metadata.get('type', 'text')
84
- doc_tokens = count_tokens(doc.text)
85
- doc_chars = len(doc.text)
86
 
87
  if doc_type == 'table':
88
  table_count += 1
89
- if doc_tokens > CHUNK_SIZE:
 
90
  large_tables_count += 1
91
- log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens")
92
 
93
  # Chunk large tables
94
  chunked_docs = chunk_document(doc)
95
  all_chunked_docs.extend(chunked_docs)
96
 
97
  for i, chunk_doc in enumerate(chunked_docs):
98
- chunk_tokens = chunk_doc.metadata.get('chunk_size_tokens', count_tokens(chunk_doc.text))
99
  chunk_info.append({
100
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
101
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
102
  'chunk_id': i,
103
- 'chunk_size_tokens': chunk_tokens,
104
- 'chunk_size_chars': len(chunk_doc.text),
105
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
106
  'type': 'table',
107
  'table_number': chunk_doc.metadata.get('table_number', 'unknown')
@@ -112,8 +79,7 @@ def process_documents_with_chunking(documents):
112
  'document_id': doc.metadata.get('document_id', 'unknown'),
113
  'section_id': doc.metadata.get('section_id', 'unknown'),
114
  'chunk_id': 0,
115
- 'chunk_size_tokens': doc_tokens,
116
- 'chunk_size_chars': doc_chars,
117
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
118
  'type': 'table',
119
  'table_number': doc.metadata.get('table_number', 'unknown')
@@ -121,22 +87,21 @@ def process_documents_with_chunking(documents):
121
 
122
  elif doc_type == 'image':
123
  image_count += 1
124
- if doc_tokens > CHUNK_SIZE:
 
125
  large_images_count += 1
126
- log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_tokens} tokens")
127
 
128
  # Chunk large images
129
  chunked_docs = chunk_document(doc)
130
  all_chunked_docs.extend(chunked_docs)
131
 
132
  for i, chunk_doc in enumerate(chunked_docs):
133
- chunk_tokens = chunk_doc.metadata.get('chunk_size_tokens', count_tokens(chunk_doc.text))
134
  chunk_info.append({
135
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
136
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
137
  'chunk_id': i,
138
- 'chunk_size_tokens': chunk_tokens,
139
- 'chunk_size_chars': len(chunk_doc.text),
140
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
141
  'type': 'image',
142
  'image_number': chunk_doc.metadata.get('image_number', 'unknown')
@@ -147,27 +112,25 @@ def process_documents_with_chunking(documents):
147
  'document_id': doc.metadata.get('document_id', 'unknown'),
148
  'section_id': doc.metadata.get('section_id', 'unknown'),
149
  'chunk_id': 0,
150
- 'chunk_size_tokens': doc_tokens,
151
- 'chunk_size_chars': doc_chars,
152
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
153
  'type': 'image',
154
  'image_number': doc.metadata.get('image_number', 'unknown')
155
  })
156
 
157
  else: # text documents
158
- if doc_tokens > CHUNK_SIZE:
 
159
  chunked_docs = chunk_document(doc)
160
  all_chunked_docs.extend(chunked_docs)
161
  text_chunks_count += len(chunked_docs)
162
 
163
  for i, chunk_doc in enumerate(chunked_docs):
164
- chunk_tokens = chunk_doc.metadata.get('chunk_size_tokens', count_tokens(chunk_doc.text))
165
  chunk_info.append({
166
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
167
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
168
  'chunk_id': i,
169
- 'chunk_size_tokens': chunk_tokens,
170
- 'chunk_size_chars': len(chunk_doc.text),
171
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
172
  'type': 'text'
173
  })
@@ -177,24 +140,22 @@ def process_documents_with_chunking(documents):
177
  'document_id': doc.metadata.get('document_id', 'unknown'),
178
  'section_id': doc.metadata.get('section_id', 'unknown'),
179
  'chunk_id': 0,
180
- 'chunk_size_tokens': doc_tokens,
181
- 'chunk_size_chars': doc_chars,
182
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
183
  'type': 'text'
184
  })
185
 
186
  log_message(f"=== PROCESSING STATISTICS ===")
187
  log_message(f"Total tables processed: {table_count}")
188
- log_message(f"Large tables (>{CHUNK_SIZE} tokens): {large_tables_count}")
189
  log_message(f"Total images processed: {image_count}")
190
- log_message(f"Large images (>{CHUNK_SIZE} tokens): {large_images_count}")
191
  log_message(f"Total text chunks created: {text_chunks_count}")
192
  log_message(f"Total documents after processing: {len(all_chunked_docs)}")
193
 
194
  return all_chunked_docs, chunk_info
195
 
196
 
197
-
198
  def extract_text_from_json(data, document_id, document_name):
199
  documents = []
200
 
 
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def chunk_document(doc, chunk_size=None, chunk_overlap=None):
 
12
  if chunk_size is None:
13
  chunk_size = CHUNK_SIZE
14
  if chunk_overlap is None:
15
  chunk_overlap = CHUNK_OVERLAP
16
+ text_splitter = SentenceSplitter(
 
 
 
 
17
  chunk_size=chunk_size,
18
  chunk_overlap=chunk_overlap,
19
+ separator=" "
 
20
  )
21
 
22
  text_chunks = text_splitter.split_text(doc.text)
 
24
  chunked_docs = []
25
  for i, chunk_text in enumerate(text_chunks):
26
  chunk_metadata = doc.metadata.copy()
 
27
  chunk_metadata.update({
28
  "chunk_id": i,
29
  "total_chunks": len(text_chunks),
30
+ "chunk_size": len(chunk_text),
 
31
  "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
32
  })
33
 
 
39
 
40
  return chunked_docs
41
 
42
+
43
  def process_documents_with_chunking(documents):
44
  all_chunked_docs = []
45
  chunk_info = []
 
51
 
52
  for doc in documents:
53
  doc_type = doc.metadata.get('type', 'text')
 
 
54
 
55
  if doc_type == 'table':
56
  table_count += 1
57
+ doc_size = len(doc.text)
58
+ if doc_size > CHUNK_SIZE:
59
  large_tables_count += 1
60
+ log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
61
 
62
  # Chunk large tables
63
  chunked_docs = chunk_document(doc)
64
  all_chunked_docs.extend(chunked_docs)
65
 
66
  for i, chunk_doc in enumerate(chunked_docs):
 
67
  chunk_info.append({
68
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
69
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
70
  'chunk_id': i,
71
+ 'chunk_size': len(chunk_doc.text),
 
72
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
73
  'type': 'table',
74
  'table_number': chunk_doc.metadata.get('table_number', 'unknown')
 
79
  'document_id': doc.metadata.get('document_id', 'unknown'),
80
  'section_id': doc.metadata.get('section_id', 'unknown'),
81
  'chunk_id': 0,
82
+ 'chunk_size': doc_size,
 
83
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
84
  'type': 'table',
85
  'table_number': doc.metadata.get('table_number', 'unknown')
 
87
 
88
  elif doc_type == 'image':
89
  image_count += 1
90
+ doc_size = len(doc.text)
91
+ if doc_size > CHUNK_SIZE:
92
  large_images_count += 1
93
+ log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
94
 
95
  # Chunk large images
96
  chunked_docs = chunk_document(doc)
97
  all_chunked_docs.extend(chunked_docs)
98
 
99
  for i, chunk_doc in enumerate(chunked_docs):
 
100
  chunk_info.append({
101
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
102
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
103
  'chunk_id': i,
104
+ 'chunk_size': len(chunk_doc.text),
 
105
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
106
  'type': 'image',
107
  'image_number': chunk_doc.metadata.get('image_number', 'unknown')
 
112
  'document_id': doc.metadata.get('document_id', 'unknown'),
113
  'section_id': doc.metadata.get('section_id', 'unknown'),
114
  'chunk_id': 0,
115
+ 'chunk_size': doc_size,
 
116
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
117
  'type': 'image',
118
  'image_number': doc.metadata.get('image_number', 'unknown')
119
  })
120
 
121
  else: # text documents
122
+ doc_size = len(doc.text)
123
+ if doc_size > CHUNK_SIZE:
124
  chunked_docs = chunk_document(doc)
125
  all_chunked_docs.extend(chunked_docs)
126
  text_chunks_count += len(chunked_docs)
127
 
128
  for i, chunk_doc in enumerate(chunked_docs):
 
129
  chunk_info.append({
130
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
131
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
132
  'chunk_id': i,
133
+ 'chunk_size': len(chunk_doc.text),
 
134
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
135
  'type': 'text'
136
  })
 
140
  'document_id': doc.metadata.get('document_id', 'unknown'),
141
  'section_id': doc.metadata.get('section_id', 'unknown'),
142
  'chunk_id': 0,
143
+ 'chunk_size': doc_size,
 
144
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
145
  'type': 'text'
146
  })
147
 
148
  log_message(f"=== PROCESSING STATISTICS ===")
149
  log_message(f"Total tables processed: {table_count}")
150
+ log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
151
  log_message(f"Total images processed: {image_count}")
152
+ log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
153
  log_message(f"Total text chunks created: {text_chunks_count}")
154
  log_message(f"Total documents after processing: {len(all_chunked_docs)}")
155
 
156
  return all_chunked_docs, chunk_info
157
 
158
 
 
159
  def extract_text_from_json(data, document_id, document_name):
160
  documents = []
161