Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +62 -14
split_files_to_excel.py
CHANGED
|
@@ -29,8 +29,8 @@ import requests
|
|
| 29 |
import json
|
| 30 |
|
| 31 |
MODEL = "thenlper/gte-base"
|
| 32 |
-
CHUNK_SIZE =
|
| 33 |
-
CHUNK_OVERLAP =
|
| 34 |
|
| 35 |
embeddings = HuggingFaceEmbeddings(
|
| 36 |
model_name=MODEL,
|
|
@@ -323,15 +323,41 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
| 323 |
# Create an empty list to store the resized documents
|
| 324 |
resized = []
|
| 325 |
previous_file=""
|
|
|
|
|
|
|
| 326 |
# Iterate through the original documents list
|
| 327 |
-
for doc in documents:
|
|
|
|
|
|
|
|
|
|
| 328 |
current_file = doc.metadata['source']
|
| 329 |
if current_file != previous_file: #chunk counting
|
| 330 |
previous_file = current_file
|
| 331 |
chunk_counter = 0
|
| 332 |
is_first_chunk = True # Keep track of the first chunk in the document
|
| 333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
if len(encoded) > max_length:
|
|
|
|
| 335 |
remaining_encoded = encoded
|
| 336 |
is_last_chunk = False
|
| 337 |
while len(remaining_encoded) > 1 and not is_last_chunk:
|
|
@@ -339,47 +365,69 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
| 339 |
overlap_text = tokenizer.decode(remaining_encoded[:overlap])# Index by token
|
| 340 |
period_index_b = overlap_text.find('.')# Index by character
|
| 341 |
if len(remaining_encoded)>max_length + min_chunk_size:
|
|
|
|
| 342 |
current_encoded = remaining_encoded[:max(10, max_length)]
|
| 343 |
else:
|
| 344 |
-
|
|
|
|
| 345 |
is_last_chunk = True
|
| 346 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
if len(remaining_encoded)>max_length+min_chunk_size:# If it is not the last sub chunk
|
|
|
|
| 348 |
overlap_text_last = tokenizer.decode(current_encoded[-overlap:])
|
| 349 |
period_index_last = overlap_text_last.find('.')
|
| 350 |
if period_index_last != -1 and period_index_last < len(overlap_text_last) - 1:
|
| 351 |
-
#print(f"period index last found at {period_index_last}")
|
| 352 |
-
period_index_e = period_index_last - len(overlap_text_last)
|
| 353 |
-
#print(f"period_index_e :{period_index_e}")
|
| 354 |
-
#print(f"last :{overlap_text_last}")
|
| 355 |
if not is_first_chunk:#starting after the period in overlap
|
|
|
|
| 356 |
if period_index_b == -1:# Period not found in overlap
|
| 357 |
-
#print(". not found in overlap")
|
| 358 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
|
| 359 |
else:
|
| 360 |
if is_last_chunk : #not the first but the last
|
|
|
|
| 361 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
|
| 362 |
#print("Should start after \".\"")
|
| 363 |
else:
|
|
|
|
| 364 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
|
| 365 |
else:#first chunk
|
|
|
|
| 366 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
|
| 367 |
if 'titles' in split_doc.metadata:
|
|
|
|
| 368 |
chunk_counter += 1
|
| 369 |
split_doc.metadata['chunk_id'] = chunk_counter
|
| 370 |
#A1 We could round chunk length in token if we ignore the '.' position in the overlap and save time of computation
|
| 371 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
| 372 |
resized.append(split_doc)
|
|
|
|
| 373 |
remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
|
| 374 |
is_first_chunk = False
|
| 375 |
-
#print(len(tokenizer.encode(split_doc.page_content)), split_doc.page_content, "\n-----------------")
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
#print(f"◀Document:{{ {doc.page_content} }} was not added because to short▶")
|
| 378 |
if 'titles' in doc.metadata:#check if it was splitted by or split_docx
|
| 379 |
chunk_counter += 1
|
| 380 |
doc.metadata['chunk_id'] = chunk_counter
|
| 381 |
-
|
|
|
|
| 382 |
resized.append(doc)
|
|
|
|
| 383 |
print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
|
| 384 |
return resized
|
| 385 |
|
|
|
|
| 29 |
import json
|
| 30 |
|
| 31 |
MODEL = "thenlper/gte-base"
|
| 32 |
+
CHUNK_SIZE = 1500
|
| 33 |
+
CHUNK_OVERLAP = 400
|
| 34 |
|
| 35 |
embeddings = HuggingFaceEmbeddings(
|
| 36 |
model_name=MODEL,
|
|
|
|
| 323 |
# Create an empty list to store the resized documents
|
| 324 |
resized = []
|
| 325 |
previous_file=""
|
| 326 |
+
to_encode = ""
|
| 327 |
+
skip_next = False
|
| 328 |
# Iterate through the original documents list
|
| 329 |
+
for i, doc in enumerate(documents):
|
| 330 |
+
if skip_next:
|
| 331 |
+
skip_next = False
|
| 332 |
+
continue
|
| 333 |
current_file = doc.metadata['source']
|
| 334 |
if current_file != previous_file: #chunk counting
|
| 335 |
previous_file = current_file
|
| 336 |
chunk_counter = 0
|
| 337 |
is_first_chunk = True # Keep track of the first chunk in the document
|
| 338 |
+
to_encode += doc.page_content
|
| 339 |
+
# if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
|
| 340 |
+
if (documents[i+1] is documents[-1] or documents[i+1].metadata['source'] != documents[i+2].metadata['source']) and len(tokenizer.encode(documents[i+1].page_content)) < min_chunk_size: # if the next doc is the last doc of the current file or the last of the corpus
|
| 341 |
+
# print('SAME DOC')
|
| 342 |
+
skip_next = True
|
| 343 |
+
to_encode += documents[i+1].page_content
|
| 344 |
+
#print(f"to_encode:\n{to_encode}")
|
| 345 |
+
encoded = tokenizer.encode(to_encode)#encode the current document
|
| 346 |
+
if len(encoded) < min_chunk_size and not skip_next:
|
| 347 |
+
# print(f"len(encoded):{len(encoded)}<min_chunk_size:{min_chunk_size}")
|
| 348 |
+
continue
|
| 349 |
+
elif skip_next:
|
| 350 |
+
split_doc = Document(page_content=tokenizer.decode(encoded), metadata=doc.metadata.copy())
|
| 351 |
+
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
| 352 |
+
resized.append(split_doc)
|
| 353 |
+
# print(f"Added a document of {split_doc.metadata['token_length']} tokens 1")
|
| 354 |
+
to_encode = ""
|
| 355 |
+
continue
|
| 356 |
+
else:
|
| 357 |
+
# print(f"len(encoded):{len(encoded)}>=min_chunk_size:{min_chunk_size}")
|
| 358 |
+
to_encode = ""
|
| 359 |
if len(encoded) > max_length:
|
| 360 |
+
# print(f"len(encoded):{len(encoded)}>=max_length:{max_length}")
|
| 361 |
remaining_encoded = encoded
|
| 362 |
is_last_chunk = False
|
| 363 |
while len(remaining_encoded) > 1 and not is_last_chunk:
|
|
|
|
| 365 |
overlap_text = tokenizer.decode(remaining_encoded[:overlap])# Index by token
|
| 366 |
period_index_b = overlap_text.find('.')# Index by character
|
| 367 |
if len(remaining_encoded)>max_length + min_chunk_size:
|
| 368 |
+
# print("len(remaining_encoded)>max_length + min_chunk_size")
|
| 369 |
current_encoded = remaining_encoded[:max(10, max_length)]
|
| 370 |
else:
|
| 371 |
+
# print("not len(remaining_encoded)>max_length + min_chunk_size")
|
| 372 |
+
current_encoded = remaining_encoded #if the last chunk is to small, concatenate it with the previous one
|
| 373 |
is_last_chunk = True
|
| 374 |
+
split_doc = Document(page_content=tokenizer.decode(current_encoded), metadata=doc.metadata.copy())
|
| 375 |
+
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
| 376 |
+
resized.append(split_doc)
|
| 377 |
+
# print(f"Added a document of {split_doc.metadata['token_length']} tokens 2")
|
| 378 |
+
break
|
| 379 |
+
period_index_e = -1 # an amount of character that I am sure will be greater or equal to the max lengh of a chunk, could have done len(tokenizer.decode(current_encoded))
|
| 380 |
if len(remaining_encoded)>max_length+min_chunk_size:# If it is not the last sub chunk
|
| 381 |
+
# print("len(remaining_encoded)>max_length+min_chunk_size")
|
| 382 |
overlap_text_last = tokenizer.decode(current_encoded[-overlap:])
|
| 383 |
period_index_last = overlap_text_last.find('.')
|
| 384 |
if period_index_last != -1 and period_index_last < len(overlap_text_last) - 1:
|
| 385 |
+
# print(f"period index last found at {period_index_last}")
|
| 386 |
+
period_index_e = period_index_last - len(overlap_text_last)
|
| 387 |
+
# print(f"period_index_e :{period_index_e}")
|
| 388 |
+
# print(f"last :{overlap_text_last}")
|
| 389 |
if not is_first_chunk:#starting after the period in overlap
|
| 390 |
+
# print("not is_first_chunk", period_index_b)
|
| 391 |
if period_index_b == -1:# Period not found in overlap
|
| 392 |
+
# print(". not found in overlap")
|
| 393 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
|
| 394 |
else:
|
| 395 |
if is_last_chunk : #not the first but the last
|
| 396 |
+
# print("is_last_chunk")
|
| 397 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
|
| 398 |
#print("Should start after \".\"")
|
| 399 |
else:
|
| 400 |
+
# print("not is_last_chunk", period_index_e, len(to_encode))
|
| 401 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
|
| 402 |
else:#first chunk
|
| 403 |
+
# print("else")
|
| 404 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
|
| 405 |
if 'titles' in split_doc.metadata:
|
| 406 |
+
# print("title in metadata")
|
| 407 |
chunk_counter += 1
|
| 408 |
split_doc.metadata['chunk_id'] = chunk_counter
|
| 409 |
#A1 We could round chunk length in token if we ignore the '.' position in the overlap and save time of computation
|
| 410 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
| 411 |
resized.append(split_doc)
|
| 412 |
+
print(f"Added a document of {split_doc.metadata['token_length']} tokens 3")
|
| 413 |
remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
|
| 414 |
is_first_chunk = False
|
| 415 |
+
# # print(len(tokenizer.encode(split_doc.page_content)), split_doc.page_content[:50], "\n-----------------")
|
| 416 |
+
# print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
|
| 417 |
+
# print(split_doc.page_content[:100])
|
| 418 |
+
# # print("😂😂😂😂")
|
| 419 |
+
# print(split_doc.page_content[-100:])
|
| 420 |
+
# print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
|
| 421 |
+
else:# len(encoded)>min_chunk_size:#ignore the chunks that are too small
|
| 422 |
+
print(f"found a chunk with the perfect size:{len(encoded)}")
|
| 423 |
#print(f"◀Document:{{ {doc.page_content} }} was not added because to short▶")
|
| 424 |
if 'titles' in doc.metadata:#check if it was splitted by or split_docx
|
| 425 |
chunk_counter += 1
|
| 426 |
doc.metadata['chunk_id'] = chunk_counter
|
| 427 |
+
doc.metadata['token_length'] = len(encoded)
|
| 428 |
+
doc.page_content = tokenizer.decode(encoded)
|
| 429 |
resized.append(doc)
|
| 430 |
+
print(f"Added a document of {doc.metadata['token_length']} tokens 4")
|
| 431 |
print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
|
| 432 |
return resized
|
| 433 |
|