Spaces:
Sleeping
Sleeping
Commit
·
ee54fb7
1
Parent(s):
88291da
topk query = 50 + 0.55 sim cut off + table chunk size= 2500
Browse files- table_prep.py +4 -3
table_prep.py
CHANGED
|
@@ -32,7 +32,7 @@ def create_table_content(table_data):
|
|
| 32 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 33 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 34 |
|
| 35 |
-
def chunk_table_document(doc, max_chunk_size=
|
| 36 |
lines = doc.text.strip().split('\n')
|
| 37 |
|
| 38 |
# Separate header and data rows
|
|
@@ -70,11 +70,12 @@ def chunk_table_document(doc, max_chunk_size=2500):
|
|
| 70 |
|
| 71 |
current_rows.append(row)
|
| 72 |
current_size += row_size
|
| 73 |
-
|
| 74 |
# Add final chunk
|
| 75 |
if current_rows:
|
| 76 |
chunk_text = header + '\n'.join(current_rows)
|
| 77 |
chunks.append(chunk_text)
|
|
|
|
| 78 |
|
| 79 |
# Create Document objects
|
| 80 |
chunked_docs = []
|
|
@@ -137,7 +138,7 @@ def table_to_document(table_data, document_id=None):
|
|
| 137 |
"section": section
|
| 138 |
}
|
| 139 |
)
|
| 140 |
-
if len(content) >
|
| 141 |
chunks = chunk_table_document(base_doc)
|
| 142 |
log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
|
| 143 |
return chunk_table_document(base_doc)
|
|
|
|
| 32 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 33 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 34 |
|
| 35 |
+
def chunk_table_document(doc, max_chunk_size=4000):
|
| 36 |
lines = doc.text.strip().split('\n')
|
| 37 |
|
| 38 |
# Separate header and data rows
|
|
|
|
| 70 |
|
| 71 |
current_rows.append(row)
|
| 72 |
current_size += row_size
|
| 73 |
+
log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов")
|
| 74 |
# Add final chunk
|
| 75 |
if current_rows:
|
| 76 |
chunk_text = header + '\n'.join(current_rows)
|
| 77 |
chunks.append(chunk_text)
|
| 78 |
+
log_message(f"Создана финальная часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
|
| 79 |
|
| 80 |
# Create Document objects
|
| 81 |
chunked_docs = []
|
|
|
|
| 138 |
"section": section
|
| 139 |
}
|
| 140 |
)
|
| 141 |
+
if len(content) > 4000:
|
| 142 |
chunks = chunk_table_document(base_doc)
|
| 143 |
log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
|
| 144 |
return chunk_table_document(base_doc)
|