Spaces:
Sleeping
Sleeping
Commit
·
9da507d
1
Parent(s):
a42e1ff
eski holat with utils simplified
Browse files- index_retriever.py +2 -2
- table_prep.py +59 -66
index_retriever.py
CHANGED
|
@@ -46,12 +46,12 @@ def create_query_engine(vector_index):
|
|
| 46 |
|
| 47 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 48 |
docstore=vector_index.docstore,
|
| 49 |
-
similarity_top_k=
|
| 50 |
)
|
| 51 |
|
| 52 |
vector_retriever = VectorIndexRetriever(
|
| 53 |
index=vector_index,
|
| 54 |
-
similarity_top_k=
|
| 55 |
similarity_cutoff=0.65
|
| 56 |
)
|
| 57 |
|
|
|
|
| 46 |
|
| 47 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 48 |
docstore=vector_index.docstore,
|
| 49 |
+
similarity_top_k=50
|
| 50 |
)
|
| 51 |
|
| 52 |
vector_retriever = VectorIndexRetriever(
|
| 53 |
index=vector_index,
|
| 54 |
+
similarity_top_k=50,
|
| 55 |
similarity_cutoff=0.65
|
| 56 |
)
|
| 57 |
|
table_prep.py
CHANGED
|
@@ -35,128 +35,121 @@ from config import CHUNK_SIZE, CHUNK_OVERLAP
|
|
| 35 |
def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
|
| 36 |
"""Simple table chunking: max 5 rows or 2000 chars per chunk"""
|
| 37 |
|
| 38 |
-
table_num = doc.metadata.get('table_number', 'unknown')
|
| 39 |
-
|
| 40 |
-
# Parse table
|
| 41 |
lines = doc.text.strip().split('\n')
|
| 42 |
|
| 43 |
-
|
|
|
|
| 44 |
data_rows = []
|
| 45 |
in_data = False
|
| 46 |
|
| 47 |
for line in lines:
|
| 48 |
if line.startswith('Данные таблицы:'):
|
| 49 |
in_data = True
|
| 50 |
-
|
| 51 |
elif in_data and line.startswith('Строка'):
|
| 52 |
data_rows.append(line)
|
| 53 |
elif not in_data:
|
| 54 |
-
|
| 55 |
|
| 56 |
-
|
| 57 |
|
|
|
|
| 58 |
if not data_rows:
|
| 59 |
-
# No rows, return as is
|
| 60 |
return [doc]
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
# Simple chunking
|
| 65 |
chunks = []
|
| 66 |
-
|
| 67 |
-
current_size = len(
|
| 68 |
|
| 69 |
for row in data_rows:
|
| 70 |
-
row_size = len(row) + 1
|
| 71 |
|
| 72 |
-
# Check if
|
| 73 |
-
if (len(
|
| 74 |
-
current_size + row_size > max_chunk_size) and
|
| 75 |
|
| 76 |
# Save current chunk
|
| 77 |
-
chunk_text =
|
| 78 |
chunks.append(chunk_text)
|
| 79 |
-
log_message(f" Чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
|
| 80 |
|
| 81 |
-
# Start new chunk
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
current_size = len(table_header) + len(current_chunk_rows[0]) + 1
|
| 85 |
-
else:
|
| 86 |
-
current_chunk_rows = []
|
| 87 |
-
current_size = len(table_header)
|
| 88 |
|
| 89 |
-
|
| 90 |
current_size += row_size
|
| 91 |
|
| 92 |
-
#
|
| 93 |
-
if
|
| 94 |
-
chunk_text =
|
| 95 |
chunks.append(chunk_text)
|
| 96 |
-
log_message(f" Последний чанк: {len(current_chunk_rows)} строк")
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
# Create documents
|
| 101 |
chunked_docs = []
|
| 102 |
for i, chunk_text in enumerate(chunks):
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
return chunked_docs
|
| 115 |
|
| 116 |
|
| 117 |
def table_to_document(table_data, document_id=None):
|
|
|
|
|
|
|
| 118 |
if not isinstance(table_data, dict):
|
| 119 |
-
log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
|
| 120 |
return []
|
| 121 |
|
| 122 |
doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
|
| 123 |
table_num = table_data.get('table_number', 'Неизвестно')
|
| 124 |
table_title = table_data.get('table_title', 'Неизвестно')
|
| 125 |
section = table_data.get('section', 'Неизвестно')
|
| 126 |
-
|
| 127 |
table_rows = table_data.get('data', [])
|
| 128 |
-
|
| 129 |
-
|
| 130 |
return []
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
|
|
|
| 136 |
base_doc = Document(
|
| 137 |
text=content,
|
| 138 |
metadata={
|
| 139 |
"type": "table",
|
| 140 |
"table_number": table_num,
|
| 141 |
-
"table_title": table_title,
|
| 142 |
"document_id": doc_id,
|
| 143 |
-
"section": section
|
| 144 |
-
"section_id": section,
|
| 145 |
-
"total_rows": row_count,
|
| 146 |
-
"content_size": content_size
|
| 147 |
}
|
| 148 |
)
|
|
|
|
|
|
|
| 149 |
|
| 150 |
-
|
| 151 |
-
chunked_docs = chunk_table_document(base_doc)
|
| 152 |
-
log_message(f" ✂️ Разделена на {len(chunked_docs)} чанков")
|
| 153 |
-
for i, chunk_doc in enumerate(chunked_docs):
|
| 154 |
-
log_message(f" Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
|
| 155 |
-
return chunked_docs
|
| 156 |
-
else:
|
| 157 |
-
log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
|
| 158 |
-
f"Размер: {content_size} символов | Строк: {row_count}")
|
| 159 |
-
return [base_doc]
|
| 160 |
|
| 161 |
|
| 162 |
def load_table_data(repo_id, hf_token, table_data_dir):
|
|
|
|
| 35 |
def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
|
| 36 |
"""Simple table chunking: max 5 rows or 2000 chars per chunk"""
|
| 37 |
|
|
|
|
|
|
|
|
|
|
| 38 |
lines = doc.text.strip().split('\n')
|
| 39 |
|
| 40 |
+
# Separate header and data rows
|
| 41 |
+
header_lines = []
|
| 42 |
data_rows = []
|
| 43 |
in_data = False
|
| 44 |
|
| 45 |
for line in lines:
|
| 46 |
if line.startswith('Данные таблицы:'):
|
| 47 |
in_data = True
|
| 48 |
+
header_lines.append(line)
|
| 49 |
elif in_data and line.startswith('Строка'):
|
| 50 |
data_rows.append(line)
|
| 51 |
elif not in_data:
|
| 52 |
+
header_lines.append(line)
|
| 53 |
|
| 54 |
+
header = '\n'.join(header_lines) + '\n'
|
| 55 |
|
| 56 |
+
# No rows to chunk
|
| 57 |
if not data_rows:
|
|
|
|
| 58 |
return [doc]
|
| 59 |
|
| 60 |
+
# Chunk the data rows
|
|
|
|
|
|
|
| 61 |
chunks = []
|
| 62 |
+
current_rows = []
|
| 63 |
+
current_size = len(header)
|
| 64 |
|
| 65 |
for row in data_rows:
|
| 66 |
+
row_size = len(row) + 1 # +1 for newline
|
| 67 |
|
| 68 |
+
# Check if we need to create a new chunk
|
| 69 |
+
if (len(current_rows) >= max_rows_per_chunk or
|
| 70 |
+
current_size + row_size > max_chunk_size) and current_rows:
|
| 71 |
|
| 72 |
# Save current chunk
|
| 73 |
+
chunk_text = header + '\n'.join(current_rows)
|
| 74 |
chunks.append(chunk_text)
|
|
|
|
| 75 |
|
| 76 |
+
# Start new chunk (keep last row for overlap)
|
| 77 |
+
current_rows = [current_rows[-1]]
|
| 78 |
+
current_size = len(header) + len(current_rows[0]) + 1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
current_rows.append(row)
|
| 81 |
current_size += row_size
|
| 82 |
|
| 83 |
+
# Add final chunk
|
| 84 |
+
if current_rows:
|
| 85 |
+
chunk_text = header + '\n'.join(current_rows)
|
| 86 |
chunks.append(chunk_text)
|
|
|
|
| 87 |
|
| 88 |
+
# Create Document objects
|
|
|
|
|
|
|
| 89 |
chunked_docs = []
|
| 90 |
for i, chunk_text in enumerate(chunks):
|
| 91 |
+
chunk_doc = Document(
|
| 92 |
+
text=chunk_text,
|
| 93 |
+
metadata={
|
| 94 |
+
"type": "table",
|
| 95 |
+
"table_number": doc.metadata.get('table_number'),
|
| 96 |
+
"document_id": doc.metadata.get('document_id'),
|
| 97 |
+
"section": doc.metadata.get('section'),
|
| 98 |
+
"chunk_id": i,
|
| 99 |
+
"total_chunks": len(chunks),
|
| 100 |
+
"is_chunked": True
|
| 101 |
+
}
|
| 102 |
+
)
|
| 103 |
+
chunked_docs.append(chunk_doc)
|
| 104 |
|
| 105 |
return chunked_docs
|
| 106 |
|
| 107 |
|
| 108 |
def table_to_document(table_data, document_id=None):
|
| 109 |
+
"""Convert table data to Document, chunk if needed"""
|
| 110 |
+
|
| 111 |
if not isinstance(table_data, dict):
|
|
|
|
| 112 |
return []
|
| 113 |
|
| 114 |
doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
|
| 115 |
table_num = table_data.get('table_number', 'Неизвестно')
|
| 116 |
table_title = table_data.get('table_title', 'Неизвестно')
|
| 117 |
section = table_data.get('section', 'Неизвестно')
|
|
|
|
| 118 |
table_rows = table_data.get('data', [])
|
| 119 |
+
|
| 120 |
+
if not table_rows:
|
| 121 |
return []
|
| 122 |
|
| 123 |
+
# Build table content
|
| 124 |
+
content = f"Таблица: {table_num}\n"
|
| 125 |
+
content += f"Название: {table_title}\n"
|
| 126 |
+
content += f"Документ: {doc_id}\n"
|
| 127 |
+
content += f"Раздел: {section}\n"
|
| 128 |
+
|
| 129 |
+
headers = table_data.get('headers', [])
|
| 130 |
+
if headers:
|
| 131 |
+
content += f"\nЗаголовки: {' | '.join(headers)}\n"
|
| 132 |
+
|
| 133 |
+
content += "\nДанные таблицы:\n"
|
| 134 |
+
for row_idx, row in enumerate(table_rows, start=1):
|
| 135 |
+
if isinstance(row, dict):
|
| 136 |
+
row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
|
| 137 |
+
content += f"Строка {row_idx}: {row_text}\n"
|
| 138 |
|
| 139 |
+
# Create base document
|
| 140 |
base_doc = Document(
|
| 141 |
text=content,
|
| 142 |
metadata={
|
| 143 |
"type": "table",
|
| 144 |
"table_number": table_num,
|
|
|
|
| 145 |
"document_id": doc_id,
|
| 146 |
+
"section": section
|
|
|
|
|
|
|
|
|
|
| 147 |
}
|
| 148 |
)
|
| 149 |
+
if len(content) > 2000:
|
| 150 |
+
return chunk_table_document(base_doc)
|
| 151 |
|
| 152 |
+
return [base_doc]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
def load_table_data(repo_id, hf_token, table_data_dir):
|