Spaces:
Sleeping
Sleeping
Commit
·
90e6b4c
1
Parent(s):
f85ad1c
a new way with keywords
Browse files- documents_prep.py +68 -127
- index_retriever.py +139 -1
- requirements.txt +2 -1
- table_prep.py +108 -100
- utils.py +20 -10
documents_prep.py
CHANGED
|
@@ -14,206 +14,147 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
|
|
| 14 |
chunk_size = CHUNK_SIZE
|
| 15 |
if chunk_overlap is None:
|
| 16 |
chunk_overlap = CHUNK_OVERLAP
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
-
# Try to split by double newlines (paragraphs) first
|
| 21 |
-
paragraphs = text.split('\n\n')
|
| 22 |
-
|
| 23 |
-
chunks = []
|
| 24 |
-
current_chunk = ""
|
| 25 |
-
|
| 26 |
-
for para in paragraphs:
|
| 27 |
-
para = para.strip()
|
| 28 |
-
if not para:
|
| 29 |
-
continue
|
| 30 |
-
|
| 31 |
-
# If adding this paragraph exceeds limit, save current chunk
|
| 32 |
-
if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
|
| 33 |
-
chunks.append(current_chunk.strip())
|
| 34 |
-
# Add overlap from end of previous chunk
|
| 35 |
-
overlap_text = current_chunk[-chunk_overlap:] if len(current_chunk) > chunk_overlap else current_chunk
|
| 36 |
-
current_chunk = overlap_text + "\n\n" + para
|
| 37 |
-
else:
|
| 38 |
-
if current_chunk:
|
| 39 |
-
current_chunk += "\n\n" + para
|
| 40 |
-
else:
|
| 41 |
-
current_chunk = para
|
| 42 |
-
|
| 43 |
-
# Add last chunk
|
| 44 |
-
if current_chunk:
|
| 45 |
-
chunks.append(current_chunk.strip())
|
| 46 |
-
|
| 47 |
-
# If single paragraph is too large, fall back to sentence splitting
|
| 48 |
-
final_chunks = []
|
| 49 |
-
for chunk_text in chunks:
|
| 50 |
-
if len(chunk_text) > chunk_size:
|
| 51 |
-
splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 52 |
-
final_chunks.extend(splitter.split_text(chunk_text))
|
| 53 |
-
else:
|
| 54 |
-
final_chunks.append(chunk_text)
|
| 55 |
-
|
| 56 |
-
log_message(f" ✂️ Текст разбит на {len(final_chunks)} семантических чанков")
|
| 57 |
-
|
| 58 |
-
# Create documents
|
| 59 |
chunked_docs = []
|
| 60 |
-
for i, chunk_text in enumerate(
|
| 61 |
chunk_metadata = doc.metadata.copy()
|
| 62 |
chunk_metadata.update({
|
| 63 |
"chunk_id": i,
|
| 64 |
-
"total_chunks": len(
|
| 65 |
"chunk_size": len(chunk_text),
|
| 66 |
-
"
|
| 67 |
})
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
return chunked_docs
|
| 71 |
|
| 72 |
def process_documents_with_chunking(documents):
|
| 73 |
-
log_message("\n" + "="*60)
|
| 74 |
-
log_message("🔄 НАЧАЛО ПРОЦЕССА ЧАНКИНГА")
|
| 75 |
-
log_message("="*60)
|
| 76 |
-
|
| 77 |
all_chunked_docs = []
|
| 78 |
chunk_info = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
|
| 81 |
-
table_whole_count = 0 # Целые таблицы (не нуждаются в чанкинге)
|
| 82 |
-
table_chunked_count = 0 # Таблицы, которые УЖЕ разбиты
|
| 83 |
-
image_whole_count = 0 # Целые изображения
|
| 84 |
-
image_chunked_count = 0 # Изображения, разбитые на чанки
|
| 85 |
-
text_whole_count = 0 # Целые текстовые документы
|
| 86 |
-
text_chunked_count = 0 # Текстовые документы, разбитые на чанки
|
| 87 |
-
|
| 88 |
-
for idx, doc in enumerate(documents):
|
| 89 |
doc_type = doc.metadata.get('type', 'text')
|
| 90 |
is_already_chunked = doc.metadata.get('is_chunked', False)
|
| 91 |
-
doc_size = len(doc.text)
|
| 92 |
-
|
| 93 |
-
log_message(f"\n📄 Документ {idx+1}/{len(documents)} | "
|
| 94 |
-
f"Тип: {doc_type} | "
|
| 95 |
-
f"Размер: {doc_size} | "
|
| 96 |
-
f"Уже разбит: {is_already_chunked}")
|
| 97 |
|
| 98 |
if doc_type == 'table':
|
| 99 |
if is_already_chunked:
|
| 100 |
-
|
| 101 |
-
table_chunked_count += 1
|
| 102 |
all_chunked_docs.append(doc)
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
else:
|
| 106 |
-
|
| 107 |
-
table_whole_count += 1
|
| 108 |
all_chunked_docs.append(doc)
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 119 |
-
'type': 'table',
|
| 120 |
-
'table_number': doc.metadata.get('table_number', 'unknown'),
|
| 121 |
-
'is_chunked': is_already_chunked
|
| 122 |
-
})
|
| 123 |
|
| 124 |
elif doc_type == 'image':
|
|
|
|
|
|
|
| 125 |
if doc_size > CHUNK_SIZE:
|
| 126 |
-
log_message(f"
|
|
|
|
| 127 |
chunked_docs = chunk_document(doc)
|
| 128 |
-
|
| 129 |
all_chunked_docs.extend(chunked_docs)
|
|
|
|
| 130 |
|
| 131 |
-
for chunk_doc in chunked_docs:
|
| 132 |
chunk_info.append({
|
| 133 |
'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
|
| 134 |
'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
|
| 135 |
-
'chunk_id':
|
| 136 |
-
'total_chunks': chunk_doc.metadata.get('total_chunks', 1),
|
| 137 |
'chunk_size': len(chunk_doc.text),
|
| 138 |
'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
|
| 139 |
'type': 'image',
|
| 140 |
-
'image_number': chunk_doc.metadata.get('image_number', 'unknown')
|
| 141 |
-
'is_chunked': True
|
| 142 |
})
|
| 143 |
else:
|
| 144 |
-
image_whole_count += 1
|
| 145 |
all_chunked_docs.append(doc)
|
| 146 |
-
log_message(f" ✓ Целое изображение добавлено | Размер: {doc_size}")
|
| 147 |
-
|
| 148 |
chunk_info.append({
|
| 149 |
'document_id': doc.metadata.get('document_id', 'unknown'),
|
| 150 |
'section_id': doc.metadata.get('section_id', 'unknown'),
|
| 151 |
'chunk_id': 0,
|
| 152 |
-
'total_chunks': 1,
|
| 153 |
'chunk_size': doc_size,
|
| 154 |
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 155 |
'type': 'image',
|
| 156 |
-
'image_number': doc.metadata.get('image_number', 'unknown')
|
| 157 |
-
'is_chunked': False
|
| 158 |
})
|
| 159 |
|
| 160 |
-
else:
|
|
|
|
| 161 |
if doc_size > CHUNK_SIZE:
|
| 162 |
-
log_message(f"
|
| 163 |
-
f"Документ: {doc.metadata.get('document_id', 'unknown')} | "
|
| 164 |
-
f"Раздел: {doc.metadata.get('section_id', 'unknown')} | "
|
| 165 |
f"Размер: {doc_size} > {CHUNK_SIZE}")
|
| 166 |
-
|
| 167 |
chunked_docs = chunk_document(doc)
|
| 168 |
-
|
| 169 |
all_chunked_docs.extend(chunked_docs)
|
|
|
|
| 170 |
|
| 171 |
-
for chunk_doc in chunked_docs:
|
| 172 |
chunk_info.append({
|
| 173 |
'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
|
| 174 |
'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
|
| 175 |
-
'chunk_id':
|
| 176 |
-
'total_chunks': chunk_doc.metadata.get('total_chunks', 1),
|
| 177 |
'chunk_size': len(chunk_doc.text),
|
| 178 |
'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
|
| 179 |
-
'type': 'text'
|
| 180 |
-
'is_chunked': True
|
| 181 |
})
|
| 182 |
else:
|
| 183 |
-
text_whole_count += 1
|
| 184 |
all_chunked_docs.append(doc)
|
| 185 |
-
log_message(f" ✓ Целый текстовый документ добавлен | Размер: {doc_size}")
|
| 186 |
-
|
| 187 |
chunk_info.append({
|
| 188 |
'document_id': doc.metadata.get('document_id', 'unknown'),
|
| 189 |
'section_id': doc.metadata.get('section_id', 'unknown'),
|
| 190 |
'chunk_id': 0,
|
| 191 |
-
'total_chunks': 1,
|
| 192 |
'chunk_size': doc_size,
|
| 193 |
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 194 |
-
'type': 'text'
|
| 195 |
-
'is_chunked': False
|
| 196 |
})
|
| 197 |
|
| 198 |
log_message(f"\n{'='*60}")
|
| 199 |
-
log_message(f"
|
| 200 |
-
log_message(f"{
|
| 201 |
-
log_message(f"
|
| 202 |
-
log_message(f"
|
| 203 |
-
log_message(f"
|
| 204 |
-
log_message(f"
|
| 205 |
-
log_message(f"
|
| 206 |
-
log_message(f" • Чанки: {image_chunked_count}")
|
| 207 |
-
log_message(f" ТЕКСТ:")
|
| 208 |
-
log_message(f" • Целые документы: {text_whole_count}")
|
| 209 |
-
log_message(f" • Чанки: {text_chunked_count}")
|
| 210 |
-
log_message(f" {'─'*58}")
|
| 211 |
-
log_message(f" ВСЕГО ДОКУМЕНТОВ В ИНДЕКСЕ: {len(all_chunked_docs)}")
|
| 212 |
log_message(f"{'='*60}\n")
|
| 213 |
|
| 214 |
return all_chunked_docs, chunk_info
|
| 215 |
|
| 216 |
-
|
| 217 |
def extract_text_from_json(data, document_id, document_name):
|
| 218 |
documents = []
|
| 219 |
|
|
|
|
| 14 |
chunk_size = CHUNK_SIZE
|
| 15 |
if chunk_overlap is None:
|
| 16 |
chunk_overlap = CHUNK_OVERLAP
|
| 17 |
+
text_splitter = SentenceSplitter(
|
| 18 |
+
chunk_size=chunk_size,
|
| 19 |
+
chunk_overlap=chunk_overlap,
|
| 20 |
+
separator=" "
|
| 21 |
+
)
|
| 22 |
|
| 23 |
+
text_chunks = text_splitter.split_text(doc.text)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
chunked_docs = []
|
| 26 |
+
for i, chunk_text in enumerate(text_chunks):
|
| 27 |
chunk_metadata = doc.metadata.copy()
|
| 28 |
chunk_metadata.update({
|
| 29 |
"chunk_id": i,
|
| 30 |
+
"total_chunks": len(text_chunks),
|
| 31 |
"chunk_size": len(chunk_text),
|
| 32 |
+
"original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
|
| 33 |
})
|
| 34 |
+
|
| 35 |
+
chunked_doc = Document(
|
| 36 |
+
text=chunk_text,
|
| 37 |
+
metadata=chunk_metadata
|
| 38 |
+
)
|
| 39 |
+
chunked_docs.append(chunked_doc)
|
| 40 |
|
| 41 |
return chunked_docs
|
| 42 |
|
| 43 |
def process_documents_with_chunking(documents):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
all_chunked_docs = []
|
| 45 |
chunk_info = []
|
| 46 |
+
table_count = 0
|
| 47 |
+
table_chunks_count = 0
|
| 48 |
+
image_count = 0
|
| 49 |
+
image_chunks_count = 0
|
| 50 |
+
text_chunks_count = 0
|
| 51 |
|
| 52 |
+
for doc in documents:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
doc_type = doc.metadata.get('type', 'text')
|
| 54 |
is_already_chunked = doc.metadata.get('is_chunked', False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
if doc_type == 'table':
|
| 57 |
if is_already_chunked:
|
| 58 |
+
table_chunks_count += 1
|
|
|
|
| 59 |
all_chunked_docs.append(doc)
|
| 60 |
+
chunk_info.append({
|
| 61 |
+
'document_id': doc.metadata.get('document_id', 'unknown'),
|
| 62 |
+
'section_id': doc.metadata.get('section_id', 'unknown'),
|
| 63 |
+
'chunk_id': doc.metadata.get('chunk_id', 0),
|
| 64 |
+
'total_chunks': doc.metadata.get('total_chunks', 1),
|
| 65 |
+
'chunk_size': len(doc.text),
|
| 66 |
+
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 67 |
+
'type': 'table',
|
| 68 |
+
'table_number': doc.metadata.get('table_number', 'unknown')
|
| 69 |
+
})
|
| 70 |
else:
|
| 71 |
+
table_count += 1
|
|
|
|
| 72 |
all_chunked_docs.append(doc)
|
| 73 |
+
chunk_info.append({
|
| 74 |
+
'document_id': doc.metadata.get('document_id', 'unknown'),
|
| 75 |
+
'section_id': doc.metadata.get('section_id', 'unknown'),
|
| 76 |
+
'chunk_id': 0,
|
| 77 |
+
'chunk_size': len(doc.text),
|
| 78 |
+
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 79 |
+
'type': 'table',
|
| 80 |
+
'table_number': doc.metadata.get('table_number', 'unknown')
|
| 81 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
elif doc_type == 'image':
|
| 84 |
+
image_count += 1
|
| 85 |
+
doc_size = len(doc.text)
|
| 86 |
if doc_size > CHUNK_SIZE:
|
| 87 |
+
log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number', 'unknown')} | "
|
| 88 |
+
f"Размер: {doc_size} > {CHUNK_SIZE}")
|
| 89 |
chunked_docs = chunk_document(doc)
|
| 90 |
+
image_chunks_count += len(chunked_docs)
|
| 91 |
all_chunked_docs.extend(chunked_docs)
|
| 92 |
+
log_message(f" ✂️ Разделено на {len(chunked_docs)} чанков")
|
| 93 |
|
| 94 |
+
for i, chunk_doc in enumerate(chunked_docs):
|
| 95 |
chunk_info.append({
|
| 96 |
'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
|
| 97 |
'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
|
| 98 |
+
'chunk_id': i,
|
|
|
|
| 99 |
'chunk_size': len(chunk_doc.text),
|
| 100 |
'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
|
| 101 |
'type': 'image',
|
| 102 |
+
'image_number': chunk_doc.metadata.get('image_number', 'unknown')
|
|
|
|
| 103 |
})
|
| 104 |
else:
|
|
|
|
| 105 |
all_chunked_docs.append(doc)
|
|
|
|
|
|
|
| 106 |
chunk_info.append({
|
| 107 |
'document_id': doc.metadata.get('document_id', 'unknown'),
|
| 108 |
'section_id': doc.metadata.get('section_id', 'unknown'),
|
| 109 |
'chunk_id': 0,
|
|
|
|
| 110 |
'chunk_size': doc_size,
|
| 111 |
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 112 |
'type': 'image',
|
| 113 |
+
'image_number': doc.metadata.get('image_number', 'unknown')
|
|
|
|
| 114 |
})
|
| 115 |
|
| 116 |
+
else:
|
| 117 |
+
doc_size = len(doc.text)
|
| 118 |
if doc_size > CHUNK_SIZE:
|
| 119 |
+
log_message(f"📝 CHUNKING: Текст из '{doc.metadata.get('document_id', 'unknown')}' | "
|
|
|
|
|
|
|
| 120 |
f"Размер: {doc_size} > {CHUNK_SIZE}")
|
|
|
|
| 121 |
chunked_docs = chunk_document(doc)
|
| 122 |
+
text_chunks_count += len(chunked_docs)
|
| 123 |
all_chunked_docs.extend(chunked_docs)
|
| 124 |
+
log_message(f" ✂️ Разделен на {len(chunked_docs)} чанков")
|
| 125 |
|
| 126 |
+
for i, chunk_doc in enumerate(chunked_docs):
|
| 127 |
chunk_info.append({
|
| 128 |
'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
|
| 129 |
'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
|
| 130 |
+
'chunk_id': i,
|
|
|
|
| 131 |
'chunk_size': len(chunk_doc.text),
|
| 132 |
'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
|
| 133 |
+
'type': 'text'
|
|
|
|
| 134 |
})
|
| 135 |
else:
|
|
|
|
| 136 |
all_chunked_docs.append(doc)
|
|
|
|
|
|
|
| 137 |
chunk_info.append({
|
| 138 |
'document_id': doc.metadata.get('document_id', 'unknown'),
|
| 139 |
'section_id': doc.metadata.get('section_id', 'unknown'),
|
| 140 |
'chunk_id': 0,
|
|
|
|
| 141 |
'chunk_size': doc_size,
|
| 142 |
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 143 |
+
'type': 'text'
|
|
|
|
| 144 |
})
|
| 145 |
|
| 146 |
log_message(f"\n{'='*60}")
|
| 147 |
+
log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
|
| 148 |
+
log_message(f" • Таблицы (целые): {table_count}")
|
| 149 |
+
log_message(f" • Таблицы (чанки): {table_chunks_count}")
|
| 150 |
+
log_message(f" • Изображения (целые): {image_count - (image_chunks_count > 0)}")
|
| 151 |
+
log_message(f" • Изображения (чанки): {image_chunks_count}")
|
| 152 |
+
log_message(f" • Текстовые чанки: {text_chunks_count}")
|
| 153 |
+
log_message(f" • Всего документов: {len(all_chunked_docs)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
log_message(f"{'='*60}\n")
|
| 155 |
|
| 156 |
return all_chunked_docs, chunk_info
|
| 157 |
|
|
|
|
| 158 |
def extract_text_from_json(data, document_id, document_name):
|
| 159 |
documents = []
|
| 160 |
|
index_retriever.py
CHANGED
|
@@ -112,4 +112,142 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, dive
|
|
| 112 |
|
| 113 |
except Exception as e:
|
| 114 |
log_message(f"Ошибка переранжировки: {str(e)}")
|
| 115 |
-
return nodes[:top_k]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
except Exception as e:
|
| 114 |
log_message(f"Ошибка переранжировки: {str(e)}")
|
| 115 |
+
return nodes[:top_k]
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
from rank_bm25 import BM25Okapi
|
| 119 |
+
import numpy as np
|
| 120 |
+
|
| 121 |
+
class HybridRetriever:
|
| 122 |
+
def __init__(self, vector_retriever, documents):
|
| 123 |
+
self.vector_retriever = vector_retriever
|
| 124 |
+
self.documents = documents
|
| 125 |
+
|
| 126 |
+
# Build BM25 index
|
| 127 |
+
tokenized_docs = [doc.text.lower().split() for doc in documents]
|
| 128 |
+
self.bm25 = BM25Okapi(tokenized_docs)
|
| 129 |
+
|
| 130 |
+
# Build metadata index for exact matching
|
| 131 |
+
self.metadata_index = self._build_metadata_index(documents)
|
| 132 |
+
|
| 133 |
+
def _build_metadata_index(self, documents):
|
| 134 |
+
"""Index by materials, GOSTs, classes for exact matching"""
|
| 135 |
+
index = {
|
| 136 |
+
'materials': {},
|
| 137 |
+
'gosts': {},
|
| 138 |
+
'classes': {},
|
| 139 |
+
'key_terms': {}
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
for i, doc in enumerate(documents):
|
| 143 |
+
metadata = doc.metadata
|
| 144 |
+
|
| 145 |
+
# Index materials
|
| 146 |
+
for material in metadata.get('materials', []):
|
| 147 |
+
if material not in index['materials']:
|
| 148 |
+
index['materials'][material] = []
|
| 149 |
+
index['materials'][material].append(i)
|
| 150 |
+
|
| 151 |
+
# Index GOSTs
|
| 152 |
+
for gost in metadata.get('gosts', []):
|
| 153 |
+
if gost not in index['gosts']:
|
| 154 |
+
index['gosts'][gost] = []
|
| 155 |
+
index['gosts'][gost].append(i)
|
| 156 |
+
|
| 157 |
+
# Index classes
|
| 158 |
+
for cls in metadata.get('classes', []):
|
| 159 |
+
if cls not in index['classes']:
|
| 160 |
+
index['classes'][cls] = []
|
| 161 |
+
index['classes'][cls].append(i)
|
| 162 |
+
|
| 163 |
+
# Index key terms
|
| 164 |
+
for term in metadata.get('key_terms', []):
|
| 165 |
+
term_lower = term.lower()
|
| 166 |
+
if term_lower not in index['key_terms']:
|
| 167 |
+
index['key_terms'][term_lower] = []
|
| 168 |
+
index['key_terms'][term_lower].append(i)
|
| 169 |
+
|
| 170 |
+
return index
|
| 171 |
+
|
| 172 |
+
def retrieve(self, query, top_k=20, vector_weight=0.5, bm25_weight=0.3, metadata_weight=0.2):
|
| 173 |
+
"""Hybrid retrieval combining vector, BM25, and metadata matching"""
|
| 174 |
+
|
| 175 |
+
# 1. Vector search
|
| 176 |
+
vector_results = self.vector_retriever.retrieve(query)
|
| 177 |
+
vector_scores = {node.node_id: node.score for node in vector_results}
|
| 178 |
+
|
| 179 |
+
# 2. BM25 search
|
| 180 |
+
tokenized_query = query.lower().split()
|
| 181 |
+
bm25_scores = self.bm25.get_scores(tokenized_query)
|
| 182 |
+
|
| 183 |
+
# 3. Metadata exact matching
|
| 184 |
+
metadata_scores = self._get_metadata_scores(query)
|
| 185 |
+
|
| 186 |
+
# 4. Combine scores
|
| 187 |
+
all_node_ids = set(list(vector_scores.keys()) +
|
| 188 |
+
list(range(len(self.documents))))
|
| 189 |
+
|
| 190 |
+
combined_scores = {}
|
| 191 |
+
for node_id in all_node_ids:
|
| 192 |
+
vec_score = vector_scores.get(node_id, 0.0)
|
| 193 |
+
bm25_score = bm25_scores[node_id] if isinstance(node_id, int) and node_id < len(bm25_scores) else 0.0
|
| 194 |
+
meta_score = metadata_scores.get(node_id, 0.0)
|
| 195 |
+
|
| 196 |
+
# Normalize and combine
|
| 197 |
+
combined_scores[node_id] = (
|
| 198 |
+
vector_weight * vec_score +
|
| 199 |
+
bm25_weight * (bm25_score / (max(bm25_scores) + 1e-6)) +
|
| 200 |
+
metadata_weight * meta_score
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
# 5. Get top-k
|
| 204 |
+
sorted_nodes = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
|
| 205 |
+
|
| 206 |
+
# Return as node objects
|
| 207 |
+
results = []
|
| 208 |
+
for node_id, score in sorted_nodes:
|
| 209 |
+
if isinstance(node_id, int) and node_id < len(self.documents):
|
| 210 |
+
doc = self.documents[node_id]
|
| 211 |
+
# Create node-like object
|
| 212 |
+
from types import SimpleNamespace
|
| 213 |
+
node = SimpleNamespace(
|
| 214 |
+
text=doc.text,
|
| 215 |
+
metadata=doc.metadata,
|
| 216 |
+
score=score,
|
| 217 |
+
node_id=node_id
|
| 218 |
+
)
|
| 219 |
+
results.append(node)
|
| 220 |
+
|
| 221 |
+
return results
|
| 222 |
+
|
| 223 |
+
def _get_metadata_scores(self, query):
|
| 224 |
+
"""Score documents by exact metadata matches"""
|
| 225 |
+
scores = {}
|
| 226 |
+
query_lower = query.lower()
|
| 227 |
+
|
| 228 |
+
# Check for material codes
|
| 229 |
+
import re
|
| 230 |
+
material_pattern = r'\b\d{2}[ХНТМКВБА]+\d{1,2}[ХНТМКВБА]*\d*\b'
|
| 231 |
+
materials_in_query = re.findall(material_pattern, query, re.IGNORECASE)
|
| 232 |
+
|
| 233 |
+
for material in materials_in_query:
|
| 234 |
+
if material in self.metadata_index['materials']:
|
| 235 |
+
for doc_id in self.metadata_index['materials'][material]:
|
| 236 |
+
scores[doc_id] = scores.get(doc_id, 0) + 1.0
|
| 237 |
+
|
| 238 |
+
# Check for GOSTs
|
| 239 |
+
gost_pattern = r'ГОСТ\s+[РЕ��\s]*\d+[\.\-\d]*'
|
| 240 |
+
gosts_in_query = re.findall(gost_pattern, query, re.IGNORECASE)
|
| 241 |
+
|
| 242 |
+
for gost in gosts_in_query:
|
| 243 |
+
if gost in self.metadata_index['gosts']:
|
| 244 |
+
for doc_id in self.metadata_index['gosts'][gost]:
|
| 245 |
+
scores[doc_id] = scores.get(doc_id, 0) + 0.8
|
| 246 |
+
|
| 247 |
+
# Check for key terms
|
| 248 |
+
for term, doc_ids in self.metadata_index['key_terms'].items():
|
| 249 |
+
if term in query_lower:
|
| 250 |
+
for doc_id in doc_ids:
|
| 251 |
+
scores[doc_id] = scores.get(doc_id, 0) + 0.5
|
| 252 |
+
|
| 253 |
+
return scores
|
requirements.txt
CHANGED
|
@@ -15,4 +15,5 @@ openpyxl
|
|
| 15 |
llama-index-llms-openai
|
| 16 |
llama-index-vector-stores-faiss
|
| 17 |
llama-index-retrievers-bm25
|
| 18 |
-
tiktoken
|
|
|
|
|
|
| 15 |
llama-index-llms-openai
|
| 16 |
llama-index-vector-stores-faiss
|
| 17 |
llama-index-retrievers-bm25
|
| 18 |
+
tiktoken
|
| 19 |
+
rank-bm25
|
table_prep.py
CHANGED
|
@@ -32,80 +32,93 @@ def create_table_content(table_data):
|
|
| 32 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 33 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
section = table_data.get('section', 'Неизвестно')
|
| 43 |
-
headers = table_data.get('headers', [])
|
| 44 |
-
table_rows = table_data.get('data', [])
|
| 45 |
-
|
| 46 |
-
if not table_rows:
|
| 47 |
-
return []
|
| 48 |
-
|
| 49 |
-
# Create header string that will be included in EVERY chunk
|
| 50 |
-
header_context = f"Таблица {table_num}: {table_title}\n"
|
| 51 |
-
header_context += f"Документ: {doc_id}\n"
|
| 52 |
-
header_context += f"Раздел: {section}\n"
|
| 53 |
-
if headers:
|
| 54 |
-
header_context += f"Заголовки: {' | '.join(headers)}\n"
|
| 55 |
-
header_context += f"Всего строк в таблице: {len(table_rows)}\n\n"
|
| 56 |
|
| 57 |
-
#
|
| 58 |
-
|
| 59 |
-
max_chunk_size = CHUNK_SIZE - len(header_context) - 500 # Safety margin
|
| 60 |
-
optimal_rows = max(5, int(max_chunk_size / avg_row_size))
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
|
| 66 |
-
total_rows = len(table_rows)
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
# Build chunk content
|
| 72 |
-
chunk_content = header_context
|
| 73 |
-
chunk_content += f"[Строки {i+1}-{min(i+optimal_rows, total_rows)} из {total_rows}]\n"
|
| 74 |
-
chunk_content += "Данные:\n"
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
"
|
| 83 |
-
"
|
| 84 |
-
"
|
| 85 |
-
|
| 86 |
-
"section": section,
|
| 87 |
-
"section_id": section,
|
| 88 |
-
"headers": headers,
|
| 89 |
-
"chunk_id": i // optimal_rows,
|
| 90 |
-
"total_chunks": (total_rows + optimal_rows - 1) // optimal_rows,
|
| 91 |
-
"row_range": f"{i+1}-{min(i+optimal_rows, total_rows)}",
|
| 92 |
-
"total_table_rows": total_rows,
|
| 93 |
-
"is_chunked": True
|
| 94 |
-
}
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
return
|
|
|
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
def table_to_document(table_data, document_id=None):
|
| 106 |
-
"""
|
| 107 |
-
Convert table to Document(s) with intelligent chunking
|
| 108 |
-
"""
|
| 109 |
if not isinstance(table_data, dict):
|
| 110 |
log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
|
| 111 |
return []
|
|
@@ -116,46 +129,41 @@ def table_to_document(table_data, document_id=None):
|
|
| 116 |
section = table_data.get('section', 'Неизвестно')
|
| 117 |
|
| 118 |
table_rows = table_data.get('data', [])
|
| 119 |
-
if not table_rows:
|
| 120 |
-
log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных")
|
| 121 |
return []
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
"is_chunked": False
|
| 150 |
-
}
|
| 151 |
-
)
|
| 152 |
-
return [doc]
|
| 153 |
else:
|
| 154 |
-
log_message(f"
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
return chunks
|
| 159 |
|
| 160 |
def load_table_data(repo_id, hf_token, table_data_dir):
|
| 161 |
log_message("=" * 60)
|
|
|
|
| 32 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 33 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 34 |
|
| 35 |
+
# In table_prep.py - replace chunk_table_document function
|
| 36 |
+
|
| 37 |
+
def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
|
| 38 |
+
if chunk_size is None:
|
| 39 |
+
chunk_size = CHUNK_SIZE
|
| 40 |
+
if chunk_overlap is None:
|
| 41 |
+
chunk_overlap = CHUNK_OVERLAP
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
# Extract critical metadata from table before chunking
|
| 44 |
+
table_metadata = extract_table_metadata(doc.text)
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
text_splitter = SentenceSplitter(
|
| 47 |
+
chunk_size=chunk_size,
|
| 48 |
+
chunk_overlap=chunk_overlap,
|
| 49 |
+
separator="\n"
|
| 50 |
+
)
|
| 51 |
|
| 52 |
+
text_chunks = text_splitter.split_text(doc.text)
|
|
|
|
| 53 |
|
| 54 |
+
chunked_docs = []
|
| 55 |
+
for i, chunk_text in enumerate(text_chunks):
|
| 56 |
+
chunk_metadata = doc.metadata.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
# Add extracted keywords/materials to each chunk
|
| 59 |
+
chunk_metadata.update({
|
| 60 |
+
"chunk_id": i,
|
| 61 |
+
"total_chunks": len(text_chunks),
|
| 62 |
+
"chunk_size": len(chunk_text),
|
| 63 |
+
"is_chunked": True,
|
| 64 |
+
"materials": table_metadata.get("materials", []), # All materials from table
|
| 65 |
+
"key_terms": table_metadata.get("key_terms", []), # Technical terms
|
| 66 |
+
"table_summary": table_metadata.get("summary", "") # Brief table description
|
| 67 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
+
# Enrich chunk text with context from full table
|
| 70 |
+
enriched_text = f"""[Таблица {doc.metadata.get('table_number')}: {doc.metadata.get('table_title')}]
|
| 71 |
+
[Материалы в таблице: {', '.join(table_metadata.get('materials', [])[:10])}]
|
| 72 |
+
[Ключевые термины: {', '.join(table_metadata.get('key_terms', [])[:10])}]
|
| 73 |
+
|
| 74 |
+
{chunk_text}"""
|
| 75 |
|
| 76 |
+
chunked_doc = Document(
|
| 77 |
+
text=enriched_text,
|
| 78 |
+
metadata=chunk_metadata
|
| 79 |
+
)
|
| 80 |
+
chunked_docs.append(chunked_doc)
|
| 81 |
|
| 82 |
+
return chunked_docs
|
| 83 |
+
|
| 84 |
|
| 85 |
+
def extract_table_metadata(table_text):
|
| 86 |
+
"""Extract searchable metadata from table content"""
|
| 87 |
+
import re
|
| 88 |
+
|
| 89 |
+
# Extract material codes (e.g., 08Х18Н10Т)
|
| 90 |
+
material_pattern = r'\b\d{2}[ХНТМКВБА]+\d{1,2}[ХНТМКВБА]*\d*\b'
|
| 91 |
+
materials = list(set(re.findall(material_pattern, table_text, re.IGNORECASE)))
|
| 92 |
+
|
| 93 |
+
# Extract GOST standards
|
| 94 |
+
gost_pattern = r'ГОСТ\s+[РЕН\s]*\d+[\.\-\d]*'
|
| 95 |
+
gosts = list(set(re.findall(gost_pattern, table_text, re.IGNORECASE)))
|
| 96 |
+
|
| 97 |
+
# Extract class/category codes
|
| 98 |
+
class_pattern = r'\b\d[АБВСI]+[IVX]+[a-z]*\b'
|
| 99 |
+
classes = list(set(re.findall(class_pattern, table_text, re.IGNORECASE)))
|
| 100 |
+
|
| 101 |
+
# Extract common technical terms
|
| 102 |
+
tech_terms = []
|
| 103 |
+
keywords = ['контроль', 'испытание', 'сертификат', 'качество', 'план',
|
| 104 |
+
'полуфабрикат', 'оборудование', 'арматура', 'деталь']
|
| 105 |
+
for keyword in keywords:
|
| 106 |
+
if keyword.lower() in table_text.lower():
|
| 107 |
+
tech_terms.append(keyword)
|
| 108 |
+
|
| 109 |
+
# Create brief summary
|
| 110 |
+
lines = table_text.split('\n')[:5]
|
| 111 |
+
summary = ' '.join([l.strip() for l in lines if l.strip()])[:200]
|
| 112 |
+
|
| 113 |
+
return {
|
| 114 |
+
"materials": materials,
|
| 115 |
+
"gosts": gosts,
|
| 116 |
+
"classes": classes,
|
| 117 |
+
"key_terms": tech_terms + gosts,
|
| 118 |
+
"summary": summary
|
| 119 |
+
}
|
| 120 |
|
| 121 |
def table_to_document(table_data, document_id=None):
|
|
|
|
|
|
|
|
|
|
| 122 |
if not isinstance(table_data, dict):
|
| 123 |
log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
|
| 124 |
return []
|
|
|
|
| 129 |
section = table_data.get('section', 'Неизвестно')
|
| 130 |
|
| 131 |
table_rows = table_data.get('data', [])
|
| 132 |
+
if not table_rows or len(table_rows) == 0:
|
| 133 |
+
log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
|
| 134 |
return []
|
| 135 |
|
| 136 |
+
content = create_table_content(table_data)
|
| 137 |
+
content_size = len(content)
|
| 138 |
+
row_count = len(table_rows)
|
| 139 |
+
|
| 140 |
+
base_doc = Document(
|
| 141 |
+
text=content,
|
| 142 |
+
metadata={
|
| 143 |
+
"type": "table",
|
| 144 |
+
"table_number": table_num,
|
| 145 |
+
"table_title": table_title,
|
| 146 |
+
"document_id": doc_id,
|
| 147 |
+
"section": section,
|
| 148 |
+
"section_id": section,
|
| 149 |
+
"total_rows": row_count,
|
| 150 |
+
"content_size": content_size
|
| 151 |
+
}
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
if content_size > CHUNK_SIZE:
|
| 155 |
+
log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
|
| 156 |
+
f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
|
| 157 |
+
chunked_docs = chunk_table_document(base_doc)
|
| 158 |
+
log_message(f" ✂️ Разделена на {len(chunked_docs)} чанков")
|
| 159 |
+
for i, chunk_doc in enumerate(chunked_docs):
|
| 160 |
+
log_message(f" Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
|
| 161 |
+
return chunked_docs
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
else:
|
| 163 |
+
log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
|
| 164 |
+
f"Размер: {content_size} символов | Строк: {row_count}")
|
| 165 |
+
return [base_doc]
|
| 166 |
+
|
|
|
|
| 167 |
|
| 168 |
def load_table_data(repo_id, hf_token, table_data_dir):
|
| 169 |
log_message("=" * 60)
|
utils.py
CHANGED
|
@@ -21,9 +21,11 @@ def get_llm_model(model_name):
|
|
| 21 |
raise Exception(f"API ключ не найден для модели {model_name}")
|
| 22 |
|
| 23 |
if model_config["provider"] == "google":
|
|
|
|
| 24 |
return GoogleGenAI(
|
| 25 |
model=model_config["model_name"],
|
| 26 |
-
api_key=model_config["api_key"]
|
|
|
|
| 27 |
)
|
| 28 |
elif model_config["provider"] == "openai":
|
| 29 |
return OpenAI(
|
|
@@ -35,7 +37,11 @@ def get_llm_model(model_name):
|
|
| 35 |
|
| 36 |
except Exception as e:
|
| 37 |
log_message(f"Ошибка создания модели {model_name}: {str(e)}")
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
|
| 41 |
return HuggingFaceEmbedding(model_name=model_name)
|
|
@@ -225,7 +231,7 @@ def generate_sources_html(nodes, chunks_df=None):
|
|
| 225 |
|
| 226 |
html += "</div>"
|
| 227 |
return html
|
| 228 |
-
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 229 |
if query_engine is None:
|
| 230 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 231 |
|
|
@@ -234,18 +240,22 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 234 |
|
| 235 |
llm = get_llm_model(current_model)
|
| 236 |
|
| 237 |
-
#
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
| 241 |
|
|
|
|
| 242 |
reranked_nodes = rerank_nodes(
|
| 243 |
question,
|
| 244 |
retrieved_nodes,
|
| 245 |
reranker,
|
| 246 |
-
top_k=20
|
| 247 |
-
min_score_threshold=0.
|
| 248 |
-
diversity_penalty=0.
|
| 249 |
)
|
| 250 |
|
| 251 |
formatted_context = format_context_for_llm(reranked_nodes)
|
|
|
|
| 21 |
raise Exception(f"API ключ не найден для модели {model_name}")
|
| 22 |
|
| 23 |
if model_config["provider"] == "google":
|
| 24 |
+
# Fix: Remove image_config parameter or set it properly
|
| 25 |
return GoogleGenAI(
|
| 26 |
model=model_config["model_name"],
|
| 27 |
+
api_key=model_config["api_key"],
|
| 28 |
+
# Don't pass image_config=None
|
| 29 |
)
|
| 30 |
elif model_config["provider"] == "openai":
|
| 31 |
return OpenAI(
|
|
|
|
| 37 |
|
| 38 |
except Exception as e:
|
| 39 |
log_message(f"Ошибка создания модели {model_name}: {str(e)}")
|
| 40 |
+
# Fix: Also apply to fallback model
|
| 41 |
+
return GoogleGenAI(
|
| 42 |
+
model="gemini-2.0-flash",
|
| 43 |
+
api_key=GOOGLE_API_KEY
|
| 44 |
+
)
|
| 45 |
|
| 46 |
def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
|
| 47 |
return HuggingFaceEmbedding(model_name=model_name)
|
|
|
|
| 231 |
|
| 232 |
html += "</div>"
|
| 233 |
return html
|
| 234 |
+
def answer_question(question, query_engine, reranker, current_model, chunks_df=None, hybrid_retriever=None):
|
| 235 |
if query_engine is None:
|
| 236 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 237 |
|
|
|
|
| 240 |
|
| 241 |
llm = get_llm_model(current_model)
|
| 242 |
|
| 243 |
+
# Use hybrid retriever if available
|
| 244 |
+
if hybrid_retriever:
|
| 245 |
+
retrieved_nodes = hybrid_retriever.retrieve(question, top_k=30)
|
| 246 |
+
log_message(f"Hybrid retrieval: получено {len(retrieved_nodes)} узлов")
|
| 247 |
+
else:
|
| 248 |
+
retrieved_nodes = query_engine.retriever.retrieve(question)
|
| 249 |
+
log_message(f"Vector retrieval: получено {len(retrieved_nodes)} узлов")
|
| 250 |
|
| 251 |
+
# Rerank with increased top_k
|
| 252 |
reranked_nodes = rerank_nodes(
|
| 253 |
question,
|
| 254 |
retrieved_nodes,
|
| 255 |
reranker,
|
| 256 |
+
top_k=25, # Increased from 20
|
| 257 |
+
min_score_threshold=0.3, # Lowered from 0.5 to catch more results
|
| 258 |
+
diversity_penalty=0.2 # Reduced penalty
|
| 259 |
)
|
| 260 |
|
| 261 |
formatted_context = format_context_for_llm(reranked_nodes)
|