Spaces:
Sleeping
Sleeping
Commit
·
aa38fcf
1
Parent(s):
eefdfd0
table prep changed
Browse files- index_retriever.py +2 -116
- table_prep.py +102 -53
- utils.py +7 -8
index_retriever.py
CHANGED
|
@@ -16,7 +16,7 @@ def create_query_engine(vector_index):
|
|
| 16 |
try:
|
| 17 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 18 |
docstore=vector_index.docstore,
|
| 19 |
-
similarity_top_k=
|
| 20 |
)
|
| 21 |
|
| 22 |
vector_retriever = VectorIndexRetriever(
|
|
@@ -49,119 +49,6 @@ def create_query_engine(vector_index):
|
|
| 49 |
log_message(f"Ошибка создания query engine: {str(e)}")
|
| 50 |
raise
|
| 51 |
|
| 52 |
-
import re
|
| 53 |
-
from typing import List, Dict, Set
|
| 54 |
-
from my_logging import log_message
|
| 55 |
-
|
| 56 |
-
def extract_keywords_from_query(query: str) -> Dict[str, List[str]]:
|
| 57 |
-
"""Extract technical keywords from query"""
|
| 58 |
-
keywords = {
|
| 59 |
-
'materials': [],
|
| 60 |
-
'gosts': [],
|
| 61 |
-
'classes': [],
|
| 62 |
-
'technical_terms': []
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
# Material codes: 08Х18Н10Т, 12Х18Н10Т, etc.
|
| 66 |
-
material_pattern = r'\b\d{2}[ХНТМКВБА]+\d{1,2}[ХНТМКВБА]*\d*\b'
|
| 67 |
-
keywords['materials'] = re.findall(material_pattern, query, re.IGNORECASE)
|
| 68 |
-
|
| 69 |
-
# GOST standards
|
| 70 |
-
gost_pattern = r'ГОСТ\s+[РЕН\s]*\d+[\.\-\d]*'
|
| 71 |
-
keywords['gosts'] = re.findall(gost_pattern, query, re.IGNORECASE)
|
| 72 |
-
|
| 73 |
-
# Classification codes: 3СIIIa, 1А, 2BII, etc.
|
| 74 |
-
class_pattern = r'\b\d[АБВГСD]+[IV]+[a-z]?\b'
|
| 75 |
-
keywords['classes'] = re.findall(class_pattern, query, re.IGNORECASE)
|
| 76 |
-
|
| 77 |
-
# Technical terms
|
| 78 |
-
terms = ['полуфабрикат', 'план качества', 'контроль', 'арматура',
|
| 79 |
-
'ультразвуковой', 'сварка', 'испытание']
|
| 80 |
-
for term in terms:
|
| 81 |
-
if term.lower() in query.lower():
|
| 82 |
-
keywords['technical_terms'].append(term)
|
| 83 |
-
|
| 84 |
-
return keywords
|
| 85 |
-
|
| 86 |
-
def keyword_search_nodes(nodes: List, keywords: Dict[str, List[str]]) -> List:
|
| 87 |
-
"""Filter nodes by exact keyword matches"""
|
| 88 |
-
if not any(keywords.values()):
|
| 89 |
-
return nodes
|
| 90 |
-
|
| 91 |
-
matched_nodes = []
|
| 92 |
-
|
| 93 |
-
for node in nodes:
|
| 94 |
-
text_lower = node.text.lower()
|
| 95 |
-
metadata = node.metadata if hasattr(node, 'metadata') else {}
|
| 96 |
-
|
| 97 |
-
# Check materials
|
| 98 |
-
for material in keywords['materials']:
|
| 99 |
-
if material.lower() in text_lower:
|
| 100 |
-
matched_nodes.append(node)
|
| 101 |
-
break
|
| 102 |
-
else:
|
| 103 |
-
# Check GOSTs
|
| 104 |
-
for gost in keywords['gosts']:
|
| 105 |
-
if gost.lower() in text_lower:
|
| 106 |
-
matched_nodes.append(node)
|
| 107 |
-
break
|
| 108 |
-
else:
|
| 109 |
-
# Check classes
|
| 110 |
-
for cls in keywords['classes']:
|
| 111 |
-
if cls.lower() in text_lower:
|
| 112 |
-
matched_nodes.append(node)
|
| 113 |
-
break
|
| 114 |
-
else:
|
| 115 |
-
# Check technical terms (at least 2 matches)
|
| 116 |
-
term_matches = sum(1 for term in keywords['technical_terms']
|
| 117 |
-
if term.lower() in text_lower)
|
| 118 |
-
if term_matches >= 2:
|
| 119 |
-
matched_nodes.append(node)
|
| 120 |
-
|
| 121 |
-
return matched_nodes
|
| 122 |
-
|
| 123 |
-
def hybrid_retrieve_with_keywords(question: str, query_engine, top_k: int = 40) -> List:
|
| 124 |
-
"""Retrieve using both vector search and keyword matching"""
|
| 125 |
-
|
| 126 |
-
# Extract keywords from query
|
| 127 |
-
keywords = extract_keywords_from_query(question)
|
| 128 |
-
log_message(f"Извлечены ключевые слова: {keywords}")
|
| 129 |
-
|
| 130 |
-
# Get vector search results
|
| 131 |
-
vector_nodes = query_engine.retriever.retrieve(question)
|
| 132 |
-
log_message(f"Векторный поиск: {len(vector_nodes)} узлов")
|
| 133 |
-
|
| 134 |
-
# Apply keyword filtering
|
| 135 |
-
if any(keywords.values()):
|
| 136 |
-
keyword_nodes = keyword_search_nodes(vector_nodes, keywords)
|
| 137 |
-
log_message(f"После фильтрации по ключевым словам: {len(keyword_nodes)} узлов")
|
| 138 |
-
|
| 139 |
-
# If keyword search found results, prioritize them
|
| 140 |
-
if keyword_nodes:
|
| 141 |
-
# Deduplicate and combine
|
| 142 |
-
seen_ids = set()
|
| 143 |
-
combined_nodes = []
|
| 144 |
-
|
| 145 |
-
# First add keyword matches
|
| 146 |
-
for node in keyword_nodes[:top_k]:
|
| 147 |
-
node_id = id(node)
|
| 148 |
-
if node_id not in seen_ids:
|
| 149 |
-
combined_nodes.append(node)
|
| 150 |
-
seen_ids.add(node_id)
|
| 151 |
-
|
| 152 |
-
# Then fill with vector results
|
| 153 |
-
for node in vector_nodes:
|
| 154 |
-
if len(combined_nodes) >= top_k:
|
| 155 |
-
break
|
| 156 |
-
node_id = id(node)
|
| 157 |
-
if node_id not in seen_ids:
|
| 158 |
-
combined_nodes.append(node)
|
| 159 |
-
seen_ids.add(node_id)
|
| 160 |
-
|
| 161 |
-
return combined_nodes[:top_k]
|
| 162 |
-
|
| 163 |
-
return vector_nodes[:top_k]
|
| 164 |
-
|
| 165 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, diversity_penalty=0.3):
|
| 166 |
if not nodes or not reranker:
|
| 167 |
return nodes[:top_k]
|
|
@@ -225,5 +112,4 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, dive
|
|
| 225 |
|
| 226 |
except Exception as e:
|
| 227 |
log_message(f"Ошибка переранжировки: {str(e)}")
|
| 228 |
-
return nodes[:top_k]
|
| 229 |
-
|
|
|
|
| 16 |
try:
|
| 17 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 18 |
docstore=vector_index.docstore,
|
| 19 |
+
similarity_top_k=30
|
| 20 |
)
|
| 21 |
|
| 22 |
vector_retriever = VectorIndexRetriever(
|
|
|
|
| 49 |
log_message(f"Ошибка создания query engine: {str(e)}")
|
| 50 |
raise
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, diversity_penalty=0.3):
|
| 53 |
if not nodes or not reranker:
|
| 54 |
return nodes[:top_k]
|
|
|
|
| 112 |
|
| 113 |
except Exception as e:
|
| 114 |
log_message(f"Ошибка переранжировки: {str(e)}")
|
| 115 |
+
return nodes[:top_k]
|
|
|
table_prep.py
CHANGED
|
@@ -32,7 +32,21 @@ def create_table_content(table_data):
|
|
| 32 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 33 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 34 |
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
|
| 38 |
if chunk_size is None:
|
|
@@ -42,37 +56,109 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
|
|
| 42 |
|
| 43 |
# Extract critical metadata from table before chunking
|
| 44 |
table_metadata = extract_table_metadata(doc.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
-
|
| 53 |
|
|
|
|
| 54 |
chunked_docs = []
|
|
|
|
|
|
|
|
|
|
| 55 |
for i, chunk_text in enumerate(text_chunks):
|
| 56 |
chunk_metadata = doc.metadata.copy()
|
| 57 |
-
|
| 58 |
-
# Add extracted keywords/materials to each chunk
|
| 59 |
chunk_metadata.update({
|
| 60 |
"chunk_id": i,
|
| 61 |
"total_chunks": len(text_chunks),
|
| 62 |
"chunk_size": len(chunk_text),
|
| 63 |
"is_chunked": True,
|
| 64 |
-
"materials":
|
| 65 |
-
"key_terms":
|
| 66 |
-
"table_summary": table_metadata.get("summary", "")
|
| 67 |
})
|
| 68 |
|
| 69 |
-
#
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
{chunk_text}"""
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
chunked_doc = Document(
|
| 77 |
text=enriched_text,
|
| 78 |
metadata=chunk_metadata
|
|
@@ -81,43 +167,6 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
|
|
| 81 |
|
| 82 |
return chunked_docs
|
| 83 |
|
| 84 |
-
|
| 85 |
-
def extract_table_metadata(table_text):
|
| 86 |
-
"""Extract searchable metadata from table content"""
|
| 87 |
-
import re
|
| 88 |
-
|
| 89 |
-
# Extract material codes (e.g., 08Х18Н10Т)
|
| 90 |
-
material_pattern = r'\b\d{2}[ХНТМКВБА]+\d{1,2}[ХНТМКВБА]*\d*\b'
|
| 91 |
-
materials = list(set(re.findall(material_pattern, table_text, re.IGNORECASE)))
|
| 92 |
-
|
| 93 |
-
# Extract GOST standards
|
| 94 |
-
gost_pattern = r'ГОСТ\s+[РЕН\s]*\d+[\.\-\d]*'
|
| 95 |
-
gosts = list(set(re.findall(gost_pattern, table_text, re.IGNORECASE)))
|
| 96 |
-
|
| 97 |
-
# Extract class/category codes
|
| 98 |
-
class_pattern = r'\b\d[АБВСI]+[IVX]+[a-z]*\b'
|
| 99 |
-
classes = list(set(re.findall(class_pattern, table_text, re.IGNORECASE)))
|
| 100 |
-
|
| 101 |
-
# Extract common technical terms
|
| 102 |
-
tech_terms = []
|
| 103 |
-
keywords = ['контроль', 'испытание', 'сертификат', 'качество', 'план',
|
| 104 |
-
'полуфабрикат', 'оборудование', 'арматура', 'деталь']
|
| 105 |
-
for keyword in keywords:
|
| 106 |
-
if keyword.lower() in table_text.lower():
|
| 107 |
-
tech_terms.append(keyword)
|
| 108 |
-
|
| 109 |
-
# Create brief summary
|
| 110 |
-
lines = table_text.split('\n')[:5]
|
| 111 |
-
summary = ' '.join([l.strip() for l in lines if l.strip()])[:200]
|
| 112 |
-
|
| 113 |
-
return {
|
| 114 |
-
"materials": materials,
|
| 115 |
-
"gosts": gosts,
|
| 116 |
-
"classes": classes,
|
| 117 |
-
"key_terms": tech_terms + gosts,
|
| 118 |
-
"summary": summary
|
| 119 |
-
}
|
| 120 |
-
|
| 121 |
def table_to_document(table_data, document_id=None):
|
| 122 |
if not isinstance(table_data, dict):
|
| 123 |
log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
|
|
|
|
| 32 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 33 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 34 |
|
| 35 |
+
def extract_table_metadata(table_text: str) -> dict:
|
| 36 |
+
words = table_text.split()
|
| 37 |
+
unique_words = set(words)
|
| 38 |
+
|
| 39 |
+
from collections import Counter
|
| 40 |
+
stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
|
| 41 |
+
filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
|
| 42 |
+
common = Counter(filtered).most_common(15)
|
| 43 |
+
key_terms = [w for w, _ in common]
|
| 44 |
+
|
| 45 |
+
return {
|
| 46 |
+
"summary": f"Таблица содержит около {len(words)} слов и {len(unique_words)} уникальных терминов.",
|
| 47 |
+
"materials": [], # if you want to extract material names, hook in regex or LLM here
|
| 48 |
+
"key_terms": key_terms
|
| 49 |
+
}
|
| 50 |
|
| 51 |
def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
|
| 52 |
if chunk_size is None:
|
|
|
|
| 56 |
|
| 57 |
# Extract critical metadata from table before chunking
|
| 58 |
table_metadata = extract_table_metadata(doc.text)
|
| 59 |
+
table_num = doc.metadata.get('table_number', 'unknown')
|
| 60 |
+
table_title = doc.metadata.get('table_title', 'unknown')
|
| 61 |
+
doc_id = doc.metadata.get('document_id', 'unknown')
|
| 62 |
+
section = doc.metadata.get('section', 'unknown')
|
| 63 |
|
| 64 |
+
# Parse table structure from your create_table_content format
|
| 65 |
+
lines = doc.text.strip().split('\n')
|
| 66 |
+
|
| 67 |
+
# Find where data rows start
|
| 68 |
+
table_header_lines = []
|
| 69 |
+
data_rows = []
|
| 70 |
+
in_data = False
|
| 71 |
+
|
| 72 |
+
for line in lines:
|
| 73 |
+
if line.startswith('Данные таблицы:'):
|
| 74 |
+
in_data = True
|
| 75 |
+
table_header_lines.append(line)
|
| 76 |
+
elif in_data and line.startswith('Строка'):
|
| 77 |
+
data_rows.append(line)
|
| 78 |
+
elif not in_data:
|
| 79 |
+
table_header_lines.append(line)
|
| 80 |
+
|
| 81 |
+
table_header = '\n'.join(table_header_lines) + '\n'
|
| 82 |
+
|
| 83 |
+
if not data_rows:
|
| 84 |
+
log_message(f" ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
|
| 85 |
+
text_splitter = SentenceSplitter(
|
| 86 |
+
chunk_size=chunk_size,
|
| 87 |
+
chunk_overlap=chunk_overlap,
|
| 88 |
+
separator="\n"
|
| 89 |
+
)
|
| 90 |
+
text_chunks = text_splitter.split_text(doc.text)
|
| 91 |
+
log_message(f" 📊 Стандартное разбиение: {len(text_chunks)} чанков")
|
| 92 |
+
else:
|
| 93 |
+
# Row-based chunking
|
| 94 |
+
log_message(f" 📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
|
| 95 |
+
|
| 96 |
+
header_size = len(table_header)
|
| 97 |
+
# Reserve space for enrichment prefix
|
| 98 |
+
available_size = chunk_size - header_size - 300
|
| 99 |
+
|
| 100 |
+
text_chunks = []
|
| 101 |
+
current_chunk_rows = []
|
| 102 |
+
current_size = 0
|
| 103 |
+
|
| 104 |
+
for row in data_rows:
|
| 105 |
+
row_size = len(row) + 1
|
| 106 |
+
|
| 107 |
+
# Check if adding this row exceeds limit
|
| 108 |
+
if current_size + row_size > available_size and current_chunk_rows:
|
| 109 |
+
# Create chunk
|
| 110 |
+
chunk_text = table_header + '\n'.join(current_chunk_rows)
|
| 111 |
+
text_chunks.append(chunk_text)
|
| 112 |
+
log_message(f" ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
|
| 113 |
+
|
| 114 |
+
# Overlap: keep last 2 rows
|
| 115 |
+
overlap_count = min(2, len(current_chunk_rows))
|
| 116 |
+
current_chunk_rows = current_chunk_rows[-overlap_count:]
|
| 117 |
+
current_size = sum(len(r) + 1 for r in current_chunk_rows)
|
| 118 |
+
|
| 119 |
+
current_chunk_rows.append(row)
|
| 120 |
+
current_size += row_size
|
| 121 |
+
|
| 122 |
+
# Final chunk
|
| 123 |
+
if current_chunk_rows:
|
| 124 |
+
chunk_text = table_header + '\n'.join(current_chunk_rows)
|
| 125 |
+
text_chunks.append(chunk_text)
|
| 126 |
+
log_message(f" ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
|
| 127 |
|
| 128 |
+
log_message(f" 📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
|
| 129 |
|
| 130 |
+
# Create enriched chunks
|
| 131 |
chunked_docs = []
|
| 132 |
+
materials = table_metadata.get("materials", [])
|
| 133 |
+
key_terms = table_metadata.get("key_terms", [])
|
| 134 |
+
|
| 135 |
for i, chunk_text in enumerate(text_chunks):
|
| 136 |
chunk_metadata = doc.metadata.copy()
|
|
|
|
|
|
|
| 137 |
chunk_metadata.update({
|
| 138 |
"chunk_id": i,
|
| 139 |
"total_chunks": len(text_chunks),
|
| 140 |
"chunk_size": len(chunk_text),
|
| 141 |
"is_chunked": True,
|
| 142 |
+
"materials": materials,
|
| 143 |
+
"key_terms": key_terms,
|
| 144 |
+
"table_summary": table_metadata.get("summary", "")
|
| 145 |
})
|
| 146 |
|
| 147 |
+
# Enrichment prefix
|
| 148 |
+
materials_str = ', '.join(materials[:10]) if materials else 'нет'
|
| 149 |
+
terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
|
| 150 |
+
|
| 151 |
+
enriched_text = f"""[Таблица {table_num}: {table_title}]
|
| 152 |
+
[Материалы в таблице: {materials_str}]
|
| 153 |
+
[Ключевые термины: {terms_str}]
|
| 154 |
|
| 155 |
{chunk_text}"""
|
| 156 |
|
| 157 |
+
log_message(f" ✓ Чанк {i+1}/{len(text_chunks)}: "
|
| 158 |
+
f"размер={len(enriched_text)}, "
|
| 159 |
+
f"материалов={len(materials)}, "
|
| 160 |
+
f"терминов={len(key_terms)}")
|
| 161 |
+
|
| 162 |
chunked_doc = Document(
|
| 163 |
text=enriched_text,
|
| 164 |
metadata=chunk_metadata
|
|
|
|
| 167 |
|
| 168 |
return chunked_docs
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
def table_to_document(table_data, document_id=None):
|
| 171 |
if not isinstance(table_data, dict):
|
| 172 |
log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
|
utils.py
CHANGED
|
@@ -231,8 +231,7 @@ def generate_sources_html(nodes, chunks_df=None):
|
|
| 231 |
|
| 232 |
html += "</div>"
|
| 233 |
return html
|
| 234 |
-
def answer_question(question, query_engine, reranker, current_model, chunks_df=None
|
| 235 |
-
from index_retriever import hybrid_retrieve_with_keywords
|
| 236 |
if query_engine is None:
|
| 237 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 238 |
|
|
@@ -241,18 +240,18 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 241 |
|
| 242 |
llm = get_llm_model(current_model)
|
| 243 |
|
| 244 |
-
#
|
| 245 |
-
retrieved_nodes =
|
| 246 |
-
|
|
|
|
| 247 |
|
| 248 |
-
# Rerank
|
| 249 |
reranked_nodes = rerank_nodes(
|
| 250 |
question,
|
| 251 |
retrieved_nodes,
|
| 252 |
reranker,
|
| 253 |
top_k=25,
|
| 254 |
-
min_score_threshold=0.
|
| 255 |
-
diversity_penalty=0.
|
| 256 |
)
|
| 257 |
|
| 258 |
formatted_context = format_context_for_llm(reranked_nodes)
|
|
|
|
| 231 |
|
| 232 |
html += "</div>"
|
| 233 |
return html
|
| 234 |
+
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
|
|
|
| 235 |
if query_engine is None:
|
| 236 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 237 |
|
|
|
|
| 240 |
|
| 241 |
llm = get_llm_model(current_model)
|
| 242 |
|
| 243 |
+
# Direct retrieval without query expansion
|
| 244 |
+
retrieved_nodes = query_engine.retriever.retrieve(question)
|
| 245 |
+
|
| 246 |
+
log_message(f"Получено {len(retrieved_nodes)} узлов")
|
| 247 |
|
|
|
|
| 248 |
reranked_nodes = rerank_nodes(
|
| 249 |
question,
|
| 250 |
retrieved_nodes,
|
| 251 |
reranker,
|
| 252 |
top_k=25,
|
| 253 |
+
min_score_threshold=0.5,
|
| 254 |
+
diversity_penalty=0.3
|
| 255 |
)
|
| 256 |
|
| 257 |
formatted_context = format_context_for_llm(reranked_nodes)
|