MrSimple07 commited on
Commit
6c83262
·
1 Parent(s): 944b5ee
Files changed (2) hide show
  1. index_retriever.py +3 -3
  2. utils.py +51 -73
index_retriever.py CHANGED
@@ -16,18 +16,18 @@ def create_query_engine(vector_index):
16
  try:
17
  bm25_retriever = BM25Retriever.from_defaults(
18
  docstore=vector_index.docstore,
19
- similarity_top_k=20
20
  )
21
 
22
  vector_retriever = VectorIndexRetriever(
23
  index=vector_index,
24
- similarity_top_k=30,
25
  similarity_cutoff=0.7
26
  )
27
 
28
  hybrid_retriever = QueryFusionRetriever(
29
  [vector_retriever, bm25_retriever],
30
- similarity_top_k=40,
31
  num_queries=1
32
  )
33
 
 
16
  try:
17
  bm25_retriever = BM25Retriever.from_defaults(
18
  docstore=vector_index.docstore,
19
+ similarity_top_k=15
20
  )
21
 
22
  vector_retriever = VectorIndexRetriever(
23
  index=vector_index,
24
+ similarity_top_k=20,
25
  similarity_cutoff=0.7
26
  )
27
 
28
  hybrid_retriever = QueryFusionRetriever(
29
  [vector_retriever, bm25_retriever],
30
+ similarity_top_k=30,
31
  num_queries=1
32
  )
33
 
utils.py CHANGED
@@ -43,69 +43,6 @@ def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingua
43
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
44
  return CrossEncoder(model_name)
45
 
46
- def format_context_for_llm(nodes):
47
- context_parts = []
48
-
49
- for node in nodes:
50
- metadata = node.metadata if hasattr(node, 'metadata') else {}
51
- doc_id = metadata.get('document_id', 'Неизвестный документ')
52
-
53
- section_info = ""
54
-
55
- if metadata.get('section_path'):
56
- section_path = metadata['section_path']
57
- section_text = metadata.get('section_text', '')
58
- parent_section = metadata.get('parent_section', '')
59
- parent_title = metadata.get('parent_title', '')
60
- level = metadata.get('level', '')
61
-
62
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
63
- # For subsections, show: пункт X.X в разделе X (Title)
64
- section_info = f"пункт {section_path} в разделе {parent_section} ({parent_title})"
65
- elif section_text:
66
- # For main sections, show: пункт X (Title)
67
- section_info = f"пункт {section_path} ({section_text})"
68
- else:
69
- section_info = f"пункт {section_path}"
70
- elif metadata.get('section_id'):
71
- section_id = metadata['section_id']
72
- section_text = metadata.get('section_text', '')
73
- level = metadata.get('level', '')
74
- parent_section = metadata.get('parent_section', '')
75
- parent_title = metadata.get('parent_title', '')
76
-
77
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
78
- # For subsections without section_path, show: пункт X.X в разделе X (Title)
79
- section_info = f"пункт {section_id} в разделе {parent_section} ({parent_title})"
80
- elif section_text:
81
- section_info = f"пункт {section_id} ({section_text})"
82
- else:
83
- section_info = f"пункт {section_id}"
84
-
85
- if metadata.get('type') == 'table' and metadata.get('table_number'):
86
- table_num = metadata['table_number']
87
- if not str(table_num).startswith('№'):
88
- table_num = f"№{table_num}"
89
- section_info = f"таблица {table_num}"
90
-
91
- if metadata.get('type') == 'image' and metadata.get('image_number'):
92
- image_num = metadata['image_number']
93
- if not str(image_num).startswith('№'):
94
- image_num = f"№{image_num}"
95
- section_info = f"рисунок {image_num}"
96
-
97
- context_text = node.text if hasattr(node, 'text') else str(node)
98
-
99
- if section_info:
100
- formatted_context = f"[ИСТОЧНИК: {section_info} документа {doc_id}]\n{context_text}\n"
101
- else:
102
- formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
103
-
104
- context_parts.append(formatted_context)
105
-
106
- return "\n".join(context_parts)
107
-
108
-
109
  def get_llm_model(model_name):
110
  try:
111
  model_config = AVAILABLE_MODELS.get(model_name)
@@ -148,42 +85,82 @@ def format_context_for_llm(nodes):
148
 
149
  section_info = ""
150
 
 
151
  if metadata.get('section_path'):
152
  section_path = metadata['section_path']
153
  section_text = metadata.get('section_text', '')
154
  parent_section = metadata.get('parent_section', '')
155
  parent_title = metadata.get('parent_title', '')
 
156
 
157
- if metadata.get('level') in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
158
- section_info = f"пункт {section_path} ({section_text}) в разделе {parent_section} ({parent_title})"
 
 
 
 
159
  elif section_text:
160
- section_info = f"пункт {section_path} ({section_text})"
 
161
  else:
162
- section_info = f"пункт {section_path}"
 
163
  elif metadata.get('section_id'):
164
  section_id = metadata['section_id']
165
  section_text = metadata.get('section_text', '')
166
- if section_text:
167
- section_info = f"пункт {section_id} ({section_text})"
 
 
 
 
 
 
 
 
 
168
  else:
169
- section_info = f"пункт {section_id}"
170
 
 
171
  if metadata.get('type') == 'table' and metadata.get('table_number'):
172
  table_num = metadata['table_number']
173
  if not str(table_num).startswith('№'):
174
  table_num = f"№{table_num}"
175
- section_info = f"таблица {table_num}"
 
 
 
 
 
 
 
 
 
 
 
176
 
177
  if metadata.get('type') == 'image' and metadata.get('image_number'):
178
  image_num = metadata['image_number']
179
  if not str(image_num).startswith('№'):
180
  image_num = f"№{image_num}"
181
- section_info = f"рисунок {image_num}"
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  context_text = node.text if hasattr(node, 'text') else str(node)
184
 
185
  if section_info:
186
- formatted_context = f"[ИСТОЧНИК: {section_info} документа {doc_id}]\n{context_text}\n"
187
  else:
188
  formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
189
 
@@ -191,6 +168,7 @@ def format_context_for_llm(nodes):
191
 
192
  return "\n".join(context_parts)
193
 
 
194
  def generate_sources_html(nodes, chunks_df=None):
195
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
196
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
 
43
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
44
  return CrossEncoder(model_name)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def get_llm_model(model_name):
47
  try:
48
  model_config = AVAILABLE_MODELS.get(model_name)
 
85
 
86
  section_info = ""
87
 
88
+ # Handle section information with proper hierarchy
89
  if metadata.get('section_path'):
90
  section_path = metadata['section_path']
91
  section_text = metadata.get('section_text', '')
92
  parent_section = metadata.get('parent_section', '')
93
  parent_title = metadata.get('parent_title', '')
94
+ level = metadata.get('level', '')
95
 
96
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
97
+ # For subsections: раздел X (Title), пункт X.X
98
+ if section_text:
99
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path} ({section_text})"
100
+ else:
101
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path}"
102
  elif section_text:
103
+ # For main sections: раздел X (Title)
104
+ section_info = f"раздел {section_path} ({section_text})"
105
  else:
106
+ section_info = f"раздел {section_path}"
107
+
108
  elif metadata.get('section_id'):
109
  section_id = metadata['section_id']
110
  section_text = metadata.get('section_text', '')
111
+ level = metadata.get('level', '')
112
+ parent_section = metadata.get('parent_section', '')
113
+ parent_title = metadata.get('parent_title', '')
114
+
115
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
116
+ if section_text:
117
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id} ({section_text})"
118
+ else:
119
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id}"
120
+ elif section_text:
121
+ section_info = f"раздел {section_id} ({section_text})"
122
  else:
123
+ section_info = f"раздел {section_id}"
124
 
125
+ # Override with table/image info if applicable
126
  if metadata.get('type') == 'table' and metadata.get('table_number'):
127
  table_num = metadata['table_number']
128
  if not str(table_num).startswith('№'):
129
  table_num = f"№{table_num}"
130
+ table_title = metadata.get('table_title', '')
131
+ # Include section context for tables
132
+ base_section = ""
133
+ if metadata.get('section_path'):
134
+ base_section = f", раздел {metadata['section_path']}"
135
+ elif metadata.get('section_id'):
136
+ base_section = f", раздел {metadata['section_id']}"
137
+
138
+ if table_title:
139
+ section_info = f"Таблица {table_num} ({table_title}){base_section}"
140
+ else:
141
+ section_info = f"Таблица {table_num}{base_section}"
142
 
143
  if metadata.get('type') == 'image' and metadata.get('image_number'):
144
  image_num = metadata['image_number']
145
  if not str(image_num).startswith('№'):
146
  image_num = f"№{image_num}"
147
+ image_title = metadata.get('image_title', '')
148
+ # Include section context for images
149
+ base_section = ""
150
+ if metadata.get('section_path'):
151
+ base_section = f", раздел {metadata['section_path']}"
152
+ elif metadata.get('section_id'):
153
+ base_section = f", раздел {metadata['section_id']}"
154
+
155
+ if image_title:
156
+ section_info = f"Рисунок {image_num} ({image_title}){base_section}"
157
+ else:
158
+ section_info = f"Рисунок {image_num}{base_section}"
159
 
160
  context_text = node.text if hasattr(node, 'text') else str(node)
161
 
162
  if section_info:
163
+ formatted_context = f"[ИСТОЧНИК: {section_info}, документ {doc_id}]\n{context_text}\n"
164
  else:
165
  formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
166
 
 
168
 
169
  return "\n".join(context_parts)
170
 
171
+
172
  def generate_sources_html(nodes, chunks_df=None):
173
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
174
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"