MrSimple07 commited on
Commit
05c597d
·
1 Parent(s): 7062aff

new debug functions + 2000 chunk size

Browse files
Files changed (3) hide show
  1. config.py +1 -1
  2. documents_prep.py +17 -19
  3. index_retriever.py +50 -39
config.py CHANGED
@@ -52,7 +52,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
52
  CHUNK_SIZE = 1500
53
  CHUNK_OVERLAP = 128
54
 
55
- MAX_CHARS_TABLE = 10000
56
  MAX_ROWS_TABLE = 40
57
 
58
  CUSTOM_PROMPT = """
 
52
  CHUNK_SIZE = 1500
53
  CHUNK_OVERLAP = 128
54
 
55
+ MAX_CHARS_TABLE = 2000
56
  MAX_ROWS_TABLE = 40
57
 
58
  CUSTOM_PROMPT = """
documents_prep.py CHANGED
@@ -174,33 +174,31 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
174
  content += f"ТАБЛИЦА: {table_identifier}\n"
175
 
176
  # Extract and emphasize the connection type if present
 
177
  if table_title:
178
  content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
179
 
180
- # Parse type from title (e.g., "С-25" from "Тип сварного соединения С-25")
181
  import re
182
- type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
183
  if type_match:
184
- connection_type = type_match.group(0)
 
 
185
  content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
 
 
186
 
187
- if table_num and table_num != table_identifier:
188
- content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
189
-
190
- if section:
191
- content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
192
-
193
- content += f"\n{'='*70}\n"
194
-
195
- # Add headers with better formatting
196
- if headers:
197
- content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
198
- for i, h in enumerate(headers, 1):
199
- content += f" {i}. {h}\n"
200
- content += "\n"
201
 
202
- content += "ДАННЫЕ ТАБЛИЦЫ:\n"
203
- return content
204
 
205
 
206
  def format_single_row(row, idx):
 
174
  content += f"ТАБЛИЦА: {table_identifier}\n"
175
 
176
  # Extract and emphasize the connection type if present
177
+ connection_type = ''
178
  if table_title:
179
  content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
180
 
181
+ # Parse type from title - ADD MORE VARIANTS
182
  import re
183
+ type_match = re.search(r'[СУUTC]-?\s*\d+(?:-\d+)?', table_title)
184
  if type_match:
185
+ connection_type = type_match.group(0).replace(' ', '')
186
+ # Normalize: always use С (Cyrillic)
187
+ connection_type = connection_type.replace('C', 'С').replace('c', 'С')
188
  content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
189
+ # ADD SEARCHABLE KEYWORDS
190
+ content += f"КЛЮЧЕВЫЕ СЛОВА: {connection_type} тип сварного соединения\n"
191
 
192
+ # Also check table_identifier for type
193
+ if not connection_type and table_identifier:
194
+ import re
195
+ type_match = re.search(r'[СУUTC]-?\s*\d+', table_identifier)
196
+ if type_match:
197
+ connection_type = type_match.group(0).replace(' ', '')
198
+ connection_type = connection_type.replace('C', 'С')
199
+ content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
200
+ content += f"КЛЮЧЕВЫЕ СЛОВА: {connection_type} тип сварного соединения\n"
 
 
 
 
 
201
 
 
 
202
 
203
 
204
  def format_single_row(row, idx):
index_retriever.py CHANGED
@@ -44,20 +44,66 @@ def create_query_engine(vector_index):
44
  try:
45
  from config import CUSTOM_PROMPT
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  bm25_retriever = BM25Retriever.from_defaults(
48
  docstore=vector_index.docstore,
49
- similarity_top_k=200
50
  )
51
 
52
  vector_retriever = VectorIndexRetriever(
53
  index=vector_index,
54
  similarity_top_k=50,
55
- similarity_cutoff=0.35
56
  )
57
 
 
 
 
58
  hybrid_retriever = QueryFusionRetriever(
59
- [vector_retriever, bm25_retriever],
60
- similarity_top_k=150,
61
  num_queries=1
62
  )
63
 
@@ -73,42 +119,7 @@ def create_query_engine(vector_index):
73
  )
74
 
75
  log_message("Query engine успешно создан")
76
-
77
-
78
- all_nodes = list(vector_index.docstore.docs.values())
79
- c25_tables = []
80
-
81
- for node_id, node in vector_index.docstore.docs.items():
82
- metadata = node.metadata
83
- text = node.get_content()
84
-
85
- # Check if this is a С-25 table
86
- if ('С-25' in text or 'C-25' in text or
87
- 'С-25' in str(metadata.get('table_title', '')) or
88
- 'С-25' in str(metadata.get('table_number', ''))):
89
-
90
- c25_tables.append({
91
- 'node_id': node_id,
92
- 'doc_id': metadata.get('document_id'),
93
- 'table_num': metadata.get('table_number'),
94
- 'table_title': metadata.get('table_title', ''),
95
- 'text_preview': text[:200]
96
- })
97
-
98
- log_message(f"\n{'='*70}")
99
- log_message(f"DEBUG: Found {len(c25_tables)} С-25 tables in index:")
100
- for t in c25_tables:
101
- log_message(f" • {t['doc_id']} - Table {t['table_num']}")
102
- log_message(f" Title: {t['table_title']}")
103
- log_message(f" Preview: {t['text_preview']}")
104
- log_message(f"{'='*70}\n")
105
-
106
-
107
  return query_engine
108
-
109
-
110
-
111
-
112
  except Exception as e:
113
  log_message(f"Ошибка создания query engine: {str(e)}")
114
  raise
 
44
  try:
45
  from config import CUSTOM_PROMPT
46
 
47
+ # Preprocess query to expand table number patterns
48
+ class TableAwareRetriever:
49
+ def __init__(self, base_retriever):
50
+ self.base_retriever = base_retriever
51
+
52
+ def retrieve(self, query_str):
53
+ import re
54
+
55
+ # Expand queries with table numbers
56
+ queries = [query_str]
57
+
58
+ # Extract table numbers like С-25, C-25, С25
59
+ table_patterns = re.findall(r'[СCс]-?\s*\d+', query_str)
60
+ if table_patterns:
61
+ for pattern in table_patterns:
62
+ # Normalize: "С-25" -> ["С-25", "C-25", "С25", "C25"]
63
+ normalized = pattern.upper().replace(' ', '')
64
+ variants = [
65
+ normalized,
66
+ normalized.replace('С', 'C'),
67
+ normalized.replace('-', ''),
68
+ normalized.replace('С', 'C').replace('-', '')
69
+ ]
70
+ for variant in variants:
71
+ queries.append(f"тип соединения {variant}")
72
+ queries.append(f"таблица {variant}")
73
+
74
+ log_message(f"Searching with {len(queries)} query variants: {queries[:3]}...")
75
+
76
+ # Retrieve with all variants
77
+ all_nodes = []
78
+ seen_ids = set()
79
+
80
+ for q in queries:
81
+ nodes = self.base_retriever.retrieve(q)
82
+ for node in nodes:
83
+ node_id = id(node)
84
+ if node_id not in seen_ids:
85
+ seen_ids.add(node_id)
86
+ all_nodes.append(node)
87
+
88
+ return all_nodes
89
+
90
  bm25_retriever = BM25Retriever.from_defaults(
91
  docstore=vector_index.docstore,
92
+ similarity_top_k=100
93
  )
94
 
95
  vector_retriever = VectorIndexRetriever(
96
  index=vector_index,
97
  similarity_top_k=50,
98
+ similarity_cutoff=0.3 # Lower threshold
99
  )
100
 
101
+ # Wrap retrievers with table-aware logic
102
+ table_aware_bm25 = TableAwareRetriever(bm25_retriever)
103
+
104
  hybrid_retriever = QueryFusionRetriever(
105
+ [vector_retriever, table_aware_bm25],
106
+ similarity_top_k=200, # Increase to capture more candidates
107
  num_queries=1
108
  )
109
 
 
119
  )
120
 
121
  log_message("Query engine успешно создан")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  return query_engine
 
 
 
 
123
  except Exception as e:
124
  log_message(f"Ошибка создания query engine: {str(e)}")
125
  raise