MrSimple07 commited on
Commit
c7a9dbd
·
1 Parent(s): 123a5db

simplest version

Browse files
Files changed (1) hide show
  1. utils.py +34 -56
utils.py CHANGED
@@ -24,15 +24,9 @@ def format_sources(nodes):
24
  doc_id = meta.get('document_id', 'unknown')
25
 
26
  if doc_type == 'table':
27
- table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
28
  title = meta.get('table_title', '')
29
- section = meta.get('section', '')
30
- source = f"📊 {doc_id} - {table_id}"
31
- if title:
32
- source += f": {title}"
33
- if section:
34
- source += f" ({section})"
35
- sources.append(source)
36
  elif doc_type == 'image':
37
  img_num = meta.get('image_number', 'unknown')
38
  sources.append(f"🖼️ {doc_id} - Рисунок {img_num}")
@@ -40,7 +34,7 @@ def format_sources(nodes):
40
  section = meta.get('section_id', '')
41
  sources.append(f"📄 {doc_id} - Раздел {section}")
42
 
43
- return "\n".join(sources) # Don't use set() to preserve order
44
 
45
  def preprocess_query(question):
46
  import re
@@ -79,68 +73,52 @@ def preprocess_query(question):
79
 
80
  def answer_question(question, query_engine, reranker):
81
  try:
82
- log_message(f"\n{'='*70}")
83
  log_message(f"Query: {question}")
84
- log_message(f"{'='*70}")
85
 
86
  enhanced_query = preprocess_query(question)
87
  if enhanced_query != question:
88
  log_message(f"Enhanced query: {enhanced_query}")
89
 
90
  retrieved = query_engine.retriever.retrieve(enhanced_query)
91
- log_message(f"\n📥 INITIAL RETRIEVAL: {len(retrieved)} nodes")
92
 
93
- # Detailed logging
94
- doc_ids = {}
95
- for n in retrieved:
96
- doc_id = n.metadata.get('document_id', 'unknown')
97
- if doc_id not in doc_ids:
98
- doc_ids[doc_id] = {'tables': [], 'text': 0, 'images': 0}
99
-
100
- if n.metadata.get('type') == 'table':
101
- table_id = n.metadata.get('table_identifier', n.metadata.get('table_number', ''))
102
- doc_ids[doc_id]['tables'].append(table_id)
103
- elif n.metadata.get('type') == 'image':
104
- doc_ids[doc_id]['images'] += 1
105
- else:
106
- doc_ids[doc_id]['text'] += 1
107
-
108
- for doc_id, counts in doc_ids.items():
109
- log_message(f" 📄 {doc_id}:")
110
- if counts['tables']:
111
- log_message(f" Tables: {', '.join(set(counts['tables']))}")
112
- if counts['text']:
113
- log_message(f" Text chunks: {counts['text']}")
114
- if counts['images']:
115
- log_message(f" Images: {counts['images']}")
116
 
117
  reranked = rerank_nodes(question, retrieved, reranker, top_k=25)
118
- log_message(f"\n🔄 AFTER RERANKING: {len(reranked)} nodes")
119
 
120
- # Detailed reranking results
121
- doc_ids_reranked = {}
 
 
 
 
 
122
  for n in reranked:
123
- doc_id = n.metadata.get('document_id', 'unknown')
124
- if doc_id not in doc_ids_reranked:
125
- doc_ids_reranked[doc_id] = {'tables': [], 'text': 0, 'images': 0}
126
 
127
- if n.metadata.get('type') == 'table':
128
- table_id = n.metadata.get('table_identifier', n.metadata.get('table_number', ''))
129
- doc_ids_reranked[doc_id]['tables'].append(table_id)
130
- elif n.metadata.get('type') == 'image':
131
- doc_ids_reranked[doc_id]['images'] += 1
 
 
 
 
132
  else:
133
- doc_ids_reranked[doc_id]['text'] += 1
 
 
 
134
 
135
- for doc_id, counts in doc_ids_reranked.items():
136
- log_message(f" 📄 {doc_id}:")
137
- if counts['tables']:
138
- log_message(f" Tables: {', '.join(set(counts['tables']))}")
139
- if counts['text']:
140
- log_message(f" Text chunks: {counts['text']}")
141
- if counts['images']:
142
- log_message(f" Images: {counts['images']}")
143
- context = "\n\n" + ("="*70 + "\n\n").join(doc_ids_reranked)
144
 
145
  prompt = f"""Ты эксперт по технической документации.
146
 
 
24
  doc_id = meta.get('document_id', 'unknown')
25
 
26
  if doc_type == 'table':
27
+ table_num = meta.get('table_number', 'unknown')
28
  title = meta.get('table_title', '')
29
+ sources.append(f"📊 {doc_id} - Таблица {table_num}: {title}")
 
 
 
 
 
 
30
  elif doc_type == 'image':
31
  img_num = meta.get('image_number', 'unknown')
32
  sources.append(f"🖼️ {doc_id} - Рисунок {img_num}")
 
34
  section = meta.get('section_id', '')
35
  sources.append(f"📄 {doc_id} - Раздел {section}")
36
 
37
+ return "\n".join(set(sources))
38
 
39
  def preprocess_query(question):
40
  import re
 
73
 
74
  def answer_question(question, query_engine, reranker):
75
  try:
 
76
  log_message(f"Query: {question}")
 
77
 
78
  enhanced_query = preprocess_query(question)
79
  if enhanced_query != question:
80
  log_message(f"Enhanced query: {enhanced_query}")
81
 
82
  retrieved = query_engine.retriever.retrieve(enhanced_query)
83
+ log_message(f"Retrieved {len(retrieved)} nodes")
84
 
85
+ doc_ids = [n.metadata.get('document_id', 'unknown') for n in retrieved]
86
+ table_nums = [n.metadata.get('table_number', '') for n in retrieved if n.metadata.get('type') == 'table']
87
+ log_message(f"Retrieved from documents: {set(doc_ids)}")
88
+ if table_nums:
89
+ log_message(f"Retrieved tables: {set(table_nums)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  reranked = rerank_nodes(question, retrieved, reranker, top_k=25)
92
+ log_message(f"Reranked to {len(reranked)} nodes")
93
 
94
+ doc_ids_reranked = [n.metadata.get('document_id', 'unknown') for n in reranked]
95
+ table_nums_reranked = [n.metadata.get('table_number', '') for n in reranked if n.metadata.get('type') == 'table']
96
+ log_message(f"After reranking - documents: {set(doc_ids_reranked)}")
97
+ if table_nums_reranked:
98
+ log_message(f"After reranking - tables: {set(table_nums_reranked)}")
99
+
100
+ context_parts = []
101
  for n in reranked:
102
+ meta = n.metadata
103
+ doc_id = meta.get('document_id', 'unknown')
104
+ doc_type = meta.get('type', 'text')
105
 
106
+ if doc_type == 'table':
107
+ table_num = meta.get('table_number', 'unknown')
108
+ title = meta.get('table_title', '')
109
+ source_label = f"[ТАБЛИЦА {table_num} - {doc_id}]"
110
+ if title:
111
+ source_label += f" {title}"
112
+ elif doc_type == 'image':
113
+ img_num = meta.get('image_number', 'unknown')
114
+ source_label = f"[РИСУНОК {img_num} - {doc_id}]"
115
  else:
116
+ section = meta.get('section_id', '')
117
+ source_label = f"[{doc_id} - {section}]"
118
+
119
+ context_parts.append(f"{source_label}\n{n.text}")
120
 
121
+ context = "\n\n" + ("="*70 + "\n\n").join(context_parts)
 
 
 
 
 
 
 
 
122
 
123
  prompt = f"""Ты эксперт по технической документации.
124