MrSimple07 commited on
Commit
c33deff
·
1 Parent(s): 31659d7

removed normalization doc id

Browse files
Files changed (1) hide show
  1. utils.py +3 -65
utils.py CHANGED
@@ -39,62 +39,15 @@ def format_sources(nodes):
39
 
40
  import re
41
 
42
- def extract_document_id(query):
43
- """Extract explicit document IDs from query"""
44
- patterns = [
45
- r'ГОСТ\s*Р?\s*[\d.-]+(?:-\d{4})?', # ГОСТ 59023.4 or ГОСТ Р 59023.5-2020
46
- r'НП-\d+-\d+',
47
- r'МУ[_\s][\d.]+',
48
- ]
49
-
50
- for pattern in patterns:
51
- match = re.search(pattern, query, re.IGNORECASE)
52
- if match:
53
- return match.group(0).strip()
54
- return None
55
-
56
- def normalize_doc_id(doc_id):
57
- normalized = doc_id.replace(' ', '').replace('р', '').replace('Р', '').lower()
58
- normalized = re.sub(r'-\d{4}$', '', normalized)
59
- normalized = normalized.replace('.', '') # Remove dots for flexible matching
60
- return normalized
61
-
62
  def answer_question(question, query_engine, reranker):
63
  try:
64
  log_message(f"\n{'='*70}")
65
  log_message(f"QUERY: {question}")
66
 
67
- target_doc_id = extract_document_id(question)
68
- found_docs = set(normalize_doc_id(node.metadata.get('document_id', 'unknown')) for node in query_engine.retrieve(question))
69
- log_message(f"NORMALIZED DOCS IN RETRIEVED: {', '.join(list(found_docs))}")
70
- if target_doc_id:
71
- log_message(f"TARGET DOCUMENT: {target_doc_id}")
72
-
73
  retrieved = query_engine.retrieve(question)
74
  log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
75
-
76
- if target_doc_id:
77
- target_normalized = normalize_doc_id(target_doc_id)
78
- filtered = [
79
- node for node in retrieved
80
- if target_normalized in normalize_doc_id(node.metadata.get('document_id', ''))
81
- ]
82
-
83
- log_message(f"FILTERED TO TARGET DOC: {len(filtered)} nodes")
84
-
85
- # Debug: show what document IDs were found
86
- if not filtered and len(retrieved) > 0:
87
- found_docs = set(node.metadata.get('document_id', 'unknown') for node in retrieved[:10])
88
- log_message(f"AVAILABLE DOCS (sample): {', '.join(list(found_docs)[:5])}")
89
-
90
- if not filtered:
91
- log_message(f"WARNING: No nodes found for {target_doc_id}")
92
- return f"В базе данных не найдены таблицы из документа {target_doc_id}.", ""
93
-
94
- retrieved = filtered
95
-
96
- # Rest stays the same...
97
- reranked = rerank_nodes(question, retrieved, reranker, top_k=20, min_score=0.25)
98
  log_message(f"RERANKED: {len(reranked)} nodes")
99
 
100
  context_parts = []
@@ -102,7 +55,6 @@ def answer_question(question, query_engine, reranker):
102
  meta = n.metadata
103
  doc_id = meta.get('document_id', 'unknown')
104
  doc_type = meta.get('type', 'text')
105
-
106
  if doc_type == 'table':
107
  table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
108
  title = meta.get('table_title', '')
@@ -111,7 +63,6 @@ def answer_question(question, query_engine, reranker):
111
  source_label += f" {title}"
112
  else:
113
  source_label = f"[{doc_id}]"
114
-
115
  context_parts.append(f"{source_label}\n{n.text}")
116
 
117
  context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
@@ -119,24 +70,11 @@ def answer_question(question, query_engine, reranker):
119
  from config import CUSTOM_PROMPT
120
  prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
121
  log_message(f"PROMPT LENGTH: {len(prompt)} chars")
122
-
123
  from llama_index.core import Settings
124
  response = Settings.llm.complete(prompt)
125
 
126
  sources = format_sources(reranked)
127
-
128
- log_message(f"\n{'='*70}")
129
- log_message("RETRIEVED CHUNKS:")
130
- for i, node in enumerate(reranked, 1):
131
- log_message(f"\n--- Chunk {i} ---")
132
- log_message(f"Document: {node.metadata.get('document_id')}")
133
- log_message(f"Type: {node.metadata.get('type')}")
134
- if node.metadata.get('type') == 'table':
135
- table_id = node.metadata.get('table_identifier')
136
- rows = f"{node.metadata.get('row_start', 0)}-{node.metadata.get('row_end', 0)}"
137
- log_message(f"Table: {table_id} (rows {rows})")
138
- log_message(f"Text: {node.text[:300]}...")
139
-
140
  return response.text, sources
141
 
142
  except Exception as e:
 
39
 
40
  import re
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def answer_question(question, query_engine, reranker):
43
  try:
44
  log_message(f"\n{'='*70}")
45
  log_message(f"QUERY: {question}")
46
 
 
 
 
 
 
 
47
  retrieved = query_engine.retrieve(question)
48
  log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
49
+
50
+ reranked = rerank_nodes(question, retrieved, reranker, top_k=20, min_score=0.3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  log_message(f"RERANKED: {len(reranked)} nodes")
52
 
53
  context_parts = []
 
55
  meta = n.metadata
56
  doc_id = meta.get('document_id', 'unknown')
57
  doc_type = meta.get('type', 'text')
 
58
  if doc_type == 'table':
59
  table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
60
  title = meta.get('table_title', '')
 
63
  source_label += f" {title}"
64
  else:
65
  source_label = f"[{doc_id}]"
 
66
  context_parts.append(f"{source_label}\n{n.text}")
67
 
68
  context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
 
70
  from config import CUSTOM_PROMPT
71
  prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
72
  log_message(f"PROMPT LENGTH: {len(prompt)} chars")
73
+
74
  from llama_index.core import Settings
75
  response = Settings.llm.complete(prompt)
76
 
77
  sources = format_sources(reranked)
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  return response.text, sources
79
 
80
  except Exception as e: