MrSimple07 commited on
Commit
26c4970
·
1 Parent(s): 9bad02a

chunk size = 1024 + max chars = 1200 + keyword based

Browse files
Files changed (2) hide show
  1. documents_prep.py +1 -1
  2. index_retriever.py +44 -78
documents_prep.py CHANGED
@@ -38,7 +38,7 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
41
- def chunk_table_by_content(table_data, doc_id, max_chars=1000):
42
  """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
 
38
  return chunked
39
 
40
 
41
+ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
42
  """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
index_retriever.py CHANGED
@@ -27,89 +27,55 @@ def create_vector_index(documents):
27
  index = VectorStoreIndex.from_documents(documents)
28
  log_message("✓ Index created")
29
  return index
30
- from llama_index.core.vector_stores import MetadataFilters, ExactMatchFilter
31
- import re
32
-
33
- def extract_document_id(query):
34
- """Extract GOST document ID from query"""
35
- patterns = [
36
- r'ГОСТ\s*Р?\s*([\d\.]+(?:-\d{4})?)',
37
- r'НП-[\d\-]+',
38
- r'ПН\s+АЭ\s+Г-[\d\-]+'
39
- ]
40
-
41
- for pattern in patterns:
42
- match = re.search(pattern, query, re.IGNORECASE)
43
- if match:
44
- doc_id = match.group(0)
45
- # Normalize
46
- doc_id = re.sub(r'ГОСТ\s*Р', 'ГОСТ Р', doc_id, flags=re.IGNORECASE)
47
- if 'ГОСТ' in doc_id and '-' not in doc_id:
48
- doc_id += '-2020'
49
- return doc_id
50
- return None
51
 
 
 
 
 
 
 
 
 
 
52
 
53
  def create_query_engine(vector_index):
54
- """Create hybrid retrieval engine with document filtering"""
55
  log_message("Creating query engine...")
56
-
57
- def retrieve_with_filter(query_str):
58
- """Custom retrieval with optional document filtering"""
59
- doc_id = extract_document_id(query_str)
60
-
61
- if doc_id:
62
- log_message(f"Detected document filter: {doc_id}")
63
-
64
- # Try filtered retrieval first
65
- filters = MetadataFilters(
66
- filters=[ExactMatchFilter(key="document_id", value=doc_id)]
67
- )
68
-
69
- filtered_retriever = VectorIndexRetriever(
70
- index=vector_index,
71
- similarity_top_k=30,
72
- filters=filters
73
- )
74
-
75
- filtered_results = filtered_retriever.retrieve(query_str)
76
- log_message(f"Filtered retrieval: {len(filtered_results)} results from {doc_id}")
77
-
78
- if len(filtered_results) >= 10:
79
- # Good enough, use filtered results
80
- return filtered_results
81
- else:
82
- log_message("Not enough filtered results, falling back to hybrid")
83
-
84
- # Fallback to hybrid retrieval
85
- vector_retriever = VectorIndexRetriever(
86
- index=vector_index,
87
- similarity_top_k=50
88
- )
89
-
90
- bm25_retriever = BM25Retriever.from_defaults(
91
- docstore=vector_index.docstore,
92
- similarity_top_k=50
93
- )
94
-
95
- hybrid_retriever = QueryFusionRetriever(
96
- [vector_retriever, bm25_retriever],
97
- similarity_top_k=60,
98
- num_queries=1
99
- )
100
-
101
- return hybrid_retriever.retrieve(query_str)
102
-
103
- # Create custom query engine
104
- class CustomRetriever:
105
- def retrieve(self, query_str):
106
- return retrieve_with_filter(query_str)
107
-
108
  response_synthesizer = get_response_synthesizer()
109
- query_engine = RetrieverQueryEngine(
110
- retriever=CustomRetriever(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  response_synthesizer=response_synthesizer
112
  )
113
-
114
- log_message("✓ Query engine created with document filtering")
115
  return query_engine
 
27
  index = VectorStoreIndex.from_documents(documents)
28
  log_message("✓ Index created")
29
  return index
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
32
+ """Return nodes that contain at least one keyword from the query."""
33
+ keywords = [w.lower() for w in query.split() if len(w) > 2]
34
+ filtered = []
35
+ for node in nodes:
36
+ text = node.text.lower()
37
+ if any(k in text for k in keywords):
38
+ filtered.append(node)
39
+ return filtered
40
 
41
  def create_query_engine(vector_index):
42
+ """Create hybrid retrieval engine with keyword boost"""
43
  log_message("Creating query engine...")
44
+
45
+ vector_retriever = VectorIndexRetriever(
46
+ index=vector_index,
47
+ similarity_top_k=50
48
+ )
49
+ bm25_retriever = BM25Retriever.from_defaults(
50
+ docstore=vector_index.docstore,
51
+ similarity_top_k=50
52
+ )
53
+ hybrid_retriever = QueryFusionRetriever(
54
+ [vector_retriever, bm25_retriever],
55
+ similarity_top_k=60,
56
+ num_queries=1
57
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  response_synthesizer = get_response_synthesizer()
59
+
60
+ class KeywordBoostQueryEngine(RetrieverQueryEngine):
61
+ def retrieve(self, query):
62
+ # Hybrid results
63
+ hybrid_nodes = hybrid_retriever.retrieve(query)
64
+ # Keyword filter from all indexed nodes
65
+ all_nodes = list(vector_index.docstore.values())
66
+ keyword_nodes = keyword_filter_nodes(query, all_nodes)
67
+ # Combine and deduplicate
68
+ all_candidates = {id(n): n for n in hybrid_nodes + keyword_nodes}
69
+ log_message(f"Hybrid: {len(hybrid_nodes)}, Keyword: {len(keyword_nodes)}, Total: {len(all_candidates)}")
70
+ return list(all_candidates.values())[:60]
71
+ def query(self, prompt):
72
+ nodes = self.retrieve(prompt)
73
+ return response_synthesizer.synthesize(prompt, nodes)
74
+
75
+ query_engine = KeywordBoostQueryEngine(
76
+ retriever=hybrid_retriever,
77
  response_synthesizer=response_synthesizer
78
  )
79
+
80
+ log_message("✓ Query engine created (with keyword boost)")
81
  return query_engine