Spaces:
Sleeping
Sleeping
Commit
·
26c4970
1
Parent(s):
9bad02a
chunk size = 1024 + max chars = 1200 + keyword based
Browse files- documents_prep.py +1 -1
- index_retriever.py +44 -78
documents_prep.py
CHANGED
|
@@ -38,7 +38,7 @@ def chunk_text_documents(documents):
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
-
def chunk_table_by_content(table_data, doc_id, max_chars=
|
| 42 |
"""Chunk tables by content size instead of rows"""
|
| 43 |
headers = table_data.get('headers', [])
|
| 44 |
rows = table_data.get('data', [])
|
|
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
+
def chunk_table_by_content(table_data, doc_id, max_chars=1200):
|
| 42 |
"""Chunk tables by content size instead of rows"""
|
| 43 |
headers = table_data.get('headers', [])
|
| 44 |
rows = table_data.get('data', [])
|
index_retriever.py
CHANGED
|
@@ -27,89 +27,55 @@ def create_vector_index(documents):
|
|
| 27 |
index = VectorStoreIndex.from_documents(documents)
|
| 28 |
log_message("✓ Index created")
|
| 29 |
return index
|
| 30 |
-
from llama_index.core.vector_stores import MetadataFilters, ExactMatchFilter
|
| 31 |
-
import re
|
| 32 |
-
|
| 33 |
-
def extract_document_id(query):
|
| 34 |
-
"""Extract GOST document ID from query"""
|
| 35 |
-
patterns = [
|
| 36 |
-
r'ГОСТ\s*Р?\s*([\d\.]+(?:-\d{4})?)',
|
| 37 |
-
r'НП-[\d\-]+',
|
| 38 |
-
r'ПН\s+АЭ\s+Г-[\d\-]+'
|
| 39 |
-
]
|
| 40 |
-
|
| 41 |
-
for pattern in patterns:
|
| 42 |
-
match = re.search(pattern, query, re.IGNORECASE)
|
| 43 |
-
if match:
|
| 44 |
-
doc_id = match.group(0)
|
| 45 |
-
# Normalize
|
| 46 |
-
doc_id = re.sub(r'ГОСТ\s*Р', 'ГОСТ Р', doc_id, flags=re.IGNORECASE)
|
| 47 |
-
if 'ГОСТ' in doc_id and '-' not in doc_id:
|
| 48 |
-
doc_id += '-2020'
|
| 49 |
-
return doc_id
|
| 50 |
-
return None
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
def create_query_engine(vector_index):
|
| 54 |
-
"""Create hybrid retrieval engine with
|
| 55 |
log_message("Creating query engine...")
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
index=vector_index,
|
| 71 |
-
similarity_top_k=30,
|
| 72 |
-
filters=filters
|
| 73 |
-
)
|
| 74 |
-
|
| 75 |
-
filtered_results = filtered_retriever.retrieve(query_str)
|
| 76 |
-
log_message(f"Filtered retrieval: {len(filtered_results)} results from {doc_id}")
|
| 77 |
-
|
| 78 |
-
if len(filtered_results) >= 10:
|
| 79 |
-
# Good enough, use filtered results
|
| 80 |
-
return filtered_results
|
| 81 |
-
else:
|
| 82 |
-
log_message("Not enough filtered results, falling back to hybrid")
|
| 83 |
-
|
| 84 |
-
# Fallback to hybrid retrieval
|
| 85 |
-
vector_retriever = VectorIndexRetriever(
|
| 86 |
-
index=vector_index,
|
| 87 |
-
similarity_top_k=50
|
| 88 |
-
)
|
| 89 |
-
|
| 90 |
-
bm25_retriever = BM25Retriever.from_defaults(
|
| 91 |
-
docstore=vector_index.docstore,
|
| 92 |
-
similarity_top_k=50
|
| 93 |
-
)
|
| 94 |
-
|
| 95 |
-
hybrid_retriever = QueryFusionRetriever(
|
| 96 |
-
[vector_retriever, bm25_retriever],
|
| 97 |
-
similarity_top_k=60,
|
| 98 |
-
num_queries=1
|
| 99 |
-
)
|
| 100 |
-
|
| 101 |
-
return hybrid_retriever.retrieve(query_str)
|
| 102 |
-
|
| 103 |
-
# Create custom query engine
|
| 104 |
-
class CustomRetriever:
|
| 105 |
-
def retrieve(self, query_str):
|
| 106 |
-
return retrieve_with_filter(query_str)
|
| 107 |
-
|
| 108 |
response_synthesizer = get_response_synthesizer()
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
response_synthesizer=response_synthesizer
|
| 112 |
)
|
| 113 |
-
|
| 114 |
-
log_message("✓ Query engine created with
|
| 115 |
return query_engine
|
|
|
|
| 27 |
index = VectorStoreIndex.from_documents(documents)
|
| 28 |
log_message("✓ Index created")
|
| 29 |
return index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
|
| 32 |
+
"""Return nodes that contain at least one keyword from the query."""
|
| 33 |
+
keywords = [w.lower() for w in query.split() if len(w) > 2]
|
| 34 |
+
filtered = []
|
| 35 |
+
for node in nodes:
|
| 36 |
+
text = node.text.lower()
|
| 37 |
+
if any(k in text for k in keywords):
|
| 38 |
+
filtered.append(node)
|
| 39 |
+
return filtered
|
| 40 |
|
| 41 |
def create_query_engine(vector_index):
|
| 42 |
+
"""Create hybrid retrieval engine with keyword boost"""
|
| 43 |
log_message("Creating query engine...")
|
| 44 |
+
|
| 45 |
+
vector_retriever = VectorIndexRetriever(
|
| 46 |
+
index=vector_index,
|
| 47 |
+
similarity_top_k=50
|
| 48 |
+
)
|
| 49 |
+
bm25_retriever = BM25Retriever.from_defaults(
|
| 50 |
+
docstore=vector_index.docstore,
|
| 51 |
+
similarity_top_k=50
|
| 52 |
+
)
|
| 53 |
+
hybrid_retriever = QueryFusionRetriever(
|
| 54 |
+
[vector_retriever, bm25_retriever],
|
| 55 |
+
similarity_top_k=60,
|
| 56 |
+
num_queries=1
|
| 57 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
response_synthesizer = get_response_synthesizer()
|
| 59 |
+
|
| 60 |
+
class KeywordBoostQueryEngine(RetrieverQueryEngine):
|
| 61 |
+
def retrieve(self, query):
|
| 62 |
+
# Hybrid results
|
| 63 |
+
hybrid_nodes = hybrid_retriever.retrieve(query)
|
| 64 |
+
# Keyword filter from all indexed nodes
|
| 65 |
+
all_nodes = list(vector_index.docstore.values())
|
| 66 |
+
keyword_nodes = keyword_filter_nodes(query, all_nodes)
|
| 67 |
+
# Combine and deduplicate
|
| 68 |
+
all_candidates = {id(n): n for n in hybrid_nodes + keyword_nodes}
|
| 69 |
+
log_message(f"Hybrid: {len(hybrid_nodes)}, Keyword: {len(keyword_nodes)}, Total: {len(all_candidates)}")
|
| 70 |
+
return list(all_candidates.values())[:60]
|
| 71 |
+
def query(self, prompt):
|
| 72 |
+
nodes = self.retrieve(prompt)
|
| 73 |
+
return response_synthesizer.synthesize(prompt, nodes)
|
| 74 |
+
|
| 75 |
+
query_engine = KeywordBoostQueryEngine(
|
| 76 |
+
retriever=hybrid_retriever,
|
| 77 |
response_synthesizer=response_synthesizer
|
| 78 |
)
|
| 79 |
+
|
| 80 |
+
log_message("✓ Query engine created (with keyword boost)")
|
| 81 |
return query_engine
|