Spaces:
Sleeping
Sleeping
Commit
·
31659d7
1
Parent(s):
8114c87
index retriever = 100 + 100
Browse files- documents_prep.py +2 -2
- index_retriever.py +1 -1
- utils.py +6 -7
documents_prep.py
CHANGED
|
@@ -38,7 +38,7 @@ def chunk_text_documents(documents):
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
-
def chunk_table_by_content(table_data, doc_id, max_chars=
|
| 42 |
"""Chunk tables by content size instead of rows"""
|
| 43 |
headers = table_data.get('headers', [])
|
| 44 |
rows = table_data.get('data', [])
|
|
@@ -222,7 +222,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 222 |
for sheet in data.get('sheets', []):
|
| 223 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 224 |
|
| 225 |
-
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=
|
| 226 |
all_chunks.extend(chunks)
|
| 227 |
|
| 228 |
except Exception as e:
|
|
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
+
def chunk_table_by_content(table_data, doc_id, max_chars=2000):
|
| 42 |
"""Chunk tables by content size instead of rows"""
|
| 43 |
headers = table_data.get('headers', [])
|
| 44 |
rows = table_data.get('data', [])
|
|
|
|
| 222 |
for sheet in data.get('sheets', []):
|
| 223 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 224 |
|
| 225 |
+
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2000)
|
| 226 |
all_chunks.extend(chunks)
|
| 227 |
|
| 228 |
except Exception as e:
|
index_retriever.py
CHANGED
|
@@ -72,7 +72,7 @@ def create_query_engine(vector_index):
|
|
| 72 |
unique_nodes.append(node)
|
| 73 |
|
| 74 |
log_message(f"Retrieved: {len(nodes)} → Unique: {len(unique_nodes)}")
|
| 75 |
-
return unique_nodes[:
|
| 76 |
|
| 77 |
response_synthesizer = get_response_synthesizer()
|
| 78 |
|
|
|
|
| 72 |
unique_nodes.append(node)
|
| 73 |
|
| 74 |
log_message(f"Retrieved: {len(nodes)} → Unique: {len(unique_nodes)}")
|
| 75 |
+
return unique_nodes[:50] # Return top 50 unique
|
| 76 |
|
| 77 |
response_synthesizer = get_response_synthesizer()
|
| 78 |
|
utils.py
CHANGED
|
@@ -54,10 +54,9 @@ def extract_document_id(query):
|
|
| 54 |
return None
|
| 55 |
|
| 56 |
def normalize_doc_id(doc_id):
|
| 57 |
-
|
| 58 |
-
normalized = doc_id.replace(' ', '').replace('Р', '').replace('р', '').lower()
|
| 59 |
-
# Remove year suffix for comparison (e.g., -2020)
|
| 60 |
normalized = re.sub(r'-\d{4}$', '', normalized)
|
|
|
|
| 61 |
return normalized
|
| 62 |
|
| 63 |
def answer_question(question, query_engine, reranker):
|
|
@@ -66,6 +65,8 @@ def answer_question(question, query_engine, reranker):
|
|
| 66 |
log_message(f"QUERY: {question}")
|
| 67 |
|
| 68 |
target_doc_id = extract_document_id(question)
|
|
|
|
|
|
|
| 69 |
if target_doc_id:
|
| 70 |
log_message(f"TARGET DOCUMENT: {target_doc_id}")
|
| 71 |
|
|
@@ -74,8 +75,6 @@ def answer_question(question, query_engine, reranker):
|
|
| 74 |
|
| 75 |
if target_doc_id:
|
| 76 |
target_normalized = normalize_doc_id(target_doc_id)
|
| 77 |
-
log_message(f"NORMALIZED TARGET: {target_normalized}")
|
| 78 |
-
|
| 79 |
filtered = [
|
| 80 |
node for node in retrieved
|
| 81 |
if target_normalized in normalize_doc_id(node.metadata.get('document_id', ''))
|
|
@@ -95,7 +94,7 @@ def answer_question(question, query_engine, reranker):
|
|
| 95 |
retrieved = filtered
|
| 96 |
|
| 97 |
# Rest stays the same...
|
| 98 |
-
reranked = rerank_nodes(question, retrieved, reranker, top_k=
|
| 99 |
log_message(f"RERANKED: {len(reranked)} nodes")
|
| 100 |
|
| 101 |
context_parts = []
|
|
@@ -146,7 +145,7 @@ def answer_question(question, query_engine, reranker):
|
|
| 146 |
log_message(traceback.format_exc())
|
| 147 |
return f"Ошибка: {e}", ""
|
| 148 |
|
| 149 |
-
def rerank_nodes(query, nodes, reranker, top_k=
|
| 150 |
"""Simple and effective reranking: sort by score and filter by threshold."""
|
| 151 |
if not nodes or not reranker:
|
| 152 |
return nodes[:top_k]
|
|
|
|
| 54 |
return None
|
| 55 |
|
| 56 |
def normalize_doc_id(doc_id):
|
| 57 |
+
normalized = doc_id.replace(' ', '').replace('р', '').replace('Р', '').lower()
|
|
|
|
|
|
|
| 58 |
normalized = re.sub(r'-\d{4}$', '', normalized)
|
| 59 |
+
normalized = normalized.replace('.', '') # Remove dots for flexible matching
|
| 60 |
return normalized
|
| 61 |
|
| 62 |
def answer_question(question, query_engine, reranker):
|
|
|
|
| 65 |
log_message(f"QUERY: {question}")
|
| 66 |
|
| 67 |
target_doc_id = extract_document_id(question)
|
| 68 |
+
found_docs = set(normalize_doc_id(node.metadata.get('document_id', 'unknown')) for node in query_engine.retrieve(question))
|
| 69 |
+
log_message(f"NORMALIZED DOCS IN RETRIEVED: {', '.join(list(found_docs))}")
|
| 70 |
if target_doc_id:
|
| 71 |
log_message(f"TARGET DOCUMENT: {target_doc_id}")
|
| 72 |
|
|
|
|
| 75 |
|
| 76 |
if target_doc_id:
|
| 77 |
target_normalized = normalize_doc_id(target_doc_id)
|
|
|
|
|
|
|
| 78 |
filtered = [
|
| 79 |
node for node in retrieved
|
| 80 |
if target_normalized in normalize_doc_id(node.metadata.get('document_id', ''))
|
|
|
|
| 94 |
retrieved = filtered
|
| 95 |
|
| 96 |
# Rest stays the same...
|
| 97 |
+
reranked = rerank_nodes(question, retrieved, reranker, top_k=20, min_score=0.25)
|
| 98 |
log_message(f"RERANKED: {len(reranked)} nodes")
|
| 99 |
|
| 100 |
context_parts = []
|
|
|
|
| 145 |
log_message(traceback.format_exc())
|
| 146 |
return f"Ошибка: {e}", ""
|
| 147 |
|
| 148 |
+
def rerank_nodes(query, nodes, reranker, top_k=20, min_score=0.3):
|
| 149 |
"""Simple and effective reranking: sort by score and filter by threshold."""
|
| 150 |
if not nodes or not reranker:
|
| 151 |
return nodes[:top_k]
|