Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

MrSimple07 commited on Oct 8, 2025

Commit

60178fd

1 Parent(s): 30336c3

remove hyphens

Files changed (2) hide show

documents_prep.py CHANGED Viewed

@@ -35,11 +35,17 @@ def chunk_text_documents(documents):
     return chunked
 def normalize_connection_type(s):
-    # Replace Cyrillic С/с with Latin C/c
-    return s.replace('С', 'C').replace('с', 'c')
 def extract_connection_type(text):
     import re
     match = re.search(r'[СCс]-?\d+(?:-\d+)?', text)
     if match:
         return normalize_connection_type(match.group(0))
@@ -51,7 +57,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
-    table_description = table_data.get('table_description', '')  # NEW
     table_num_clean = str(table_num).strip()

     return chunked
 def normalize_connection_type(s):
+    # Replace Cyrillic with Latin
+    s = s.replace('С', 'C').replace('с', 'c')
+    s = s.replace('У', 'U').replace('у', 'u')
+    s = s.replace('Т', 'T').replace('т', 't')
+    # REMOVE HYPHENS for consistent tokenization
+    s = s.replace('-', '')
+    return s
 def extract_connection_type(text):
     import re
+    # Match with or without hyphen
     match = re.search(r'[СCс]-?\d+(?:-\d+)?', text)
     if match:
         return normalize_connection_type(match.group(0))
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
+    table_description = table_data.get('table_description', '')
     table_num_clean = str(table_num).strip()

index_retriever.py CHANGED Viewed

@@ -26,12 +26,6 @@ def create_vector_index(documents):
     log_message("CONNECTION TYPES IN INDEX:")
     for conn_type, count in sorted(connection_types.items()):
         log_message(f"  {conn_type}: {count} chunks")
-    # Check for С-25 specifically
-    if 'С-25' in connection_types:
-        log_message(f"✓ С-25 FOUND: {connection_types['С-25']} chunks")
-    else:
-        log_message("✗ С-25 NOT FOUND IN INDEX!")
     log_message("="*60)
     return VectorStoreIndex.from_documents(documents)
@@ -70,18 +64,18 @@ def create_query_engine(vector_index):
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
-            similarity_top_k=200
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
-            similarity_top_k=200,
-            similarity_cutoff=0.15
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
-            similarity_top_k=150,
             num_queries=1
         )

     log_message("CONNECTION TYPES IN INDEX:")
     for conn_type, count in sorted(connection_types.items()):
         log_message(f"  {conn_type}: {count} chunks")
     log_message("="*60)
     return VectorStoreIndex.from_documents(documents)
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
+            similarity_top_k=70
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
+            similarity_top_k=70,
+            similarity_cutoff=0.45
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
+            similarity_top_k=70,
             num_queries=1
         )