MrSimple07 commited on
Commit
60178fd
·
1 Parent(s): 30336c3

remove hyphens

Browse files
Files changed (2) hide show
  1. documents_prep.py +9 -3
  2. index_retriever.py +4 -10
documents_prep.py CHANGED
@@ -35,11 +35,17 @@ def chunk_text_documents(documents):
35
  return chunked
36
 
37
  def normalize_connection_type(s):
38
- # Replace Cyrillic С/с with Latin C/c
39
- return s.replace('С', 'C').replace('с', 'c')
 
 
 
 
 
40
 
41
  def extract_connection_type(text):
42
  import re
 
43
  match = re.search(r'[СCс]-?\d+(?:-\d+)?', text)
44
  if match:
45
  return normalize_connection_type(match.group(0))
@@ -51,7 +57,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
51
  table_num = table_data.get('table_number', 'unknown')
52
  table_title = table_data.get('table_title', '')
53
  section = table_data.get('section', '')
54
- table_description = table_data.get('table_description', '') # NEW
55
 
56
  table_num_clean = str(table_num).strip()
57
 
 
35
  return chunked
36
 
37
  def normalize_connection_type(s):
38
+ # Replace Cyrillic with Latin
39
+ s = s.replace('С', 'C').replace('с', 'c')
40
+ s = s.replace('У', 'U').replace('у', 'u')
41
+ s = s.replace('Т', 'T').replace('т', 't')
42
+ # REMOVE HYPHENS for consistent tokenization
43
+ s = s.replace('-', '')
44
+ return s
45
 
46
  def extract_connection_type(text):
47
  import re
48
+ # Match with or without hyphen
49
  match = re.search(r'[СCс]-?\d+(?:-\d+)?', text)
50
  if match:
51
  return normalize_connection_type(match.group(0))
 
57
  table_num = table_data.get('table_number', 'unknown')
58
  table_title = table_data.get('table_title', '')
59
  section = table_data.get('section', '')
60
+ table_description = table_data.get('table_description', '')
61
 
62
  table_num_clean = str(table_num).strip()
63
 
index_retriever.py CHANGED
@@ -26,12 +26,6 @@ def create_vector_index(documents):
26
  log_message("CONNECTION TYPES IN INDEX:")
27
  for conn_type, count in sorted(connection_types.items()):
28
  log_message(f" {conn_type}: {count} chunks")
29
-
30
- # Check for С-25 specifically
31
- if 'С-25' in connection_types:
32
- log_message(f"✓ С-25 FOUND: {connection_types['С-25']} chunks")
33
- else:
34
- log_message("✗ С-25 NOT FOUND IN INDEX!")
35
  log_message("="*60)
36
 
37
  return VectorStoreIndex.from_documents(documents)
@@ -70,18 +64,18 @@ def create_query_engine(vector_index):
70
 
71
  bm25_retriever = BM25Retriever.from_defaults(
72
  docstore=vector_index.docstore,
73
- similarity_top_k=200
74
  )
75
 
76
  vector_retriever = VectorIndexRetriever(
77
  index=vector_index,
78
- similarity_top_k=200,
79
- similarity_cutoff=0.15
80
  )
81
 
82
  hybrid_retriever = QueryFusionRetriever(
83
  [vector_retriever, bm25_retriever],
84
- similarity_top_k=150,
85
  num_queries=1
86
  )
87
 
 
26
  log_message("CONNECTION TYPES IN INDEX:")
27
  for conn_type, count in sorted(connection_types.items()):
28
  log_message(f" {conn_type}: {count} chunks")
 
 
 
 
 
 
29
  log_message("="*60)
30
 
31
  return VectorStoreIndex.from_documents(documents)
 
64
 
65
  bm25_retriever = BM25Retriever.from_defaults(
66
  docstore=vector_index.docstore,
67
+ similarity_top_k=70
68
  )
69
 
70
  vector_retriever = VectorIndexRetriever(
71
  index=vector_index,
72
+ similarity_top_k=70,
73
+ similarity_cutoff=0.45
74
  )
75
 
76
  hybrid_retriever = QueryFusionRetriever(
77
  [vector_retriever, bm25_retriever],
78
+ similarity_top_k=70,
79
  num_queries=1
80
  )
81