MrSimple07 commited on
Commit
75fe00d
1 Parent(s): 429d2d4

added the 100 topk

Browse files
Files changed (3) hide show
  1. documents_prep.py +6 -4
  2. index_retriever.py +3 -3
  3. utils.py +4 -4
documents_prep.py CHANGED
@@ -36,14 +36,14 @@ def chunk_text_documents(documents):
36
 
37
  def normalize_connection_type(s):
38
  # Replace Cyrillic with Latin
39
- # s = s.replace('小', 'C').replace('褋', 'c')
40
- # s = s.replace('校', 'U').replace('褍', 'u')
41
- # s = s.replace('孝', 'T').replace('褌', 't')
42
  s= s.replace('小-', 'C-').replace('褋-', 'c-')
43
  s = s.replace('校-', 'U-').replace('褍-', 'u-')
44
  s = s.replace('孝-', 'T-').replace('褌-', 't-')
45
  # REMOVE ALL HYPHENS for consistent tokenization
46
- # s = s.replace('-', '')
47
  return s
48
 
49
  def extract_connection_type(text):
@@ -80,6 +80,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
80
  return []
81
 
82
  log_message(f" 馃搳 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
 
 
83
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
84
 
85
  # ADD DESCRIPTION HERE if it exists
 
36
 
37
  def normalize_connection_type(s):
38
  # Replace Cyrillic with Latin
39
+ s = s.replace('小', 'C').replace('褋', 'c')
40
+ s = s.replace('校', 'U').replace('褍', 'u')
41
+ s = s.replace('孝', 'T').replace('褌', 't')
42
  s= s.replace('小-', 'C-').replace('褋-', 'c-')
43
  s = s.replace('校-', 'U-').replace('褍-', 'u-')
44
  s = s.replace('孝-', 'T-').replace('褌-', 't-')
45
  # REMOVE ALL HYPHENS for consistent tokenization
46
+ s = s.replace('-', '')
47
  return s
48
 
49
  def extract_connection_type(text):
 
80
  return []
81
 
82
  log_message(f" 馃搳 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
83
+
84
+ # Calculate base metadata size - NOW INCLUDING DESCRIPTION
85
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
86
 
87
  # ADD DESCRIPTION HERE if it exists
index_retriever.py CHANGED
@@ -71,18 +71,18 @@ def create_query_engine(vector_index):
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
74
- similarity_top_k=80
75
  )
76
 
77
  vector_retriever = VectorIndexRetriever(
78
  index=vector_index,
79
- similarity_top_k=80,
80
  similarity_cutoff=0.55
81
  )
82
 
83
  hybrid_retriever = QueryFusionRetriever(
84
  [vector_retriever, bm25_retriever],
85
- similarity_top_k=80,
86
  num_queries=1
87
  )
88
 
 
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
74
+ similarity_top_k=100
75
  )
76
 
77
  vector_retriever = VectorIndexRetriever(
78
  index=vector_index,
79
+ similarity_top_k=100,
80
  similarity_cutoff=0.55
81
  )
82
 
83
  hybrid_retriever = QueryFusionRetriever(
84
  [vector_retriever, bm25_retriever],
85
+ similarity_top_k=100,
86
  num_queries=1
87
  )
88
 
utils.py CHANGED
@@ -179,10 +179,10 @@ def normalize_query(query):
179
  query = query.replace('小-', 'C-').replace('褋-', 'c-')
180
  query = query.replace('校-', 'U-').replace('褍-', 'u-')
181
  query = query.replace('孝-', 'T-').replace('褌-', 't-')
182
- # query = query.replace('小', 'C').replace('褋', 'C')
183
- # query = query.replace('校', 'U').replace('褍', 'U')
184
- # query = query.replace('孝', 'T').replace('褌', 'T')
185
- # query = query.replace('-', '')
186
 
187
  return query
188
 
 
179
  query = query.replace('小-', 'C-').replace('褋-', 'c-')
180
  query = query.replace('校-', 'U-').replace('褍-', 'u-')
181
  query = query.replace('孝-', 'T-').replace('褌-', 't-')
182
+ query = query.replace('小', 'C').replace('褋', 'C')
183
+ query = query.replace('校', 'U').replace('褍', 'U')
184
+ query = query.replace('孝', 'T').replace('褌', 'T')
185
+ query = query.replace('-', '')
186
 
187
  return query
188