MrSimple07 commited on
Commit
6c839c3
1 Parent(s): 7c27a96

new normalizer C to Latin C + max table = 20, max chunk = 4000

Browse files
Files changed (3) hide show
  1. config.py +1 -1
  2. documents_prep.py +1 -5
  3. index_retriever.py +2 -2
config.py CHANGED
@@ -52,7 +52,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
52
  CHUNK_SIZE = 1500
53
  CHUNK_OVERLAP = 128
54
 
55
- MAX_CHARS_TABLE = 3000
56
  MAX_ROWS_TABLE = 20
57
 
58
  CUSTOM_PROMPT = """
 
52
  CHUNK_SIZE = 1500
53
  CHUNK_OVERLAP = 128
54
 
55
+ MAX_CHARS_TABLE = 4000
56
  MAX_ROWS_TABLE = 20
57
 
58
  CUSTOM_PROMPT = """
documents_prep.py CHANGED
@@ -35,14 +35,10 @@ def chunk_text_documents(documents):
35
  return chunked
36
 
37
  def normalize_text(text):
38
- """
39
- Normalize text by converting Latin C to Cyrillic 小 for consistency
40
- This ensures "C-25" and "小-25" are treated as the same in search
41
- """
42
  if not text:
43
  return text
44
 
45
- # Replace Latin 'C' with Cyrillic '小' (U+0421)
46
  # This is for welding types like C-25 -> 小-25
47
  text = text.replace('小-', 'C')
48
 
 
35
  return chunked
36
 
37
  def normalize_text(text):
 
 
 
 
38
  if not text:
39
  return text
40
 
41
+ # Replace Cyrillic 'C' with Latin '小' (U+0421)
42
  # This is for welding types like C-25 -> 小-25
43
  text = text.replace('小-', 'C')
44
 
index_retriever.py CHANGED
@@ -71,12 +71,12 @@ def create_query_engine(vector_index):
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
74
- similarity_top_k=70
75
  )
76
 
77
  vector_retriever = VectorIndexRetriever(
78
  index=vector_index,
79
- similarity_top_k=70,
80
  similarity_cutoff=0.45
81
  )
82
 
 
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
74
+ similarity_top_k=80
75
  )
76
 
77
  vector_retriever = VectorIndexRetriever(
78
  index=vector_index,
79
+ similarity_top_k=80,
80
  similarity_cutoff=0.45
81
  )
82