Spaces:
Sleeping
Sleeping
Commit 路
6c839c3
1
Parent(s): 7c27a96
new normalizer C to Latin C + max table = 20, max chunk = 4000
Browse files- config.py +1 -1
- documents_prep.py +1 -5
- index_retriever.py +2 -2
config.py
CHANGED
|
@@ -52,7 +52,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
|
|
| 52 |
CHUNK_SIZE = 1500
|
| 53 |
CHUNK_OVERLAP = 128
|
| 54 |
|
| 55 |
-
MAX_CHARS_TABLE =
|
| 56 |
MAX_ROWS_TABLE = 20
|
| 57 |
|
| 58 |
CUSTOM_PROMPT = """
|
|
|
|
| 52 |
CHUNK_SIZE = 1500
|
| 53 |
CHUNK_OVERLAP = 128
|
| 54 |
|
| 55 |
+
MAX_CHARS_TABLE = 4000
|
| 56 |
MAX_ROWS_TABLE = 20
|
| 57 |
|
| 58 |
CUSTOM_PROMPT = """
|
documents_prep.py
CHANGED
|
@@ -35,14 +35,10 @@ def chunk_text_documents(documents):
|
|
| 35 |
return chunked
|
| 36 |
|
| 37 |
def normalize_text(text):
|
| 38 |
-
"""
|
| 39 |
-
Normalize text by converting Latin C to Cyrillic 小 for consistency
|
| 40 |
-
This ensures "C-25" and "小-25" are treated as the same in search
|
| 41 |
-
"""
|
| 42 |
if not text:
|
| 43 |
return text
|
| 44 |
|
| 45 |
-
# Replace
|
| 46 |
# This is for welding types like C-25 -> 小-25
|
| 47 |
text = text.replace('小-', 'C')
|
| 48 |
|
|
|
|
| 35 |
return chunked
|
| 36 |
|
| 37 |
def normalize_text(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
if not text:
|
| 39 |
return text
|
| 40 |
|
| 41 |
+
# Replace Cyrillic 'C' with Latin '小' (U+0421)
|
| 42 |
# This is for welding types like C-25 -> 小-25
|
| 43 |
text = text.replace('小-', 'C')
|
| 44 |
|
index_retriever.py
CHANGED
|
@@ -71,12 +71,12 @@ def create_query_engine(vector_index):
|
|
| 71 |
|
| 72 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 73 |
docstore=vector_index.docstore,
|
| 74 |
-
similarity_top_k=
|
| 75 |
)
|
| 76 |
|
| 77 |
vector_retriever = VectorIndexRetriever(
|
| 78 |
index=vector_index,
|
| 79 |
-
similarity_top_k=
|
| 80 |
similarity_cutoff=0.45
|
| 81 |
)
|
| 82 |
|
|
|
|
| 71 |
|
| 72 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 73 |
docstore=vector_index.docstore,
|
| 74 |
+
similarity_top_k=80
|
| 75 |
)
|
| 76 |
|
| 77 |
vector_retriever = VectorIndexRetriever(
|
| 78 |
index=vector_index,
|
| 79 |
+
similarity_top_k=80,
|
| 80 |
similarity_cutoff=0.45
|
| 81 |
)
|
| 82 |
|