Update app.py
Browse files
app.py
CHANGED
|
@@ -50,7 +50,8 @@ collection = client.get_or_create_collection(
|
|
| 50 |
embedding_model = SentenceTransformer("intfloat/multilingual-e5-base")
|
| 51 |
|
| 52 |
# Initialize the text splitter
|
| 53 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=300)
|
|
|
|
| 54 |
|
| 55 |
total_chunks = 0
|
| 56 |
|
|
@@ -120,7 +121,7 @@ def rerank_with_bm25(docs, query):
|
|
| 120 |
tokenized_query = clean_and_tokenize(query, lang)
|
| 121 |
scores = bm25.get_scores(tokenized_query)
|
| 122 |
|
| 123 |
-
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:
|
| 124 |
return [docs[i] for i in top_indices]
|
| 125 |
|
| 126 |
|
|
|
|
| 50 |
embedding_model = SentenceTransformer("intfloat/multilingual-e5-base")
|
| 51 |
|
| 52 |
# Initialize the text splitter
|
| 53 |
+
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=300)
|
| 54 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
|
| 55 |
|
| 56 |
total_chunks = 0
|
| 57 |
|
|
|
|
| 121 |
tokenized_query = clean_and_tokenize(query, lang)
|
| 122 |
scores = bm25.get_scores(tokenized_query)
|
| 123 |
|
| 124 |
+
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:3]
|
| 125 |
return [docs[i] for i in top_indices]
|
| 126 |
|
| 127 |
|