Update app.py
Browse files
app.py
CHANGED
|
@@ -52,8 +52,8 @@ tokenizer_name = EMBEDDING_MODEL_NAME
|
|
| 52 |
# Token splitting for more context split
|
| 53 |
text_splitter = TokenTextSplitter.from_huggingface_tokenizer(
|
| 54 |
tokenizer=AutoTokenizer.from_pretrained(tokenizer_name),
|
| 55 |
-
chunk_size=
|
| 56 |
-
chunk_overlap=
|
| 57 |
)
|
| 58 |
|
| 59 |
chunks = text_splitter.split_documents(documents) # chunks used for LLM generation
|
|
@@ -68,7 +68,7 @@ for i, doc in enumerate(chunks):
|
|
| 68 |
doc.metadata["index"] = i # Add an index for tracking
|
| 69 |
|
| 70 |
"""Retriever"""
|
| 71 |
-
bm25_retriever = BM25Retriever.from_documents(chunks_bm25, k =
|
| 72 |
|
| 73 |
def retriever(query):
|
| 74 |
tmp = bm25_retriever.invoke(preprocess_for_bm25(query))
|
|
|
|
| 52 |
# Token splitting for more context split
|
| 53 |
text_splitter = TokenTextSplitter.from_huggingface_tokenizer(
|
| 54 |
tokenizer=AutoTokenizer.from_pretrained(tokenizer_name),
|
| 55 |
+
chunk_size=150,
|
| 56 |
+
chunk_overlap=15
|
| 57 |
)
|
| 58 |
|
| 59 |
chunks = text_splitter.split_documents(documents) # chunks used for LLM generation
|
|
|
|
| 68 |
doc.metadata["index"] = i # Add an index for tracking
|
| 69 |
|
| 70 |
"""Retriever"""
|
| 71 |
+
bm25_retriever = BM25Retriever.from_documents(chunks_bm25, k = 4) # 2 most similar contexts
|
| 72 |
|
| 73 |
def retriever(query):
|
| 74 |
tmp = bm25_retriever.invoke(preprocess_for_bm25(query))
|