zstanjj
/

HTML-Pruner-Phi-3.8B

@@ -81,12 +81,32 @@ print(simplified_html)
 ```
 ### 🌲 Build Block Tree
 ```python
 from htmlrag import build_block_tree
-block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=10)
 for block in block_tree:
     print("Block Content: ", block[0])
     print("Block Path: ", block[1])
@@ -114,8 +134,21 @@ for block in block_tree:
 ```python
 from htmlrag import EmbedHTMLPruner
-embed_html_pruner = EmbedHTMLPruner(embed_model="bm25")
-block_rankings = embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
 print(block_rankings)
 # [0, 2, 1]
@@ -124,8 +157,7 @@ from transformers import AutoTokenizer
 chat_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
-max_context_window = 60
-pruned_html = embed_html_pruner.prune_HTML(simplified_html, block_tree, block_rankings, chat_tokenizer, max_context_window)
 print(pruned_html)
 # <html>
@@ -141,18 +173,8 @@ print(pruned_html)
 from htmlrag import GenHTMLPruner
 import torch
-ckpt_path = "zstanjj/HTML-Pruner-Phi-3.8B"
-if torch.cuda.is_available():
-    device="cuda"
-else:
-    device="cpu"
-gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path, max_node_words=5, device=device)
-block_rankings = gen_embed_pruner.calculate_block_rankings(question, pruned_html)
-print(block_rankings)
-# [1, 0]
-block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=10)
 for block in block_tree:
     print("Block Content: ", block[0])
     print("Block Path: ", block[1])
@@ -167,13 +189,25 @@ for block in block_tree:
 # Block Path:  ['html', 'p']
 # Is Leaf:  True
-max_context_window = 32
-pruned_html = gen_embed_pruner.prune_HTML(pruned_html, block_tree, block_rankings, chat_tokenizer, max_context_window)
 print(pruned_html)
 # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
 ```
 ## Results
 - **Results for [HTML-Pruner-Phi-3.8B](https://huggingface.co/zstanjj/HTML-Pruner-Phi-3.8B) and [HTML-Pruner-Llama-1B](https://huggingface.co/zstanjj/HTML-Pruner-Llama-1B) with Llama-3.1-70B-Instruct as chat model**.

 ```
+### 🔧 Configure Pruning Parameters
+The example HTML document is rather a short one. Real-world HTML documents can be much longer and more complex. To handle such cases, we can configure the following parameters:
+```python
+# Maximum number of words in a node when constructing the block tree for pruning with the embedding model
+MAX_NODE_WORDS_EMBED = 10
+# MAX_NODE_WORDS_EMBED = 256 # a recommended setting for real-world HTML documents
+# Maximum number of tokens in the output HTML document pruned with the embedding model
+MAX_CONTEXT_WINDOW_EMBED = 60
+# MAX_CONTEXT_WINDOW_EMBED = 6144 # a recommended setting for real-world HTML documents
+# Maximum number of words in a node when constructing the block tree for pruning with the generative model
+MAX_NODE_WORDS_GEN = 5
+# MAX_NODE_WORDS_GEN = 128 # a recommended setting for real-world HTML documents
+# Maximum number of tokens in the output HTML document pruned with the generative model
+MAX_CONTEXT_WINDOW_GEN = 32
+# MAX_CONTEXT_WINDOW_GEN = 4096 # a recommended setting for real-world HTML documents
+```
 ### 🌲 Build Block Tree
 ```python
 from htmlrag import build_block_tree
+block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
 for block in block_tree:
     print("Block Content: ", block[0])
     print("Block Path: ", block[1])
 ```python
 from htmlrag import EmbedHTMLPruner
+embed_model="/train_data_load/huggingface/tjj_hf/bge-large-en/"
+query_instruction_for_retrieval = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
+embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=True, query_instruction_for_retrieval = query_instruction_for_retrieval)
+# alternatively you can init a remote TEI model, refer to https://github.com/huggingface/text-embeddings-inference.
+# tei_endpoint="http://YOUR_TEI_ENDPOINT"
+# embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=False, query_instruction_for_retrieval = query_instruction_for_retrieval, endpoint=tei_endpoint)
+block_rankings=embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
+print(block_rankings)
+# [0, 2, 1]
+#. alternatively you can use bm25 to rank the blocks
+from htmlrag import BM25HTMLPruner
+bm25_html_pruner = BM25HTMLPruner()
+block_rankings=bm25_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
 print(block_rankings)
 # [0, 2, 1]
 chat_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
+pruned_html = embed_html_pruner.prune_HTML(simplified_html, block_tree, block_rankings, chat_tokenizer, MAX_CONTEXT_WINDOW_EMBED)
 print(pruned_html)
 # <html>
 from htmlrag import GenHTMLPruner
 import torch
+# construct a finer block tree
+block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
 for block in block_tree:
     print("Block Content: ", block[0])
     print("Block Path: ", block[1])
 # Block Path:  ['html', 'p']
 # Is Leaf:  True
+ckpt_path = "zstanjj/HTML-Pruner-Llama-1B"
+if torch.cuda.is_available():
+    device="cuda"
+else:
+    device="cpu"
+gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path, max_node_words=MAX_NODE_WORDS_GEN, device=device)
+block_rankings = gen_embed_pruner.calculate_block_rankings(question, pruned_html)
+print(block_rankings)
+# [1, 0]
+pruned_html = gen_embed_pruner.prune_HTML(pruned_html, block_tree, block_rankings, chat_tokenizer, MAX_CONTEXT_WINDOW_GEN)
 print(pruned_html)
 # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
 ```
 ## Results
 - **Results for [HTML-Pruner-Phi-3.8B](https://huggingface.co/zstanjj/HTML-Pruner-Phi-3.8B) and [HTML-Pruner-Llama-1B](https://huggingface.co/zstanjj/HTML-Pruner-Llama-1B) with Llama-3.1-70B-Instruct as chat model**.