Update README.md
Browse files
README.md
CHANGED
|
@@ -67,6 +67,11 @@ document.write("Hello World!");
|
|
| 67 |
</html>
|
| 68 |
"""
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
simplified_html = clean_html(html)
|
| 71 |
print(simplified_html)
|
| 72 |
|
|
@@ -80,7 +85,6 @@ print(simplified_html)
|
|
| 80 |
# </html>
|
| 81 |
```
|
| 82 |
|
| 83 |
-
|
| 84 |
### 🔧 Configure Pruning Parameters
|
| 85 |
|
| 86 |
The example HTML document is rather a short one. Real-world HTML documents can be much longer and more complex. To handle such cases, we can configure the following parameters:
|
|
@@ -107,6 +111,7 @@ MAX_CONTEXT_WINDOW_GEN = 32
|
|
| 107 |
from htmlrag import build_block_tree
|
| 108 |
|
| 109 |
block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
|
|
|
|
| 110 |
for block in block_tree:
|
| 111 |
print("Block Content: ", block[0])
|
| 112 |
print("Block Path: ", block[1])
|
|
@@ -175,6 +180,7 @@ import torch
|
|
| 175 |
|
| 176 |
# construct a finer block tree
|
| 177 |
block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
|
|
|
|
| 178 |
for block in block_tree:
|
| 179 |
print("Block Content: ", block[0])
|
| 180 |
print("Block Path: ", block[1])
|
|
@@ -189,7 +195,7 @@ for block in block_tree:
|
|
| 189 |
# Block Path: ['html', 'p']
|
| 190 |
# Is Leaf: True
|
| 191 |
|
| 192 |
-
ckpt_path = "zstanjj/HTML-Pruner-
|
| 193 |
if torch.cuda.is_available():
|
| 194 |
device="cuda"
|
| 195 |
else:
|
|
@@ -206,6 +212,7 @@ print(pruned_html)
|
|
| 206 |
# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
|
| 207 |
```
|
| 208 |
|
|
|
|
| 209 |
|
| 210 |
|
| 211 |
## Results
|
|
|
|
| 67 |
</html>
|
| 68 |
"""
|
| 69 |
|
| 70 |
+
#. alternatively you can read html files and merge them
|
| 71 |
+
# html_files=["/path/to/html/file1.html", "/path/to/html/file2.html"]
|
| 72 |
+
# htmls=[open(file).read() for file in html_files]
|
| 73 |
+
# html = "\n".join(htmls)
|
| 74 |
+
|
| 75 |
simplified_html = clean_html(html)
|
| 76 |
print(simplified_html)
|
| 77 |
|
|
|
|
| 85 |
# </html>
|
| 86 |
```
|
| 87 |
|
|
|
|
| 88 |
### 🔧 Configure Pruning Parameters
|
| 89 |
|
| 90 |
The example HTML document is rather a short one. Real-world HTML documents can be much longer and more complex. To handle such cases, we can configure the following parameters:
|
|
|
|
| 111 |
from htmlrag import build_block_tree
|
| 112 |
|
| 113 |
block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
|
| 114 |
+
# block_tree, simplified_html=build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
|
| 115 |
for block in block_tree:
|
| 116 |
print("Block Content: ", block[0])
|
| 117 |
print("Block Path: ", block[1])
|
|
|
|
| 180 |
|
| 181 |
# construct a finer block tree
|
| 182 |
block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
|
| 183 |
+
# block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
|
| 184 |
for block in block_tree:
|
| 185 |
print("Block Content: ", block[0])
|
| 186 |
print("Block Path: ", block[1])
|
|
|
|
| 195 |
# Block Path: ['html', 'p']
|
| 196 |
# Is Leaf: True
|
| 197 |
|
| 198 |
+
ckpt_path = "zstanjj/HTML-Pruner-Phi-3.8B"
|
| 199 |
if torch.cuda.is_available():
|
| 200 |
device="cuda"
|
| 201 |
else:
|
|
|
|
| 212 |
# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
|
| 213 |
```
|
| 214 |
|
| 215 |
+
---
|
| 216 |
|
| 217 |
|
| 218 |
## Results
|