jinaai
/

ReaderLM-v2

@@ -83,7 +83,6 @@ To use `ReaderLM-v2` locally:
    ```python
    from transformers import AutoModelForCausalLM, AutoTokenizer
-   import re
    device = "cuda"  # or "cpu"
    tokenizer = AutoTokenizer.from_pretrained("jinaai/ReaderLM-v2")
@@ -93,14 +92,17 @@ To use `ReaderLM-v2` locally:
 3. (Optional) Pre-clean your HTML to remove scripts, styles, comments, to reduce the noise and length of the input:
    ```python
    # Patterns
-   SCRIPT_PATTERN = r'<[ ]*script.*?\/[ ]*script[ ]*>'
-   STYLE_PATTERN = r'<[ ]*style.*?\/[ ]*style[ ]*>'
-   META_PATTERN = r'<[ ]*meta.*?>'
-   COMMENT_PATTERN = r'<[ ]*!--.*?--[ ]*>'
-   LINK_PATTERN = r'<[ ]*link.*?>'
    BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
-   SVG_PATTERN = r'(<svg[^>]*>)(.*?)(<\/svg>)'
    def replace_svg(html: str, new_content: str = "this is a placeholder") -> str:
        return re.sub(
@@ -110,15 +112,27 @@ To use `ReaderLM-v2` locally:
            flags=re.DOTALL,
        )
    def replace_base64_images(html: str, new_image_src: str = "#") -> str:
        return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html)
    def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False):
-       html = re.sub(SCRIPT_PATTERN, '', html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
-       html = re.sub(STYLE_PATTERN, '', html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
-       html = re.sub(META_PATTERN, '', html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
-       html = re.sub(COMMENT_PATTERN, '', html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
-       html = re.sub(LINK_PATTERN, '', html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        if clean_svg:
            html = replace_svg(html)
@@ -130,7 +144,9 @@ To use `ReaderLM-v2` locally:
 4. Create a prompt for the model:
    ```python
-   def create_prompt(text: str, tokenizer=None, instruction: str = None, schema: str = None) -> str:
        """
        Create a prompt for the model with optional instruction and JSON schema.
        """
@@ -157,14 +173,15 @@ To use `ReaderLM-v2` locally:
 ### HTML to Markdown Example
 ```python
-# Example HTML
 html = "<html><body><h1>Hello, world!</h1></body></html>"
 html = clean_html(html)
 input_prompt = create_prompt(html)
 inputs = tokenizer.encode(input_prompt, return_tensors="pt").to(device)
-outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
 print(tokenizer.decode(outputs[0]))
 ```
@@ -197,7 +214,9 @@ html = clean_html(html)
 input_prompt = create_prompt(html, schema=schema)
 inputs = tokenizer.encode(input_prompt, return_tensors="pt").to(device)
-outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
 print(tokenizer.decode(outputs[0]))
 ```

    ```python
    from transformers import AutoModelForCausalLM, AutoTokenizer
    device = "cuda"  # or "cpu"
    tokenizer = AutoTokenizer.from_pretrained("jinaai/ReaderLM-v2")
 3. (Optional) Pre-clean your HTML to remove scripts, styles, comments, to reduce the noise and length of the input:
    ```python
+   import re
    # Patterns
+   SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
+   STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
+   META_PATTERN = r"<[ ]*meta.*?>"
+   COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
+   LINK_PATTERN = r"<[ ]*link.*?>"
    BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
+   SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
    def replace_svg(html: str, new_content: str = "this is a placeholder") -> str:
        return re.sub(
            flags=re.DOTALL,
        )
    def replace_base64_images(html: str, new_image_src: str = "#") -> str:
        return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html)
    def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False):
+       html = re.sub(
+           SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL
+       )
+       html = re.sub(
+           STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL
+       )
+       html = re.sub(
+           META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL
+       )
+       html = re.sub(
+           COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL
+       )
+       html = re.sub(
+           LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL
+       )
        if clean_svg:
            html = replace_svg(html)
 4. Create a prompt for the model:
    ```python
+   def create_prompt(
+       text: str, tokenizer=None, instruction: str = None, schema: str = None
+   ) -> str:
        """
        Create a prompt for the model with optional instruction and JSON schema.
        """
 ### HTML to Markdown Example
 ```python
 html = "<html><body><h1>Hello, world!</h1></body></html>"
 html = clean_html(html)
 input_prompt = create_prompt(html)
 inputs = tokenizer.encode(input_prompt, return_tensors="pt").to(device)
+outputs = model.generate(
+    inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
+)
 print(tokenizer.decode(outputs[0]))
 ```
 input_prompt = create_prompt(html, schema=schema)
 inputs = tokenizer.encode(input_prompt, return_tensors="pt").to(device)
+outputs = model.generate(
+    inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
+)
 print(tokenizer.decode(outputs[0]))
 ```