inference-net
/

Schematron-3B

@@ -4,12 +4,26 @@ license: llama3.2
 base_model: meta-llama/Llama-3.2-3B-Instruct
 ---
-###IN ORDER TO USE THIS:
-Request the HTML from a page. You should clean the HTML using something like
-```
 from lxml.html.clean import Cleaner
 import lxml.html as LH
@@ -35,211 +49,55 @@ def strip_noise(html: str) -> str:
         return ""
 ```
-There are three parts to the prompt:
-```
-{
-    "prompt_part_one": "You are going to be given a JSON schema following the standardized JSON Schema format. You are going to be given a HTML page and you are going to apply the schema to the HTML page however you see it as applicable and return the results in a JSON object. The schema is as follows:",
-    "prompt_part_two": "Here is the HTML page:",
-    "prompt_part_three": "MAKE SURE ITS VALID JSON."
-}
-```
-The draft schema is:
-```
-{
-    "$schema": "http://json-schema.org/draft-07/schema#",
-    "$id": "http://json-schema.org/draft-07/schema#",
-    "title": "Core schema meta-schema",
-    "definitions": {
-        "schemaArray": {
-            "type": "array",
-            "minItems": 1,
-            "items": { "$ref": "#" }
-        },
-        "nonNegativeInteger": {
-            "type": "integer",
-            "minimum": 0
-        },
-        "nonNegativeIntegerDefault0": {
-            "allOf": [
-                { "$ref": "#/definitions/nonNegativeInteger" },
-                { "default": 0 }
-            ]
-        },
-        "simpleTypes": {
-            "enum": [
-                "array",
-                "boolean",
-                "integer",
-                "null",
-                "number",
-                "object",
-                "string"
-            ]
-        },
-        "stringArray": {
-            "type": "array",
-            "items": { "type": "string" },
-            "uniqueItems": true,
-            "default": []
-        }
-    },
-    "type": ["object", "boolean"],
-    "properties": {
-        "$id": {
-            "type": "string",
-            "format": "uri-reference"
-        },
-        "$schema": {
-            "type": "string",
-            "format": "uri"
-        },
-        "$ref": {
-            "type": "string",
-            "format": "uri-reference"
-        },
-        "$comment": {
-            "type": "string"
-        },
-        "title": {
-            "type": "string"
-        },
-        "description": {
-            "type": "string"
-        },
-        "default": true,
-        "readOnly": {
-            "type": "boolean",
-            "default": false
-        },
-        "writeOnly": {
-            "type": "boolean",
-            "default": false
-        },
-        "examples": {
-            "type": "array",
-            "items": true
-        },
-        "multipleOf": {
-            "type": "number",
-            "exclusiveMinimum": 0
-        },
-        "maximum": {
-            "type": "number"
-        },
-        "exclusiveMaximum": {
-            "type": "number"
-        },
-        "minimum": {
-            "type": "number"
-        },
-        "exclusiveMinimum": {
-            "type": "number"
-        },
-        "maxLength": { "$ref": "#/definitions/nonNegativeInteger" },
-        "minLength": { "$ref": "#/definitions/nonNegativeIntegerDefault0" },
-        "pattern": {
-            "type": "string",
-            "format": "regex"
-        },
-        "additionalItems": { "$ref": "#" },
-        "items": {
-            "anyOf": [
-                { "$ref": "#" },
-                { "$ref": "#/definitions/schemaArray" }
-            ],
-            "default": true
-        },
-        "maxItems": { "$ref": "#/definitions/nonNegativeInteger" },
-        "minItems": { "$ref": "#/definitions/nonNegativeIntegerDefault0" },
-        "uniqueItems": {
-            "type": "boolean",
-            "default": false
-        },
-        "contains": { "$ref": "#" },
-        "maxProperties": { "$ref": "#/definitions/nonNegativeInteger" },
-        "minProperties": { "$ref": "#/definitions/nonNegativeIntegerDefault0" },
-        "required": { "$ref": "#/definitions/stringArray" },
-        "additionalProperties": { "$ref": "#" },
-        "definitions": {
-            "type": "object",
-            "additionalProperties": { "$ref": "#" },
-            "default": {}
-        },
-        "properties": {
-            "type": "object",
-            "additionalProperties": { "$ref": "#" },
-            "default": {}
-        },
-        "patternProperties": {
-            "type": "object",
-            "additionalProperties": { "$ref": "#" },
-            "propertyNames": { "format": "regex" },
-            "default": {}
-        },
-        "dependencies": {
-            "type": "object",
-            "additionalProperties": {
-                "anyOf": [
-                    { "$ref": "#" },
-                    { "$ref": "#/definitions/stringArray" }
-                ]
-            }
-        },
-        "propertyNames": { "$ref": "#" },
-        "const": true,
-        "enum": {
-            "type": "array",
-            "items": true,
-            "minItems": 1,
-            "uniqueItems": true
-        },
-        "type": {
-            "anyOf": [
-                { "$ref": "#/definitions/simpleTypes" },
-                {
-                    "type": "array",
-                    "items": { "$ref": "#/definitions/simpleTypes" },
-                    "minItems": 1,
-                    "uniqueItems": true
-                }
-            ]
-        },
-        "format": { "type": "string" },
-        "contentMediaType": { "type": "string" },
-        "contentEncoding": { "type": "string" },
-        "if": { "$ref": "#" },
-        "then": { "$ref": "#" },
-        "else": { "$ref": "#" },
-        "allOf": { "$ref": "#/definitions/schemaArray" },
-        "anyOf": { "$ref": "#/definitions/schemaArray" },
-        "oneOf": { "$ref": "#/definitions/schemaArray" },
-        "not": { "$ref": "#" }
-    },
-    "default": true
-}
-```
-You can combine the prompt, schema, and HTML together using something like:
 ```
-def construct_messages(schema, html):
-  """Construct messages for OpenAI API"""
-  user_prompt = (
-      response_prompt['prompt_part_one'] +
-      "\n\n" + schema + "\n\n" +
-      response_prompt['prompt_part_two'] +
-      "\n\n" + html + "\n\n" +
-      response_prompt['prompt_part_three']
-  )
-  messages = [
-      {"role": "system", "content": "You are a helpful assistant"},
-      {"role": "user", "content": user_prompt}
-  ]
-  return messages
-```
-such that the schema is copied from above and the html is the response from the lxml cleaning function. The output should be the filled out JSON.

 base_model: meta-llama/Llama-3.2-3B-Instruct
 ---
+## Model Overview
+Schematron is a long‑context extraction model for converting noisy HTML into clean, typed JSON that conforms to a user‑provided schema. It is purpose‑built for web scraping, data ingestion, and turning arbitrary pages into structured records.
+## Highlights
+- **Schema-first extraction**: Strict, schema‑conformant JSON outputs
+- **Long context**: Robust to lengthy, noisy HTML (up to 128K tokens)
+- **Reliable structure**: Works well with JSON mode and typed parsers
+- **Variants**: Schematron‑8B (quality) and Schematron‑3B (cost)
+## Model Details
+- **Family**: Schematron (3B and 8B)
+- **Base**: Instruction‑tuned LLM, fine‑tuned for schema‑guided extraction
+- **Context window**: Up to 128K tokens
+- **Input**: Raw or lightly cleaned HTML
+- **Output**: Strictly valid JSON matching your schema
+## Minimal Quickstart
+Use these local snippets to prepare HTML and compose a schema‑guided prompt. The model returns strictly valid JSON; validate it against your schema downstream.
+```python
 from lxml.html.clean import Cleaner
 import lxml.html as LH
         return ""
 ```
+Compose messages with your schema and cleaned HTML:
+```python
+def construct_messages(schema: str, html: str):
+    """Construct messages for a schema‑guided extraction request."""
+    response_prompt = {
+        "prompt_part_one": (
+            "You are going to be given a JSON schema following the standardized JSON "
+            "Schema format. You are going to be given a HTML page and you are going "
+            "to apply the schema to the HTML page however you see it as applicable "
+            "and return the results in a JSON object. The schema is as follows:"
+        ),
+        "prompt_part_two": "Here is the HTML page:",
+        "prompt_part_three": "MAKE SURE ITS VALID JSON.",
+    }
+    user_prompt = (
+        response_prompt['prompt_part_one']
+        + "\n\n" + schema + "\n\n"
+        + response_prompt['prompt_part_two']
+        + "\n\n" + html + "\n\n"
+        + response_prompt['prompt_part_three']
+    )
+    return [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": user_prompt},
+    ]
 ```
+## Recommendations
+- Temperature 0 and JSON mode for deterministic, parseable output
+- Validate responses against your schema (e.g., Pydantic or Zod)
+- Pre‑clean HTML (remove scripts/styles) when possible; avoid over‑aggressive removal
+- Using lxml to clean the HTML is not required, but is recommended as it matches the training data.
+## Limitations
+- Static HTML only; render client‑side content upstream
+- Very large pages may require truncation
+- Ambiguous fields depend on schema clarity; be explicit in field descriptions
+## Safety and Responsible Use
+- Extracted data may include personal or sensitive information present in the page—handle and store responsibly
+- Respect site terms, robots.txt, and applicable laws
+- Use downstream validation and guardrails for compliance
+## License
+See license in the metadata above.
+## Support
+- Docs: https://docs.inference.net
+- Email: support@inference.net