opensporks commited on
Commit
8f1c9ca
·
verified ·
1 Parent(s): b221974

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +63 -205
README.md CHANGED
@@ -4,12 +4,26 @@ license: llama3.2
4
  base_model: meta-llama/Llama-3.2-3B-Instruct
5
  ---
6
 
7
- ###IN ORDER TO USE THIS:
 
8
 
9
- Request the HTML from a page. You should clean the HTML using something like
 
 
 
 
10
 
 
 
 
 
 
 
11
 
12
- ```
 
 
 
13
  from lxml.html.clean import Cleaner
14
  import lxml.html as LH
15
 
@@ -35,211 +49,55 @@ def strip_noise(html: str) -> str:
35
  return ""
36
  ```
37
 
38
- There are three parts to the prompt:
39
- ```
40
- {
41
- "prompt_part_one": "You are going to be given a JSON schema following the standardized JSON Schema format. You are going to be given a HTML page and you are going to apply the schema to the HTML page however you see it as applicable and return the results in a JSON object. The schema is as follows:",
42
- "prompt_part_two": "Here is the HTML page:",
43
- "prompt_part_three": "MAKE SURE ITS VALID JSON."
44
- }
45
- ```
46
 
47
- The draft schema is:
48
- ```
49
- {
50
- "$schema": "http://json-schema.org/draft-07/schema#",
51
- "$id": "http://json-schema.org/draft-07/schema#",
52
- "title": "Core schema meta-schema",
53
- "definitions": {
54
- "schemaArray": {
55
- "type": "array",
56
- "minItems": 1,
57
- "items": { "$ref": "#" }
58
- },
59
- "nonNegativeInteger": {
60
- "type": "integer",
61
- "minimum": 0
62
- },
63
- "nonNegativeIntegerDefault0": {
64
- "allOf": [
65
- { "$ref": "#/definitions/nonNegativeInteger" },
66
- { "default": 0 }
67
- ]
68
- },
69
- "simpleTypes": {
70
- "enum": [
71
- "array",
72
- "boolean",
73
- "integer",
74
- "null",
75
- "number",
76
- "object",
77
- "string"
78
- ]
79
- },
80
- "stringArray": {
81
- "type": "array",
82
- "items": { "type": "string" },
83
- "uniqueItems": true,
84
- "default": []
85
- }
86
- },
87
- "type": ["object", "boolean"],
88
- "properties": {
89
- "$id": {
90
- "type": "string",
91
- "format": "uri-reference"
92
- },
93
- "$schema": {
94
- "type": "string",
95
- "format": "uri"
96
- },
97
- "$ref": {
98
- "type": "string",
99
- "format": "uri-reference"
100
- },
101
- "$comment": {
102
- "type": "string"
103
- },
104
- "title": {
105
- "type": "string"
106
- },
107
- "description": {
108
- "type": "string"
109
- },
110
- "default": true,
111
- "readOnly": {
112
- "type": "boolean",
113
- "default": false
114
- },
115
- "writeOnly": {
116
- "type": "boolean",
117
- "default": false
118
- },
119
- "examples": {
120
- "type": "array",
121
- "items": true
122
- },
123
- "multipleOf": {
124
- "type": "number",
125
- "exclusiveMinimum": 0
126
- },
127
- "maximum": {
128
- "type": "number"
129
- },
130
- "exclusiveMaximum": {
131
- "type": "number"
132
- },
133
- "minimum": {
134
- "type": "number"
135
- },
136
- "exclusiveMinimum": {
137
- "type": "number"
138
- },
139
- "maxLength": { "$ref": "#/definitions/nonNegativeInteger" },
140
- "minLength": { "$ref": "#/definitions/nonNegativeIntegerDefault0" },
141
- "pattern": {
142
- "type": "string",
143
- "format": "regex"
144
- },
145
- "additionalItems": { "$ref": "#" },
146
- "items": {
147
- "anyOf": [
148
- { "$ref": "#" },
149
- { "$ref": "#/definitions/schemaArray" }
150
- ],
151
- "default": true
152
- },
153
- "maxItems": { "$ref": "#/definitions/nonNegativeInteger" },
154
- "minItems": { "$ref": "#/definitions/nonNegativeIntegerDefault0" },
155
- "uniqueItems": {
156
- "type": "boolean",
157
- "default": false
158
- },
159
- "contains": { "$ref": "#" },
160
- "maxProperties": { "$ref": "#/definitions/nonNegativeInteger" },
161
- "minProperties": { "$ref": "#/definitions/nonNegativeIntegerDefault0" },
162
- "required": { "$ref": "#/definitions/stringArray" },
163
- "additionalProperties": { "$ref": "#" },
164
- "definitions": {
165
- "type": "object",
166
- "additionalProperties": { "$ref": "#" },
167
- "default": {}
168
- },
169
- "properties": {
170
- "type": "object",
171
- "additionalProperties": { "$ref": "#" },
172
- "default": {}
173
- },
174
- "patternProperties": {
175
- "type": "object",
176
- "additionalProperties": { "$ref": "#" },
177
- "propertyNames": { "format": "regex" },
178
- "default": {}
179
- },
180
- "dependencies": {
181
- "type": "object",
182
- "additionalProperties": {
183
- "anyOf": [
184
- { "$ref": "#" },
185
- { "$ref": "#/definitions/stringArray" }
186
- ]
187
- }
188
- },
189
- "propertyNames": { "$ref": "#" },
190
- "const": true,
191
- "enum": {
192
- "type": "array",
193
- "items": true,
194
- "minItems": 1,
195
- "uniqueItems": true
196
- },
197
- "type": {
198
- "anyOf": [
199
- { "$ref": "#/definitions/simpleTypes" },
200
- {
201
- "type": "array",
202
- "items": { "$ref": "#/definitions/simpleTypes" },
203
- "minItems": 1,
204
- "uniqueItems": true
205
- }
206
- ]
207
- },
208
- "format": { "type": "string" },
209
- "contentMediaType": { "type": "string" },
210
- "contentEncoding": { "type": "string" },
211
- "if": { "$ref": "#" },
212
- "then": { "$ref": "#" },
213
- "else": { "$ref": "#" },
214
- "allOf": { "$ref": "#/definitions/schemaArray" },
215
- "anyOf": { "$ref": "#/definitions/schemaArray" },
216
- "oneOf": { "$ref": "#/definitions/schemaArray" },
217
- "not": { "$ref": "#" }
218
- },
219
- "default": true
220
- }
221
- ```
222
 
223
- You can combine the prompt, schema, and HTML together using something like:
 
 
 
 
 
 
224
 
 
 
 
 
225
  ```
226
- def construct_messages(schema, html):
227
- """Construct messages for OpenAI API"""
228
- user_prompt = (
229
- response_prompt['prompt_part_one'] +
230
- "\n\n" + schema + "\n\n" +
231
- response_prompt['prompt_part_two'] +
232
- "\n\n" + html + "\n\n" +
233
- response_prompt['prompt_part_three']
234
- )
235
-
236
- messages = [
237
- {"role": "system", "content": "You are a helpful assistant"},
238
- {"role": "user", "content": user_prompt}
239
- ]
240
-
241
- return messages
242
- ```
243
 
244
- such that the schema is copied from above and the html is the response from the lxml cleaning function. The output should be the filled out JSON.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
 
 
 
 
4
  base_model: meta-llama/Llama-3.2-3B-Instruct
5
  ---
6
 
7
+ ## Model Overview
8
+ Schematron is a long‑context extraction model for converting noisy HTML into clean, typed JSON that conforms to a user‑provided schema. It is purpose‑built for web scraping, data ingestion, and turning arbitrary pages into structured records.
9
 
10
+ ## Highlights
11
+ - **Schema-first extraction**: Strict, schema‑conformant JSON outputs
12
+ - **Long context**: Robust to lengthy, noisy HTML (up to 128K tokens)
13
+ - **Reliable structure**: Works well with JSON mode and typed parsers
14
+ - **Variants**: Schematron‑8B (quality) and Schematron‑3B (cost)
15
 
16
+ ## Model Details
17
+ - **Family**: Schematron (3B and 8B)
18
+ - **Base**: Instruction‑tuned LLM, fine‑tuned for schema‑guided extraction
19
+ - **Context window**: Up to 128K tokens
20
+ - **Input**: Raw or lightly cleaned HTML
21
+ - **Output**: Strictly valid JSON matching your schema
22
 
23
+ ## Minimal Quickstart
24
+ Use these local snippets to prepare HTML and compose a schema‑guided prompt. The model returns strictly valid JSON; validate it against your schema downstream.
25
+
26
+ ```python
27
  from lxml.html.clean import Cleaner
28
  import lxml.html as LH
29
 
 
49
  return ""
50
  ```
51
 
52
+ Compose messages with your schema and cleaned HTML:
 
 
 
 
 
 
 
53
 
54
+ ```python
55
+ def construct_messages(schema: str, html: str):
56
+ """Construct messages for a schema‑guided extraction request."""
57
+ response_prompt = {
58
+ "prompt_part_one": (
59
+ "You are going to be given a JSON schema following the standardized JSON "
60
+ "Schema format. You are going to be given a HTML page and you are going "
61
+ "to apply the schema to the HTML page however you see it as applicable "
62
+ "and return the results in a JSON object. The schema is as follows:"
63
+ ),
64
+ "prompt_part_two": "Here is the HTML page:",
65
+ "prompt_part_three": "MAKE SURE ITS VALID JSON.",
66
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ user_prompt = (
69
+ response_prompt['prompt_part_one']
70
+ + "\n\n" + schema + "\n\n"
71
+ + response_prompt['prompt_part_two']
72
+ + "\n\n" + html + "\n\n"
73
+ + response_prompt['prompt_part_three']
74
+ )
75
 
76
+ return [
77
+ {"role": "system", "content": "You are a helpful assistant"},
78
+ {"role": "user", "content": user_prompt},
79
+ ]
80
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ ## Recommendations
83
+ - Temperature 0 and JSON mode for deterministic, parseable output
84
+ - Validate responses against your schema (e.g., Pydantic or Zod)
85
+ - Pre‑clean HTML (remove scripts/styles) when possible; avoid over‑aggressive removal
86
+ - Using lxml to clean the HTML is not required, but is recommended as it matches the training data.
87
+
88
+ ## Limitations
89
+ - Static HTML only; render client‑side content upstream
90
+ - Very large pages may require truncation
91
+ - Ambiguous fields depend on schema clarity; be explicit in field descriptions
92
+
93
+ ## Safety and Responsible Use
94
+ - Extracted data may include personal or sensitive information present in the page—handle and store responsibly
95
+ - Respect site terms, robots.txt, and applicable laws
96
+ - Use downstream validation and guardrails for compliance
97
+
98
+ ## License
99
+ See license in the metadata above.
100
 
101
+ ## Support
102
+ - Docs: https://docs.inference.net
103
+ - Email: support@inference.net