ZhouChuYue
commited on
Commit
·
a579dd2
1
Parent(s):
aa1ad34
init
Browse files- app.py +373 -0
- requirements.txt +9 -0
- ultradata_math_parser/__init__.py +109 -0
- ultradata_math_parser/config.py +332 -0
- ultradata_math_parser/mmltex/cmarkup.xsl +1093 -0
- ultradata_math_parser/mmltex/entities.xsl +316 -0
- ultradata_math_parser/mmltex/glayout.xsl +220 -0
- ultradata_math_parser/mmltex/mmltex.xsl +45 -0
- ultradata_math_parser/mmltex/scripts.xsl +292 -0
- ultradata_math_parser/mmltex/tables.xsl +130 -0
- ultradata_math_parser/mmltex/tokens.xsl +296 -0
- ultradata_math_parser/parsers/__init__.py +15 -0
- ultradata_math_parser/parsers/article_parser.py +76 -0
- ultradata_math_parser/parsers/base_parser.py +1059 -0
- ultradata_math_parser/parsers/custom_parser.py +64 -0
- ultradata_math_parser/parsers/forum_parser.py +135 -0
- ultradata_math_parser/parsers/title_parser.py +49 -0
- ultradata_math_parser/parsers/unified_parser.py +187 -0
- ultradata_math_parser/readability_plus.py +539 -0
- ultradata_math_parser/utils.py +499 -0
app.py
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
UltraData Math Parser - Hugging Face Space Demo
|
| 4 |
+
A unified HTML parser optimized for extracting mathematical content.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
from ultradata_math_parser.parsers.unified_parser import UnifiedParser
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def parse_html(
|
| 12 |
+
html_content: str,
|
| 13 |
+
base_url: str = "",
|
| 14 |
+
process_math: bool = True,
|
| 15 |
+
include_tables: bool = True,
|
| 16 |
+
enable_forum_assembly: bool = True,
|
| 17 |
+
) -> dict:
|
| 18 |
+
"""
|
| 19 |
+
Parse HTML content using UnifiedParser.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
html_content: Raw HTML string to parse
|
| 23 |
+
base_url: Base URL for resolving relative links
|
| 24 |
+
process_math: Whether to process and convert math expressions
|
| 25 |
+
include_tables: Whether to preserve table elements
|
| 26 |
+
enable_forum_assembly: Whether to enable forum post assembly
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
Dictionary containing parsed results
|
| 30 |
+
"""
|
| 31 |
+
if not html_content or not html_content.strip():
|
| 32 |
+
return {
|
| 33 |
+
"title": "",
|
| 34 |
+
"html": "",
|
| 35 |
+
"text_length": 0,
|
| 36 |
+
"xp_num": "",
|
| 37 |
+
"fallback_strategy": "",
|
| 38 |
+
"forum_assembled": False,
|
| 39 |
+
"error": "Please provide HTML content to parse.",
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
parser = UnifiedParser()
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
result = parser.extract(
|
| 46 |
+
html=html_content,
|
| 47 |
+
base_url=base_url,
|
| 48 |
+
process_math=process_math,
|
| 49 |
+
include_tables=include_tables,
|
| 50 |
+
enable_forum_assembly=enable_forum_assembly,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
return {
|
| 54 |
+
"title": result.get("title", ""),
|
| 55 |
+
"html": result.get("html", ""),
|
| 56 |
+
"text_length": result.get("text_length", 0),
|
| 57 |
+
"xp_num": result.get("xp_num", ""),
|
| 58 |
+
"fallback_strategy": result.get("fallback_strategy", ""),
|
| 59 |
+
"forum_assembled": result.get("forum_assembled", False),
|
| 60 |
+
"error": None,
|
| 61 |
+
}
|
| 62 |
+
except Exception as e:
|
| 63 |
+
return {
|
| 64 |
+
"title": "",
|
| 65 |
+
"html": "",
|
| 66 |
+
"text_length": 0,
|
| 67 |
+
"xp_num": "",
|
| 68 |
+
"fallback_strategy": "",
|
| 69 |
+
"forum_assembled": False,
|
| 70 |
+
"error": str(e),
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def format_output(result: dict) -> tuple:
|
| 75 |
+
"""Format the parser output for Gradio display."""
|
| 76 |
+
if result.get("error"):
|
| 77 |
+
return (
|
| 78 |
+
f"❌ Error: {result['error']}",
|
| 79 |
+
"",
|
| 80 |
+
"",
|
| 81 |
+
"",
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# Build metadata string
|
| 85 |
+
metadata = f"""📊 **Parsing Statistics**
|
| 86 |
+
- **Title**: {result['title'] or 'N/A'}
|
| 87 |
+
- **Text Length**: {result['text_length']} characters
|
| 88 |
+
- **XPath Match**: {result['xp_num']}
|
| 89 |
+
- **Fallback Strategy**: {result['fallback_strategy']}
|
| 90 |
+
- **Forum Assembled**: {'✅ Yes' if result['forum_assembled'] else '❌ No'}
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
return (
|
| 94 |
+
metadata,
|
| 95 |
+
result.get("title", ""),
|
| 96 |
+
result.get("html", ""),
|
| 97 |
+
result.get("html", ""), # For HTML preview
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def process_input(html_content, base_url, process_math, include_tables, enable_forum):
|
| 102 |
+
"""Main processing function for Gradio interface."""
|
| 103 |
+
result = parse_html(
|
| 104 |
+
html_content=html_content,
|
| 105 |
+
base_url=base_url,
|
| 106 |
+
process_math=process_math,
|
| 107 |
+
include_tables=include_tables,
|
| 108 |
+
enable_forum_assembly=enable_forum,
|
| 109 |
+
)
|
| 110 |
+
return format_output(result)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# Example HTML content for demo
|
| 114 |
+
EXAMPLE_HTML = """<!DOCTYPE html>
|
| 115 |
+
<html>
|
| 116 |
+
<head>
|
| 117 |
+
<title>Quadratic Formula Example</title>
|
| 118 |
+
</head>
|
| 119 |
+
<body>
|
| 120 |
+
<article class="post-content">
|
| 121 |
+
<h1>Understanding the Quadratic Formula</h1>
|
| 122 |
+
<p>The quadratic formula is used to solve equations of the form ax² + bx + c = 0.</p>
|
| 123 |
+
<p>The solution is given by:</p>
|
| 124 |
+
<math xmlns="http://www.w3.org/1998/Math/MathML">
|
| 125 |
+
<mi>x</mi>
|
| 126 |
+
<mo>=</mo>
|
| 127 |
+
<mfrac>
|
| 128 |
+
<mrow>
|
| 129 |
+
<mo>-</mo>
|
| 130 |
+
<mi>b</mi>
|
| 131 |
+
<mo>±</mo>
|
| 132 |
+
<msqrt>
|
| 133 |
+
<mrow>
|
| 134 |
+
<msup><mi>b</mi><mn>2</mn></msup>
|
| 135 |
+
<mo>-</mo>
|
| 136 |
+
<mn>4</mn>
|
| 137 |
+
<mi>a</mi>
|
| 138 |
+
<mi>c</mi>
|
| 139 |
+
</mrow>
|
| 140 |
+
</msqrt>
|
| 141 |
+
</mrow>
|
| 142 |
+
<mrow>
|
| 143 |
+
<mn>2</mn>
|
| 144 |
+
<mi>a</mi>
|
| 145 |
+
</mrow>
|
| 146 |
+
</mfrac>
|
| 147 |
+
</math>
|
| 148 |
+
<p>Where a, b, and c are coefficients of the quadratic equation.</p>
|
| 149 |
+
<h2>Example Problem</h2>
|
| 150 |
+
<p>Solve: x² - 5x + 6 = 0</p>
|
| 151 |
+
<p>Here, a = 1, b = -5, c = 6</p>
|
| 152 |
+
<p>Using the formula: x = (5 ± √(25-24))/2 = (5 ± 1)/2</p>
|
| 153 |
+
<p>Therefore, x = 3 or x = 2</p>
|
| 154 |
+
</article>
|
| 155 |
+
<footer>
|
| 156 |
+
<nav>Related articles...</nav>
|
| 157 |
+
</footer>
|
| 158 |
+
</body>
|
| 159 |
+
</html>"""
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# Custom CSS for better aesthetics
|
| 163 |
+
custom_css = """
|
| 164 |
+
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&family=Space+Grotesk:wght@400;500;600;700&display=swap');
|
| 165 |
+
|
| 166 |
+
.gradio-container {
|
| 167 |
+
font-family: 'Space Grotesk', sans-serif !important;
|
| 168 |
+
background: linear-gradient(135deg, #0f0f23 0%, #1a1a3e 50%, #0f0f23 100%) !important;
|
| 169 |
+
min-height: 100vh;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
.main-title {
|
| 173 |
+
font-family: 'Space Grotesk', sans-serif !important;
|
| 174 |
+
font-weight: 700 !important;
|
| 175 |
+
font-size: 2.5rem !important;
|
| 176 |
+
background: linear-gradient(90deg, #00d4ff, #7c3aed, #f472b6) !important;
|
| 177 |
+
-webkit-background-clip: text !important;
|
| 178 |
+
-webkit-text-fill-color: transparent !important;
|
| 179 |
+
background-clip: text !important;
|
| 180 |
+
text-align: center !important;
|
| 181 |
+
margin-bottom: 0.5rem !important;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
.subtitle {
|
| 185 |
+
text-align: center !important;
|
| 186 |
+
color: #94a3b8 !important;
|
| 187 |
+
font-size: 1.1rem !important;
|
| 188 |
+
margin-bottom: 2rem !important;
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
.gr-box {
|
| 192 |
+
border-radius: 12px !important;
|
| 193 |
+
border: 1px solid rgba(124, 58, 237, 0.3) !important;
|
| 194 |
+
background: rgba(15, 15, 35, 0.8) !important;
|
| 195 |
+
backdrop-filter: blur(10px) !important;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
.gr-input, .gr-textarea {
|
| 199 |
+
font-family: 'JetBrains Mono', monospace !important;
|
| 200 |
+
background: rgba(30, 30, 60, 0.6) !important;
|
| 201 |
+
border: 1px solid rgba(124, 58, 237, 0.4) !important;
|
| 202 |
+
border-radius: 8px !important;
|
| 203 |
+
color: #e2e8f0 !important;
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
.gr-button-primary {
|
| 207 |
+
background: linear-gradient(135deg, #7c3aed 0%, #00d4ff 100%) !important;
|
| 208 |
+
border: none !important;
|
| 209 |
+
font-weight: 600 !important;
|
| 210 |
+
font-size: 1rem !important;
|
| 211 |
+
padding: 12px 32px !important;
|
| 212 |
+
border-radius: 8px !important;
|
| 213 |
+
transition: all 0.3s ease !important;
|
| 214 |
+
text-transform: uppercase !important;
|
| 215 |
+
letter-spacing: 1px !important;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
.gr-button-primary:hover {
|
| 219 |
+
transform: translateY(-2px) !important;
|
| 220 |
+
box-shadow: 0 8px 25px rgba(124, 58, 237, 0.4) !important;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
.gr-button-secondary {
|
| 224 |
+
background: transparent !important;
|
| 225 |
+
border: 2px solid rgba(124, 58, 237, 0.5) !important;
|
| 226 |
+
color: #a78bfa !important;
|
| 227 |
+
font-weight: 500 !important;
|
| 228 |
+
border-radius: 8px !important;
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
.section-header {
|
| 232 |
+
color: #00d4ff !important;
|
| 233 |
+
font-weight: 600 !important;
|
| 234 |
+
font-size: 1.2rem !important;
|
| 235 |
+
margin-bottom: 1rem !important;
|
| 236 |
+
padding-bottom: 0.5rem !important;
|
| 237 |
+
border-bottom: 2px solid rgba(0, 212, 255, 0.3) !important;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
.output-box {
|
| 241 |
+
background: rgba(20, 20, 45, 0.9) !important;
|
| 242 |
+
border: 1px solid rgba(0, 212, 255, 0.3) !important;
|
| 243 |
+
border-radius: 12px !important;
|
| 244 |
+
padding: 1rem !important;
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
.gr-markdown {
|
| 248 |
+
color: #e2e8f0 !important;
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
.gr-markdown code {
|
| 252 |
+
background: rgba(124, 58, 237, 0.2) !important;
|
| 253 |
+
padding: 2px 6px !important;
|
| 254 |
+
border-radius: 4px !important;
|
| 255 |
+
font-family: 'JetBrains Mono', monospace !important;
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
footer {
|
| 259 |
+
display: none !important;
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
.gr-accordion {
|
| 263 |
+
border: 1px solid rgba(124, 58, 237, 0.3) !important;
|
| 264 |
+
border-radius: 8px !important;
|
| 265 |
+
background: rgba(20, 20, 45, 0.6) !important;
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
.gr-check-radio {
|
| 269 |
+
accent-color: #7c3aed !important;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
label {
|
| 273 |
+
color: #cbd5e1 !important;
|
| 274 |
+
}
|
| 275 |
+
"""
|
| 276 |
+
|
| 277 |
+
# Build Gradio interface
|
| 278 |
+
with gr.Blocks(css=custom_css, title="UltraData Math Parser") as demo:
|
| 279 |
+
gr.HTML('<h1 class="main-title">📐 UltraData Math Parser</h1>')
|
| 280 |
+
gr.HTML('<p class="subtitle">Unified HTML Parser for Mathematical Content Extraction</p>')
|
| 281 |
+
|
| 282 |
+
with gr.Row():
|
| 283 |
+
with gr.Column(scale=1):
|
| 284 |
+
gr.HTML('<div class="section-header">📥 Input</div>')
|
| 285 |
+
|
| 286 |
+
html_input = gr.Textbox(
|
| 287 |
+
label="HTML Content",
|
| 288 |
+
placeholder="Paste your HTML content here...",
|
| 289 |
+
lines=15,
|
| 290 |
+
max_lines=30,
|
| 291 |
+
value=EXAMPLE_HTML,
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
base_url_input = gr.Textbox(
|
| 295 |
+
label="Base URL (Optional)",
|
| 296 |
+
placeholder="https://example.com/page",
|
| 297 |
+
lines=1,
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
with gr.Accordion("⚙️ Advanced Options", open=False):
|
| 301 |
+
process_math = gr.Checkbox(
|
| 302 |
+
label="Process Math Expressions",
|
| 303 |
+
value=True,
|
| 304 |
+
info="Convert MathML and LaTeX to unified format",
|
| 305 |
+
)
|
| 306 |
+
include_tables = gr.Checkbox(
|
| 307 |
+
label="Include Tables",
|
| 308 |
+
value=True,
|
| 309 |
+
info="Preserve table elements in output",
|
| 310 |
+
)
|
| 311 |
+
enable_forum = gr.Checkbox(
|
| 312 |
+
label="Enable Forum Assembly",
|
| 313 |
+
value=True,
|
| 314 |
+
info="Assemble forum posts and comments",
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
with gr.Row():
|
| 318 |
+
parse_btn = gr.Button("🚀 Parse HTML", variant="primary", size="lg")
|
| 319 |
+
clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="lg")
|
| 320 |
+
|
| 321 |
+
with gr.Column(scale=1):
|
| 322 |
+
gr.HTML('<div class="section-header">📤 Output</div>')
|
| 323 |
+
|
| 324 |
+
metadata_output = gr.Markdown(
|
| 325 |
+
label="Parsing Statistics",
|
| 326 |
+
elem_classes=["output-box"],
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
title_output = gr.Textbox(
|
| 330 |
+
label="Extracted Title",
|
| 331 |
+
lines=1,
|
| 332 |
+
interactive=False,
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
with gr.Tabs():
|
| 336 |
+
with gr.TabItem("📝 Raw HTML"):
|
| 337 |
+
html_output = gr.Textbox(
|
| 338 |
+
label="Extracted HTML",
|
| 339 |
+
lines=12,
|
| 340 |
+
max_lines=20,
|
| 341 |
+
interactive=False,
|
| 342 |
+
)
|
| 343 |
+
with gr.TabItem("👁️ Preview"):
|
| 344 |
+
preview_output = gr.HTML(
|
| 345 |
+
label="HTML Preview",
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
# Event handlers
|
| 349 |
+
parse_btn.click(
|
| 350 |
+
fn=process_input,
|
| 351 |
+
inputs=[html_input, base_url_input, process_math, include_tables, enable_forum],
|
| 352 |
+
outputs=[metadata_output, title_output, html_output, preview_output],
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
def clear_all():
|
| 356 |
+
return "", "", "", "", "", ""
|
| 357 |
+
|
| 358 |
+
clear_btn.click(
|
| 359 |
+
fn=clear_all,
|
| 360 |
+
outputs=[html_input, base_url_input, metadata_output, title_output, html_output, preview_output],
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
# Footer info
|
| 364 |
+
gr.HTML("""
|
| 365 |
+
<div style="text-align: center; margin-top: 2rem; padding: 1rem; color: #64748b; font-size: 0.9rem;">
|
| 366 |
+
<p>🔬 <strong>UltraData Math Parser</strong> - Part of the UltraData-Math Project</p>
|
| 367 |
+
<p>Specialized in extracting mathematical content from web pages with MathML, LaTeX, and formula support.</p>
|
| 368 |
+
</div>
|
| 369 |
+
""")
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
if __name__ == "__main__":
|
| 373 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
Brotli
|
| 3 |
+
cchardet==2.2.0a2
|
| 4 |
+
charset_normalizer
|
| 5 |
+
lxml<5.2.0
|
| 6 |
+
numpy
|
| 7 |
+
py_asciimath
|
| 8 |
+
urllib3
|
| 9 |
+
tldextract
|
ultradata_math_parser/__init__.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Optional, Type
|
| 5 |
+
from urllib.parse import urlparse
|
| 6 |
+
import tldextract
|
| 7 |
+
|
| 8 |
+
from ultradata_math_parser.parsers.article_parser import ArticleParser
|
| 9 |
+
from ultradata_math_parser.parsers.forum_parser import ForumParser
|
| 10 |
+
from ultradata_math_parser.parsers.custom_parser import CustomParser
|
| 11 |
+
from ultradata_math_parser.parsers.unified_parser import UnifiedParser
|
| 12 |
+
from ultradata_math_parser.utils import text_len, run_w3m_dump, W3MError
|
| 13 |
+
from ultradata_math_parser.config import URL_PATTERNS_TO_HTML_TYPE, BUILTIN_SITE_RULES
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class GeneralParser:
|
| 17 |
+
def __init__(self, config_path="", w3m_path: str = "w3m"):
|
| 18 |
+
self.logger = logging.getLogger(__name__)
|
| 19 |
+
if config_path:
|
| 20 |
+
try:
|
| 21 |
+
with open(config_path, 'r', encoding='utf-8') as f:
|
| 22 |
+
self.rule = json.loads(f.read())
|
| 23 |
+
except:
|
| 24 |
+
pass
|
| 25 |
+
else:
|
| 26 |
+
self.rule = {}
|
| 27 |
+
self.w3m_path = w3m_path or "w3m"
|
| 28 |
+
self.tld_extractor = tldextract.TLDExtract()
|
| 29 |
+
|
| 30 |
+
def extract(self, html="", w3m_path: Optional[str] = None, **kwargs) -> dict:
|
| 31 |
+
base_url = kwargs.get("base_url", "")
|
| 32 |
+
netloc = urlparse(base_url).netloc if base_url else ""
|
| 33 |
+
html_type = kwargs.pop("html_type", None)
|
| 34 |
+
|
| 35 |
+
current_w3m_path = w3m_path or self.w3m_path
|
| 36 |
+
|
| 37 |
+
# 检查 URL 是否匹配内置规则
|
| 38 |
+
if base_url and self._quick_check_builtin_rules(base_url):
|
| 39 |
+
try:
|
| 40 |
+
extracted = self.tld_extractor(base_url)
|
| 41 |
+
domain = f"{extracted.domain}.{extracted.suffix}"
|
| 42 |
+
self.logger.debug("TLD Extract result for %s: domain=%s, suffix=%s -> key=%s", base_url, extracted.domain, extracted.suffix, domain)
|
| 43 |
+
|
| 44 |
+
if domain in BUILTIN_SITE_RULES:
|
| 45 |
+
try:
|
| 46 |
+
builtin_rule = BUILTIN_SITE_RULES[domain]
|
| 47 |
+
new_kwargs = dict()
|
| 48 |
+
new_kwargs["rule"] = builtin_rule
|
| 49 |
+
new_kwargs.update(kwargs)
|
| 50 |
+
self.logger.debug("Using builtin rule for domain: %s", domain)
|
| 51 |
+
return self._run_extractor(CustomParser, html, new_kwargs, w3m_path=current_w3m_path)
|
| 52 |
+
except Exception as exc:
|
| 53 |
+
self.logger.debug("Builtin rule extractor failed for %s: %s", domain, exc)
|
| 54 |
+
except Exception as e:
|
| 55 |
+
self.logger.debug("Error extracting domain or checking builtin rules: %s", e)
|
| 56 |
+
|
| 57 |
+
# 检查 URL 类型模式
|
| 58 |
+
if not html_type and base_url:
|
| 59 |
+
for pattern, type in URL_PATTERNS_TO_HTML_TYPE.items():
|
| 60 |
+
if pattern in base_url:
|
| 61 |
+
html_type = type
|
| 62 |
+
break
|
| 63 |
+
|
| 64 |
+
# 使用用户配置的规则
|
| 65 |
+
if netloc in self.rule:
|
| 66 |
+
try:
|
| 67 |
+
new_kwargs = dict()
|
| 68 |
+
new_kwargs["rule"] = self.rule[netloc]
|
| 69 |
+
new_kwargs.update(kwargs)
|
| 70 |
+
return self._run_extractor(CustomParser, html, new_kwargs, w3m_path=current_w3m_path)
|
| 71 |
+
except Exception as exc:
|
| 72 |
+
self.logger.debug("Custom extractor failed for %s: %s", netloc, exc)
|
| 73 |
+
|
| 74 |
+
# 根据 html_type 选择提取模式
|
| 75 |
+
if html_type == "forum":
|
| 76 |
+
return self._run_extractor(ForumParser, html, kwargs, w3m_path=current_w3m_path)
|
| 77 |
+
if html_type == "article":
|
| 78 |
+
return self._run_extractor(ArticleParser, html, kwargs, w3m_path=current_w3m_path)
|
| 79 |
+
if html_type == "unified":
|
| 80 |
+
return self._run_extractor(UnifiedParser, html, kwargs, w3m_path=current_w3m_path)
|
| 81 |
+
|
| 82 |
+
# 默认使用统一模式
|
| 83 |
+
return self._run_extractor(UnifiedParser, html, kwargs, w3m_path=current_w3m_path)
|
| 84 |
+
|
| 85 |
+
def _quick_check_builtin_rules(self, url: str) -> bool:
|
| 86 |
+
if not url:
|
| 87 |
+
return False
|
| 88 |
+
url_lower = url.lower()
|
| 89 |
+
for domain in BUILTIN_SITE_RULES:
|
| 90 |
+
if domain in url_lower:
|
| 91 |
+
return True
|
| 92 |
+
return False
|
| 93 |
+
|
| 94 |
+
def _run_extractor(self, extractor_cls: Type, html: str, kwargs: dict, w3m_path: str):
|
| 95 |
+
result = extractor_cls().extract(html=html, **dict(kwargs))
|
| 96 |
+
return self._apply_w3m(result, w3m_path=w3m_path)
|
| 97 |
+
|
| 98 |
+
def _apply_w3m(self, result: Optional[dict], w3m_path: str) -> Optional[dict]:
|
| 99 |
+
if not result:
|
| 100 |
+
return result
|
| 101 |
+
html_fragment = result.get("html")
|
| 102 |
+
if not html_fragment:
|
| 103 |
+
raise RuntimeError("Extraction result does not contain 'html' for w3m")
|
| 104 |
+
text = run_w3m_dump(html_fragment, w3m_path)
|
| 105 |
+
enriched = dict(result)
|
| 106 |
+
enriched["text"] = text
|
| 107 |
+
enriched["w3m_text"] = text
|
| 108 |
+
enriched["text_length"] = text_len(text)
|
| 109 |
+
return enriched
|
ultradata_math_parser/config.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding:utf-8 -*-
|
| 2 |
+
|
| 3 |
+
Unique_ID = "ultradata_math_parser_id_internal"
|
| 4 |
+
|
| 5 |
+
PAYWALL_DISCARD_XPATH = [
|
| 6 |
+
""".//*[(self::div or self::p)][
|
| 7 |
+
contains(@id, "paywall") or contains(@id, "premium") or
|
| 8 |
+
contains(@class, "paid-content") or contains(@class, "paidcontent") or
|
| 9 |
+
contains(@class, "obfuscated") or contains(@class, "blurred") or
|
| 10 |
+
contains(@class, "restricted") or contains(@class, "overlay")
|
| 11 |
+
]""",
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
OVERALL_DISCARD_XPATH = [
|
| 15 |
+
# navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts
|
| 16 |
+
""".//*[(self::div or self::item or self::ul
|
| 17 |
+
or self::p or self::section or self::span)][
|
| 18 |
+
contains(translate(@id, "F","f"), "footer") or contains(translate(@class, "F","f"), "footer")
|
| 19 |
+
or contains(@id, "related") or contains(translate(@class, "R", "r"), "related") or
|
| 20 |
+
contains(@id, "viral") or contains(@class, "viral") or
|
| 21 |
+
starts-with(@id, "shar") or starts-with(@class, "shar") or
|
| 22 |
+
contains(@class, "share-") or
|
| 23 |
+
contains(translate(@id, "S", "s"), "share") or
|
| 24 |
+
contains(@id, "social") or contains(@class, "social") or contains(@class, "sociable") or
|
| 25 |
+
contains(@id, "syndication") or contains(@class, "syndication") or
|
| 26 |
+
starts-with(@id, "jp-") or starts-with(@id, "dpsp-content") or
|
| 27 |
+
contains(@class, "embedded") or contains(@class, "embed")
|
| 28 |
+
or contains(@id, "newsletter") or contains(@class, "newsletter")
|
| 29 |
+
or contains(@class, "subnav") or
|
| 30 |
+
contains(@id, "cookie") or contains(@class, "cookie") or contains(@id, "tags")
|
| 31 |
+
or contains(@class, "tags") or contains(@id, "sidebar") or
|
| 32 |
+
contains(@class, "sidebar") or contains(@id, "banner") or contains(@class, "banner")
|
| 33 |
+
or contains(@class, "meta") or
|
| 34 |
+
contains(@id, "menu") or contains(@class, "menu") or
|
| 35 |
+
contains(translate(@id, "N", "n"), "nav") or contains(translate(@role, "N", "n"), "nav")
|
| 36 |
+
or starts-with(@class, "nav") or contains(translate(@class, "N", "n"), "navigation") or
|
| 37 |
+
contains(@class, "navbar") or contains(@class, "navbox") or starts-with(@class, "post-nav")
|
| 38 |
+
or contains(@id, "breadcrumb") or contains(@class, "breadcrumb") or
|
| 39 |
+
contains(@id, "bread-crumb") or contains(@class, "bread-crumb") or
|
| 40 |
+
contains(@id, "author") or contains(@class, "author") or
|
| 41 |
+
contains(@id, "button") or contains(@class, "button")
|
| 42 |
+
or contains(translate(@class, "B", "b"), "byline")
|
| 43 |
+
or contains(@class, "rating") or starts-with(@class, "widget") or
|
| 44 |
+
contains(@class, "attachment") or contains(@class, "timestamp") or
|
| 45 |
+
contains(@class, "user-info") or contains(@class, "user-profile") or
|
| 46 |
+
contains(@class, "-ad-") or contains(@class, "-icon")
|
| 47 |
+
or contains(@class, "article-infos") or
|
| 48 |
+
contains(translate(@class, "I", "i"), "infoline")
|
| 49 |
+
or contains(@data-component, "MostPopularStories")
|
| 50 |
+
or contains(@class, "outbrain") or contains(@class, "taboola")
|
| 51 |
+
or contains(@class, "criteo") or contains(@class, "options")
|
| 52 |
+
or contains(@class, "consent") or contains(@class, "modal-content")
|
| 53 |
+
or contains(@class, "paid-content") or contains(@class, "paidcontent")
|
| 54 |
+
or contains(@id, "premium-") or contains(@id, "paywall")
|
| 55 |
+
or contains(@class, "obfuscated") or contains(@class, "blurred")
|
| 56 |
+
or contains(@class, " ad ")
|
| 57 |
+
or contains(@class, "next-post")
|
| 58 |
+
or contains(@class, "yin") or contains(@class, "zlylin") or
|
| 59 |
+
contains(@class, "xg1") or contains(@id, "bmdh")
|
| 60 |
+
or @data-lp-replacement-content]""",
|
| 61 |
+
# hidden parts
|
| 62 |
+
""".//*[starts-with(@class, "hide-") or contains(@class, "hide-print") or contains(@id, "hidden")
|
| 63 |
+
or contains(@style, "hidden") or contains(@hidden, "hidden") or contains(@class, "noprint")
|
| 64 |
+
or contains(@style, "display:none") or contains(@class, " hidden") or @aria-hidden="true"
|
| 65 |
+
or contains(@class, "notloaded")]""",
|
| 66 |
+
# comment debris
|
| 67 |
+
# or contains(@class, "message-container") or contains(@id, "message_container")
|
| 68 |
+
""".//*[@class="comments-title" or contains(@class, "comments-title") or
|
| 69 |
+
contains(@class, "nocomments") or starts-with(@id, "reply-") or starts-with(@class, "reply-") or
|
| 70 |
+
contains(@class, "-reply-") or contains(@class, "message") or contains(@id, "message_container")
|
| 71 |
+
or contains(@id, "akismet") or contains(@class, "akismet")] """,
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
TEASER_DISCARD_XPATH = [
|
| 75 |
+
""".//*[(self::div or self::item or self::ul
|
| 76 |
+
or self::p or self::section or self::span)][
|
| 77 |
+
contains(translate(@id, "T", "t"), "teaser") or contains(translate(@class, "T", "t"), "teaser")
|
| 78 |
+
]""",
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
PRECISION_DISCARD_XPATH = [
|
| 82 |
+
".//header",
|
| 83 |
+
""".//*[(self::div or self::item or self::ul
|
| 84 |
+
or self::p or self::section or self::span)][
|
| 85 |
+
contains(@id, "bottom") or contains(@class, "bottom") or
|
| 86 |
+
contains(@id, "link") or contains(@class, "link")
|
| 87 |
+
or contains(@style, "border")
|
| 88 |
+
]""",
|
| 89 |
+
]
|
| 90 |
+
|
| 91 |
+
DISCARD_IMAGE_ELEMENTS = [
|
| 92 |
+
""".//*[(self::div or self::item or self::ul
|
| 93 |
+
or self::p or self::section or self::span)][
|
| 94 |
+
contains(@id, "caption") or contains(@class, "caption")
|
| 95 |
+
]
|
| 96 |
+
"""
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
REMOVE_COMMENTS_XPATH = [
|
| 100 |
+
""".//*[(self::div or self::ul or self::section)][
|
| 101 |
+
starts-with(translate(@id, "C","c"), 'comment') or
|
| 102 |
+
starts-with(translate(@class, "C","c"), 'comment') or starts-with(translate(@name, "C","c"), 'comment') or
|
| 103 |
+
contains(@class, 'article-comments') or contains(@class, 'post-comments')
|
| 104 |
+
or starts-with(@id, 'comol') or starts-with(@id, 'disqus_thread')
|
| 105 |
+
or starts-with(@id, 'dsq-comments')
|
| 106 |
+
]"""
|
| 107 |
+
]
|
| 108 |
+
|
| 109 |
+
CONTENT_EXTRACTOR_NOISE_XPATHS = [
|
| 110 |
+
# '//div[contains(@class, "comment") or contains(@name, "comment") or contains(@id, "comment")]',
|
| 111 |
+
'//div[starts-with(@class, "advert") or starts-with(@name, "advert") or starts-with(@id, "advert")]',
|
| 112 |
+
'//div[contains(@style, "display: none")]',
|
| 113 |
+
'//div[contains(@style, "display:none")]',
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
# 保留图片,音频,视频
|
| 117 |
+
MANUALLY_CLEANED = [
|
| 118 |
+
"aside",
|
| 119 |
+
"embed",
|
| 120 |
+
"footer",
|
| 121 |
+
"head",
|
| 122 |
+
"iframe",
|
| 123 |
+
"menu",
|
| 124 |
+
"object",
|
| 125 |
+
"script",
|
| 126 |
+
"applet",
|
| 127 |
+
"canvas",
|
| 128 |
+
"map",
|
| 129 |
+
"svg",
|
| 130 |
+
"area",
|
| 131 |
+
"blink",
|
| 132 |
+
"button",
|
| 133 |
+
"datalist",
|
| 134 |
+
"dialog",
|
| 135 |
+
"frame",
|
| 136 |
+
"frameset",
|
| 137 |
+
"fieldset",
|
| 138 |
+
"hr",
|
| 139 |
+
"link",
|
| 140 |
+
"input",
|
| 141 |
+
"ins",
|
| 142 |
+
"label",
|
| 143 |
+
"legend",
|
| 144 |
+
"marquee",
|
| 145 |
+
"menuitem",
|
| 146 |
+
"nav",
|
| 147 |
+
"noscript",
|
| 148 |
+
"optgroup",
|
| 149 |
+
"option",
|
| 150 |
+
"output",
|
| 151 |
+
"param",
|
| 152 |
+
"progress",
|
| 153 |
+
"rp",
|
| 154 |
+
"rt",
|
| 155 |
+
"rtc",
|
| 156 |
+
"select",
|
| 157 |
+
"style",
|
| 158 |
+
"track",
|
| 159 |
+
"textarea",
|
| 160 |
+
"time",
|
| 161 |
+
"use",
|
| 162 |
+
]
|
| 163 |
+
|
| 164 |
+
MANUALLY_STRIPPED = [
|
| 165 |
+
"abbr",
|
| 166 |
+
"acronym",
|
| 167 |
+
"address",
|
| 168 |
+
"bdi",
|
| 169 |
+
"bdo",
|
| 170 |
+
"big",
|
| 171 |
+
"cite",
|
| 172 |
+
"data",
|
| 173 |
+
"dfn",
|
| 174 |
+
"font",
|
| 175 |
+
"hgroup",
|
| 176 |
+
"ins",
|
| 177 |
+
"mark",
|
| 178 |
+
"meta",
|
| 179 |
+
"ruby",
|
| 180 |
+
"small",
|
| 181 |
+
"tbody",
|
| 182 |
+
"template",
|
| 183 |
+
"tfoot",
|
| 184 |
+
"thead",
|
| 185 |
+
]
|
| 186 |
+
|
| 187 |
+
CUT_EMPTY_ELEMS = {
|
| 188 |
+
"article",
|
| 189 |
+
"b",
|
| 190 |
+
"blockquote",
|
| 191 |
+
"dd",
|
| 192 |
+
"div",
|
| 193 |
+
"dt",
|
| 194 |
+
"em",
|
| 195 |
+
"h1",
|
| 196 |
+
"h2",
|
| 197 |
+
"h3",
|
| 198 |
+
"h4",
|
| 199 |
+
"h5",
|
| 200 |
+
"h6",
|
| 201 |
+
"i",
|
| 202 |
+
"li",
|
| 203 |
+
"main",
|
| 204 |
+
"p",
|
| 205 |
+
"pre",
|
| 206 |
+
"q",
|
| 207 |
+
"section",
|
| 208 |
+
"span",
|
| 209 |
+
"strong",
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
USELESS_ATTR = [
|
| 213 |
+
"share",
|
| 214 |
+
"contribution",
|
| 215 |
+
"copyright",
|
| 216 |
+
"copy-right",
|
| 217 |
+
"disclaimer",
|
| 218 |
+
"recommend",
|
| 219 |
+
"related",
|
| 220 |
+
"footer",
|
| 221 |
+
"social",
|
| 222 |
+
"submeta",
|
| 223 |
+
"report-infor",
|
| 224 |
+
]
|
| 225 |
+
|
| 226 |
+
BODY_XPATH = [
|
| 227 |
+
""".//*[(self::article or self::div or self::main or self::section)][
|
| 228 |
+
@class="post" or @class="entry" or
|
| 229 |
+
contains(@class, "post-text") or contains(@class, "post_text") or
|
| 230 |
+
contains(@class, "post-body") or contains(@class, "post-entry") or contains(@class, "postentry") or
|
| 231 |
+
contains(@class, "post-content") or contains(@class, "post_content") or
|
| 232 |
+
contains(@class, "postcontent") or contains(@class, "postContent") or
|
| 233 |
+
contains(@class, "article-text") or contains(@class, "articletext") or contains(@class, "articleText")
|
| 234 |
+
or contains(@id, "entry-content") or
|
| 235 |
+
contains(@class, "entry-content") or contains(@id, "article-content") or
|
| 236 |
+
contains(@class, "article-content") or contains(@id, "article__content") or
|
| 237 |
+
contains(@class, "article__content") or contains(@id, "article-body") or
|
| 238 |
+
contains(@class, "article-body") or contains(@id, "article__body") or
|
| 239 |
+
contains(@class, "article__body") or @itemprop="articleBody" or
|
| 240 |
+
contains(translate(@id, "B", "b"), "articlebody") or contains(translate(@class, "B", "b"), "articlebody")
|
| 241 |
+
or @id="articleContent" or contains(@class, "ArticleContent") or
|
| 242 |
+
contains(@class, "page-content") or contains(@class, "text-content") or
|
| 243 |
+
contains(@id, "body-text") or contains(@class, "body-text") or contains(@class, "body-content") or contains(translate(@class, "B", "b"), "textbody") or
|
| 244 |
+
contains(@class, "article__container") or contains(@id, "art-content") or contains(@class, "art-content")][1]""",
|
| 245 |
+
"(.//article)[1]",
|
| 246 |
+
"""(.//*[(self::article or self::div or self::main or self::section)][
|
| 247 |
+
contains(@class, 'post-bodycopy') or
|
| 248 |
+
contains(@class, 'storycontent') or contains(@class, 'story-content') or
|
| 249 |
+
@class='postarea' or @class='art-postcontent' or
|
| 250 |
+
contains(@class, 'theme-content') or contains(@class, 'blog-content') or
|
| 251 |
+
contains(@class, 'section-content') or contains(@class, 'single-content') or
|
| 252 |
+
contains(@class, 'single-post') or
|
| 253 |
+
contains(@class, 'main-column') or contains(@class, 'wpb_text_column') or
|
| 254 |
+
starts-with(@id, 'primary') or starts-with(@class, 'article ') or @class="text" or
|
| 255 |
+
@id="article" or @class="cell" or @id="story" or @class="story" or
|
| 256 |
+
contains(@class, "story-body") or contains(@class, "field-body") or
|
| 257 |
+
contains(translate(@class, "FULTEX","fultex"), "fulltext")
|
| 258 |
+
or @role='article'])[1]""",
|
| 259 |
+
"""(.//*[(self::article or self::div or self::main or self::section)][
|
| 260 |
+
contains(@id, "content-main") or contains(@class, "content-main") or contains(@class, "content_main") or
|
| 261 |
+
contains(@id, "content-body") or contains(@class, "content-body") or contains(@id, "contentBody")
|
| 262 |
+
or contains(@class, "content__body") or contains(translate(@id, "CM","cm"), "main-content") or contains(translate(@class, "CM","cm"), "main-content")
|
| 263 |
+
or contains(translate(@class, "CP","cp"), "page-content") or
|
| 264 |
+
@id="content" or @class="content"])[1]""",
|
| 265 |
+
'(.//*[(self::article or self::div or self::section)][starts-with(@class, "main") or starts-with(@id, "main") or starts-with(@role, "main")])[1]|(.//main)[1]',
|
| 266 |
+
]
|
| 267 |
+
|
| 268 |
+
Forum_XPATH = [
|
| 269 |
+
""".//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][
|
| 270 |
+
contains(@id, 'question') or contains(@class, 'question')]""",
|
| 271 |
+
""".//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][
|
| 272 |
+
contains(@id, 'answer') or contains(@class, 'answer')]""",
|
| 273 |
+
""".//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][
|
| 274 |
+
contains(@id, 'comment') or contains(@class, 'comment') or contains(@class, 'Comment')]""",
|
| 275 |
+
""".//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][contains(@class, "message-container") or contains(@id, "message_container") or contains(@class, "Messages_container")]""",
|
| 276 |
+
""".//*[(self::article or self::div or self::main or self::section or self::p or self::span or self::li or self::tr)][
|
| 277 |
+
contains(@id, 'comment-content') or contains(@class, 'comment-content') or contains(@class, 'comment-body') or contains(@class, 'comment-body') or contains(@class, "post-reply") or contains(@class, "reply_content") or contains(@class, "reply-content") or contains(@class, "reply_post") or contains(@class, "post-reply") or contains(@id, "reply") or contains(@class, "post-text") or contains(@class, "post_text") or
|
| 278 |
+
contains(@class, "post-body") or contains(@class, "postbody") or contains(@class, "post-entry") or contains(@class, "postentry") or contains(@component, 'post') or
|
| 279 |
+
contains(@class, "post-content") or contains(@class, "post_content") or contains(@class, "p_content") or contains(@class, "Post_content") or contains(@class, "message-post") or contains(@class, "js-post")]""",
|
| 280 |
+
# id 包含post-加数字组成的形式
|
| 281 |
+
""".//*[(self::article or self::div or self::main or self::section or self::p or self::span or self::li or self::tr)][contains(@id, 'post-') or contains(@id, 'post_')]"""
|
| 282 |
+
]
|
| 283 |
+
|
| 284 |
+
METAS = [
|
| 285 |
+
'//meta[starts-with(@property, "og:title")]/@content',
|
| 286 |
+
'//meta[starts-with(@name, "og:title")]/@content',
|
| 287 |
+
'//meta[starts-with(@property, "title")]/@content',
|
| 288 |
+
'//meta[starts-with(@name, "title")]/@content',
|
| 289 |
+
'//meta[starts-with(@property, "page:title")]/@content',
|
| 290 |
+
'//meta[starts-with(@name, "page:title")]/@content',
|
| 291 |
+
]
|
| 292 |
+
URL_PATTERNS_TO_HTML_TYPE = {
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
# 内置的网站适配规则(根据 URL 模式匹配,使用 CustomParser)
|
| 296 |
+
BUILTIN_SITE_RULES = {
|
| 297 |
+
# answers.com 系列网站适配
|
| 298 |
+
"answers.com": {
|
| 299 |
+
"clean": [
|
| 300 |
+
"//script",
|
| 301 |
+
"//style",
|
| 302 |
+
],
|
| 303 |
+
"title": {
|
| 304 |
+
"mode": "xpath",
|
| 305 |
+
"value": "//h1[@property='name']//text() | //h1[contains(@class, 'headline1')]//text()"
|
| 306 |
+
},
|
| 307 |
+
"content": {
|
| 308 |
+
"mode": "xpath",
|
| 309 |
+
# 只提取答案内容
|
| 310 |
+
"value": "//div[@property='content'] | //div[contains(@class, 'markdownStyles')]"
|
| 311 |
+
}
|
| 312 |
+
},
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
SCORING_WEIGHTS = {
|
| 316 |
+
"content_length": 1.0,
|
| 317 |
+
"paragraph_quality": 0.0,
|
| 318 |
+
"link_density": 0.0,
|
| 319 |
+
"text_density": 0.0,
|
| 320 |
+
"punctuation_density": 0.0,
|
| 321 |
+
"structure_completeness": 0.0,
|
| 322 |
+
"xpath_confidence": 0.0,
|
| 323 |
+
"noise_elements": 0.0,
|
| 324 |
+
"code_block_quality": 0.0,
|
| 325 |
+
"list_structure": 0.0,
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
SCORE_THRESHOLDS = {
|
| 329 |
+
"min_acceptable_score": 3.0,
|
| 330 |
+
"similar_threshold": 0.5,
|
| 331 |
+
}
|
| 332 |
+
|
ultradata_math_parser/mmltex/cmarkup.xsl
ADDED
|
@@ -0,0 +1,1093 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version='1.0' encoding="UTF-8"?>
|
| 2 |
+
<xsl:stylesheet
|
| 3 |
+
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
| 4 |
+
xmlns:m="http://www.w3.org/1998/Math/MathML"
|
| 5 |
+
version='1.0'>
|
| 6 |
+
|
| 7 |
+
<!-- ====================================================================== -->
|
| 8 |
+
<!-- $id: tokens.xsl, 2002/22/11 Exp $
|
| 9 |
+
This file is part of the XSLT MathML Library distribution.
|
| 10 |
+
See ./README or http://www.raleigh.ru/MathML/mmltex for
|
| 11 |
+
copyright and other information -->
|
| 12 |
+
<!-- ====================================================================== -->
|
| 13 |
+
|
| 14 |
+
<!-- 4.4.1.1 cn -->
|
| 15 |
+
<xsl:template match="m:cn"><xsl:apply-templates/></xsl:template>
|
| 16 |
+
|
| 17 |
+
<xsl:template match="m:cn[@type='complex-cartesian']">
|
| 18 |
+
<xsl:apply-templates select="text()[1]"/>
|
| 19 |
+
<xsl:text>+</xsl:text>
|
| 20 |
+
<xsl:apply-templates select="text()[2]"/>
|
| 21 |
+
<xsl:text>i</xsl:text>
|
| 22 |
+
</xsl:template>
|
| 23 |
+
|
| 24 |
+
<xsl:template match="m:cn[@type='rational']">
|
| 25 |
+
<xsl:apply-templates select="text()[1]"/>
|
| 26 |
+
<xsl:text>/</xsl:text>
|
| 27 |
+
<xsl:apply-templates select="text()[2]"/>
|
| 28 |
+
</xsl:template>
|
| 29 |
+
|
| 30 |
+
<xsl:template match="m:cn[@type='integer' and @base!=10]">
|
| 31 |
+
<xsl:apply-templates/>
|
| 32 |
+
<xsl:text>_{</xsl:text><xsl:value-of select="@base"/><xsl:text>}</xsl:text>
|
| 33 |
+
</xsl:template>
|
| 34 |
+
|
| 35 |
+
<xsl:template match="m:cn[@type='complex-polar']">
|
| 36 |
+
<xsl:apply-templates select="text()[1]"/>
|
| 37 |
+
<xsl:text>e^{i </xsl:text>
|
| 38 |
+
<xsl:apply-templates select="text()[2]"/>
|
| 39 |
+
<xsl:text>}</xsl:text>
|
| 40 |
+
</xsl:template>
|
| 41 |
+
|
| 42 |
+
<xsl:template match="m:cn[@type='e-notation']">
|
| 43 |
+
<xsl:apply-templates select="text()[1]"/>
|
| 44 |
+
<xsl:text>E</xsl:text>
|
| 45 |
+
<xsl:apply-templates select="text()[2]"/>
|
| 46 |
+
</xsl:template>
|
| 47 |
+
|
| 48 |
+
<!-- 4.4.1.1 ci 4.4.1.2 csymbol -->
|
| 49 |
+
<xsl:template match="m:ci | m:csymbol">
|
| 50 |
+
<xsl:choose>
|
| 51 |
+
<xsl:when test="string-length(normalize-space(text()))>1">
|
| 52 |
+
<xsl:text>\mathrm{</xsl:text><xsl:apply-templates/><xsl:text>}</xsl:text>
|
| 53 |
+
</xsl:when>
|
| 54 |
+
<xsl:otherwise><xsl:apply-templates/></xsl:otherwise>
|
| 55 |
+
</xsl:choose>
|
| 56 |
+
</xsl:template>
|
| 57 |
+
|
| 58 |
+
<!-- 4.4.2.1 apply 4.4.2.2 reln -->
|
| 59 |
+
<xsl:template match="m:apply | m:reln">
|
| 60 |
+
<xsl:apply-templates select="*[1]">
|
| 61 |
+
<!-- <? -->
|
| 62 |
+
<xsl:with-param name="p" select="10"/>
|
| 63 |
+
</xsl:apply-templates>
|
| 64 |
+
<!-- ?> -->
|
| 65 |
+
<xsl:text>(</xsl:text>
|
| 66 |
+
<xsl:for-each select="*[position()>1]">
|
| 67 |
+
<xsl:apply-templates select="."/>
|
| 68 |
+
<xsl:if test="not(position()=last())"><xsl:text>, </xsl:text></xsl:if>
|
| 69 |
+
</xsl:for-each>
|
| 70 |
+
<xsl:text>)</xsl:text>
|
| 71 |
+
</xsl:template>
|
| 72 |
+
|
| 73 |
+
<!-- 4.4.2.3 fn -->
|
| 74 |
+
<xsl:template match="m:fn[m:apply[1]]"> <!-- for m:fn using default rule -->
|
| 75 |
+
<xsl:text>(</xsl:text><xsl:apply-templates/><xsl:text>)</xsl:text>
|
| 76 |
+
</xsl:template>
|
| 77 |
+
|
| 78 |
+
<!-- 4.4.2.4 interval -->
|
| 79 |
+
<xsl:template match="m:interval[*[2]]">
|
| 80 |
+
<xsl:choose>
|
| 81 |
+
<xsl:when test="@closure='open' or @closure='open-closed'">
|
| 82 |
+
<xsl:text>\left(</xsl:text>
|
| 83 |
+
</xsl:when>
|
| 84 |
+
<xsl:otherwise><xsl:text>\left[</xsl:text></xsl:otherwise>
|
| 85 |
+
</xsl:choose>
|
| 86 |
+
<xsl:apply-templates select="*[1]"/>
|
| 87 |
+
<xsl:text> , </xsl:text>
|
| 88 |
+
<xsl:apply-templates select="*[2]"/>
|
| 89 |
+
<xsl:choose>
|
| 90 |
+
<xsl:when test="@closure='open' or @closure='closed-open'">
|
| 91 |
+
<xsl:text>\right)</xsl:text>
|
| 92 |
+
</xsl:when>
|
| 93 |
+
<xsl:otherwise><xsl:text>\right]</xsl:text></xsl:otherwise>
|
| 94 |
+
</xsl:choose>
|
| 95 |
+
</xsl:template>
|
| 96 |
+
|
| 97 |
+
<xsl:template match="m:interval">
|
| 98 |
+
<xsl:text>\left\{</xsl:text><xsl:apply-templates/><xsl:text>\right\}</xsl:text>
|
| 99 |
+
</xsl:template>
|
| 100 |
+
|
| 101 |
+
<!-- 4.4.2.5 inverse -->
|
| 102 |
+
<xsl:template match="m:apply[*[1][self::m:inverse]]">
|
| 103 |
+
<xsl:apply-templates select="*[2]"/><xsl:text>^{(-1)}</xsl:text>
|
| 104 |
+
</xsl:template>
|
| 105 |
+
|
| 106 |
+
<!-- 4.4.2.6 sep 4.4.2.7 condition -->
|
| 107 |
+
<xsl:template match="m:sep | m:condition"><xsl:apply-templates/></xsl:template>
|
| 108 |
+
|
| 109 |
+
<!-- 4.4.2.9 lambda -->
|
| 110 |
+
<xsl:template match="m:lambda">
|
| 111 |
+
<xsl:text>\mathrm{lambda}\: </xsl:text>
|
| 112 |
+
<xsl:apply-templates select="m:bvar/*"/>
|
| 113 |
+
<xsl:text>.\: </xsl:text>
|
| 114 |
+
<xsl:apply-templates select="*[last()]"/>
|
| 115 |
+
</xsl:template>
|
| 116 |
+
|
| 117 |
+
<!-- 4.4.2.10 compose -->
|
| 118 |
+
<xsl:template match="m:apply[*[1][self::m:compose]]">
|
| 119 |
+
<xsl:param name="p" select="0"/>
|
| 120 |
+
<xsl:call-template name="infix">
|
| 121 |
+
<xsl:with-param name="this-p" select="1"/>
|
| 122 |
+
<xsl:with-param name="p" select="$p"/>
|
| 123 |
+
<xsl:with-param name="mo">\circ </xsl:with-param>
|
| 124 |
+
</xsl:call-template>
|
| 125 |
+
</xsl:template>
|
| 126 |
+
|
| 127 |
+
<!-- 4.4.2.11 ident -->
|
| 128 |
+
<xsl:template match="m:ident"><xsl:text>\mathrm{id}</xsl:text></xsl:template>
|
| 129 |
+
|
| 130 |
+
<!-- 4.4.2.12 domain 4.4.2.13 codomain 4.4.2.14 image 4.4.3.21 arg 4.4.3.24 lcm
|
| 131 |
+
4.4.5.9 grad 4.4.5.10 curl 4.4.9.4 median 4.4.9.5 mode-->
|
| 132 |
+
<xsl:template match="m:domain | m:codomain | m:image | m:arg | m:lcm | m:grad |
|
| 133 |
+
m:curl | m:median | m:mode">
|
| 134 |
+
<xsl:text>\mathop{\mathrm{</xsl:text>
|
| 135 |
+
<xsl:value-of select="local-name()"/>
|
| 136 |
+
<xsl:text>}}</xsl:text>
|
| 137 |
+
</xsl:template>
|
| 138 |
+
|
| 139 |
+
<!-- 4.4.2.15 domainofapplication -->
|
| 140 |
+
<xsl:template match="m:domainofapplication"/>
|
| 141 |
+
|
| 142 |
+
<!-- 4.4.2.16 piecewise -->
|
| 143 |
+
<xsl:template match="m:piecewise">
|
| 144 |
+
<xsl:text>\begin{cases}</xsl:text>
|
| 145 |
+
<xsl:apply-templates select="m:piece"/>
|
| 146 |
+
<xsl:apply-templates select="m:otherwise"/>
|
| 147 |
+
<xsl:text>\end{cases}</xsl:text>
|
| 148 |
+
</xsl:template>
|
| 149 |
+
|
| 150 |
+
<xsl:template match="m:piece">
|
| 151 |
+
<xsl:apply-templates select="*[1]"/>
|
| 152 |
+
<xsl:text> & \text{if $</xsl:text>
|
| 153 |
+
<xsl:apply-templates select="*[2]"/>
|
| 154 |
+
<xsl:text>$}</xsl:text>
|
| 155 |
+
<xsl:if test="not(position()=last()) or ../m:otherwise"><xsl:text>\\ </xsl:text></xsl:if>
|
| 156 |
+
</xsl:template>
|
| 157 |
+
|
| 158 |
+
<xsl:template match="m:otherwise">
|
| 159 |
+
<xsl:apply-templates select="*[1]"/>
|
| 160 |
+
<xsl:text> & \text{otherwise}</xsl:text>
|
| 161 |
+
</xsl:template>
|
| 162 |
+
|
| 163 |
+
<!-- 4.4.3.1 quotient -->
|
| 164 |
+
<xsl:template match="m:apply[*[1][self::m:quotient]]">
|
| 165 |
+
<xsl:text>\left\lfloor\frac{</xsl:text>
|
| 166 |
+
<xsl:apply-templates select="*[2]"/>
|
| 167 |
+
<xsl:text>}{</xsl:text>
|
| 168 |
+
<xsl:apply-templates select="*[3]"/>
|
| 169 |
+
<xsl:text>}\right\rfloor </xsl:text>
|
| 170 |
+
</xsl:template>
|
| 171 |
+
|
| 172 |
+
<!-- 4.4.3.2 factorial -->
|
| 173 |
+
<xsl:template match="m:apply[*[1][self::m:factorial]]">
|
| 174 |
+
<xsl:apply-templates select="*[2]">
|
| 175 |
+
<xsl:with-param name="p" select="7"/>
|
| 176 |
+
</xsl:apply-templates>
|
| 177 |
+
<xsl:text>!</xsl:text>
|
| 178 |
+
</xsl:template>
|
| 179 |
+
|
| 180 |
+
<!-- 4.4.3.3 divide -->
|
| 181 |
+
<xsl:template match="m:apply[*[1][self::m:divide]]">
|
| 182 |
+
<xsl:param name="p" select="0"/>
|
| 183 |
+
<xsl:param name="this-p" select="3"/>
|
| 184 |
+
<xsl:if test="$this-p < $p"><xsl:text>\left(</xsl:text></xsl:if>
|
| 185 |
+
<xsl:text>\frac{</xsl:text>
|
| 186 |
+
<xsl:apply-templates select="*[2]"/>
|
| 187 |
+
<!-- <xsl:with-param name="p" select="$this-p"/>
|
| 188 |
+
</xsl:apply-templates>-->
|
| 189 |
+
<xsl:text>}{</xsl:text>
|
| 190 |
+
<xsl:apply-templates select="*[3]"/>
|
| 191 |
+
<!-- <xsl:with-param name="p" select="$this-p"/>
|
| 192 |
+
</xsl:apply-templates>-->
|
| 193 |
+
<xsl:text>}</xsl:text>
|
| 194 |
+
<xsl:if test="$this-p < $p"><xsl:text>\right)</xsl:text></xsl:if>
|
| 195 |
+
</xsl:template>
|
| 196 |
+
|
| 197 |
+
<!-- 4.4.3.4 max min -->
|
| 198 |
+
<xsl:template match="m:apply[*[1][self::m:max or self::m:min]]">
|
| 199 |
+
<xsl:text>\</xsl:text>
|
| 200 |
+
<xsl:value-of select="local-name(*[1])"/>
|
| 201 |
+
<xsl:text>\{</xsl:text>
|
| 202 |
+
<xsl:choose>
|
| 203 |
+
<xsl:when test="m:condition">
|
| 204 |
+
<xsl:apply-templates select="*[last()]"/>
|
| 205 |
+
<xsl:text>, </xsl:text>
|
| 206 |
+
<xsl:apply-templates select="m:condition/node()"/>
|
| 207 |
+
</xsl:when>
|
| 208 |
+
<xsl:otherwise>
|
| 209 |
+
<xsl:for-each select="*[position() > 1]">
|
| 210 |
+
<xsl:apply-templates select="."/>
|
| 211 |
+
<xsl:if test="position() !=last()"><xsl:text> , </xsl:text></xsl:if>
|
| 212 |
+
</xsl:for-each>
|
| 213 |
+
</xsl:otherwise>
|
| 214 |
+
</xsl:choose>
|
| 215 |
+
<xsl:text>\}</xsl:text>
|
| 216 |
+
</xsl:template>
|
| 217 |
+
|
| 218 |
+
<!-- 4.4.3.5 minus-->
|
| 219 |
+
<xsl:template match="m:apply[*[1][self::m:minus] and count(*)=2]">
|
| 220 |
+
<xsl:text>-</xsl:text>
|
| 221 |
+
<xsl:apply-templates select="*[2]">
|
| 222 |
+
<xsl:with-param name="p" select="5"/>
|
| 223 |
+
</xsl:apply-templates>
|
| 224 |
+
</xsl:template>
|
| 225 |
+
|
| 226 |
+
<xsl:template match="m:apply[*[1][self::m:minus] and count(*)>2]">
|
| 227 |
+
<xsl:param name="p" select="0"/>
|
| 228 |
+
<xsl:call-template name="binary">
|
| 229 |
+
<xsl:with-param name="mo">-</xsl:with-param>
|
| 230 |
+
<xsl:with-param name="p" select="$p"/>
|
| 231 |
+
<xsl:with-param name="this-p" select="2"/>
|
| 232 |
+
</xsl:call-template>
|
| 233 |
+
</xsl:template>
|
| 234 |
+
|
| 235 |
+
<!-- 4.4.3.6 plus-->
|
| 236 |
+
<xsl:template match="m:apply[*[1][self::m:plus]]">
|
| 237 |
+
<xsl:param name="p" select="0"/>
|
| 238 |
+
<xsl:if test="$p > 2">
|
| 239 |
+
<xsl:text>(</xsl:text>
|
| 240 |
+
</xsl:if>
|
| 241 |
+
<xsl:for-each select="*[position()>1]">
|
| 242 |
+
<xsl:if test="position() > 1">
|
| 243 |
+
<xsl:choose>
|
| 244 |
+
<xsl:when test="self::m:apply[*[1][self::m:times] and
|
| 245 |
+
*[2][self::m:apply/*[1][self::m:minus] or self::m:cn[not(m:sep) and
|
| 246 |
+
(number(.) < 0)]]]">-</xsl:when>
|
| 247 |
+
<xsl:otherwise>+</xsl:otherwise>
|
| 248 |
+
</xsl:choose>
|
| 249 |
+
</xsl:if>
|
| 250 |
+
<xsl:choose>
|
| 251 |
+
<xsl:when test="self::m:apply[*[1][self::m:times] and
|
| 252 |
+
*[2][self::m:cn[not(m:sep) and (number(.) <0)]]]">
|
| 253 |
+
<xsl:value-of select="-(*[2])"/>
|
| 254 |
+
<xsl:apply-templates select=".">
|
| 255 |
+
<xsl:with-param name="first" select="2"/>
|
| 256 |
+
<xsl:with-param name="p" select="2"/>
|
| 257 |
+
</xsl:apply-templates>
|
| 258 |
+
</xsl:when>
|
| 259 |
+
<xsl:when test="self::m:apply[*[1][self::m:times] and
|
| 260 |
+
*[2][self::m:apply/*[1][self::m:minus]]]">
|
| 261 |
+
<xsl:apply-templates select="./*[2]/*[2]"/>
|
| 262 |
+
<xsl:apply-templates select=".">
|
| 263 |
+
<xsl:with-param name="first" select="2"/>
|
| 264 |
+
<xsl:with-param name="p" select="2"/>
|
| 265 |
+
</xsl:apply-templates>
|
| 266 |
+
</xsl:when>
|
| 267 |
+
<xsl:otherwise>
|
| 268 |
+
<xsl:apply-templates select=".">
|
| 269 |
+
<xsl:with-param name="p" select="2"/>
|
| 270 |
+
</xsl:apply-templates>
|
| 271 |
+
</xsl:otherwise>
|
| 272 |
+
</xsl:choose>
|
| 273 |
+
</xsl:for-each>
|
| 274 |
+
<xsl:if test="$p > 2">
|
| 275 |
+
<xsl:text>)</xsl:text>
|
| 276 |
+
</xsl:if>
|
| 277 |
+
</xsl:template>
|
| 278 |
+
|
| 279 |
+
<!-- 4.4.3.7 power -->
|
| 280 |
+
<xsl:template match="m:apply[*[1][self::m:power]]">
|
| 281 |
+
<xsl:apply-templates select="*[2]">
|
| 282 |
+
<xsl:with-param name="p" select="5"/>
|
| 283 |
+
</xsl:apply-templates>
|
| 284 |
+
<xsl:text>^{</xsl:text>
|
| 285 |
+
<xsl:apply-templates select="*[3]">
|
| 286 |
+
<xsl:with-param name="p" select="5"/>
|
| 287 |
+
</xsl:apply-templates>
|
| 288 |
+
<xsl:text>}</xsl:text>
|
| 289 |
+
</xsl:template>
|
| 290 |
+
|
| 291 |
+
<!-- 4.4.3.8 remainder -->
|
| 292 |
+
<xsl:template match="m:apply[*[1][self::m:rem]]">
|
| 293 |
+
<xsl:param name="p" select="0"/>
|
| 294 |
+
<xsl:call-template name="binary">
|
| 295 |
+
<xsl:with-param name="mo">\mod </xsl:with-param>
|
| 296 |
+
<xsl:with-param name="p" select="$p"/>
|
| 297 |
+
<xsl:with-param name="this-p" select="3"/>
|
| 298 |
+
</xsl:call-template>
|
| 299 |
+
</xsl:template>
|
| 300 |
+
|
| 301 |
+
<!-- 4.4.3.9 times-->
|
| 302 |
+
<xsl:template match="m:apply[*[1][self::m:times]]" name="times">
|
| 303 |
+
<xsl:param name="p" select="0"/>
|
| 304 |
+
<xsl:param name="first" select="1"/>
|
| 305 |
+
<xsl:if test="$p > 3"><xsl:text>(</xsl:text></xsl:if>
|
| 306 |
+
<xsl:for-each select="*[position()>1]">
|
| 307 |
+
<xsl:if test="position() > 1">
|
| 308 |
+
<xsl:choose>
|
| 309 |
+
<xsl:when test="self::m:cn">\times <!-- times --></xsl:when>
|
| 310 |
+
<xsl:otherwise><!--invisible times--></xsl:otherwise>
|
| 311 |
+
</xsl:choose>
|
| 312 |
+
</xsl:if>
|
| 313 |
+
<xsl:if test="position()>= $first">
|
| 314 |
+
<xsl:apply-templates select=".">
|
| 315 |
+
<xsl:with-param name="p" select="3"/>
|
| 316 |
+
</xsl:apply-templates>
|
| 317 |
+
</xsl:if>
|
| 318 |
+
</xsl:for-each>
|
| 319 |
+
<xsl:if test="$p > 3"><xsl:text>)</xsl:text></xsl:if>
|
| 320 |
+
</xsl:template>
|
| 321 |
+
|
| 322 |
+
<!-- 4.4.3.10 root -->
|
| 323 |
+
<xsl:template match="m:apply[*[1][self::m:root]]">
|
| 324 |
+
<xsl:text>\sqrt</xsl:text>
|
| 325 |
+
<xsl:if test="m:degree!=2">
|
| 326 |
+
<xsl:text>[</xsl:text>
|
| 327 |
+
<xsl:apply-templates select="m:degree/*"/>
|
| 328 |
+
<xsl:text>]</xsl:text>
|
| 329 |
+
</xsl:if>
|
| 330 |
+
<xsl:text>{</xsl:text>
|
| 331 |
+
<xsl:apply-templates select="*[position()>1 and not(self::m:degree)]"/>
|
| 332 |
+
<xsl:text>}</xsl:text>
|
| 333 |
+
</xsl:template>
|
| 334 |
+
|
| 335 |
+
<!-- 4.4.3.11 gcd -->
|
| 336 |
+
<xsl:template match="m:gcd"><xsl:text>\gcd </xsl:text></xsl:template>
|
| 337 |
+
|
| 338 |
+
<!-- 4.4.3.12 and -->
|
| 339 |
+
<xsl:template match="m:apply[*[1][self::m:and]]">
|
| 340 |
+
<xsl:param name="p" select="0"/>
|
| 341 |
+
<xsl:call-template name="infix">
|
| 342 |
+
<xsl:with-param name="this-p" select="2"/>
|
| 343 |
+
<xsl:with-param name="p" select="$p"/>
|
| 344 |
+
<xsl:with-param name="mo">\land <!-- and --></xsl:with-param>
|
| 345 |
+
</xsl:call-template>
|
| 346 |
+
</xsl:template>
|
| 347 |
+
|
| 348 |
+
<!-- 4.4.3.13 or -->
|
| 349 |
+
<xsl:template match="m:apply[*[1][self::m:or]]">
|
| 350 |
+
<xsl:param name="p" select="0"/>
|
| 351 |
+
<xsl:call-template name="infix">
|
| 352 |
+
<xsl:with-param name="this-p" select="3"/>
|
| 353 |
+
<xsl:with-param name="p" select="$p"/>
|
| 354 |
+
<xsl:with-param name="mo">\lor </xsl:with-param>
|
| 355 |
+
</xsl:call-template>
|
| 356 |
+
</xsl:template>
|
| 357 |
+
|
| 358 |
+
<!-- 4.4.3.14 xor -->
|
| 359 |
+
<xsl:template match="m:apply[*[1][self::m:xor]]">
|
| 360 |
+
<xsl:param name="p" select="0"/>
|
| 361 |
+
<xsl:call-template name="infix">
|
| 362 |
+
<xsl:with-param name="this-p" select="3"/>
|
| 363 |
+
<xsl:with-param name="p" select="$p"/>
|
| 364 |
+
<xsl:with-param name="mo">\mathop{\mathrm{xor}}</xsl:with-param>
|
| 365 |
+
</xsl:call-template>
|
| 366 |
+
</xsl:template>
|
| 367 |
+
|
| 368 |
+
<!-- 4.4.3.15 not -->
|
| 369 |
+
<xsl:template match="m:apply[*[1][self::m:not]]">
|
| 370 |
+
<xsl:text>\neg </xsl:text>
|
| 371 |
+
<xsl:apply-templates select="*[2]">
|
| 372 |
+
<xsl:with-param name="p" select="7"/>
|
| 373 |
+
</xsl:apply-templates>
|
| 374 |
+
</xsl:template>
|
| 375 |
+
|
| 376 |
+
<!-- 4.4.3.16 implies -->
|
| 377 |
+
<xsl:template match="m:apply[*[1][self::m:implies]]">
|
| 378 |
+
<xsl:param name="p" select="0"/>
|
| 379 |
+
<xsl:call-template name="binary">
|
| 380 |
+
<xsl:with-param name="mo">\implies </xsl:with-param>
|
| 381 |
+
<xsl:with-param name="p" select="$p"/>
|
| 382 |
+
<xsl:with-param name="this-p" select="3"/>
|
| 383 |
+
</xsl:call-template>
|
| 384 |
+
</xsl:template>
|
| 385 |
+
|
| 386 |
+
<!-- 4.4.3.17 forall 4.4.3.18 exists -->
|
| 387 |
+
<xsl:template match="m:apply[*[1][self::m:forall or self::m:exists]]">
|
| 388 |
+
<xsl:text>\</xsl:text>
|
| 389 |
+
<xsl:value-of select="local-name(*[1])"/>
|
| 390 |
+
<xsl:text> </xsl:text>
|
| 391 |
+
<xsl:apply-templates select="m:bvar"/>
|
| 392 |
+
<xsl:if test="m:condition">
|
| 393 |
+
<xsl:text>, </xsl:text><xsl:apply-templates select="m:condition"/>
|
| 394 |
+
</xsl:if>
|
| 395 |
+
<xsl:if test="*[last()][local-name()!='condition'][local-name()!='bvar']">
|
| 396 |
+
<xsl:text>\colon </xsl:text>
|
| 397 |
+
<xsl:apply-templates select="*[last()]"/>
|
| 398 |
+
</xsl:if>
|
| 399 |
+
</xsl:template>
|
| 400 |
+
|
| 401 |
+
<!-- 4.4.3.19 abs -->
|
| 402 |
+
<xsl:template match="m:apply[*[1][self::m:abs]]">
|
| 403 |
+
<xsl:text>\left|</xsl:text>
|
| 404 |
+
<xsl:apply-templates select="*[2]"/>
|
| 405 |
+
<xsl:text>\right|</xsl:text>
|
| 406 |
+
</xsl:template>
|
| 407 |
+
|
| 408 |
+
<!-- 4.4.3.20 conjugate -->
|
| 409 |
+
<xsl:template match="m:apply[*[1][self::m:conjugate]]">
|
| 410 |
+
<xsl:text>\overline{</xsl:text><xsl:apply-templates select="*[2]"/><xsl:text>}</xsl:text>
|
| 411 |
+
</xsl:template>
|
| 412 |
+
|
| 413 |
+
<!-- 4.4.3.22 real -->
|
| 414 |
+
<xsl:template match="m:real"><xsl:text>\Re </xsl:text></xsl:template>
|
| 415 |
+
|
| 416 |
+
<!-- 4.4.3.23 imaginary -->
|
| 417 |
+
<xsl:template match="m:imaginary"><xsl:text>\Im </xsl:text></xsl:template>
|
| 418 |
+
|
| 419 |
+
<!-- 4.4.3.25 floor -->
|
| 420 |
+
<xsl:template match="m:apply[*[1][self::m:floor]]">
|
| 421 |
+
<xsl:text>\lfloor </xsl:text>
|
| 422 |
+
<xsl:apply-templates select="*[2]"/>
|
| 423 |
+
<xsl:text>\rfloor </xsl:text>
|
| 424 |
+
</xsl:template>
|
| 425 |
+
|
| 426 |
+
<!-- 4.4.3.25 ceiling -->
|
| 427 |
+
<xsl:template match="m:apply[*[1][self::m:ceiling]]">
|
| 428 |
+
<xsl:text>\lceil </xsl:text>
|
| 429 |
+
<xsl:apply-templates select="*[2]"/>
|
| 430 |
+
<xsl:text>\rceil </xsl:text>
|
| 431 |
+
</xsl:template>
|
| 432 |
+
|
| 433 |
+
<!-- 4.4.4.1 eq -->
|
| 434 |
+
<xsl:template match="m:apply[*[1][self::m:eq]]">
|
| 435 |
+
<xsl:param name="p" select="0"/>
|
| 436 |
+
<xsl:call-template name="infix">
|
| 437 |
+
<xsl:with-param name="this-p" select="1"/>
|
| 438 |
+
<xsl:with-param name="p" select="$p"/>
|
| 439 |
+
<xsl:with-param name="mo">=</xsl:with-param>
|
| 440 |
+
</xsl:call-template>
|
| 441 |
+
</xsl:template>
|
| 442 |
+
|
| 443 |
+
<!-- 4.4.4.2 neq -->
|
| 444 |
+
<xsl:template match="m:apply[*[1][self::m:neq]]">
|
| 445 |
+
<xsl:param name="p" select="0"/>
|
| 446 |
+
<xsl:call-template name="infix">
|
| 447 |
+
<xsl:with-param name="this-p" select="1"/>
|
| 448 |
+
<xsl:with-param name="p" select="$p"/>
|
| 449 |
+
<xsl:with-param name="mo">\neq </xsl:with-param>
|
| 450 |
+
</xsl:call-template>
|
| 451 |
+
</xsl:template>
|
| 452 |
+
|
| 453 |
+
<!-- 4.4.4.3 gt -->
|
| 454 |
+
<xsl:template match="m:apply[*[1][self::m:gt]]">
|
| 455 |
+
<xsl:param name="p" select="0"/>
|
| 456 |
+
<xsl:call-template name="infix">
|
| 457 |
+
<xsl:with-param name="this-p" select="1"/>
|
| 458 |
+
<xsl:with-param name="p" select="$p"/>
|
| 459 |
+
<xsl:with-param name="mo">> </xsl:with-param>
|
| 460 |
+
</xsl:call-template>
|
| 461 |
+
</xsl:template>
|
| 462 |
+
|
| 463 |
+
<!-- 4.4.4.4 lt -->
|
| 464 |
+
<xsl:template match="m:apply[*[1][self::m:lt]]">
|
| 465 |
+
<xsl:param name="p" select="0"/>
|
| 466 |
+
<xsl:call-template name="infix">
|
| 467 |
+
<xsl:with-param name="this-p" select="1"/>
|
| 468 |
+
<xsl:with-param name="p" select="$p"/>
|
| 469 |
+
<xsl:with-param name="mo">< </xsl:with-param>
|
| 470 |
+
</xsl:call-template>
|
| 471 |
+
</xsl:template>
|
| 472 |
+
|
| 473 |
+
<!-- 4.4.4.5 geq -->
|
| 474 |
+
<xsl:template match="m:apply[*[1][self::m:geq]]">
|
| 475 |
+
<xsl:param name="p" select="0"/>
|
| 476 |
+
<xsl:call-template name="infix">
|
| 477 |
+
<xsl:with-param name="this-p" select="1"/>
|
| 478 |
+
<xsl:with-param name="p" select="$p"/>
|
| 479 |
+
<xsl:with-param name="mo">\ge </xsl:with-param>
|
| 480 |
+
</xsl:call-template>
|
| 481 |
+
</xsl:template>
|
| 482 |
+
|
| 483 |
+
<!-- 4.4.4.6 leq -->
|
| 484 |
+
<xsl:template match="m:apply[*[1][self::m:leq]]">
|
| 485 |
+
<xsl:param name="p" select="0"/>
|
| 486 |
+
<xsl:call-template name="infix">
|
| 487 |
+
<xsl:with-param name="this-p" select="1"/>
|
| 488 |
+
<xsl:with-param name="p" select="$p"/>
|
| 489 |
+
<xsl:with-param name="mo">\le </xsl:with-param>
|
| 490 |
+
</xsl:call-template>
|
| 491 |
+
</xsl:template>
|
| 492 |
+
|
| 493 |
+
<!-- 4.4.4.7 equivalent -->
|
| 494 |
+
<xsl:template match="m:apply[*[1][self::m:equivalent]]">
|
| 495 |
+
<xsl:param name="p" select="0"/>
|
| 496 |
+
<xsl:call-template name="infix">
|
| 497 |
+
<xsl:with-param name="this-p" select="1"/>
|
| 498 |
+
<xsl:with-param name="p" select="$p"/>
|
| 499 |
+
<xsl:with-param name="mo">\equiv </xsl:with-param>
|
| 500 |
+
</xsl:call-template>
|
| 501 |
+
</xsl:template>
|
| 502 |
+
|
| 503 |
+
<!-- 4.4.4.8 approx -->
|
| 504 |
+
<xsl:template match="m:apply[*[1][self::m:approx]]">
|
| 505 |
+
<xsl:param name="p" select="0"/>
|
| 506 |
+
<xsl:call-template name="infix">
|
| 507 |
+
<xsl:with-param name="this-p" select="1"/>
|
| 508 |
+
<xsl:with-param name="p" select="$p"/>
|
| 509 |
+
<xsl:with-param name="mo">\approx </xsl:with-param>
|
| 510 |
+
</xsl:call-template>
|
| 511 |
+
</xsl:template>
|
| 512 |
+
|
| 513 |
+
<!-- 4.4.4.9 factorof -->
|
| 514 |
+
<xsl:template match="m:apply[*[1][self::m:factorof]]">
|
| 515 |
+
<xsl:param name="p" select="0"/>
|
| 516 |
+
<xsl:call-template name="binary">
|
| 517 |
+
<xsl:with-param name="mo"> | </xsl:with-param>
|
| 518 |
+
<xsl:with-param name="p" select="$p"/>
|
| 519 |
+
<xsl:with-param name="this-p" select="3"/>
|
| 520 |
+
</xsl:call-template>
|
| 521 |
+
</xsl:template>
|
| 522 |
+
|
| 523 |
+
<!-- 4.4.5.1 int -->
|
| 524 |
+
<xsl:template match="m:apply[*[1][self::m:int]]">
|
| 525 |
+
<xsl:text>\int</xsl:text>
|
| 526 |
+
<xsl:if test="m:lowlimit/*|m:interval/*[1]|m:condition/*">
|
| 527 |
+
<xsl:text>_{</xsl:text>
|
| 528 |
+
<xsl:apply-templates select="m:lowlimit/*|m:interval/*[1]|m:condition/*"/>
|
| 529 |
+
<xsl:text>}</xsl:text>
|
| 530 |
+
</xsl:if>
|
| 531 |
+
<xsl:if test="m:uplimit/*|m:interval/*[2]">
|
| 532 |
+
<xsl:text>^{</xsl:text>
|
| 533 |
+
<xsl:apply-templates select="m:uplimit/*|m:interval/*[2]"/>
|
| 534 |
+
<xsl:text>}</xsl:text>
|
| 535 |
+
</xsl:if>
|
| 536 |
+
<xsl:text> </xsl:text>
|
| 537 |
+
<xsl:apply-templates select="*[last()]"/>
|
| 538 |
+
<xsl:text>\,d </xsl:text>
|
| 539 |
+
<xsl:apply-templates select="m:bvar"/>
|
| 540 |
+
</xsl:template>
|
| 541 |
+
|
| 542 |
+
<!-- 4.4.5.2 diff -->
|
| 543 |
+
<xsl:template match="m:apply[*[1][self::m:diff] and m:ci and count(*)=2]" priority="2">
|
| 544 |
+
<xsl:apply-templates select="*[2]"/>
|
| 545 |
+
<xsl:text>^\prime </xsl:text>
|
| 546 |
+
</xsl:template>
|
| 547 |
+
|
| 548 |
+
<xsl:template match="m:apply[*[1][self::m:diff]]" priority="1">
|
| 549 |
+
<xsl:text>\frac{</xsl:text>
|
| 550 |
+
<xsl:choose>
|
| 551 |
+
<xsl:when test="m:bvar/m:degree">
|
| 552 |
+
<xsl:text>d^{</xsl:text>
|
| 553 |
+
<xsl:apply-templates select="m:bvar/m:degree/node()"/>
|
| 554 |
+
<xsl:text>}</xsl:text>
|
| 555 |
+
<xsl:apply-templates select="*[last()]"/>
|
| 556 |
+
<xsl:text>}{d</xsl:text>
|
| 557 |
+
<xsl:apply-templates select="m:bvar/node()"/>
|
| 558 |
+
<xsl:text>^{</xsl:text>
|
| 559 |
+
<xsl:apply-templates select="m:bvar/m:degree/node()"/>
|
| 560 |
+
<xsl:text>}</xsl:text>
|
| 561 |
+
</xsl:when>
|
| 562 |
+
<xsl:otherwise>
|
| 563 |
+
<xsl:text>d </xsl:text>
|
| 564 |
+
<xsl:apply-templates select="*[last()]"/>
|
| 565 |
+
<xsl:text>}{d </xsl:text>
|
| 566 |
+
<xsl:apply-templates select="m:bvar"/>
|
| 567 |
+
<xsl:text>}</xsl:text>
|
| 568 |
+
</xsl:otherwise>
|
| 569 |
+
</xsl:choose>
|
| 570 |
+
<xsl:text>}</xsl:text>
|
| 571 |
+
</xsl:template>
|
| 572 |
+
|
| 573 |
+
<!-- 4.4.5.3 partialdiff -->
|
| 574 |
+
<xsl:template match="m:apply[*[1][self::m:partialdiff] and m:list and m:ci and count(*)=3]" priority="2">
|
| 575 |
+
<xsl:text>D_{</xsl:text>
|
| 576 |
+
<xsl:for-each select="m:list[1]/*">
|
| 577 |
+
<xsl:apply-templates select="."/>
|
| 578 |
+
<xsl:if test="position()<last()"><xsl:text>, </xsl:text></xsl:if>
|
| 579 |
+
</xsl:for-each>
|
| 580 |
+
<xsl:text>}</xsl:text>
|
| 581 |
+
<xsl:apply-templates select="*[3]"/>
|
| 582 |
+
</xsl:template>
|
| 583 |
+
|
| 584 |
+
<xsl:template match="m:apply[*[1][self::m:partialdiff]]" priority="1">
|
| 585 |
+
<xsl:text>\frac{\partial^{</xsl:text>
|
| 586 |
+
<xsl:choose>
|
| 587 |
+
<xsl:when test="m:degree">
|
| 588 |
+
<xsl:apply-templates select="m:degree/node()"/>
|
| 589 |
+
</xsl:when>
|
| 590 |
+
<xsl:when test="m:bvar/m:degree[string(number(.))='NaN']">
|
| 591 |
+
<xsl:for-each select="m:bvar/m:degree">
|
| 592 |
+
<xsl:apply-templates select="node()"/>
|
| 593 |
+
<xsl:if test="position()<last()"><xsl:text>+</xsl:text></xsl:if>
|
| 594 |
+
</xsl:for-each>
|
| 595 |
+
<xsl:if test="count(m:bvar[not(m:degree)])>0">
|
| 596 |
+
<xsl:text>+</xsl:text>
|
| 597 |
+
<xsl:value-of select="count(m:bvar[not(m:degree)])"/>
|
| 598 |
+
</xsl:if>
|
| 599 |
+
</xsl:when>
|
| 600 |
+
<xsl:otherwise>
|
| 601 |
+
<xsl:value-of select="sum(m:bvar/m:degree)+count(m:bvar[not(m:degree)])"/>
|
| 602 |
+
</xsl:otherwise>
|
| 603 |
+
</xsl:choose>
|
| 604 |
+
<xsl:text>}</xsl:text>
|
| 605 |
+
<xsl:apply-templates select="*[last()]"/>
|
| 606 |
+
<xsl:text>}{</xsl:text>
|
| 607 |
+
<xsl:for-each select="m:bvar">
|
| 608 |
+
<xsl:text>\partial </xsl:text>
|
| 609 |
+
<xsl:apply-templates select="node()"/>
|
| 610 |
+
<xsl:if test="m:degree">
|
| 611 |
+
<xsl:text>^{</xsl:text>
|
| 612 |
+
<xsl:apply-templates select="m:degree/node()"/>
|
| 613 |
+
<xsl:text>}</xsl:text>
|
| 614 |
+
</xsl:if>
|
| 615 |
+
</xsl:for-each>
|
| 616 |
+
<xsl:text>}</xsl:text>
|
| 617 |
+
</xsl:template>
|
| 618 |
+
|
| 619 |
+
<!-- 4.4.2.8 declare 4.4.5.4 lowlimit 4.4.5.5 uplimit 4.4.5.7 degree 4.4.9.5 momentabout -->
|
| 620 |
+
<xsl:template match="m:declare | m:lowlimit | m:uplimit | m:degree | m:momentabout"/>
|
| 621 |
+
|
| 622 |
+
<!-- 4.4.5.6 bvar-->
|
| 623 |
+
<xsl:template match="m:bvar">
|
| 624 |
+
<xsl:apply-templates/>
|
| 625 |
+
<xsl:if test="following-sibling::m:bvar"><xsl:text>, </xsl:text></xsl:if>
|
| 626 |
+
</xsl:template>
|
| 627 |
+
|
| 628 |
+
<!-- 4.4.5.8 divergence-->
|
| 629 |
+
<xsl:template match="m:divergence"><xsl:text>\mathop{\mathrm{div}}</xsl:text></xsl:template>
|
| 630 |
+
|
| 631 |
+
<!-- 4.4.5.11 laplacian-->
|
| 632 |
+
<xsl:template match="m:laplacian"><xsl:text>\nabla^2 </xsl:text></xsl:template>
|
| 633 |
+
|
| 634 |
+
<!-- 4.4.6.1 set -->
|
| 635 |
+
<xsl:template match="m:set">
|
| 636 |
+
<xsl:text>\{</xsl:text><xsl:call-template name="set"/><xsl:text>\}</xsl:text>
|
| 637 |
+
</xsl:template>
|
| 638 |
+
|
| 639 |
+
<!-- 4.4.6.2 list -->
|
| 640 |
+
<xsl:template match="m:list">
|
| 641 |
+
<xsl:text>\left[</xsl:text><xsl:call-template name="set"/><xsl:text>\right]</xsl:text>
|
| 642 |
+
</xsl:template>
|
| 643 |
+
|
| 644 |
+
<xsl:template name="set">
|
| 645 |
+
<xsl:choose>
|
| 646 |
+
<xsl:when test="m:condition">
|
| 647 |
+
<xsl:apply-templates select="m:bvar/*[not(self::bvar or self::condition)]"/>
|
| 648 |
+
<xsl:text>\colon </xsl:text>
|
| 649 |
+
<xsl:apply-templates select="m:condition/node()"/>
|
| 650 |
+
</xsl:when>
|
| 651 |
+
<xsl:otherwise>
|
| 652 |
+
<xsl:for-each select="*">
|
| 653 |
+
<xsl:apply-templates select="."/>
|
| 654 |
+
<xsl:if test="position()!=last()"><xsl:text>, </xsl:text></xsl:if>
|
| 655 |
+
</xsl:for-each>
|
| 656 |
+
</xsl:otherwise>
|
| 657 |
+
</xsl:choose>
|
| 658 |
+
</xsl:template>
|
| 659 |
+
|
| 660 |
+
<!-- 4.4.6.3 union -->
|
| 661 |
+
<xsl:template match="m:apply[*[1][self::m:union]]">
|
| 662 |
+
<xsl:param name="p" select="0"/>
|
| 663 |
+
<xsl:call-template name="infix">
|
| 664 |
+
<xsl:with-param name="this-p" select="2"/>
|
| 665 |
+
<xsl:with-param name="p" select="$p"/>
|
| 666 |
+
<xsl:with-param name="mo">\cup </xsl:with-param>
|
| 667 |
+
</xsl:call-template>
|
| 668 |
+
</xsl:template>
|
| 669 |
+
|
| 670 |
+
<!-- 4.4.6.4 intersect -->
|
| 671 |
+
<xsl:template match="m:apply[*[1][self::m:intersect]]">
|
| 672 |
+
<xsl:param name="p" select="0"/>
|
| 673 |
+
<xsl:call-template name="infix">
|
| 674 |
+
<xsl:with-param name="this-p" select="3"/>
|
| 675 |
+
<xsl:with-param name="p" select="$p"/>
|
| 676 |
+
<xsl:with-param name="mo">\cap </xsl:with-param>
|
| 677 |
+
</xsl:call-template>
|
| 678 |
+
</xsl:template>
|
| 679 |
+
|
| 680 |
+
<!-- 4.4.6.5 in -->
|
| 681 |
+
<xsl:template match="m:apply[*[1][self::m:in]]">
|
| 682 |
+
<xsl:param name="p" select="0"/>
|
| 683 |
+
<xsl:call-template name="binary">
|
| 684 |
+
<xsl:with-param name="mo">\in </xsl:with-param>
|
| 685 |
+
<xsl:with-param name="p" select="$p"/>
|
| 686 |
+
<xsl:with-param name="this-p" select="3"/>
|
| 687 |
+
</xsl:call-template>
|
| 688 |
+
</xsl:template>
|
| 689 |
+
|
| 690 |
+
<!-- 4.4.6.5 notin -->
|
| 691 |
+
<xsl:template match="m:apply[*[1][self::m:notin]]">
|
| 692 |
+
<xsl:param name="p" select="0"/>
|
| 693 |
+
<xsl:call-template name="binary">
|
| 694 |
+
<xsl:with-param name="mo">\notin </xsl:with-param>
|
| 695 |
+
<xsl:with-param name="p" select="$p"/>
|
| 696 |
+
<xsl:with-param name="this-p" select="3"/>
|
| 697 |
+
</xsl:call-template>
|
| 698 |
+
</xsl:template>
|
| 699 |
+
|
| 700 |
+
<!-- 4.4.6.7 subset -->
|
| 701 |
+
<xsl:template match="m:apply[*[1][self::m:subset]]">
|
| 702 |
+
<xsl:param name="p" select="0"/>
|
| 703 |
+
<xsl:call-template name="infix">
|
| 704 |
+
<xsl:with-param name="this-p" select="2"/>
|
| 705 |
+
<xsl:with-param name="p" select="$p"/>
|
| 706 |
+
<xsl:with-param name="mo">\subseteq </xsl:with-param>
|
| 707 |
+
</xsl:call-template>
|
| 708 |
+
</xsl:template>
|
| 709 |
+
|
| 710 |
+
<!-- 4.4.6.8 prsubset -->
|
| 711 |
+
<xsl:template match="m:apply[*[1][self::m:prsubset]]">
|
| 712 |
+
<xsl:param name="p" select="0"/>
|
| 713 |
+
<xsl:call-template name="infix">
|
| 714 |
+
<xsl:with-param name="this-p" select="2"/>
|
| 715 |
+
<xsl:with-param name="p" select="$p"/>
|
| 716 |
+
<xsl:with-param name="mo">\subset </xsl:with-param>
|
| 717 |
+
</xsl:call-template>
|
| 718 |
+
</xsl:template>
|
| 719 |
+
|
| 720 |
+
<!-- 4.4.6.9 notsubset -->
|
| 721 |
+
<xsl:template match="m:apply[*[1][self::m:notsubset]]">
|
| 722 |
+
<xsl:param name="p" select="0"/>
|
| 723 |
+
<xsl:call-template name="binary">
|
| 724 |
+
<xsl:with-param name="this-p" select="2"/>
|
| 725 |
+
<xsl:with-param name="p" select="$p"/>
|
| 726 |
+
<xsl:with-param name="mo">\nsubseteq </xsl:with-param>
|
| 727 |
+
</xsl:call-template>
|
| 728 |
+
</xsl:template>
|
| 729 |
+
|
| 730 |
+
<!-- 4.4.6.10 notprsubset -->
|
| 731 |
+
<xsl:template match="m:apply[*[1][self::m:notprsubset]]">
|
| 732 |
+
<xsl:param name="p" select="0"/>
|
| 733 |
+
<xsl:call-template name="binary">
|
| 734 |
+
<xsl:with-param name="this-p" select="2"/>
|
| 735 |
+
<xsl:with-param name="p" select="$p"/>
|
| 736 |
+
<xsl:with-param name="mo">\not\subset </xsl:with-param>
|
| 737 |
+
</xsl:call-template>
|
| 738 |
+
</xsl:template>
|
| 739 |
+
|
| 740 |
+
<!-- 4.4.6.11 setdiff -->
|
| 741 |
+
<xsl:template match="m:apply[*[1][self::m:setdiff]]">
|
| 742 |
+
<xsl:param name="p" select="0"/>
|
| 743 |
+
<xsl:call-template name="binary">
|
| 744 |
+
<xsl:with-param name="this-p" select="2"/>
|
| 745 |
+
<xsl:with-param name="p" select="$p"/>
|
| 746 |
+
<xsl:with-param name="mo">\setminus </xsl:with-param>
|
| 747 |
+
</xsl:call-template>
|
| 748 |
+
</xsl:template>
|
| 749 |
+
|
| 750 |
+
<!-- 4.4.6.12 card -->
|
| 751 |
+
<xsl:template match="m:apply[*[1][self::m:card]]">
|
| 752 |
+
<xsl:text>|</xsl:text>
|
| 753 |
+
<xsl:apply-templates select="*[2]"/>
|
| 754 |
+
<xsl:text>|</xsl:text>
|
| 755 |
+
</xsl:template>
|
| 756 |
+
|
| 757 |
+
<!-- 4.4.6.13 cartesianproduct 4.4.10.6 vectorproduct -->
|
| 758 |
+
<xsl:template match="m:apply[*[1][self::m:cartesianproduct or self::m:vectorproduct]]">
|
| 759 |
+
<xsl:param name="p" select="0"/>
|
| 760 |
+
<xsl:call-template name="infix">
|
| 761 |
+
<xsl:with-param name="this-p" select="2"/>
|
| 762 |
+
<xsl:with-param name="p" select="$p"/>
|
| 763 |
+
<xsl:with-param name="mo">\times </xsl:with-param>
|
| 764 |
+
</xsl:call-template>
|
| 765 |
+
</xsl:template>
|
| 766 |
+
|
| 767 |
+
<xsl:template
|
| 768 |
+
match="m:apply[*[1][self::m:cartesianproduct][count(following-sibling::m:reals)=count(following-sibling::*)]]"
|
| 769 |
+
priority="2">
|
| 770 |
+
<xsl:apply-templates select="*[2]">
|
| 771 |
+
<xsl:with-param name="p" select="5"/>
|
| 772 |
+
</xsl:apply-templates>
|
| 773 |
+
<xsl:text>^{</xsl:text>
|
| 774 |
+
<xsl:value-of select="count(*)-1"/>
|
| 775 |
+
<xsl:text>}</xsl:text>
|
| 776 |
+
</xsl:template>
|
| 777 |
+
|
| 778 |
+
<!-- 4.4.7.1 sum -->
|
| 779 |
+
<xsl:template match="m:apply[*[1][self::m:sum]]">
|
| 780 |
+
<xsl:text>\sum</xsl:text><xsl:call-template name="series"/>
|
| 781 |
+
</xsl:template>
|
| 782 |
+
|
| 783 |
+
<!-- 4.4.7.2 product -->
|
| 784 |
+
<xsl:template match="m:apply[*[1][self::m:product]]">
|
| 785 |
+
<xsl:text>\prod</xsl:text><xsl:call-template name="series"/>
|
| 786 |
+
</xsl:template>
|
| 787 |
+
|
| 788 |
+
<xsl:template name="series">
|
| 789 |
+
<xsl:if test="m:lowlimit/*|m:interval/*[1]|m:condition/*">
|
| 790 |
+
<xsl:text>_{</xsl:text>
|
| 791 |
+
<xsl:if test="not(m:condition)">
|
| 792 |
+
<xsl:apply-templates select="m:bvar"/>
|
| 793 |
+
<xsl:text>=</xsl:text>
|
| 794 |
+
</xsl:if>
|
| 795 |
+
<xsl:apply-templates select="m:lowlimit/*|m:interval/*[1]|m:condition/*"/>
|
| 796 |
+
<xsl:text>}</xsl:text>
|
| 797 |
+
</xsl:if>
|
| 798 |
+
<xsl:if test="m:uplimit/*|m:interval/*[2]">
|
| 799 |
+
<xsl:text>^{</xsl:text>
|
| 800 |
+
<xsl:apply-templates select="m:uplimit/*|m:interval/*[2]"/>
|
| 801 |
+
<xsl:text>}</xsl:text>
|
| 802 |
+
</xsl:if>
|
| 803 |
+
<xsl:text> </xsl:text>
|
| 804 |
+
<xsl:apply-templates select="*[last()]"/>
|
| 805 |
+
</xsl:template>
|
| 806 |
+
|
| 807 |
+
<!-- 4.4.7.3 limit -->
|
| 808 |
+
<xsl:template match="m:apply[*[1][self::m:limit]]">
|
| 809 |
+
<xsl:text>\lim_{</xsl:text>
|
| 810 |
+
<xsl:apply-templates select="m:lowlimit|m:condition/*"/>
|
| 811 |
+
<xsl:text>}</xsl:text>
|
| 812 |
+
<xsl:apply-templates select="*[last()]"/>
|
| 813 |
+
</xsl:template>
|
| 814 |
+
|
| 815 |
+
<xsl:template match="m:apply[m:limit]/m:lowlimit" priority="3">
|
| 816 |
+
<xsl:apply-templates select="../m:bvar/node()"/>
|
| 817 |
+
<xsl:text>\to </xsl:text>
|
| 818 |
+
<xsl:apply-templates/>
|
| 819 |
+
</xsl:template>
|
| 820 |
+
|
| 821 |
+
<!-- 4.4.7.4 tendsto -->
|
| 822 |
+
<xsl:template match="m:apply[*[1][self::m:tendsto]]">
|
| 823 |
+
<xsl:param name="p"/>
|
| 824 |
+
<xsl:call-template name="binary">
|
| 825 |
+
<xsl:with-param name="this-p" select="2"/>
|
| 826 |
+
<xsl:with-param name="p" select="$p"/>
|
| 827 |
+
<xsl:with-param name="mo">
|
| 828 |
+
<xsl:choose>
|
| 829 |
+
<xsl:when test="@type='above'">\searrow </xsl:when>
|
| 830 |
+
<xsl:when test="@type='below'">\nearrow </xsl:when>
|
| 831 |
+
<xsl:when test="@type='two-sided'">\rightarrow </xsl:when>
|
| 832 |
+
<xsl:otherwise>\to </xsl:otherwise>
|
| 833 |
+
</xsl:choose>
|
| 834 |
+
</xsl:with-param>
|
| 835 |
+
</xsl:call-template>
|
| 836 |
+
</xsl:template>
|
| 837 |
+
|
| 838 |
+
<!-- 4.4.8.1 common tringonometric functions 4.4.8.3 natural logarithm -->
|
| 839 |
+
<xsl:template match="m:apply[*[1][
|
| 840 |
+
self::m:sin or self::m:cos or self::m:tan or self::m:sec or
|
| 841 |
+
self::m:csc or self::m:cot or self::m:sinh or self::m:cosh or
|
| 842 |
+
self::m:tanh or self::m:coth or self::m:arcsin or self::m:arccos or
|
| 843 |
+
self::m:arctan or self::m:ln]]">
|
| 844 |
+
<xsl:text>\</xsl:text>
|
| 845 |
+
<xsl:value-of select="local-name(*[1])"/>
|
| 846 |
+
<xsl:text> </xsl:text>
|
| 847 |
+
<xsl:apply-templates select="*[2]">
|
| 848 |
+
<xsl:with-param name="p" select="7"/>
|
| 849 |
+
</xsl:apply-templates>
|
| 850 |
+
</xsl:template>
|
| 851 |
+
|
| 852 |
+
<xsl:template match="m:sin | m:cos | m:tan | m:sec | m:csc |
|
| 853 |
+
m:cot | m:sinh | m:cosh | m:tanh | m:coth |
|
| 854 |
+
m:arcsin | m:arccos | m:arctan | m:ln">
|
| 855 |
+
<xsl:text>\</xsl:text>
|
| 856 |
+
<xsl:value-of select="local-name(.)"/>
|
| 857 |
+
<xsl:text> </xsl:text>
|
| 858 |
+
</xsl:template>
|
| 859 |
+
|
| 860 |
+
<xsl:template match="m:apply[*[1][
|
| 861 |
+
self::m:sech or self::m:csch or self::m:arccosh or
|
| 862 |
+
self::m:arccot or self::m:arccoth or self::m:arccsc or
|
| 863 |
+
self::m:arccsch or self::m:arcsec or self::m:arcsech or
|
| 864 |
+
self::m:arcsinh or self::m:arctanh]]">
|
| 865 |
+
<xsl:text>\mathrm{</xsl:text>
|
| 866 |
+
<xsl:value-of select="local-name(*[1])"/>
|
| 867 |
+
<xsl:text>\,}</xsl:text>
|
| 868 |
+
<xsl:apply-templates select="*[2]">
|
| 869 |
+
<xsl:with-param name="p" select="7"/>
|
| 870 |
+
</xsl:apply-templates>
|
| 871 |
+
</xsl:template>
|
| 872 |
+
|
| 873 |
+
<xsl:template match="m:sech | m:csch | m:arccosh | m:arccot |
|
| 874 |
+
m:arccoth | m:arccsc |m:arccsch |m:arcsec |
|
| 875 |
+
m:arcsech | m:arcsinh | m:arctanh">
|
| 876 |
+
<xsl:text>\mathrm{</xsl:text>
|
| 877 |
+
<xsl:value-of select="local-name(.)"/>
|
| 878 |
+
<xsl:text>}</xsl:text>
|
| 879 |
+
</xsl:template>
|
| 880 |
+
|
| 881 |
+
<!-- 4.4.8.2 exp -->
|
| 882 |
+
<xsl:template match="m:apply[*[1][self::m:exp]]">
|
| 883 |
+
<xsl:text>e^{</xsl:text><xsl:apply-templates select="*[2]"/><xsl:text>}</xsl:text>
|
| 884 |
+
</xsl:template>
|
| 885 |
+
|
| 886 |
+
<!-- 4.4.8.4 log -->
|
| 887 |
+
<xsl:template match="m:apply[*[1][self::m:log]]">
|
| 888 |
+
<xsl:text>\lg </xsl:text>
|
| 889 |
+
<xsl:apply-templates select="*[last()]">
|
| 890 |
+
<xsl:with-param name="p" select="7"/>
|
| 891 |
+
</xsl:apply-templates>
|
| 892 |
+
</xsl:template>
|
| 893 |
+
|
| 894 |
+
<xsl:template match="m:apply[*[1][self::m:log] and m:logbase != 10]">
|
| 895 |
+
<xsl:text>\log_{</xsl:text>
|
| 896 |
+
<xsl:apply-templates select="m:logbase/node()"/>
|
| 897 |
+
<xsl:text>}</xsl:text>
|
| 898 |
+
<xsl:apply-templates select="*[last()]">
|
| 899 |
+
<xsl:with-param name="p" select="7"/>
|
| 900 |
+
</xsl:apply-templates>
|
| 901 |
+
</xsl:template>
|
| 902 |
+
|
| 903 |
+
<!-- 4.4.9.1 mean -->
|
| 904 |
+
<xsl:template match="m:apply[*[1][self::m:mean]]">
|
| 905 |
+
<xsl:text>\langle </xsl:text>
|
| 906 |
+
<xsl:for-each select="*[position()>1]">
|
| 907 |
+
<xsl:apply-templates select="."/>
|
| 908 |
+
<xsl:if test="position() !=last()"><xsl:text>, </xsl:text></xsl:if>
|
| 909 |
+
</xsl:for-each>
|
| 910 |
+
<xsl:text>\rangle </xsl:text>
|
| 911 |
+
</xsl:template>
|
| 912 |
+
|
| 913 |
+
<!-- 4.4.9.2 sdef -->
|
| 914 |
+
<xsl:template match="m:sdev"><xsl:text>\sigma </xsl:text></xsl:template>
|
| 915 |
+
|
| 916 |
+
<!-- 4.4.9.3 variance -->
|
| 917 |
+
<xsl:template match="m:apply[*[1][self::m:variance]]">
|
| 918 |
+
<xsl:text>\sigma(</xsl:text>
|
| 919 |
+
<xsl:apply-templates select="*[2]"/>
|
| 920 |
+
<xsl:text>)^2</xsl:text>
|
| 921 |
+
</xsl:template>
|
| 922 |
+
|
| 923 |
+
<!-- 4.4.9.5 moment -->
|
| 924 |
+
<xsl:template match="m:apply[*[1][self::m:moment]]">
|
| 925 |
+
<xsl:text>\langle </xsl:text>
|
| 926 |
+
<xsl:apply-templates select="*[last()]"/>
|
| 927 |
+
<xsl:text>^{</xsl:text>
|
| 928 |
+
<xsl:apply-templates select="m:degree/node()"/>
|
| 929 |
+
<xsl:text>}\rangle</xsl:text>
|
| 930 |
+
<xsl:if test="m:momentabout">
|
| 931 |
+
<xsl:text>_{</xsl:text>
|
| 932 |
+
<xsl:apply-templates select="m:momentabout/node()"/>
|
| 933 |
+
<xsl:text>}</xsl:text>
|
| 934 |
+
</xsl:if>
|
| 935 |
+
<xsl:text> </xsl:text>
|
| 936 |
+
</xsl:template>
|
| 937 |
+
|
| 938 |
+
<!-- 4.4.10.1 vector -->
|
| 939 |
+
<xsl:template match="m:vector">
|
| 940 |
+
<xsl:text>\left(\begin{array}{c}</xsl:text>
|
| 941 |
+
<xsl:for-each select="*">
|
| 942 |
+
<xsl:apply-templates select="."/>
|
| 943 |
+
<xsl:if test="position()!=last()"><xsl:text>\\ </xsl:text></xsl:if>
|
| 944 |
+
</xsl:for-each>
|
| 945 |
+
<xsl:text>\end{array}\right)</xsl:text>
|
| 946 |
+
</xsl:template>
|
| 947 |
+
|
| 948 |
+
<!-- 4.4.10.2 matrix -->
|
| 949 |
+
<xsl:template match="m:matrix">
|
| 950 |
+
<xsl:text>\begin{pmatrix}</xsl:text>
|
| 951 |
+
<xsl:apply-templates/>
|
| 952 |
+
<xsl:text>\end{pmatrix}</xsl:text>
|
| 953 |
+
</xsl:template>
|
| 954 |
+
|
| 955 |
+
<!-- 4.4.10.3 matrixrow -->
|
| 956 |
+
<xsl:template match="m:matrixrow">
|
| 957 |
+
<xsl:for-each select="*">
|
| 958 |
+
<xsl:apply-templates select="."/>
|
| 959 |
+
<xsl:if test="position()!=last()"><xsl:text> & </xsl:text></xsl:if>
|
| 960 |
+
</xsl:for-each>
|
| 961 |
+
<xsl:if test="position()!=last()"><xsl:text>\\ </xsl:text></xsl:if>
|
| 962 |
+
</xsl:template>
|
| 963 |
+
|
| 964 |
+
<!-- 4.4.10.4 determinant -->
|
| 965 |
+
<xsl:template match="m:apply[*[1][self::m:determinant]]">
|
| 966 |
+
<xsl:text>\det </xsl:text>
|
| 967 |
+
<xsl:apply-templates select="*[2]">
|
| 968 |
+
<xsl:with-param name="p" select="7"/>
|
| 969 |
+
</xsl:apply-templates>
|
| 970 |
+
</xsl:template>
|
| 971 |
+
|
| 972 |
+
<xsl:template match="m:apply[*[1][self::m:determinant]][*[2][self::m:matrix]]" priority="2">
|
| 973 |
+
<xsl:text>\begin{vmatrix}</xsl:text>
|
| 974 |
+
<xsl:apply-templates select="m:matrix/*"/>
|
| 975 |
+
<xsl:text>\end{vmatrix}</xsl:text>
|
| 976 |
+
</xsl:template>
|
| 977 |
+
|
| 978 |
+
<!-- 4.4.10.5 transpose -->
|
| 979 |
+
<xsl:template match="m:apply[*[1][self::m:transpose]]">
|
| 980 |
+
<xsl:apply-templates select="*[2]">
|
| 981 |
+
<xsl:with-param name="p" select="7"/>
|
| 982 |
+
</xsl:apply-templates>
|
| 983 |
+
<xsl:text>^T</xsl:text>
|
| 984 |
+
</xsl:template>
|
| 985 |
+
|
| 986 |
+
<!-- 4.4.10.5 selector -->
|
| 987 |
+
<xsl:template match="m:apply[*[1][self::m:selector]]">
|
| 988 |
+
<xsl:apply-templates select="*[2]">
|
| 989 |
+
<xsl:with-param name="p" select="7"/>
|
| 990 |
+
</xsl:apply-templates>
|
| 991 |
+
<xsl:text>_{</xsl:text>
|
| 992 |
+
<xsl:for-each select="*[position()>2]">
|
| 993 |
+
<xsl:apply-templates select="."/>
|
| 994 |
+
<xsl:if test="position() !=last()"><xsl:text>, </xsl:text></xsl:if>
|
| 995 |
+
</xsl:for-each>
|
| 996 |
+
<xsl:text>}</xsl:text>
|
| 997 |
+
</xsl:template>
|
| 998 |
+
|
| 999 |
+
<!-- 4.4.10.7 scalarproduct 4.4.10.8 outerproduct -->
|
| 1000 |
+
<xsl:template match="m:apply[*[1][self::m:scalarproduct or self::m:outerproduct]]">
|
| 1001 |
+
<xsl:param name="p" select="0"/>
|
| 1002 |
+
<xsl:call-template name="infix">
|
| 1003 |
+
<xsl:with-param name="this-p" select="2"/>
|
| 1004 |
+
<xsl:with-param name="p" select="$p"/>
|
| 1005 |
+
<xsl:with-param name="mo">\dot </xsl:with-param>
|
| 1006 |
+
</xsl:call-template>
|
| 1007 |
+
</xsl:template>
|
| 1008 |
+
|
| 1009 |
+
<!-- 4.4.11.2 semantics -->
|
| 1010 |
+
<xsl:template match="m:semantics"><xsl:apply-templates select="*[1]"/></xsl:template>
|
| 1011 |
+
|
| 1012 |
+
<xsl:template match="m:semantics[m:annotation/@encoding='TeX']">
|
| 1013 |
+
<xsl:apply-templates select="m:annotation[@encoding='TeX']/node()"/>
|
| 1014 |
+
</xsl:template>
|
| 1015 |
+
|
| 1016 |
+
<!-- 4.4.12.1 integers -->
|
| 1017 |
+
<xsl:template match="m:integers"><xsl:text>\mathbb{Z}</xsl:text></xsl:template>
|
| 1018 |
+
|
| 1019 |
+
<!-- 4.4.12.2 reals -->
|
| 1020 |
+
<xsl:template match="m:reals"><xsl:text>\mathbb{R}</xsl:text></xsl:template>
|
| 1021 |
+
|
| 1022 |
+
<!-- 4.4.12.3 rationals -->
|
| 1023 |
+
<xsl:template match="m:rationals"><xsl:text>\mathbb{Q}</xsl:text></xsl:template>
|
| 1024 |
+
|
| 1025 |
+
<!-- 4.4.12.4 naturalnumbers -->
|
| 1026 |
+
<xsl:template match="m:naturalnumbers"><xsl:text>\mathbb{N}</xsl:text></xsl:template>
|
| 1027 |
+
|
| 1028 |
+
<!-- 4.4.12.5 complexes -->
|
| 1029 |
+
<xsl:template match="m:complexes"><xsl:text>\mathbb{C}</xsl:text></xsl:template>
|
| 1030 |
+
|
| 1031 |
+
<!-- 4.4.12.6 primes -->
|
| 1032 |
+
<xsl:template match="m:primes"><xsl:text>\mathbb{P}</xsl:text></xsl:template>
|
| 1033 |
+
|
| 1034 |
+
<!-- 4.4.12.7 exponentiale -->
|
| 1035 |
+
<xsl:template match="m:exponentiale"><xsl:text>e</xsl:text></xsl:template>
|
| 1036 |
+
|
| 1037 |
+
<!-- 4.4.12.8 imaginaryi -->
|
| 1038 |
+
<xsl:template match="m:imaginaryi"><xsl:text>i</xsl:text></xsl:template>
|
| 1039 |
+
|
| 1040 |
+
<!-- 4.4.12.9 notanumber -->
|
| 1041 |
+
<xsl:template match="m:notanumber"><xsl:text>NaN</xsl:text></xsl:template>
|
| 1042 |
+
|
| 1043 |
+
<!-- 4.4.12.10 true -->
|
| 1044 |
+
<xsl:template match="m:true"><xsl:text>\mbox{true}</xsl:text></xsl:template>
|
| 1045 |
+
|
| 1046 |
+
<!-- 4.4.12.11 false -->
|
| 1047 |
+
<xsl:template match="m:false"><xsl:text>\mbox{false}</xsl:text></xsl:template>
|
| 1048 |
+
|
| 1049 |
+
<!-- 4.4.12.12 emptyset -->
|
| 1050 |
+
<xsl:template match="m:emptyset"><xsl:text>\emptyset </xsl:text></xsl:template>
|
| 1051 |
+
|
| 1052 |
+
<!-- 4.4.12.13 pi -->
|
| 1053 |
+
<xsl:template match="m:pi"><xsl:text>\pi </xsl:text></xsl:template>
|
| 1054 |
+
|
| 1055 |
+
<!-- 4.4.12.14 eulergamma -->
|
| 1056 |
+
<xsl:template match="m:eulergamma"><xsl:text>\gamma </xsl:text></xsl:template>
|
| 1057 |
+
|
| 1058 |
+
<!-- 4.4.12.15 infinity -->
|
| 1059 |
+
<xsl:template match="m:infinity"><xsl:text>\infty </xsl:text></xsl:template>
|
| 1060 |
+
|
| 1061 |
+
<!-- ****************************** -->
|
| 1062 |
+
<xsl:template name="infix" >
|
| 1063 |
+
<xsl:param name="mo"/>
|
| 1064 |
+
<xsl:param name="p" select="0"/>
|
| 1065 |
+
<xsl:param name="this-p" select="0"/>
|
| 1066 |
+
<xsl:if test="$this-p < $p"><xsl:text>(</xsl:text></xsl:if>
|
| 1067 |
+
<xsl:for-each select="*[position()>1]">
|
| 1068 |
+
<xsl:if test="position() > 1">
|
| 1069 |
+
<xsl:copy-of select="$mo"/>
|
| 1070 |
+
</xsl:if>
|
| 1071 |
+
<xsl:apply-templates select=".">
|
| 1072 |
+
<xsl:with-param name="p" select="$this-p"/>
|
| 1073 |
+
</xsl:apply-templates>
|
| 1074 |
+
</xsl:for-each>
|
| 1075 |
+
<xsl:if test="$this-p < $p"><xsl:text>)</xsl:text></xsl:if>
|
| 1076 |
+
</xsl:template>
|
| 1077 |
+
|
| 1078 |
+
<xsl:template name="binary" >
|
| 1079 |
+
<xsl:param name="mo"/>
|
| 1080 |
+
<xsl:param name="p" select="0"/>
|
| 1081 |
+
<xsl:param name="this-p" select="0"/>
|
| 1082 |
+
<xsl:if test="$this-p < $p"><xsl:text>(</xsl:text></xsl:if>
|
| 1083 |
+
<xsl:apply-templates select="*[2]">
|
| 1084 |
+
<xsl:with-param name="p" select="$this-p"/>
|
| 1085 |
+
</xsl:apply-templates>
|
| 1086 |
+
<xsl:value-of select="$mo"/>
|
| 1087 |
+
<xsl:apply-templates select="*[3]">
|
| 1088 |
+
<xsl:with-param name="p" select="$this-p"/>
|
| 1089 |
+
</xsl:apply-templates>
|
| 1090 |
+
<xsl:if test="$this-p < $p"><xsl:text>)</xsl:text></xsl:if>
|
| 1091 |
+
</xsl:template>
|
| 1092 |
+
|
| 1093 |
+
</xsl:stylesheet>
|
ultradata_math_parser/mmltex/entities.xsl
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version='1.0' encoding="UTF-8"?>
|
| 2 |
+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
| 3 |
+
xmlns:m="http://www.w3.org/1998/Math/MathML"
|
| 4 |
+
version='1.0'>
|
| 5 |
+
|
| 6 |
+
<!-- ====================================================================== -->
|
| 7 |
+
<!-- $id: entities.xsl, 2002/22/11 Exp $
|
| 8 |
+
This file is part of the XSLT MathML Library distribution.
|
| 9 |
+
See ./README or http://www.raleigh.ru/MathML/mmltex for
|
| 10 |
+
copyright and other information -->
|
| 11 |
+
<!-- ====================================================================== -->
|
| 12 |
+
|
| 13 |
+
<xsl:template name="replaceEntities">
|
| 14 |
+
<xsl:param name="content"/>
|
| 15 |
+
<xsl:if test="string-length($content)>0">
|
| 16 |
+
<xsl:choose>
|
| 17 |
+
<xsl:when test="starts-with($content,'ɛ')"><xsl:value-of select="'\varepsilon '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ɛ')"/></xsl:call-template></xsl:when> <!--/varepsilon -->
|
| 18 |
+
|
| 19 |
+
<!-- ====================================================================== -->
|
| 20 |
+
<!-- Unicode 3.2
|
| 21 |
+
Greek
|
| 22 |
+
Range: 0370-03FF
|
| 23 |
+
http://www.unicode.org/charts/PDF/U0370.pdf -->
|
| 24 |
+
<!-- ====================================================================== -->
|
| 25 |
+
<xsl:when test="starts-with($content,'Γ')"><xsl:value-of select="'\Gamma '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'Γ')"/></xsl:call-template></xsl:when> <!--/Gamma capital Gamma, Greek -->
|
| 26 |
+
<xsl:when test="starts-with($content,'Δ')"><xsl:value-of select="'\Delta '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'Δ')"/></xsl:call-template></xsl:when> <!--/Delta capital Delta, Greek -->
|
| 27 |
+
<xsl:when test="starts-with($content,'Θ')"><xsl:value-of select="'\Theta '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'Θ')"/></xsl:call-template></xsl:when> <!--/Theta capital Theta, Greek -->
|
| 28 |
+
<xsl:when test="starts-with($content,'Λ')"><xsl:value-of select="'\Lambda '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'Λ')"/></xsl:call-template></xsl:when> <!--/Lambda capital Lambda, Greek -->
|
| 29 |
+
<xsl:when test="starts-with($content,'Ξ')"><xsl:value-of select="'\Xi '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'Ξ')"/></xsl:call-template></xsl:when> <!--/Xi capital Xi, Greek -->
|
| 30 |
+
<xsl:when test="starts-with($content,'Π')"><xsl:value-of select="'\Pi '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'Π')"/></xsl:call-template></xsl:when> <!--/Pi capital Pi, Greek -->
|
| 31 |
+
<xsl:when test="starts-with($content,'Σ')"><xsl:value-of select="'\Sigma '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'Σ')"/></xsl:call-template></xsl:when> <!--/Sigma capital Sigma, Greek -->
|
| 32 |
+
<xsl:when test="starts-with($content,'Φ')"><xsl:value-of select="'\Phi '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'Φ')"/></xsl:call-template></xsl:when> <!--/Phi capital Phi, Greek -->
|
| 33 |
+
<xsl:when test="starts-with($content,'Ψ')"><xsl:value-of select="'\Psi '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'Ψ')"/></xsl:call-template></xsl:when> <!--/Psi capital Psi, Greek -->
|
| 34 |
+
<xsl:when test="starts-with($content,'Ω')"><xsl:value-of select="'\Omega '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'Ω')"/></xsl:call-template></xsl:when> <!--/Omega capital Omega, Greek -->
|
| 35 |
+
<xsl:when test="starts-with($content,'α')"><xsl:value-of select="'\alpha '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'α')"/></xsl:call-template></xsl:when> <!--/alpha small alpha, Greek -->
|
| 36 |
+
<xsl:when test="starts-with($content,'β')"><xsl:value-of select="'\beta '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'β')"/></xsl:call-template></xsl:when> <!--/beta small beta, Greek -->
|
| 37 |
+
<xsl:when test="starts-with($content,'γ')"><xsl:value-of select="'\gamma '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'γ')"/></xsl:call-template></xsl:when> <!--/gamma small gamma, Greek -->
|
| 38 |
+
<xsl:when test="starts-with($content,'δ')"><xsl:value-of select="'\delta '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'δ')"/></xsl:call-template></xsl:when> <!--/delta small delta, Greek -->
|
| 39 |
+
<xsl:when test="starts-with($content,'ε')"><xsl:value-of select="'\epsilon '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ε')"/></xsl:call-template></xsl:when> <!--/straightepsilon, small epsilon, Greek -->
|
| 40 |
+
<xsl:when test="starts-with($content,'ζ')"><xsl:value-of select="'\zeta '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ζ')"/></xsl:call-template></xsl:when> <!--/zeta small zeta, Greek -->
|
| 41 |
+
<xsl:when test="starts-with($content,'η')"><xsl:value-of select="'\eta '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'η')"/></xsl:call-template></xsl:when> <!--/eta small eta, Greek -->
|
| 42 |
+
<xsl:when test="starts-with($content,'θ')"><xsl:value-of select="'\theta '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'θ')"/></xsl:call-template></xsl:when> <!--/theta straight theta, small theta, Greek -->
|
| 43 |
+
<xsl:when test="starts-with($content,'ι')"><xsl:value-of select="'\iota '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ι')"/></xsl:call-template></xsl:when> <!--/iota small iota, Greek -->
|
| 44 |
+
<xsl:when test="starts-with($content,'κ')"><xsl:value-of select="'\kappa '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'κ')"/></xsl:call-template></xsl:when> <!--/kappa small kappa, Greek -->
|
| 45 |
+
<xsl:when test="starts-with($content,'λ')"><xsl:value-of select="'\lambda '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'λ')"/></xsl:call-template></xsl:when> <!--/lambda small lambda, Greek -->
|
| 46 |
+
<xsl:when test="starts-with($content,'μ')"><xsl:value-of select="'\mu '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'μ')"/></xsl:call-template></xsl:when> <!--/mu small mu, Greek -->
|
| 47 |
+
<xsl:when test="starts-with($content,'ν')"><xsl:value-of select="'\nu '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ν')"/></xsl:call-template></xsl:when> <!--/nu small nu, Greek -->
|
| 48 |
+
<xsl:when test="starts-with($content,'ξ')"><xsl:value-of select="'\xi '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ξ')"/></xsl:call-template></xsl:when> <!--/xi small xi, Greek -->
|
| 49 |
+
<xsl:when test="starts-with($content,'π')"><xsl:value-of select="'\pi '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'π')"/></xsl:call-template></xsl:when> <!--/pi small pi, Greek -->
|
| 50 |
+
<xsl:when test="starts-with($content,'ρ')"><xsl:value-of select="'\rho '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ρ')"/></xsl:call-template></xsl:when> <!--/rho small rho, Greek -->
|
| 51 |
+
<xsl:when test="starts-with($content,'ς')"><xsl:value-of select="'\varsigma '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ς')"/></xsl:call-template></xsl:when> <!--/varsigma -->
|
| 52 |
+
<xsl:when test="starts-with($content,'σ')"><xsl:value-of select="'\sigma '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'σ')"/></xsl:call-template></xsl:when> <!--/sigma small sigma, Greek -->
|
| 53 |
+
<xsl:when test="starts-with($content,'τ')"><xsl:value-of select="'\tau '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'τ')"/></xsl:call-template></xsl:when> <!--/tau small tau, Greek -->
|
| 54 |
+
<xsl:when test="starts-with($content,'υ')"><xsl:value-of select="'\upsilon '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'υ')"/></xsl:call-template></xsl:when> <!--/upsilon small upsilon, Greek -->
|
| 55 |
+
<xsl:when test="starts-with($content,'φ')"><xsl:value-of select="'\phi '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'φ')"/></xsl:call-template></xsl:when> <!--/straightphi - small phi, Greek -->
|
| 56 |
+
<xsl:when test="starts-with($content,'χ')"><xsl:value-of select="'\chi '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'χ')"/></xsl:call-template></xsl:when> <!--/chi small chi, Greek -->
|
| 57 |
+
<xsl:when test="starts-with($content,'ψ')"><xsl:value-of select="'\psi '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ψ')"/></xsl:call-template></xsl:when> <!--/psi small psi, Greek -->
|
| 58 |
+
<xsl:when test="starts-with($content,'ω')"><xsl:value-of select="'\omega '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ω')"/></xsl:call-template></xsl:when> <!--/omega small omega, Greek -->
|
| 59 |
+
<xsl:when test="starts-with($content,'ϑ')"><xsl:value-of select="'\vartheta '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ϑ')"/></xsl:call-template></xsl:when> <!--/vartheta - curly or open theta -->
|
| 60 |
+
<xsl:when test="starts-with($content,'ϒ')"><xsl:value-of select="'\Upsilon '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ϒ')"/></xsl:call-template></xsl:when> <!--/Upsilon capital Upsilon, Greek -->
|
| 61 |
+
<xsl:when test="starts-with($content,'ϕ')"><xsl:value-of select="'\varphi '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ϕ')"/></xsl:call-template></xsl:when> <!--/varphi - curly or open phi -->
|
| 62 |
+
<xsl:when test="starts-with($content,'ϖ')"><xsl:value-of select="'\varpi '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ϖ')"/></xsl:call-template></xsl:when> <!--/varpi -->
|
| 63 |
+
<xsl:when test="starts-with($content,'ϰ')"><xsl:value-of select="'\varkappa '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ϰ')"/></xsl:call-template></xsl:when> <!--/varkappa -->
|
| 64 |
+
<xsl:when test="starts-with($content,'ϱ')"><xsl:value-of select="'\varrho '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ϱ')"/></xsl:call-template></xsl:when> <!--/varrho -->
|
| 65 |
+
|
| 66 |
+
<!-- ====================================================================== -->
|
| 67 |
+
<xsl:when test="starts-with($content,'​')"><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '​')"/></xsl:call-template></xsl:when> <!--short form of ⁣ -->
|
| 68 |
+
<xsl:when test="starts-with($content,'…')"><xsl:value-of select="'\dots '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '…')"/></xsl:call-template></xsl:when>
|
| 69 |
+
<xsl:when test="starts-with($content,'′')"><xsl:value-of select="'\prime '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '′')"/></xsl:call-template></xsl:when> <!--/prime prime or minute -->
|
| 70 |
+
<xsl:when test="starts-with($content,'⁡')"><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⁡')"/></xsl:call-template></xsl:when> <!-- ApplyFunction -->
|
| 71 |
+
<xsl:when test="starts-with($content,'⁢')"><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⁢')"/></xsl:call-template></xsl:when> <!-- InvisibleTimes -->
|
| 72 |
+
<!-- ====================================================================== -->
|
| 73 |
+
<!-- Unicode 3.2
|
| 74 |
+
Letterlike Symbols
|
| 75 |
+
Range: 2100-214F
|
| 76 |
+
http://www.unicode.org/charts/PDF/U2100.pdf -->
|
| 77 |
+
<!-- ====================================================================== -->
|
| 78 |
+
<xsl:when test="starts-with($content,'ℏ︀')"><xsl:value-of select="'\hbar '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ℏ︀')"/></xsl:call-template></xsl:when> <!--/hbar - Planck's over 2pi -->
|
| 79 |
+
<xsl:when test="starts-with($content,'ℏ')"><xsl:value-of select="'\hslash '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ℏ')"/></xsl:call-template></xsl:when> <!--/hslash - variant Planck's over 2pi --> <!-- Required amssymb -->
|
| 80 |
+
<xsl:when test="starts-with($content,'ℑ')"><xsl:value-of select="'\Im '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ℑ')"/></xsl:call-template></xsl:when> <!--/Im - imaginary -->
|
| 81 |
+
<xsl:when test="starts-with($content,'ℓ')"><xsl:value-of select="'\ell '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ℓ')"/></xsl:call-template></xsl:when> <!--/ell - cursive small l -->
|
| 82 |
+
<xsl:when test="starts-with($content,'℘')"><xsl:value-of select="'\wp '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '℘')"/></xsl:call-template></xsl:when> <!--/wp - Weierstrass p -->
|
| 83 |
+
<xsl:when test="starts-with($content,'ℜ')"><xsl:value-of select="'\Re '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ℜ')"/></xsl:call-template></xsl:when> <!--/Re - real -->
|
| 84 |
+
<xsl:when test="starts-with($content,'℧')"><xsl:value-of select="'\mho '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '℧')"/></xsl:call-template></xsl:when> <!--/mho - conductance -->
|
| 85 |
+
<xsl:when test="starts-with($content,'ℵ')"><xsl:value-of select="'\aleph '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ℵ')"/></xsl:call-template></xsl:when> <!--/aleph aleph, Hebrew -->
|
| 86 |
+
<xsl:when test="starts-with($content,'ℶ')"><xsl:value-of select="'\beth '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ℶ')"/></xsl:call-template></xsl:when> <!--/beth - beth, Hebrew --> <!-- Required amssymb -->
|
| 87 |
+
<xsl:when test="starts-with($content,'ℷ')"><xsl:value-of select="'\gimel '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ℷ')"/></xsl:call-template></xsl:when> <!--/gimel - gimel, Hebrew --> <!-- Required amssymb -->
|
| 88 |
+
<xsl:when test="starts-with($content,'ℸ')"><xsl:value-of select="'\daleth '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ℸ')"/></xsl:call-template></xsl:when> <!--/daleth - daleth, Hebrew --> <!-- Required amssymb -->
|
| 89 |
+
<xsl:when test="starts-with($content,'ⅅ')"><xsl:value-of select="'D'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ⅅ')"/></xsl:call-template></xsl:when> <!--D for use in differentials, e.g., within integrals -->
|
| 90 |
+
<xsl:when test="starts-with($content,'ⅆ')"><xsl:value-of select="'d'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ⅆ')"/></xsl:call-template></xsl:when> <!--d for use in differentials, e.g., within integrals -->
|
| 91 |
+
<xsl:when test="starts-with($content,'ⅇ')"><xsl:value-of select="'e'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ⅇ')"/></xsl:call-template></xsl:when> <!--e use for the exponential base of the natural logarithms -->
|
| 92 |
+
<xsl:when test="starts-with($content,'ⅈ')"><xsl:value-of select="'i'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, 'ⅈ')"/></xsl:call-template></xsl:when> <!--i for use as a square root of -1 -->
|
| 93 |
+
|
| 94 |
+
<!-- ====================================================================== -->
|
| 95 |
+
<xsl:when test="starts-with($content,'→')"><xsl:value-of select="'\to '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '→')"/></xsl:call-template></xsl:when> <!--/rightarrow /to A: =rightward arrow -->
|
| 96 |
+
|
| 97 |
+
<!-- ====================================================================== -->
|
| 98 |
+
<!-- Unicode 3.2
|
| 99 |
+
Mathematical Operators
|
| 100 |
+
Range: 2200-22FF
|
| 101 |
+
http://www.unicode.org/charts/PDF/U2200.pdf -->
|
| 102 |
+
<!-- ====================================================================== -->
|
| 103 |
+
<xsl:when test="starts-with($content,'∀')"><xsl:value-of select="'\forall '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∀')"/></xsl:call-template></xsl:when> <!--/forall for all -->
|
| 104 |
+
<xsl:when test="starts-with($content,'∁')"><xsl:value-of select="'\complement '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∁')"/></xsl:call-template></xsl:when> <!--/complement - complement sign --> <!-- Required amssymb -->
|
| 105 |
+
<xsl:when test="starts-with($content,'∂')"><xsl:value-of select="'\partial '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∂')"/></xsl:call-template></xsl:when> <!--/partial partial differential -->
|
| 106 |
+
<xsl:when test="starts-with($content,'∃')"><xsl:value-of select="'\exists '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∃')"/></xsl:call-template></xsl:when> <!--/exists at least one exists -->
|
| 107 |
+
<xsl:when test="starts-with($content,'∄')"><xsl:value-of select="'\nexists '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∄')"/></xsl:call-template></xsl:when> <!--/nexists - negated exists --> <!-- Required amssymb -->
|
| 108 |
+
<xsl:when test="starts-with($content,'∅︀')"><xsl:value-of select="'\emptyset '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∅︀')"/></xsl:call-template></xsl:when> <!--/emptyset - zero, slash -->
|
| 109 |
+
<xsl:when test="starts-with($content,'∅')"><xsl:value-of select="'\varnothing '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∅')"/></xsl:call-template></xsl:when> <!--/varnothing - circle, slash --> <!-- Required amssymb -->
|
| 110 |
+
<!-- <xsl:when test="starts-with($content,'∆')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∆')"/></xsl:call-template></xsl:when>-->
|
| 111 |
+
<xsl:when test="starts-with($content,'∇')"><xsl:value-of select="'\nabla '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∇')"/></xsl:call-template></xsl:when> <!--/nabla del, Hamilton operator -->
|
| 112 |
+
<xsl:when test="starts-with($content,'∈')"><xsl:value-of select="'\in '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∈')"/></xsl:call-template></xsl:when> <!--/in R: set membership -->
|
| 113 |
+
<xsl:when test="starts-with($content,'∉')"><xsl:value-of select="'\notin '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∉')"/></xsl:call-template></xsl:when> <!--/notin N: negated set membership -->
|
| 114 |
+
<xsl:when test="starts-with($content,'∋')"><xsl:value-of select="'\ni '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∋')"/></xsl:call-template></xsl:when> <!--/ni /owns R: contains -->
|
| 115 |
+
<xsl:when test="starts-with($content,'∌')"><xsl:value-of select="'\not\ni '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∌')"/></xsl:call-template></xsl:when> <!--negated contains -->
|
| 116 |
+
<xsl:when test="starts-with($content,'∏')"><xsl:value-of select="'\prod '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∏')"/></xsl:call-template></xsl:when> <!--/prod L: product operator -->
|
| 117 |
+
<xsl:when test="starts-with($content,'∐')"><xsl:value-of select="'\coprod '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∐')"/></xsl:call-template></xsl:when> <!--/coprod L: coproduct operator -->
|
| 118 |
+
<xsl:when test="starts-with($content,'∑')"><xsl:value-of select="'\sum '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∑')"/></xsl:call-template></xsl:when> <!--/sum L: summation operator -->
|
| 119 |
+
<xsl:when test="starts-with($content,'−')"><xsl:value-of select="'-'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '−')"/></xsl:call-template></xsl:when> <!--B: minus sign -->
|
| 120 |
+
<xsl:when test="starts-with($content,'∓')"><xsl:value-of select="'\mp '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∓')"/></xsl:call-template></xsl:when> <!--/mp B: minus-or-plus sign -->
|
| 121 |
+
<xsl:when test="starts-with($content,'∔')"><xsl:value-of select="'\dotplus '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∔')"/></xsl:call-template></xsl:when> <!--/dotplus B: plus sign, dot above --> <!-- Required amssymb -->
|
| 122 |
+
<!-- <xsl:when test="starts-with($content,'∕')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∕')"/></xsl:call-template></xsl:when>-->
|
| 123 |
+
<xsl:when test="starts-with($content,'∖')"><xsl:value-of select="'\setminus '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∖')"/></xsl:call-template></xsl:when> <!--/setminus B: reverse solidus -->
|
| 124 |
+
<xsl:when test="starts-with($content,'∗')"><xsl:value-of select="'\ast '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∗')"/></xsl:call-template></xsl:when> <!--low asterisk -->
|
| 125 |
+
<xsl:when test="starts-with($content,'∘')"><xsl:value-of select="'\circ '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∘')"/></xsl:call-template></xsl:when> <!--/circ B: composite function (small circle) -->
|
| 126 |
+
<xsl:when test="starts-with($content,'∙')"><xsl:value-of select="'\bullet '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∙')"/></xsl:call-template></xsl:when>
|
| 127 |
+
<xsl:when test="starts-with($content,'√')"><xsl:value-of select="'\surd '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '√')"/></xsl:call-template></xsl:when> <!--/surd radical -->
|
| 128 |
+
<xsl:when test="starts-with($content,'∝')"><xsl:value-of select="'\propto '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∝')"/></xsl:call-template></xsl:when> <!--/propto R: is proportional to -->
|
| 129 |
+
<xsl:when test="starts-with($content,'∞')"><xsl:value-of select="'\infty '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∞')"/></xsl:call-template></xsl:when> <!--/infty infinity -->
|
| 130 |
+
<!-- <xsl:when test="starts-with($content,'∟')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∟')"/></xsl:call-template></xsl:when> right (90 degree) angle -->
|
| 131 |
+
<xsl:when test="starts-with($content,'∠')"><xsl:value-of select="'\angle '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∠')"/></xsl:call-template></xsl:when> <!--/angle - angle -->
|
| 132 |
+
<xsl:when test="starts-with($content,'∡')"><xsl:value-of select="'\measuredangle '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∡')"/></xsl:call-template></xsl:when> <!--/measuredangle - angle-measured --> <!-- Required amssymb -->
|
| 133 |
+
<xsl:when test="starts-with($content,'∢')"><xsl:value-of select="'\sphericalangle '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∢')"/></xsl:call-template></xsl:when><!--/sphericalangle angle-spherical --> <!-- Required amssymb -->
|
| 134 |
+
<xsl:when test="starts-with($content,'∣')"><xsl:value-of select="'\mid '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∣')"/></xsl:call-template></xsl:when> <!--/mid R: -->
|
| 135 |
+
<xsl:when test="starts-with($content,'∤︀')"><xsl:value-of select="'\nshortmid '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∤︀')"/></xsl:call-template></xsl:when> <!--/nshortmid --> <!-- Required amssymb -->
|
| 136 |
+
<xsl:when test="starts-with($content,'∤')"><xsl:value-of select="'\nmid '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∤')"/></xsl:call-template></xsl:when> <!--/nmid --> <!-- Required amssymb -->
|
| 137 |
+
<xsl:when test="starts-with($content,'∥')"><xsl:value-of select="'\parallel '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∥')"/></xsl:call-template></xsl:when> <!--/parallel R: parallel -->
|
| 138 |
+
<xsl:when test="starts-with($content,'∦︀')"><xsl:value-of select="'\nshortparallel '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∦︀')"/></xsl:call-template></xsl:when> <!--/nshortparallel N: not short par --> <!-- Required amssymb -->
|
| 139 |
+
<xsl:when test="starts-with($content,'∦')"><xsl:value-of select="'\nparallel '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∦')"/></xsl:call-template></xsl:when> <!--/nparallel N: not parallel --> <!-- Required amssymb -->
|
| 140 |
+
<xsl:when test="starts-with($content,'∧')"><xsl:value-of select="'\wedge '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∧')"/></xsl:call-template></xsl:when> <!--/wedge /land B: logical and -->
|
| 141 |
+
<xsl:when test="starts-with($content,'∨')"><xsl:value-of select="'\vee '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∨')"/></xsl:call-template></xsl:when> <!--/vee /lor B: logical or -->
|
| 142 |
+
<xsl:when test="starts-with($content,'∩')"><xsl:value-of select="'\cap '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∩')"/></xsl:call-template></xsl:when> <!--/cap B: intersection -->
|
| 143 |
+
<xsl:when test="starts-with($content,'∪')"><xsl:value-of select="'\cup '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∪')"/></xsl:call-template></xsl:when> <!--/cup B: union or logical sum -->
|
| 144 |
+
<xsl:when test="starts-with($content,'∫')"><xsl:value-of select="'\int '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∫')"/></xsl:call-template></xsl:when> <!--/int L: integral operator -->
|
| 145 |
+
<xsl:when test="starts-with($content,'∬')"><xsl:value-of select="'\iint '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∬')"/></xsl:call-template></xsl:when> <!--double integral operator --> <!-- Required amsmath -->
|
| 146 |
+
<xsl:when test="starts-with($content,'∭')"><xsl:value-of select="'\iiint '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∭')"/></xsl:call-template></xsl:when> <!--/iiint triple integral operator --> <!-- Required amsmath -->
|
| 147 |
+
<xsl:when test="starts-with($content,'∮')"><xsl:value-of select="'\oint '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∮')"/></xsl:call-template></xsl:when> <!--/oint L: contour integral operator -->
|
| 148 |
+
<!-- <xsl:when test="starts-with($content,'∯')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∯')"/></xsl:call-template></xsl:when>-->
|
| 149 |
+
<!-- <xsl:when test="starts-with($content,'∰')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∰')"/></xsl:call-template></xsl:when>-->
|
| 150 |
+
<!-- <xsl:when test="starts-with($content,'∱')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∱')"/></xsl:call-template></xsl:when>-->
|
| 151 |
+
<!-- <xsl:when test="starts-with($content,'∲')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∲')"/></xsl:call-template></xsl:when>-->
|
| 152 |
+
<!-- <xsl:when test="starts-with($content,'∳')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∳')"/></xsl:call-template></xsl:when>-->
|
| 153 |
+
<xsl:when test="starts-with($content,'∴')"><xsl:value-of select="'\therefore '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∴')"/></xsl:call-template></xsl:when> <!--/therefore R: therefore --> <!-- Required amssymb -->
|
| 154 |
+
<xsl:when test="starts-with($content,'∵')"><xsl:value-of select="'\because '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∵')"/></xsl:call-template></xsl:when> <!--/because R: because --> <!-- Required amssymb -->
|
| 155 |
+
<!-- ? --> <xsl:when test="starts-with($content,'∶')"><xsl:value-of select="':'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∶')"/></xsl:call-template></xsl:when> <!--/ratio -->
|
| 156 |
+
<!-- ? --> <xsl:when test="starts-with($content,'∷')"><xsl:value-of select="'\colon\colon '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∷')"/></xsl:call-template></xsl:when> <!--/Colon, two colons -->
|
| 157 |
+
<!-- ? --> <xsl:when test="starts-with($content,'∸')"><xsl:value-of select="'\dot{-}'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∸')"/></xsl:call-template></xsl:when> <!--/dotminus B: minus sign, dot above -->
|
| 158 |
+
<!-- <xsl:when test="starts-with($content,'∹')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∹')"/></xsl:call-template></xsl:when> -->
|
| 159 |
+
<!-- <xsl:when test="starts-with($content,'∺')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∺')"/></xsl:call-template></xsl:when> minus with four dots, geometric properties -->
|
| 160 |
+
<!-- <xsl:when test="starts-with($content,'∻')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∻')"/></xsl:call-template></xsl:when> homothetic -->
|
| 161 |
+
<xsl:when test="starts-with($content,'∼')"><xsl:value-of select="'\sim '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∼')"/></xsl:call-template></xsl:when> <!--/sim R: similar -->
|
| 162 |
+
<xsl:when test="starts-with($content,'∽')"><xsl:value-of select="'\backsim '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∽')"/></xsl:call-template></xsl:when> <!--/backsim R: reverse similar --> <!-- Required amssymb -->
|
| 163 |
+
<!-- <xsl:when test="starts-with($content,'∾')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∾')"/></xsl:call-template></xsl:when> most positive -->
|
| 164 |
+
<!-- <xsl:when test="starts-with($content,'∿')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '∿')"/></xsl:call-template></xsl:when> ac current -->
|
| 165 |
+
<xsl:when test="starts-with($content,'≀')"><xsl:value-of select="'\wr '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≀')"/></xsl:call-template></xsl:when> <!--/wr B: wreath product -->
|
| 166 |
+
<xsl:when test="starts-with($content,'≁')"><xsl:value-of select="'\nsim '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≁')"/></xsl:call-template></xsl:when> <!--/nsim N: not similar --> <!-- Required amssymb -->
|
| 167 |
+
<xsl:when test="starts-with($content,'≂')"><xsl:value-of select="'\eqsim '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≂')"/></xsl:call-template></xsl:when> <!--/esim R: equals, similar --> <!-- Required amssymb -->
|
| 168 |
+
<xsl:when test="starts-with($content,'≃')"><xsl:value-of select="'\simeq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≃')"/></xsl:call-template></xsl:when> <!--/simeq R: similar, equals -->
|
| 169 |
+
<xsl:when test="starts-with($content,'≄')"><xsl:value-of select="'\not\simeq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≄')"/></xsl:call-template></xsl:when> <!--/nsimeq N: not similar, equals -->
|
| 170 |
+
<xsl:when test="starts-with($content,'≅')"><xsl:value-of select="'\cong '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≅')"/></xsl:call-template></xsl:when> <!--/cong R: congruent with -->
|
| 171 |
+
<!-- <xsl:when test="starts-with($content,'≆')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≆')"/></xsl:call-template></xsl:when> similar, not equals -->
|
| 172 |
+
<xsl:when test="starts-with($content,'≇')"><xsl:value-of select="'\ncong '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≇')"/></xsl:call-template></xsl:when> <!--/ncong N: not congruent with --> <!-- Required amssymb -->
|
| 173 |
+
<xsl:when test="starts-with($content,'≈')"><xsl:value-of select="'\approx '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≈')"/></xsl:call-template></xsl:when> <!--/approx R: approximate -->
|
| 174 |
+
<!-- <xsl:when test="starts-with($content,'≉̸')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≉̸')"/></xsl:call-template></xsl:when> not, vert, approximate -->
|
| 175 |
+
<xsl:when test="starts-with($content,'≉')"><xsl:value-of select="'\not\approx '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≉')"/></xsl:call-template></xsl:when> <!--/napprox N: not approximate -->
|
| 176 |
+
<xsl:when test="starts-with($content,'≊')"><xsl:value-of select="'\approxeq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≊')"/></xsl:call-template></xsl:when> <!--/approxeq R: approximate, equals --> <!-- Required amssymb -->
|
| 177 |
+
<!-- <xsl:when test="starts-with($content,'≋')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≋')"/></xsl:call-template></xsl:when> approximately identical to -->
|
| 178 |
+
<!-- <xsl:when test="starts-with($content,'≌')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≌')"/></xsl:call-template></xsl:when> /backcong R: reverse congruent -->
|
| 179 |
+
<xsl:when test="starts-with($content,'≍')"><xsl:value-of select="'\asymp '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≍')"/></xsl:call-template></xsl:when> <!--/asymp R: asymptotically equal to -->
|
| 180 |
+
<xsl:when test="starts-with($content,'≎')"><xsl:value-of select="'\Bumpeq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≎')"/></xsl:call-template></xsl:when> <!--/Bumpeq R: bumpy equals --> <!-- Required amssymb -->
|
| 181 |
+
<xsl:when test="starts-with($content,'≏')"><xsl:value-of select="'\bumpeq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≏')"/></xsl:call-template></xsl:when> <!--/bumpeq R: bumpy equals, equals --> <!-- Required amssymb -->
|
| 182 |
+
<xsl:when test="starts-with($content,'≐')"><xsl:value-of select="'\doteq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≐')"/></xsl:call-template></xsl:when> <!--/doteq R: equals, single dot above -->
|
| 183 |
+
<xsl:when test="starts-with($content,'≑')"><xsl:value-of select="'\doteqdot '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≑')"/></xsl:call-template></xsl:when> <!--/doteqdot /Doteq R: eq, even dots --> <!-- Required amssymb -->
|
| 184 |
+
<xsl:when test="starts-with($content,'≒')"><xsl:value-of select="'\fallingdotseq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≒')"/></xsl:call-template></xsl:when> <!--/fallingdotseq R: eq, falling dots --> <!-- Required amssymb -->
|
| 185 |
+
<xsl:when test="starts-with($content,'≓')"><xsl:value-of select="'\risingdotseq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≓')"/></xsl:call-template></xsl:when> <!--/risingdotseq R: eq, rising dots --> <!-- Required amssymb -->
|
| 186 |
+
<!-- <xsl:when test="starts-with($content,'≔')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≔')"/></xsl:call-template></xsl:when> /coloneq R: colon, equals -->
|
| 187 |
+
<!-- <xsl:when test="starts-with($content,'≕')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≕')"/></xsl:call-template></xsl:when> /eqcolon R: equals, colon -->
|
| 188 |
+
<xsl:when test="starts-with($content,'≖')"><xsl:value-of select="'\eqcirc '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≖')"/></xsl:call-template></xsl:when> <!--/eqcirc R: circle on equals sign --> <!-- Required amssymb -->
|
| 189 |
+
<xsl:when test="starts-with($content,'≗')"><xsl:value-of select="'\circeq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≗')"/></xsl:call-template></xsl:when> <!--/circeq R: circle, equals --> <!-- Required amssymb -->
|
| 190 |
+
<!-- ? --> <xsl:when test="starts-with($content,'≘')"><xsl:value-of select="'\stackrel{\frown}{=}'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≘')"/></xsl:call-template></xsl:when>
|
| 191 |
+
<!-- ? --> <xsl:when test="starts-with($content,'≙')"><xsl:value-of select="'\stackrel{\wedge}{=}'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≙')"/></xsl:call-template></xsl:when> <!--/wedgeq R: corresponds to (wedge, equals) -->
|
| 192 |
+
<!-- ? --> <xsl:when test="starts-with($content,'≚')"><xsl:value-of select="'\stackrel{\vee}{=}'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≚')"/></xsl:call-template></xsl:when> <!--logical or, equals -->
|
| 193 |
+
<!-- ? --> <xsl:when test="starts-with($content,'≛')"><xsl:value-of select="'\stackrel{\star}{=}'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≛')"/></xsl:call-template></xsl:when> <!--equal, asterisk above -->
|
| 194 |
+
<xsl:when test="starts-with($content,'≜')"><xsl:value-of select="'\triangleq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≜')"/></xsl:call-template></xsl:when> <!--/triangleq R: triangle, equals --> <!-- Required amssymb -->
|
| 195 |
+
<!-- ? --> <xsl:when test="starts-with($content,'≝')"><xsl:value-of select="'\stackrel{\scriptscriptstyle\mathrm{def}}{=}'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≝')"/></xsl:call-template></xsl:when>
|
| 196 |
+
<!-- ? --> <xsl:when test="starts-with($content,'≞')"><xsl:value-of select="'\stackrel{\scriptscriptstyle\mathrm{m}}{=}'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≞')"/></xsl:call-template></xsl:when>
|
| 197 |
+
<!-- ? --> <xsl:when test="starts-with($content,'≟')"><xsl:value-of select="'\stackrel{?}{=}'" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≟')"/></xsl:call-template></xsl:when> <!--/questeq R: equal with questionmark -->
|
| 198 |
+
<!-- <xsl:when test="starts-with($content,'≠︀')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≠︀')"/></xsl:call-template></xsl:when> not equal, dot -->
|
| 199 |
+
<xsl:when test="starts-with($content,'≠')"><xsl:value-of select="'\ne '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≠')"/></xsl:call-template></xsl:when> <!--/ne /neq R: not equal -->
|
| 200 |
+
<!-- <xsl:when test="starts-with($content,'≡⃥')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≡⃥')"/></xsl:call-template></xsl:when> reverse not equivalent -->
|
| 201 |
+
<xsl:when test="starts-with($content,'≡')"><xsl:value-of select="'\equiv '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≡')"/></xsl:call-template></xsl:when> <!--/equiv R: identical with -->
|
| 202 |
+
<xsl:when test="starts-with($content,'≢')"><xsl:value-of select="'\not\equiv '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≢')"/></xsl:call-template></xsl:when> <!--/nequiv N: not identical with -->
|
| 203 |
+
<!-- <xsl:when test="starts-with($content,'≣')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≣')"/></xsl:call-template></xsl:when> -->
|
| 204 |
+
<xsl:when test="starts-with($content,'≤')"><xsl:value-of select="'\le '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≤')"/></xsl:call-template></xsl:when> <!--/leq /le R: less-than-or-equal -->
|
| 205 |
+
<xsl:when test="starts-with($content,'≥')"><xsl:value-of select="'\ge '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≥')"/></xsl:call-template></xsl:when> <!--/geq /ge R: greater-than-or-equal -->
|
| 206 |
+
<xsl:when test="starts-with($content,'≦')"><xsl:value-of select="'\leqq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≦')"/></xsl:call-template></xsl:when> <!--/leqq R: less, double equals --> <!-- Required amssymb -->
|
| 207 |
+
<xsl:when test="starts-with($content,'≧')"><xsl:value-of select="'\geqq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≧')"/></xsl:call-template></xsl:when> <!--/geqq R: greater, double equals --> <!-- Required amssymb -->
|
| 208 |
+
<xsl:when test="starts-with($content,'≨')"><xsl:value-of select="'\lneqq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≨')"/></xsl:call-template></xsl:when> <!--/lneqq N: less, not double equals --> <!-- Required amssymb -->
|
| 209 |
+
<xsl:when test="starts-with($content,'≩')"><xsl:value-of select="'\gneqq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≩')"/></xsl:call-template></xsl:when> <!--/gneqq N: greater, not dbl equals --> <!-- Required amssymb -->
|
| 210 |
+
<!-- <xsl:when test="starts-with($content,'≪̸︀')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≪̸︀')"/></xsl:call-template></xsl:when> not much less than, variant -->
|
| 211 |
+
<!-- <xsl:when test="starts-with($content,'≪̸')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≪̸')"/></xsl:call-template></xsl:when> not, vert, much less than -->
|
| 212 |
+
<xsl:when test="starts-with($content,'≪')"><xsl:value-of select="'\ll '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≪')"/></xsl:call-template></xsl:when> <!--/ll R: double less-than sign -->
|
| 213 |
+
<!-- <xsl:when test="starts-with($content,'≫̸︀')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≫̸︀')"/></xsl:call-template></xsl:when> not much greater than, variant -->
|
| 214 |
+
<!-- <xsl:when test="starts-with($content,'≫̸')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≫̸')"/></xsl:call-template></xsl:when> not, vert, much greater than -->
|
| 215 |
+
<xsl:when test="starts-with($content,'≫')"><xsl:value-of select="'\gg '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≫')"/></xsl:call-template></xsl:when> <!--/gg R: dbl greater-than sign -->
|
| 216 |
+
<xsl:when test="starts-with($content,'≬')"><xsl:value-of select="'\between '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≬')"/></xsl:call-template></xsl:when> <!--/between R: between --> <!-- Required amssymb -->
|
| 217 |
+
<xsl:when test="starts-with($content,'≭')"><xsl:value-of select="'\not\asymp '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≭')"/></xsl:call-template></xsl:when>
|
| 218 |
+
<xsl:when test="starts-with($content,'≮')"><xsl:value-of select="'\nless '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≮')"/></xsl:call-template></xsl:when> <!--/nless N: not less-than --> <!-- Required amssymb -->
|
| 219 |
+
<xsl:when test="starts-with($content,'≯')"><xsl:value-of select="'\ngtr '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≯')"/></xsl:call-template></xsl:when> <!--/ngtr N: not greater-than --> <!-- Required amssymb -->
|
| 220 |
+
<xsl:when test="starts-with($content,'≰⃥')"><xsl:value-of select="'\nleq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≰⃥')"/></xsl:call-template></xsl:when> <!--/nleq N: not less-than-or-equal --> <!-- Required amssymb -->
|
| 221 |
+
<xsl:when test="starts-with($content,'≰')"><xsl:value-of select="'\nleqq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≰')"/></xsl:call-template></xsl:when> <!--/nleqq N: not less, dbl equals --> <!-- Required amssymb -->
|
| 222 |
+
<xsl:when test="starts-with($content,'≱⃥')"><xsl:value-of select="'\ngeq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≱⃥')"/></xsl:call-template></xsl:when> <!--/ngeq N: not greater-than-or-equal --> <!-- Required amssymb -->
|
| 223 |
+
<xsl:when test="starts-with($content,'≱')"><xsl:value-of select="'\ngeqq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≱')"/></xsl:call-template></xsl:when> <!--/ngeqq N: not greater, dbl equals --> <!-- Required amssymb -->
|
| 224 |
+
<xsl:when test="starts-with($content,'≲')"><xsl:value-of select="'\lesssim '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≲')"/></xsl:call-template></xsl:when> <!--/lesssim R: less, similar --> <!-- Required amssymb -->
|
| 225 |
+
<xsl:when test="starts-with($content,'≳')"><xsl:value-of select="'\gtrsim '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≳')"/></xsl:call-template></xsl:when> <!--/gtrsim R: greater, similar --> <!-- Required amssymb -->
|
| 226 |
+
<xsl:when test="starts-with($content,'≴')"><xsl:value-of select="'\not\lesssim '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≴')"/></xsl:call-template></xsl:when> <!--not less, similar --> <!-- Required amssymb -->
|
| 227 |
+
<xsl:when test="starts-with($content,'≵')"><xsl:value-of select="'\not\gtrsim '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≵')"/></xsl:call-template></xsl:when> <!--not greater, similar --> <!-- Required amssymb -->
|
| 228 |
+
<xsl:when test="starts-with($content,'≶')"><xsl:value-of select="'\lessgtr '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≶')"/></xsl:call-template></xsl:when> <!--/lessgtr R: less, greater --> <!-- Required amssymb -->
|
| 229 |
+
<xsl:when test="starts-with($content,'≷')"><xsl:value-of select="'\gtrless '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≷')"/></xsl:call-template></xsl:when> <!--/gtrless R: greater, less --> <!-- Required amssymb -->
|
| 230 |
+
<xsl:when test="starts-with($content,'≸')"><xsl:value-of select="'\not\lessgtr '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≸')"/></xsl:call-template></xsl:when> <!--not less, greater --> <!-- Required amssymb -->
|
| 231 |
+
<xsl:when test="starts-with($content,'≹')"><xsl:value-of select="'\not\gtrless '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≹')"/></xsl:call-template></xsl:when> <!--not greater, less --> <!-- Required amssymb -->
|
| 232 |
+
<xsl:when test="starts-with($content,'≺')"><xsl:value-of select="'\prec '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≺')"/></xsl:call-template></xsl:when> <!--/prec R: precedes -->
|
| 233 |
+
<xsl:when test="starts-with($content,'≻')"><xsl:value-of select="'\succ '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≻')"/></xsl:call-template></xsl:when> <!--/succ R: succeeds -->
|
| 234 |
+
<xsl:when test="starts-with($content,'≼')"><xsl:value-of select="'\preccurlyeq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≼')"/></xsl:call-template></xsl:when> <!--/preccurlyeq R: precedes, curly eq --> <!-- Required amssymb -->
|
| 235 |
+
<xsl:when test="starts-with($content,'≽')"><xsl:value-of select="'\succcurlyeq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≽')"/></xsl:call-template></xsl:when> <!--/succcurlyeq R: succeeds, curly eq --> <!-- Required amssymb -->
|
| 236 |
+
<xsl:when test="starts-with($content,'≾')"><xsl:value-of select="'\precsim '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≾')"/></xsl:call-template></xsl:when> <!--/precsim R: precedes, similar --> <!-- Required amssymb -->
|
| 237 |
+
<xsl:when test="starts-with($content,'≿')"><xsl:value-of select="'\succsim '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '≿')"/></xsl:call-template></xsl:when> <!--/succsim R: succeeds, similar --> <!-- Required amssymb -->
|
| 238 |
+
<xsl:when test="starts-with($content,'⊀')"><xsl:value-of select="'\nprec '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊀')"/></xsl:call-template></xsl:when> <!--/nprec N: not precedes --> <!-- Required amssymb -->
|
| 239 |
+
<xsl:when test="starts-with($content,'⊁')"><xsl:value-of select="'\nsucc '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊁')"/></xsl:call-template></xsl:when> <!--/nsucc N: not succeeds --> <!-- Required amssymb -->
|
| 240 |
+
<xsl:when test="starts-with($content,'⊂')"><xsl:value-of select="'\subset '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊂')"/></xsl:call-template></xsl:when> <!--/subset R: subset or is implied by -->
|
| 241 |
+
<xsl:when test="starts-with($content,'⊃')"><xsl:value-of select="'\supset '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊃')"/></xsl:call-template></xsl:when> <!--/supset R: superset or implies -->
|
| 242 |
+
<xsl:when test="starts-with($content,'⊄')"><xsl:value-of select="'\not\subset '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊄')"/></xsl:call-template></xsl:when> <!--not subset -->
|
| 243 |
+
<xsl:when test="starts-with($content,'⊅')"><xsl:value-of select="'\not\supset '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊅')"/></xsl:call-template></xsl:when> <!--not superset -->
|
| 244 |
+
<xsl:when test="starts-with($content,'⊆')"><xsl:value-of select="'\subseteq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊆')"/></xsl:call-template></xsl:when> <!--/subseteq R: subset, equals -->
|
| 245 |
+
<xsl:when test="starts-with($content,'⊇')"><xsl:value-of select="'\supseteq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊇')"/></xsl:call-template></xsl:when> <!--/supseteq R: superset, equals -->
|
| 246 |
+
<xsl:when test="starts-with($content,'⊎')"><xsl:value-of select="'\uplus '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊎')"/></xsl:call-template></xsl:when> <!--/uplus B: plus sign in union -->
|
| 247 |
+
<xsl:when test="starts-with($content,'⊓')"><xsl:value-of select="'\sqcap '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊓')"/></xsl:call-template></xsl:when> <!--/sqcap B: square intersection -->
|
| 248 |
+
<xsl:when test="starts-with($content,'⊔')"><xsl:value-of select="'\bigsqcup '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊔')"/></xsl:call-template></xsl:when> <!--/sqcup B: square union -->
|
| 249 |
+
<xsl:when test="starts-with($content,'⊕')"><xsl:value-of select="'\oplus '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊕')"/></xsl:call-template></xsl:when> <!--/oplus B: plus sign in circle -->
|
| 250 |
+
<xsl:when test="starts-with($content,'⊖')"><xsl:value-of select="'\ominus '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊖')"/></xsl:call-template></xsl:when> <!--/ominus B: minus sign in circle -->
|
| 251 |
+
<xsl:when test="starts-with($content,'⊗')"><xsl:value-of select="'\otimes '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊗')"/></xsl:call-template></xsl:when> <!--/otimes B: multiply sign in circle -->
|
| 252 |
+
<xsl:when test="starts-with($content,'⊘')"><xsl:value-of select="'\oslash '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊘')"/></xsl:call-template></xsl:when> <!--/oslash B: solidus in circle -->
|
| 253 |
+
<!-- ? --> <xsl:when test="starts-with($content,'⊙')"><xsl:value-of select="'\odot '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊙')"/></xsl:call-template></xsl:when> <!--/odot B: middle dot in circle --> <!--/bigodot L: circle dot operator -->
|
| 254 |
+
<xsl:when test="starts-with($content,'⊟')"><xsl:value-of select="'\boxminus '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊟')"/></xsl:call-template></xsl:when> <!--/boxminus B: minus sign in box --> <!-- Required amssymb -->
|
| 255 |
+
<xsl:when test="starts-with($content,'⊤')"><xsl:value-of select="'\top '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊤')"/></xsl:call-template></xsl:when> <!--/top top -->
|
| 256 |
+
<xsl:when test="starts-with($content,'⊥')"><xsl:value-of select="'\perp '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊥')"/></xsl:call-template></xsl:when> <!--/perp R: perpendicular --><!--/bot bottom -->
|
| 257 |
+
<xsl:when test="starts-with($content,'⊦')"><xsl:value-of select="'\vdash '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊦')"/></xsl:call-template></xsl:when> <!--/vdash R: vertical, dash -->
|
| 258 |
+
<xsl:when test="starts-with($content,'⊧')"><xsl:value-of select="'\vDash '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊧')"/></xsl:call-template></xsl:when> <!--/vDash R: vertical, dbl dash --> <!-- Required amssymb -->
|
| 259 |
+
<xsl:when test="starts-with($content,'⊨')"><xsl:value-of select="'\models '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊨')"/></xsl:call-template></xsl:when> <!--/models R: -->
|
| 260 |
+
<xsl:when test="starts-with($content,'⊪')"><xsl:value-of select="'\Vvdash '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⊪')"/></xsl:call-template></xsl:when> <!--/Vvdash R: triple vertical, dash --> <!-- Required amssymb -->
|
| 261 |
+
<xsl:when test="starts-with($content,'⋀')"><xsl:value-of select="'\bigwedge '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋀')"/></xsl:call-template></xsl:when> <!--/bigwedge L: logical or operator -->
|
| 262 |
+
<xsl:when test="starts-with($content,'⋁')"><xsl:value-of select="'\bigvee '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋁')"/></xsl:call-template></xsl:when> <!--/bigcap L: intersection operator -->
|
| 263 |
+
<xsl:when test="starts-with($content,'⋂')"><xsl:value-of select="'\bigcap '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋂')"/></xsl:call-template></xsl:when> <!--/bigvee L: logical and operator -->
|
| 264 |
+
<xsl:when test="starts-with($content,'⋃')"><xsl:value-of select="'\bigcup '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋃')"/></xsl:call-template></xsl:when> <!--/bigcup L: union operator -->
|
| 265 |
+
<xsl:when test="starts-with($content,'⋄')"><xsl:value-of select="'\diamond '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋄')"/></xsl:call-template></xsl:when> <!--/diamond B: open diamond -->
|
| 266 |
+
<xsl:when test="starts-with($content,'⋅')"><xsl:value-of select="'\cdot '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋅')"/></xsl:call-template></xsl:when> <!--/cdot B: small middle dot -->
|
| 267 |
+
<xsl:when test="starts-with($content,'⋆')"><xsl:value-of select="'\star '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋆')"/></xsl:call-template></xsl:when> <!--/star B: small star, filled -->
|
| 268 |
+
<xsl:when test="starts-with($content,'⋇')"><xsl:value-of select="'\divideontimes '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋇')"/></xsl:call-template></xsl:when> <!--/divideontimes B: division on times --> <!-- Required amssymb -->
|
| 269 |
+
<xsl:when test="starts-with($content,'⋈')"><xsl:value-of select="'\bowtie '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋈')"/></xsl:call-template></xsl:when> <!--/bowtie R: -->
|
| 270 |
+
<xsl:when test="starts-with($content,'⋍')"><xsl:value-of select="'\backsimeq '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋍')"/></xsl:call-template></xsl:when> <!--/backsimeq R: reverse similar, eq --> <!-- Required amssymb -->
|
| 271 |
+
<xsl:when test="starts-with($content,'⋯')"><xsl:value-of select="'\cdots '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋯')"/></xsl:call-template></xsl:when> <!--/cdots, three dots, centered -->
|
| 272 |
+
<!-- <xsl:when test="starts-with($content,'⋰')"><xsl:value-of select="' '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋰')"/></xsl:call-template></xsl:when> three dots, ascending -->
|
| 273 |
+
<xsl:when test="starts-with($content,'⋱')"><xsl:value-of select="'\ddots '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '⋱')"/></xsl:call-template></xsl:when> <!--/ddots, three dots, descending -->
|
| 274 |
+
|
| 275 |
+
<!-- ====================================================================== -->
|
| 276 |
+
<xsl:when test="starts-with($content,'□')"><xsl:value-of select="'\square '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '□')"/></xsl:call-template></xsl:when> <!--/square, square --> <!-- Required amssymb -->
|
| 277 |
+
<xsl:when test="starts-with($content,'▪')"><xsl:value-of select="'\blacksquare '" /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '▪')"/></xsl:call-template></xsl:when> <!--/blacksquare, square, filled --> <!-- Required amssymb -->
|
| 278 |
+
|
| 279 |
+
<xsl:when test='starts-with($content,"'")'><xsl:value-of select='"\text{'}"' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select='substring-after($content, "'")'/></xsl:call-template></xsl:when><!-- \text required amslatex -->
|
| 280 |
+
<xsl:when test='starts-with($content,"(")'><xsl:value-of select='"\left("' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '(')"/></xsl:call-template></xsl:when>
|
| 281 |
+
<xsl:when test='starts-with($content,")")'><xsl:value-of select='"\right)"' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, ')')"/></xsl:call-template></xsl:when>
|
| 282 |
+
<xsl:when test='starts-with($content,"[")'><xsl:value-of select='"\left["' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '[')"/></xsl:call-template></xsl:when>
|
| 283 |
+
<xsl:when test='starts-with($content,"]")'><xsl:value-of select='"\right]"' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, ']')"/></xsl:call-template></xsl:when>
|
| 284 |
+
<xsl:when test='starts-with($content,"{")'><xsl:value-of select='"\left\{"' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '{')"/></xsl:call-template></xsl:when>
|
| 285 |
+
<xsl:when test='starts-with($content,"}")'><xsl:value-of select='"\right\}"' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '}')"/></xsl:call-template></xsl:when>
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
<xsl:otherwise>
|
| 289 |
+
<xsl:value-of select="substring($content,1,1)"/>
|
| 290 |
+
<xsl:call-template name="replaceEntities">
|
| 291 |
+
<xsl:with-param name="content" select="substring($content, 2)"/>
|
| 292 |
+
</xsl:call-template>
|
| 293 |
+
</xsl:otherwise>
|
| 294 |
+
</xsl:choose></xsl:if>
|
| 295 |
+
</xsl:template>
|
| 296 |
+
|
| 297 |
+
<xsl:template name="replaceMtextEntities">
|
| 298 |
+
<xsl:param name="content"/>
|
| 299 |
+
<xsl:choose>
|
| 300 |
+
<xsl:when test="contains($content,'   ')"> <!-- ThickSpace - space of width 5/18 em -->
|
| 301 |
+
<xsl:call-template name="replaceMtextEntities">
|
| 302 |
+
<xsl:with-param name="content" select="concat(substring-before($content,'   '),'\hspace{0.28em}',substring-after($content,'   '))"/>
|
| 303 |
+
</xsl:call-template>
|
| 304 |
+
</xsl:when>
|
| 305 |
+
<xsl:when test="contains($content,' ')"> <!-- ThinSpace - space of width 3/18 em -->
|
| 306 |
+
<xsl:call-template name="replaceMtextEntities">
|
| 307 |
+
<xsl:with-param name="content" select="concat(substring-before($content,' '),'\hspace{0.17em}',substring-after($content,' '))"/>
|
| 308 |
+
</xsl:call-template>
|
| 309 |
+
</xsl:when>
|
| 310 |
+
<xsl:otherwise>
|
| 311 |
+
<xsl:value-of select="normalize-space($content)"/>
|
| 312 |
+
</xsl:otherwise>
|
| 313 |
+
</xsl:choose>
|
| 314 |
+
</xsl:template>
|
| 315 |
+
|
| 316 |
+
</xsl:stylesheet>
|
ultradata_math_parser/mmltex/glayout.xsl
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version='1.0' encoding="UTF-8"?>
|
| 2 |
+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
| 3 |
+
xmlns:m="http://www.w3.org/1998/Math/MathML"
|
| 4 |
+
version='1.0'>
|
| 5 |
+
|
| 6 |
+
<!-- ====================================================================== -->
|
| 7 |
+
<!-- $id: glayout.xsl, 2002/17/05 Exp $
|
| 8 |
+
This file is part of the XSLT MathML Library distribution.
|
| 9 |
+
See ./README or http://www.raleigh.ru/MathML/mmltex for
|
| 10 |
+
copyright and other information -->
|
| 11 |
+
<!-- ====================================================================== -->
|
| 12 |
+
|
| 13 |
+
<xsl:template match="m:mfrac">
|
| 14 |
+
<xsl:choose>
|
| 15 |
+
<xsl:when test="@bevelled='true'">
|
| 16 |
+
<!-- <xsl:text>\raisebox{1ex}{</xsl:text>
|
| 17 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 18 |
+
<xsl:text>}\!\left/ \!\raisebox{-1ex}{</xsl:text>
|
| 19 |
+
<xsl:apply-templates select="./*[2]"/>
|
| 20 |
+
<xsl:text>}\right.</xsl:text>-->
|
| 21 |
+
</xsl:when>
|
| 22 |
+
<xsl:when test="@linethickness">
|
| 23 |
+
<xsl:text>\genfrac{}{}{</xsl:text>
|
| 24 |
+
<xsl:choose>
|
| 25 |
+
<xsl:when test="number(@linethickness)">
|
| 26 |
+
<xsl:value-of select="@linethickness div 10"/>
|
| 27 |
+
<xsl:text>ex</xsl:text>
|
| 28 |
+
</xsl:when>
|
| 29 |
+
<xsl:when test="@linethickness='thin'">
|
| 30 |
+
<xsl:text>.05ex</xsl:text>
|
| 31 |
+
</xsl:when>
|
| 32 |
+
<xsl:when test="@linethickness='medium'"/>
|
| 33 |
+
<xsl:when test="@linethickness='thick'">
|
| 34 |
+
<xsl:text>.2ex</xsl:text>
|
| 35 |
+
</xsl:when>
|
| 36 |
+
<xsl:otherwise>
|
| 37 |
+
<xsl:value-of select="@linethickness"/>
|
| 38 |
+
</xsl:otherwise>
|
| 39 |
+
</xsl:choose>
|
| 40 |
+
<xsl:text>}{}{</xsl:text>
|
| 41 |
+
</xsl:when>
|
| 42 |
+
<xsl:otherwise>
|
| 43 |
+
<xsl:text>\frac{</xsl:text>
|
| 44 |
+
</xsl:otherwise>
|
| 45 |
+
</xsl:choose>
|
| 46 |
+
<xsl:if test="@numalign='right'">
|
| 47 |
+
<xsl:text>\hfill </xsl:text>
|
| 48 |
+
</xsl:if>
|
| 49 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 50 |
+
<xsl:if test="@numalign='left'">
|
| 51 |
+
<xsl:text>\hfill </xsl:text>
|
| 52 |
+
</xsl:if>
|
| 53 |
+
<xsl:text>}{</xsl:text>
|
| 54 |
+
<xsl:if test="@denomalign='right'">
|
| 55 |
+
<xsl:text>\hfill </xsl:text>
|
| 56 |
+
</xsl:if>
|
| 57 |
+
<xsl:apply-templates select="./*[2]"/>
|
| 58 |
+
<xsl:if test="@denomalign='left'">
|
| 59 |
+
<xsl:text>\hfill </xsl:text>
|
| 60 |
+
</xsl:if>
|
| 61 |
+
<xsl:text>}</xsl:text>
|
| 62 |
+
</xsl:template>
|
| 63 |
+
|
| 64 |
+
<xsl:template match="m:mroot">
|
| 65 |
+
<xsl:choose>
|
| 66 |
+
<xsl:when test="count(./*)=2">
|
| 67 |
+
<xsl:text>\sqrt[</xsl:text>
|
| 68 |
+
<xsl:apply-templates select="./*[2]"/>
|
| 69 |
+
<xsl:text>]{</xsl:text>
|
| 70 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 71 |
+
<xsl:text>}</xsl:text>
|
| 72 |
+
</xsl:when>
|
| 73 |
+
<xsl:otherwise>
|
| 74 |
+
<!-- number of argumnets is not 2 - code 25 -->
|
| 75 |
+
<xsl:message>exception 25:</xsl:message>
|
| 76 |
+
<xsl:text>\text{exception 25:}</xsl:text>
|
| 77 |
+
</xsl:otherwise>
|
| 78 |
+
</xsl:choose>
|
| 79 |
+
</xsl:template>
|
| 80 |
+
|
| 81 |
+
<xsl:template match="m:msqrt">
|
| 82 |
+
<xsl:text>\sqrt{</xsl:text>
|
| 83 |
+
<xsl:apply-templates/>
|
| 84 |
+
<xsl:text>}</xsl:text>
|
| 85 |
+
</xsl:template>
|
| 86 |
+
|
| 87 |
+
<xsl:template match="m:mfenced">
|
| 88 |
+
<xsl:choose>
|
| 89 |
+
<xsl:when test="@open">
|
| 90 |
+
<xsl:if test="translate(@open,'{}[]()|','{{{{{{{')='{'">
|
| 91 |
+
<xsl:text>\left</xsl:text>
|
| 92 |
+
</xsl:if>
|
| 93 |
+
<xsl:if test="@open='{' or @open='}'">
|
| 94 |
+
<xsl:text>\</xsl:text>
|
| 95 |
+
</xsl:if>
|
| 96 |
+
<xsl:value-of select="@open"/>
|
| 97 |
+
</xsl:when>
|
| 98 |
+
<xsl:otherwise><xsl:text>\left(</xsl:text></xsl:otherwise>
|
| 99 |
+
</xsl:choose>
|
| 100 |
+
<xsl:choose>
|
| 101 |
+
<xsl:when test="count(./*)>1">
|
| 102 |
+
<xsl:variable name="symbol">
|
| 103 |
+
<xsl:choose>
|
| 104 |
+
<xsl:when test="@separators">
|
| 105 |
+
<xsl:call-template name="startspace">
|
| 106 |
+
<xsl:with-param name="symbol" select="@separators"/>
|
| 107 |
+
</xsl:call-template>
|
| 108 |
+
</xsl:when>
|
| 109 |
+
<xsl:otherwise>,</xsl:otherwise>
|
| 110 |
+
</xsl:choose>
|
| 111 |
+
</xsl:variable>
|
| 112 |
+
<xsl:for-each select="./*">
|
| 113 |
+
<xsl:apply-templates select="."/>
|
| 114 |
+
<xsl:if test="not(position()=last())">
|
| 115 |
+
<xsl:choose>
|
| 116 |
+
<xsl:when test="position()>string-length($symbol)">
|
| 117 |
+
<xsl:value-of select="substring($symbol,string-length($symbol))"/>
|
| 118 |
+
</xsl:when>
|
| 119 |
+
<xsl:otherwise>
|
| 120 |
+
<xsl:value-of select="substring($symbol,position(),1)"/>
|
| 121 |
+
</xsl:otherwise>
|
| 122 |
+
</xsl:choose>
|
| 123 |
+
</xsl:if>
|
| 124 |
+
</xsl:for-each>
|
| 125 |
+
</xsl:when>
|
| 126 |
+
<xsl:otherwise>
|
| 127 |
+
<xsl:apply-templates/>
|
| 128 |
+
</xsl:otherwise>
|
| 129 |
+
</xsl:choose>
|
| 130 |
+
<xsl:choose>
|
| 131 |
+
<xsl:when test="@close">
|
| 132 |
+
<xsl:if test="translate(@open,'{}[]()|','{{{{{{{')='{'">
|
| 133 |
+
<xsl:text>\right</xsl:text>
|
| 134 |
+
</xsl:if>
|
| 135 |
+
<xsl:if test="@open='{' or @open='}'">
|
| 136 |
+
<xsl:text>\</xsl:text>
|
| 137 |
+
</xsl:if>
|
| 138 |
+
<xsl:value-of select="@close"/>
|
| 139 |
+
</xsl:when>
|
| 140 |
+
<xsl:otherwise><xsl:text>\right)</xsl:text></xsl:otherwise>
|
| 141 |
+
</xsl:choose>
|
| 142 |
+
</xsl:template>
|
| 143 |
+
|
| 144 |
+
<xsl:template match="m:mphantom">
|
| 145 |
+
<xsl:text>\phantom{</xsl:text>
|
| 146 |
+
<xsl:apply-templates/>
|
| 147 |
+
<xsl:text>}</xsl:text>
|
| 148 |
+
</xsl:template>
|
| 149 |
+
|
| 150 |
+
<xsl:template match="m:menclose">
|
| 151 |
+
<xsl:choose>
|
| 152 |
+
<xsl:when test="@notation = 'actuarial'">
|
| 153 |
+
<xsl:text>\overline{</xsl:text>
|
| 154 |
+
<xsl:apply-templates/>
|
| 155 |
+
<xsl:text>\hspace{.2em}|}</xsl:text>
|
| 156 |
+
</xsl:when>
|
| 157 |
+
<xsl:when test="@notation = 'radical'">
|
| 158 |
+
<xsl:text>\sqrt{</xsl:text>
|
| 159 |
+
<xsl:apply-templates/>
|
| 160 |
+
<xsl:text>}</xsl:text>
|
| 161 |
+
</xsl:when>
|
| 162 |
+
<xsl:otherwise>
|
| 163 |
+
<xsl:text>\overline{)</xsl:text>
|
| 164 |
+
<xsl:apply-templates/>
|
| 165 |
+
<xsl:text>}</xsl:text>
|
| 166 |
+
</xsl:otherwise>
|
| 167 |
+
</xsl:choose>
|
| 168 |
+
</xsl:template>
|
| 169 |
+
|
| 170 |
+
<xsl:template match="m:mrow">
|
| 171 |
+
<xsl:apply-templates/>
|
| 172 |
+
</xsl:template>
|
| 173 |
+
|
| 174 |
+
<xsl:template match="m:mstyle">
|
| 175 |
+
<xsl:if test="@background">
|
| 176 |
+
<xsl:text>\colorbox[rgb]{</xsl:text>
|
| 177 |
+
<xsl:call-template name="color">
|
| 178 |
+
<xsl:with-param name="color" select="@background"/>
|
| 179 |
+
</xsl:call-template>
|
| 180 |
+
<xsl:text>}{$</xsl:text>
|
| 181 |
+
</xsl:if>
|
| 182 |
+
<xsl:if test="@color">
|
| 183 |
+
<xsl:text>\textcolor[rgb]{</xsl:text>
|
| 184 |
+
<xsl:call-template name="color">
|
| 185 |
+
<xsl:with-param name="color" select="@color"/>
|
| 186 |
+
</xsl:call-template>
|
| 187 |
+
<xsl:text>}{</xsl:text>
|
| 188 |
+
</xsl:if>
|
| 189 |
+
<xsl:apply-templates/>
|
| 190 |
+
<xsl:if test="@color">
|
| 191 |
+
<xsl:text>}</xsl:text>
|
| 192 |
+
</xsl:if>
|
| 193 |
+
<xsl:if test="@background">
|
| 194 |
+
<xsl:text>$}</xsl:text>
|
| 195 |
+
</xsl:if>
|
| 196 |
+
</xsl:template>
|
| 197 |
+
<!--
|
| 198 |
+
|
| 199 |
+
<xsl:template match="m:mstyle">
|
| 200 |
+
<xsl:if test="@displaystyle='true'">
|
| 201 |
+
<xsl:text>{\displaystyle</xsl:text>
|
| 202 |
+
</xsl:if>
|
| 203 |
+
<xsl:if test="@scriptlevel=2">
|
| 204 |
+
<xsl:text>{\scriptscriptstyle</xsl:text>
|
| 205 |
+
</xsl:if>
|
| 206 |
+
<xsl:apply-templates/>
|
| 207 |
+
<xsl:if test="@scriptlevel=2">
|
| 208 |
+
<xsl:text>}</xsl:text>
|
| 209 |
+
</xsl:if>
|
| 210 |
+
<xsl:if test="@displaystyle='true'">
|
| 211 |
+
<xsl:text>}</xsl:text>
|
| 212 |
+
</xsl:if>
|
| 213 |
+
</xsl:template>
|
| 214 |
+
-->
|
| 215 |
+
|
| 216 |
+
<xsl:template match="m:merror">
|
| 217 |
+
<xsl:apply-templates/>
|
| 218 |
+
</xsl:template>
|
| 219 |
+
|
| 220 |
+
</xsl:stylesheet>
|
ultradata_math_parser/mmltex/mmltex.xsl
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version='1.0' encoding="UTF-8"?>
|
| 2 |
+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
| 3 |
+
xmlns:m="http://www.w3.org/1998/Math/MathML"
|
| 4 |
+
version='1.0'>
|
| 5 |
+
|
| 6 |
+
<xsl:output method="text" indent="no" encoding="UTF-8"/>
|
| 7 |
+
|
| 8 |
+
<!-- ====================================================================== -->
|
| 9 |
+
<!-- $id: mmltex.xsl, 2002/22/11 Exp $
|
| 10 |
+
This file is part of the XSLT MathML Library distribution.
|
| 11 |
+
See ./README or http://www.raleigh.ru/MathML/mmltex for
|
| 12 |
+
copyright and other information -->
|
| 13 |
+
<!-- ====================================================================== -->
|
| 14 |
+
|
| 15 |
+
<xsl:include href="tokens.xsl"/>
|
| 16 |
+
<xsl:include href="glayout.xsl"/>
|
| 17 |
+
<xsl:include href="scripts.xsl"/>
|
| 18 |
+
<xsl:include href="tables.xsl"/>
|
| 19 |
+
<xsl:include href="entities.xsl"/>
|
| 20 |
+
<xsl:include href="cmarkup.xsl"/>
|
| 21 |
+
|
| 22 |
+
<!-- Note: variables colora (template color) and symbola (template startspace) only for Sablotron -->
|
| 23 |
+
|
| 24 |
+
<xsl:template name="startspace">
|
| 25 |
+
<xsl:param name="symbol"/>
|
| 26 |
+
<xsl:if test="contains($symbol,' ')">
|
| 27 |
+
<xsl:variable name="symbola" select="concat(substring-before($symbol,' '),substring-after($symbol,' '))"/>
|
| 28 |
+
<xsl:call-template name="startspace">
|
| 29 |
+
<xsl:with-param name="symbol" select="$symbola"/>
|
| 30 |
+
</xsl:call-template>
|
| 31 |
+
</xsl:if>
|
| 32 |
+
<xsl:if test="not(contains($symbol,' '))">
|
| 33 |
+
<xsl:value-of select="$symbol"/>
|
| 34 |
+
</xsl:if>
|
| 35 |
+
</xsl:template>
|
| 36 |
+
|
| 37 |
+
<xsl:strip-space elements="m:*"/>
|
| 38 |
+
|
| 39 |
+
<xsl:template match="m:math">
|
| 40 |
+
<xsl:text>$</xsl:text>
|
| 41 |
+
<xsl:apply-templates/>
|
| 42 |
+
<xsl:text>$</xsl:text>
|
| 43 |
+
</xsl:template>
|
| 44 |
+
|
| 45 |
+
</xsl:stylesheet>
|
ultradata_math_parser/mmltex/scripts.xsl
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version='1.0' encoding="UTF-8"?>
|
| 2 |
+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
| 3 |
+
xmlns:m="http://www.w3.org/1998/Math/MathML"
|
| 4 |
+
version='1.0'>
|
| 5 |
+
|
| 6 |
+
<!-- ====================================================================== -->
|
| 7 |
+
<!-- $Id: scripts.xsl,v 1.1.1.1 2002/10/26 14:20:06 shade33 Exp $
|
| 8 |
+
This file is part of the XSLT MathML Library distribution.
|
| 9 |
+
See ./README or http://www.raleigh.ru/MathML/mmltex for
|
| 10 |
+
copyright and other information -->
|
| 11 |
+
<!-- ====================================================================== -->
|
| 12 |
+
|
| 13 |
+
<xsl:template match="m:munderover">
|
| 14 |
+
<xsl:variable name="base">
|
| 15 |
+
<xsl:call-template name="startspace">
|
| 16 |
+
<xsl:with-param name="symbol" select="./*[1]"/>
|
| 17 |
+
</xsl:call-template>
|
| 18 |
+
</xsl:variable>
|
| 19 |
+
<xsl:variable name="under">
|
| 20 |
+
<xsl:call-template name="startspace">
|
| 21 |
+
<xsl:with-param name="symbol" select="./*[2]"/>
|
| 22 |
+
</xsl:call-template>
|
| 23 |
+
</xsl:variable>
|
| 24 |
+
<xsl:variable name="over">
|
| 25 |
+
<xsl:call-template name="startspace">
|
| 26 |
+
<xsl:with-param name="symbol" select="./*[3]"/>
|
| 27 |
+
</xsl:call-template>
|
| 28 |
+
</xsl:variable>
|
| 29 |
+
|
| 30 |
+
<xsl:choose>
|
| 31 |
+
<xsl:when test="$over='¯'"> <!-- OverBar - over bar -->
|
| 32 |
+
<xsl:text>\overline{</xsl:text>
|
| 33 |
+
<xsl:call-template name="munder">
|
| 34 |
+
<xsl:with-param name="base" select="$base"/>
|
| 35 |
+
<xsl:with-param name="under" select="$under"/>
|
| 36 |
+
</xsl:call-template>
|
| 37 |
+
<xsl:text>}</xsl:text>
|
| 38 |
+
</xsl:when>
|
| 39 |
+
<xsl:when test="$over='︷'"> <!-- OverBrace - over brace -->
|
| 40 |
+
<xsl:text>\overbrace{</xsl:text>
|
| 41 |
+
<xsl:call-template name="munder">
|
| 42 |
+
<xsl:with-param name="base" select="$base"/>
|
| 43 |
+
<xsl:with-param name="under" select="$under"/>
|
| 44 |
+
</xsl:call-template>
|
| 45 |
+
<xsl:text>}</xsl:text>
|
| 46 |
+
</xsl:when>
|
| 47 |
+
<xsl:when test="$under='̲'"> <!-- UnderBar - combining low line -->
|
| 48 |
+
<xsl:text>\underline{</xsl:text>
|
| 49 |
+
<xsl:call-template name="mover">
|
| 50 |
+
<xsl:with-param name="base" select="$base"/>
|
| 51 |
+
<xsl:with-param name="over" select="$over"/>
|
| 52 |
+
<xsl:with-param name="pos_over" select="3"/>
|
| 53 |
+
</xsl:call-template>
|
| 54 |
+
<xsl:text>}</xsl:text>
|
| 55 |
+
</xsl:when>
|
| 56 |
+
<xsl:when test="$under='︸'"> <!-- UnderBrace - under brace -->
|
| 57 |
+
<xsl:text>\underbrace{</xsl:text>
|
| 58 |
+
<xsl:call-template name="mover">
|
| 59 |
+
<xsl:with-param name="base" select="$base"/>
|
| 60 |
+
<xsl:with-param name="over" select="$over"/>
|
| 61 |
+
<xsl:with-param name="pos_over" select="3"/>
|
| 62 |
+
</xsl:call-template>
|
| 63 |
+
<xsl:text>}</xsl:text>
|
| 64 |
+
</xsl:when>
|
| 65 |
+
<xsl:when test="translate($base,'∏∐⋂⋃⊔',
|
| 66 |
+
'∑∑∑∑∑')='∑'">
|
| 67 |
+
<!-- if $base is operator, such as
|
| 68 |
+
∑ /sum L: summation operator
|
| 69 |
+
∏ /prod L: product operator
|
| 70 |
+
∐ /coprod L: coproduct operator
|
| 71 |
+
⋂ /bigcap
|
| 72 |
+
⋃ /bigcup
|
| 73 |
+
⊔ /bigsqcup
|
| 74 |
+
-->
|
| 75 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 76 |
+
<xsl:text>_{</xsl:text>
|
| 77 |
+
<xsl:apply-templates select="./*[2]"/>
|
| 78 |
+
<xsl:text>}^{</xsl:text>
|
| 79 |
+
<xsl:apply-templates select="./*[3]"/>
|
| 80 |
+
<xsl:text>}</xsl:text>
|
| 81 |
+
</xsl:when>
|
| 82 |
+
<xsl:otherwise>
|
| 83 |
+
<xsl:text>\underset{</xsl:text>
|
| 84 |
+
<xsl:apply-templates select="./*[2]"/>
|
| 85 |
+
<xsl:text>}{\overset{</xsl:text>
|
| 86 |
+
<xsl:apply-templates select="./*[3]"/>
|
| 87 |
+
<xsl:text>}{</xsl:text>
|
| 88 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 89 |
+
<xsl:text>}}</xsl:text>
|
| 90 |
+
</xsl:otherwise>
|
| 91 |
+
</xsl:choose>
|
| 92 |
+
</xsl:template>
|
| 93 |
+
|
| 94 |
+
<xsl:template match="m:mover">
|
| 95 |
+
<xsl:call-template name="mover">
|
| 96 |
+
<xsl:with-param name="base">
|
| 97 |
+
<xsl:call-template name="startspace">
|
| 98 |
+
<xsl:with-param name="symbol" select="./*[1]"/>
|
| 99 |
+
</xsl:call-template>
|
| 100 |
+
</xsl:with-param>
|
| 101 |
+
<xsl:with-param name="over">
|
| 102 |
+
<xsl:call-template name="startspace">
|
| 103 |
+
<xsl:with-param name="symbol" select="./*[2]"/>
|
| 104 |
+
</xsl:call-template>
|
| 105 |
+
</xsl:with-param>
|
| 106 |
+
</xsl:call-template>
|
| 107 |
+
</xsl:template>
|
| 108 |
+
|
| 109 |
+
<xsl:template match="m:munder">
|
| 110 |
+
<xsl:call-template name="munder">
|
| 111 |
+
<xsl:with-param name="base">
|
| 112 |
+
<xsl:call-template name="startspace">
|
| 113 |
+
<xsl:with-param name="symbol" select="./*[1]"/>
|
| 114 |
+
</xsl:call-template>
|
| 115 |
+
</xsl:with-param>
|
| 116 |
+
<xsl:with-param name="under">
|
| 117 |
+
<xsl:call-template name="startspace">
|
| 118 |
+
<xsl:with-param name="symbol" select="./*[2]"/>
|
| 119 |
+
</xsl:call-template>
|
| 120 |
+
</xsl:with-param>
|
| 121 |
+
</xsl:call-template>
|
| 122 |
+
</xsl:template>
|
| 123 |
+
|
| 124 |
+
<xsl:template name="mover">
|
| 125 |
+
<xsl:param name="base"/>
|
| 126 |
+
<xsl:param name="over"/>
|
| 127 |
+
<xsl:param name="pos_over" select="2"/>
|
| 128 |
+
<xsl:choose>
|
| 129 |
+
<xsl:when test="$over='¯'"> <!-- OverBar - over bar -->
|
| 130 |
+
<xsl:text>\overline{</xsl:text>
|
| 131 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 132 |
+
<xsl:text>}</xsl:text>
|
| 133 |
+
</xsl:when>
|
| 134 |
+
<xsl:when test="$over='︷'"> <!-- OverBrace - over brace -->
|
| 135 |
+
<xsl:text>\overbrace{</xsl:text>
|
| 136 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 137 |
+
<xsl:text>}</xsl:text>
|
| 138 |
+
</xsl:when>
|
| 139 |
+
<xsl:when test="translate($base,'∏∐⋂⋃⊔',
|
| 140 |
+
'∑∑∑∑∑')='∑'">
|
| 141 |
+
<!-- if $base is operator, such as
|
| 142 |
+
∑ /sum L: summation operator
|
| 143 |
+
∏ /prod L: product operator
|
| 144 |
+
∐ /coprod L: coproduct operator
|
| 145 |
+
⋂ /bigcap
|
| 146 |
+
⋃ /bigcup
|
| 147 |
+
⊔ /bigsqcup
|
| 148 |
+
-->
|
| 149 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 150 |
+
<xsl:text>^{</xsl:text>
|
| 151 |
+
<xsl:apply-templates select="./*[$pos_over]"/>
|
| 152 |
+
<xsl:text>}</xsl:text>
|
| 153 |
+
</xsl:when>
|
| 154 |
+
<xsl:otherwise>
|
| 155 |
+
<xsl:text>\stackrel{</xsl:text>
|
| 156 |
+
<xsl:apply-templates select="./*[$pos_over]"/>
|
| 157 |
+
<xsl:text>}{</xsl:text>
|
| 158 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 159 |
+
<xsl:text>}</xsl:text>
|
| 160 |
+
<!--
|
| 161 |
+
<xsl:text>\overset{</xsl:text>
|
| 162 |
+
<xsl:apply-templates select="./*[$pos_over]"/>
|
| 163 |
+
<xsl:text>}{</xsl:text>
|
| 164 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 165 |
+
<xsl:text>}</xsl:text>-->
|
| 166 |
+
</xsl:otherwise>
|
| 167 |
+
</xsl:choose>
|
| 168 |
+
</xsl:template>
|
| 169 |
+
|
| 170 |
+
<xsl:template name="munder">
|
| 171 |
+
<xsl:param name="base"/>
|
| 172 |
+
<xsl:param name="under"/>
|
| 173 |
+
<xsl:choose>
|
| 174 |
+
<xsl:when test="$under='̲'"> <!-- UnderBar - combining low line -->
|
| 175 |
+
<xsl:text>\underline{</xsl:text>
|
| 176 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 177 |
+
<xsl:text>}</xsl:text>
|
| 178 |
+
</xsl:when>
|
| 179 |
+
<xsl:when test="$under='︸'"> <!-- UnderBrace - under brace -->
|
| 180 |
+
<xsl:text>\underbrace{</xsl:text>
|
| 181 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 182 |
+
<xsl:text>}</xsl:text>
|
| 183 |
+
</xsl:when>
|
| 184 |
+
<xsl:when test="translate($base,'∏∐⋂⋃⊔',
|
| 185 |
+
'∑∑∑∑∑')='∑'">
|
| 186 |
+
<!-- if $base is operator, such as
|
| 187 |
+
∑ /sum L: summation operator
|
| 188 |
+
∏ /prod L: product operator
|
| 189 |
+
∐ /coprod L: coproduct operator
|
| 190 |
+
⋂ /bigcap
|
| 191 |
+
⋃ /bigcup
|
| 192 |
+
⊔ /bigsqcup
|
| 193 |
+
-->
|
| 194 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 195 |
+
<xsl:text>_{</xsl:text>
|
| 196 |
+
<xsl:apply-templates select="./*[2]"/>
|
| 197 |
+
<xsl:text>}</xsl:text>
|
| 198 |
+
</xsl:when>
|
| 199 |
+
<xsl:otherwise>
|
| 200 |
+
<xsl:text>\underset{</xsl:text> <!-- Required AmsMath package -->
|
| 201 |
+
<xsl:apply-templates select="./*[2]"/>
|
| 202 |
+
<xsl:text>}{</xsl:text>
|
| 203 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 204 |
+
<xsl:text>}</xsl:text>
|
| 205 |
+
</xsl:otherwise>
|
| 206 |
+
</xsl:choose>
|
| 207 |
+
</xsl:template>
|
| 208 |
+
|
| 209 |
+
<xsl:template match="m:msubsup">
|
| 210 |
+
<xsl:text>{</xsl:text>
|
| 211 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 212 |
+
<xsl:text>}_{</xsl:text>
|
| 213 |
+
<xsl:apply-templates select="./*[2]"/>
|
| 214 |
+
<xsl:text>}^{</xsl:text>
|
| 215 |
+
<xsl:apply-templates select="./*[3]"/>
|
| 216 |
+
<xsl:text>}</xsl:text>
|
| 217 |
+
</xsl:template>
|
| 218 |
+
|
| 219 |
+
<xsl:template match="m:msup">
|
| 220 |
+
<xsl:text>{</xsl:text>
|
| 221 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 222 |
+
<xsl:text>}^{</xsl:text>
|
| 223 |
+
<xsl:apply-templates select="./*[2]"/>
|
| 224 |
+
<xsl:text>}</xsl:text>
|
| 225 |
+
</xsl:template>
|
| 226 |
+
|
| 227 |
+
<xsl:template match="m:msub">
|
| 228 |
+
<xsl:text>{</xsl:text>
|
| 229 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 230 |
+
<xsl:text>}_{</xsl:text>
|
| 231 |
+
<xsl:apply-templates select="./*[2]"/>
|
| 232 |
+
<xsl:text>}</xsl:text>
|
| 233 |
+
</xsl:template>
|
| 234 |
+
|
| 235 |
+
<xsl:template match="m:mmultiscripts" mode="mprescripts">
|
| 236 |
+
<xsl:for-each select="m:mprescripts/following-sibling::*">
|
| 237 |
+
<xsl:if test="position() mod 2 and local-name(.)!='none'">
|
| 238 |
+
<xsl:text>{}_{</xsl:text>
|
| 239 |
+
<xsl:apply-templates select="."/>
|
| 240 |
+
<xsl:text>}</xsl:text>
|
| 241 |
+
</xsl:if>
|
| 242 |
+
<xsl:if test="not(position() mod 2) and local-name(.)!='none'">
|
| 243 |
+
<xsl:text>{}^{</xsl:text>
|
| 244 |
+
<xsl:apply-templates select="."/>
|
| 245 |
+
<xsl:text>}</xsl:text>
|
| 246 |
+
</xsl:if>
|
| 247 |
+
</xsl:for-each>
|
| 248 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 249 |
+
<xsl:for-each select="m:mprescripts/preceding-sibling::*[position()!=last()]">
|
| 250 |
+
<xsl:if test="position()>2 and local-name(.)!='none'">
|
| 251 |
+
<xsl:text>{}</xsl:text>
|
| 252 |
+
</xsl:if>
|
| 253 |
+
<xsl:if test="position() mod 2 and local-name(.)!='none'">
|
| 254 |
+
<xsl:text>_{</xsl:text>
|
| 255 |
+
<xsl:apply-templates select="."/>
|
| 256 |
+
<xsl:text>}</xsl:text>
|
| 257 |
+
</xsl:if>
|
| 258 |
+
<xsl:if test="not(position() mod 2) and local-name(.)!='none'">
|
| 259 |
+
<xsl:text>^{</xsl:text>
|
| 260 |
+
<xsl:apply-templates select="."/>
|
| 261 |
+
<xsl:text>}</xsl:text>
|
| 262 |
+
</xsl:if>
|
| 263 |
+
</xsl:for-each>
|
| 264 |
+
</xsl:template>
|
| 265 |
+
|
| 266 |
+
<xsl:template match="m:mmultiscripts">
|
| 267 |
+
<xsl:choose>
|
| 268 |
+
<xsl:when test="m:mprescripts">
|
| 269 |
+
<xsl:apply-templates select="." mode="mprescripts"/>
|
| 270 |
+
</xsl:when>
|
| 271 |
+
<xsl:otherwise>
|
| 272 |
+
<xsl:apply-templates select="./*[1]"/>
|
| 273 |
+
<xsl:for-each select="*[position()>1]">
|
| 274 |
+
<xsl:if test="position()>2 and local-name(.)!='none'">
|
| 275 |
+
<xsl:text>{}</xsl:text>
|
| 276 |
+
</xsl:if>
|
| 277 |
+
<xsl:if test="position() mod 2 and local-name(.)!='none'">
|
| 278 |
+
<xsl:text>_{</xsl:text>
|
| 279 |
+
<xsl:apply-templates select="."/>
|
| 280 |
+
<xsl:text>}</xsl:text>
|
| 281 |
+
</xsl:if>
|
| 282 |
+
<xsl:if test="not(position() mod 2) and local-name(.)!='none'">
|
| 283 |
+
<xsl:text>^{</xsl:text>
|
| 284 |
+
<xsl:apply-templates select="."/>
|
| 285 |
+
<xsl:text>}</xsl:text>
|
| 286 |
+
</xsl:if>
|
| 287 |
+
</xsl:for-each>
|
| 288 |
+
</xsl:otherwise>
|
| 289 |
+
</xsl:choose>
|
| 290 |
+
</xsl:template>
|
| 291 |
+
|
| 292 |
+
</xsl:stylesheet>
|
ultradata_math_parser/mmltex/tables.xsl
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version='1.0' encoding="UTF-8"?>
|
| 2 |
+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
| 3 |
+
xmlns:m="http://www.w3.org/1998/Math/MathML"
|
| 4 |
+
version='1.0'>
|
| 5 |
+
|
| 6 |
+
<!-- ====================================================================== -->
|
| 7 |
+
<!-- $id: tables.xsl, 2002/17/05 Exp $
|
| 8 |
+
This file is part of the XSLT MathML Library distribution.
|
| 9 |
+
See ./README or http://www.raleigh.ru/MathML/mmltex for
|
| 10 |
+
copyright and other information -->
|
| 11 |
+
<!-- ====================================================================== -->
|
| 12 |
+
|
| 13 |
+
<xsl:template match="m:mtd[@columnspan]">
|
| 14 |
+
<xsl:text>\multicolumn{</xsl:text>
|
| 15 |
+
<xsl:value-of select="@columnspan"/>
|
| 16 |
+
<xsl:text>}{c}{</xsl:text>
|
| 17 |
+
<xsl:apply-templates/>
|
| 18 |
+
<xsl:text>}</xsl:text>
|
| 19 |
+
<xsl:if test="count(following-sibling::*)>0">
|
| 20 |
+
<xsl:text>& </xsl:text>
|
| 21 |
+
</xsl:if>
|
| 22 |
+
</xsl:template>
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
<xsl:template match="m:mtd">
|
| 26 |
+
<xsl:if test="@columnalign='right' or @columnalign='center'">
|
| 27 |
+
<xsl:text>\hfill </xsl:text>
|
| 28 |
+
</xsl:if>
|
| 29 |
+
<xsl:apply-templates/>
|
| 30 |
+
<xsl:if test="@columnalign='left' or @columnalign='center'">
|
| 31 |
+
<xsl:text>\hfill </xsl:text>
|
| 32 |
+
</xsl:if>
|
| 33 |
+
<xsl:if test="count(following-sibling::*)>0">
|
| 34 |
+
<!-- this test valid for Sablotron, another form - test="not(position()=last())".
|
| 35 |
+
Also for m:mtd[@columnspan] and m:mtr -->
|
| 36 |
+
<xsl:text>& </xsl:text>
|
| 37 |
+
</xsl:if>
|
| 38 |
+
</xsl:template>
|
| 39 |
+
|
| 40 |
+
<xsl:template match="m:mtr">
|
| 41 |
+
<xsl:apply-templates/>
|
| 42 |
+
<xsl:if test="count(following-sibling::*)>0">
|
| 43 |
+
<xsl:text>\\ </xsl:text>
|
| 44 |
+
</xsl:if>
|
| 45 |
+
</xsl:template>
|
| 46 |
+
|
| 47 |
+
<xsl:template match="m:mtable">
|
| 48 |
+
<xsl:text>\begin{array}{</xsl:text>
|
| 49 |
+
<xsl:if test="@frame='solid'">
|
| 50 |
+
<xsl:text>|</xsl:text>
|
| 51 |
+
</xsl:if>
|
| 52 |
+
<xsl:variable name="numbercols" select="count(./m:mtr[1]/m:mtd[not(@columnspan)])+sum(./m:mtr[1]/m:mtd/@columnspan)"/>
|
| 53 |
+
<xsl:choose>
|
| 54 |
+
<xsl:when test="@columnalign">
|
| 55 |
+
<xsl:variable name="colalign">
|
| 56 |
+
<xsl:call-template name="colalign">
|
| 57 |
+
<xsl:with-param name="colalign" select="@columnalign"/>
|
| 58 |
+
</xsl:call-template>
|
| 59 |
+
</xsl:variable>
|
| 60 |
+
<xsl:choose>
|
| 61 |
+
<xsl:when test="string-length($colalign) > $numbercols">
|
| 62 |
+
<xsl:value-of select="substring($colalign,1,$numbercols)"/>
|
| 63 |
+
</xsl:when>
|
| 64 |
+
<xsl:when test="string-length($colalign) < $numbercols">
|
| 65 |
+
<xsl:value-of select="$colalign"/>
|
| 66 |
+
<xsl:call-template name="generate-string">
|
| 67 |
+
<xsl:with-param name="text" select="substring($colalign,string-length($colalign))"/>
|
| 68 |
+
<xsl:with-param name="count" select="$numbercols - string-length($colalign)"/>
|
| 69 |
+
</xsl:call-template>
|
| 70 |
+
</xsl:when>
|
| 71 |
+
<xsl:otherwise>
|
| 72 |
+
<xsl:value-of select="$colalign"/>
|
| 73 |
+
</xsl:otherwise>
|
| 74 |
+
</xsl:choose>
|
| 75 |
+
</xsl:when>
|
| 76 |
+
<xsl:otherwise>
|
| 77 |
+
<xsl:call-template name="generate-string">
|
| 78 |
+
<xsl:with-param name="text" select="'c'"/>
|
| 79 |
+
<xsl:with-param name="count" select="$numbercols"/>
|
| 80 |
+
</xsl:call-template>
|
| 81 |
+
</xsl:otherwise>
|
| 82 |
+
</xsl:choose>
|
| 83 |
+
<xsl:if test="@frame='solid'">
|
| 84 |
+
<xsl:text>|</xsl:text>
|
| 85 |
+
</xsl:if>
|
| 86 |
+
<xsl:text>}</xsl:text>
|
| 87 |
+
<xsl:if test="@frame='solid'">
|
| 88 |
+
<xsl:text>\hline </xsl:text>
|
| 89 |
+
</xsl:if>
|
| 90 |
+
<xsl:apply-templates/>
|
| 91 |
+
<xsl:if test="@frame='solid'">
|
| 92 |
+
<xsl:text>\\ \hline</xsl:text>
|
| 93 |
+
</xsl:if>
|
| 94 |
+
<xsl:text>\end{array}</xsl:text>
|
| 95 |
+
</xsl:template>
|
| 96 |
+
|
| 97 |
+
<xsl:template name="colalign">
|
| 98 |
+
<xsl:param name="colalign"/>
|
| 99 |
+
<xsl:choose>
|
| 100 |
+
<xsl:when test="contains($colalign,' ')">
|
| 101 |
+
<xsl:value-of select="substring($colalign,1,1)"/>
|
| 102 |
+
<xsl:call-template name="colalign">
|
| 103 |
+
<xsl:with-param name="colalign" select="substring-after($colalign,' ')"/>
|
| 104 |
+
</xsl:call-template>
|
| 105 |
+
</xsl:when>
|
| 106 |
+
<xsl:otherwise>
|
| 107 |
+
<xsl:value-of select="substring($colalign,1,1)"/>
|
| 108 |
+
</xsl:otherwise>
|
| 109 |
+
</xsl:choose>
|
| 110 |
+
</xsl:template>
|
| 111 |
+
|
| 112 |
+
<xsl:template name="generate-string">
|
| 113 |
+
<!-- template from XSLT Standard Library v1.1 -->
|
| 114 |
+
<xsl:param name="text"/>
|
| 115 |
+
<xsl:param name="count"/>
|
| 116 |
+
|
| 117 |
+
<xsl:choose>
|
| 118 |
+
<xsl:when test="string-length($text) = 0 or $count <= 0"/>
|
| 119 |
+
|
| 120 |
+
<xsl:otherwise>
|
| 121 |
+
<xsl:value-of select="$text"/>
|
| 122 |
+
<xsl:call-template name="generate-string">
|
| 123 |
+
<xsl:with-param name="text" select="$text"/>
|
| 124 |
+
<xsl:with-param name="count" select="$count - 1"/>
|
| 125 |
+
</xsl:call-template>
|
| 126 |
+
</xsl:otherwise>
|
| 127 |
+
</xsl:choose>
|
| 128 |
+
</xsl:template>
|
| 129 |
+
|
| 130 |
+
</xsl:stylesheet>
|
ultradata_math_parser/mmltex/tokens.xsl
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version='1.0' encoding="UTF-8"?>
|
| 2 |
+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
| 3 |
+
xmlns:m="http://www.w3.org/1998/Math/MathML"
|
| 4 |
+
version='1.0'>
|
| 5 |
+
|
| 6 |
+
<!-- ====================================================================== -->
|
| 7 |
+
<!-- $id: tokens.xsl, 2002/22/11 Exp $
|
| 8 |
+
This file is part of the XSLT MathML Library distribution.
|
| 9 |
+
See ./README or http://www.raleigh.ru/MathML/mmltex for
|
| 10 |
+
copyright and other information -->
|
| 11 |
+
<!-- ====================================================================== -->
|
| 12 |
+
|
| 13 |
+
<xsl:template match="m:mi|m:mn|m:mo|m:mtext|m:ms">
|
| 14 |
+
<xsl:call-template name="CommonTokenAtr"/>
|
| 15 |
+
</xsl:template>
|
| 16 |
+
|
| 17 |
+
<xsl:template name="mi">
|
| 18 |
+
<xsl:choose>
|
| 19 |
+
<xsl:when test="string-length(normalize-space(.))>1 and not(@mathvariant)">
|
| 20 |
+
<xsl:text>\mathrm{</xsl:text>
|
| 21 |
+
<xsl:apply-templates/>
|
| 22 |
+
<xsl:text>}</xsl:text>
|
| 23 |
+
</xsl:when>
|
| 24 |
+
<xsl:otherwise>
|
| 25 |
+
<xsl:apply-templates/>
|
| 26 |
+
</xsl:otherwise>
|
| 27 |
+
</xsl:choose>
|
| 28 |
+
</xsl:template>
|
| 29 |
+
|
| 30 |
+
<xsl:template name="mn">
|
| 31 |
+
<xsl:apply-templates/>
|
| 32 |
+
</xsl:template>
|
| 33 |
+
|
| 34 |
+
<xsl:template name="mo">
|
| 35 |
+
<xsl:apply-templates/>
|
| 36 |
+
</xsl:template>
|
| 37 |
+
|
| 38 |
+
<xsl:template name="mtext">
|
| 39 |
+
<xsl:variable name="content">
|
| 40 |
+
<xsl:call-template name="replaceMtextEntities">
|
| 41 |
+
<xsl:with-param name="content" select="."/>
|
| 42 |
+
</xsl:call-template>
|
| 43 |
+
</xsl:variable>
|
| 44 |
+
<xsl:text>\text{</xsl:text>
|
| 45 |
+
<xsl:value-of select="$content"/>
|
| 46 |
+
<xsl:text>}</xsl:text>
|
| 47 |
+
</xsl:template>
|
| 48 |
+
|
| 49 |
+
<xsl:template match="m:mspace">
|
| 50 |
+
<xsl:text>\phantom{\rule</xsl:text>
|
| 51 |
+
<xsl:if test="@depth">
|
| 52 |
+
<xsl:text>[-</xsl:text>
|
| 53 |
+
<xsl:value-of select="@depth"/>
|
| 54 |
+
<xsl:text>]</xsl:text>
|
| 55 |
+
</xsl:if>
|
| 56 |
+
<xsl:text>{</xsl:text>
|
| 57 |
+
<xsl:if test="not(@width)">
|
| 58 |
+
<xsl:text>0ex</xsl:text>
|
| 59 |
+
</xsl:if>
|
| 60 |
+
<xsl:value-of select="@width"/>
|
| 61 |
+
<xsl:text>}{</xsl:text>
|
| 62 |
+
<xsl:if test="not(@height)">
|
| 63 |
+
<xsl:text>0ex</xsl:text>
|
| 64 |
+
</xsl:if>
|
| 65 |
+
<xsl:value-of select="@height"/>
|
| 66 |
+
<xsl:text>}}</xsl:text>
|
| 67 |
+
</xsl:template>
|
| 68 |
+
|
| 69 |
+
<xsl:template name="ms">
|
| 70 |
+
<xsl:choose>
|
| 71 |
+
<xsl:when test="@lquote"><xsl:value-of select="@lquote"/></xsl:when>
|
| 72 |
+
<xsl:otherwise><xsl:text>"</xsl:text></xsl:otherwise>
|
| 73 |
+
</xsl:choose><xsl:apply-templates/><xsl:choose>
|
| 74 |
+
<xsl:when test="@rquote"><xsl:value-of select="@rquote"/></xsl:when>
|
| 75 |
+
<xsl:otherwise><xsl:text>"</xsl:text></xsl:otherwise>
|
| 76 |
+
</xsl:choose>
|
| 77 |
+
</xsl:template>
|
| 78 |
+
|
| 79 |
+
<xsl:template name="CommonTokenAtr">
|
| 80 |
+
<xsl:if test="@mathbackground">
|
| 81 |
+
<xsl:text>\colorbox[rgb]{</xsl:text>
|
| 82 |
+
<xsl:call-template name="color">
|
| 83 |
+
<xsl:with-param name="color" select="@mathbackground"/>
|
| 84 |
+
</xsl:call-template>
|
| 85 |
+
<xsl:text>}{$</xsl:text>
|
| 86 |
+
</xsl:if>
|
| 87 |
+
<xsl:if test="@color or @mathcolor"> <!-- Note: @color is deprecated in MathML 2.0 -->
|
| 88 |
+
<xsl:text>\textcolor[rgb]{</xsl:text>
|
| 89 |
+
<xsl:call-template name="color">
|
| 90 |
+
<xsl:with-param name="color" select="@color|@mathcolor"/>
|
| 91 |
+
</xsl:call-template>
|
| 92 |
+
<xsl:text>}{</xsl:text>
|
| 93 |
+
</xsl:if>
|
| 94 |
+
<xsl:if test="@mathvariant">
|
| 95 |
+
<xsl:choose>
|
| 96 |
+
<xsl:when test="@mathvariant='normal'">
|
| 97 |
+
<xsl:text>\mathrm{</xsl:text>
|
| 98 |
+
</xsl:when>
|
| 99 |
+
<xsl:when test="@mathvariant='bold'">
|
| 100 |
+
<xsl:text>\mathbf{</xsl:text>
|
| 101 |
+
</xsl:when>
|
| 102 |
+
<xsl:when test="@mathvariant='italic'">
|
| 103 |
+
<xsl:text>\mathit{</xsl:text>
|
| 104 |
+
</xsl:when>
|
| 105 |
+
<xsl:when test="@mathvariant='bold-italic'"> <!-- Required definition -->
|
| 106 |
+
<xsl:text>\mathbit{</xsl:text>
|
| 107 |
+
</xsl:when>
|
| 108 |
+
<xsl:when test="@mathvariant='double-struck'"> <!-- Required amsfonts -->
|
| 109 |
+
<xsl:text>\mathbb{</xsl:text>
|
| 110 |
+
</xsl:when>
|
| 111 |
+
<xsl:when test="@mathvariant='bold-fraktur'"> <!-- Error -->
|
| 112 |
+
<xsl:text>{</xsl:text>
|
| 113 |
+
</xsl:when>
|
| 114 |
+
<xsl:when test="@mathvariant='script'">
|
| 115 |
+
<xsl:text>\mathcal{</xsl:text>
|
| 116 |
+
</xsl:when>
|
| 117 |
+
<xsl:when test="@mathvariant='bold-script'"> <!-- Error -->
|
| 118 |
+
<xsl:text>\mathsc{</xsl:text>
|
| 119 |
+
</xsl:when>
|
| 120 |
+
<xsl:when test="@mathvariant='fraktur'"> <!-- Required amsfonts -->
|
| 121 |
+
<xsl:text>\mathfrak{</xsl:text>
|
| 122 |
+
</xsl:when>
|
| 123 |
+
<xsl:when test="@mathvariant='sans-serif'">
|
| 124 |
+
<xsl:text>\mathsf{</xsl:text>
|
| 125 |
+
</xsl:when>
|
| 126 |
+
<xsl:when test="@mathvariant='bold-sans-serif'"> <!-- Required definition -->
|
| 127 |
+
<xsl:text>\mathbsf{</xsl:text>
|
| 128 |
+
</xsl:when>
|
| 129 |
+
<xsl:when test="@mathvariant='sans-serif-italic'"> <!-- Required definition -->
|
| 130 |
+
<xsl:text>\mathsfit{</xsl:text>
|
| 131 |
+
</xsl:when>
|
| 132 |
+
<xsl:when test="@mathvariant='sans-serif-bold-italic'"> <!-- Error -->
|
| 133 |
+
<xsl:text>\mathbsfit{</xsl:text>
|
| 134 |
+
</xsl:when>
|
| 135 |
+
<xsl:when test="@mathvariant='monospace'">
|
| 136 |
+
<xsl:text>\mathtt{</xsl:text>
|
| 137 |
+
</xsl:when>
|
| 138 |
+
<xsl:otherwise>
|
| 139 |
+
<xsl:text>{</xsl:text>
|
| 140 |
+
</xsl:otherwise>
|
| 141 |
+
</xsl:choose>
|
| 142 |
+
</xsl:if>
|
| 143 |
+
<xsl:call-template name="selectTemplate"/>
|
| 144 |
+
<xsl:if test="@mathvariant">
|
| 145 |
+
<xsl:text>}</xsl:text>
|
| 146 |
+
</xsl:if>
|
| 147 |
+
<xsl:if test="@color or @mathcolor">
|
| 148 |
+
<xsl:text>}</xsl:text>
|
| 149 |
+
</xsl:if>
|
| 150 |
+
<xsl:if test="@mathbackground">
|
| 151 |
+
<xsl:text>$}</xsl:text>
|
| 152 |
+
</xsl:if>
|
| 153 |
+
</xsl:template>
|
| 154 |
+
|
| 155 |
+
<xsl:template name="selectTemplate">
|
| 156 |
+
<!-- <xsl:variable name="name" select="local-name()"/>
|
| 157 |
+
<xsl:call-template name="{$name}"/>-->
|
| 158 |
+
<xsl:choose>
|
| 159 |
+
<xsl:when test="local-name(.)='mi'">
|
| 160 |
+
<xsl:call-template name="mi"/>
|
| 161 |
+
</xsl:when>
|
| 162 |
+
<xsl:when test="local-name(.)='mn'">
|
| 163 |
+
<xsl:call-template name="mn"/>
|
| 164 |
+
</xsl:when>
|
| 165 |
+
<xsl:when test="local-name(.)='mo'">
|
| 166 |
+
<xsl:call-template name="mo"/>
|
| 167 |
+
</xsl:when>
|
| 168 |
+
<xsl:when test="local-name(.)='mtext'">
|
| 169 |
+
<xsl:call-template name="mtext"/>
|
| 170 |
+
</xsl:when>
|
| 171 |
+
<xsl:when test="local-name(.)='ms'">
|
| 172 |
+
<xsl:call-template name="ms"/>
|
| 173 |
+
</xsl:when>
|
| 174 |
+
</xsl:choose>
|
| 175 |
+
</xsl:template>
|
| 176 |
+
|
| 177 |
+
<xsl:template name="color">
|
| 178 |
+
<!-- NB: Variables colora and valueColor{n} only for Sablotron -->
|
| 179 |
+
<xsl:param name="color"/>
|
| 180 |
+
<xsl:variable name="colora" select="translate($color,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')"/>
|
| 181 |
+
<xsl:choose>
|
| 182 |
+
<xsl:when test="starts-with($colora,'#') and string-length($colora)=4">
|
| 183 |
+
<xsl:variable name="valueColor">
|
| 184 |
+
<xsl:call-template name="Hex2Decimal">
|
| 185 |
+
<xsl:with-param name="arg" select="substring($colora,2,1)"/>
|
| 186 |
+
</xsl:call-template>
|
| 187 |
+
</xsl:variable>
|
| 188 |
+
<xsl:value-of select="$valueColor div 15"/><xsl:text>,</xsl:text>
|
| 189 |
+
<xsl:variable name="valueColor1">
|
| 190 |
+
<xsl:call-template name="Hex2Decimal">
|
| 191 |
+
<xsl:with-param name="arg" select="substring($colora,3,1)"/>
|
| 192 |
+
</xsl:call-template>
|
| 193 |
+
</xsl:variable>
|
| 194 |
+
<xsl:value-of select="$valueColor1 div 15"/><xsl:text>,</xsl:text>
|
| 195 |
+
<xsl:variable name="valueColor2">
|
| 196 |
+
<xsl:call-template name="Hex2Decimal">
|
| 197 |
+
<xsl:with-param name="arg" select="substring($colora,4,1)"/>
|
| 198 |
+
</xsl:call-template>
|
| 199 |
+
</xsl:variable>
|
| 200 |
+
<xsl:value-of select="$valueColor2 div 15"/>
|
| 201 |
+
</xsl:when>
|
| 202 |
+
<xsl:when test="starts-with($colora,'#') and string-length($colora)=7">
|
| 203 |
+
<xsl:variable name="valueColor1">
|
| 204 |
+
<xsl:call-template name="Hex2Decimal">
|
| 205 |
+
<xsl:with-param name="arg" select="substring($colora,2,1)"/>
|
| 206 |
+
</xsl:call-template>
|
| 207 |
+
</xsl:variable>
|
| 208 |
+
<xsl:variable name="valueColor2">
|
| 209 |
+
<xsl:call-template name="Hex2Decimal">
|
| 210 |
+
<xsl:with-param name="arg" select="substring($colora,3,1)"/>
|
| 211 |
+
</xsl:call-template>
|
| 212 |
+
</xsl:variable>
|
| 213 |
+
<xsl:value-of select="($valueColor1*16 + $valueColor2) div 255"/><xsl:text>,</xsl:text>
|
| 214 |
+
<xsl:variable name="valueColor1a">
|
| 215 |
+
<xsl:call-template name="Hex2Decimal">
|
| 216 |
+
<xsl:with-param name="arg" select="substring($colora,4,1)"/>
|
| 217 |
+
</xsl:call-template>
|
| 218 |
+
</xsl:variable>
|
| 219 |
+
<xsl:variable name="valueColor2a">
|
| 220 |
+
<xsl:call-template name="Hex2Decimal">
|
| 221 |
+
<xsl:with-param name="arg" select="substring($colora,5,1)"/>
|
| 222 |
+
</xsl:call-template>
|
| 223 |
+
</xsl:variable>
|
| 224 |
+
<xsl:value-of select="($valueColor1a*16 + $valueColor2a) div 255"/><xsl:text>,</xsl:text>
|
| 225 |
+
<xsl:variable name="valueColor1b">
|
| 226 |
+
<xsl:call-template name="Hex2Decimal">
|
| 227 |
+
<xsl:with-param name="arg" select="substring($colora,6,1)"/>
|
| 228 |
+
</xsl:call-template>
|
| 229 |
+
</xsl:variable>
|
| 230 |
+
<xsl:variable name="valueColor2b">
|
| 231 |
+
<xsl:call-template name="Hex2Decimal">
|
| 232 |
+
<xsl:with-param name="arg" select="substring($colora,7,1)"/>
|
| 233 |
+
</xsl:call-template>
|
| 234 |
+
</xsl:variable>
|
| 235 |
+
<xsl:value-of select="($valueColor1b*16 + $valueColor2b) div 255"/>
|
| 236 |
+
</xsl:when>
|
| 237 |
+
<!-- ======================= if color specifed as an html-color-name ========================================== -->
|
| 238 |
+
<xsl:when test="$colora='aqua'"><xsl:text>0,1,1</xsl:text></xsl:when>
|
| 239 |
+
<xsl:when test="$colora='black'"><xsl:text>0,0,0</xsl:text></xsl:when>
|
| 240 |
+
<xsl:when test="$colora='blue'"><xsl:text>0,0,1</xsl:text></xsl:when>
|
| 241 |
+
<xsl:when test="$colora='fuchsia'"><xsl:text>1,0,1</xsl:text></xsl:when>
|
| 242 |
+
<xsl:when test="$colora='gray'"><xsl:text>.5,.5,.5</xsl:text></xsl:when>
|
| 243 |
+
<xsl:when test="$colora='green'"><xsl:text>0,.5,0</xsl:text></xsl:when>
|
| 244 |
+
<xsl:when test="$colora='lime'"><xsl:text>0,1,0</xsl:text></xsl:when>
|
| 245 |
+
<xsl:when test="$colora='maroon'"><xsl:text>.5,0,0</xsl:text></xsl:when>
|
| 246 |
+
<xsl:when test="$colora='navy'"><xsl:text>0,0,.5</xsl:text></xsl:when>
|
| 247 |
+
<xsl:when test="$colora='olive'"><xsl:text>.5,.5,0</xsl:text></xsl:when>
|
| 248 |
+
<xsl:when test="$colora='purple'"><xsl:text>.5,0,.5</xsl:text></xsl:when>
|
| 249 |
+
<xsl:when test="$colora='red'"><xsl:text>1,0,0</xsl:text></xsl:when>
|
| 250 |
+
<xsl:when test="$colora='silver'"><xsl:text>.75,.75,.75</xsl:text></xsl:when>
|
| 251 |
+
<xsl:when test="$colora='teal'"><xsl:text>0,.5,.5</xsl:text></xsl:when>
|
| 252 |
+
<xsl:when test="$colora='white'"><xsl:text>1,1,1</xsl:text></xsl:when>
|
| 253 |
+
<xsl:when test="$colora='yellow'"><xsl:text>1,1,0</xsl:text></xsl:when>
|
| 254 |
+
<xsl:otherwise>
|
| 255 |
+
<xsl:message>Exception at color template</xsl:message>
|
| 256 |
+
</xsl:otherwise>
|
| 257 |
+
</xsl:choose>
|
| 258 |
+
</xsl:template>
|
| 259 |
+
|
| 260 |
+
<xsl:template name="Hex2Decimal">
|
| 261 |
+
<xsl:param name="arg"/>
|
| 262 |
+
<xsl:choose>
|
| 263 |
+
<xsl:when test="$arg='f'">
|
| 264 |
+
<xsl:value-of select="15"/>
|
| 265 |
+
</xsl:when>
|
| 266 |
+
<xsl:when test="$arg='e'">
|
| 267 |
+
<xsl:value-of select="14"/>
|
| 268 |
+
</xsl:when>
|
| 269 |
+
<xsl:when test="$arg='d'">
|
| 270 |
+
<xsl:value-of select="13"/>
|
| 271 |
+
</xsl:when>
|
| 272 |
+
<xsl:when test="$arg='c'">
|
| 273 |
+
<xsl:value-of select="12"/>
|
| 274 |
+
</xsl:when>
|
| 275 |
+
<xsl:when test="$arg='b'">
|
| 276 |
+
<xsl:value-of select="11"/>
|
| 277 |
+
</xsl:when>
|
| 278 |
+
<xsl:when test="$arg='a'">
|
| 279 |
+
<xsl:value-of select="10"/>
|
| 280 |
+
</xsl:when>
|
| 281 |
+
<xsl:when test="translate($arg, '0123456789', '9999999999')='9'"> <!-- if $arg is number -->
|
| 282 |
+
<xsl:value-of select="$arg"/>
|
| 283 |
+
</xsl:when>
|
| 284 |
+
<xsl:otherwise>
|
| 285 |
+
<xsl:message>Exception at Hex2Decimal template</xsl:message>
|
| 286 |
+
</xsl:otherwise>
|
| 287 |
+
</xsl:choose>
|
| 288 |
+
</xsl:template>
|
| 289 |
+
|
| 290 |
+
<xsl:template match="m:*/text()">
|
| 291 |
+
<xsl:call-template name="replaceEntities">
|
| 292 |
+
<xsl:with-param name="content" select="normalize-space()"/>
|
| 293 |
+
</xsl:call-template>
|
| 294 |
+
</xsl:template>
|
| 295 |
+
|
| 296 |
+
</xsl:stylesheet>
|
ultradata_math_parser/parsers/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding:utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from ultradata_math_parser.parsers.article_parser import ArticleParser
|
| 4 |
+
from ultradata_math_parser.parsers.forum_parser import ForumParser
|
| 5 |
+
from ultradata_math_parser.parsers.custom_parser import CustomParser
|
| 6 |
+
from ultradata_math_parser.parsers.unified_parser import UnifiedParser
|
| 7 |
+
from ultradata_math_parser.parsers.title_parser import TitleParser
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"ArticleParser",
|
| 11 |
+
"ForumParser",
|
| 12 |
+
"CustomParser",
|
| 13 |
+
"UnifiedParser",
|
| 14 |
+
"TitleParser",
|
| 15 |
+
]
|
ultradata_math_parser/parsers/article_parser.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding:utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from copy import deepcopy
|
| 4 |
+
|
| 5 |
+
from ultradata_math_parser.utils import *
|
| 6 |
+
from ultradata_math_parser.parsers.base_parser import BaseParser
|
| 7 |
+
from ultradata_math_parser.parsers.title_parser import TitleParser
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class ArticleParser(BaseParser):
|
| 11 |
+
def __init__(self) -> None:
|
| 12 |
+
super().__init__()
|
| 13 |
+
|
| 14 |
+
def extract(self, html="", **kwargs) -> dict:
|
| 15 |
+
base_url = kwargs.get("base_url", "")
|
| 16 |
+
self.process_math = kwargs.get("process_math", self.process_math)
|
| 17 |
+
self.preserve_math_containers = kwargs.get("preserve_math_containers", self.preserve_math_containers)
|
| 18 |
+
self.include_tables = kwargs.get("include_tables", self.include_tables)
|
| 19 |
+
self.include_images = kwargs.get("include_images", self.include_images)
|
| 20 |
+
html = html.replace(" ", " ").replace(" ", " ")
|
| 21 |
+
tree = load_html(html)
|
| 22 |
+
if tree is None:
|
| 23 |
+
raise ValueError
|
| 24 |
+
|
| 25 |
+
title = TitleParser().process(tree)
|
| 26 |
+
|
| 27 |
+
# base_url
|
| 28 |
+
base_href = tree.xpath("//base/@href")
|
| 29 |
+
|
| 30 |
+
if base_href and "http" in base_href[0]:
|
| 31 |
+
base_url = base_href[0]
|
| 32 |
+
|
| 33 |
+
if "://blog.csdn.net/" in base_url:
|
| 34 |
+
for dtree in tree.xpath('//div[@id="content_views"]//ul[@class="pre-numbering"]'):
|
| 35 |
+
self.remove_node(dtree)
|
| 36 |
+
|
| 37 |
+
raw_tree = deepcopy(tree)
|
| 38 |
+
working_tree = deepcopy(tree)
|
| 39 |
+
|
| 40 |
+
# 标签转换, 增加数学标签处理
|
| 41 |
+
format_tree = self.convert_tags(working_tree, base_url=base_url)
|
| 42 |
+
format_tree = self._remove_tables_from_tree(format_tree)
|
| 43 |
+
format_tree = self._remove_images_from_tree(format_tree)
|
| 44 |
+
|
| 45 |
+
# 删除script style等标签及其内容
|
| 46 |
+
normal_tree = self.clean_tags(format_tree)
|
| 47 |
+
normal_tree = self._remove_tables_from_tree(normal_tree)
|
| 48 |
+
normal_tree = self._remove_images_from_tree(normal_tree)
|
| 49 |
+
fallback_tree = deepcopy(normal_tree)
|
| 50 |
+
|
| 51 |
+
subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
|
| 52 |
+
if xp_num == "others":
|
| 53 |
+
subtree, drop_list = self.prune_unwanted_sections(normal_tree)
|
| 54 |
+
body_html = self.get_content_html(subtree, xp_num, base_url)
|
| 55 |
+
|
| 56 |
+
body_html, fallback_strategy = self.apply_fallbacks(
|
| 57 |
+
primary_html=body_html,
|
| 58 |
+
base_url=base_url,
|
| 59 |
+
normal_tree=fallback_tree,
|
| 60 |
+
raw_tree=raw_tree,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
body_html = self._strip_tables_from_html(body_html)
|
| 64 |
+
body_html = self._strip_images_from_html(body_html)
|
| 65 |
+
|
| 66 |
+
text_length = self._text_length_from_html(body_html)
|
| 67 |
+
|
| 68 |
+
return {
|
| 69 |
+
"xp_num": xp_num,
|
| 70 |
+
"drop_list": drop_list,
|
| 71 |
+
"html": body_html,
|
| 72 |
+
"title": title,
|
| 73 |
+
"base_url": base_url,
|
| 74 |
+
"fallback_strategy": fallback_strategy,
|
| 75 |
+
"text_length": text_length,
|
| 76 |
+
}
|
ultradata_math_parser/parsers/base_parser.py
ADDED
|
@@ -0,0 +1,1059 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding:utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import html
|
| 4 |
+
import logging
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
from copy import deepcopy
|
| 7 |
+
from urllib.parse import unquote, urljoin
|
| 8 |
+
from lxml.etree import Comment, strip_elements
|
| 9 |
+
from ultradata_math_parser.config import *
|
| 10 |
+
from ultradata_math_parser.readability_plus import Document as DocumentPlus
|
| 11 |
+
from ultradata_math_parser.utils import *
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BaseParser:
|
| 15 |
+
def __init__(self):
|
| 16 |
+
self.drop_ids = []
|
| 17 |
+
self.need_comment = False
|
| 18 |
+
self.process_math = True
|
| 19 |
+
self.preserve_math_containers = True
|
| 20 |
+
self.include_tables = True
|
| 21 |
+
self.include_images = False
|
| 22 |
+
self.fallback_min_length = 250
|
| 23 |
+
self.enable_wild_text_fallback = True
|
| 24 |
+
self.enable_readability_fallback = True
|
| 25 |
+
self._logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
def xp_1_5(self, tree: HtmlElement):
|
| 28 |
+
drop_list = False
|
| 29 |
+
xp_num = "others"
|
| 30 |
+
result_body = Element("body")
|
| 31 |
+
|
| 32 |
+
for idx, expr in enumerate(BODY_XPATH):
|
| 33 |
+
try:
|
| 34 |
+
subtree = tree.xpath(expr)[0]
|
| 35 |
+
xp_num = str(idx + 1)
|
| 36 |
+
except IndexError:
|
| 37 |
+
continue
|
| 38 |
+
|
| 39 |
+
subtree, drop_list = self.prune_unwanted_sections(subtree)
|
| 40 |
+
|
| 41 |
+
if len(subtree) == 0:
|
| 42 |
+
xp_num = "others"
|
| 43 |
+
continue
|
| 44 |
+
|
| 45 |
+
ptest = subtree.xpath(".//text()[not(ancestor::a)]")
|
| 46 |
+
ptest_len = text_len("".join(ptest))
|
| 47 |
+
all_text_len = text_len(
|
| 48 |
+
"".join(tree.xpath("//p//text()[not(ancestor::a)]"))
|
| 49 |
+
)
|
| 50 |
+
if drop_list:
|
| 51 |
+
if ptest_len <= 50:
|
| 52 |
+
if all_text_len > 100:
|
| 53 |
+
xp_num = "others"
|
| 54 |
+
continue
|
| 55 |
+
else:
|
| 56 |
+
if ptest_len <= 20:
|
| 57 |
+
if all_text_len > 100:
|
| 58 |
+
xp_num = "others"
|
| 59 |
+
continue
|
| 60 |
+
result_body.append(subtree)
|
| 61 |
+
return result_body, xp_num, drop_list
|
| 62 |
+
|
| 63 |
+
return result_body, xp_num, drop_list
|
| 64 |
+
|
| 65 |
+
def get_content_html(self, cleaned_tree_backup, xp_num="others", base_url=""):
|
| 66 |
+
# readability_plus
|
| 67 |
+
doc = DocumentPlus(
|
| 68 |
+
cleaned_tree_backup,
|
| 69 |
+
url=base_url,
|
| 70 |
+
xp_num=xp_num,
|
| 71 |
+
need_comment=self.need_comment,
|
| 72 |
+
)
|
| 73 |
+
body = doc.summary(html_partial=True)
|
| 74 |
+
|
| 75 |
+
return body
|
| 76 |
+
|
| 77 |
+
def _text_length_from_html(self, html_fragment):
|
| 78 |
+
if not html_fragment:
|
| 79 |
+
return 0
|
| 80 |
+
# 使用 lxml.html.fromstring 解析后提取 text_content
|
| 81 |
+
# 不再依赖 w3m
|
| 82 |
+
try:
|
| 83 |
+
tree = fromstring(html_fragment)
|
| 84 |
+
text = tree.text_content()
|
| 85 |
+
return len(text or "")
|
| 86 |
+
except Exception:
|
| 87 |
+
return 0
|
| 88 |
+
|
| 89 |
+
def _is_content_sufficient(self, html_fragment):
|
| 90 |
+
return self._text_length_from_html(html_fragment) >= self.fallback_min_length
|
| 91 |
+
|
| 92 |
+
def _remove_tables_from_tree(self, tree: HtmlElement) -> HtmlElement:
|
| 93 |
+
if self.include_tables:
|
| 94 |
+
return tree
|
| 95 |
+
for table in list(tree.xpath(".//table")):
|
| 96 |
+
parent = table.getparent()
|
| 97 |
+
if parent is not None:
|
| 98 |
+
parent.remove(table)
|
| 99 |
+
return tree
|
| 100 |
+
|
| 101 |
+
def _strip_tables_from_html(self, html_fragment: str) -> str:
|
| 102 |
+
if self.include_tables or not html_fragment:
|
| 103 |
+
return html_fragment
|
| 104 |
+
try:
|
| 105 |
+
wrapper = fromstring(f"<div>{html_fragment}</div>")
|
| 106 |
+
except Exception:
|
| 107 |
+
return html_fragment
|
| 108 |
+
self._remove_tables_from_tree(wrapper)
|
| 109 |
+
return "".join(tostring(child, encoding=str) for child in wrapper)
|
| 110 |
+
|
| 111 |
+
def _remove_images_from_tree(self, tree: HtmlElement) -> HtmlElement:
|
| 112 |
+
for node in list(tree.xpath(".//img|.//picture|.//source")):
|
| 113 |
+
# 在删除IMG之前,检查ALT是否包含LaTeX公式
|
| 114 |
+
if node.tag == "img":
|
| 115 |
+
alt = node.get("alt", "")
|
| 116 |
+
src = node.get("src", "")
|
| 117 |
+
|
| 118 |
+
if alt:
|
| 119 |
+
# URL解码(处理 &space; 等编码)
|
| 120 |
+
alt_decoded = unquote(alt.replace('&space;', ' ').replace('\', '\\'))
|
| 121 |
+
|
| 122 |
+
# 检测ALT是否包含LaTeX特征
|
| 123 |
+
is_latex = False
|
| 124 |
+
# 1. 以$开头结尾
|
| 125 |
+
if alt_decoded.strip().startswith('$') and len(alt_decoded.strip()) > 2:
|
| 126 |
+
is_latex = True
|
| 127 |
+
# 2. 以\[开头或\]结尾 (display math)
|
| 128 |
+
elif alt_decoded.strip().startswith('\\[') or alt_decoded.strip().endswith('\\]'):
|
| 129 |
+
is_latex = True
|
| 130 |
+
# 3. 包含LaTeX命令 (\frac, \sum, \alpha等)
|
| 131 |
+
elif re.search(r'\\[a-zA-Z]+', alt_decoded):
|
| 132 |
+
is_latex = True
|
| 133 |
+
# 4. 包含上下标
|
| 134 |
+
elif re.search(r'\^|_\{|_[a-zA-Z0-9]', alt_decoded):
|
| 135 |
+
is_latex = True
|
| 136 |
+
# 5. src包含latex相关关键词(作为辅助判断)
|
| 137 |
+
elif any(kw in src.lower() for kw in ['latex', 'codecogs', 'math', 'tex', 'equation']):
|
| 138 |
+
if len(alt_decoded.strip()) > 1:
|
| 139 |
+
is_latex = True
|
| 140 |
+
|
| 141 |
+
if is_latex:
|
| 142 |
+
# 创建span保存LaTeX公式
|
| 143 |
+
new_span = Element("span")
|
| 144 |
+
# 确保公式被正确包装
|
| 145 |
+
if alt_decoded.strip().startswith('$') or alt_decoded.strip().startswith('\\['):
|
| 146 |
+
new_span.text = alt_decoded
|
| 147 |
+
else:
|
| 148 |
+
new_span.text = wrap_math(alt_decoded)
|
| 149 |
+
|
| 150 |
+
# 在img之前插入span
|
| 151 |
+
parent = node.getparent()
|
| 152 |
+
if parent is not None:
|
| 153 |
+
node.addprevious(new_span)
|
| 154 |
+
|
| 155 |
+
# 删除图片节点
|
| 156 |
+
parent = node.getparent()
|
| 157 |
+
if parent is not None:
|
| 158 |
+
parent.remove(node)
|
| 159 |
+
|
| 160 |
+
for html_map in list(tree.xpath(".//map")):
|
| 161 |
+
parent = html_map.getparent()
|
| 162 |
+
if parent is not None:
|
| 163 |
+
parent.remove(html_map)
|
| 164 |
+
return tree
|
| 165 |
+
|
| 166 |
+
def _strip_images_from_html(self, html_fragment: str) -> str:
|
| 167 |
+
if not html_fragment:
|
| 168 |
+
return html_fragment
|
| 169 |
+
try:
|
| 170 |
+
wrapper = fromstring(f"<div>{html_fragment}</div>")
|
| 171 |
+
except Exception:
|
| 172 |
+
return html_fragment
|
| 173 |
+
self._remove_images_from_tree(wrapper)
|
| 174 |
+
return "".join(tostring(child, encoding=str) for child in wrapper)
|
| 175 |
+
|
| 176 |
+
def recover_wild_text(self, tree, base_url="", aggressive=False):
|
| 177 |
+
if tree is None:
|
| 178 |
+
return None
|
| 179 |
+
working_tree = deepcopy(tree)
|
| 180 |
+
try:
|
| 181 |
+
pruned_tree, _ = self.prune_unwanted_sections(working_tree)
|
| 182 |
+
except Exception:
|
| 183 |
+
pruned_tree = working_tree
|
| 184 |
+
search_expr = ".//p|.//pre|.//code|.//blockquote|.//q|.//quote"
|
| 185 |
+
if self.include_tables:
|
| 186 |
+
search_expr += "|.//table"
|
| 187 |
+
if aggressive:
|
| 188 |
+
search_expr += "|.//div|.//section|.//article|.//li"
|
| 189 |
+
try:
|
| 190 |
+
nodes = pruned_tree.xpath(search_expr)
|
| 191 |
+
except Exception:
|
| 192 |
+
nodes = []
|
| 193 |
+
if not nodes:
|
| 194 |
+
return None
|
| 195 |
+
container = Element("div")
|
| 196 |
+
seen_texts = set()
|
| 197 |
+
for node in nodes:
|
| 198 |
+
try:
|
| 199 |
+
text_value = trim(node.text_content())
|
| 200 |
+
except Exception:
|
| 201 |
+
text_value = None
|
| 202 |
+
if not text_value:
|
| 203 |
+
continue
|
| 204 |
+
if text_len(text_value) < 10:
|
| 205 |
+
continue
|
| 206 |
+
if text_value in seen_texts:
|
| 207 |
+
continue
|
| 208 |
+
seen_texts.add(text_value)
|
| 209 |
+
if node.tag == "table":
|
| 210 |
+
if self.include_tables:
|
| 211 |
+
container.append(deepcopy(node))
|
| 212 |
+
continue
|
| 213 |
+
else:
|
| 214 |
+
paragraph = Element("p")
|
| 215 |
+
paragraph.text = text_value
|
| 216 |
+
container.append(paragraph)
|
| 217 |
+
if len(container) == 0:
|
| 218 |
+
return None
|
| 219 |
+
return tostring(container, encoding=str)
|
| 220 |
+
|
| 221 |
+
def readability_fallback(self, tree, base_url=""):
|
| 222 |
+
if tree is None:
|
| 223 |
+
return None
|
| 224 |
+
try:
|
| 225 |
+
doc = DocumentPlus(
|
| 226 |
+
deepcopy(tree),
|
| 227 |
+
url=base_url,
|
| 228 |
+
xp_num="others",
|
| 229 |
+
need_comment=self.need_comment,
|
| 230 |
+
)
|
| 231 |
+
return doc.summary(html_partial=True)
|
| 232 |
+
except Exception:
|
| 233 |
+
return None
|
| 234 |
+
|
| 235 |
+
def apply_fallbacks(self, primary_html, base_url, normal_tree, raw_tree):
|
| 236 |
+
if self._is_content_sufficient(primary_html):
|
| 237 |
+
return primary_html, "primary"
|
| 238 |
+
|
| 239 |
+
wild_html = None
|
| 240 |
+
if self.enable_wild_text_fallback:
|
| 241 |
+
wild_html = self.recover_wild_text(normal_tree, base_url)
|
| 242 |
+
if self._is_content_sufficient(wild_html):
|
| 243 |
+
return wild_html, "wild_text"
|
| 244 |
+
|
| 245 |
+
readability_html = None
|
| 246 |
+
if self.enable_readability_fallback:
|
| 247 |
+
readability_html = self.readability_fallback(raw_tree, base_url)
|
| 248 |
+
if self._is_content_sufficient(readability_html):
|
| 249 |
+
return readability_html, "readability"
|
| 250 |
+
|
| 251 |
+
for candidate, name in (
|
| 252 |
+
(primary_html, "primary"),
|
| 253 |
+
(wild_html, "wild_text"),
|
| 254 |
+
(readability_html, "readability"),
|
| 255 |
+
):
|
| 256 |
+
if candidate:
|
| 257 |
+
return candidate, name
|
| 258 |
+
return "", "primary"
|
| 259 |
+
|
| 260 |
+
def prune_unwanted_nodes(self, tree, nodelist, with_backup=False):
|
| 261 |
+
if with_backup is True:
|
| 262 |
+
old_len = len(tree.text_content())
|
| 263 |
+
backup = deepcopy(tree)
|
| 264 |
+
for expr in nodelist:
|
| 265 |
+
for subtree in tree.xpath(expr):
|
| 266 |
+
if self.preserve_math_containers and subtree.xpath(".//math"):
|
| 267 |
+
continue
|
| 268 |
+
|
| 269 |
+
# DISCARD_IMAGE_ELEMENTS 需要特殊判断
|
| 270 |
+
if '"caption"' in expr and subtree.xpath(".//img"):
|
| 271 |
+
continue
|
| 272 |
+
# 有些出现hidden
|
| 273 |
+
if "hidden" in expr:
|
| 274 |
+
try:
|
| 275 |
+
if re.findall(
|
| 276 |
+
"overflow-x:\s*hidden", subtree.attrib["style"]
|
| 277 |
+
) or re.findall(
|
| 278 |
+
"overflow-y:\s*hidden", subtree.attrib["style"]
|
| 279 |
+
):
|
| 280 |
+
continue
|
| 281 |
+
if re.findall(
|
| 282 |
+
"overflow:\s*hidden", subtree.attrib["style"]
|
| 283 |
+
) and re.findall("height:", subtree.attrib["style"]):
|
| 284 |
+
height_px = re.findall(
|
| 285 |
+
"height:\s*(\d+)", subtree.attrib["style"]
|
| 286 |
+
)[0]
|
| 287 |
+
if int(height_px) >= 800:
|
| 288 |
+
continue
|
| 289 |
+
except:
|
| 290 |
+
pass
|
| 291 |
+
|
| 292 |
+
if ancestor_node_check(subtree, ['code', 'pre']):
|
| 293 |
+
continue
|
| 294 |
+
self.remove_node(subtree)
|
| 295 |
+
if with_backup is False:
|
| 296 |
+
return tree
|
| 297 |
+
# else:
|
| 298 |
+
new_len = len(tree.text_content())
|
| 299 |
+
if new_len > old_len / 7:
|
| 300 |
+
return tree
|
| 301 |
+
return backup
|
| 302 |
+
|
| 303 |
+
def prune_html(self, tree):
|
| 304 |
+
"""Delete selected empty elements"""
|
| 305 |
+
for element in tree.xpath(".//*[not(node())]"):
|
| 306 |
+
if element.tag in CUT_EMPTY_ELEMS:
|
| 307 |
+
self.remove_node(element)
|
| 308 |
+
return tree
|
| 309 |
+
|
| 310 |
+
def remove_node(self, node: HtmlElement):
|
| 311 |
+
parent = node.getparent()
|
| 312 |
+
if text_strip(node.tail):
|
| 313 |
+
previous = node.getprevious()
|
| 314 |
+
if previous is None:
|
| 315 |
+
if parent is not None:
|
| 316 |
+
if text_strip(parent.text):
|
| 317 |
+
parent.text = "".join([parent.text, node.tail])
|
| 318 |
+
else:
|
| 319 |
+
parent.text = node.tail
|
| 320 |
+
else:
|
| 321 |
+
if text_strip(previous.tail):
|
| 322 |
+
previous.tail = "".join([previous.tail, node.tail])
|
| 323 |
+
else:
|
| 324 |
+
previous.tail = node.tail
|
| 325 |
+
|
| 326 |
+
if parent is not None:
|
| 327 |
+
idx = node.attrib.get(Unique_ID, "")
|
| 328 |
+
parent.remove(node)
|
| 329 |
+
if idx:
|
| 330 |
+
self.drop_ids.append(int(idx))
|
| 331 |
+
|
| 332 |
+
def clean_tags(self, tree):
|
| 333 |
+
strip_elements(tree, Comment)
|
| 334 |
+
|
| 335 |
+
xp_lists = []
|
| 336 |
+
if not self.need_comment:
|
| 337 |
+
xp_lists.append(REMOVE_COMMENTS_XPATH)
|
| 338 |
+
xp_lists.append(CONTENT_EXTRACTOR_NOISE_XPATHS)
|
| 339 |
+
for xp_list in xp_lists:
|
| 340 |
+
tree = self.prune_unwanted_nodes(tree, xp_list)
|
| 341 |
+
|
| 342 |
+
cleaning_list, stripping_list = (
|
| 343 |
+
MANUALLY_CLEANED.copy(),
|
| 344 |
+
MANUALLY_STRIPPED.copy(),
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
for elem in tree.xpath(".//figure[descendant::table]"):
|
| 348 |
+
elem.tag = "div"
|
| 349 |
+
|
| 350 |
+
for expression in cleaning_list + ["form"]:
|
| 351 |
+
for element in tree.getiterator(expression):
|
| 352 |
+
if self.preserve_math_containers and element.xpath('.//math'):
|
| 353 |
+
continue
|
| 354 |
+
# 针对form 标签特殊处理
|
| 355 |
+
if element.tag == "form":
|
| 356 |
+
ptest = element.xpath(".//text()[not(ancestor::a)]")
|
| 357 |
+
if text_len("".join(ptest)) <= 60: # 50
|
| 358 |
+
self.remove_node(element)
|
| 359 |
+
else:
|
| 360 |
+
self.remove_node(element)
|
| 361 |
+
|
| 362 |
+
HTML_CLEANER.kill_tags, HTML_CLEANER.remove_tags = cleaning_list, stripping_list
|
| 363 |
+
cleaned_tree = HTML_CLEANER.clean_html(self.prune_html(tree))
|
| 364 |
+
|
| 365 |
+
return cleaned_tree
|
| 366 |
+
|
| 367 |
+
def generate_unique_id(self, element):
|
| 368 |
+
idx = 0
|
| 369 |
+
for node in iter_node(element):
|
| 370 |
+
l_tag = node.tag.lower()
|
| 371 |
+
if l_tag not in ["html", "body"]:
|
| 372 |
+
node.attrib[Unique_ID] = str(idx)
|
| 373 |
+
idx += 1
|
| 374 |
+
|
| 375 |
+
def clean_unique_id(self, raw_element, content_html):
|
| 376 |
+
ids = re.findall(f' {Unique_ID}="(\d+)"', content_html)
|
| 377 |
+
self.drop_ids = list(set(self.drop_ids))
|
| 378 |
+
self.drop_ids.sort()
|
| 379 |
+
skip_ids = [-1]
|
| 380 |
+
for x in ids:
|
| 381 |
+
if int(x) > int(skip_ids[-1]):
|
| 382 |
+
skip_ids.append(int(x))
|
| 383 |
+
drop_node = raw_element.xpath(
|
| 384 |
+
f"//*[@{Unique_ID}='{x}']"
|
| 385 |
+
)
|
| 386 |
+
if drop_node:
|
| 387 |
+
new_div = Element("div")
|
| 388 |
+
for j in self.drop_ids:
|
| 389 |
+
if int(j) > int(skip_ids[-1]):
|
| 390 |
+
append_element = drop_node[0].xpath(
|
| 391 |
+
f".//*[@{Unique_ID}='{j}']"
|
| 392 |
+
)
|
| 393 |
+
if append_element:
|
| 394 |
+
skip_ids.append(j)
|
| 395 |
+
if len(append_element[0]) > 0:
|
| 396 |
+
skip_ids.extend(
|
| 397 |
+
[
|
| 398 |
+
int(pjid)
|
| 399 |
+
for pjid in append_element[0].xpath(
|
| 400 |
+
f".//*/@{Unique_ID}"
|
| 401 |
+
)
|
| 402 |
+
]
|
| 403 |
+
)
|
| 404 |
+
append_element[0].tail = None
|
| 405 |
+
new_div.append(append_element[0])
|
| 406 |
+
|
| 407 |
+
try:
|
| 408 |
+
drop_node[0].addnext(new_div)
|
| 409 |
+
parent = drop_node[0].getparent()
|
| 410 |
+
if parent is not None:
|
| 411 |
+
parent.remove(drop_node[0])
|
| 412 |
+
except:
|
| 413 |
+
pass
|
| 414 |
+
|
| 415 |
+
content_html = re.sub(f' {Unique_ID}="\d+"', "", content_html)
|
| 416 |
+
|
| 417 |
+
drop_html = re.sub(
|
| 418 |
+
f' {Unique_ID}="\d+"',
|
| 419 |
+
"",
|
| 420 |
+
tostring(raw_element, encoding=str),
|
| 421 |
+
)
|
| 422 |
+
return content_html, drop_html
|
| 423 |
+
|
| 424 |
+
def math_latex_processing(self, node):
|
| 425 |
+
# 1. 文本中有\\begin{align} 或 \\begin{equation}
|
| 426 |
+
if node.tag not in ["script", "style"] and text_strip(node.text):
|
| 427 |
+
regex = r"\\begin{align}(.*?)\\end{align}"
|
| 428 |
+
text = node.text
|
| 429 |
+
matches = re.findall(regex, text, re.DOTALL)
|
| 430 |
+
if matches:
|
| 431 |
+
node.text = text.replace("\\begin{align}", "").replace(
|
| 432 |
+
"\\end{align}", ""
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
if node.tag not in ["script", "style"] and text_strip(node.text):
|
| 436 |
+
regex = r"\\begin{equation}(.*?)\\end{equation}"
|
| 437 |
+
text = node.text
|
| 438 |
+
matches = re.findall(regex, text, re.DOTALL)
|
| 439 |
+
for match in matches:
|
| 440 |
+
match = match.replace("\\begin{equation}", "")
|
| 441 |
+
match = match.replace("\\end{equation}", "")
|
| 442 |
+
wrapped_text = wrap_math(match, display=True)
|
| 443 |
+
text = text.replace(match, wrapped_text)
|
| 444 |
+
if matches:
|
| 445 |
+
# Remove the \begin{equation} and \end{equation} tags
|
| 446 |
+
text = text.replace("\\begin{equation}", "").replace(
|
| 447 |
+
"\\end{equation}", ""
|
| 448 |
+
)
|
| 449 |
+
node.text = text
|
| 450 |
+
|
| 451 |
+
if node.tag not in ["script", "style"] and text_strip(node.tail):
|
| 452 |
+
regex = r"\\begin{align}(.*?)\\end{align}"
|
| 453 |
+
text = node.tail
|
| 454 |
+
matches = re.findall(regex, text, re.DOTALL)
|
| 455 |
+
if matches:
|
| 456 |
+
node.tail = text.replace("\\begin{align}", "").replace(
|
| 457 |
+
"\\end{align}", ""
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
if node.tag not in ["script", "style"] and text_strip(node.tail):
|
| 461 |
+
regex = r"\\begin{equation}(.*?)\\end{equation}"
|
| 462 |
+
text = node.tail
|
| 463 |
+
matches = re.findall(regex, text, re.DOTALL)
|
| 464 |
+
for match in matches:
|
| 465 |
+
match = match.replace("\\begin{equation}", "")
|
| 466 |
+
match = match.replace("\\end{equation}", "")
|
| 467 |
+
wrapped_text = wrap_math(match, display=True)
|
| 468 |
+
text = text.replace(match, wrapped_text)
|
| 469 |
+
if matches:
|
| 470 |
+
# Remove the \begin{equation} and \end{equation} tags
|
| 471 |
+
text = text.replace("\\begin{equation}", "").replace(
|
| 472 |
+
"\\end{equation}", ""
|
| 473 |
+
)
|
| 474 |
+
node.tail = text
|
| 475 |
+
|
| 476 |
+
node_class = node.get("class")
|
| 477 |
+
|
| 478 |
+
parent = node.getparent()
|
| 479 |
+
|
| 480 |
+
# 2. class 为 texerror 的标签
|
| 481 |
+
# Find the text between {} (maximum length) and replace the texerror with that text
|
| 482 |
+
|
| 483 |
+
# 3. img中的latex
|
| 484 |
+
if node.tag == "img":
|
| 485 |
+
if node_class:
|
| 486 |
+
class_list = node_class.split(" ")
|
| 487 |
+
if any(
|
| 488 |
+
[img_class in class_list for img_class in latex_image_class_names]
|
| 489 |
+
):
|
| 490 |
+
alt = node.get("alt")
|
| 491 |
+
if text_strip(alt):
|
| 492 |
+
new_span = Element("span")
|
| 493 |
+
wrapped_alt = wrap_math(alt)
|
| 494 |
+
new_span.text = wrapped_alt
|
| 495 |
+
node.addprevious(new_span)
|
| 496 |
+
self.remove_node(node)
|
| 497 |
+
src = node.get("src")
|
| 498 |
+
if src:
|
| 499 |
+
if "codecogs.com" in src:
|
| 500 |
+
try:
|
| 501 |
+
latex = src.split("?")[1:]
|
| 502 |
+
latex = "?".join(
|
| 503 |
+
latex
|
| 504 |
+
) # In case there are multiple ? in the latex
|
| 505 |
+
latex = unquote(latex)
|
| 506 |
+
new_span = Element("span")
|
| 507 |
+
wrapped_latex = wrap_math(latex)
|
| 508 |
+
new_span.text = wrapped_latex
|
| 509 |
+
node.addprevious(new_span)
|
| 510 |
+
self.remove_node(node)
|
| 511 |
+
except:
|
| 512 |
+
pass
|
| 513 |
+
if "latex.php" in src:
|
| 514 |
+
try:
|
| 515 |
+
# they usually have "alt='-i u_t + \Delta u = |u|^2 u'"
|
| 516 |
+
alt = node.get("alt")
|
| 517 |
+
if text_strip(alt):
|
| 518 |
+
# Unescape the latex
|
| 519 |
+
alt = unquote(alt)
|
| 520 |
+
# Get the latex
|
| 521 |
+
wrapped_alt = wrap_math(alt)
|
| 522 |
+
new_span = Element("span")
|
| 523 |
+
new_span.text = wrapped_alt
|
| 524 |
+
node.addprevious(new_span)
|
| 525 |
+
self.remove_node(node)
|
| 526 |
+
except:
|
| 527 |
+
pass
|
| 528 |
+
if "/images/math/codecogs" in src:
|
| 529 |
+
try:
|
| 530 |
+
# they usually have "alt='-i u_t + \Delta u = |u|^2 u'"
|
| 531 |
+
alt = node.get("alt")
|
| 532 |
+
if text_strip(alt):
|
| 533 |
+
# Unescape the latex
|
| 534 |
+
alt = unquote(alt)
|
| 535 |
+
# Get the latex
|
| 536 |
+
wrapped_alt = wrap_math(alt)
|
| 537 |
+
new_span = Element("span")
|
| 538 |
+
new_span.text = wrapped_alt
|
| 539 |
+
node.addprevious(new_span)
|
| 540 |
+
self.remove_node(node)
|
| 541 |
+
except:
|
| 542 |
+
pass
|
| 543 |
+
if "mimetex.cgi" in src:
|
| 544 |
+
try:
|
| 545 |
+
latex = src.split("?")[1:]
|
| 546 |
+
latex = "?".join(
|
| 547 |
+
latex
|
| 548 |
+
) # In case there are multiple ? in the latex
|
| 549 |
+
latex = unquote(latex)
|
| 550 |
+
new_span = Element("span")
|
| 551 |
+
wrapped_latex = wrap_math(latex)
|
| 552 |
+
new_span.text = wrapped_latex
|
| 553 |
+
node.addprevious(new_span)
|
| 554 |
+
self.remove_node(node)
|
| 555 |
+
except:
|
| 556 |
+
pass
|
| 557 |
+
if "mathtex.cgi" in src:
|
| 558 |
+
try:
|
| 559 |
+
latex = src.split("?")[1:]
|
| 560 |
+
latex = "?".join(
|
| 561 |
+
latex
|
| 562 |
+
) # In case there are multiple ? in the latex
|
| 563 |
+
latex = unquote(latex)
|
| 564 |
+
new_span = Element("span")
|
| 565 |
+
wrapped_latex = wrap_math(latex)
|
| 566 |
+
new_span.text = wrapped_latex
|
| 567 |
+
node.addprevious(new_span)
|
| 568 |
+
self.remove_node(node)
|
| 569 |
+
except:
|
| 570 |
+
pass
|
| 571 |
+
if node_class:
|
| 572 |
+
if "x-ck12" in node_class:
|
| 573 |
+
try:
|
| 574 |
+
latex = node.get("alt")
|
| 575 |
+
if text_strip(latex):
|
| 576 |
+
latex = unquote(latex)
|
| 577 |
+
new_span = Element("span")
|
| 578 |
+
wrapped_latex = wrap_math(latex)
|
| 579 |
+
new_span.text = wrapped_latex
|
| 580 |
+
node.addprevious(new_span)
|
| 581 |
+
except:
|
| 582 |
+
pass
|
| 583 |
+
|
| 584 |
+
# 4. class 为 math-container
|
| 585 |
+
if node_class == "math-container":
|
| 586 |
+
try:
|
| 587 |
+
text = node.text
|
| 588 |
+
if text_strip(text):
|
| 589 |
+
new_span = Element("span")
|
| 590 |
+
wrapped_math = wrap_math(text, display=True)
|
| 591 |
+
new_span.text = wrapped_math
|
| 592 |
+
if parent is not None:
|
| 593 |
+
if text_strip(node.tail):
|
| 594 |
+
new_span.tail = node.tail
|
| 595 |
+
parent.replace(node, new_span)
|
| 596 |
+
except:
|
| 597 |
+
pass
|
| 598 |
+
|
| 599 |
+
# 5. class 为 wp-katex-eq
|
| 600 |
+
if node_class == "wp-katex-eq":
|
| 601 |
+
try:
|
| 602 |
+
text = node.text
|
| 603 |
+
if text_strip(text):
|
| 604 |
+
new_span = Element("span")
|
| 605 |
+
display_attr = node.get("data-display")
|
| 606 |
+
if display_attr is not None:
|
| 607 |
+
display = display_attr == "true"
|
| 608 |
+
else:
|
| 609 |
+
display = False
|
| 610 |
+
wrapped_math = wrap_math(text, display=display)
|
| 611 |
+
new_span.text = wrapped_math
|
| 612 |
+
if parent is not None:
|
| 613 |
+
if text_strip(node.tail):
|
| 614 |
+
new_span.tail = node.tail
|
| 615 |
+
parent.replace(node, new_span)
|
| 616 |
+
except:
|
| 617 |
+
pass
|
| 618 |
+
|
| 619 |
+
# 6. script[type="math/tex"]
|
| 620 |
+
if node.tag == "script" and node.get("type") == "math/tex":
|
| 621 |
+
try:
|
| 622 |
+
text = node.text
|
| 623 |
+
if text_strip(text):
|
| 624 |
+
new_span = Element("span")
|
| 625 |
+
wrapped_text = wrap_math(text)
|
| 626 |
+
new_span.text = wrapped_text
|
| 627 |
+
if parent is not None:
|
| 628 |
+
if text_strip(node.tail):
|
| 629 |
+
new_span.tail = node.tail
|
| 630 |
+
parent.replace(node, new_span)
|
| 631 |
+
except:
|
| 632 |
+
pass
|
| 633 |
+
|
| 634 |
+
# 7. script[type="math/asciimath"]
|
| 635 |
+
if node.tag == "script" and node.get("type") == "math/asciimath":
|
| 636 |
+
try:
|
| 637 |
+
text = node.text
|
| 638 |
+
if text_strip(text):
|
| 639 |
+
new_span = Element("span")
|
| 640 |
+
wrapped_asciimath = wrap_math(extract_asciimath(text))
|
| 641 |
+
new_span.text = wrapped_asciimath
|
| 642 |
+
if parent is not None:
|
| 643 |
+
if text_strip(node.tail):
|
| 644 |
+
new_span.tail = node.tail
|
| 645 |
+
parent.replace(node, new_span)
|
| 646 |
+
except:
|
| 647 |
+
# Delete this script tag
|
| 648 |
+
self.remove_node(node)
|
| 649 |
+
|
| 650 |
+
# 8. class tex
|
| 651 |
+
if node_class == "tex":
|
| 652 |
+
try:
|
| 653 |
+
# Check if they have data-expr attr
|
| 654 |
+
expr = node.get("data-expr")
|
| 655 |
+
if text_strip(expr):
|
| 656 |
+
# Replace with a span
|
| 657 |
+
new_span = Element("span")
|
| 658 |
+
wrapped_expr = wrap_math(expr)
|
| 659 |
+
new_span.text = wrapped_expr
|
| 660 |
+
if parent is not None:
|
| 661 |
+
if text_strip(node.tail):
|
| 662 |
+
new_span.tail = node.tail
|
| 663 |
+
parent.replace(node, new_span)
|
| 664 |
+
except:
|
| 665 |
+
pass
|
| 666 |
+
|
| 667 |
+
# 9. span.katex
|
| 668 |
+
if node.tag == "span" and node_class == "katex":
|
| 669 |
+
# Find any spans with class "katex-html" and remove them
|
| 670 |
+
katex_html_spans = node.xpath('.//span[@class="katex-html"]')
|
| 671 |
+
for katex_html_span in katex_html_spans:
|
| 672 |
+
self.remove_node(katex_html_span)
|
| 673 |
+
|
| 674 |
+
# 10. Remove any .MathJax_Preview spans
|
| 675 |
+
if node.tag == "span" and node_class == "MathJax_Preview":
|
| 676 |
+
self.remove_node(node)
|
| 677 |
+
|
| 678 |
+
if node.tag == "span" and node_class and "x-ck12-mathEditor" in node_class:
|
| 679 |
+
try:
|
| 680 |
+
expr = node.get("data-tex")
|
| 681 |
+
if text_strip(expr):
|
| 682 |
+
expr = unquote(expr).replace("\"", "").replace(""", "")
|
| 683 |
+
# Replace with a span
|
| 684 |
+
new_span = Element("span")
|
| 685 |
+
wrapped_expr = wrap_math(expr)
|
| 686 |
+
new_span.text = wrapped_expr
|
| 687 |
+
if parent is not None:
|
| 688 |
+
if text_strip(node.tail):
|
| 689 |
+
new_span.tail = node.tail
|
| 690 |
+
parent.replace(node, new_span)
|
| 691 |
+
except:
|
| 692 |
+
pass
|
| 693 |
+
|
| 694 |
+
# 11. all math tags
|
| 695 |
+
if node.tag == "math":
|
| 696 |
+
annotation_tags = node.xpath('.//annotation[@encoding="application/x-tex"]')
|
| 697 |
+
if len(annotation_tags) > 0:
|
| 698 |
+
annotation_tag = annotation_tags[0]
|
| 699 |
+
text = annotation_tag.text
|
| 700 |
+
if text_strip(text):
|
| 701 |
+
new_span = Element("span")
|
| 702 |
+
wrapped_text = wrap_math(text)
|
| 703 |
+
new_span.text = wrapped_text
|
| 704 |
+
if parent is not None:
|
| 705 |
+
if text_strip(node.tail):
|
| 706 |
+
new_span.tail = node.tail
|
| 707 |
+
parent.replace(node, new_span)
|
| 708 |
+
style_value = parent.get("style")
|
| 709 |
+
if style_value:
|
| 710 |
+
normalized_style_value = (
|
| 711 |
+
style_value.lower()
|
| 712 |
+
.strip()
|
| 713 |
+
.replace(" ", "")
|
| 714 |
+
.replace(";", "")
|
| 715 |
+
)
|
| 716 |
+
if "display:none" in normalized_style_value:
|
| 717 |
+
parent.style = ""
|
| 718 |
+
elif text_strip(node.get("alttext")):
|
| 719 |
+
# Get the alttext attribute
|
| 720 |
+
alttext = node.get("alttext")
|
| 721 |
+
if text_strip(alttext):
|
| 722 |
+
new_span = Element("span")
|
| 723 |
+
wrapped_alttext = wrap_math(alttext)
|
| 724 |
+
new_span.text = wrapped_alttext
|
| 725 |
+
if parent is not None:
|
| 726 |
+
if text_strip(node.tail):
|
| 727 |
+
new_span.tail = node.tail
|
| 728 |
+
parent.replace(node, new_span)
|
| 729 |
+
else:
|
| 730 |
+
try:
|
| 731 |
+
# Try translating to LaTeX
|
| 732 |
+
tmp_node = deepcopy(node)
|
| 733 |
+
tmp_node.tail = None
|
| 734 |
+
mathml = tostring(tmp_node, encoding=str)
|
| 735 |
+
# If this includes xmlns:mml, then we need to replace all
|
| 736 |
+
# instances of mml: with nothing
|
| 737 |
+
if "xmlns:mml" in mathml:
|
| 738 |
+
mathml = mathml.replace("mml:", "")
|
| 739 |
+
# replace xmlns:mml="..." with nothing
|
| 740 |
+
mathml = re.sub(r'xmlns:mml=".*?"', "", mathml)
|
| 741 |
+
# if 'xmlns=' in mathml:
|
| 742 |
+
# mathml = re.sub(r"xmlns='.*?'", '', mathml)
|
| 743 |
+
latex = mml_to_latex(mathml)
|
| 744 |
+
# Make a new span tag
|
| 745 |
+
new_span = Element("span")
|
| 746 |
+
# Set the html of the new span tag to the text
|
| 747 |
+
wrapped_latex = wrap_math(latex)
|
| 748 |
+
new_span.text = wrapped_latex
|
| 749 |
+
# Then, we need to replace the math tag with the new span tag
|
| 750 |
+
if parent is not None:
|
| 751 |
+
if text_strip(node.tail):
|
| 752 |
+
new_span.tail = node.tail
|
| 753 |
+
parent.replace(node, new_span)
|
| 754 |
+
except:
|
| 755 |
+
|
| 756 |
+
self.remove_node(node)
|
| 757 |
+
|
| 758 |
+
if node.tag == "mathjax":
|
| 759 |
+
try:
|
| 760 |
+
# Get the inner text of the mathjax tag
|
| 761 |
+
text = node.text
|
| 762 |
+
if text_strip(text):
|
| 763 |
+
text = html.unescape(text)
|
| 764 |
+
# Use regex to find text wrapped in hashes
|
| 765 |
+
matches = re.findall(r"#(.+?)#", text)
|
| 766 |
+
# For each match, replace the match with the LaTeX
|
| 767 |
+
for match in matches:
|
| 768 |
+
try:
|
| 769 |
+
latex = extract_asciimath(match)
|
| 770 |
+
# Replace the match with the LaTeX
|
| 771 |
+
text = text.replace(f"#{match}#", latex)
|
| 772 |
+
except:
|
| 773 |
+
|
| 774 |
+
pass
|
| 775 |
+
# Create a new span tag
|
| 776 |
+
new_span = Element("span")
|
| 777 |
+
# Set the html of the new span tag to the text
|
| 778 |
+
new_span.text = text
|
| 779 |
+
# Then, we need to replace the mathjax tag with the new span tag
|
| 780 |
+
if parent is not None:
|
| 781 |
+
if text_strip(node.tail):
|
| 782 |
+
new_span.tail = node.tail
|
| 783 |
+
parent.replace(node, new_span)
|
| 784 |
+
except:
|
| 785 |
+
pass
|
| 786 |
+
|
| 787 |
+
def convert_tags(self, element, base_url=""):
|
| 788 |
+
USELESS_ATTR_LIST = USELESS_ATTR
|
| 789 |
+
if not self.need_comment:
|
| 790 |
+
USELESS_ATTR_LIST = USELESS_ATTR_LIST + ["comment"]
|
| 791 |
+
for node in iter_node(element):
|
| 792 |
+
|
| 793 |
+
if self.process_math:
|
| 794 |
+
# 增加数学标签转换
|
| 795 |
+
self.math_latex_processing(node)
|
| 796 |
+
|
| 797 |
+
if "data-src" in node.attrib and "src" not in node.attrib:
|
| 798 |
+
node.attrib["src"] = node.attrib["data-src"]
|
| 799 |
+
if "src" in node.attrib and node.attrib["src"] and base_url:
|
| 800 |
+
src_url = node.attrib["src"]
|
| 801 |
+
absolute_url = urljoin(base_url, src_url)
|
| 802 |
+
node.attrib["src"] = absolute_url
|
| 803 |
+
|
| 804 |
+
if node.tag.lower() == "div" and not node.getchildren():
|
| 805 |
+
node.tag = "p"
|
| 806 |
+
|
| 807 |
+
class_name = node.get("class")
|
| 808 |
+
if class_name:
|
| 809 |
+
if class_name.lower() in USELESS_ATTR_LIST:
|
| 810 |
+
self.remove_node(node)
|
| 811 |
+
|
| 812 |
+
return element
|
| 813 |
+
|
| 814 |
+
def delete_by_link_density(
|
| 815 |
+
self, subtree, tagname, backtracking=False, favor_precision=False
|
| 816 |
+
):
|
| 817 |
+
need_del_par = []
|
| 818 |
+
skip_par = []
|
| 819 |
+
drop_list = False
|
| 820 |
+
for descendant in subtree.iter(tagname):
|
| 821 |
+
pparent = descendant.getparent()
|
| 822 |
+
if pparent in need_del_par or pparent in skip_par:
|
| 823 |
+
continue
|
| 824 |
+
siblings = descendant.xpath(f"following-sibling::{tagname}")
|
| 825 |
+
|
| 826 |
+
if 'list' in descendant.get("class", "") and len(descendant.xpath('./a')) >= 5:
|
| 827 |
+
need_del_par.append(descendant)
|
| 828 |
+
need_del_par.extend(siblings)
|
| 829 |
+
continue
|
| 830 |
+
|
| 831 |
+
nn = [descendant]
|
| 832 |
+
nn.extend(siblings)
|
| 833 |
+
txt_max_num = 0
|
| 834 |
+
if len(siblings) + 1 >= 4:
|
| 835 |
+
pass
|
| 836 |
+
else:
|
| 837 |
+
txt_max_dict = {
|
| 838 |
+
"read": 0,
|
| 839 |
+
"more": 0,
|
| 840 |
+
"...": 0,
|
| 841 |
+
"阅读": 0,
|
| 842 |
+
"更多": 0,
|
| 843 |
+
"详细": 0,
|
| 844 |
+
"detail": 0,
|
| 845 |
+
"article": 0,
|
| 846 |
+
"blog": 0,
|
| 847 |
+
"news": 0,
|
| 848 |
+
}
|
| 849 |
+
if tagname == "div" or tagname == "article" or tagname == "section":
|
| 850 |
+
for j in nn:
|
| 851 |
+
txt = "".join(j.xpath(".//text()")).strip()
|
| 852 |
+
for x in [
|
| 853 |
+
"read",
|
| 854 |
+
"more",
|
| 855 |
+
"...",
|
| 856 |
+
"阅读",
|
| 857 |
+
"更多",
|
| 858 |
+
"详细",
|
| 859 |
+
"detail",
|
| 860 |
+
"article",
|
| 861 |
+
"blog",
|
| 862 |
+
"news",
|
| 863 |
+
]:
|
| 864 |
+
if txt.lower().endswith(x):
|
| 865 |
+
txt_max_dict[x] += 1
|
| 866 |
+
txt_num = max(txt_max_dict.values())
|
| 867 |
+
if txt_max_num < txt_num:
|
| 868 |
+
txt_max_num = txt_num
|
| 869 |
+
if txt_max_num >= 3:
|
| 870 |
+
break
|
| 871 |
+
if txt_max_num >= 3:
|
| 872 |
+
pass
|
| 873 |
+
else:
|
| 874 |
+
continue
|
| 875 |
+
skip_par.append(pparent)
|
| 876 |
+
a_num = 0
|
| 877 |
+
for j in siblings:
|
| 878 |
+
if j.xpath(".//a"):
|
| 879 |
+
if tagname == "p":
|
| 880 |
+
if density_of_a_text(j, pre=0.8):
|
| 881 |
+
a_num += 1
|
| 882 |
+
elif tagname in ["div", "section", "article"]:
|
| 883 |
+
if density_of_a_text(j, pre=0.2):
|
| 884 |
+
a_num += 1
|
| 885 |
+
else:
|
| 886 |
+
if self.need_comment:
|
| 887 |
+
# 增加判断是否包含评论 再决定是否删除
|
| 888 |
+
break_flg = False
|
| 889 |
+
for c_xpath in Forum_XPATH[:-1]:
|
| 890 |
+
if j.xpath(c_xpath.replace(".//*", "self::*")):
|
| 891 |
+
break_flg = True
|
| 892 |
+
break
|
| 893 |
+
if break_flg:
|
| 894 |
+
continue
|
| 895 |
+
if tagname == "li":
|
| 896 |
+
if text_len("".join(j.xpath(".//text()[not(ancestor::a)]"))) > 50:
|
| 897 |
+
continue
|
| 898 |
+
a_num += 1
|
| 899 |
+
|
| 900 |
+
if a_num < len(siblings):
|
| 901 |
+
if a_num >= 15 and (
|
| 902 |
+
tagname == "div" or tagname == "article" or tagname == "section"
|
| 903 |
+
):
|
| 904 |
+
pass
|
| 905 |
+
else:
|
| 906 |
+
continue
|
| 907 |
+
|
| 908 |
+
similarity_with_siblings_nums = similarity_with_siblings(
|
| 909 |
+
descendant, siblings
|
| 910 |
+
)
|
| 911 |
+
if tagname == "article" or tagname == "item": # or tagname == "section"
|
| 912 |
+
similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5
|
| 913 |
+
# 列表有个很特殊的地方 另一种情况就是 descendant和siblings 都包含title/h1 | h2 标签
|
| 914 |
+
if tagname == "div" or tagname == "article" or tagname == "section":
|
| 915 |
+
title_max_num = 0
|
| 916 |
+
for ll in [".//head[@rend='h2']", ".//head[@rend='h1']", "./article"]:
|
| 917 |
+
title_num = 0
|
| 918 |
+
for jj in nn:
|
| 919 |
+
if jj.xpath(ll):
|
| 920 |
+
title_num += 1
|
| 921 |
+
if title_max_num < title_num:
|
| 922 |
+
title_max_num = title_num
|
| 923 |
+
if title_max_num >= 4:
|
| 924 |
+
similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5
|
| 925 |
+
|
| 926 |
+
if txt_max_num >= 3:
|
| 927 |
+
pass
|
| 928 |
+
elif similarity_with_siblings_nums < 0.84:
|
| 929 |
+
if len(siblings) >= 15 and (
|
| 930 |
+
tagname == "div" or tagname == "article" or tagname == "section"
|
| 931 |
+
):
|
| 932 |
+
pass
|
| 933 |
+
else:
|
| 934 |
+
continue
|
| 935 |
+
# 父div中包含多同级div 且div class post-时,删除其余节点,保留第一篇文章
|
| 936 |
+
class_attr = descendant.get("class") if descendant.get("class") else ""
|
| 937 |
+
if (
|
| 938 |
+
re.findall("post-", class_attr, re.I)
|
| 939 |
+
or re.findall("-post", class_attr, re.I)
|
| 940 |
+
or re.findall("blog|aricle", class_attr, re.I)
|
| 941 |
+
):
|
| 942 |
+
drop_list = True
|
| 943 |
+
sk_flg = True
|
| 944 |
+
for dl in siblings:
|
| 945 |
+
if (
|
| 946 |
+
text_len("".join(descendant.xpath(".//text()"))) * 2
|
| 947 |
+
< text_len("".join(dl.xpath(".//text()")))
|
| 948 |
+
and sk_flg
|
| 949 |
+
):
|
| 950 |
+
self.remove_node(descendant)
|
| 951 |
+
sk_flg = False
|
| 952 |
+
else:
|
| 953 |
+
self.remove_node(dl)
|
| 954 |
+
else:
|
| 955 |
+
need_del_par.append(descendant)
|
| 956 |
+
need_del_par.extend(siblings)
|
| 957 |
+
for node in need_del_par:
|
| 958 |
+
drop_list = True
|
| 959 |
+
try:
|
| 960 |
+
self.remove_node(node)
|
| 961 |
+
except Exception as e:
|
| 962 |
+
pass
|
| 963 |
+
|
| 964 |
+
myelems, deletions = defaultdict(list), []
|
| 965 |
+
|
| 966 |
+
if tagname == "div":
|
| 967 |
+
for elem in subtree.iter(tagname):
|
| 968 |
+
if density_of_a_text(elem, pre=0.8) and img_div_check(elem):
|
| 969 |
+
deletions.append(elem)
|
| 970 |
+
|
| 971 |
+
for elem in subtree.iter(tagname):
|
| 972 |
+
elemtext = trim(elem.text_content())
|
| 973 |
+
result, templist = link_density_test(elem, elemtext, favor_precision)
|
| 974 |
+
if result is True and img_div_check(elem):
|
| 975 |
+
# 保留table中的链接
|
| 976 |
+
if tagname in ['ul', 'li', 'div', 'p'] and ancestor_node_check(elem, ['td']):
|
| 977 |
+
continue
|
| 978 |
+
deletions.append(elem)
|
| 979 |
+
elif backtracking is True and len(templist) > 0: # if?
|
| 980 |
+
myelems[elemtext].append(elem)
|
| 981 |
+
if backtracking is True:
|
| 982 |
+
if favor_precision is False:
|
| 983 |
+
threshold = 100
|
| 984 |
+
else:
|
| 985 |
+
threshold = 200
|
| 986 |
+
for text, elem in myelems.items():
|
| 987 |
+
if 0 < len(text) < threshold and len(elem) >= 3:
|
| 988 |
+
deletions.extend(elem)
|
| 989 |
+
|
| 990 |
+
for elem in uniquify_list(deletions):
|
| 991 |
+
try:
|
| 992 |
+
if self.need_comment:
|
| 993 |
+
# 增加判断是否包含评论 再决定是否删除
|
| 994 |
+
break_flg = False
|
| 995 |
+
for c_xpath in Forum_XPATH[:-1]:
|
| 996 |
+
if elem.xpath(c_xpath):
|
| 997 |
+
break_flg = True
|
| 998 |
+
break
|
| 999 |
+
if break_flg:
|
| 1000 |
+
continue
|
| 1001 |
+
self.remove_node(elem)
|
| 1002 |
+
except AttributeError:
|
| 1003 |
+
pass
|
| 1004 |
+
return subtree, drop_list
|
| 1005 |
+
|
| 1006 |
+
def prune_unwanted_sections(self, tree):
|
| 1007 |
+
tmp_OVERALL_DISCARD_XPATH = OVERALL_DISCARD_XPATH
|
| 1008 |
+
if self.need_comment:
|
| 1009 |
+
tmp_OVERALL_DISCARD_XPATH = tmp_OVERALL_DISCARD_XPATH[:-1]
|
| 1010 |
+
tree = self.prune_unwanted_nodes(
|
| 1011 |
+
tree, tmp_OVERALL_DISCARD_XPATH, with_backup=True
|
| 1012 |
+
)
|
| 1013 |
+
for xp_list in [
|
| 1014 |
+
PAYWALL_DISCARD_XPATH,
|
| 1015 |
+
TEASER_DISCARD_XPATH,
|
| 1016 |
+
DISCARD_IMAGE_ELEMENTS,
|
| 1017 |
+
]:
|
| 1018 |
+
tree = self.prune_unwanted_nodes(tree, xp_list)
|
| 1019 |
+
# remove elements by link density
|
| 1020 |
+
tree, drop_list_1 = self.delete_by_link_density(
|
| 1021 |
+
tree, "div", backtracking=True, favor_precision=False
|
| 1022 |
+
)
|
| 1023 |
+
tree, drop_list_1_1 = self.delete_by_link_density(
|
| 1024 |
+
tree, "article", backtracking=False, favor_precision=False
|
| 1025 |
+
)
|
| 1026 |
+
tree, drop_list_1_2 = self.delete_by_link_density(
|
| 1027 |
+
tree, "section", backtracking=False, favor_precision=False
|
| 1028 |
+
)
|
| 1029 |
+
tree, drop_list_2_1 = self.delete_by_link_density(
|
| 1030 |
+
tree, "ul", backtracking=False, favor_precision=False
|
| 1031 |
+
)
|
| 1032 |
+
tree, drop_list_2_2 = self.delete_by_link_density(
|
| 1033 |
+
tree, "li", backtracking=False, favor_precision=False
|
| 1034 |
+
)
|
| 1035 |
+
tree, drop_list_3_1 = self.delete_by_link_density(
|
| 1036 |
+
tree, "dl", backtracking=False, favor_precision=False
|
| 1037 |
+
)
|
| 1038 |
+
tree, drop_list_3_3 = self.delete_by_link_density(
|
| 1039 |
+
tree, "dt", backtracking=False, favor_precision=False
|
| 1040 |
+
)
|
| 1041 |
+
tree, drop_list_3_2 = self.delete_by_link_density(
|
| 1042 |
+
tree, "dd", backtracking=False, favor_precision=False
|
| 1043 |
+
)
|
| 1044 |
+
tree, drop_list_3 = self.delete_by_link_density(
|
| 1045 |
+
tree, "p", backtracking=False, favor_precision=False
|
| 1046 |
+
)
|
| 1047 |
+
|
| 1048 |
+
return (
|
| 1049 |
+
tree,
|
| 1050 |
+
drop_list_1
|
| 1051 |
+
or drop_list_2_1
|
| 1052 |
+
or drop_list_2_2
|
| 1053 |
+
or drop_list_3
|
| 1054 |
+
or drop_list_1_1
|
| 1055 |
+
or drop_list_1_2
|
| 1056 |
+
or drop_list_3_1
|
| 1057 |
+
or drop_list_3_2
|
| 1058 |
+
or drop_list_3_3,
|
| 1059 |
+
)
|
ultradata_math_parser/parsers/custom_parser.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding:utf-8 -*-
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
from ultradata_math_parser.utils import *
|
| 5 |
+
from ultradata_math_parser.parsers.base_parser import BaseParser
|
| 6 |
+
from ultradata_math_parser.parsers.title_parser import TitleParser
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class CustomParser(BaseParser):
|
| 10 |
+
def __init__(self) -> None:
|
| 11 |
+
super().__init__()
|
| 12 |
+
|
| 13 |
+
def use_clean_rule(self, tree, clean_rules):
|
| 14 |
+
for clean_rule in clean_rules:
|
| 15 |
+
for x in tree.xpath(clean_rule):
|
| 16 |
+
self.remove_node(x)
|
| 17 |
+
return tree
|
| 18 |
+
|
| 19 |
+
def use_extract_rule(self, tree, extract_rule):
|
| 20 |
+
if "/text()" in extract_rule["value"]:
|
| 21 |
+
return "".join(tree.xpath(extract_rule["value"])).strip()
|
| 22 |
+
return tree.xpath(extract_rule["value"])[0]
|
| 23 |
+
|
| 24 |
+
def extract(self, html="", base_url="", rule={}, **kwargs) -> dict:
|
| 25 |
+
self.include_images = kwargs.get("include_images", False)
|
| 26 |
+
tree = load_html(html)
|
| 27 |
+
if tree is None:
|
| 28 |
+
raise ValueError
|
| 29 |
+
|
| 30 |
+
# base_url
|
| 31 |
+
base_href = tree.xpath("//base/@href")
|
| 32 |
+
|
| 33 |
+
if base_href and "http" in base_href[0]:
|
| 34 |
+
base_url = base_href[0]
|
| 35 |
+
|
| 36 |
+
if "clean" in rule:
|
| 37 |
+
tree = self.use_clean_rule(tree, rule["clean"])
|
| 38 |
+
|
| 39 |
+
# 获取title
|
| 40 |
+
if "title" not in rule:
|
| 41 |
+
title = TitleParser().process(tree)
|
| 42 |
+
else:
|
| 43 |
+
title = self.use_extract_rule(tree, rule["title"])
|
| 44 |
+
|
| 45 |
+
# 文章区域
|
| 46 |
+
try:
|
| 47 |
+
body_tree = self.use_extract_rule(tree, rule["content"])
|
| 48 |
+
except:
|
| 49 |
+
raise ValueError
|
| 50 |
+
if not self.include_images:
|
| 51 |
+
self._remove_images_from_tree(body_tree)
|
| 52 |
+
body_html = tostring(body_tree, encoding=str)
|
| 53 |
+
body_html = self._strip_images_from_html(body_html)
|
| 54 |
+
|
| 55 |
+
text_length = self._text_length_from_html(body_html)
|
| 56 |
+
|
| 57 |
+
return {
|
| 58 |
+
"xp_num": "custom",
|
| 59 |
+
"drop_list": False,
|
| 60 |
+
"html": body_html,
|
| 61 |
+
"title": title,
|
| 62 |
+
"base_url": base_url,
|
| 63 |
+
"text_length": text_length,
|
| 64 |
+
}
|
ultradata_math_parser/parsers/forum_parser.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding:utf-8 -*-
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
from ultradata_math_parser.config import Forum_XPATH, Unique_ID
|
| 5 |
+
from ultradata_math_parser.utils import *
|
| 6 |
+
from ultradata_math_parser.parsers.base_parser import BaseParser
|
| 7 |
+
from ultradata_math_parser.parsers.title_parser import TitleParser
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class ForumParser(BaseParser):
|
| 11 |
+
def __init__(self) -> None:
|
| 12 |
+
super().__init__()
|
| 13 |
+
|
| 14 |
+
def extract(self, html="", base_url="", **kwargs) -> dict:
|
| 15 |
+
self.preserve_math_containers = kwargs.get("preserve_math_containers", self.preserve_math_containers)
|
| 16 |
+
self.process_math = kwargs.get("process_math", self.process_math)
|
| 17 |
+
self.include_tables = kwargs.get("include_tables", self.include_tables)
|
| 18 |
+
self.include_images = kwargs.get("include_images", self.include_images)
|
| 19 |
+
self.need_comment = True
|
| 20 |
+
html = html.replace(" ", " ").replace(" ", " ")
|
| 21 |
+
tree = load_html(html)
|
| 22 |
+
if tree is None:
|
| 23 |
+
raise ValueError
|
| 24 |
+
|
| 25 |
+
# 获取title
|
| 26 |
+
title = TitleParser().process(tree)
|
| 27 |
+
|
| 28 |
+
# base_url
|
| 29 |
+
base_href = tree.xpath("//base/@href")
|
| 30 |
+
|
| 31 |
+
if base_href and "http" in base_href[0]:
|
| 32 |
+
base_url = base_href[0]
|
| 33 |
+
self.generate_unique_id(tree)
|
| 34 |
+
|
| 35 |
+
format_tree = self.convert_tags(tree, base_url=base_url)
|
| 36 |
+
format_tree = self._remove_tables_from_tree(format_tree)
|
| 37 |
+
format_tree = self._remove_images_from_tree(format_tree)
|
| 38 |
+
|
| 39 |
+
normal_tree = self.clean_tags(format_tree)
|
| 40 |
+
normal_tree = self._remove_tables_from_tree(normal_tree)
|
| 41 |
+
normal_tree = self._remove_images_from_tree(normal_tree)
|
| 42 |
+
|
| 43 |
+
subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
|
| 44 |
+
if xp_num == "others":
|
| 45 |
+
subtree, drop_list = self.prune_unwanted_sections(normal_tree)
|
| 46 |
+
body_html = self.get_content_html(subtree, xp_num, base_url)
|
| 47 |
+
body_html = self._strip_tables_from_html(body_html)
|
| 48 |
+
body_html = self._strip_images_from_html(body_html)
|
| 49 |
+
|
| 50 |
+
# 论坛等独有
|
| 51 |
+
body_html_tree = fromstring(body_html)
|
| 52 |
+
try:
|
| 53 |
+
body_tree = body_html_tree.body
|
| 54 |
+
except:
|
| 55 |
+
body_tree = Element("body")
|
| 56 |
+
body_tree.extend(body_html_tree)
|
| 57 |
+
main_ids = body_tree.xpath(f".//@{Unique_ID}")
|
| 58 |
+
|
| 59 |
+
for main_id in main_ids:
|
| 60 |
+
main_tree = normal_tree.xpath(
|
| 61 |
+
f".//*[@{Unique_ID}={main_id}]"
|
| 62 |
+
)
|
| 63 |
+
if main_tree:
|
| 64 |
+
self.remove_node(main_tree[0])
|
| 65 |
+
if not main_ids:
|
| 66 |
+
main_ids = [-1]
|
| 67 |
+
|
| 68 |
+
if xp_num != "others":
|
| 69 |
+
normal_tree, _ = self.prune_unwanted_sections(normal_tree)
|
| 70 |
+
for c_xpath in Forum_XPATH:
|
| 71 |
+
while normal_tree.xpath(c_xpath):
|
| 72 |
+
x = normal_tree.xpath(c_xpath)[0]
|
| 73 |
+
self.remove_node(x)
|
| 74 |
+
if "'post-'" in c_xpath:
|
| 75 |
+
if not (re.findall('post-\d+', x.attrib.get("id", "").lower()) or re.findall('post_\d+',
|
| 76 |
+
x.attrib.get("id",
|
| 77 |
+
"").lower())):
|
| 78 |
+
continue
|
| 79 |
+
if (
|
| 80 |
+
"header" in x.attrib.get("class", "").lower()
|
| 81 |
+
or "header" in x.attrib.get("id", "").lower()
|
| 82 |
+
):
|
| 83 |
+
continue
|
| 84 |
+
try:
|
| 85 |
+
if int(x.attrib.get(Unique_ID, "0")) > int(
|
| 86 |
+
main_ids[-1]
|
| 87 |
+
):
|
| 88 |
+
body_tree.append(x)
|
| 89 |
+
else:
|
| 90 |
+
prefix_div = Element("div")
|
| 91 |
+
suffix_div = Element("div")
|
| 92 |
+
need_prefix = False
|
| 93 |
+
need_suffix = False
|
| 94 |
+
while x.xpath(
|
| 95 |
+
f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
|
| 96 |
+
):
|
| 97 |
+
tmp_x = x.xpath(
|
| 98 |
+
f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
|
| 99 |
+
)[0]
|
| 100 |
+
self.remove_node(tmp_x)
|
| 101 |
+
suffix_div.append(tmp_x)
|
| 102 |
+
need_suffix = True
|
| 103 |
+
while x.xpath(
|
| 104 |
+
f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
|
| 105 |
+
):
|
| 106 |
+
tmp_x = x.xpath(
|
| 107 |
+
f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
|
| 108 |
+
)[0]
|
| 109 |
+
self.remove_node(tmp_x)
|
| 110 |
+
prefix_div.append(tmp_x)
|
| 111 |
+
need_prefix = True
|
| 112 |
+
if need_prefix:
|
| 113 |
+
body_tree.insert(0, prefix_div)
|
| 114 |
+
if need_suffix:
|
| 115 |
+
body_tree.append(suffix_div)
|
| 116 |
+
|
| 117 |
+
except:
|
| 118 |
+
pass
|
| 119 |
+
|
| 120 |
+
body_html = re.sub(
|
| 121 |
+
f' {Unique_ID}="\d+"',
|
| 122 |
+
"",
|
| 123 |
+
tostring(body_tree, encoding=str),
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
text_length = self._text_length_from_html(body_html)
|
| 127 |
+
|
| 128 |
+
return {
|
| 129 |
+
"xp_num": xp_num,
|
| 130 |
+
"drop_list": drop_list,
|
| 131 |
+
"html": body_html,
|
| 132 |
+
"title": title,
|
| 133 |
+
"base_url": base_url,
|
| 134 |
+
"text_length": text_length,
|
| 135 |
+
}
|
ultradata_math_parser/parsers/title_parser.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding:utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from ultradata_math_parser.utils import *
|
| 4 |
+
from ultradata_math_parser.config import *
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class TitleParser:
|
| 8 |
+
def extract_by_meta(self, element: HtmlElement):
|
| 9 |
+
for xpath in METAS:
|
| 10 |
+
title = element.xpath(xpath)
|
| 11 |
+
if title:
|
| 12 |
+
return "".join(title)
|
| 13 |
+
|
| 14 |
+
def extract_by_title(self, element: HtmlElement):
|
| 15 |
+
return "".join(element.xpath("//title//text()")).strip()
|
| 16 |
+
|
| 17 |
+
def extract_by_hs(self, element: HtmlElement):
|
| 18 |
+
hs = element.xpath("//h1//text()|//h2//text()|//h3//text()")
|
| 19 |
+
return hs or []
|
| 20 |
+
|
| 21 |
+
def extract_by_h(self, element: HtmlElement):
|
| 22 |
+
for xpath in ["//h1", "//h2", "//h3"]:
|
| 23 |
+
children = element.xpath(xpath)
|
| 24 |
+
if not children:
|
| 25 |
+
continue
|
| 26 |
+
child = children[0]
|
| 27 |
+
texts = child.xpath("./text()")
|
| 28 |
+
if texts and len(texts):
|
| 29 |
+
return texts[0].strip()
|
| 30 |
+
|
| 31 |
+
def process(self, element: HtmlElement):
|
| 32 |
+
title_extracted_by_meta = self.extract_by_meta(element)
|
| 33 |
+
if title_extracted_by_meta:
|
| 34 |
+
return title_extracted_by_meta
|
| 35 |
+
title_extracted_by_h = self.extract_by_h(element)
|
| 36 |
+
title_extracted_by_hs = self.extract_by_hs(element)
|
| 37 |
+
title_extracted_by_title = self.extract_by_title(element)
|
| 38 |
+
title_extracted_by_hs = sorted(
|
| 39 |
+
title_extracted_by_hs,
|
| 40 |
+
key=lambda x: similarity2(x, title_extracted_by_title),
|
| 41 |
+
reverse=True,
|
| 42 |
+
)
|
| 43 |
+
if title_extracted_by_hs:
|
| 44 |
+
return lcs_of_2(title_extracted_by_hs[0], title_extracted_by_title)
|
| 45 |
+
|
| 46 |
+
if title_extracted_by_title:
|
| 47 |
+
return title_extracted_by_title
|
| 48 |
+
|
| 49 |
+
return title_extracted_by_h
|
ultradata_math_parser/parsers/unified_parser.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding:utf-8 -*-
|
| 2 |
+
import re
|
| 3 |
+
from copy import deepcopy
|
| 4 |
+
|
| 5 |
+
from lxml.html import Element, tostring, fromstring
|
| 6 |
+
|
| 7 |
+
from ultradata_math_parser.config import Forum_XPATH, Unique_ID
|
| 8 |
+
from ultradata_math_parser.utils import load_html, text_len
|
| 9 |
+
from ultradata_math_parser.parsers.base_parser import BaseParser
|
| 10 |
+
from ultradata_math_parser.parsers.title_parser import TitleParser
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class UnifiedParser(BaseParser):
|
| 14 |
+
def __init__(self):
|
| 15 |
+
super().__init__()
|
| 16 |
+
self.need_comment = True
|
| 17 |
+
self.enable_forum_assembly = True
|
| 18 |
+
self.forum_assembly_min_gain = 1.1
|
| 19 |
+
|
| 20 |
+
def extract(self, html="", **kwargs) -> dict:
|
| 21 |
+
base_url = kwargs.get("base_url", "")
|
| 22 |
+
self.process_math = kwargs.get("process_math", self.process_math)
|
| 23 |
+
self.preserve_math_containers = kwargs.get("preserve_math_containers", self.preserve_math_containers)
|
| 24 |
+
self.include_tables = kwargs.get("include_tables", self.include_tables)
|
| 25 |
+
self.include_images = kwargs.get("include_images", self.include_images)
|
| 26 |
+
self.enable_forum_assembly = kwargs.get("enable_forum_assembly", self.enable_forum_assembly)
|
| 27 |
+
self.fallback_min_length = kwargs.get("fallback_min_length", self.fallback_min_length)
|
| 28 |
+
|
| 29 |
+
html = html.replace(" ", " ").replace(" ", " ")
|
| 30 |
+
tree = load_html(html)
|
| 31 |
+
if tree is None:
|
| 32 |
+
raise ValueError
|
| 33 |
+
|
| 34 |
+
title = TitleParser().process(tree)
|
| 35 |
+
|
| 36 |
+
raw_tree = deepcopy(tree)
|
| 37 |
+
|
| 38 |
+
# base_url
|
| 39 |
+
base_href = tree.xpath("//base/@href")
|
| 40 |
+
if base_href and "http" in base_href[0]:
|
| 41 |
+
base_url = base_href[0]
|
| 42 |
+
|
| 43 |
+
self.generate_unique_id(tree)
|
| 44 |
+
|
| 45 |
+
# 标签转换
|
| 46 |
+
format_tree = self.convert_tags(tree, base_url=base_url)
|
| 47 |
+
format_tree = self._remove_tables_from_tree(format_tree)
|
| 48 |
+
format_tree = self._remove_images_from_tree(format_tree)
|
| 49 |
+
|
| 50 |
+
normal_tree = self.clean_tags(format_tree)
|
| 51 |
+
normal_tree = self._remove_tables_from_tree(normal_tree)
|
| 52 |
+
normal_tree = self._remove_images_from_tree(normal_tree)
|
| 53 |
+
|
| 54 |
+
fallback_tree = deepcopy(normal_tree)
|
| 55 |
+
|
| 56 |
+
# 主体提取
|
| 57 |
+
subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
|
| 58 |
+
if xp_num == "others":
|
| 59 |
+
subtree, drop_list = self.prune_unwanted_sections(normal_tree)
|
| 60 |
+
|
| 61 |
+
body_html = self.get_content_html(subtree, xp_num, base_url)
|
| 62 |
+
|
| 63 |
+
# 论坛帖子拼装
|
| 64 |
+
forum_assembled = False
|
| 65 |
+
if self.enable_forum_assembly:
|
| 66 |
+
if xp_num != "others":
|
| 67 |
+
normal_tree, _ = self.prune_unwanted_sections(normal_tree)
|
| 68 |
+
|
| 69 |
+
original_length = self._text_length_from_html(body_html)
|
| 70 |
+
assembled_html = self._try_forum_assembly(normal_tree, body_html)
|
| 71 |
+
assembled_length = self._text_length_from_html(assembled_html)
|
| 72 |
+
|
| 73 |
+
if assembled_length >= original_length * self.forum_assembly_min_gain:
|
| 74 |
+
body_html = assembled_html
|
| 75 |
+
forum_assembled = True
|
| 76 |
+
|
| 77 |
+
# 条件兜底
|
| 78 |
+
current_length = self._text_length_from_html(body_html)
|
| 79 |
+
fallback_strategy = "primary"
|
| 80 |
+
|
| 81 |
+
if current_length < self.fallback_min_length:
|
| 82 |
+
body_html, fallback_strategy = self.apply_fallbacks(
|
| 83 |
+
primary_html=body_html,
|
| 84 |
+
base_url=base_url,
|
| 85 |
+
normal_tree=fallback_tree,
|
| 86 |
+
raw_tree=raw_tree,
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
body_html = self._strip_tables_from_html(body_html)
|
| 90 |
+
body_html = self._strip_images_from_html(body_html)
|
| 91 |
+
|
| 92 |
+
text_length = self._text_length_from_html(body_html)
|
| 93 |
+
|
| 94 |
+
return {
|
| 95 |
+
"xp_num": xp_num,
|
| 96 |
+
"drop_list": drop_list,
|
| 97 |
+
"html": body_html,
|
| 98 |
+
"title": title,
|
| 99 |
+
"base_url": base_url,
|
| 100 |
+
"fallback_strategy": fallback_strategy,
|
| 101 |
+
"text_length": text_length,
|
| 102 |
+
"forum_assembled": forum_assembled,
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
def _try_forum_assembly(self, normal_tree, body_html):
|
| 106 |
+
if not body_html:
|
| 107 |
+
return body_html
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
body_html_tree = fromstring(body_html)
|
| 111 |
+
except Exception:
|
| 112 |
+
return body_html
|
| 113 |
+
|
| 114 |
+
try:
|
| 115 |
+
body_tree = body_html_tree.body
|
| 116 |
+
except:
|
| 117 |
+
body_tree = Element("body")
|
| 118 |
+
body_tree.extend(body_html_tree)
|
| 119 |
+
|
| 120 |
+
main_ids = body_tree.xpath(f".//@{Unique_ID}")
|
| 121 |
+
|
| 122 |
+
for main_id in main_ids:
|
| 123 |
+
main_tree = normal_tree.xpath(f".//*[@{Unique_ID}={main_id}]")
|
| 124 |
+
if main_tree:
|
| 125 |
+
try:
|
| 126 |
+
self.remove_node(main_tree[0])
|
| 127 |
+
except:
|
| 128 |
+
pass
|
| 129 |
+
|
| 130 |
+
if not main_ids:
|
| 131 |
+
main_ids = [-1]
|
| 132 |
+
|
| 133 |
+
for c_xpath in Forum_XPATH:
|
| 134 |
+
while True:
|
| 135 |
+
matches = normal_tree.xpath(c_xpath)
|
| 136 |
+
if not matches:
|
| 137 |
+
break
|
| 138 |
+
|
| 139 |
+
x = matches[0]
|
| 140 |
+
self.remove_node(x)
|
| 141 |
+
|
| 142 |
+
if "'post-'" in c_xpath or "'post_'" in c_xpath:
|
| 143 |
+
elem_id = x.attrib.get("id", "").lower()
|
| 144 |
+
if not (re.search(r'post-\d+', elem_id) or re.search(r'post_\d+', elem_id)):
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
if "header" in x.attrib.get("class", "").lower() or "header" in x.attrib.get("id", "").lower():
|
| 148 |
+
continue
|
| 149 |
+
|
| 150 |
+
try:
|
| 151 |
+
node_id = int(x.attrib.get(Unique_ID, "0"))
|
| 152 |
+
last_main_id = int(main_ids[-1]) if main_ids else -1
|
| 153 |
+
|
| 154 |
+
if node_id > last_main_id:
|
| 155 |
+
body_tree.append(x)
|
| 156 |
+
else:
|
| 157 |
+
prefix_div = Element("div")
|
| 158 |
+
suffix_div = Element("div")
|
| 159 |
+
need_prefix = False
|
| 160 |
+
need_suffix = False
|
| 161 |
+
|
| 162 |
+
while x.xpath(f".//*[number(@{Unique_ID}) > {last_main_id}]"):
|
| 163 |
+
tmp_x = x.xpath(f".//*[number(@{Unique_ID}) > {last_main_id}]")[0]
|
| 164 |
+
self.remove_node(tmp_x)
|
| 165 |
+
suffix_div.append(tmp_x)
|
| 166 |
+
need_suffix = True
|
| 167 |
+
|
| 168 |
+
while x.xpath(f".//*[number(@{Unique_ID}) < {last_main_id}]"):
|
| 169 |
+
tmp_x = x.xpath(f".//*[number(@{Unique_ID}) < {last_main_id}]")[0]
|
| 170 |
+
self.remove_node(tmp_x)
|
| 171 |
+
prefix_div.append(tmp_x)
|
| 172 |
+
need_prefix = True
|
| 173 |
+
|
| 174 |
+
if need_prefix:
|
| 175 |
+
body_tree.insert(0, prefix_div)
|
| 176 |
+
if need_suffix:
|
| 177 |
+
body_tree.append(suffix_div)
|
| 178 |
+
except Exception:
|
| 179 |
+
pass
|
| 180 |
+
|
| 181 |
+
result_html = re.sub(
|
| 182 |
+
f' {Unique_ID}="\d+"',
|
| 183 |
+
"",
|
| 184 |
+
tostring(body_tree, encoding=str),
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
return result_html
|
ultradata_math_parser/readability_plus.py
ADDED
|
@@ -0,0 +1,539 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding:utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from lxml.etree import tounicode
|
| 4 |
+
from lxml.html import document_fromstring, fragment_fromstring
|
| 5 |
+
|
| 6 |
+
from ultradata_math_parser.utils import *
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def to_int(x):
|
| 10 |
+
if not x:
|
| 11 |
+
return None
|
| 12 |
+
x = x.strip()
|
| 13 |
+
if x.endswith("px"):
|
| 14 |
+
return int(x[:-2])
|
| 15 |
+
if x.endswith("em"):
|
| 16 |
+
return int(x[:-2]) * 12
|
| 17 |
+
return int(x)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def clean(text):
|
| 21 |
+
text = re.sub(r"\s{255,}", " " * 255, text)
|
| 22 |
+
text = re.sub(r"\s*\n\s*", "\n", text)
|
| 23 |
+
text = re.sub(r"\t|[ \t]{2,}", " ", text)
|
| 24 |
+
return text.strip()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def text_length(i):
|
| 28 |
+
return len(clean(i.text_content() or ""))
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
|
| 32 |
+
single_quoted = "'[^']+'"
|
| 33 |
+
double_quoted = '"[^"]+"'
|
| 34 |
+
non_space = "[^ \"'>]+"
|
| 35 |
+
htmlstrip = re.compile(
|
| 36 |
+
"<" # open
|
| 37 |
+
"([^>]+) " # prefix
|
| 38 |
+
"(?:%s) *" % ("|".join(bad_attrs),)
|
| 39 |
+
+ "= *(?:%s|%s|%s)" # undesirable attributes
|
| 40 |
+
% (non_space, single_quoted, double_quoted)
|
| 41 |
+
+ "([^>]*)" # value # postfix
|
| 42 |
+
">", # end
|
| 43 |
+
re.I,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def clean_attributes(html):
|
| 48 |
+
while htmlstrip.search(html):
|
| 49 |
+
html = htmlstrip.sub("<\\1\\2>", html)
|
| 50 |
+
return html
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class Document:
|
| 54 |
+
def __init__(
|
| 55 |
+
self,
|
| 56 |
+
input,
|
| 57 |
+
url=None,
|
| 58 |
+
min_text_length=25,
|
| 59 |
+
retry_length=250,
|
| 60 |
+
xpath=False,
|
| 61 |
+
handle_failures="discard",
|
| 62 |
+
xp_num="others",
|
| 63 |
+
need_comment=False,
|
| 64 |
+
):
|
| 65 |
+
self.input = input
|
| 66 |
+
self.html = None
|
| 67 |
+
self.encoding = None
|
| 68 |
+
self.positive_keywords = None
|
| 69 |
+
self.negative_keywords = None
|
| 70 |
+
self.url = url
|
| 71 |
+
self.min_text_length = min_text_length
|
| 72 |
+
self.retry_length = retry_length
|
| 73 |
+
self.xpath = xpath
|
| 74 |
+
self.handle_failures = handle_failures
|
| 75 |
+
self.xp_num = xp_num
|
| 76 |
+
self.need_comment = need_comment
|
| 77 |
+
if not need_comment:
|
| 78 |
+
self.REGEXES = {
|
| 79 |
+
"unlikelyCandidatesRe": re.compile(
|
| 80 |
+
r"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter",
|
| 81 |
+
re.I,
|
| 82 |
+
),
|
| 83 |
+
"okMaybeItsACandidateRe": re.compile(
|
| 84 |
+
r"and|article|body|column|main|shadow", re.I
|
| 85 |
+
),
|
| 86 |
+
"positiveRe": re.compile(
|
| 87 |
+
r"article|body|content|entry|hentry|main|page|pagination|post|text|blog|story",
|
| 88 |
+
re.I,
|
| 89 |
+
),
|
| 90 |
+
"negativeRe": re.compile(
|
| 91 |
+
r"combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget",
|
| 92 |
+
re.I,
|
| 93 |
+
),
|
| 94 |
+
"divToPElementsRe": re.compile(
|
| 95 |
+
r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
|
| 96 |
+
),
|
| 97 |
+
"videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
|
| 98 |
+
}
|
| 99 |
+
else:
|
| 100 |
+
self.REGEXES = {
|
| 101 |
+
"unlikelyCandidatesRe": re.compile(
|
| 102 |
+
r"combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter",
|
| 103 |
+
re.I,
|
| 104 |
+
),
|
| 105 |
+
"okMaybeItsACandidateRe": re.compile(
|
| 106 |
+
r"and|article|body|column|main|shadow", re.I
|
| 107 |
+
),
|
| 108 |
+
"positiveRe": re.compile(
|
| 109 |
+
r"article|body|content|entry|hentry|main|page|pagination|post|text|blog|story",
|
| 110 |
+
re.I,
|
| 111 |
+
),
|
| 112 |
+
"negativeRe": re.compile(
|
| 113 |
+
r"combx|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget",
|
| 114 |
+
re.I,
|
| 115 |
+
),
|
| 116 |
+
"divToPElementsRe": re.compile(
|
| 117 |
+
r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
|
| 118 |
+
),
|
| 119 |
+
"videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
def _html(self, force=False):
|
| 123 |
+
if force or self.html is None:
|
| 124 |
+
self.html = self._parse(self.input)
|
| 125 |
+
if self.xpath:
|
| 126 |
+
root = self.html.getroottree()
|
| 127 |
+
for i in self.html.getiterator():
|
| 128 |
+
i.attrib["x"] = root.getpath(i)
|
| 129 |
+
return self.html
|
| 130 |
+
|
| 131 |
+
def _parse(self, input: HtmlElement):
|
| 132 |
+
doc = input
|
| 133 |
+
base_href = self.url
|
| 134 |
+
if base_href:
|
| 135 |
+
try:
|
| 136 |
+
doc.make_links_absolute(
|
| 137 |
+
base_href,
|
| 138 |
+
resolve_base_href=True,
|
| 139 |
+
handle_failures=self.handle_failures,
|
| 140 |
+
)
|
| 141 |
+
except TypeError:
|
| 142 |
+
doc.make_links_absolute(
|
| 143 |
+
base_href,
|
| 144 |
+
resolve_base_href=True,
|
| 145 |
+
handle_failures=self.handle_failures,
|
| 146 |
+
)
|
| 147 |
+
else:
|
| 148 |
+
doc.resolve_base_href(handle_failures=self.handle_failures)
|
| 149 |
+
return doc
|
| 150 |
+
|
| 151 |
+
def summary(self, html_partial=False):
|
| 152 |
+
try:
|
| 153 |
+
ruthless = True
|
| 154 |
+
while True:
|
| 155 |
+
self._html(True)
|
| 156 |
+
for i in self.tags(self.html, "body"):
|
| 157 |
+
i.set("id", "readabilityplusBody")
|
| 158 |
+
if ruthless and self.xp_num == "others":
|
| 159 |
+
self.remove_unlikely_candidates()
|
| 160 |
+
self.transform_misused_divs_into_paragraphs()
|
| 161 |
+
if self.xp_num == "others":
|
| 162 |
+
candidates = self.score_paragraphs()
|
| 163 |
+
best_candidate = self.select_best_candidate(candidates)
|
| 164 |
+
else:
|
| 165 |
+
best_candidate = None
|
| 166 |
+
ruthless = False
|
| 167 |
+
candidates = {}
|
| 168 |
+
if best_candidate:
|
| 169 |
+
article = self.get_article(
|
| 170 |
+
candidates, best_candidate, html_partial=html_partial
|
| 171 |
+
)
|
| 172 |
+
else:
|
| 173 |
+
if ruthless:
|
| 174 |
+
ruthless = False
|
| 175 |
+
continue
|
| 176 |
+
else:
|
| 177 |
+
article = self.html.find("body")
|
| 178 |
+
if article is None:
|
| 179 |
+
article = self.html
|
| 180 |
+
cleaned_article = self.sanitize(article, candidates)
|
| 181 |
+
|
| 182 |
+
article_length = len(cleaned_article or "")
|
| 183 |
+
retry_length = self.retry_length
|
| 184 |
+
of_acceptable_length = article_length >= retry_length
|
| 185 |
+
if ruthless and not of_acceptable_length:
|
| 186 |
+
ruthless = False
|
| 187 |
+
continue
|
| 188 |
+
else:
|
| 189 |
+
return cleaned_article
|
| 190 |
+
except Exception as e:
|
| 191 |
+
return None
|
| 192 |
+
|
| 193 |
+
def get_article(self, candidates, best_candidate, html_partial=False):
|
| 194 |
+
sibling_score_threshold = max([10, best_candidate["content_score"] * 0.2])
|
| 195 |
+
if html_partial:
|
| 196 |
+
output = fragment_fromstring("<div/>")
|
| 197 |
+
else:
|
| 198 |
+
output = document_fromstring("<div/>")
|
| 199 |
+
best_elem = best_candidate["elem"]
|
| 200 |
+
parent = best_elem.getparent()
|
| 201 |
+
siblings = parent.getchildren() if parent is not None else [best_elem]
|
| 202 |
+
for sibling in siblings:
|
| 203 |
+
append = False
|
| 204 |
+
if sibling is best_elem:
|
| 205 |
+
append = True
|
| 206 |
+
sibling_key = sibling
|
| 207 |
+
if (
|
| 208 |
+
sibling_key in candidates
|
| 209 |
+
and candidates[sibling_key]["content_score"] >= sibling_score_threshold
|
| 210 |
+
):
|
| 211 |
+
append = True
|
| 212 |
+
|
| 213 |
+
if sibling.tag == "p":
|
| 214 |
+
link_density = self.get_link_density(sibling)
|
| 215 |
+
node_content = sibling.text or ""
|
| 216 |
+
node_length = len(node_content)
|
| 217 |
+
|
| 218 |
+
if node_length > 80 and link_density < 0.25:
|
| 219 |
+
append = True
|
| 220 |
+
elif (
|
| 221 |
+
node_length <= 80
|
| 222 |
+
and link_density == 0
|
| 223 |
+
and re.search(r"\.( |$)", node_content)
|
| 224 |
+
):
|
| 225 |
+
append = True
|
| 226 |
+
|
| 227 |
+
if append:
|
| 228 |
+
if html_partial:
|
| 229 |
+
output.append(sibling)
|
| 230 |
+
else:
|
| 231 |
+
output.getchildren()[0].getchildren()[0].append(sibling)
|
| 232 |
+
return output
|
| 233 |
+
|
| 234 |
+
def select_best_candidate(self, candidates):
|
| 235 |
+
if not candidates:
|
| 236 |
+
return None
|
| 237 |
+
|
| 238 |
+
sorted_candidates = sorted(
|
| 239 |
+
candidates.values(), key=lambda x: x["content_score"], reverse=True
|
| 240 |
+
)
|
| 241 |
+
for candidate in sorted_candidates[:5]:
|
| 242 |
+
elem = candidate["elem"]
|
| 243 |
+
|
| 244 |
+
best_candidate = sorted_candidates[0]
|
| 245 |
+
return best_candidate
|
| 246 |
+
|
| 247 |
+
def get_link_density(self, elem):
|
| 248 |
+
link_length = 0
|
| 249 |
+
for i in elem.findall(".//a"):
|
| 250 |
+
link_length += text_length(i)
|
| 251 |
+
total_length = text_length(elem)
|
| 252 |
+
return float(link_length) / max(total_length, 1)
|
| 253 |
+
|
| 254 |
+
def score_paragraphs(self):
|
| 255 |
+
MIN_LEN = self.min_text_length
|
| 256 |
+
candidates = {}
|
| 257 |
+
ordered = []
|
| 258 |
+
for elem in self.tags(self._html(), "p", "pre", "td"):
|
| 259 |
+
parent_node = elem.getparent()
|
| 260 |
+
if parent_node is None:
|
| 261 |
+
continue
|
| 262 |
+
grand_parent_node = parent_node.getparent()
|
| 263 |
+
|
| 264 |
+
inner_text = clean(elem.text_content() or "")
|
| 265 |
+
inner_text_len = len(inner_text)
|
| 266 |
+
|
| 267 |
+
if inner_text_len < MIN_LEN:
|
| 268 |
+
continue
|
| 269 |
+
|
| 270 |
+
if parent_node not in candidates:
|
| 271 |
+
candidates[parent_node] = self.score_node(parent_node)
|
| 272 |
+
ordered.append(parent_node)
|
| 273 |
+
|
| 274 |
+
if grand_parent_node is not None and grand_parent_node not in candidates:
|
| 275 |
+
candidates[grand_parent_node] = self.score_node(grand_parent_node)
|
| 276 |
+
ordered.append(grand_parent_node)
|
| 277 |
+
|
| 278 |
+
content_score = 1
|
| 279 |
+
content_score += len(inner_text.split(","))
|
| 280 |
+
content_score += len(inner_text.split(","))
|
| 281 |
+
content_score += min((inner_text_len / 100), 3)
|
| 282 |
+
|
| 283 |
+
candidates[parent_node]["content_score"] += content_score
|
| 284 |
+
if grand_parent_node is not None:
|
| 285 |
+
candidates[grand_parent_node]["content_score"] += content_score / 2.0
|
| 286 |
+
|
| 287 |
+
for elem in ordered:
|
| 288 |
+
candidate = candidates[elem]
|
| 289 |
+
ld = self.get_link_density(elem)
|
| 290 |
+
score = candidate["content_score"]
|
| 291 |
+
|
| 292 |
+
candidate["content_score"] *= 1 - ld
|
| 293 |
+
|
| 294 |
+
return candidates
|
| 295 |
+
|
| 296 |
+
def class_weight(self, e):
|
| 297 |
+
weight = 0
|
| 298 |
+
for feature in [e.get("class", None), e.get("id", None)]:
|
| 299 |
+
if feature:
|
| 300 |
+
if self.xp_num == "others":
|
| 301 |
+
if self.REGEXES["negativeRe"].search(feature):
|
| 302 |
+
weight -= 25
|
| 303 |
+
|
| 304 |
+
if self.REGEXES["positiveRe"].search(feature):
|
| 305 |
+
weight += 25
|
| 306 |
+
else:
|
| 307 |
+
if self.REGEXES["positiveRe"].search(feature):
|
| 308 |
+
weight += 25
|
| 309 |
+
|
| 310 |
+
if self.positive_keywords and self.positive_keywords.search(feature):
|
| 311 |
+
weight += 25
|
| 312 |
+
|
| 313 |
+
if self.negative_keywords and self.negative_keywords.search(feature):
|
| 314 |
+
weight -= 25
|
| 315 |
+
|
| 316 |
+
if self.positive_keywords and self.positive_keywords.match("tag-" + e.tag):
|
| 317 |
+
weight += 25
|
| 318 |
+
|
| 319 |
+
if self.negative_keywords and self.negative_keywords.match("tag-" + e.tag):
|
| 320 |
+
weight -= 25
|
| 321 |
+
|
| 322 |
+
return weight
|
| 323 |
+
|
| 324 |
+
def score_node(self, elem):
|
| 325 |
+
content_score = self.class_weight(elem)
|
| 326 |
+
name = elem.tag.lower()
|
| 327 |
+
if name in ["div", "article"]:
|
| 328 |
+
content_score += 5
|
| 329 |
+
elif name in ["pre", "td", "blockquote"]:
|
| 330 |
+
content_score += 3
|
| 331 |
+
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:
|
| 332 |
+
content_score -= 3
|
| 333 |
+
elif name in [
|
| 334 |
+
"h1",
|
| 335 |
+
"h2",
|
| 336 |
+
"h3",
|
| 337 |
+
"h4",
|
| 338 |
+
"h5",
|
| 339 |
+
"h6",
|
| 340 |
+
"th",
|
| 341 |
+
"header",
|
| 342 |
+
"footer",
|
| 343 |
+
"nav",
|
| 344 |
+
]:
|
| 345 |
+
content_score -= 5
|
| 346 |
+
return {"content_score": content_score, "elem": elem}
|
| 347 |
+
|
| 348 |
+
def remove_unlikely_candidates(self):
|
| 349 |
+
for elem in self.html.findall(".//*"):
|
| 350 |
+
s = "%s %s" % (elem.get("class", ""), elem.get("id", ""))
|
| 351 |
+
if len(s) < 2:
|
| 352 |
+
continue
|
| 353 |
+
if (
|
| 354 |
+
self.REGEXES["unlikelyCandidatesRe"].search(s)
|
| 355 |
+
and (not self.REGEXES["okMaybeItsACandidateRe"].search(s))
|
| 356 |
+
and elem.tag not in ["html", "body"]
|
| 357 |
+
):
|
| 358 |
+
elem.drop_tree()
|
| 359 |
+
|
| 360 |
+
def transform_misused_divs_into_paragraphs(self):
|
| 361 |
+
for elem in self.tags(self.html, "div"):
|
| 362 |
+
if not self.REGEXES["divToPElementsRe"].search(
|
| 363 |
+
str(b"".join(map(tostring, list(elem))))
|
| 364 |
+
):
|
| 365 |
+
elem.tag = "p"
|
| 366 |
+
|
| 367 |
+
for elem in self.tags(self.html, "div"):
|
| 368 |
+
if elem.text and elem.text.strip():
|
| 369 |
+
p = fragment_fromstring("<p/>")
|
| 370 |
+
p.text = elem.text
|
| 371 |
+
elem.text = None
|
| 372 |
+
elem.insert(0, p)
|
| 373 |
+
|
| 374 |
+
for pos, child in reversed(list(enumerate(elem))):
|
| 375 |
+
if child.tail and child.tail.strip():
|
| 376 |
+
p = fragment_fromstring("<p/>")
|
| 377 |
+
p.text = child.tail
|
| 378 |
+
child.tail = None
|
| 379 |
+
elem.insert(pos + 1, p)
|
| 380 |
+
if child.tag == "br":
|
| 381 |
+
child.drop_tree()
|
| 382 |
+
|
| 383 |
+
def tags(self, node, *tag_names):
|
| 384 |
+
for tag_name in tag_names:
|
| 385 |
+
for e in node.findall(".//%s" % tag_name):
|
| 386 |
+
yield e
|
| 387 |
+
|
| 388 |
+
def reverse_tags(self, node, *tag_names):
|
| 389 |
+
for tag_name in tag_names:
|
| 390 |
+
for e in reversed(node.findall(".//%s" % tag_name)):
|
| 391 |
+
yield e
|
| 392 |
+
|
| 393 |
+
def sanitize(self, node, candidates):
|
| 394 |
+
MIN_LEN = self.min_text_length
|
| 395 |
+
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
|
| 396 |
+
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
|
| 397 |
+
header.drop_tree()
|
| 398 |
+
|
| 399 |
+
for elem in self.tags(node, "iframe"):
|
| 400 |
+
if "src" in elem.attrib and self.REGEXES["videoRe"].search(
|
| 401 |
+
elem.attrib["src"]
|
| 402 |
+
):
|
| 403 |
+
elem.text = "VIDEO"
|
| 404 |
+
else:
|
| 405 |
+
elem.drop_tree()
|
| 406 |
+
|
| 407 |
+
allowed = {}
|
| 408 |
+
# Conditionally clean <table>s, <ul>s, and <div>s
|
| 409 |
+
for el in self.reverse_tags(
|
| 410 |
+
node, "table", "ul", "div", "aside", "header", "footer", "section"
|
| 411 |
+
):
|
| 412 |
+
if el in allowed:
|
| 413 |
+
continue
|
| 414 |
+
weight = self.class_weight(el)
|
| 415 |
+
if el in candidates:
|
| 416 |
+
content_score = candidates[el]["content_score"]
|
| 417 |
+
else:
|
| 418 |
+
content_score = 0
|
| 419 |
+
tag = el.tag
|
| 420 |
+
|
| 421 |
+
if weight + content_score < 0:
|
| 422 |
+
el.drop_tree()
|
| 423 |
+
elif el.text_content().count(",") + el.text_content().count(",") < 10:
|
| 424 |
+
counts = {}
|
| 425 |
+
for kind in ["p", "img", "li", "a", "embed", "input"]:
|
| 426 |
+
counts[kind] = len(el.findall(".//%s" % kind))
|
| 427 |
+
counts["li"] -= 100
|
| 428 |
+
counts["input"] -= len(el.findall('.//input[@type="hidden"]'))
|
| 429 |
+
|
| 430 |
+
content_length = text_length(el)
|
| 431 |
+
link_density = self.get_link_density(el)
|
| 432 |
+
|
| 433 |
+
to_remove = False
|
| 434 |
+
reason = ""
|
| 435 |
+
|
| 436 |
+
# 修改
|
| 437 |
+
if el.tag == "div" and counts["img"] >= 1:
|
| 438 |
+
continue
|
| 439 |
+
if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
|
| 440 |
+
reason = "too many images (%s)" % counts["img"]
|
| 441 |
+
# to_remove = True
|
| 442 |
+
elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
|
| 443 |
+
reason = "more <li>s than <p>s"
|
| 444 |
+
# to_remove = True
|
| 445 |
+
elif counts["input"] > (counts["p"] / 3):
|
| 446 |
+
reason = "less than 3x <p>s than <input>s"
|
| 447 |
+
to_remove = True
|
| 448 |
+
elif content_length < MIN_LEN and counts["img"] == 0:
|
| 449 |
+
# 代码块内容过短,导致删除
|
| 450 |
+
if el.tag in ['code', 'pre']:
|
| 451 |
+
continue
|
| 452 |
+
if ancestor_node_check(el, ['code', 'pre']):
|
| 453 |
+
continue
|
| 454 |
+
# 保留table中的链接
|
| 455 |
+
if el.tag in ['ul', 'div'] and ancestor_node_check(el, ['td']):
|
| 456 |
+
continue
|
| 457 |
+
reason = (
|
| 458 |
+
"too short content length %s without a single image"
|
| 459 |
+
% content_length
|
| 460 |
+
)
|
| 461 |
+
to_remove = True
|
| 462 |
+
elif content_length < MIN_LEN and counts["img"] > 2:
|
| 463 |
+
reason = (
|
| 464 |
+
"too short content length %s and too many images"
|
| 465 |
+
% content_length
|
| 466 |
+
)
|
| 467 |
+
to_remove = True
|
| 468 |
+
elif weight < 25 and link_density > 0.2:
|
| 469 |
+
if tag in ["div", "ul", "table"]:
|
| 470 |
+
ptest = el.xpath(".//text()[not(ancestor::a)]")
|
| 471 |
+
ptest_len = text_len("".join(ptest))
|
| 472 |
+
if ptest_len >= MIN_LEN and link_density <= 0.3:
|
| 473 |
+
continue
|
| 474 |
+
if tag == "table":
|
| 475 |
+
if len(el.xpath('.//tr[1]/td')) >=2:
|
| 476 |
+
continue
|
| 477 |
+
if tag == "div":
|
| 478 |
+
if el.xpath('.//table'):
|
| 479 |
+
continue
|
| 480 |
+
reason = "too many links %.3f for its weight %s" % (
|
| 481 |
+
link_density,
|
| 482 |
+
weight,
|
| 483 |
+
)
|
| 484 |
+
to_remove = True
|
| 485 |
+
elif weight >= 25 and link_density > 0.5:
|
| 486 |
+
if tag == "table":
|
| 487 |
+
if len(el.xpath('.//tr[1]/td')) >= 2:
|
| 488 |
+
continue
|
| 489 |
+
if tag == "div":
|
| 490 |
+
if el.xpath('.//table'):
|
| 491 |
+
continue
|
| 492 |
+
reason = "too many links %.3f for its weight %s" % (
|
| 493 |
+
link_density,
|
| 494 |
+
weight,
|
| 495 |
+
)
|
| 496 |
+
to_remove = True
|
| 497 |
+
elif (counts["embed"] == 1 and content_length < 75) or counts[
|
| 498 |
+
"embed"
|
| 499 |
+
] > 1:
|
| 500 |
+
reason = (
|
| 501 |
+
"<embed>s with too short content length, or too many <embed>s"
|
| 502 |
+
)
|
| 503 |
+
to_remove = True
|
| 504 |
+
elif not content_length:
|
| 505 |
+
reason = "no content"
|
| 506 |
+
to_remove = True
|
| 507 |
+
|
| 508 |
+
i, j = 0, 0
|
| 509 |
+
x = 1
|
| 510 |
+
siblings = []
|
| 511 |
+
for sib in el.itersiblings():
|
| 512 |
+
sib_content_length = text_length(sib)
|
| 513 |
+
if sib_content_length:
|
| 514 |
+
i = +1
|
| 515 |
+
siblings.append(sib_content_length)
|
| 516 |
+
if i == x:
|
| 517 |
+
break
|
| 518 |
+
for sib in el.itersiblings(preceding=True):
|
| 519 |
+
sib_content_length = text_length(sib)
|
| 520 |
+
if sib_content_length:
|
| 521 |
+
j = +1
|
| 522 |
+
siblings.append(sib_content_length)
|
| 523 |
+
if j == x:
|
| 524 |
+
break
|
| 525 |
+
if siblings and sum(siblings) > 1000:
|
| 526 |
+
to_remove = False
|
| 527 |
+
for desnode in self.tags(el, "table", "ul", "div", "section"):
|
| 528 |
+
allowed[desnode] = True
|
| 529 |
+
|
| 530 |
+
if to_remove:
|
| 531 |
+
el.drop_tree()
|
| 532 |
+
else:
|
| 533 |
+
pass
|
| 534 |
+
|
| 535 |
+
self.html = node
|
| 536 |
+
return self.get_clean_html()
|
| 537 |
+
|
| 538 |
+
def get_clean_html(self):
|
| 539 |
+
return clean_attributes(tounicode(self.html, method="html"))
|
ultradata_math_parser/utils.py
ADDED
|
@@ -0,0 +1,499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding:utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import logging
|
| 6 |
+
import subprocess
|
| 7 |
+
import tempfile
|
| 8 |
+
from gzip import decompress
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
from lxml import etree
|
| 12 |
+
from lxml.html import Element, HtmlElement, HTMLParser, fromstring, tostring
|
| 13 |
+
from lxml.html.clean import Cleaner
|
| 14 |
+
from urllib3.response import HTTPResponse
|
| 15 |
+
from ultradata_math_parser.config import Unique_ID
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
import brotli
|
| 19 |
+
except ImportError:
|
| 20 |
+
brotli = None
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
from cchardet import detect as cchardet_detect
|
| 24 |
+
except ImportError:
|
| 25 |
+
cchardet_detect = None
|
| 26 |
+
|
| 27 |
+
from difflib import SequenceMatcher
|
| 28 |
+
|
| 29 |
+
from charset_normalizer import from_bytes
|
| 30 |
+
|
| 31 |
+
HTML_PARSER = HTMLParser(
|
| 32 |
+
collect_ids=False,
|
| 33 |
+
default_doctype=False,
|
| 34 |
+
encoding="utf-8",
|
| 35 |
+
remove_comments=True,
|
| 36 |
+
remove_pis=True,
|
| 37 |
+
)
|
| 38 |
+
DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I)
|
| 39 |
+
UNICODE_ALIASES = {"utf-8", "utf_8"}
|
| 40 |
+
|
| 41 |
+
HTML_CLEANER = Cleaner(
|
| 42 |
+
annoying_tags=False,
|
| 43 |
+
comments=True,
|
| 44 |
+
embedded=False,
|
| 45 |
+
forms=False,
|
| 46 |
+
frames=False,
|
| 47 |
+
javascript=False,
|
| 48 |
+
links=False,
|
| 49 |
+
meta=False,
|
| 50 |
+
page_structure=False,
|
| 51 |
+
processing_instructions=True,
|
| 52 |
+
remove_unknown_tags=False,
|
| 53 |
+
safe_attrs_only=False,
|
| 54 |
+
scripts=False,
|
| 55 |
+
style=False,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
color_regex = re.compile(r"\\textcolor\[.*?\]\{.*?\}")
|
| 59 |
+
|
| 60 |
+
latex_image_class_names = [
|
| 61 |
+
"latexcenter",
|
| 62 |
+
"latex",
|
| 63 |
+
"tex",
|
| 64 |
+
"latexdisplay",
|
| 65 |
+
"latexblock",
|
| 66 |
+
"latexblockcenter",
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _translator():
|
| 71 |
+
old_log_level = logging.getLogger().level
|
| 72 |
+
try:
|
| 73 |
+
import py_asciimath.translator.translator as _translator
|
| 74 |
+
|
| 75 |
+
return _translator
|
| 76 |
+
finally:
|
| 77 |
+
logging.getLogger().setLevel(old_log_level)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def ASCIIMath2Tex(*args, **kwargs):
|
| 81 |
+
return _translator().ASCIIMath2Tex(*args, **kwargs)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def MathML2Tex(*args, **kwargs):
|
| 85 |
+
return _translator().MathML2Tex(*args, **kwargs)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
asciimath2tex = ASCIIMath2Tex(log=False)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def lcs_of_2(a, b):
|
| 92 |
+
match = SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b))
|
| 93 |
+
return a[match[0]: match[0] + match[2]]
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def lcs_of_list(*args):
|
| 97 |
+
if len(args) == 2:
|
| 98 |
+
return lcs_of_2(args[0], args[1])
|
| 99 |
+
first = args[0]
|
| 100 |
+
remains = args[1:]
|
| 101 |
+
return lcs_of_2(first, lcs_of_list(*remains))
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def isutf8(data):
|
| 105 |
+
try:
|
| 106 |
+
data.decode("UTF-8")
|
| 107 |
+
except UnicodeDecodeError:
|
| 108 |
+
return False
|
| 109 |
+
return True
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def handle_compressed_file(filecontent):
|
| 113 |
+
if isinstance(filecontent, bytes):
|
| 114 |
+
if filecontent[:2] == b"\x1f\x8b":
|
| 115 |
+
try:
|
| 116 |
+
filecontent = decompress(filecontent)
|
| 117 |
+
except (EOFError, OSError):
|
| 118 |
+
pass
|
| 119 |
+
elif brotli is not None:
|
| 120 |
+
try:
|
| 121 |
+
filecontent = brotli.decompress(filecontent)
|
| 122 |
+
except brotli.error:
|
| 123 |
+
pass
|
| 124 |
+
return filecontent
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def detect_encoding(bytesobject):
|
| 128 |
+
if isutf8(bytesobject):
|
| 129 |
+
return ["utf-8"]
|
| 130 |
+
guesses = []
|
| 131 |
+
if cchardet_detect is not None:
|
| 132 |
+
cchardet_guess = cchardet_detect(bytesobject)["encoding"]
|
| 133 |
+
if cchardet_guess is not None:
|
| 134 |
+
guesses.append(cchardet_guess.lower())
|
| 135 |
+
detection_results = from_bytes(bytesobject[:15000]) or from_bytes(bytesobject)
|
| 136 |
+
if len(detection_results) > 0:
|
| 137 |
+
guesses.extend([r.encoding for r in detection_results])
|
| 138 |
+
return [g for g in guesses if g not in UNICODE_ALIASES]
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def decode_file(filecontent):
|
| 142 |
+
if isinstance(filecontent, str):
|
| 143 |
+
return filecontent
|
| 144 |
+
htmltext = None
|
| 145 |
+
filecontent = handle_compressed_file(filecontent)
|
| 146 |
+
for guessed_encoding in detect_encoding(filecontent):
|
| 147 |
+
try:
|
| 148 |
+
htmltext = filecontent.decode(guessed_encoding)
|
| 149 |
+
except (LookupError, UnicodeDecodeError):
|
| 150 |
+
htmltext = None
|
| 151 |
+
else:
|
| 152 |
+
break
|
| 153 |
+
return htmltext or str(filecontent, encoding="utf-8", errors="replace")
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def strip_faulty_doctypes(htmlstring: str, beginning: str) -> str:
|
| 157 |
+
if "doctype" in beginning:
|
| 158 |
+
firstline, _, rest = htmlstring.partition("\n")
|
| 159 |
+
return DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
|
| 160 |
+
return htmlstring
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def is_dubious_html(beginning: str) -> bool:
|
| 164 |
+
return "html" not in beginning
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def fromstring_bytes(htmlobject):
|
| 168 |
+
tree = None
|
| 169 |
+
try:
|
| 170 |
+
tree = fromstring(
|
| 171 |
+
htmlobject.encode("utf8", "surrogatepass"), parser=HTML_PARSER
|
| 172 |
+
)
|
| 173 |
+
except Exception as err:
|
| 174 |
+
pass
|
| 175 |
+
return tree
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def ancestor_node_check(node: HtmlElement, tags: list):
|
| 179 |
+
for tag in tags:
|
| 180 |
+
if node.xpath(f'ancestor::{tag}[1]'):
|
| 181 |
+
return True
|
| 182 |
+
return False
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def load_html(htmlobject):
|
| 186 |
+
if isinstance(htmlobject, HtmlElement):
|
| 187 |
+
return htmlobject
|
| 188 |
+
if isinstance(htmlobject, HTTPResponse) or hasattr(htmlobject, "data"):
|
| 189 |
+
htmlobject = htmlobject.data
|
| 190 |
+
if not isinstance(htmlobject, (bytes, str)):
|
| 191 |
+
raise TypeError("incompatible input type", type(htmlobject))
|
| 192 |
+
tree = None
|
| 193 |
+
htmlobject = decode_file(htmlobject)
|
| 194 |
+
beginning = htmlobject[:50].lower()
|
| 195 |
+
check_flag = is_dubious_html(beginning)
|
| 196 |
+
htmlobject = strip_faulty_doctypes(htmlobject, beginning)
|
| 197 |
+
fallback_parse = False
|
| 198 |
+
try:
|
| 199 |
+
tree = fromstring(htmlobject, parser=HTML_PARSER)
|
| 200 |
+
except ValueError:
|
| 201 |
+
tree = fromstring_bytes(htmlobject)
|
| 202 |
+
fallback_parse = True
|
| 203 |
+
except Exception as err:
|
| 204 |
+
pass
|
| 205 |
+
if (tree is None or len(tree) < 1) and not fallback_parse:
|
| 206 |
+
tree = fromstring_bytes(htmlobject)
|
| 207 |
+
if tree is not None and check_flag is True and len(tree) < 2:
|
| 208 |
+
tree = None
|
| 209 |
+
return tree
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
class W3MError(RuntimeError):
|
| 213 |
+
"""Raised when w3m rendering fails."""
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def run_w3m_dump(html_content: str, w3m_path: str, *, columns: int = 200) -> str:
|
| 217 |
+
"""
|
| 218 |
+
Render HTML content into plain text using w3m.
|
| 219 |
+
|
| 220 |
+
:param html_content: HTML snippet to render.
|
| 221 |
+
:param w3m_path: Path to the w3m executable.
|
| 222 |
+
:param columns: Column width passed to w3m (-cols).
|
| 223 |
+
:return: Rendered plain text.
|
| 224 |
+
:raises RuntimeError: if w3m is unavailable or returns a non-zero exit code.
|
| 225 |
+
"""
|
| 226 |
+
if not w3m_path:
|
| 227 |
+
raise W3MError("w3m path must be provided")
|
| 228 |
+
|
| 229 |
+
tmp_file = tempfile.NamedTemporaryFile(
|
| 230 |
+
mode="w", suffix=".html", delete=False, encoding="utf-8"
|
| 231 |
+
)
|
| 232 |
+
try:
|
| 233 |
+
tmp_file.write(html_content or "")
|
| 234 |
+
tmp_file.flush()
|
| 235 |
+
tmp_file.close()
|
| 236 |
+
|
| 237 |
+
try:
|
| 238 |
+
completed = subprocess.run(
|
| 239 |
+
[
|
| 240 |
+
w3m_path,
|
| 241 |
+
"-dump",
|
| 242 |
+
"-T",
|
| 243 |
+
"text/html",
|
| 244 |
+
"-cols",
|
| 245 |
+
str(columns),
|
| 246 |
+
tmp_file.name,
|
| 247 |
+
],
|
| 248 |
+
check=True,
|
| 249 |
+
capture_output=True,
|
| 250 |
+
text=True,
|
| 251 |
+
)
|
| 252 |
+
except FileNotFoundError as exc:
|
| 253 |
+
raise W3MError(f"w3m executable not found at '{w3m_path}'") from exc
|
| 254 |
+
except subprocess.CalledProcessError as exc:
|
| 255 |
+
stderr = (exc.stderr or "").strip()
|
| 256 |
+
message = f"w3m exited with status {exc.returncode}"
|
| 257 |
+
if stderr:
|
| 258 |
+
message = f"{message}: {stderr}"
|
| 259 |
+
raise W3MError(message) from exc
|
| 260 |
+
|
| 261 |
+
return completed.stdout
|
| 262 |
+
finally:
|
| 263 |
+
try:
|
| 264 |
+
os.unlink(tmp_file.name)
|
| 265 |
+
except OSError:
|
| 266 |
+
pass
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def is_empty_element(node: HtmlElement):
|
| 270 |
+
return not node.getchildren() and not node.text
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def iter_node(element: HtmlElement):
|
| 274 |
+
yield element
|
| 275 |
+
for sub_element in element:
|
| 276 |
+
if isinstance(sub_element, HtmlElement):
|
| 277 |
+
yield from iter_node(sub_element)
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def img_div_check(tree):
|
| 281 |
+
"""
|
| 282 |
+
如果一个div中只有一张图,且子节点数小于4则保留
|
| 283 |
+
"""
|
| 284 |
+
if len(tree.xpath(".//img")) == 1 and len(tree.xpath(".//*")) < 4:
|
| 285 |
+
return False
|
| 286 |
+
else:
|
| 287 |
+
return True
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def text_len(s):
|
| 291 |
+
s = re.sub(" +", " ", s) # 将连续的多个空格替换为一个空格
|
| 292 |
+
s = re.sub("[\n\t\r]+", "\n", s)
|
| 293 |
+
english_words = s.split()
|
| 294 |
+
chinese_characters = re.findall(r"[\u4e00-\u9fff]", s)
|
| 295 |
+
japanese_characters = re.findall(r"[\u3040-\u309F\u30A0-\u30FF]", s)
|
| 296 |
+
arabic_characters = re.findall(r"[\u0600-\u06FF]", s)
|
| 297 |
+
return (
|
| 298 |
+
len(english_words)
|
| 299 |
+
+ len(chinese_characters)
|
| 300 |
+
+ len(japanese_characters)
|
| 301 |
+
+ len(arabic_characters)
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def alias(element):
|
| 306 |
+
if element is None:
|
| 307 |
+
return ""
|
| 308 |
+
tag = element.tag
|
| 309 |
+
# skip nth-child
|
| 310 |
+
if tag in ["html", "body"]:
|
| 311 |
+
return tag
|
| 312 |
+
attribs = [tag]
|
| 313 |
+
for k, v in element.attrib.items():
|
| 314 |
+
if k == Unique_ID:
|
| 315 |
+
continue
|
| 316 |
+
k, v = re.sub(r"\s*", "", k), re.sub(r"\s*", "", v)
|
| 317 |
+
v = re.sub(r"-\d+", "", v)
|
| 318 |
+
attribs.append(f'[{k}="{v}"]' if v else f"[{k}]")
|
| 319 |
+
result = "".join(attribs)
|
| 320 |
+
|
| 321 |
+
# 直接将当前子节点属性展示上来
|
| 322 |
+
nth = ""
|
| 323 |
+
for child in element.getchildren():
|
| 324 |
+
if child.tag in ["dt", "dd", "li"]:
|
| 325 |
+
try:
|
| 326 |
+
# 子节点个数
|
| 327 |
+
nth += str(len(list(child.getchildren())))
|
| 328 |
+
except:
|
| 329 |
+
pass
|
| 330 |
+
continue
|
| 331 |
+
attribs = [child.tag]
|
| 332 |
+
for k, v in child.attrib.items():
|
| 333 |
+
if k == Unique_ID:
|
| 334 |
+
continue
|
| 335 |
+
k, v = re.sub(r"\s*", "", k), re.sub(r"\s*", "", v)
|
| 336 |
+
v = re.sub(r"-\d+", "", v)
|
| 337 |
+
attribs.append(f"[{k}]" if v else f"[{k}]")
|
| 338 |
+
nth += "".join(attribs)
|
| 339 |
+
|
| 340 |
+
result += f":{nth}"
|
| 341 |
+
return result
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
def similarity2(s1, s2):
|
| 345 |
+
if not s1 or not s2:
|
| 346 |
+
return 0
|
| 347 |
+
s1_set = set(list(s1))
|
| 348 |
+
s2_set = set(list(s2))
|
| 349 |
+
intersection = s1_set.intersection(s2_set)
|
| 350 |
+
union = s1_set.union(s2_set)
|
| 351 |
+
return len(intersection) / len(union)
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def similarity_with_element(element1, element2):
|
| 355 |
+
alias1 = alias(element1)
|
| 356 |
+
alias2 = alias(element2)
|
| 357 |
+
return similarity2(alias1, alias2)
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
def similarity_with_siblings(element, siblings):
|
| 361 |
+
scores = []
|
| 362 |
+
for sibling in siblings:
|
| 363 |
+
# TODO: maybe compare all children not only alias
|
| 364 |
+
scores.append(similarity_with_element(element, sibling))
|
| 365 |
+
if not scores:
|
| 366 |
+
return 0
|
| 367 |
+
# 去掉一个最低值
|
| 368 |
+
min_value = min(scores)
|
| 369 |
+
scores.remove(min_value)
|
| 370 |
+
return np.mean(scores)
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def number_of_a_char(ele, xpath=".//a//text()"):
|
| 374 |
+
s = "".join(ele.xpath(xpath)).strip()
|
| 375 |
+
return text_len(s)
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def number_of_char(ele, xpath=".//text()"):
|
| 379 |
+
s = "".join(ele.xpath(xpath)).strip()
|
| 380 |
+
return text_len(s) + 1
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
def density_of_a_text(ele, pre=0.7):
|
| 384 |
+
a_char = number_of_a_char(ele)
|
| 385 |
+
t_char = number_of_char(ele)
|
| 386 |
+
if a_char / t_char >= pre:
|
| 387 |
+
return True
|
| 388 |
+
else:
|
| 389 |
+
return False
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def uniquify_list(l):
|
| 393 |
+
return list(dict.fromkeys(l))
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
def trim(string):
|
| 397 |
+
"""Remove unnecessary spaces within a text string"""
|
| 398 |
+
try:
|
| 399 |
+
return " ".join(string.split()).strip()
|
| 400 |
+
except (AttributeError, TypeError):
|
| 401 |
+
return None
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
def collect_link_info(links_xpath, favor_precision=False):
|
| 405 |
+
shortelems, mylist = 0, []
|
| 406 |
+
threshold = 10 if not favor_precision else 50
|
| 407 |
+
for subelem in links_xpath:
|
| 408 |
+
subelemtext = trim(subelem.text_content())
|
| 409 |
+
if subelemtext:
|
| 410 |
+
mylist.append(subelemtext)
|
| 411 |
+
if len(subelemtext) < threshold:
|
| 412 |
+
shortelems += 1
|
| 413 |
+
lengths = sum(len(text) for text in mylist)
|
| 414 |
+
return lengths, len(mylist), shortelems, mylist
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
def link_density_test(element, text, favor_precision=False):
|
| 418 |
+
links_xpath, mylist = element.findall(".//a"), []
|
| 419 |
+
if links_xpath:
|
| 420 |
+
if element.tag == "p":
|
| 421 |
+
if favor_precision is False:
|
| 422 |
+
if element.getnext() is None:
|
| 423 |
+
limitlen, threshold = 60, 0.8
|
| 424 |
+
else:
|
| 425 |
+
limitlen, threshold = 30, 0.8
|
| 426 |
+
else:
|
| 427 |
+
limitlen, threshold = 200, 0.8
|
| 428 |
+
else:
|
| 429 |
+
if element.getnext() is None:
|
| 430 |
+
limitlen, threshold = 300, 0.8
|
| 431 |
+
else:
|
| 432 |
+
limitlen, threshold = 100, 0.8
|
| 433 |
+
elemlen = len(text)
|
| 434 |
+
if elemlen < limitlen:
|
| 435 |
+
linklen, elemnum, shortelems, mylist = collect_link_info(
|
| 436 |
+
links_xpath, favor_precision
|
| 437 |
+
)
|
| 438 |
+
if elemnum == 0:
|
| 439 |
+
return True, mylist
|
| 440 |
+
if density_of_a_text(element, 0.5):
|
| 441 |
+
if linklen > threshold * elemlen or (
|
| 442 |
+
elemnum > 1 and shortelems / elemnum > 0.8
|
| 443 |
+
):
|
| 444 |
+
return True, mylist
|
| 445 |
+
return False, mylist
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
def text_strip(text):
|
| 449 |
+
return text.strip() if text else text
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
def wrap_math(s, display=False):
|
| 453 |
+
s = re.sub(r"\s+", " ", s)
|
| 454 |
+
s = color_regex.sub("", s)
|
| 455 |
+
s = s.replace("$", "")
|
| 456 |
+
s = s.replace("\n", " ").replace("\\n", "")
|
| 457 |
+
s = s.strip()
|
| 458 |
+
if len(s) == 0:
|
| 459 |
+
return s
|
| 460 |
+
# Don't wrap if it's already in \align
|
| 461 |
+
if "align" in s:
|
| 462 |
+
return s
|
| 463 |
+
if display:
|
| 464 |
+
return "$$" + s + "$$"
|
| 465 |
+
return "$" + s + "$"
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
def extract_asciimath(s):
|
| 469 |
+
parsed = asciimath2tex.translate(s)
|
| 470 |
+
return parsed
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
cur_file = os.path.abspath(__file__)
|
| 474 |
+
xsl_path = os.path.join(os.path.dirname(cur_file), "mmltex/mmltex.xsl")
|
| 475 |
+
|
| 476 |
+
xslt = etree.parse(xsl_path)
|
| 477 |
+
transform = etree.XSLT(xslt)
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def mml_to_latex(mml_code):
|
| 481 |
+
# Remove any attibutes from the math tag
|
| 482 |
+
mml_code = re.sub(r"(<math.*?>)", r"\1", mml_code)
|
| 483 |
+
mml_ns = mml_code.replace(
|
| 484 |
+
"<math>", '<math xmlns="http://www.w3.org/1998/Math/MathML">'
|
| 485 |
+
) # Required.
|
| 486 |
+
|
| 487 |
+
mml_ns = mml_ns.replace(""", '"')
|
| 488 |
+
mml_ns = mml_ns.replace("'\\\"", '"').replace("\\\"'", '"')
|
| 489 |
+
|
| 490 |
+
# 很多网页中标签内容就是错误
|
| 491 |
+
# pattern = r"(<[^<>]*?\s)(mathbackground|mathsize|mathvariant|mathfamily|class|separators|style|id|rowalign|columnspacing|rowlines|columnlines|frame|framespacing|equalrows|equalcolumns|align|linethickness|lspace|rspace|mathcolor|rowspacing|displaystyle|style|columnalign|open|close|right|left)(?=\s|>)(?![\"'][^<>]*?>)"
|
| 492 |
+
|
| 493 |
+
pattern = r'"([^"]+?)\''
|
| 494 |
+
mml_ns = re.sub(pattern, r'"\1"', mml_ns)
|
| 495 |
+
|
| 496 |
+
mml_dom = etree.fromstring(mml_ns)
|
| 497 |
+
mmldom = transform(mml_dom)
|
| 498 |
+
latex_code = str(mmldom)
|
| 499 |
+
return latex_code
|