Upload concordancer.py with huggingface_hub
Browse files- concordancer.py +37 -16
concordancer.py
CHANGED
|
@@ -72,7 +72,7 @@ mark {
|
|
| 72 |
)
|
| 73 |
|
| 74 |
|
| 75 |
-
def _highlight(text, query):
|
| 76 |
if not text or not query:
|
| 77 |
return html.escape(text)
|
| 78 |
escaped_text = html.escape(text)
|
|
@@ -81,15 +81,17 @@ def _highlight(text, query):
|
|
| 81 |
rf"({regex.escape(escaped_query)})",
|
| 82 |
r'<mark style="background:#fef08a;padding:1px 2px;border-radius:2px">\1</mark>',
|
| 83 |
escaped_text,
|
| 84 |
-
flags=regex.IGNORECASE | regex.V1,
|
| 85 |
)
|
| 86 |
|
| 87 |
|
| 88 |
-
def _highlight_regex(text, pattern):
|
| 89 |
if not text or not pattern:
|
| 90 |
return html.escape(text)
|
| 91 |
try:
|
| 92 |
-
compiled = regex.compile(
|
|
|
|
|
|
|
| 93 |
except regex.error:
|
| 94 |
return html.escape(text)
|
| 95 |
|
|
@@ -173,6 +175,7 @@ with st.form("search_form", clear_on_submit=False):
|
|
| 173 |
)
|
| 174 |
with col2:
|
| 175 |
use_regex = st.checkbox("Regex", value=False)
|
|
|
|
| 176 |
submitted = st.form_submit_button("Submit", use_container_width=True)
|
| 177 |
with col3:
|
| 178 |
per_page = st.selectbox("Per page", [10, 25, 50, 100], index=1)
|
|
@@ -193,6 +196,7 @@ if law_id_filter:
|
|
| 193 |
search_signature = (
|
| 194 |
query,
|
| 195 |
use_regex,
|
|
|
|
| 196 |
per_page,
|
| 197 |
cat_filter,
|
| 198 |
law_id_filter,
|
|
@@ -210,6 +214,7 @@ if query:
|
|
| 210 |
conn,
|
| 211 |
query,
|
| 212 |
use_regex=use_regex,
|
|
|
|
| 213 |
law_id=law_id_filter,
|
| 214 |
category=cat_filter,
|
| 215 |
article_no=article_filter,
|
|
@@ -264,11 +269,11 @@ if query:
|
|
| 264 |
en_text = row["en_text"] or ""
|
| 265 |
|
| 266 |
if query and use_regex:
|
| 267 |
-
zh_display = _highlight_regex(zh_text, query)
|
| 268 |
-
en_display = _highlight_regex(en_text, query)
|
| 269 |
elif query and not use_regex:
|
| 270 |
-
zh_display = _highlight(zh_text, query)
|
| 271 |
-
en_display = _highlight(en_text, query)
|
| 272 |
else:
|
| 273 |
zh_display = html.escape(zh_text)
|
| 274 |
en_display = html.escape(en_text)
|
|
@@ -305,11 +310,19 @@ if query:
|
|
| 305 |
para_zh = _join_sentences(para["sentences"], "zh")
|
| 306 |
para_en = _join_sentences(para["sentences"], "en")
|
| 307 |
if query and use_regex:
|
| 308 |
-
para_zh_display = _highlight_regex(
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
elif query and not use_regex:
|
| 311 |
-
para_zh_display = _highlight(
|
| 312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
else:
|
| 314 |
para_zh_display = html.escape(para_zh)
|
| 315 |
para_en_display = html.escape(para_en)
|
|
@@ -337,11 +350,19 @@ if query:
|
|
| 337 |
art_zh = _join_sentences(para["sentences"], "zh")
|
| 338 |
art_en = _join_sentences(para["sentences"], "en")
|
| 339 |
if query and use_regex:
|
| 340 |
-
art_zh_display = _highlight_regex(
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
elif query and not use_regex:
|
| 343 |
-
art_zh_display = _highlight(
|
| 344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
else:
|
| 346 |
art_zh_display = html.escape(art_zh)
|
| 347 |
art_en_display = html.escape(art_en)
|
|
|
|
| 72 |
)
|
| 73 |
|
| 74 |
|
| 75 |
+
def _highlight(text, query, case_sensitive=False):
|
| 76 |
if not text or not query:
|
| 77 |
return html.escape(text)
|
| 78 |
escaped_text = html.escape(text)
|
|
|
|
| 81 |
rf"({regex.escape(escaped_query)})",
|
| 82 |
r'<mark style="background:#fef08a;padding:1px 2px;border-radius:2px">\1</mark>',
|
| 83 |
escaped_text,
|
| 84 |
+
flags=regex.V1 if case_sensitive else regex.IGNORECASE | regex.V1,
|
| 85 |
)
|
| 86 |
|
| 87 |
|
| 88 |
+
def _highlight_regex(text, pattern, case_sensitive=False):
|
| 89 |
if not text or not pattern:
|
| 90 |
return html.escape(text)
|
| 91 |
try:
|
| 92 |
+
compiled = regex.compile(
|
| 93 |
+
pattern, flags=regex.V1 if case_sensitive else regex.IGNORECASE | regex.V1
|
| 94 |
+
)
|
| 95 |
except regex.error:
|
| 96 |
return html.escape(text)
|
| 97 |
|
|
|
|
| 175 |
)
|
| 176 |
with col2:
|
| 177 |
use_regex = st.checkbox("Regex", value=False)
|
| 178 |
+
case_sensitive = st.checkbox("Case sensitive", value=False)
|
| 179 |
submitted = st.form_submit_button("Submit", use_container_width=True)
|
| 180 |
with col3:
|
| 181 |
per_page = st.selectbox("Per page", [10, 25, 50, 100], index=1)
|
|
|
|
| 196 |
search_signature = (
|
| 197 |
query,
|
| 198 |
use_regex,
|
| 199 |
+
case_sensitive,
|
| 200 |
per_page,
|
| 201 |
cat_filter,
|
| 202 |
law_id_filter,
|
|
|
|
| 214 |
conn,
|
| 215 |
query,
|
| 216 |
use_regex=use_regex,
|
| 217 |
+
case_sensitive=case_sensitive,
|
| 218 |
law_id=law_id_filter,
|
| 219 |
category=cat_filter,
|
| 220 |
article_no=article_filter,
|
|
|
|
| 269 |
en_text = row["en_text"] or ""
|
| 270 |
|
| 271 |
if query and use_regex:
|
| 272 |
+
zh_display = _highlight_regex(zh_text, query, case_sensitive=case_sensitive)
|
| 273 |
+
en_display = _highlight_regex(en_text, query, case_sensitive=case_sensitive)
|
| 274 |
elif query and not use_regex:
|
| 275 |
+
zh_display = _highlight(zh_text, query, case_sensitive=case_sensitive)
|
| 276 |
+
en_display = _highlight(en_text, query, case_sensitive=case_sensitive)
|
| 277 |
else:
|
| 278 |
zh_display = html.escape(zh_text)
|
| 279 |
en_display = html.escape(en_text)
|
|
|
|
| 310 |
para_zh = _join_sentences(para["sentences"], "zh")
|
| 311 |
para_en = _join_sentences(para["sentences"], "en")
|
| 312 |
if query and use_regex:
|
| 313 |
+
para_zh_display = _highlight_regex(
|
| 314 |
+
para_zh, query, case_sensitive=case_sensitive
|
| 315 |
+
)
|
| 316 |
+
para_en_display = _highlight_regex(
|
| 317 |
+
para_en, query, case_sensitive=case_sensitive
|
| 318 |
+
)
|
| 319 |
elif query and not use_regex:
|
| 320 |
+
para_zh_display = _highlight(
|
| 321 |
+
para_zh, query, case_sensitive=case_sensitive
|
| 322 |
+
)
|
| 323 |
+
para_en_display = _highlight(
|
| 324 |
+
para_en, query, case_sensitive=case_sensitive
|
| 325 |
+
)
|
| 326 |
else:
|
| 327 |
para_zh_display = html.escape(para_zh)
|
| 328 |
para_en_display = html.escape(para_en)
|
|
|
|
| 350 |
art_zh = _join_sentences(para["sentences"], "zh")
|
| 351 |
art_en = _join_sentences(para["sentences"], "en")
|
| 352 |
if query and use_regex:
|
| 353 |
+
art_zh_display = _highlight_regex(
|
| 354 |
+
art_zh, query, case_sensitive=case_sensitive
|
| 355 |
+
)
|
| 356 |
+
art_en_display = _highlight_regex(
|
| 357 |
+
art_en, query, case_sensitive=case_sensitive
|
| 358 |
+
)
|
| 359 |
elif query and not use_regex:
|
| 360 |
+
art_zh_display = _highlight(
|
| 361 |
+
art_zh, query, case_sensitive=case_sensitive
|
| 362 |
+
)
|
| 363 |
+
art_en_display = _highlight(
|
| 364 |
+
art_en, query, case_sensitive=case_sensitive
|
| 365 |
+
)
|
| 366 |
else:
|
| 367 |
art_zh_display = html.escape(art_zh)
|
| 368 |
art_en_display = html.escape(art_en)
|