rubentsui commited on
Commit
4fe179b
·
verified ·
1 Parent(s): 23eb8b8

Upload concordancer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. concordancer.py +37 -16
concordancer.py CHANGED
@@ -72,7 +72,7 @@ mark {
72
  )
73
 
74
 
75
- def _highlight(text, query):
76
  if not text or not query:
77
  return html.escape(text)
78
  escaped_text = html.escape(text)
@@ -81,15 +81,17 @@ def _highlight(text, query):
81
  rf"({regex.escape(escaped_query)})",
82
  r'<mark style="background:#fef08a;padding:1px 2px;border-radius:2px">\1</mark>',
83
  escaped_text,
84
- flags=regex.IGNORECASE | regex.V1,
85
  )
86
 
87
 
88
- def _highlight_regex(text, pattern):
89
  if not text or not pattern:
90
  return html.escape(text)
91
  try:
92
- compiled = regex.compile(pattern, flags=regex.IGNORECASE | regex.V1)
 
 
93
  except regex.error:
94
  return html.escape(text)
95
 
@@ -173,6 +175,7 @@ with st.form("search_form", clear_on_submit=False):
173
  )
174
  with col2:
175
  use_regex = st.checkbox("Regex", value=False)
 
176
  submitted = st.form_submit_button("Submit", use_container_width=True)
177
  with col3:
178
  per_page = st.selectbox("Per page", [10, 25, 50, 100], index=1)
@@ -193,6 +196,7 @@ if law_id_filter:
193
  search_signature = (
194
  query,
195
  use_regex,
 
196
  per_page,
197
  cat_filter,
198
  law_id_filter,
@@ -210,6 +214,7 @@ if query:
210
  conn,
211
  query,
212
  use_regex=use_regex,
 
213
  law_id=law_id_filter,
214
  category=cat_filter,
215
  article_no=article_filter,
@@ -264,11 +269,11 @@ if query:
264
  en_text = row["en_text"] or ""
265
 
266
  if query and use_regex:
267
- zh_display = _highlight_regex(zh_text, query)
268
- en_display = _highlight_regex(en_text, query)
269
  elif query and not use_regex:
270
- zh_display = _highlight(zh_text, query)
271
- en_display = _highlight(en_text, query)
272
  else:
273
  zh_display = html.escape(zh_text)
274
  en_display = html.escape(en_text)
@@ -305,11 +310,19 @@ if query:
305
  para_zh = _join_sentences(para["sentences"], "zh")
306
  para_en = _join_sentences(para["sentences"], "en")
307
  if query and use_regex:
308
- para_zh_display = _highlight_regex(para_zh, query)
309
- para_en_display = _highlight_regex(para_en, query)
 
 
 
 
310
  elif query and not use_regex:
311
- para_zh_display = _highlight(para_zh, query)
312
- para_en_display = _highlight(para_en, query)
 
 
 
 
313
  else:
314
  para_zh_display = html.escape(para_zh)
315
  para_en_display = html.escape(para_en)
@@ -337,11 +350,19 @@ if query:
337
  art_zh = _join_sentences(para["sentences"], "zh")
338
  art_en = _join_sentences(para["sentences"], "en")
339
  if query and use_regex:
340
- art_zh_display = _highlight_regex(art_zh, query)
341
- art_en_display = _highlight_regex(art_en, query)
 
 
 
 
342
  elif query and not use_regex:
343
- art_zh_display = _highlight(art_zh, query)
344
- art_en_display = _highlight(art_en, query)
 
 
 
 
345
  else:
346
  art_zh_display = html.escape(art_zh)
347
  art_en_display = html.escape(art_en)
 
72
  )
73
 
74
 
75
+ def _highlight(text, query, case_sensitive=False):
76
  if not text or not query:
77
  return html.escape(text)
78
  escaped_text = html.escape(text)
 
81
  rf"({regex.escape(escaped_query)})",
82
  r'<mark style="background:#fef08a;padding:1px 2px;border-radius:2px">\1</mark>',
83
  escaped_text,
84
+ flags=regex.V1 if case_sensitive else regex.IGNORECASE | regex.V1,
85
  )
86
 
87
 
88
+ def _highlight_regex(text, pattern, case_sensitive=False):
89
  if not text or not pattern:
90
  return html.escape(text)
91
  try:
92
+ compiled = regex.compile(
93
+ pattern, flags=regex.V1 if case_sensitive else regex.IGNORECASE | regex.V1
94
+ )
95
  except regex.error:
96
  return html.escape(text)
97
 
 
175
  )
176
  with col2:
177
  use_regex = st.checkbox("Regex", value=False)
178
+ case_sensitive = st.checkbox("Case sensitive", value=False)
179
  submitted = st.form_submit_button("Submit", use_container_width=True)
180
  with col3:
181
  per_page = st.selectbox("Per page", [10, 25, 50, 100], index=1)
 
196
  search_signature = (
197
  query,
198
  use_regex,
199
+ case_sensitive,
200
  per_page,
201
  cat_filter,
202
  law_id_filter,
 
214
  conn,
215
  query,
216
  use_regex=use_regex,
217
+ case_sensitive=case_sensitive,
218
  law_id=law_id_filter,
219
  category=cat_filter,
220
  article_no=article_filter,
 
269
  en_text = row["en_text"] or ""
270
 
271
  if query and use_regex:
272
+ zh_display = _highlight_regex(zh_text, query, case_sensitive=case_sensitive)
273
+ en_display = _highlight_regex(en_text, query, case_sensitive=case_sensitive)
274
  elif query and not use_regex:
275
+ zh_display = _highlight(zh_text, query, case_sensitive=case_sensitive)
276
+ en_display = _highlight(en_text, query, case_sensitive=case_sensitive)
277
  else:
278
  zh_display = html.escape(zh_text)
279
  en_display = html.escape(en_text)
 
310
  para_zh = _join_sentences(para["sentences"], "zh")
311
  para_en = _join_sentences(para["sentences"], "en")
312
  if query and use_regex:
313
+ para_zh_display = _highlight_regex(
314
+ para_zh, query, case_sensitive=case_sensitive
315
+ )
316
+ para_en_display = _highlight_regex(
317
+ para_en, query, case_sensitive=case_sensitive
318
+ )
319
  elif query and not use_regex:
320
+ para_zh_display = _highlight(
321
+ para_zh, query, case_sensitive=case_sensitive
322
+ )
323
+ para_en_display = _highlight(
324
+ para_en, query, case_sensitive=case_sensitive
325
+ )
326
  else:
327
  para_zh_display = html.escape(para_zh)
328
  para_en_display = html.escape(para_en)
 
350
  art_zh = _join_sentences(para["sentences"], "zh")
351
  art_en = _join_sentences(para["sentences"], "en")
352
  if query and use_regex:
353
+ art_zh_display = _highlight_regex(
354
+ art_zh, query, case_sensitive=case_sensitive
355
+ )
356
+ art_en_display = _highlight_regex(
357
+ art_en, query, case_sensitive=case_sensitive
358
+ )
359
  elif query and not use_regex:
360
+ art_zh_display = _highlight(
361
+ art_zh, query, case_sensitive=case_sensitive
362
+ )
363
+ art_en_display = _highlight(
364
+ art_en, query, case_sensitive=case_sensitive
365
+ )
366
  else:
367
  art_zh_display = html.escape(art_zh)
368
  art_en_display = html.escape(art_en)