Kalana commited on
Commit
728fd3f
Β·
2 Parent(s): a9209c7845bb58

Merge branch 'main' of https://huggingface.co/spaces/Kalana001/SinCode

Browse files
Files changed (2) hide show
  1. app.py +181 -30
  2. core/decoder.py +73 -0
app.py CHANGED
@@ -5,36 +5,55 @@ SinCode Web UI β€” Streamlit interface for the transliteration engine.
5
  import streamlit as st
6
  import time
7
  import os
 
 
8
  import base64
 
 
9
  from PIL import Image
10
  from sincode_model import BeamSearchDecoder
11
 
 
 
12
  st.set_page_config(page_title="ΰ·ƒΰ·’ΰΆ‚Code", page_icon="πŸ‡±πŸ‡°", layout="centered")
13
 
14
 
15
  # ─── Helpers ─────────────────────────────────────────────────────────────────
16
 
17
- def _set_background(image_file: str) -> None:
18
- """Inject a dark-overlay background from a local image."""
 
19
  try:
20
  with open(image_file, "rb") as f:
21
  b64 = base64.b64encode(f.read()).decode()
22
- st.markdown(
23
- f"""
24
- <style>
25
- .stApp {{
26
- background-image: linear-gradient(rgba(0,0,0,0.7), rgba(0,0,0,0.7)),
27
- url(data:image/png;base64,{b64});
28
- background-size: cover;
29
- background-position: center;
30
- background-attachment: fixed;
31
- }}
32
- </style>
33
- """,
34
- unsafe_allow_html=True,
35
  )
36
  except FileNotFoundError:
37
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
  @st.cache_resource
@@ -52,7 +71,7 @@ def _load_decoder() -> BeamSearchDecoder:
52
  _set_background("images/background.png")
53
 
54
  with st.sidebar:
55
- st.image(Image.open("images/SinCodeLogo.jpg"), width=200)
56
  st.title("ΰ·ƒΰ·’ΰΆ‚Code Project")
57
  st.info("Prototype")
58
 
@@ -99,21 +118,153 @@ if st.button("Transliterate", type="primary", use_container_width=True) and inpu
99
  with st.spinner("Processing..."):
100
  decoder = _load_decoder()
101
  t0 = time.time()
102
- result, trace_logs = decoder.decode(input_text, mode=decode_mode)
 
 
 
103
  elapsed = time.time() - t0
104
 
105
- st.success("Transliteration Complete")
106
- st.markdown(f"### {result}")
107
- st.caption(f"Mode: {decode_mode} Β· Time: {round(elapsed, 2)}s")
108
-
109
- with st.expander("Scoring Breakdown", expanded=True):
110
- st.caption(
111
- "MLM = contextual fit Β· Fid = transliteration fidelity Β· "
112
- "Rank = dictionary prior Β· πŸ”€ = English"
113
- )
114
- for log in trace_logs:
115
- st.markdown(log)
116
- st.divider()
117
 
118
  except Exception as e:
119
  st.error(f"Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import streamlit as st
6
  import time
7
  import os
8
+ import csv
9
+ import html as html_lib
10
  import base64
11
+ from datetime import datetime
12
+ from pathlib import Path
13
  from PIL import Image
14
  from sincode_model import BeamSearchDecoder
15
 
16
+ FEEDBACK_FILE = Path("feedback.csv")
17
+
18
  st.set_page_config(page_title="ΰ·ƒΰ·’ΰΆ‚Code", page_icon="πŸ‡±πŸ‡°", layout="centered")
19
 
20
 
21
  # ─── Helpers ─────────────────────────────────────────────────────────────────
22
 
23
+ @st.cache_data
24
+ def _background_css(image_file: str) -> str:
25
+ """Return the CSS string for the background image (cached after first read)."""
26
  try:
27
  with open(image_file, "rb") as f:
28
  b64 = base64.b64encode(f.read()).decode()
29
+ return (
30
+ f"<style>.stApp {{background-image: linear-gradient(rgba(0,0,0,0.7),"
31
+ f"rgba(0,0,0,0.7)),url(data:image/png;base64,{b64});"
32
+ f"background-size:cover;background-position:center;"
33
+ f"background-attachment:fixed;}}</style>"
 
 
 
 
 
 
 
 
34
  )
35
  except FileNotFoundError:
36
+ return ""
37
+
38
+
39
+ def _set_background(image_file: str) -> None:
40
+ css = _background_css(image_file)
41
+ if css:
42
+ st.markdown(css, unsafe_allow_html=True)
43
+
44
+
45
+ @st.cache_data
46
+ def _load_logo(image_file: str):
47
+ return Image.open(image_file)
48
+
49
+
50
+ def _save_feedback(input_sentence: str, original_output: str, corrected_output: str) -> None:
51
+ """Append a full-sentence correction to the feedback CSV."""
52
+ with FEEDBACK_FILE.open("a", newline="", encoding="utf-8") as f:
53
+ writer = csv.writer(f)
54
+ if f.tell() == 0:
55
+ writer.writerow(["timestamp", "input_sentence", "original_output", "corrected_output"])
56
+ writer.writerow([datetime.now().isoformat(), input_sentence, original_output, corrected_output])
57
 
58
 
59
  @st.cache_resource
 
71
  _set_background("images/background.png")
72
 
73
  with st.sidebar:
74
+ st.image(_load_logo("images/SinCodeLogo.jpg"), width=200)
75
  st.title("ΰ·ƒΰ·’ΰΆ‚Code Project")
76
  st.info("Prototype")
77
 
 
118
  with st.spinner("Processing..."):
119
  decoder = _load_decoder()
120
  t0 = time.time()
121
+ if decode_mode == "greedy":
122
+ result, trace_logs, diagnostics = decoder.greedy_decode_with_diagnostics(input_text)
123
+ else:
124
+ result, trace_logs, diagnostics = decoder.decode_with_diagnostics(input_text)
125
  elapsed = time.time() - t0
126
 
127
+ # Store results in session state for interactive word swapping
128
+ selected = [d.selected_candidate for d in diagnostics]
129
+ st.session_state["diagnostics"] = diagnostics
130
+ st.session_state["output_words"] = selected
131
+ st.session_state["original_words"] = list(selected)
132
+ st.session_state["input_sentence"] = input_text
133
+ st.session_state["trace_logs"] = trace_logs
134
+ st.session_state["elapsed"] = elapsed
135
+ st.session_state["correction_mode"] = False
136
+ st.session_state["correction_submitted_for"] = None
 
 
137
 
138
  except Exception as e:
139
  st.error(f"Error: {e}")
140
+
141
+ # ─── Render output (persists across reruns for word swapping) ─────────────
142
+
143
+ if "output_words" in st.session_state and st.session_state["output_words"]:
144
+ diagnostics = st.session_state["diagnostics"]
145
+ output_words = st.session_state["output_words"]
146
+ original_words = st.session_state.get("original_words", list(output_words))
147
+ trace_logs = st.session_state["trace_logs"]
148
+ elapsed = st.session_state["elapsed"]
149
+
150
+ current_result = " ".join(output_words)
151
+ original_result = " ".join(original_words)
152
+ has_changes = output_words != original_words
153
+
154
+ st.success("Transliteration Complete")
155
+
156
+ # Output display with native copy button (st.code has built-in clipboard support)
157
+ safe_display = html_lib.escape(current_result)
158
+ st.markdown(
159
+ f'<span style="font-size:1.4em;font-weight:700;">{safe_display}</span>',
160
+ unsafe_allow_html=True,
161
+ )
162
+ st.code(current_result, language=None)
163
+ st.caption(f"Mode: {decode_mode} Β· Time: {round(elapsed, 2)}s")
164
+
165
+ # ── Correction mode toggle ────────────────────────────────────────
166
+ correction_mode = st.toggle(
167
+ "Correct this translation",
168
+ value=st.session_state.get("correction_mode", False),
169
+ key="correction_toggle",
170
+ )
171
+
172
+ if correction_mode:
173
+ st.caption("Use the buttons below to swap alternative transliterations.")
174
+
175
+ # ── Inline sentence display (natural text flow, no grid) ─────
176
+ word_spans = []
177
+ for i, diag in enumerate(diagnostics):
178
+ has_alts = len(diag.candidate_breakdown) > 1
179
+ was_changed = output_words[i] != original_words[i]
180
+ w = html_lib.escape(output_words[i])
181
+ if was_changed:
182
+ word_spans.append(
183
+ f'<span style="color:#68d391;font-weight:700;">{w} βœ“</span>'
184
+ )
185
+ elif has_alts:
186
+ word_spans.append(
187
+ f'<span style="color:#63b3ed;font-weight:700;'
188
+ f'border-bottom:2px dashed #63b3ed;cursor:default;">{w}</span>'
189
+ )
190
+ else:
191
+ word_spans.append(f'<span style="font-weight:600;">{w}</span>')
192
+
193
+ st.markdown(
194
+ '<div style="font-size:1.15em;line-height:2.4;">'
195
+ + " &ensp; ".join(word_spans)
196
+ + "</div>",
197
+ unsafe_allow_html=True,
198
+ )
199
+ # ── Popover buttons only for swappable words ─────────────────
200
+ swappable = [
201
+ (i, diag)
202
+ for i, diag in enumerate(diagnostics)
203
+ if len(diag.candidate_breakdown) > 1
204
+ ]
205
+ if swappable:
206
+ widths = [max(len(output_words[i]), 3) for i, _ in swappable]
207
+ cols = st.columns(widths, gap="small")
208
+
209
+ for col, (i, diag) in zip(cols, swappable):
210
+ was_changed = output_words[i] != original_words[i]
211
+ with col:
212
+ chip = (
213
+ f":green[**{output_words[i]}**] βœ“"
214
+ if was_changed
215
+ else f":blue[**{output_words[i]}**]"
216
+ )
217
+ with st.popover(chip, use_container_width=True):
218
+ st.markdown(f"**`{diag.input_word}`** β€” pick alternative:")
219
+ for scored in diag.candidate_breakdown[:5]:
220
+ eng_tag = " πŸ”€" if scored.is_english else ""
221
+ is_sel = scored.text == output_words[i]
222
+ if st.button(
223
+ f"{'βœ… ' if is_sel else ''}{scored.text}{eng_tag}",
224
+ key=f"alt_{i}_{scored.text}",
225
+ help=f"Score: {scored.combined_score:.2f}",
226
+ use_container_width=True,
227
+ type="primary" if is_sel else "secondary",
228
+ ):
229
+ st.session_state["output_words"][i] = scored.text
230
+ st.rerun()
231
+ st.markdown("---")
232
+ custom = st.text_input(
233
+ "Not listed? Type correct word:",
234
+ key=f"custom_{i}",
235
+ placeholder="Type Sinhala word",
236
+ )
237
+ if custom and st.button(
238
+ "Use this", key=f"custom_apply_{i}", use_container_width=True
239
+ ):
240
+ st.session_state["output_words"][i] = custom
241
+ st.rerun()
242
+
243
+ # ── Submit correction button (only when changes exist, once per result) ──
244
+ # Guard key: (original sentence, original output) β€” stable regardless of swaps
245
+ submit_key = (st.session_state["input_sentence"], original_result)
246
+ already_submitted = st.session_state.get("correction_submitted_for") == submit_key
247
+ if has_changes and not already_submitted:
248
+ st.info(f"**Original:** {original_result}\n\n**Corrected:** {current_result}")
249
+ if st.button("Submit Correction", type="primary", use_container_width=True):
250
+ _save_feedback(
251
+ input_sentence=st.session_state["input_sentence"],
252
+ original_output=original_result,
253
+ corrected_output=current_result,
254
+ )
255
+ st.session_state["correction_submitted_for"] = submit_key
256
+ st.session_state["correction_mode"] = False
257
+ st.toast("Correction submitted β€” thank you!")
258
+ st.rerun()
259
+
260
+ # Show outside toggle so it remains visible after submission closes the toggle
261
+ input_sent = st.session_state.get("input_sentence", "")
262
+ if st.session_state.get("correction_submitted_for") == (input_sent, original_result):
263
+ st.success("Correction already submitted.")
264
+
265
+ with st.expander("Scoring Breakdown", expanded=False):
266
+ st.caption(
267
+ "MLM = contextual fit Β· Fid = transliteration fidelity Β· "
268
+ "Rank = dictionary prior Β· πŸ”€ = English"
269
+ )
270
+ st.markdown("\n\n---\n\n".join(trace_logs))
core/decoder.py CHANGED
@@ -3,6 +3,7 @@ Beam search and greedy decoders for Singlish β†’ Sinhala transliteration.
3
  """
4
 
5
  import math
 
6
  import torch
7
  import pickle
8
  import logging
@@ -22,6 +23,14 @@ from core.dictionary import DictionaryAdapter
22
 
23
  logger = logging.getLogger(__name__)
24
 
 
 
 
 
 
 
 
 
25
 
26
  class BeamSearchDecoder:
27
  """
@@ -210,6 +219,20 @@ class BeamSearchDecoder:
210
  "dict_flags": [False],
211
  "prefix": prefix,
212
  "suffix": suffix,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  })
214
  continue
215
 
@@ -242,6 +265,7 @@ class BeamSearchDecoder:
242
  "dict_flags": dict_flags[:MAX_CANDIDATES],
243
  "prefix": prefix,
244
  "suffix": suffix,
 
245
  })
246
 
247
  # Build right-side stable context (rule outputs for future words)
@@ -268,6 +292,23 @@ class BeamSearchDecoder:
268
  suffix = info.get("suffix", "")
269
  total_cands = len(candidates)
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  # ── Common-word shortcut ─────────────────────────────────
272
  core_lower = words[t].lower().strip()
273
  if core_lower in COMMON_WORDS:
@@ -464,6 +505,19 @@ class BeamSearchDecoder:
464
  "english_flags": [False],
465
  "prefix": prefix,
466
  "suffix": suffix,
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  })
468
  continue
469
 
@@ -495,6 +549,7 @@ class BeamSearchDecoder:
495
  "dict_flags": dict_flags[:MAX_CANDIDATES],
496
  "prefix": prefix,
497
  "suffix": suffix,
 
498
  })
499
 
500
  # Build stable context (fixed for all beam paths)
@@ -521,6 +576,24 @@ class BeamSearchDecoder:
521
  suffix = info.get("suffix", "")
522
  total_cands = len(candidates)
523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  # ── Common-word shortcut ─────────────────────────────────
525
  core_lower = words[t].lower().strip()
526
  if core_lower in COMMON_WORDS:
 
3
  """
4
 
5
  import math
6
+ import re
7
  import torch
8
  import pickle
9
  import logging
 
23
 
24
  logger = logging.getLogger(__name__)
25
 
26
+ # Sinhala Unicode block: U+0D80 – U+0DFF
27
+ _SINHALA_RE = re.compile(r"[\u0D80-\u0DFF]")
28
+
29
+
30
+ def _is_sinhala(text: str) -> bool:
31
+ """Return True if the text already contains Sinhala script characters."""
32
+ return bool(_SINHALA_RE.search(text))
33
+
34
 
35
  class BeamSearchDecoder:
36
  """
 
219
  "dict_flags": [False],
220
  "prefix": prefix,
221
  "suffix": suffix,
222
+ "sinhala_passthrough": False,
223
+ })
224
+ continue
225
+
226
+ # Already-Sinhala text: pass through unchanged
227
+ if _is_sinhala(core):
228
+ word_infos.append({
229
+ "candidates": [raw],
230
+ "rule_output": raw,
231
+ "english_flags": [False],
232
+ "dict_flags": [False],
233
+ "prefix": prefix,
234
+ "suffix": suffix,
235
+ "sinhala_passthrough": True,
236
  })
237
  continue
238
 
 
265
  "dict_flags": dict_flags[:MAX_CANDIDATES],
266
  "prefix": prefix,
267
  "suffix": suffix,
268
+ "sinhala_passthrough": False,
269
  })
270
 
271
  # Build right-side stable context (rule outputs for future words)
 
292
  suffix = info.get("suffix", "")
293
  total_cands = len(candidates)
294
 
295
+ # ── Sinhala passthrough ────────────────────────────────────
296
+ if info.get("sinhala_passthrough"):
297
+ selected_words.append(words[t])
298
+ trace_logs.append(
299
+ f"**Step {t + 1}: `{words[t]}`** &nbsp;β†’ "
300
+ f"`{words[t]}` (Sinhala passthrough)\n"
301
+ )
302
+ diagnostics.append(WordDiagnostic(
303
+ step_index=t,
304
+ input_word=words[t],
305
+ rule_output=rule_out,
306
+ selected_candidate=words[t],
307
+ beam_score=0.0,
308
+ candidate_breakdown=[],
309
+ ))
310
+ continue
311
+
312
  # ── Common-word shortcut ─────────────────────────────────
313
  core_lower = words[t].lower().strip()
314
  if core_lower in COMMON_WORDS:
 
505
  "english_flags": [False],
506
  "prefix": prefix,
507
  "suffix": suffix,
508
+ "sinhala_passthrough": False,
509
+ })
510
+ continue
511
+
512
+ # Already-Sinhala text: pass through unchanged
513
+ if _is_sinhala(core):
514
+ word_infos.append({
515
+ "candidates": [raw],
516
+ "rule_output": raw,
517
+ "english_flags": [False],
518
+ "prefix": prefix,
519
+ "suffix": suffix,
520
+ "sinhala_passthrough": True,
521
  })
522
  continue
523
 
 
549
  "dict_flags": dict_flags[:MAX_CANDIDATES],
550
  "prefix": prefix,
551
  "suffix": suffix,
552
+ "sinhala_passthrough": False,
553
  })
554
 
555
  # Build stable context (fixed for all beam paths)
 
576
  suffix = info.get("suffix", "")
577
  total_cands = len(candidates)
578
 
579
+ # ── Sinhala passthrough ────────────────────────────────────
580
+ if info.get("sinhala_passthrough"):
581
+ next_beam_si = [(path + [words[t]], sc) for path, sc in beam]
582
+ beam = next_beam_si[:beam_width]
583
+ trace_logs.append(
584
+ f"**Step {t + 1}: `{words[t]}`** &nbsp;β†’ "
585
+ f"`{words[t]}` (Sinhala passthrough)\n"
586
+ )
587
+ diagnostics.append(WordDiagnostic(
588
+ step_index=t,
589
+ input_word=words[t],
590
+ rule_output=rule_out,
591
+ selected_candidate=words[t],
592
+ beam_score=beam[0][1] if beam else 0.0,
593
+ candidate_breakdown=[],
594
+ ))
595
+ continue
596
+
597
  # ── Common-word shortcut ─────────────────────────────────
598
  core_lower = words[t].lower().strip()
599
  if core_lower in COMMON_WORDS: