Sina1138 commited on
Commit
623f4c7
·
1 Parent(s): 2b125b7

Add semantic similarity detection for sentences: implement _find_corresponding function to identify similar sentences across reviews and enhance summary card rendering with corresponding sentence matches.

Browse files
Files changed (1) hide show
  1. interface/Demo.py +83 -0
interface/Demo.py CHANGED
@@ -28,6 +28,53 @@ def _make_sentence_id(sentence: str) -> str:
28
  return "sent_" + hashlib.md5(sentence.encode("utf-8")).hexdigest()[:12]
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def _get_context(sentence: str, sentence_lists: list):
32
  """Return (context_before, context_after) strings for the first review containing sentence."""
33
  for sl in sentence_lists:
@@ -133,6 +180,41 @@ def format_summary_cards(
133
  f"setTimeout(function(){{el.style.outline='';}},2500);}}}})();"
134
  )
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  cards_html += (
137
  f'<div style="border:1px solid #e5e7eb;border-left:3px solid {border_color};'
138
  f'border-radius:6px;padding:10px 14px;margin-bottom:6px;cursor:pointer;" '
@@ -141,6 +223,7 @@ def format_summary_cards(
141
  f'{before_html}'
142
  f'<div style="color:#111827;line-height:1.5;padding:2px 0;">{_html.escape(sent)}</div>'
143
  f'{after_html}'
 
144
  f'</div>'
145
  )
146
 
 
28
  return "sent_" + hashlib.md5(sentence.encode("utf-8")).hexdigest()[:12]
29
 
30
 
31
+ def _find_corresponding(
32
+ sent: str,
33
+ source_review_indices: list,
34
+ sentence_lists: list,
35
+ listener: dict,
36
+ min_similarity: float = 0.70,
37
+ ) -> list:
38
+ """
39
+ Find the most semantically similar sentence in each OTHER review using
40
+ only the listener distribution L_t(d|s).
41
+
42
+ Two sentences with similar listener vectors "apply to" the same set of
43
+ reviewers → they express the same concept in different words.
44
+
45
+ Uses dot product of listener probability vectors (equivalent to
46
+ Bhattacharyya coefficient since vectors are already normalized to sum=1).
47
+
48
+ Returns: [(review_num, sentence, similarity), ...] for reviews that have
49
+ a match above min_similarity.
50
+ """
51
+ if sent not in listener:
52
+ return []
53
+
54
+ vec_a = listener[sent] # {R1: prob, R2: prob, ...}
55
+ results = []
56
+
57
+ for r_idx, sl in enumerate(sentence_lists):
58
+ if r_idx in source_review_indices:
59
+ continue # skip the source review
60
+
61
+ best_sent, best_sim = None, 0.0
62
+ for candidate in sl:
63
+ if candidate == sent or candidate not in listener:
64
+ continue
65
+ vec_b = listener[candidate]
66
+ # Dot product of probability vectors
67
+ sim = sum(vec_a.get(k, 0.0) * vec_b.get(k, 0.0) for k in vec_a)
68
+ if sim > best_sim:
69
+ best_sim = sim
70
+ best_sent = candidate
71
+
72
+ if best_sent and best_sim >= min_similarity:
73
+ results.append((r_idx + 1, best_sent, best_sim))
74
+
75
+ return results
76
+
77
+
78
  def _get_context(sentence: str, sentence_lists: list):
79
  """Return (context_before, context_after) strings for the first review containing sentence."""
80
  for sl in sentence_lists:
 
180
  f"setTimeout(function(){{el.style.outline='';}},2500);}}}})();"
181
  )
182
 
183
+ # --- Corresponding sentences from other reviews ---
184
+ corr_html = ""
185
+ if listener:
186
+ source_indices = [r_idx for r_idx, sl in enumerate(sentence_lists) if sent in sl]
187
+ correspondences = _find_corresponding(sent, source_indices, sentence_lists, listener)
188
+ if correspondences:
189
+ corr_parts = []
190
+ for r_num, corr_sent, sim in correspondences:
191
+ corr_id = _make_sentence_id(corr_sent)
192
+ pct_sim = int(round(sim * 100))
193
+ corr_onclick = (
194
+ f"event.stopPropagation();"
195
+ f"var el=document.getElementById('{corr_id}');"
196
+ f"if(el){{el.scrollIntoView({{behavior:'smooth',block:'center'}});"
197
+ f"el.style.outline='3px solid #10b981';"
198
+ f"setTimeout(function(){{el.style.outline='';}},2500);}}"
199
+ )
200
+ corr_parts.append(
201
+ f'<div style="margin-top:4px;padding:4px 8px;background:#f0fdf4;'
202
+ f'border-radius:4px;cursor:pointer;" onclick="{_html.escape(corr_onclick)}">'
203
+ f'<span style="font-size:0.72em;font-weight:600;color:#065f46;">'
204
+ f'Review {r_num}</span> '
205
+ f'<span style="font-size:0.72em;color:#6b7280;">({pct_sim}% match)</span>'
206
+ f'<div style="font-size:0.85em;color:#374151;line-height:1.4;">'
207
+ f'{_html.escape(corr_sent[:150])}{"..." if len(corr_sent) > 150 else ""}</div>'
208
+ f'</div>'
209
+ )
210
+ corr_html = (
211
+ f'<div style="margin-top:6px;border-top:1px solid #e5e7eb;padding-top:6px;">'
212
+ f'<div style="font-size:0.72em;font-weight:600;color:#6b7280;margin-bottom:3px;">'
213
+ f'Similar in other reviews:</div>'
214
+ + "".join(corr_parts)
215
+ + "</div>"
216
+ )
217
+
218
  cards_html += (
219
  f'<div style="border:1px solid #e5e7eb;border-left:3px solid {border_color};'
220
  f'border-radius:6px;padding:10px 14px;margin-bottom:6px;cursor:pointer;" '
 
223
  f'{before_html}'
224
  f'<div style="color:#111827;line-height:1.5;padding:2px 0;">{_html.escape(sent)}</div>'
225
  f'{after_html}'
226
+ f'{corr_html}'
227
  f'</div>'
228
  )
229