Sina1138 commited on
Commit ·
623f4c7
1
Parent(s): 2b125b7
Add semantic similarity detection for sentences: implement _find_corresponding function to identify similar sentences across reviews and enhance summary card rendering with corresponding sentence matches.
Browse files- interface/Demo.py +83 -0
interface/Demo.py
CHANGED
|
@@ -28,6 +28,53 @@ def _make_sentence_id(sentence: str) -> str:
|
|
| 28 |
return "sent_" + hashlib.md5(sentence.encode("utf-8")).hexdigest()[:12]
|
| 29 |
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
def _get_context(sentence: str, sentence_lists: list):
|
| 32 |
"""Return (context_before, context_after) strings for the first review containing sentence."""
|
| 33 |
for sl in sentence_lists:
|
|
@@ -133,6 +180,41 @@ def format_summary_cards(
|
|
| 133 |
f"setTimeout(function(){{el.style.outline='';}},2500);}}}})();"
|
| 134 |
)
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
cards_html += (
|
| 137 |
f'<div style="border:1px solid #e5e7eb;border-left:3px solid {border_color};'
|
| 138 |
f'border-radius:6px;padding:10px 14px;margin-bottom:6px;cursor:pointer;" '
|
|
@@ -141,6 +223,7 @@ def format_summary_cards(
|
|
| 141 |
f'{before_html}'
|
| 142 |
f'<div style="color:#111827;line-height:1.5;padding:2px 0;">{_html.escape(sent)}</div>'
|
| 143 |
f'{after_html}'
|
|
|
|
| 144 |
f'</div>'
|
| 145 |
)
|
| 146 |
|
|
|
|
| 28 |
return "sent_" + hashlib.md5(sentence.encode("utf-8")).hexdigest()[:12]
|
| 29 |
|
| 30 |
|
| 31 |
+
def _find_corresponding(
|
| 32 |
+
sent: str,
|
| 33 |
+
source_review_indices: list,
|
| 34 |
+
sentence_lists: list,
|
| 35 |
+
listener: dict,
|
| 36 |
+
min_similarity: float = 0.70,
|
| 37 |
+
) -> list:
|
| 38 |
+
"""
|
| 39 |
+
Find the most semantically similar sentence in each OTHER review using
|
| 40 |
+
only the listener distribution L_t(d|s).
|
| 41 |
+
|
| 42 |
+
Two sentences with similar listener vectors "apply to" the same set of
|
| 43 |
+
reviewers → they express the same concept in different words.
|
| 44 |
+
|
| 45 |
+
Uses dot product of listener probability vectors (equivalent to
|
| 46 |
+
Bhattacharyya coefficient since vectors are already normalized to sum=1).
|
| 47 |
+
|
| 48 |
+
Returns: [(review_num, sentence, similarity), ...] for reviews that have
|
| 49 |
+
a match above min_similarity.
|
| 50 |
+
"""
|
| 51 |
+
if sent not in listener:
|
| 52 |
+
return []
|
| 53 |
+
|
| 54 |
+
vec_a = listener[sent] # {R1: prob, R2: prob, ...}
|
| 55 |
+
results = []
|
| 56 |
+
|
| 57 |
+
for r_idx, sl in enumerate(sentence_lists):
|
| 58 |
+
if r_idx in source_review_indices:
|
| 59 |
+
continue # skip the source review
|
| 60 |
+
|
| 61 |
+
best_sent, best_sim = None, 0.0
|
| 62 |
+
for candidate in sl:
|
| 63 |
+
if candidate == sent or candidate not in listener:
|
| 64 |
+
continue
|
| 65 |
+
vec_b = listener[candidate]
|
| 66 |
+
# Dot product of probability vectors
|
| 67 |
+
sim = sum(vec_a.get(k, 0.0) * vec_b.get(k, 0.0) for k in vec_a)
|
| 68 |
+
if sim > best_sim:
|
| 69 |
+
best_sim = sim
|
| 70 |
+
best_sent = candidate
|
| 71 |
+
|
| 72 |
+
if best_sent and best_sim >= min_similarity:
|
| 73 |
+
results.append((r_idx + 1, best_sent, best_sim))
|
| 74 |
+
|
| 75 |
+
return results
|
| 76 |
+
|
| 77 |
+
|
| 78 |
def _get_context(sentence: str, sentence_lists: list):
|
| 79 |
"""Return (context_before, context_after) strings for the first review containing sentence."""
|
| 80 |
for sl in sentence_lists:
|
|
|
|
| 180 |
f"setTimeout(function(){{el.style.outline='';}},2500);}}}})();"
|
| 181 |
)
|
| 182 |
|
| 183 |
+
# --- Corresponding sentences from other reviews ---
|
| 184 |
+
corr_html = ""
|
| 185 |
+
if listener:
|
| 186 |
+
source_indices = [r_idx for r_idx, sl in enumerate(sentence_lists) if sent in sl]
|
| 187 |
+
correspondences = _find_corresponding(sent, source_indices, sentence_lists, listener)
|
| 188 |
+
if correspondences:
|
| 189 |
+
corr_parts = []
|
| 190 |
+
for r_num, corr_sent, sim in correspondences:
|
| 191 |
+
corr_id = _make_sentence_id(corr_sent)
|
| 192 |
+
pct_sim = int(round(sim * 100))
|
| 193 |
+
corr_onclick = (
|
| 194 |
+
f"event.stopPropagation();"
|
| 195 |
+
f"var el=document.getElementById('{corr_id}');"
|
| 196 |
+
f"if(el){{el.scrollIntoView({{behavior:'smooth',block:'center'}});"
|
| 197 |
+
f"el.style.outline='3px solid #10b981';"
|
| 198 |
+
f"setTimeout(function(){{el.style.outline='';}},2500);}}"
|
| 199 |
+
)
|
| 200 |
+
corr_parts.append(
|
| 201 |
+
f'<div style="margin-top:4px;padding:4px 8px;background:#f0fdf4;'
|
| 202 |
+
f'border-radius:4px;cursor:pointer;" onclick="{_html.escape(corr_onclick)}">'
|
| 203 |
+
f'<span style="font-size:0.72em;font-weight:600;color:#065f46;">'
|
| 204 |
+
f'Review {r_num}</span> '
|
| 205 |
+
f'<span style="font-size:0.72em;color:#6b7280;">({pct_sim}% match)</span>'
|
| 206 |
+
f'<div style="font-size:0.85em;color:#374151;line-height:1.4;">'
|
| 207 |
+
f'{_html.escape(corr_sent[:150])}{"..." if len(corr_sent) > 150 else ""}</div>'
|
| 208 |
+
f'</div>'
|
| 209 |
+
)
|
| 210 |
+
corr_html = (
|
| 211 |
+
f'<div style="margin-top:6px;border-top:1px solid #e5e7eb;padding-top:6px;">'
|
| 212 |
+
f'<div style="font-size:0.72em;font-weight:600;color:#6b7280;margin-bottom:3px;">'
|
| 213 |
+
f'Similar in other reviews:</div>'
|
| 214 |
+
+ "".join(corr_parts)
|
| 215 |
+
+ "</div>"
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
cards_html += (
|
| 219 |
f'<div style="border:1px solid #e5e7eb;border-left:3px solid {border_color};'
|
| 220 |
f'border-radius:6px;padding:10px 14px;margin-bottom:6px;cursor:pointer;" '
|
|
|
|
| 223 |
f'{before_html}'
|
| 224 |
f'<div style="color:#111827;line-height:1.5;padding:2px 0;">{_html.escape(sent)}</div>'
|
| 225 |
f'{after_html}'
|
| 226 |
+
f'{corr_html}'
|
| 227 |
f'</div>'
|
| 228 |
)
|
| 229 |
|