| """ |
| Demo: Word2Vec vs Transformer β side by side comparison. |
| |
| Run: python demo.py |
| """ |
|
|
| import json |
| from contextual_similarity import ContextualSimilarityEngine |
| from word2vec_baseline import Word2VecEngine |
| from evaluation import Evaluator, GroundTruthEntry |
|
|
| |
| |
| |
|
|
| DOCS = { |
| "secret_language": """ |
| The kids in the neighborhood had developed their own secret language. When they said |
| "pizza" they actually meant "school". So when Tommy said "I love pizza so much, I go |
| there every day", he really meant he loved going to school. His friend Sarah would say |
| "pizza gives me homework" and everyone in the group understood she was talking about school. |
| |
| The code words extended further. "Pepperoni" meant math class, because it was their |
| favorite topping but also the hardest subject. When Jake complained about "too much |
| pepperoni on my pizza", the group knew he was struggling with math at school. |
| |
| Their parents were confused. "Why do you kids talk about pizza all the time?" asked |
| Tommy's mom. The kids just giggled. Their secret language was working perfectly. |
| """, |
| "real_pizza": """ |
| Meanwhile, across town, Maria genuinely loved pizza. She worked at Giuseppe's Pizzeria |
| and made the best margherita in the city. Her pizza dough recipe used tipo 00 flour, |
| San Marzano tomatoes, and fresh mozzarella. Every Saturday, she would fire up the |
| wood-burning oven and create masterpieces. |
| |
| Maria's customers raved about her pizza. "This pizza is amazing, the crust is perfectly |
| crispy!" they would say. The restaurant was always full. Pizza was Maria's life, her |
| passion, and her livelihood. She dreamed of opening more pizza restaurants across the country. |
| """, |
| "school_board": """ |
| The local school board met to discuss improving education in the district. Principal |
| Johnson presented data showing that students who attended school regularly performed |
| better on standardized tests. "School attendance is directly correlated with academic |
| success," she explained. |
| |
| The board discussed new programs to make school more engaging for students. They proposed |
| adding more extracurricular activities, updating the curriculum, and hiring additional |
| teachers. "We need to make school a place where students want to be," said board member |
| Williams. |
| """, |
| "misunderstanding": """ |
| One day, Tommy's mom overheard a phone conversation. Tommy said to his friend, "I really |
| don't want to go to pizza tomorrow. The pizza test is going to be so hard." His mom was |
| bewildered - what kind of test does a pizzeria give? |
| |
| She called Sarah's mom, who had noticed similar strange statements. "Sarah told me she |
| got an A on her pizza report. Since when do pizza places give grades?" The parents |
| decided to investigate. |
| |
| When they finally figured out the code, they laughed. "So all this time, when you said |
| you hated Monday pizza, you meant you hated going to school on Mondays?" Tommy nodded |
| sheepishly. |
| """, |
| } |
|
|
| COMPARE_PAIRS = [ |
| ("I love pizza so much", "I love school so much"), |
| ("pizza gives me homework", "school gives me homework"), |
| ("pizza gives me homework", "fresh mozzarella on pizza"), |
| ("The pizza test is hard", "The school exam is difficult"), |
| ("too much pepperoni on my pizza", "math class is too hard"), |
| ] |
|
|
|
|
| def main(): |
| |
| |
| |
| print("=" * 70) |
| print("Loading models...") |
| print("=" * 70) |
|
|
| |
| transformer = ContextualSimilarityEngine( |
| model_name="all-MiniLM-L6-v2", |
| chunk_size=400, |
| chunk_overlap=80, |
| ) |
| for doc_id, text in DOCS.items(): |
| transformer.add_document(doc_id, text) |
| transformer.build_index(show_progress=False) |
| print(f"Transformer: {transformer.get_stats()['total_chunks']} chunks, " |
| f"dim={transformer.embedding_dim}") |
|
|
| |
| w2v = Word2VecEngine(vector_size=100, window=5, epochs=50) |
| for doc_id, text in DOCS.items(): |
| w2v.add_document(doc_id, text) |
| stats = w2v.build_index() |
| print(f"Word2Vec: {stats['sentences']} sentences, " |
| f"vocab={stats['vocab_size']}, dim={stats['vector_size']}") |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("1. TEXT SIMILARITY β same pairs, both models") |
| print("=" * 70) |
| print(f"\n {'Text A':<35} {'Text B':<35} {'W2V':>6} {'Trans':>6} {'Winner'}") |
| print(" " + "-" * 95) |
|
|
| for a, b in COMPARE_PAIRS: |
| w2v_score = w2v.compare_texts(a, b) |
| tr_score = transformer.compare_texts(a, b) |
| winner = "W2V" if abs(w2v_score) > abs(tr_score) else "TRANS" |
| print(f" {a:<35} {b:<35} {w2v_score:>6.3f} {tr_score:>6.3f} {winner}") |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("2. WORD-LEVEL SIMILARITY (Word2Vec only)") |
| print(" Word2Vec gives ONE vector per word β no context awareness") |
| print("=" * 70) |
|
|
| for word in ["pizza", "school", "homework", "pepperoni"]: |
| similar = w2v.most_similar_words(word, top_k=5) |
| if similar: |
| top = ", ".join(f"{w}({s:.2f})" for w, s in similar) |
| print(f" {word:>12} -> {top}") |
|
|
| print(f"\n Word2Vec word pairs:") |
| for a, b in [("pizza", "school"), ("pizza", "homework"), ("pizza", "cheese"), |
| ("school", "homework"), ("pepperoni", "math")]: |
| score = w2v.word_similarity(a, b) |
| print(f" {a} <-> {b}: {score:.4f}") |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("3. SEMANTIC SEARCH β 'a place where children learn and take tests'") |
| print("=" * 70) |
|
|
| query = "a place where children learn and take tests" |
|
|
| print("\n Transformer results:") |
| for r in transformer.query(query, top_k=3): |
| print(f" #{r.rank} ({r.score:.4f}) [{r.chunk.doc_id}] {r.chunk.text[:80]}...") |
|
|
| print("\n Word2Vec results:") |
| for r in w2v.query(query, top_k=3): |
| print(f" #{r.rank} ({r.score:.4f}) [{r.doc_id}] {r.text[:80]}...") |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("4. KEYWORD MEANING MATCHING β 'pizza' -> food or school?") |
| print(" Transformer uses full passage context. Word2Vec averages word vectors.") |
| print("=" * 70) |
|
|
| candidates = [ |
| "Italian food, restaurant, cooking, dough and cheese", |
| "School, education, academic activities, homework and tests", |
| ] |
|
|
| print("\n Transformer (match_keyword_to_meaning):") |
| matches = transformer.match_keyword_to_meaning("pizza", candidates) |
| for m in matches: |
| doc = m["chunk"].doc_id |
| best = m["best_match"][:40] |
| scores = " | ".join(f"{c[:20]}={s:.3f}" for c, s in m["all_scores"].items()) |
| print(f" [{doc:>20}] -> {best:<40} ({scores})") |
|
|
| print("\n Word2Vec (sentence-level similarity to candidates):") |
| |
| import re |
| for doc_id, text in DOCS.items(): |
| sents = re.split(r"(?<=[.!?])\s+", text.strip()) |
| for sent in sents: |
| if re.search(r"\bpizza\b", sent, re.IGNORECASE) and len(sent.split()) >= 5: |
| scores = {c: w2v.compare_texts(sent, c) for c in candidates} |
| best = max(scores, key=scores.get) |
| best_label = best[:40] |
| score_str = " | ".join(f"{c[:20]}={s:.3f}" for c, s in scores.items()) |
| print(f" [{doc_id:>20}] -> {best_label:<40} ({score_str})") |
| break |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("5. KEYWORD CLUSTERING β can the model separate meanings of 'pizza'?") |
| print("=" * 70) |
|
|
| analysis = transformer.analyze_keyword("pizza", top_k=2, cluster_threshold=0.4) |
| print(f"\n Transformer: {analysis.total_occurrences} occurrences -> " |
| f"{len(analysis.meaning_clusters)} clusters") |
| for c in analysis.meaning_clusters: |
| docs = set(ctx.chunk.doc_id for ctx in c["contexts"]) |
| print(f" Cluster {c['cluster_id']} ({c['size']} hits, docs: {docs})") |
| print(f" Example: {c['representative_text'][:100]}...") |
|
|
| print(f"\n Word2Vec: cannot cluster by meaning (same word = same vector always)") |
| print(f" 'pizza' has exactly ONE embedding regardless of context") |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("SUMMARY") |
| print("=" * 70) |
| print(""" |
| Word2Vec: |
| + Fast to train on small corpus |
| + Shows which words co-occur (word-level neighbors) |
| - ONE vector per word β "pizza" is always "pizza" |
| - Cannot distinguish "pizza = food" from "pizza = school" |
| - Sentence similarity is just averaged word vectors (lossy) |
| |
| Transformer (SentenceTransformers): |
| + Full sentence/passage context β same word gets different embeddings |
| + Can cluster "pizza" into food vs school meanings |
| + Pretrained on massive data β understands language out of the box |
| + FAISS enables fast search over large corpora |
| - Larger model (~80MB vs ~1MB for Word2Vec) |
| - Slower inference (still <100ms per query) |
| """) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|