Spaces:

caisdev
/

esfiles

Running

File size: 10,274 Bytes

db764ae

"""
Demo: Word2Vec vs Transformer — side by side comparison.

Run: python demo.py
"""

import json
from contextual_similarity import ContextualSimilarityEngine
from word2vec_baseline import Word2VecEngine
from evaluation import Evaluator, GroundTruthEntry

# ------------------------------------------------------------------ #
#  Sample corpus
# ------------------------------------------------------------------ #

DOCS = {
    "secret_language": """
The kids in the neighborhood had developed their own secret language. When they said
"pizza" they actually meant "school". So when Tommy said "I love pizza so much, I go
there every day", he really meant he loved going to school. His friend Sarah would say
"pizza gives me homework" and everyone in the group understood she was talking about school.

The code words extended further. "Pepperoni" meant math class, because it was their
favorite topping but also the hardest subject. When Jake complained about "too much
pepperoni on my pizza", the group knew he was struggling with math at school.

Their parents were confused. "Why do you kids talk about pizza all the time?" asked
Tommy's mom. The kids just giggled. Their secret language was working perfectly.
""",
    "real_pizza": """
Meanwhile, across town, Maria genuinely loved pizza. She worked at Giuseppe's Pizzeria
and made the best margherita in the city. Her pizza dough recipe used tipo 00 flour,
San Marzano tomatoes, and fresh mozzarella. Every Saturday, she would fire up the
wood-burning oven and create masterpieces.

Maria's customers raved about her pizza. "This pizza is amazing, the crust is perfectly
crispy!" they would say. The restaurant was always full. Pizza was Maria's life, her
passion, and her livelihood. She dreamed of opening more pizza restaurants across the country.
""",
    "school_board": """
The local school board met to discuss improving education in the district. Principal
Johnson presented data showing that students who attended school regularly performed
better on standardized tests. "School attendance is directly correlated with academic
success," she explained.

The board discussed new programs to make school more engaging for students. They proposed
adding more extracurricular activities, updating the curriculum, and hiring additional
teachers. "We need to make school a place where students want to be," said board member
Williams.
""",
    "misunderstanding": """
One day, Tommy's mom overheard a phone conversation. Tommy said to his friend, "I really
don't want to go to pizza tomorrow. The pizza test is going to be so hard." His mom was
bewildered - what kind of test does a pizzeria give?

She called Sarah's mom, who had noticed similar strange statements. "Sarah told me she
got an A on her pizza report. Since when do pizza places give grades?" The parents
decided to investigate.

When they finally figured out the code, they laughed. "So all this time, when you said
you hated Monday pizza, you meant you hated going to school on Mondays?" Tommy nodded
sheepishly.
""",
}

COMPARE_PAIRS = [
    ("I love pizza so much", "I love school so much"),
    ("pizza gives me homework", "school gives me homework"),
    ("pizza gives me homework", "fresh mozzarella on pizza"),
    ("The pizza test is hard", "The school exam is difficult"),
    ("too much pepperoni on my pizza", "math class is too hard"),
]


def main():
    # ================================================================ #
    #  Build both engines on the same corpus
    # ================================================================ #
    print("=" * 70)
    print("Loading models...")
    print("=" * 70)

    # Transformer engine
    transformer = ContextualSimilarityEngine(
        model_name="all-MiniLM-L6-v2",
        chunk_size=400,
        chunk_overlap=80,
    )
    for doc_id, text in DOCS.items():
        transformer.add_document(doc_id, text)
    transformer.build_index(show_progress=False)
    print(f"Transformer: {transformer.get_stats()['total_chunks']} chunks, "
          f"dim={transformer.embedding_dim}")

    # Word2Vec engine
    w2v = Word2VecEngine(vector_size=100, window=5, epochs=50)
    for doc_id, text in DOCS.items():
        w2v.add_document(doc_id, text)
    stats = w2v.build_index()
    print(f"Word2Vec:    {stats['sentences']} sentences, "
          f"vocab={stats['vocab_size']}, dim={stats['vector_size']}")

    # ================================================================ #
    #  1. Text similarity comparison
    # ================================================================ #
    print("\n" + "=" * 70)
    print("1. TEXT SIMILARITY — same pairs, both models")
    print("=" * 70)
    print(f"\n  {'Text A':<35} {'Text B':<35} {'W2V':>6} {'Trans':>6}  {'Winner'}")
    print("  " + "-" * 95)

    for a, b in COMPARE_PAIRS:
        w2v_score = w2v.compare_texts(a, b)
        tr_score = transformer.compare_texts(a, b)
        winner = "W2V" if abs(w2v_score) > abs(tr_score) else "TRANS"
        print(f"  {a:<35} {b:<35} {w2v_score:>6.3f} {tr_score:>6.3f}  {winner}")

    # ================================================================ #
    #  2. Word-level similarity (Word2Vec only — transformers don't do this)
    # ================================================================ #
    print("\n" + "=" * 70)
    print("2. WORD-LEVEL SIMILARITY (Word2Vec only)")
    print("   Word2Vec gives ONE vector per word — no context awareness")
    print("=" * 70)

    for word in ["pizza", "school", "homework", "pepperoni"]:
        similar = w2v.most_similar_words(word, top_k=5)
        if similar:
            top = ", ".join(f"{w}({s:.2f})" for w, s in similar)
            print(f"  {word:>12} -> {top}")

    print(f"\n  Word2Vec word pairs:")
    for a, b in [("pizza", "school"), ("pizza", "homework"), ("pizza", "cheese"),
                  ("school", "homework"), ("pepperoni", "math")]:
        score = w2v.word_similarity(a, b)
        print(f"    {a} <-> {b}: {score:.4f}")

    # ================================================================ #
    #  3. Semantic search comparison
    # ================================================================ #
    print("\n" + "=" * 70)
    print("3. SEMANTIC SEARCH — 'a place where children learn and take tests'")
    print("=" * 70)

    query = "a place where children learn and take tests"

    print("\n  Transformer results:")
    for r in transformer.query(query, top_k=3):
        print(f"    #{r.rank} ({r.score:.4f}) [{r.chunk.doc_id}] {r.chunk.text[:80]}...")

    print("\n  Word2Vec results:")
    for r in w2v.query(query, top_k=3):
        print(f"    #{r.rank} ({r.score:.4f}) [{r.doc_id}] {r.text[:80]}...")

    # ================================================================ #
    #  4. The core test: does "pizza" mean "school" or "food"?
    # ================================================================ #
    print("\n" + "=" * 70)
    print("4. KEYWORD MEANING MATCHING — 'pizza' -> food or school?")
    print("   Transformer uses full passage context. Word2Vec averages word vectors.")
    print("=" * 70)

    candidates = [
        "Italian food, restaurant, cooking, dough and cheese",
        "School, education, academic activities, homework and tests",
    ]

    print("\n  Transformer (match_keyword_to_meaning):")
    matches = transformer.match_keyword_to_meaning("pizza", candidates)
    for m in matches:
        doc = m["chunk"].doc_id
        best = m["best_match"][:40]
        scores = " | ".join(f"{c[:20]}={s:.3f}" for c, s in m["all_scores"].items())
        print(f"    [{doc:>20}] -> {best:<40} ({scores})")

    print("\n  Word2Vec (sentence-level similarity to candidates):")
    # Replicate the same logic with Word2Vec
    import re
    for doc_id, text in DOCS.items():
        sents = re.split(r"(?<=[.!?])\s+", text.strip())
        for sent in sents:
            if re.search(r"\bpizza\b", sent, re.IGNORECASE) and len(sent.split()) >= 5:
                scores = {c: w2v.compare_texts(sent, c) for c in candidates}
                best = max(scores, key=scores.get)
                best_label = best[:40]
                score_str = " | ".join(f"{c[:20]}={s:.3f}" for c, s in scores.items())
                print(f"    [{doc_id:>20}] -> {best_label:<40} ({score_str})")
                break  # one per doc for brevity

    # ================================================================ #
    #  5. Clustering comparison
    # ================================================================ #
    print("\n" + "=" * 70)
    print("5. KEYWORD CLUSTERING — can the model separate meanings of 'pizza'?")
    print("=" * 70)

    analysis = transformer.analyze_keyword("pizza", top_k=2, cluster_threshold=0.4)
    print(f"\n  Transformer: {analysis.total_occurrences} occurrences -> "
          f"{len(analysis.meaning_clusters)} clusters")
    for c in analysis.meaning_clusters:
        docs = set(ctx.chunk.doc_id for ctx in c["contexts"])
        print(f"    Cluster {c['cluster_id']} ({c['size']} hits, docs: {docs})")
        print(f"      Example: {c['representative_text'][:100]}...")

    print(f"\n  Word2Vec: cannot cluster by meaning (same word = same vector always)")
    print(f"    'pizza' has exactly ONE embedding regardless of context")

    # ================================================================ #
    #  Summary
    # ================================================================ #
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print("""
  Word2Vec:
    + Fast to train on small corpus
    + Shows which words co-occur (word-level neighbors)
    - ONE vector per word — "pizza" is always "pizza"
    - Cannot distinguish "pizza = food" from "pizza = school"
    - Sentence similarity is just averaged word vectors (lossy)

  Transformer (SentenceTransformers):
    + Full sentence/passage context — same word gets different embeddings
    + Can cluster "pizza" into food vs school meanings
    + Pretrained on massive data — understands language out of the box
    + FAISS enables fast search over large corpora
    - Larger model (~80MB vs ~1MB for Word2Vec)
    - Slower inference (still <100ms per query)
""")


if __name__ == "__main__":
    main()