Spaces:

caisdev
/

esfiles

Running

esfiles / demo.py

Besjon Cifliku

feat: initial project setup

db764ae 18 days ago

10.3 kB

	"""
	Demo: Word2Vec vs Transformer — side by side comparison.

	Run: python demo.py
	"""

	import json
	from contextual_similarity import ContextualSimilarityEngine
	from word2vec_baseline import Word2VecEngine
	from evaluation import Evaluator, GroundTruthEntry

	# ------------------------------------------------------------------ #
	# Sample corpus
	# ------------------------------------------------------------------ #

	DOCS = {
	"secret_language": """
	The kids in the neighborhood had developed their own secret language. When they said
	"pizza" they actually meant "school". So when Tommy said "I love pizza so much, I go
	there every day", he really meant he loved going to school. His friend Sarah would say
	"pizza gives me homework" and everyone in the group understood she was talking about school.

	The code words extended further. "Pepperoni" meant math class, because it was their
	favorite topping but also the hardest subject. When Jake complained about "too much
	pepperoni on my pizza", the group knew he was struggling with math at school.

	Their parents were confused. "Why do you kids talk about pizza all the time?" asked
	Tommy's mom. The kids just giggled. Their secret language was working perfectly.
	""",
	"real_pizza": """
	Meanwhile, across town, Maria genuinely loved pizza. She worked at Giuseppe's Pizzeria
	and made the best margherita in the city. Her pizza dough recipe used tipo 00 flour,
	San Marzano tomatoes, and fresh mozzarella. Every Saturday, she would fire up the
	wood-burning oven and create masterpieces.

	Maria's customers raved about her pizza. "This pizza is amazing, the crust is perfectly
	crispy!" they would say. The restaurant was always full. Pizza was Maria's life, her
	passion, and her livelihood. She dreamed of opening more pizza restaurants across the country.
	""",
	"school_board": """
	The local school board met to discuss improving education in the district. Principal
	Johnson presented data showing that students who attended school regularly performed
	better on standardized tests. "School attendance is directly correlated with academic
	success," she explained.

	The board discussed new programs to make school more engaging for students. They proposed
	adding more extracurricular activities, updating the curriculum, and hiring additional
	teachers. "We need to make school a place where students want to be," said board member
	Williams.
	""",
	"misunderstanding": """
	One day, Tommy's mom overheard a phone conversation. Tommy said to his friend, "I really
	don't want to go to pizza tomorrow. The pizza test is going to be so hard." His mom was
	bewildered - what kind of test does a pizzeria give?

	She called Sarah's mom, who had noticed similar strange statements. "Sarah told me she
	got an A on her pizza report. Since when do pizza places give grades?" The parents
	decided to investigate.

	When they finally figured out the code, they laughed. "So all this time, when you said
	you hated Monday pizza, you meant you hated going to school on Mondays?" Tommy nodded
	sheepishly.
	""",
	}

	COMPARE_PAIRS = [
	("I love pizza so much", "I love school so much"),
	("pizza gives me homework", "school gives me homework"),
	("pizza gives me homework", "fresh mozzarella on pizza"),
	("The pizza test is hard", "The school exam is difficult"),
	("too much pepperoni on my pizza", "math class is too hard"),
	]


	def main():
	# ================================================================ #
	# Build both engines on the same corpus
	# ================================================================ #
	print("=" * 70)
	print("Loading models...")
	print("=" * 70)

	# Transformer engine
	transformer = ContextualSimilarityEngine(
	model_name="all-MiniLM-L6-v2",
	chunk_size=400,
	chunk_overlap=80,
	)
	for doc_id, text in DOCS.items():
	transformer.add_document(doc_id, text)
	transformer.build_index(show_progress=False)
	print(f"Transformer: {transformer.get_stats()['total_chunks']} chunks, "
	f"dim={transformer.embedding_dim}")

	# Word2Vec engine
	w2v = Word2VecEngine(vector_size=100, window=5, epochs=50)
	for doc_id, text in DOCS.items():
	w2v.add_document(doc_id, text)
	stats = w2v.build_index()
	print(f"Word2Vec: {stats['sentences']} sentences, "
	f"vocab={stats['vocab_size']}, dim={stats['vector_size']}")

	# ================================================================ #
	# 1. Text similarity comparison
	# ================================================================ #
	print("\n" + "=" * 70)
	print("1. TEXT SIMILARITY — same pairs, both models")
	print("=" * 70)
	print(f"\n {'Text A':<35} {'Text B':<35} {'W2V':>6} {'Trans':>6} {'Winner'}")
	print(" " + "-" * 95)

	for a, b in COMPARE_PAIRS:
	w2v_score = w2v.compare_texts(a, b)
	tr_score = transformer.compare_texts(a, b)
	winner = "W2V" if abs(w2v_score) > abs(tr_score) else "TRANS"
	print(f" {a:<35} {b:<35} {w2v_score:>6.3f} {tr_score:>6.3f} {winner}")

	# ================================================================ #
	# 2. Word-level similarity (Word2Vec only — transformers don't do this)
	# ================================================================ #
	print("\n" + "=" * 70)
	print("2. WORD-LEVEL SIMILARITY (Word2Vec only)")
	print(" Word2Vec gives ONE vector per word — no context awareness")
	print("=" * 70)

	for word in ["pizza", "school", "homework", "pepperoni"]:
	similar = w2v.most_similar_words(word, top_k=5)
	if similar:
	top = ", ".join(f"{w}({s:.2f})" for w, s in similar)
	print(f" {word:>12} -> {top}")

	print(f"\n Word2Vec word pairs:")
	for a, b in [("pizza", "school"), ("pizza", "homework"), ("pizza", "cheese"),
	("school", "homework"), ("pepperoni", "math")]:
	score = w2v.word_similarity(a, b)
	print(f" {a} <-> {b}: {score:.4f}")

	# ================================================================ #
	# 3. Semantic search comparison
	# ================================================================ #
	print("\n" + "=" * 70)
	print("3. SEMANTIC SEARCH — 'a place where children learn and take tests'")
	print("=" * 70)

	query = "a place where children learn and take tests"

	print("\n Transformer results:")
	for r in transformer.query(query, top_k=3):
	print(f" #{r.rank} ({r.score:.4f}) [{r.chunk.doc_id}] {r.chunk.text[:80]}...")

	print("\n Word2Vec results:")
	for r in w2v.query(query, top_k=3):
	print(f" #{r.rank} ({r.score:.4f}) [{r.doc_id}] {r.text[:80]}...")

	# ================================================================ #
	# 4. The core test: does "pizza" mean "school" or "food"?
	# ================================================================ #
	print("\n" + "=" * 70)
	print("4. KEYWORD MEANING MATCHING — 'pizza' -> food or school?")
	print(" Transformer uses full passage context. Word2Vec averages word vectors.")
	print("=" * 70)

	candidates = [
	"Italian food, restaurant, cooking, dough and cheese",
	"School, education, academic activities, homework and tests",
	]

	print("\n Transformer (match_keyword_to_meaning):")
	matches = transformer.match_keyword_to_meaning("pizza", candidates)
	for m in matches:
	doc = m["chunk"].doc_id
	best = m["best_match"][:40]
	scores = " \| ".join(f"{c[:20]}={s:.3f}" for c, s in m["all_scores"].items())
	print(f" [{doc:>20}] -> {best:<40} ({scores})")

	print("\n Word2Vec (sentence-level similarity to candidates):")
	# Replicate the same logic with Word2Vec
	import re
	for doc_id, text in DOCS.items():
	sents = re.split(r"(?<=[.!?])\s+", text.strip())
	for sent in sents:
	if re.search(r"\bpizza\b", sent, re.IGNORECASE) and len(sent.split()) >= 5:
	scores = {c: w2v.compare_texts(sent, c) for c in candidates}
	best = max(scores, key=scores.get)
	best_label = best[:40]
	score_str = " \| ".join(f"{c[:20]}={s:.3f}" for c, s in scores.items())
	print(f" [{doc_id:>20}] -> {best_label:<40} ({score_str})")
	break # one per doc for brevity

	# ================================================================ #
	# 5. Clustering comparison
	# ================================================================ #
	print("\n" + "=" * 70)
	print("5. KEYWORD CLUSTERING — can the model separate meanings of 'pizza'?")
	print("=" * 70)

	analysis = transformer.analyze_keyword("pizza", top_k=2, cluster_threshold=0.4)
	print(f"\n Transformer: {analysis.total_occurrences} occurrences -> "
	f"{len(analysis.meaning_clusters)} clusters")
	for c in analysis.meaning_clusters:
	docs = set(ctx.chunk.doc_id for ctx in c["contexts"])
	print(f" Cluster {c['cluster_id']} ({c['size']} hits, docs: {docs})")
	print(f" Example: {c['representative_text'][:100]}...")

	print(f"\n Word2Vec: cannot cluster by meaning (same word = same vector always)")
	print(f" 'pizza' has exactly ONE embedding regardless of context")

	# ================================================================ #
	# Summary
	# ================================================================ #
	print("\n" + "=" * 70)
	print("SUMMARY")
	print("=" * 70)
	print("""
	Word2Vec:
	+ Fast to train on small corpus
	+ Shows which words co-occur (word-level neighbors)
	- ONE vector per word — "pizza" is always "pizza"
	- Cannot distinguish "pizza = food" from "pizza = school"
	- Sentence similarity is just averaged word vectors (lossy)

	Transformer (SentenceTransformers):
	+ Full sentence/passage context — same word gets different embeddings
	+ Can cluster "pizza" into food vs school meanings
	+ Pretrained on massive data — understands language out of the box
	+ FAISS enables fast search over large corpora
	- Larger model (~80MB vs ~1MB for Word2Vec)
	- Slower inference (still <100ms per query)
	""")


	if __name__ == "__main__":
	main()