|
|
|
|
|
""" |
|
|
Demo: Semantic Search & Deduplication (Phase 6). |
|
|
|
|
|
This script demonstrates embedding-based capabilities: |
|
|
- Text embedding with sentence-transformers |
|
|
- Semantic similarity search via ChromaDB |
|
|
- Duplicate detection by meaning (not just URL) |
|
|
|
|
|
Usage: |
|
|
uv run python examples/embeddings_demo/run_embeddings.py |
|
|
|
|
|
No API keys required - uses local sentence-transformers model. |
|
|
""" |
|
|
|
|
|
import asyncio |
|
|
|
|
|
from src.services.embeddings import EmbeddingService |
|
|
from src.utils.models import Citation, Evidence |
|
|
|
|
|
|
|
|
def create_sample_evidence() -> list[Evidence]: |
|
|
"""Create sample evidence with some semantic duplicates.""" |
|
|
return [ |
|
|
Evidence( |
|
|
content="Metformin activates AMPK which inhibits mTOR signaling pathway.", |
|
|
citation=Citation( |
|
|
source="pubmed", |
|
|
title="Metformin and AMPK activation", |
|
|
url="https://pubmed.ncbi.nlm.nih.gov/11111/", |
|
|
date="2023", |
|
|
authors=["Smith J"], |
|
|
), |
|
|
), |
|
|
Evidence( |
|
|
content="The drug metformin works by turning on AMPK, blocking the mTOR pathway.", |
|
|
citation=Citation( |
|
|
source="pubmed", |
|
|
title="AMPK-mTOR axis in diabetes treatment", |
|
|
url="https://pubmed.ncbi.nlm.nih.gov/22222/", |
|
|
date="2022", |
|
|
authors=["Jones A"], |
|
|
), |
|
|
), |
|
|
Evidence( |
|
|
content="Sildenafil increases nitric oxide signaling for vasodilation.", |
|
|
citation=Citation( |
|
|
source="web", |
|
|
title="How Viagra Works", |
|
|
url="https://example.com/viagra-mechanism", |
|
|
date="2023", |
|
|
authors=["WebMD"], |
|
|
), |
|
|
), |
|
|
Evidence( |
|
|
content="Clinical trials show metformin reduces cancer incidence in diabetic patients.", |
|
|
citation=Citation( |
|
|
source="pubmed", |
|
|
title="Metformin and cancer prevention", |
|
|
url="https://pubmed.ncbi.nlm.nih.gov/33333/", |
|
|
date="2024", |
|
|
authors=["Lee K", "Park S"], |
|
|
), |
|
|
), |
|
|
Evidence( |
|
|
content="Metformin inhibits mTOR through AMPK activation mechanism.", |
|
|
citation=Citation( |
|
|
source="pubmed", |
|
|
title="mTOR inhibition by Metformin", |
|
|
url="https://pubmed.ncbi.nlm.nih.gov/44444/", |
|
|
date="2023", |
|
|
authors=["Brown M"], |
|
|
), |
|
|
), |
|
|
] |
|
|
|
|
|
|
|
|
def create_fresh_service(name_suffix: str = "") -> EmbeddingService: |
|
|
"""Create a fresh embedding service with unique collection name.""" |
|
|
import uuid |
|
|
|
|
|
|
|
|
service = EmbeddingService.__new__(EmbeddingService) |
|
|
service._model = __import__("sentence_transformers").SentenceTransformer("all-MiniLM-L6-v2") |
|
|
service._client = __import__("chromadb").Client() |
|
|
collection_name = f"demo_{name_suffix}_{uuid.uuid4().hex[:8]}" |
|
|
service._collection = service._client.create_collection( |
|
|
name=collection_name, metadata={"hnsw:space": "cosine"} |
|
|
) |
|
|
return service |
|
|
|
|
|
|
|
|
async def demo_embedding() -> None: |
|
|
"""Demo single text embedding.""" |
|
|
print("\n" + "=" * 60) |
|
|
print("1. TEXT EMBEDDING DEMO") |
|
|
print("=" * 60) |
|
|
|
|
|
service = create_fresh_service("embed") |
|
|
|
|
|
texts = [ |
|
|
"Metformin activates AMPK", |
|
|
"Aspirin reduces inflammation", |
|
|
"Metformin turns on the AMPK enzyme", |
|
|
] |
|
|
|
|
|
print("\nEmbedding sample texts...") |
|
|
embeddings = await service.embed_batch(texts) |
|
|
|
|
|
for text, emb in zip(texts, embeddings, strict=False): |
|
|
print(f" '{text[:40]}...' -> [{emb[0]:.4f}, {emb[1]:.4f}, ... ] (dim={len(emb)})") |
|
|
|
|
|
|
|
|
import numpy as np |
|
|
|
|
|
sim_0_2 = np.dot(embeddings[0], embeddings[2]) / ( |
|
|
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[2]) |
|
|
) |
|
|
sim_0_1 = np.dot(embeddings[0], embeddings[1]) / ( |
|
|
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]) |
|
|
) |
|
|
|
|
|
print(f"\nSimilarity (Metformin AMPK) vs (Metformin turns on AMPK): {sim_0_2:.3f}") |
|
|
print(f"Similarity (Metformin AMPK) vs (Aspirin inflammation): {sim_0_1:.3f}") |
|
|
print(" -> Semantically similar texts have higher cosine similarity!") |
|
|
|
|
|
|
|
|
async def demo_semantic_search() -> None: |
|
|
"""Demo semantic similarity search.""" |
|
|
print("\n" + "=" * 60) |
|
|
print("2. SEMANTIC SEARCH DEMO") |
|
|
print("=" * 60) |
|
|
|
|
|
service = create_fresh_service("search") |
|
|
|
|
|
|
|
|
docs = [ |
|
|
("doc1", "Metformin activates AMPK enzyme in liver cells", {"source": "pubmed"}), |
|
|
("doc2", "Aspirin inhibits COX-2 to reduce inflammation", {"source": "pubmed"}), |
|
|
("doc3", "Statins lower cholesterol by inhibiting HMG-CoA reductase", {"source": "web"}), |
|
|
("doc4", "AMPK activation leads to improved glucose metabolism", {"source": "pubmed"}), |
|
|
("doc5", "Sildenafil works via nitric oxide pathway", {"source": "web"}), |
|
|
] |
|
|
|
|
|
print("\nIndexing documents...") |
|
|
for doc_id, content, meta in docs: |
|
|
await service.add_evidence(doc_id, content, meta) |
|
|
print(f" Added: {doc_id}") |
|
|
|
|
|
|
|
|
query = "drugs that activate AMPK" |
|
|
print(f"\nSearching for: '{query}'") |
|
|
|
|
|
results = await service.search_similar(query, n_results=3) |
|
|
|
|
|
print("\nTop 3 results:") |
|
|
for i, r in enumerate(results, 1): |
|
|
|
|
|
similarity = 1 - r["distance"] |
|
|
print(f" {i}. [{similarity:.2%} similar] {r['content'][:60]}...") |
|
|
|
|
|
|
|
|
async def demo_deduplication() -> None: |
|
|
"""Demo semantic deduplication.""" |
|
|
print("\n" + "=" * 60) |
|
|
print("3. SEMANTIC DEDUPLICATION DEMO") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
service = create_fresh_service("dedup") |
|
|
|
|
|
evidence = create_sample_evidence() |
|
|
print(f"\nOriginal evidence count: {len(evidence)}") |
|
|
for i, e in enumerate(evidence, 1): |
|
|
print(f" {i}. {e.citation.title}") |
|
|
|
|
|
print("\nRunning semantic deduplication (threshold=0.85)...") |
|
|
unique = await service.deduplicate(evidence, threshold=0.85) |
|
|
|
|
|
print(f"\nUnique evidence count: {len(unique)}") |
|
|
print(f"Removed {len(evidence) - len(unique)} semantic duplicates\n") |
|
|
|
|
|
for i, e in enumerate(unique, 1): |
|
|
print(f" {i}. {e.citation.title}") |
|
|
|
|
|
print("\n -> Notice: Papers about 'Metformin AMPK mTOR' were deduplicated!") |
|
|
print(" Different titles, same semantic meaning = duplicate removed.") |
|
|
|
|
|
|
|
|
async def main() -> None: |
|
|
"""Run all embedding demos.""" |
|
|
print("\n" + "=" * 60) |
|
|
print("DeepCritical Embeddings Demo (Phase 6)") |
|
|
print("Using: sentence-transformers + ChromaDB") |
|
|
print("=" * 60) |
|
|
|
|
|
await demo_embedding() |
|
|
await demo_semantic_search() |
|
|
await demo_deduplication() |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Demo complete! Embeddings enable:") |
|
|
print(" - Finding papers by MEANING, not just keywords") |
|
|
print(" - Removing duplicate findings automatically") |
|
|
print(" - Building diverse evidence sets for research") |
|
|
print("=" * 60 + "\n") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
asyncio.run(main()) |
|
|
|