Spaces:
Running
Running
File size: 7,382 Bytes
0efdc2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
#!/usr/bin/env python3
"""
Demo: Semantic Search & Deduplication (Phase 6).
This script demonstrates embedding-based capabilities:
- Text embedding with sentence-transformers
- Semantic similarity search via ChromaDB
- Duplicate detection by meaning (not just URL)
Usage:
uv run python examples/embeddings_demo/run_embeddings.py
No API keys required - uses local sentence-transformers model.
"""
import asyncio
from src.services.embeddings import EmbeddingService
from src.utils.models import Citation, Evidence
def create_sample_evidence() -> list[Evidence]:
"""Create sample evidence with some semantic duplicates."""
return [
Evidence(
content="Metformin activates AMPK which inhibits mTOR signaling pathway.",
citation=Citation(
source="pubmed",
title="Metformin and AMPK activation",
url="https://pubmed.ncbi.nlm.nih.gov/11111/",
date="2023",
authors=["Smith J"],
),
),
Evidence(
content="The drug metformin works by turning on AMPK, blocking the mTOR pathway.",
citation=Citation(
source="pubmed",
title="AMPK-mTOR axis in diabetes treatment",
url="https://pubmed.ncbi.nlm.nih.gov/22222/",
date="2022",
authors=["Jones A"],
),
),
Evidence(
content="Sildenafil increases nitric oxide signaling for vasodilation.",
citation=Citation(
source="web",
title="How Viagra Works",
url="https://example.com/viagra-mechanism",
date="2023",
authors=["WebMD"],
),
),
Evidence(
content="Clinical trials show metformin reduces cancer incidence in diabetic patients.",
citation=Citation(
source="pubmed",
title="Metformin and cancer prevention",
url="https://pubmed.ncbi.nlm.nih.gov/33333/",
date="2024",
authors=["Lee K", "Park S"],
),
),
Evidence(
content="Metformin inhibits mTOR through AMPK activation mechanism.",
citation=Citation(
source="pubmed",
title="mTOR inhibition by Metformin",
url="https://pubmed.ncbi.nlm.nih.gov/44444/",
date="2023",
authors=["Brown M"],
),
),
]
def create_fresh_service(name_suffix: str = "") -> EmbeddingService:
"""Create a fresh embedding service with unique collection name."""
import uuid
# Create service with unique collection by modifying the internal collection
service = EmbeddingService.__new__(EmbeddingService)
service._model = __import__("sentence_transformers").SentenceTransformer("all-MiniLM-L6-v2")
service._client = __import__("chromadb").Client()
collection_name = f"demo_{name_suffix}_{uuid.uuid4().hex[:8]}"
service._collection = service._client.create_collection(
name=collection_name, metadata={"hnsw:space": "cosine"}
)
return service
async def demo_embedding() -> None:
"""Demo single text embedding."""
print("\n" + "=" * 60)
print("1. TEXT EMBEDDING DEMO")
print("=" * 60)
service = create_fresh_service("embed")
texts = [
"Metformin activates AMPK",
"Aspirin reduces inflammation",
"Metformin turns on the AMPK enzyme",
]
print("\nEmbedding sample texts...")
embeddings = await service.embed_batch(texts)
for text, emb in zip(texts, embeddings, strict=False):
print(f" '{text[:40]}...' -> [{emb[0]:.4f}, {emb[1]:.4f}, ... ] (dim={len(emb)})")
# Calculate similarity between text 0 and text 2 (semantically similar)
import numpy as np
sim_0_2 = np.dot(embeddings[0], embeddings[2]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[2])
)
sim_0_1 = np.dot(embeddings[0], embeddings[1]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
)
print(f"\nSimilarity (Metformin AMPK) vs (Metformin turns on AMPK): {sim_0_2:.3f}")
print(f"Similarity (Metformin AMPK) vs (Aspirin inflammation): {sim_0_1:.3f}")
print(" -> Semantically similar texts have higher cosine similarity!")
async def demo_semantic_search() -> None:
"""Demo semantic similarity search."""
print("\n" + "=" * 60)
print("2. SEMANTIC SEARCH DEMO")
print("=" * 60)
service = create_fresh_service("search")
# Add some documents to the vector store
docs = [
("doc1", "Metformin activates AMPK enzyme in liver cells", {"source": "pubmed"}),
("doc2", "Aspirin inhibits COX-2 to reduce inflammation", {"source": "pubmed"}),
("doc3", "Statins lower cholesterol by inhibiting HMG-CoA reductase", {"source": "web"}),
("doc4", "AMPK activation leads to improved glucose metabolism", {"source": "pubmed"}),
("doc5", "Sildenafil works via nitric oxide pathway", {"source": "web"}),
]
print("\nIndexing documents...")
for doc_id, content, meta in docs:
await service.add_evidence(doc_id, content, meta)
print(f" Added: {doc_id}")
# Search for semantically related content
query = "drugs that activate AMPK"
print(f"\nSearching for: '{query}'")
results = await service.search_similar(query, n_results=3)
print("\nTop 3 results:")
for i, r in enumerate(results, 1):
# Lower distance = more similar (cosine distance: 0=identical, 2=opposite)
similarity = 1 - r["distance"]
print(f" {i}. [{similarity:.2%} similar] {r['content'][:60]}...")
async def demo_deduplication() -> None:
"""Demo semantic deduplication."""
print("\n" + "=" * 60)
print("3. SEMANTIC DEDUPLICATION DEMO")
print("=" * 60)
# Create fresh service for clean demo
service = create_fresh_service("dedup")
evidence = create_sample_evidence()
print(f"\nOriginal evidence count: {len(evidence)}")
for i, e in enumerate(evidence, 1):
print(f" {i}. {e.citation.title}")
print("\nRunning semantic deduplication (threshold=0.85)...")
unique = await service.deduplicate(evidence, threshold=0.85)
print(f"\nUnique evidence count: {len(unique)}")
print(f"Removed {len(evidence) - len(unique)} semantic duplicates\n")
for i, e in enumerate(unique, 1):
print(f" {i}. {e.citation.title}")
print("\n -> Notice: Papers about 'Metformin AMPK mTOR' were deduplicated!")
print(" Different titles, same semantic meaning = duplicate removed.")
async def main() -> None:
"""Run all embedding demos."""
print("\n" + "=" * 60)
print("DeepCritical Embeddings Demo (Phase 6)")
print("Using: sentence-transformers + ChromaDB")
print("=" * 60)
await demo_embedding()
await demo_semantic_search()
await demo_deduplication()
print("\n" + "=" * 60)
print("Demo complete! Embeddings enable:")
print(" - Finding papers by MEANING, not just keywords")
print(" - Removing duplicate findings automatically")
print(" - Building diverse evidence sets for research")
print("=" * 60 + "\n")
if __name__ == "__main__":
asyncio.run(main())
|