File size: 7,382 Bytes
0efdc2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/usr/bin/env python3
"""
Demo: Semantic Search & Deduplication (Phase 6).

This script demonstrates embedding-based capabilities:
- Text embedding with sentence-transformers
- Semantic similarity search via ChromaDB
- Duplicate detection by meaning (not just URL)

Usage:
    uv run python examples/embeddings_demo/run_embeddings.py

No API keys required - uses local sentence-transformers model.
"""

import asyncio

from src.services.embeddings import EmbeddingService
from src.utils.models import Citation, Evidence


def create_sample_evidence() -> list[Evidence]:
    """Create sample evidence with some semantic duplicates."""
    return [
        Evidence(
            content="Metformin activates AMPK which inhibits mTOR signaling pathway.",
            citation=Citation(
                source="pubmed",
                title="Metformin and AMPK activation",
                url="https://pubmed.ncbi.nlm.nih.gov/11111/",
                date="2023",
                authors=["Smith J"],
            ),
        ),
        Evidence(
            content="The drug metformin works by turning on AMPK, blocking the mTOR pathway.",
            citation=Citation(
                source="pubmed",
                title="AMPK-mTOR axis in diabetes treatment",
                url="https://pubmed.ncbi.nlm.nih.gov/22222/",
                date="2022",
                authors=["Jones A"],
            ),
        ),
        Evidence(
            content="Sildenafil increases nitric oxide signaling for vasodilation.",
            citation=Citation(
                source="web",
                title="How Viagra Works",
                url="https://example.com/viagra-mechanism",
                date="2023",
                authors=["WebMD"],
            ),
        ),
        Evidence(
            content="Clinical trials show metformin reduces cancer incidence in diabetic patients.",
            citation=Citation(
                source="pubmed",
                title="Metformin and cancer prevention",
                url="https://pubmed.ncbi.nlm.nih.gov/33333/",
                date="2024",
                authors=["Lee K", "Park S"],
            ),
        ),
        Evidence(
            content="Metformin inhibits mTOR through AMPK activation mechanism.",
            citation=Citation(
                source="pubmed",
                title="mTOR inhibition by Metformin",
                url="https://pubmed.ncbi.nlm.nih.gov/44444/",
                date="2023",
                authors=["Brown M"],
            ),
        ),
    ]


def create_fresh_service(name_suffix: str = "") -> EmbeddingService:
    """Create a fresh embedding service with unique collection name."""
    import uuid

    # Create service with unique collection by modifying the internal collection
    service = EmbeddingService.__new__(EmbeddingService)
    service._model = __import__("sentence_transformers").SentenceTransformer("all-MiniLM-L6-v2")
    service._client = __import__("chromadb").Client()
    collection_name = f"demo_{name_suffix}_{uuid.uuid4().hex[:8]}"
    service._collection = service._client.create_collection(
        name=collection_name, metadata={"hnsw:space": "cosine"}
    )
    return service


async def demo_embedding() -> None:
    """Demo single text embedding."""
    print("\n" + "=" * 60)
    print("1. TEXT EMBEDDING DEMO")
    print("=" * 60)

    service = create_fresh_service("embed")

    texts = [
        "Metformin activates AMPK",
        "Aspirin reduces inflammation",
        "Metformin turns on the AMPK enzyme",
    ]

    print("\nEmbedding sample texts...")
    embeddings = await service.embed_batch(texts)

    for text, emb in zip(texts, embeddings, strict=False):
        print(f"  '{text[:40]}...' -> [{emb[0]:.4f}, {emb[1]:.4f}, ... ] (dim={len(emb)})")

    # Calculate similarity between text 0 and text 2 (semantically similar)
    import numpy as np

    sim_0_2 = np.dot(embeddings[0], embeddings[2]) / (
        np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[2])
    )
    sim_0_1 = np.dot(embeddings[0], embeddings[1]) / (
        np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
    )

    print(f"\nSimilarity (Metformin AMPK) vs (Metformin turns on AMPK): {sim_0_2:.3f}")
    print(f"Similarity (Metformin AMPK) vs (Aspirin inflammation):    {sim_0_1:.3f}")
    print("  -> Semantically similar texts have higher cosine similarity!")


async def demo_semantic_search() -> None:
    """Demo semantic similarity search."""
    print("\n" + "=" * 60)
    print("2. SEMANTIC SEARCH DEMO")
    print("=" * 60)

    service = create_fresh_service("search")

    # Add some documents to the vector store
    docs = [
        ("doc1", "Metformin activates AMPK enzyme in liver cells", {"source": "pubmed"}),
        ("doc2", "Aspirin inhibits COX-2 to reduce inflammation", {"source": "pubmed"}),
        ("doc3", "Statins lower cholesterol by inhibiting HMG-CoA reductase", {"source": "web"}),
        ("doc4", "AMPK activation leads to improved glucose metabolism", {"source": "pubmed"}),
        ("doc5", "Sildenafil works via nitric oxide pathway", {"source": "web"}),
    ]

    print("\nIndexing documents...")
    for doc_id, content, meta in docs:
        await service.add_evidence(doc_id, content, meta)
        print(f"  Added: {doc_id}")

    # Search for semantically related content
    query = "drugs that activate AMPK"
    print(f"\nSearching for: '{query}'")

    results = await service.search_similar(query, n_results=3)

    print("\nTop 3 results:")
    for i, r in enumerate(results, 1):
        # Lower distance = more similar (cosine distance: 0=identical, 2=opposite)
        similarity = 1 - r["distance"]
        print(f"  {i}. [{similarity:.2%} similar] {r['content'][:60]}...")


async def demo_deduplication() -> None:
    """Demo semantic deduplication."""
    print("\n" + "=" * 60)
    print("3. SEMANTIC DEDUPLICATION DEMO")
    print("=" * 60)

    # Create fresh service for clean demo
    service = create_fresh_service("dedup")

    evidence = create_sample_evidence()
    print(f"\nOriginal evidence count: {len(evidence)}")
    for i, e in enumerate(evidence, 1):
        print(f"  {i}. {e.citation.title}")

    print("\nRunning semantic deduplication (threshold=0.85)...")
    unique = await service.deduplicate(evidence, threshold=0.85)

    print(f"\nUnique evidence count: {len(unique)}")
    print(f"Removed {len(evidence) - len(unique)} semantic duplicates\n")

    for i, e in enumerate(unique, 1):
        print(f"  {i}. {e.citation.title}")

    print("\n  -> Notice: Papers about 'Metformin AMPK mTOR' were deduplicated!")
    print("     Different titles, same semantic meaning = duplicate removed.")


async def main() -> None:
    """Run all embedding demos."""
    print("\n" + "=" * 60)
    print("DeepCritical Embeddings Demo (Phase 6)")
    print("Using: sentence-transformers + ChromaDB")
    print("=" * 60)

    await demo_embedding()
    await demo_semantic_search()
    await demo_deduplication()

    print("\n" + "=" * 60)
    print("Demo complete! Embeddings enable:")
    print("  - Finding papers by MEANING, not just keywords")
    print("  - Removing duplicate findings automatically")
    print("  - Building diverse evidence sets for research")
    print("=" * 60 + "\n")


if __name__ == "__main__":
    asyncio.run(main())