File size: 6,812 Bytes
0733aae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
"""
Knowledge Universe β€” Embedding Adapter
Produces vector embeddings for RAG pipeline consumption.
Uses local sentence-transformers (all-MiniLM-L6-v2) β€” no OpenAI API needed.

Output is a KnowledgeObject with embedding field populated,
ready to be inserted into any vector store (Qdrant, Weaviate, Chroma, Pinecone).
"""

import logging
from typing import Any, Dict, List, Optional
from datetime import datetime

from src.api.models import Source, KnowledgeObject
from src.format_adapters.base_adapter import BaseFormatAdapter

logger = logging.getLogger(__name__)

# Lazy-loaded model (first call loads it, subsequent calls reuse)
_model = None


def _get_model():
    global _model
    if _model is None:
        try:
            from sentence_transformers import SentenceTransformer
            _model = SentenceTransformer("all-MiniLM-L6-v2")
            logger.info("SentenceTransformer loaded: all-MiniLM-L6-v2")
        except Exception as e:
            logger.error(f"Failed to load embedding model: {e}")
            raise
    return _model


class EmbeddingAdapter(BaseFormatAdapter):
    """
    Converts sources to KnowledgeObjects with vector embeddings.

    The embedding encodes: title + summary + tags + platform
    Dimensionality: 384 (all-MiniLM-L6-v2)

    Ready for: Qdrant, Weaviate, ChromaDB, Pinecone, pgvector
    """

    def transform(self, source: Source) -> KnowledgeObject:
        embedding = self._embed(source)

        return KnowledgeObject(
            source_id=source.id,
            title=source.title,
            url=source.url,
            platform=source.source_platform,
            format=source.formats[0] if source.formats else None,

            quality_score=source.quality_score,
            pedagogical_fit=source.pedagogical_fit,
            freshness_score=self._freshness_score(source),

            authors=source.authors,
            publication_date=source.publication_date,
            license=source.license,
            open_access=source.open_access,

            embedding=embedding,

            summary=source.summary,
            tags=source.tags,
        )

    def transform_many(self, sources: list) -> list:
        """Batch embed β€” more efficient than one-by-one."""
        if not sources:
            return []

        texts = [self._source_to_text(s) for s in sources]

        try:
            model = _get_model()
            embeddings = model.encode(texts, convert_to_numpy=True)
            embeddings_list = embeddings.tolist()
        except Exception as e:
            logger.error(f"Batch embedding failed: {e}")
            embeddings_list = [None] * len(sources)

        results = []
        for source, embedding in zip(sources, embeddings_list):
            obj = KnowledgeObject(
                source_id=source.id,
                title=source.title,
                url=source.url,
                platform=source.source_platform,
                format=source.formats[0] if source.formats else None,

                quality_score=source.quality_score,
                pedagogical_fit=source.pedagogical_fit,
                freshness_score=self._freshness_score(source),

                authors=source.authors,
                publication_date=source.publication_date,
                license=source.license,
                open_access=source.open_access,

                embedding=embedding,
                summary=source.summary,
                tags=source.tags,
            )
            results.append(obj)

        return results

    def _embed(self, source: Source) -> Optional[List[float]]:
        try:
            model = _get_model()
            text = self._source_to_text(source)
            vector = model.encode(text, convert_to_numpy=True)
            return vector.tolist()
        except Exception as e:
            logger.error(f"Embedding failed for {source.id}: {e}")
            return None

    def _source_to_text(self, source: Source) -> str:
        """Concatenate fields that describe the source's semantic content."""
        parts = [
            source.title,
            source.summary[:300] if source.summary else "",
            " ".join(source.tags[:10]),
            source.source_platform,
            " ".join(f.value for f in source.formats),
            " ".join(source.authors[:2]),
        ]
        return " | ".join(p for p in parts if p)

    def _freshness_score(self, source: Source) -> float:
        """Compute 0-1 freshness score from publication date."""
        if not source.publication_date:
            return 0.5
        try:
            days = (datetime.now(source.publication_date.tzinfo) - source.publication_date).days
            if days < 180:   return 1.0
            if days < 720:   return 0.8
            if days < 1800:  return 0.6
            return 0.4
        except Exception:
            return 0.5


# ── Qdrant upload helper (optional, call from scripts) ─────────────────────

def upload_to_qdrant(
    knowledge_objects: List[KnowledgeObject],
    collection: str = "knowledge_universe",
    qdrant_url: str = "http://localhost:6333",
):
    """
    Upload KnowledgeObjects to a local Qdrant instance.
    Run Qdrant with: docker run -p 6333:6333 qdrant/qdrant

    This is completely free β€” no cloud dependency.
    """
    try:
        from qdrant_client import QdrantClient
        from qdrant_client.models import PointStruct, Distance, VectorParams

        client = QdrantClient(url=qdrant_url)

        # Create collection if needed
        try:
            client.create_collection(
                collection_name=collection,
                vectors_config=VectorParams(size=384, distance=Distance.COSINE),
            )
        except Exception:
            pass   # already exists

        points = [
            PointStruct(
                id=abs(hash(obj.source_id)) % (2**32),
                vector=obj.embedding,
                payload={
                    "source_id": obj.source_id,
                    "title": obj.title,
                    "url": obj.url,
                    "platform": obj.platform,
                    "quality_score": obj.quality_score,
                    "summary": obj.summary,
                    "tags": obj.tags,
                },
            )
            for obj in knowledge_objects
            if obj.embedding
        ]

        client.upsert(collection_name=collection, points=points)
        logger.info(f"Uploaded {len(points)} vectors to Qdrant collection '{collection}'")
        return len(points)

    except ImportError:
        logger.error("qdrant-client not installed. Run: pip install qdrant-client")
        return 0
    except Exception as e:
        logger.error(f"Qdrant upload failed: {e}")
        return 0