File size: 1,225 Bytes
109f70a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import json
# === Load dataset ===
with open("data/coaching_millionaer_dataset.json", "r", encoding="utf-8") as f:
docs = json.load(f)
# === Init embedding model ===
model = SentenceTransformer("./model")
# === Init Pinecone ===
pc = Pinecone(api_key="pcsk_6FCjSE_FFtwDN4PEY5Q7pqKGqGsNgBQrH2Ut9xWcpr3oe1FA28VDPFqei4XtpXMCwb7zdX")
index = pc.Index("ebook")
# === Upload data ===
vectors = []
for i, doc in enumerate(docs):
# Handle multiple possible content keys safely
content = (
doc.get("content")
or doc.get("text")
or doc.get("context")
or doc.get("paragraph")
)
if not content:
print(f"⚠️ Skipping item {i} (no text field found)")
continue
emb = model.encode(content).tolist()
vectors.append((str(i), emb, {"page": doc.get("page"), "context": content}))
# Upload in batches
if len(vectors) >= 100:
index.upsert(vectors=vectors)
print(f"✅ Uploaded {i + 1} documents...")
vectors = []
# Upload remaining
if vectors:
index.upsert(vectors=vectors)
print("🎉 Upload complete! All documents added to Pinecone.")
|