Spaces:
Running
Running
| import uuid | |
| import os | |
| from dotenv import load_dotenv | |
| from pinecone import Pinecone | |
| from sentence_transformers import SentenceTransformer | |
| load_dotenv() | |
| pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")) | |
| index = pc.Index("mini-rag-project-1file") | |
| embed_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| def split_text(text, chunk_size=1200, overlap=15): | |
| words = text.split() | |
| chunks = [] | |
| for i in range(0, len(words), chunk_size - overlap): | |
| chunks.append(" ".join(words[i:i + chunk_size])) | |
| return chunks | |
| def ingest(text: str, source: str = "user"): | |
| chunks = split_text(text) | |
| embeddings = embed_model.encode(chunks) | |
| vectors = [] | |
| for i, emb in enumerate(embeddings): | |
| vectors.append({ | |
| "id": str(uuid.uuid4()), | |
| "values": emb.tolist(), | |
| "metadata": { | |
| "source": source, | |
| "position": i, | |
| "text": chunks[i] | |
| } | |
| }) | |
| index.upsert(vectors) | |