Spaces:
Sleeping
Sleeping
File size: 6,602 Bytes
81726c9 f6bb754 81726c9 f6bb754 81726c9 f6bb754 81726c9 f6bb754 81726c9 f6bb754 81726c9 f6bb754 81726c9 f6bb754 81726c9 f6bb754 81726c9 f6bb754 81726c9 f6bb754 81726c9 f6bb754 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | import os
import torch
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
load_dotenv()
# ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX = os.getenv("PINECONE_INDEX", "study-saathi")
EMBEDDING_MODEL = "intfloat/multilingual-e5-large"
DATA_DIR = "data/os_notes"
CHUNK_SIZE = 512
CHUNK_OVERLAP = 64
BATCH_SIZE = 32
DIMENSION = 1024
# ββ Device βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[INFO] Using device: {device}")
# ββ Load Embedding Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("[INFO] Loading embedding model...")
embedder = SentenceTransformer(EMBEDDING_MODEL, device=device)
# ββ Pinecone Setup ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
pc = Pinecone(api_key=PINECONE_API_KEY)
if PINECONE_INDEX not in [i.name for i in pc.list_indexes()]:
print(f"[INFO] Creating Pinecone index: {PINECONE_INDEX}")
pc.create_index(
name=PINECONE_INDEX,
dimension=DIMENSION,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
index = pc.Index(PINECONE_INDEX)
# ββ Check if file already ingested βββββββββββββββββββββββββββββββββββββββββββ
def is_already_ingested(filename: str) -> bool:
"""
Query Pinecone for any vector whose metadata source == filename.
If found, the file was already ingested β skip it.
"""
topic = os.path.splitext(filename)[0]
# use a dummy zero vector just to run a metadata filter query
dummy_vector = [0.0] * DIMENSION
results = index.query(
vector=dummy_vector,
top_k=1,
include_metadata=True,
filter={"source": {"$eq": filename}}
)
return len(results["matches"]) > 0
# ββ Load Documents ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def load_documents(filepath: str, filename: str) -> list:
if filename.endswith(".pdf"):
loader = PyPDFLoader(filepath)
elif filename.endswith(".txt"):
loader = TextLoader(filepath, encoding="utf-8")
else:
return []
loaded = loader.load()
topic = os.path.splitext(filename)[0]
for doc in loaded:
doc.metadata["topic"] = topic
doc.metadata["source"] = filename
print(f"[LOADED] {filename} β {len(loaded)} page(s)")
return loaded
# ββ Chunk Documents βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def chunk_documents(docs: list) -> list:
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP
)
chunks = splitter.split_documents(docs)
print(f"[INFO] Total chunks: {len(chunks)}")
return chunks
# ββ Embed & Upsert ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def embed_and_upsert(chunks: list, filename: str):
texts = [f"passage: {chunk.page_content}" for chunk in chunks]
print("[INFO] Generating embeddings...")
all_vectors = []
for i in tqdm(range(0, len(texts), BATCH_SIZE)):
batch_texts = texts[i: i + BATCH_SIZE]
batch_chunks = chunks[i: i + BATCH_SIZE]
embeddings = embedder.encode(
batch_texts,
normalize_embeddings=True,
show_progress_bar=False
)
for j, (emb, chunk) in enumerate(zip(embeddings, batch_chunks)):
all_vectors.append({
"id": f"{os.path.splitext(filename)[0]}-chunk-{i + j}",
"values": emb.tolist(),
"metadata": {
"text": chunk.page_content,
"topic": chunk.metadata.get("topic", "unknown"),
"source": chunk.metadata.get("source", "unknown"),
}
})
print("[INFO] Upserting to Pinecone...")
for i in tqdm(range(0, len(all_vectors), 100)):
index.upsert(vectors=all_vectors[i: i + 100])
print(f"[DONE] Upserted {len(all_vectors)} chunks for '{filename}'.")
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if __name__ == "__main__":
files = [f for f in os.listdir(DATA_DIR) if f.endswith((".pdf", ".txt"))]
if not files:
print("[ERROR] No files found in data/os_notes/")
exit(1)
print(f"[INFO] Found {len(files)} file(s): {files}\n")
for filename in files:
filepath = os.path.join(DATA_DIR, filename)
# ββ SKIP CHECK ββββββββββββββββββββββββββββββββββββββββββββββββββββ
if is_already_ingested(filename):
print(f"[SKIP] '{filename}' already in Pinecone. Skipping...\n")
continue
print(f"[NEW] Processing '{filename}'...")
docs = load_documents(filepath, filename)
if not docs:
print(f"[WARN] Could not load '{filename}'. Skipping.\n")
continue
chunks = chunk_documents(docs)
embed_and_upsert(chunks, filename)
print()
print("[ALL DONE] Ingestion complete. Existing embeddings are untouched.") |