Spaces:

biswaa000
/

smartedu

Runtime error

App Files Files Community

Bishal Sharma commited on Aug 10, 2025

Commit

48e85cb

verified ·

1 Parent(s): 0547b80

Upload 5 files

Browse files

Files changed (6) hide show

.gitattributes +1 -0
build_vector_store.py +137 -0
data/metadata.json +0 -0
data/vector_store.index +0 -0
docs/GeneralBiology.pdf +3 -0
query_vector_store.py +46 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/GeneralBiology.pdf filter=lfs diff=lfs merge=lfs -text

build_vector_store.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# build_vector_store.py
+import os
+import json
+import math
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+import pdfplumber
+from sentence_transformers import SentenceTransformer
+import faiss
+# --------- CONFIG ----------
+DOCS_DIR = Path("docs")
+DATA_DIR = Path("data")
+EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+CHUNK_CHAR_SIZE = 1000      # ~400-500 tokens approx (tweak if you want)
+CHUNK_OVERLAP = 200
+EMBED_DIM = 384             # embedding dimension of all-MiniLM-L6-v2
+BATCH_SIZE = 32
+TOP_K = 5
+# ---------------------------
+DATA_DIR.mkdir(exist_ok=True)
+def extract_text_from_pdf(pdf_path: Path):
+    pages = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for i, page in enumerate(pdf.pages):
+            text = page.extract_text() or ""
+            pages.append({"page_number": i+1, "text": text})
+    return pages
+def split_text_into_chunks(text, chunk_size=CHUNK_CHAR_SIZE, overlap=CHUNK_OVERLAP):
+    text = text.strip()
+    if not text:
+        return []
+    chunks = []
+    start = 0
+    text_len = len(text)
+    while start < text_len:
+        end = start + chunk_size
+        # try to avoid breaking mid-sentence: find last newline or period inside chunk
+        if end < text_len:
+            snippet = text[start:end]
+            # prefer last sentence boundary
+            cut = max(snippet.rfind('\n'), snippet.rfind('. '), snippet.rfind('? '), snippet.rfind('! '))
+            if cut != -1 and cut > int(chunk_size * 0.5):
+                end = start + cut + 1
+        chunk_text = text[start:end].strip()
+        if chunk_text:
+            chunks.append(chunk_text)
+        start = end - overlap
+        if start < 0:
+            start = 0
+        if end >= text_len:
+            break
+    return chunks
+def build_embeddings(model, texts):
+    embeddings = []
+    for i in range(0, len(texts), BATCH_SIZE):
+        batch = texts[i:i+BATCH_SIZE]
+        embs = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
+        embeddings.append(embs)
+    if embeddings:
+        return np.vstack(embeddings)
+    return np.empty((0, model.get_sentence_embedding_dimension()))
+def normalize_embeddings(embeddings: np.ndarray):
+    # normalize in-place to unit vectors for cosine via inner product index
+    faiss.normalize_L2(embeddings)
+    return embeddings
+def main():
+    model = SentenceTransformer(EMBED_MODEL)
+    EMBED_DIM_LOCAL = model.get_sentence_embedding_dimension()
+    print(f"Loaded embed model '{EMBED_MODEL}' with dim={EMBED_DIM_LOCAL}")
+    all_text_chunks = []
+    metadata = []
+    chunk_id = 0
+    pdf_files = list(DOCS_DIR.glob("*.pdf"))
+    if not pdf_files:
+        print("No PDF files found in docs/ — put your PDFs there and re-run.")
+        return
+    for pdf_path in pdf_files:
+        print(f"Processing: {pdf_path.name}")
+        pages = extract_text_from_pdf(pdf_path)
+        for page in pages:
+            page_text = page["text"]
+            if not page_text:
+                continue
+            chunks = split_text_into_chunks(page_text)
+            for i, c in enumerate(chunks):
+                doc_meta = {
+                    "chunk_id": chunk_id,
+                    "source_file": pdf_path.name,
+                    "page": page["page_number"],
+                    "chunk_index_in_page": i,
+                    "text": c[:1000]  # store a preview (or store full text if you want)
+                }
+                metadata.append(doc_meta)
+                all_text_chunks.append(c)
+                chunk_id += 1
+    if not all_text_chunks:
+        print("No text extracted from PDFs.")
+        return
+    print(f"Total chunks: {len(all_text_chunks)}")
+    # compute embeddings
+    embeddings = build_embeddings(model, all_text_chunks)
+    print("Embeddings shape:", embeddings.shape)
+    # normalize
+    embeddings = normalize_embeddings(embeddings)
+    # build FAISS index (inner-product on normalized vectors == cosine sim)
+    index = faiss.IndexFlatIP(EMBED_DIM_LOCAL)
+    index.add(embeddings.astype('float32'))
+    print("FAISS index built. n_total:", index.ntotal)
+    # save index and metadata
+    index_path = DATA_DIR / "vector_store.index"
+    faiss.write_index(index, str(index_path))
+    meta_path = DATA_DIR / "metadata.json"
+    with open(meta_path, "w", encoding="utf-8") as f:
+        json.dump(metadata, f, ensure_ascii=False, indent=2)
+    print(f"Saved FAISS index -> {index_path}")
+    print(f"Saved metadata -> {meta_path}")
+if __name__ == "__main__":
+    main()

data/metadata.json ADDED Viewed

File without changes

data/vector_store.index ADDED Viewed

File without changes

docs/GeneralBiology.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2275f33d8fb2d45789e0cf756944c89a8f88efef2b890f6a4e6949dab3afc87
+size 6654253

query_vector_store.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# query_vector_store.py
+import json
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import faiss
+from pathlib import Path
+DATA_DIR = Path("data")
+EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+TOP_K = 5
+def load_index():
+    index = faiss.read_index(str(DATA_DIR / "vector_store.index"))
+    return index
+def load_metadata():
+    with open(DATA_DIR / "metadata.json", "r", encoding="utf-8") as f:
+        return json.load(f)
+def embed_query(model, query):
+    emb = model.encode([query], convert_to_numpy=True)
+    # normalize for cosine with IndexFlatIP
+    faiss.normalize_L2(emb)
+    return emb
+def search(query, top_k=TOP_K):
+    model = SentenceTransformer(EMBED_MODEL)
+    index = load_index()
+    metadata = load_metadata()
+    q_emb = embed_query(model, query)
+    D, I = index.search(q_emb.astype('float32'), top_k)  # D: similarities, I: indices
+    results = []
+    for score, idx in zip(D[0], I[0]):
+        meta = metadata[idx]
+        results.append({"score": float(score), "doc": meta})
+    return results
+if __name__ == "__main__":
+    q = input("Enter your question/query: ").strip()
+    res = search(q, top_k=5)
+    for i, r in enumerate(res, 1):
+        print(f"\n=== Result {i} (score={r['score']:.4f}) ===")
+        print("Source:", r["doc"]["source_file"], "page:", r["doc"]["page"])
+        print("Preview:", r["doc"]["text"][:800])