Bishal Sharma commited on
Commit
48e85cb
·
verified ·
1 Parent(s): 0547b80

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ docs/GeneralBiology.pdf filter=lfs diff=lfs merge=lfs -text
build_vector_store.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_vector_store.py
2
+ import os
3
+ import json
4
+ import math
5
+ from pathlib import Path
6
+ from tqdm import tqdm
7
+
8
+ import numpy as np
9
+ import pdfplumber
10
+ from sentence_transformers import SentenceTransformer
11
+ import faiss
12
+
13
+ # --------- CONFIG ----------
14
+ DOCS_DIR = Path("docs")
15
+ DATA_DIR = Path("data")
16
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
17
+ CHUNK_CHAR_SIZE = 1000 # ~400-500 tokens approx (tweak if you want)
18
+ CHUNK_OVERLAP = 200
19
+ EMBED_DIM = 384 # embedding dimension of all-MiniLM-L6-v2
20
+ BATCH_SIZE = 32
21
+ TOP_K = 5
22
+ # ---------------------------
23
+
24
+ DATA_DIR.mkdir(exist_ok=True)
25
+
26
+ def extract_text_from_pdf(pdf_path: Path):
27
+ pages = []
28
+ with pdfplumber.open(pdf_path) as pdf:
29
+ for i, page in enumerate(pdf.pages):
30
+ text = page.extract_text() or ""
31
+ pages.append({"page_number": i+1, "text": text})
32
+ return pages
33
+
34
+ def split_text_into_chunks(text, chunk_size=CHUNK_CHAR_SIZE, overlap=CHUNK_OVERLAP):
35
+ text = text.strip()
36
+ if not text:
37
+ return []
38
+ chunks = []
39
+ start = 0
40
+ text_len = len(text)
41
+ while start < text_len:
42
+ end = start + chunk_size
43
+ # try to avoid breaking mid-sentence: find last newline or period inside chunk
44
+ if end < text_len:
45
+ snippet = text[start:end]
46
+ # prefer last sentence boundary
47
+ cut = max(snippet.rfind('\n'), snippet.rfind('. '), snippet.rfind('? '), snippet.rfind('! '))
48
+ if cut != -1 and cut > int(chunk_size * 0.5):
49
+ end = start + cut + 1
50
+ chunk_text = text[start:end].strip()
51
+ if chunk_text:
52
+ chunks.append(chunk_text)
53
+ start = end - overlap
54
+ if start < 0:
55
+ start = 0
56
+ if end >= text_len:
57
+ break
58
+ return chunks
59
+
60
+ def build_embeddings(model, texts):
61
+ embeddings = []
62
+ for i in range(0, len(texts), BATCH_SIZE):
63
+ batch = texts[i:i+BATCH_SIZE]
64
+ embs = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
65
+ embeddings.append(embs)
66
+ if embeddings:
67
+ return np.vstack(embeddings)
68
+ return np.empty((0, model.get_sentence_embedding_dimension()))
69
+
70
+ def normalize_embeddings(embeddings: np.ndarray):
71
+ # normalize in-place to unit vectors for cosine via inner product index
72
+ faiss.normalize_L2(embeddings)
73
+ return embeddings
74
+
75
+ def main():
76
+ model = SentenceTransformer(EMBED_MODEL)
77
+ EMBED_DIM_LOCAL = model.get_sentence_embedding_dimension()
78
+ print(f"Loaded embed model '{EMBED_MODEL}' with dim={EMBED_DIM_LOCAL}")
79
+
80
+ all_text_chunks = []
81
+ metadata = []
82
+
83
+ chunk_id = 0
84
+ pdf_files = list(DOCS_DIR.glob("*.pdf"))
85
+ if not pdf_files:
86
+ print("No PDF files found in docs/ — put your PDFs there and re-run.")
87
+ return
88
+
89
+ for pdf_path in pdf_files:
90
+ print(f"Processing: {pdf_path.name}")
91
+ pages = extract_text_from_pdf(pdf_path)
92
+ for page in pages:
93
+ page_text = page["text"]
94
+ if not page_text:
95
+ continue
96
+ chunks = split_text_into_chunks(page_text)
97
+ for i, c in enumerate(chunks):
98
+ doc_meta = {
99
+ "chunk_id": chunk_id,
100
+ "source_file": pdf_path.name,
101
+ "page": page["page_number"],
102
+ "chunk_index_in_page": i,
103
+ "text": c[:1000] # store a preview (or store full text if you want)
104
+ }
105
+ metadata.append(doc_meta)
106
+ all_text_chunks.append(c)
107
+ chunk_id += 1
108
+
109
+ if not all_text_chunks:
110
+ print("No text extracted from PDFs.")
111
+ return
112
+
113
+ print(f"Total chunks: {len(all_text_chunks)}")
114
+ # compute embeddings
115
+ embeddings = build_embeddings(model, all_text_chunks)
116
+ print("Embeddings shape:", embeddings.shape)
117
+
118
+ # normalize
119
+ embeddings = normalize_embeddings(embeddings)
120
+
121
+ # build FAISS index (inner-product on normalized vectors == cosine sim)
122
+ index = faiss.IndexFlatIP(EMBED_DIM_LOCAL)
123
+ index.add(embeddings.astype('float32'))
124
+ print("FAISS index built. n_total:", index.ntotal)
125
+
126
+ # save index and metadata
127
+ index_path = DATA_DIR / "vector_store.index"
128
+ faiss.write_index(index, str(index_path))
129
+ meta_path = DATA_DIR / "metadata.json"
130
+ with open(meta_path, "w", encoding="utf-8") as f:
131
+ json.dump(metadata, f, ensure_ascii=False, indent=2)
132
+
133
+ print(f"Saved FAISS index -> {index_path}")
134
+ print(f"Saved metadata -> {meta_path}")
135
+
136
+ if __name__ == "__main__":
137
+ main()
data/metadata.json ADDED
File without changes
data/vector_store.index ADDED
File without changes
docs/GeneralBiology.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2275f33d8fb2d45789e0cf756944c89a8f88efef2b890f6a4e6949dab3afc87
3
+ size 6654253
query_vector_store.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # query_vector_store.py
2
+ import json
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ from pathlib import Path
7
+
8
+ DATA_DIR = Path("data")
9
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
10
+ TOP_K = 5
11
+
12
+ def load_index():
13
+ index = faiss.read_index(str(DATA_DIR / "vector_store.index"))
14
+ return index
15
+
16
+ def load_metadata():
17
+ with open(DATA_DIR / "metadata.json", "r", encoding="utf-8") as f:
18
+ return json.load(f)
19
+
20
+ def embed_query(model, query):
21
+ emb = model.encode([query], convert_to_numpy=True)
22
+ # normalize for cosine with IndexFlatIP
23
+ faiss.normalize_L2(emb)
24
+ return emb
25
+
26
+ def search(query, top_k=TOP_K):
27
+ model = SentenceTransformer(EMBED_MODEL)
28
+ index = load_index()
29
+ metadata = load_metadata()
30
+
31
+ q_emb = embed_query(model, query)
32
+ D, I = index.search(q_emb.astype('float32'), top_k) # D: similarities, I: indices
33
+
34
+ results = []
35
+ for score, idx in zip(D[0], I[0]):
36
+ meta = metadata[idx]
37
+ results.append({"score": float(score), "doc": meta})
38
+ return results
39
+
40
+ if __name__ == "__main__":
41
+ q = input("Enter your question/query: ").strip()
42
+ res = search(q, top_k=5)
43
+ for i, r in enumerate(res, 1):
44
+ print(f"\n=== Result {i} (score={r['score']:.4f}) ===")
45
+ print("Source:", r["doc"]["source_file"], "page:", r["doc"]["page"])
46
+ print("Preview:", r["doc"]["text"][:800])