Hitan2004 commited on
Commit
a1aac98
Β·
1 Parent(s): d713e90

Update ingestion.py

Browse files
Files changed (1) hide show
  1. ingestion.py +9 -47
ingestion.py CHANGED
@@ -8,13 +8,11 @@ from rank_bm25 import BM25Okapi
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from config import (
10
  DOCS_DIR, FAISS_INDEX_PATH, BM25_PATH,
11
- CHUNKS_PATH, SOURCES_PATH, EMBEDDER_PATH,
12
  EMBEDDER_NAME, CHUNK_SIZE, CHUNK_OVERLAP
13
  )
14
 
15
- # ─────────────────────────────────────────────────────────────
16
- # Better PDF extraction (IMPORTANT)
17
- # ─────────────────────────────────────────────────────────────
18
  def read_pdf_text(fpath):
19
  import fitz # PyMuPDF
20
  doc = fitz.open(fpath)
@@ -24,38 +22,27 @@ def read_pdf_text(fpath):
24
  return "\n".join(text).strip()
25
 
26
 
27
- # ─────────────────────────────────────────────────────────────
28
- # Clean text (removes weird spacing)
29
- # ─────────────────────────────────────────────────────────────
30
  def clean_text(text):
31
  return " ".join(text.split())
32
 
33
 
34
- # ─────────────────────────────────────────────────────────────
35
- # Load documents
36
- # ─────────────────────────────────────────────────────────────
37
  def load_documents():
38
  docs, filenames = [], []
39
  path = Path(DOCS_DIR)
40
  path.mkdir(exist_ok=True)
41
 
42
- # Load TXT files
43
  for fpath in path.glob("*.txt"):
44
  try:
45
- text = fpath.read_text(encoding="utf-8")
46
- text = clean_text(text)
47
  docs.append(text)
48
  filenames.append(fpath.name)
49
  print(f" Loaded text: {fpath.name}")
50
  except Exception as e:
51
  print(f" Skipped {fpath.name}: {e}")
52
 
53
- # Load PDF files (using PyMuPDF)
54
  for fpath in path.glob("*.pdf"):
55
  try:
56
- text = read_pdf_text(fpath)
57
- text = clean_text(text)
58
-
59
  if text:
60
  docs.append(text)
61
  filenames.append(fpath.name)
@@ -75,9 +62,6 @@ def load_documents():
75
  return docs, filenames
76
 
77
 
78
- # ─────────────────────────────────────────────────────────────
79
- # Chunking (optimized for resumes)
80
- # ─────────────────────────────────────────────────────────────
81
  def semantic_chunk(docs, filenames):
82
  splitter = RecursiveCharacterTextSplitter(
83
  chunk_size=CHUNK_SIZE,
@@ -86,7 +70,6 @@ def semantic_chunk(docs, filenames):
86
  )
87
 
88
  all_chunks, all_sources = [], []
89
-
90
  for doc, fname in zip(docs, filenames):
91
  chunks = splitter.split_text(doc)
92
  all_chunks.extend(chunks)
@@ -94,8 +77,6 @@ def semantic_chunk(docs, filenames):
94
 
95
  print(f"Created {len(all_chunks)} chunks "
96
  f"(avg {sum(len(c) for c in all_chunks)//len(all_chunks)} chars each)")
97
-
98
- # Debug: show sample chunk
99
  print("\n--- SAMPLE CHUNK ---")
100
  print(all_chunks[0][:500])
101
  print("--------------------\n")
@@ -103,66 +84,47 @@ def semantic_chunk(docs, filenames):
103
  return all_chunks, all_sources
104
 
105
 
106
- # ─────────────────────────────────────────────────────────────
107
- # Build indexes
108
- # ─────────────────────────────────────────────────────────────
109
  def build_indexes(chunks):
110
  print("\nBuilding dense embeddings...")
111
 
112
  model = SentenceTransformer(EMBEDDER_NAME)
113
  embeddings = model.encode(chunks, show_progress_bar=True, batch_size=32)
114
-
115
  embeddings = np.array(embeddings, dtype="float32")
116
  faiss.normalize_L2(embeddings)
117
 
118
  dim = embeddings.shape[1]
119
  faiss_index = faiss.IndexFlatIP(dim)
120
  faiss_index.add(embeddings)
121
-
122
  print(f"FAISS index: {faiss_index.ntotal} vectors, dim={dim}")
123
 
124
  tokenized = [c.lower().split() for c in chunks]
125
  bm25_index = BM25Okapi(tokenized)
126
-
127
  print("BM25 index: built")
128
 
129
- return faiss_index, bm25_index, model
130
 
131
 
132
- # ────────────────────────────────��────────────────────────────
133
- # Save everything
134
- # ─────────────────────────────────────────────────────────────
135
- def save_indexes(faiss_index, bm25_index, chunks, sources, model):
136
  faiss.write_index(faiss_index, FAISS_INDEX_PATH)
137
 
138
  with open(BM25_PATH, "wb") as f:
139
  pickle.dump(bm25_index, f)
140
-
141
  with open(CHUNKS_PATH, "wb") as f:
142
  pickle.dump(chunks, f)
143
-
144
  with open(SOURCES_PATH, "wb") as f:
145
  pickle.dump(sources, f)
146
 
147
- model.save(EMBEDDER_PATH)
148
-
149
  print("\nSaved indexes to disk.")
150
 
151
 
152
- # ─────────────────────────────────────────────────────────────
153
- # Main runner
154
- # ─────────────────────────────────────────────────────────────
155
  def run_ingestion():
156
  print("=== Starting ingestion ===\n")
157
-
158
  docs, filenames = load_documents()
159
  chunks, sources = semantic_chunk(docs, filenames)
160
-
161
- fi, bm25, model = build_indexes(chunks)
162
- save_indexes(fi, bm25, chunks, sources, model)
163
-
164
  print("\n=== Ingestion complete ===")
165
 
166
 
167
  if __name__ == "__main__":
168
- run_ingestion()
 
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from config import (
10
  DOCS_DIR, FAISS_INDEX_PATH, BM25_PATH,
11
+ CHUNKS_PATH, SOURCES_PATH,
12
  EMBEDDER_NAME, CHUNK_SIZE, CHUNK_OVERLAP
13
  )
14
 
15
+
 
 
16
  def read_pdf_text(fpath):
17
  import fitz # PyMuPDF
18
  doc = fitz.open(fpath)
 
22
  return "\n".join(text).strip()
23
 
24
 
 
 
 
25
  def clean_text(text):
26
  return " ".join(text.split())
27
 
28
 
 
 
 
29
  def load_documents():
30
  docs, filenames = [], []
31
  path = Path(DOCS_DIR)
32
  path.mkdir(exist_ok=True)
33
 
 
34
  for fpath in path.glob("*.txt"):
35
  try:
36
+ text = clean_text(fpath.read_text(encoding="utf-8"))
 
37
  docs.append(text)
38
  filenames.append(fpath.name)
39
  print(f" Loaded text: {fpath.name}")
40
  except Exception as e:
41
  print(f" Skipped {fpath.name}: {e}")
42
 
 
43
  for fpath in path.glob("*.pdf"):
44
  try:
45
+ text = clean_text(read_pdf_text(fpath))
 
 
46
  if text:
47
  docs.append(text)
48
  filenames.append(fpath.name)
 
62
  return docs, filenames
63
 
64
 
 
 
 
65
  def semantic_chunk(docs, filenames):
66
  splitter = RecursiveCharacterTextSplitter(
67
  chunk_size=CHUNK_SIZE,
 
70
  )
71
 
72
  all_chunks, all_sources = [], []
 
73
  for doc, fname in zip(docs, filenames):
74
  chunks = splitter.split_text(doc)
75
  all_chunks.extend(chunks)
 
77
 
78
  print(f"Created {len(all_chunks)} chunks "
79
  f"(avg {sum(len(c) for c in all_chunks)//len(all_chunks)} chars each)")
 
 
80
  print("\n--- SAMPLE CHUNK ---")
81
  print(all_chunks[0][:500])
82
  print("--------------------\n")
 
84
  return all_chunks, all_sources
85
 
86
 
 
 
 
87
  def build_indexes(chunks):
88
  print("\nBuilding dense embeddings...")
89
 
90
  model = SentenceTransformer(EMBEDDER_NAME)
91
  embeddings = model.encode(chunks, show_progress_bar=True, batch_size=32)
 
92
  embeddings = np.array(embeddings, dtype="float32")
93
  faiss.normalize_L2(embeddings)
94
 
95
  dim = embeddings.shape[1]
96
  faiss_index = faiss.IndexFlatIP(dim)
97
  faiss_index.add(embeddings)
 
98
  print(f"FAISS index: {faiss_index.ntotal} vectors, dim={dim}")
99
 
100
  tokenized = [c.lower().split() for c in chunks]
101
  bm25_index = BM25Okapi(tokenized)
 
102
  print("BM25 index: built")
103
 
104
+ return faiss_index, bm25_index # model not returned β€” HuggingFace caches it
105
 
106
 
107
+ def save_indexes(faiss_index, bm25_index, chunks, sources):
 
 
 
108
  faiss.write_index(faiss_index, FAISS_INDEX_PATH)
109
 
110
  with open(BM25_PATH, "wb") as f:
111
  pickle.dump(bm25_index, f)
 
112
  with open(CHUNKS_PATH, "wb") as f:
113
  pickle.dump(chunks, f)
 
114
  with open(SOURCES_PATH, "wb") as f:
115
  pickle.dump(sources, f)
116
 
 
 
117
  print("\nSaved indexes to disk.")
118
 
119
 
 
 
 
120
  def run_ingestion():
121
  print("=== Starting ingestion ===\n")
 
122
  docs, filenames = load_documents()
123
  chunks, sources = semantic_chunk(docs, filenames)
124
+ fi, bm25 = build_indexes(chunks)
125
+ save_indexes(fi, bm25, chunks, sources)
 
 
126
  print("\n=== Ingestion complete ===")
127
 
128
 
129
  if __name__ == "__main__":
130
+ run_ingestion()