OnlyTheTruth03 commited on
Commit
3688256
Β·
1 Parent(s): 06284fb

ingest fix

Browse files
Files changed (1) hide show
  1. src/ingest.py +31 -51
src/ingest.py CHANGED
@@ -1,76 +1,56 @@
1
- # src/ingest.py
2
- import pickle
3
- from pathlib import Path
4
-
5
  import faiss
6
- import pdfplumber
7
- from datasets import load_dataset
8
  from sentence_transformers import SentenceTransformer
9
-
10
  from config import (
11
- DATASET_NAME,
12
  FAISS_INDEX_PATH,
13
  DOCS_PATH,
14
  CHUNK_SIZE,
15
  CHUNK_OVERLAP,
16
  )
17
 
18
-
19
- def chunk_text(text: str):
20
- chunks = []
21
- start = 0
22
-
23
- while start < len(text):
24
- end = start + CHUNK_SIZE
25
- chunks.append(text[start:end])
26
- start = end - CHUNK_OVERLAP
27
-
28
- return chunks
29
-
30
-
31
  def build_index():
32
- print("πŸ“₯ Loading HF dataset...")
33
- dataset = load_dataset(DATASET_NAME, split="train")
34
-
35
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
36
-
37
- documents = []
38
- embeddings = []
39
 
40
- print(f"πŸ“„ Dataset rows: {len(dataset)}")
41
 
42
- for row_idx, row in enumerate(dataset):
43
- pdf_obj = row["pdf"]
44
 
45
- # βœ… THIS IS THE KEY
46
- pdf_path = pdf_obj.path
 
47
 
48
- print(f"➑️ Processing PDF {row_idx + 1}: {pdf_path}")
 
 
 
 
49
 
50
- with pdfplumber.open(pdf_path) as pdf:
51
- for page in pdf.pages:
52
- text = page.extract_text()
53
- if not text:
54
- continue
55
-
56
- for chunk in chunk_text(text):
57
- documents.append(chunk)
58
- embeddings.append(embedder.encode(chunk))
59
 
60
- if not documents:
61
- raise RuntimeError("❌ No text extracted from PDFs")
 
62
 
63
- print(f"🧠 Creating FAISS index with {len(documents)} chunks")
 
64
 
65
- dim = len(embeddings[0])
66
  index = faiss.IndexFlatL2(dim)
67
  index.add(embeddings)
68
 
69
- FAISS_INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
70
-
71
  faiss.write_index(index, str(FAISS_INDEX_PATH))
72
 
73
  with open(DOCS_PATH, "wb") as f:
74
- pickle.dump(documents, f)
75
 
76
- print("βœ… FAISS index built successfully")
 
1
+ import numpy as np
 
 
 
2
  import faiss
3
+ import pickle
 
4
  from sentence_transformers import SentenceTransformer
 
5
  from config import (
 
6
  FAISS_INDEX_PATH,
7
  DOCS_PATH,
8
  CHUNK_SIZE,
9
  CHUNK_OVERLAP,
10
  )
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def build_index():
13
+ from datasets import load_dataset
14
+ from pypdf import PdfReader
15
+ import io
 
 
 
 
16
 
17
+ dataset = load_dataset("OnlyTheTruth03/ott", split="train")
18
 
19
+ texts = []
 
20
 
21
+ for row in dataset:
22
+ pdf_obj = row["pdf"] # HF auto parquet object
23
+ pdf_bytes = pdf_obj["bytes"]
24
 
25
+ reader = PdfReader(io.BytesIO(pdf_bytes))
26
+ for page in reader.pages:
27
+ text = page.extract_text()
28
+ if text:
29
+ texts.append(text)
30
 
31
+ # -------- Chunking --------
32
+ chunks = []
33
+ for text in texts:
34
+ start = 0
35
+ while start < len(text):
36
+ end = start + CHUNK_SIZE
37
+ chunks.append(text[start:end])
38
+ start = end - CHUNK_OVERLAP
 
39
 
40
+ # -------- Embeddings --------
41
+ model = SentenceTransformer("all-MiniLM-L6-v2")
42
+ embeddings = model.encode(chunks, show_progress_bar=True)
43
 
44
+ # πŸ”₯ CRITICAL FIX
45
+ embeddings = np.array(embeddings).astype("float32")
46
 
47
+ dim = embeddings.shape[1]
48
  index = faiss.IndexFlatL2(dim)
49
  index.add(embeddings)
50
 
 
 
51
  faiss.write_index(index, str(FAISS_INDEX_PATH))
52
 
53
  with open(DOCS_PATH, "wb") as f:
54
+ pickle.dump(chunks, f)
55
 
56
+ return index, chunks