OnlyTheTruth03 commited on
Commit
07a5b4f
Β·
1 Parent(s): 11a6288

Paraquet dataset correction

Browse files
Files changed (1) hide show
  1. src/ingest.py +9 -18
src/ingest.py CHANGED
@@ -2,11 +2,10 @@
2
  import os
3
  import pickle
4
  import faiss
5
- from pypdf import PdfReader
6
  from datasets import load_dataset
 
7
  from sentence_transformers import SentenceTransformer
8
 
9
- # ---------------- CONFIG ----------------
10
  DATASET_NAME = "OnlyTheTruth03/ott"
11
  INDEX_DIR = "src/index"
12
 
@@ -16,46 +15,38 @@ embedder = SentenceTransformer("all-MiniLM-L6-v2")
16
 
17
 
18
  def build_index():
19
- print("πŸ“₯ Loading dataset from Hugging Face...")
20
  dataset = load_dataset(DATASET_NAME, split="train")
21
 
22
- print("πŸ“„ Dataset columns:", dataset.column_names)
23
-
24
  documents = []
25
 
26
  for row in dataset:
27
- # HF Pdf feature β†’ object with `.path`
28
  pdf_obj = row[dataset.column_names[0]]
29
 
30
- if not pdf_obj or not hasattr(pdf_obj, "path"):
31
- raise RuntimeError("❌ Invalid PDF object from HF dataset")
32
-
33
  pdf_path = pdf_obj.path
34
- print(f"πŸ“‚ Reading PDF from {pdf_path}")
35
 
 
36
  reader = PdfReader(pdf_path)
37
 
38
- for page_num, page in enumerate(reader.pages, start=1):
39
  text = page.extract_text()
40
  if not text:
41
  continue
42
 
43
  documents.append({
44
  "text": text.strip(),
45
- "page": page_num
46
  })
47
 
48
  if not documents:
49
  raise RuntimeError("❌ No text extracted from PDF")
50
 
51
- print(f"βœ‚οΈ Extracted {len(documents)} text chunks")
52
-
53
  texts = [d["text"] for d in documents]
54
- embeddings = embedder.encode(texts, show_progress_bar=True).astype("float32")
55
 
56
- print("πŸ“¦ Building FAISS index...")
57
- dimension = embeddings.shape[1]
58
- index = faiss.IndexFlatL2(dimension)
59
  index.add(embeddings)
60
 
61
  faiss.write_index(index, f"{INDEX_DIR}/faiss.index")
 
2
  import os
3
  import pickle
4
  import faiss
 
5
  from datasets import load_dataset
6
+ from pypdf import PdfReader
7
  from sentence_transformers import SentenceTransformer
8
 
 
9
  DATASET_NAME = "OnlyTheTruth03/ott"
10
  INDEX_DIR = "src/index"
11
 
 
15
 
16
 
17
  def build_index():
18
+ print("πŸ“₯ Loading HF dataset...")
19
  dataset = load_dataset(DATASET_NAME, split="train")
20
 
 
 
21
  documents = []
22
 
23
  for row in dataset:
24
+ # HF auto-parquet PDF object
25
  pdf_obj = row[dataset.column_names[0]]
26
 
27
+ # βœ… THIS is the key line
 
 
28
  pdf_path = pdf_obj.path
 
29
 
30
+ print(f"πŸ“„ Reading PDF from: {pdf_path}")
31
  reader = PdfReader(pdf_path)
32
 
33
+ for page_no, page in enumerate(reader.pages, start=1):
34
  text = page.extract_text()
35
  if not text:
36
  continue
37
 
38
  documents.append({
39
  "text": text.strip(),
40
+ "page": page_no
41
  })
42
 
43
  if not documents:
44
  raise RuntimeError("❌ No text extracted from PDF")
45
 
 
 
46
  texts = [d["text"] for d in documents]
47
+ embeddings = embedder.encode(texts).astype("float32")
48
 
49
+ index = faiss.IndexFlatL2(embeddings.shape[1])
 
 
50
  index.add(embeddings)
51
 
52
  faiss.write_index(index, f"{INDEX_DIR}/faiss.index")