OnlyTheTruth03 commited on
Commit
3bbb203
Β·
1 Parent(s): addea30

Read dataset as temp file

Browse files
Files changed (1) hide show
  1. src/ingest.py +96 -31
src/ingest.py CHANGED
@@ -1,47 +1,112 @@
1
- from datasets import load_dataset
2
- from pypdf import PdfReader
3
  import os
4
- import faiss
5
  import pickle
 
 
 
 
 
 
6
  from sentence_transformers import SentenceTransformer
7
 
 
 
 
 
 
8
  INDEX_DIR = "src/index"
9
- os.makedirs(INDEX_DIR, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
 
 
 
 
 
 
 
12
 
13
- dataset = load_dataset("OnlyTheTruth03/ott", split="train")
14
 
15
- print("Dataset columns:", dataset.column_names)
16
- print("First row:", dataset[0])
17
 
18
- documents = []
19
- texts = []
 
 
 
 
20
 
21
- for row in dataset:
22
- #pdf_path = row["file"]
23
- pdf_path = list(row.values())[0]
24
- reader = PdfReader(pdf_path)
25
 
26
- for page_num, page in enumerate(reader.pages):
27
- text = page.extract_text()
28
- if text:
29
- texts.append(text)
30
- documents.append({
31
- "text": text,
32
- "source": "ott",
33
- "page": page_num + 1
34
- })
35
 
36
- embeddings = embedder.encode(texts, show_progress_bar=True)
37
- embeddings = embeddings.astype("float32")
38
 
39
- index = faiss.IndexFlatL2(embeddings.shape[1])
40
- index.add(embeddings)
 
41
 
42
- faiss.write_index(index, f"{INDEX_DIR}/faiss.index")
43
 
44
- with open(f"{INDEX_DIR}/documents.pkl", "wb") as f:
45
- pickle.dump(documents, f)
46
 
47
- print("βœ… Ingestion complete")
 
 
 
1
+ # src/ingest.py
2
+
3
  import os
 
4
  import pickle
5
+ import tempfile
6
+
7
+ import faiss
8
+ import numpy as np
9
+ from datasets import load_dataset
10
+ from pypdf import PdfReader
11
  from sentence_transformers import SentenceTransformer
12
 
13
+
14
+ # ================= CONFIG =================
15
+ DATASET_NAME = "OnlyTheTruth03/ott"
16
+ SPLIT = "train"
17
+
18
  INDEX_DIR = "src/index"
19
+ FAISS_PATH = os.path.join(INDEX_DIR, "faiss.index")
20
+ DOCS_PATH = os.path.join(INDEX_DIR, "documents.pkl")
21
+
22
+ EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
23
+ CHUNK_SIZE = 500
24
+ CHUNK_OVERLAP = 50
25
+
26
+
27
+ # ================= HELPERS =================
28
+ def chunk_text(text, size=500, overlap=50):
29
+ words = text.split()
30
+ chunks = []
31
+
32
+ start = 0
33
+ while start < len(words):
34
+ end = start + size
35
+ chunk = " ".join(words[start:end])
36
+ chunks.append(chunk)
37
+ start += size - overlap
38
+
39
+ return chunks
40
+
41
+
42
+ # ================= INGEST =================
43
+ def ingest():
44
+ # Avoid re-indexing on every Streamlit rerun
45
+ if os.path.exists(FAISS_PATH) and os.path.exists(DOCS_PATH):
46
+ print("βœ… FAISS index already exists. Skipping ingestion.")
47
+ return
48
+
49
+ print("πŸ“₯ Loading HF dataset...")
50
+ dataset = load_dataset(DATASET_NAME, split=SPLIT)
51
+
52
+ print("πŸ”Ž Loading embedding model...")
53
+ model = SentenceTransformer(EMBED_MODEL_NAME)
54
+
55
+ all_chunks = []
56
+ documents = []
57
+
58
+ for row in dataset:
59
+ # HF datasets provide file objects, not paths
60
+ file_obj = list(row.values())[0]
61
+
62
+ # Write PDF bytes to temp file
63
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
64
+ tmp.write(file_obj["bytes"])
65
+ tmp_path = tmp.name
66
+
67
+ reader = PdfReader(tmp_path)
68
+
69
+ for page_num, page in enumerate(reader.pages):
70
+ text = page.extract_text()
71
+ if not text:
72
+ continue
73
 
74
+ chunks = chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
75
+ for chunk in chunks:
76
+ all_chunks.append(chunk)
77
+ documents.append({
78
+ "text": chunk,
79
+ "source": "ott",
80
+ "page": page_num + 1
81
+ })
82
 
83
+ os.remove(tmp_path)
84
 
85
+ if not all_chunks:
86
+ raise RuntimeError("❌ No text extracted from PDFs.")
87
 
88
+ print(f"🧠 Creating embeddings for {len(all_chunks)} chunks...")
89
+ embeddings = model.encode(
90
+ all_chunks,
91
+ show_progress_bar=True,
92
+ convert_to_numpy=True
93
+ )
94
 
95
+ embeddings = embeddings.astype("float32")
 
 
 
96
 
97
+ print("πŸ“¦ Building FAISS index...")
98
+ index = faiss.IndexFlatL2(embeddings.shape[1])
99
+ index.add(embeddings)
 
 
 
 
 
 
100
 
101
+ os.makedirs(INDEX_DIR, exist_ok=True)
 
102
 
103
+ faiss.write_index(index, FAISS_PATH)
104
+ with open(DOCS_PATH, "wb") as f:
105
+ pickle.dump(documents, f)
106
 
107
+ print("βœ… Ingestion completed successfully.")
108
 
 
 
109
 
110
+ # ================= AUTO-RUN =================
111
+ if __name__ == "__main__":
112
+ ingest()