tharunchndrn commited on
Commit
cedc2e4
·
verified ·
1 Parent(s): eef6b96

Update backend_app/ingest.py

Browse files
Files changed (1) hide show
  1. backend_app/ingest.py +213 -111
backend_app/ingest.py CHANGED
@@ -1,112 +1,214 @@
1
- import os
2
- import json
3
- import pickle
4
- from typing import List, Dict
5
-
6
- import numpy as np
7
- import faiss
8
- from sentence_transformers import SentenceTransformer
9
-
10
- from .config import (
11
- DATA_DIR,
12
- URLS_PATH,
13
- FAISS_INDEX_PATH,
14
- DOCSTORE_PATH,
15
- EMBED_MODEL_NAME,
16
- )
17
- from .fetcher import fetch_page_text
18
-
19
-
20
- def ensure_data_dir():
21
- os.makedirs(DATA_DIR, exist_ok=True)
22
-
23
-
24
- def load_urls() -> List[str]:
25
- """
26
- Expects data/urls.json like:
27
- {
28
- "urls": ["https://...", "https://..."]
29
- }
30
- """
31
- if not os.path.exists(URLS_PATH):
32
- raise FileNotFoundError(
33
- f"Missing {URLS_PATH}. Create it with your 4 URLs."
34
- )
35
- with open(URLS_PATH, "r", encoding="utf-8") as f:
36
- obj = json.load(f)
37
- urls = obj.get("urls", [])
38
- if not urls:
39
- raise ValueError("urls.json has no URLs. Add at least 1 URL.")
40
- return urls
41
-
42
-
43
- def chunk_text(text: str, chunk_size_words: int = 900, overlap_words: int = 150) -> List[str]:
44
- """
45
- Simple word-based chunking (fast + reliable).
46
- """
47
- words = text.split()
48
- chunks = []
49
- i = 0
50
- step = max(1, chunk_size_words - overlap_words)
51
-
52
- while i < len(words):
53
- chunk = words[i:i + chunk_size_words]
54
- chunks.append(" ".join(chunk))
55
- i += step
56
-
57
- return chunks
58
-
59
-
60
- def build_docs_from_urls(urls: List[str]) -> List[Dict]:
61
- docs: List[Dict] = []
62
- for url in urls:
63
- page = fetch_page_text(url, use_cache=True)
64
- chunks = chunk_text(page["text"])
65
-
66
- for idx, ch in enumerate(chunks):
67
- docs.append({
68
- "text": ch,
69
- "meta": {
70
- "url": page["url"],
71
- "title": page["title"],
72
- "chunk": idx
73
- }
74
- })
75
- return docs
76
-
77
-
78
- def build_faiss_index(docs: List[Dict]) -> None:
79
- model = SentenceTransformer(EMBED_MODEL_NAME)
80
-
81
- texts = [d["text"] for d in docs]
82
- emb = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
83
- emb = np.array(emb, dtype="float32")
84
-
85
- index = faiss.IndexFlatIP(emb.shape[1])
86
- index.add(emb)
87
-
88
- faiss.write_index(index, FAISS_INDEX_PATH)
89
-
90
- with open(DOCSTORE_PATH, "wb") as f:
91
- pickle.dump(docs, f)
92
-
93
-
94
- def run_ingestion():
95
- ensure_data_dir()
96
- urls = load_urls()
97
- docs = build_docs_from_urls(urls)
98
-
99
- if not docs:
100
- raise RuntimeError("No documents created from URLs. Check your URLs/pages.")
101
-
102
- build_faiss_index(docs)
103
-
104
- print("✅ Ingestion complete")
105
- print(f"URLs: {len(urls)}")
106
- print(f"Chunks: {len(docs)}")
107
- print(f"Saved index: {FAISS_INDEX_PATH}")
108
- print(f"Saved docs: {DOCSTORE_PATH}")
109
-
110
-
111
- if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  run_ingestion()
 
1
+ import os
2
+ import json
3
+ import pickle
4
+ from typing import List, Dict, Tuple
5
+
6
+ import numpy as np
7
+ import faiss
8
+ from sentence_transformers import SentenceTransformer
9
+ from pypdf import PdfReader
10
+
11
+ from .config import (
12
+ DATA_DIR,
13
+ URLS_PATH,
14
+ FAISS_INDEX_PATH,
15
+ DOCSTORE_PATH,
16
+ EMBED_MODEL_NAME,
17
+ )
18
+ from .fetcher import fetch_page_text
19
+
20
+
21
+ DOCS_DIR = os.path.join(DATA_DIR, "docs")
22
+
23
+
24
+ def ensure_data_dir():
25
+ os.makedirs(DATA_DIR, exist_ok=True)
26
+ os.makedirs(DOCS_DIR, exist_ok=True) # safe even if empty
27
+
28
+
29
+ def load_urls() -> List[str]:
30
+ """
31
+ Expects data/urls.json like:
32
+ { "urls": ["https://...", "https://..."] }
33
+ """
34
+ if not os.path.exists(URLS_PATH):
35
+ # If urls.json missing, we allow ingestion to continue with local docs only
36
+ return []
37
+
38
+ with open(URLS_PATH, "r", encoding="utf-8") as f:
39
+ obj = json.load(f)
40
+
41
+ urls = obj.get("urls", [])
42
+ return [u.strip() for u in urls if isinstance(u, str) and u.strip()]
43
+
44
+
45
+ def chunk_text(text: str, chunk_size_words: int = 900, overlap_words: int = 150) -> List[str]:
46
+ """
47
+ Simple word-based chunking (fast + reliable).
48
+ """
49
+ text = (text or "").strip()
50
+ if not text:
51
+ return []
52
+
53
+ words = text.split()
54
+ chunks = []
55
+ i = 0
56
+ step = max(1, chunk_size_words - overlap_words)
57
+
58
+ while i < len(words):
59
+ chunk = words[i:i + chunk_size_words]
60
+ chunks.append(" ".join(chunk))
61
+ i += step
62
+
63
+ return chunks
64
+
65
+
66
+ # -------------------------
67
+ # URL ingestion
68
+ # -------------------------
69
+ def build_docs_from_urls(urls: List[str]) -> List[Dict]:
70
+ docs: List[Dict] = []
71
+ for url in urls:
72
+ try:
73
+ page = fetch_page_text(url, use_cache=True)
74
+ chunks = chunk_text(page.get("text", ""))
75
+
76
+ for idx, ch in enumerate(chunks):
77
+ docs.append({
78
+ "text": ch,
79
+ "meta": {
80
+ "source_type": "url",
81
+ "url": page.get("url", url),
82
+ "title": page.get("title", url),
83
+ "chunk": idx,
84
+ }
85
+ })
86
+ except Exception:
87
+ # skip bad URLs but continue ingestion
88
+ continue
89
+ return docs
90
+
91
+
92
+ # -------------------------
93
+ # Local docs ingestion
94
+ # -------------------------
95
+ def list_local_files() -> List[str]:
96
+ """
97
+ Reads local files from data/docs/
98
+ Supported: .txt, .md, .pdf (text-based PDFs)
99
+ """
100
+ if not os.path.exists(DOCS_DIR):
101
+ return []
102
+
103
+ paths = []
104
+ for name in os.listdir(DOCS_DIR):
105
+ p = os.path.join(DOCS_DIR, name)
106
+ if not os.path.isfile(p):
107
+ continue
108
+ ext = os.path.splitext(name)[1].lower()
109
+ if ext in [".txt", ".md", ".pdf"]:
110
+ paths.append(p)
111
+ return sorted(paths)
112
+
113
+
114
+ def read_text_file(path: str) -> str:
115
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
116
+ return f.read()
117
+
118
+
119
+ def read_pdf_text(path: str) -> str:
120
+ """
121
+ Works best on selectable-text PDFs.
122
+ Scanned/image-only PDFs will extract very little.
123
+ """
124
+ reader = PdfReader(path)
125
+ parts = []
126
+ for page in reader.pages:
127
+ try:
128
+ parts.append(page.extract_text() or "")
129
+ except Exception:
130
+ continue
131
+ return "\n".join(parts).strip()
132
+
133
+
134
+ def build_docs_from_files(file_paths: List[str]) -> List[Dict]:
135
+ docs: List[Dict] = []
136
+
137
+ for path in file_paths:
138
+ name = os.path.basename(path)
139
+ ext = os.path.splitext(name)[1].lower()
140
+
141
+ try:
142
+ if ext in [".txt", ".md"]:
143
+ text = read_text_file(path)
144
+ elif ext == ".pdf":
145
+ text = read_pdf_text(path)
146
+ else:
147
+ continue
148
+ except Exception:
149
+ continue
150
+
151
+ chunks = chunk_text(text)
152
+ for idx, ch in enumerate(chunks):
153
+ docs.append({
154
+ "text": ch,
155
+ "meta": {
156
+ "source_type": "file",
157
+ "url": f"file://{name}",
158
+ "title": name,
159
+ "chunk": idx,
160
+ }
161
+ })
162
+
163
+ return docs
164
+
165
+
166
+ # -------------------------
167
+ # Index building
168
+ # -------------------------
169
+ def build_faiss_index(docs: List[Dict]) -> None:
170
+ model = SentenceTransformer(EMBED_MODEL_NAME)
171
+
172
+ texts = [d["text"] for d in docs]
173
+ emb = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
174
+ emb = np.array(emb, dtype="float32")
175
+
176
+ index = faiss.IndexFlatIP(emb.shape[1])
177
+ index.add(emb)
178
+
179
+ faiss.write_index(index, FAISS_INDEX_PATH)
180
+
181
+ with open(DOCSTORE_PATH, "wb") as f:
182
+ pickle.dump(docs, f)
183
+
184
+
185
+ def run_ingestion():
186
+ ensure_data_dir()
187
+
188
+ urls = load_urls()
189
+ url_docs = build_docs_from_urls(urls) if urls else []
190
+
191
+ file_paths = list_local_files()
192
+ file_docs = build_docs_from_files(file_paths) if file_paths else []
193
+
194
+ docs = url_docs + file_docs
195
+
196
+ if not docs:
197
+ raise RuntimeError(
198
+ "No documents found.\n"
199
+ "- Add URLs to data/urls.json OR\n"
200
+ "- Add files to data/docs/ (.txt, .md, .pdf)"
201
+ )
202
+
203
+ build_faiss_index(docs)
204
+
205
+ print("✅ Ingestion complete")
206
+ print(f"URLs: {len(urls)}")
207
+ print(f"Local files: {len(file_paths)}")
208
+ print(f"Chunks: {len(docs)}")
209
+ print(f"Saved index: {FAISS_INDEX_PATH}")
210
+ print(f"Saved docs: {DOCSTORE_PATH}")
211
+
212
+
213
+ if __name__ == "__main__":
214
  run_ingestion()