tharunchndrn's picture
Update backend_app/ingest.py
cedc2e4 verified
import os
import json
import pickle
from typing import List, Dict, Tuple
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from pypdf import PdfReader
from .config import (
DATA_DIR,
URLS_PATH,
FAISS_INDEX_PATH,
DOCSTORE_PATH,
EMBED_MODEL_NAME,
)
from .fetcher import fetch_page_text
DOCS_DIR = os.path.join(DATA_DIR, "docs")
def ensure_data_dir():
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(DOCS_DIR, exist_ok=True) # safe even if empty
def load_urls() -> List[str]:
"""
Expects data/urls.json like:
{ "urls": ["https://...", "https://..."] }
"""
if not os.path.exists(URLS_PATH):
# If urls.json missing, we allow ingestion to continue with local docs only
return []
with open(URLS_PATH, "r", encoding="utf-8") as f:
obj = json.load(f)
urls = obj.get("urls", [])
return [u.strip() for u in urls if isinstance(u, str) and u.strip()]
def chunk_text(text: str, chunk_size_words: int = 900, overlap_words: int = 150) -> List[str]:
"""
Simple word-based chunking (fast + reliable).
"""
text = (text or "").strip()
if not text:
return []
words = text.split()
chunks = []
i = 0
step = max(1, chunk_size_words - overlap_words)
while i < len(words):
chunk = words[i:i + chunk_size_words]
chunks.append(" ".join(chunk))
i += step
return chunks
# -------------------------
# URL ingestion
# -------------------------
def build_docs_from_urls(urls: List[str]) -> List[Dict]:
docs: List[Dict] = []
for url in urls:
try:
page = fetch_page_text(url, use_cache=True)
chunks = chunk_text(page.get("text", ""))
for idx, ch in enumerate(chunks):
docs.append({
"text": ch,
"meta": {
"source_type": "url",
"url": page.get("url", url),
"title": page.get("title", url),
"chunk": idx,
}
})
except Exception:
# skip bad URLs but continue ingestion
continue
return docs
# -------------------------
# Local docs ingestion
# -------------------------
def list_local_files() -> List[str]:
"""
Reads local files from data/docs/
Supported: .txt, .md, .pdf (text-based PDFs)
"""
if not os.path.exists(DOCS_DIR):
return []
paths = []
for name in os.listdir(DOCS_DIR):
p = os.path.join(DOCS_DIR, name)
if not os.path.isfile(p):
continue
ext = os.path.splitext(name)[1].lower()
if ext in [".txt", ".md", ".pdf"]:
paths.append(p)
return sorted(paths)
def read_text_file(path: str) -> str:
with open(path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
def read_pdf_text(path: str) -> str:
"""
Works best on selectable-text PDFs.
Scanned/image-only PDFs will extract very little.
"""
reader = PdfReader(path)
parts = []
for page in reader.pages:
try:
parts.append(page.extract_text() or "")
except Exception:
continue
return "\n".join(parts).strip()
def build_docs_from_files(file_paths: List[str]) -> List[Dict]:
docs: List[Dict] = []
for path in file_paths:
name = os.path.basename(path)
ext = os.path.splitext(name)[1].lower()
try:
if ext in [".txt", ".md"]:
text = read_text_file(path)
elif ext == ".pdf":
text = read_pdf_text(path)
else:
continue
except Exception:
continue
chunks = chunk_text(text)
for idx, ch in enumerate(chunks):
docs.append({
"text": ch,
"meta": {
"source_type": "file",
"url": f"file://{name}",
"title": name,
"chunk": idx,
}
})
return docs
# -------------------------
# Index building
# -------------------------
def build_faiss_index(docs: List[Dict]) -> None:
model = SentenceTransformer(EMBED_MODEL_NAME)
texts = [d["text"] for d in docs]
emb = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
emb = np.array(emb, dtype="float32")
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)
faiss.write_index(index, FAISS_INDEX_PATH)
with open(DOCSTORE_PATH, "wb") as f:
pickle.dump(docs, f)
def run_ingestion():
ensure_data_dir()
urls = load_urls()
url_docs = build_docs_from_urls(urls) if urls else []
file_paths = list_local_files()
file_docs = build_docs_from_files(file_paths) if file_paths else []
docs = url_docs + file_docs
if not docs:
raise RuntimeError(
"No documents found.\n"
"- Add URLs to data/urls.json OR\n"
"- Add files to data/docs/ (.txt, .md, .pdf)"
)
build_faiss_index(docs)
print("✅ Ingestion complete")
print(f"URLs: {len(urls)}")
print(f"Local files: {len(file_paths)}")
print(f"Chunks: {len(docs)}")
print(f"Saved index: {FAISS_INDEX_PATH}")
print(f"Saved docs: {DOCSTORE_PATH}")
if __name__ == "__main__":
run_ingestion()