Spaces:
Build error
Build error
| import os | |
| import json | |
| import pickle | |
| from typing import List, Dict, Tuple | |
| import numpy as np | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| from pypdf import PdfReader | |
| from .config import ( | |
| DATA_DIR, | |
| URLS_PATH, | |
| FAISS_INDEX_PATH, | |
| DOCSTORE_PATH, | |
| EMBED_MODEL_NAME, | |
| ) | |
| from .fetcher import fetch_page_text | |
| DOCS_DIR = os.path.join(DATA_DIR, "docs") | |
| def ensure_data_dir(): | |
| os.makedirs(DATA_DIR, exist_ok=True) | |
| os.makedirs(DOCS_DIR, exist_ok=True) # safe even if empty | |
| def load_urls() -> List[str]: | |
| """ | |
| Expects data/urls.json like: | |
| { "urls": ["https://...", "https://..."] } | |
| """ | |
| if not os.path.exists(URLS_PATH): | |
| # If urls.json missing, we allow ingestion to continue with local docs only | |
| return [] | |
| with open(URLS_PATH, "r", encoding="utf-8") as f: | |
| obj = json.load(f) | |
| urls = obj.get("urls", []) | |
| return [u.strip() for u in urls if isinstance(u, str) and u.strip()] | |
| def chunk_text(text: str, chunk_size_words: int = 900, overlap_words: int = 150) -> List[str]: | |
| """ | |
| Simple word-based chunking (fast + reliable). | |
| """ | |
| text = (text or "").strip() | |
| if not text: | |
| return [] | |
| words = text.split() | |
| chunks = [] | |
| i = 0 | |
| step = max(1, chunk_size_words - overlap_words) | |
| while i < len(words): | |
| chunk = words[i:i + chunk_size_words] | |
| chunks.append(" ".join(chunk)) | |
| i += step | |
| return chunks | |
| # ------------------------- | |
| # URL ingestion | |
| # ------------------------- | |
| def build_docs_from_urls(urls: List[str]) -> List[Dict]: | |
| docs: List[Dict] = [] | |
| for url in urls: | |
| try: | |
| page = fetch_page_text(url, use_cache=True) | |
| chunks = chunk_text(page.get("text", "")) | |
| for idx, ch in enumerate(chunks): | |
| docs.append({ | |
| "text": ch, | |
| "meta": { | |
| "source_type": "url", | |
| "url": page.get("url", url), | |
| "title": page.get("title", url), | |
| "chunk": idx, | |
| } | |
| }) | |
| except Exception: | |
| # skip bad URLs but continue ingestion | |
| continue | |
| return docs | |
| # ------------------------- | |
| # Local docs ingestion | |
| # ------------------------- | |
| def list_local_files() -> List[str]: | |
| """ | |
| Reads local files from data/docs/ | |
| Supported: .txt, .md, .pdf (text-based PDFs) | |
| """ | |
| if not os.path.exists(DOCS_DIR): | |
| return [] | |
| paths = [] | |
| for name in os.listdir(DOCS_DIR): | |
| p = os.path.join(DOCS_DIR, name) | |
| if not os.path.isfile(p): | |
| continue | |
| ext = os.path.splitext(name)[1].lower() | |
| if ext in [".txt", ".md", ".pdf"]: | |
| paths.append(p) | |
| return sorted(paths) | |
| def read_text_file(path: str) -> str: | |
| with open(path, "r", encoding="utf-8", errors="ignore") as f: | |
| return f.read() | |
| def read_pdf_text(path: str) -> str: | |
| """ | |
| Works best on selectable-text PDFs. | |
| Scanned/image-only PDFs will extract very little. | |
| """ | |
| reader = PdfReader(path) | |
| parts = [] | |
| for page in reader.pages: | |
| try: | |
| parts.append(page.extract_text() or "") | |
| except Exception: | |
| continue | |
| return "\n".join(parts).strip() | |
| def build_docs_from_files(file_paths: List[str]) -> List[Dict]: | |
| docs: List[Dict] = [] | |
| for path in file_paths: | |
| name = os.path.basename(path) | |
| ext = os.path.splitext(name)[1].lower() | |
| try: | |
| if ext in [".txt", ".md"]: | |
| text = read_text_file(path) | |
| elif ext == ".pdf": | |
| text = read_pdf_text(path) | |
| else: | |
| continue | |
| except Exception: | |
| continue | |
| chunks = chunk_text(text) | |
| for idx, ch in enumerate(chunks): | |
| docs.append({ | |
| "text": ch, | |
| "meta": { | |
| "source_type": "file", | |
| "url": f"file://{name}", | |
| "title": name, | |
| "chunk": idx, | |
| } | |
| }) | |
| return docs | |
| # ------------------------- | |
| # Index building | |
| # ------------------------- | |
| def build_faiss_index(docs: List[Dict]) -> None: | |
| model = SentenceTransformer(EMBED_MODEL_NAME) | |
| texts = [d["text"] for d in docs] | |
| emb = model.encode(texts, normalize_embeddings=True, show_progress_bar=True) | |
| emb = np.array(emb, dtype="float32") | |
| index = faiss.IndexFlatIP(emb.shape[1]) | |
| index.add(emb) | |
| faiss.write_index(index, FAISS_INDEX_PATH) | |
| with open(DOCSTORE_PATH, "wb") as f: | |
| pickle.dump(docs, f) | |
| def run_ingestion(): | |
| ensure_data_dir() | |
| urls = load_urls() | |
| url_docs = build_docs_from_urls(urls) if urls else [] | |
| file_paths = list_local_files() | |
| file_docs = build_docs_from_files(file_paths) if file_paths else [] | |
| docs = url_docs + file_docs | |
| if not docs: | |
| raise RuntimeError( | |
| "No documents found.\n" | |
| "- Add URLs to data/urls.json OR\n" | |
| "- Add files to data/docs/ (.txt, .md, .pdf)" | |
| ) | |
| build_faiss_index(docs) | |
| print("✅ Ingestion complete") | |
| print(f"URLs: {len(urls)}") | |
| print(f"Local files: {len(file_paths)}") | |
| print(f"Chunks: {len(docs)}") | |
| print(f"Saved index: {FAISS_INDEX_PATH}") | |
| print(f"Saved docs: {DOCSTORE_PATH}") | |
| if __name__ == "__main__": | |
| run_ingestion() |