Spaces:

tharunchndrn
/

Syslink_Chatbot

Build error

File size: 5,445 Bytes

import os
import json
import pickle
from typing import List, Dict, Tuple

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from pypdf import PdfReader

from .config import (
    DATA_DIR,
    URLS_PATH,
    FAISS_INDEX_PATH,
    DOCSTORE_PATH,
    EMBED_MODEL_NAME,
)
from .fetcher import fetch_page_text


DOCS_DIR = os.path.join(DATA_DIR, "docs")


def ensure_data_dir():
    os.makedirs(DATA_DIR, exist_ok=True)
    os.makedirs(DOCS_DIR, exist_ok=True)  # safe even if empty


def load_urls() -> List[str]:
    """
    Expects data/urls.json like:
    { "urls": ["https://...", "https://..."] }
    """
    if not os.path.exists(URLS_PATH):
        # If urls.json missing, we allow ingestion to continue with local docs only
        return []

    with open(URLS_PATH, "r", encoding="utf-8") as f:
        obj = json.load(f)

    urls = obj.get("urls", [])
    return [u.strip() for u in urls if isinstance(u, str) and u.strip()]


def chunk_text(text: str, chunk_size_words: int = 900, overlap_words: int = 150) -> List[str]:
    """
    Simple word-based chunking (fast + reliable).
    """
    text = (text or "").strip()
    if not text:
        return []

    words = text.split()
    chunks = []
    i = 0
    step = max(1, chunk_size_words - overlap_words)

    while i < len(words):
        chunk = words[i:i + chunk_size_words]
        chunks.append(" ".join(chunk))
        i += step

    return chunks


# -------------------------
# URL ingestion
# -------------------------
def build_docs_from_urls(urls: List[str]) -> List[Dict]:
    docs: List[Dict] = []
    for url in urls:
        try:
            page = fetch_page_text(url, use_cache=True)
            chunks = chunk_text(page.get("text", ""))

            for idx, ch in enumerate(chunks):
                docs.append({
                    "text": ch,
                    "meta": {
                        "source_type": "url",
                        "url": page.get("url", url),
                        "title": page.get("title", url),
                        "chunk": idx,
                    }
                })
        except Exception:
            # skip bad URLs but continue ingestion
            continue
    return docs


# -------------------------
# Local docs ingestion
# -------------------------
def list_local_files() -> List[str]:
    """
    Reads local files from data/docs/
    Supported: .txt, .md, .pdf (text-based PDFs)
    """
    if not os.path.exists(DOCS_DIR):
        return []

    paths = []
    for name in os.listdir(DOCS_DIR):
        p = os.path.join(DOCS_DIR, name)
        if not os.path.isfile(p):
            continue
        ext = os.path.splitext(name)[1].lower()
        if ext in [".txt", ".md", ".pdf"]:
            paths.append(p)
    return sorted(paths)


def read_text_file(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()


def read_pdf_text(path: str) -> str:
    """
    Works best on selectable-text PDFs.
    Scanned/image-only PDFs will extract very little.
    """
    reader = PdfReader(path)
    parts = []
    for page in reader.pages:
        try:
            parts.append(page.extract_text() or "")
        except Exception:
            continue
    return "\n".join(parts).strip()


def build_docs_from_files(file_paths: List[str]) -> List[Dict]:
    docs: List[Dict] = []

    for path in file_paths:
        name = os.path.basename(path)
        ext = os.path.splitext(name)[1].lower()

        try:
            if ext in [".txt", ".md"]:
                text = read_text_file(path)
            elif ext == ".pdf":
                text = read_pdf_text(path)
            else:
                continue
        except Exception:
            continue

        chunks = chunk_text(text)
        for idx, ch in enumerate(chunks):
            docs.append({
                "text": ch,
                "meta": {
                    "source_type": "file",
                    "url": f"file://{name}",
                    "title": name,
                    "chunk": idx,
                }
            })

    return docs


# -------------------------
# Index building
# -------------------------
def build_faiss_index(docs: List[Dict]) -> None:
    model = SentenceTransformer(EMBED_MODEL_NAME)

    texts = [d["text"] for d in docs]
    emb = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
    emb = np.array(emb, dtype="float32")

    index = faiss.IndexFlatIP(emb.shape[1])
    index.add(emb)

    faiss.write_index(index, FAISS_INDEX_PATH)

    with open(DOCSTORE_PATH, "wb") as f:
        pickle.dump(docs, f)


def run_ingestion():
    ensure_data_dir()

    urls = load_urls()
    url_docs = build_docs_from_urls(urls) if urls else []

    file_paths = list_local_files()
    file_docs = build_docs_from_files(file_paths) if file_paths else []

    docs = url_docs + file_docs

    if not docs:
        raise RuntimeError(
            "No documents found.\n"
            "- Add URLs to data/urls.json OR\n"
            "- Add files to data/docs/ (.txt, .md, .pdf)"
        )

    build_faiss_index(docs)

    print("✅ Ingestion complete")
    print(f"URLs: {len(urls)}")
    print(f"Local files: {len(file_paths)}")
    print(f"Chunks: {len(docs)}")
    print(f"Saved index: {FAISS_INDEX_PATH}")
    print(f"Saved docs:  {DOCSTORE_PATH}")


if __name__ == "__main__":
    run_ingestion()