Spaces:

TrizteX
/

SCDM-chatbot

Sleeping

File size: 6,074 Bytes

31fd087

import json
import os
import re
from pathlib import Path
from typing import Dict, List, Tuple

import fitz  # PyMuPDF
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings


ROOT_DIR = Path(__file__).resolve().parent
PDF_DIR = ROOT_DIR / "data" / "pdf"
INDEX_DIR = ROOT_DIR / "data" / "index"
SOURCE_LINKS_PATH = ROOT_DIR / "data" / "source_links.json"

# Increment this when changing ingest logic so apps can trigger rebuilds
INDEX_VERSION = 3


def load_source_links(path: Path) -> Dict[str, str]:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


def clean_text(text: str) -> str:
    # Fix hyphenation and line breaks while preserving paragraph boundaries
    # 1) Normalize Windows/Mac line endings
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    # 2) Remove hyphenation at line breaks: "exam-\nple" -> "example"
    text = text.replace("-\n", "")
    # 3) Collapse single line breaks inside paragraphs into spaces
    lines = text.split("\n")
    paragraphs: List[str] = []
    current: List[str] = []
    for line in lines:
        if line.strip() == "":
            if current:
                paragraphs.append(" ".join(s.strip() for s in current if s.strip()))
                current = []
        else:
            current.append(line)
    if current:
        paragraphs.append(" ".join(s.strip() for s in current if s.strip()))
    return "\n\n".join(p.strip() for p in paragraphs if p.strip())


NOISE_SECTION_KEYWORDS = {
    "table of contents",
    "contents",
    "references",
    "bibliography",
    "glossary",
    "acknowledgements",
    "acknowledgments",
    "foreword",
    "index",
    "list of figures",
    "list of tables",
}


def looks_like_toc_or_index(text: str) -> bool:
    if not text:
        return False
    # Many lines with dot leaders followed by page numbers
    matches = re.findall(r"\.{2,}\s*\d{1,3}\b", text)
    return len(matches) >= 5


def is_noise_page(raw_text: str, page_number: int) -> bool:
    t = (raw_text or "").lower()
    # Drop first page globally as requested
    if page_number == 1:
        return True
    if any(kw in t for kw in NOISE_SECTION_KEYWORDS):
        return True
    if looks_like_toc_or_index(raw_text):
        return True
    return False


def extract_paragraphs_with_pages(pdf_path: Path) -> List[Tuple[int, List[str]]]:
    doc = fitz.open(pdf_path)
    results: List[Tuple[int, List[str]]] = []
    for page_number in range(len(doc)):
        page = doc.load_page(page_number)
        raw_text = page.get_text("text") or ""
        # Skip pages that are likely ToC, Index, References, Glossary, or boilerplate
        if is_noise_page(raw_text, page_number + 1):
            continue
        cleaned = clean_text(raw_text)
        # Split paragraphs on double newlines created in clean_text
        paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()]
        results.append((page_number + 1, paragraphs))
    return results


def filename_to_title(file_name: str) -> str:
    name = file_name.rsplit(".", 1)[0]
    return name.replace("_", " ").replace("-", " ")


def load_exclude_pages(path: Path) -> Dict[str, List[int]]:
    """Optional per-file manual page exclusions.
    JSON format: {"Some.pdf": [1,2,3], "Other.pdf": [10,11]}
    """
    if not path.exists():
        return {}
    try:
        with path.open("r", encoding="utf-8") as f:
            data = json.load(f)
        # Normalize keys to file names only
        norm: Dict[str, List[int]] = {}
        for k, v in (data or {}).items():
            try:
                fname = Path(k).name
                nums = [int(x) for x in (v or [])]
                norm[fname] = nums
            except Exception:
                continue
        return norm
    except Exception:
        return {}


def build_index():
    if not PDF_DIR.exists():
        raise FileNotFoundError(f"PDF directory not found: {PDF_DIR}")
    INDEX_DIR.mkdir(parents=True, exist_ok=True)

    source_links = load_source_links(SOURCE_LINKS_PATH)
    exclude_map = load_exclude_pages(ROOT_DIR / "data" / "exclude_pages.json")

    texts: List[str] = []
    metadatas: List[Dict] = []

    for pdf_file in sorted(PDF_DIR.glob("*.pdf")):
        file_name = pdf_file.name
        url = source_links.get(file_name, "")
        title = filename_to_title(file_name)

        para_pages = extract_paragraphs_with_pages(pdf_file)
        manual_excludes = set(exclude_map.get(file_name, []))
        for page_num, paragraphs in para_pages:
            if page_num in manual_excludes:
                continue
            for paragraph_index, paragraph in enumerate(paragraphs):
                # Skip tiny fragments
                if len(paragraph) < 40:
                    continue
                texts.append(paragraph)
                metadatas.append(
                    {
                        "file_name": file_name,
                        "title": title,
                        "url": url,
                        "page": page_num,
                        "paragraph_index": paragraph_index,
                    }
                )

    if not texts:
        raise RuntimeError("No text extracted from PDFs. Check PDF parsing.")

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas)
    vectorstore.save_local(str(INDEX_DIR))

    # Save a small manifest for debugging & UI
    manifest = {
        "num_texts": len(texts),
        "pdf_dir": str(PDF_DIR),
        "index_dir": str(INDEX_DIR),
        "files_indexed": sorted([p.name for p in PDF_DIR.glob("*.pdf")]),
        "index_version": INDEX_VERSION,
        "manual_exclusions": exclude_map,
    }
    with (INDEX_DIR / "manifest.json").open("w", encoding="utf-8") as f:
        json.dump(manifest, f, indent=2)

    print(f"Index built with {len(texts)} paragraphs. Saved to {INDEX_DIR}.")


if __name__ == "__main__":
    build_index()