import json import os import re from pathlib import Path from typing import Dict, List, Tuple import fitz # PyMuPDF from langchain_community.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings ROOT_DIR = Path(__file__).resolve().parent PDF_DIR = ROOT_DIR / "data" / "pdf" INDEX_DIR = ROOT_DIR / "data" / "index" SOURCE_LINKS_PATH = ROOT_DIR / "data" / "source_links.json" # Increment this when changing ingest logic so apps can trigger rebuilds INDEX_VERSION = 3 def load_source_links(path: Path) -> Dict[str, str]: with path.open("r", encoding="utf-8") as f: return json.load(f) def clean_text(text: str) -> str: # Fix hyphenation and line breaks while preserving paragraph boundaries # 1) Normalize Windows/Mac line endings text = text.replace("\r\n", "\n").replace("\r", "\n") # 2) Remove hyphenation at line breaks: "exam-\nple" -> "example" text = text.replace("-\n", "") # 3) Collapse single line breaks inside paragraphs into spaces lines = text.split("\n") paragraphs: List[str] = [] current: List[str] = [] for line in lines: if line.strip() == "": if current: paragraphs.append(" ".join(s.strip() for s in current if s.strip())) current = [] else: current.append(line) if current: paragraphs.append(" ".join(s.strip() for s in current if s.strip())) return "\n\n".join(p.strip() for p in paragraphs if p.strip()) NOISE_SECTION_KEYWORDS = { "table of contents", "contents", "references", "bibliography", "glossary", "acknowledgements", "acknowledgments", "foreword", "index", "list of figures", "list of tables", } def looks_like_toc_or_index(text: str) -> bool: if not text: return False # Many lines with dot leaders followed by page numbers matches = re.findall(r"\.{2,}\s*\d{1,3}\b", text) return len(matches) >= 5 def is_noise_page(raw_text: str, page_number: int) -> bool: t = (raw_text or "").lower() # Drop first page globally as requested if page_number == 1: return True if any(kw in t for kw in NOISE_SECTION_KEYWORDS): return True if looks_like_toc_or_index(raw_text): return True return False def extract_paragraphs_with_pages(pdf_path: Path) -> List[Tuple[int, List[str]]]: doc = fitz.open(pdf_path) results: List[Tuple[int, List[str]]] = [] for page_number in range(len(doc)): page = doc.load_page(page_number) raw_text = page.get_text("text") or "" # Skip pages that are likely ToC, Index, References, Glossary, or boilerplate if is_noise_page(raw_text, page_number + 1): continue cleaned = clean_text(raw_text) # Split paragraphs on double newlines created in clean_text paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()] results.append((page_number + 1, paragraphs)) return results def filename_to_title(file_name: str) -> str: name = file_name.rsplit(".", 1)[0] return name.replace("_", " ").replace("-", " ") def load_exclude_pages(path: Path) -> Dict[str, List[int]]: """Optional per-file manual page exclusions. JSON format: {"Some.pdf": [1,2,3], "Other.pdf": [10,11]} """ if not path.exists(): return {} try: with path.open("r", encoding="utf-8") as f: data = json.load(f) # Normalize keys to file names only norm: Dict[str, List[int]] = {} for k, v in (data or {}).items(): try: fname = Path(k).name nums = [int(x) for x in (v or [])] norm[fname] = nums except Exception: continue return norm except Exception: return {} def build_index(): if not PDF_DIR.exists(): raise FileNotFoundError(f"PDF directory not found: {PDF_DIR}") INDEX_DIR.mkdir(parents=True, exist_ok=True) source_links = load_source_links(SOURCE_LINKS_PATH) exclude_map = load_exclude_pages(ROOT_DIR / "data" / "exclude_pages.json") texts: List[str] = [] metadatas: List[Dict] = [] for pdf_file in sorted(PDF_DIR.glob("*.pdf")): file_name = pdf_file.name url = source_links.get(file_name, "") title = filename_to_title(file_name) para_pages = extract_paragraphs_with_pages(pdf_file) manual_excludes = set(exclude_map.get(file_name, [])) for page_num, paragraphs in para_pages: if page_num in manual_excludes: continue for paragraph_index, paragraph in enumerate(paragraphs): # Skip tiny fragments if len(paragraph) < 40: continue texts.append(paragraph) metadatas.append( { "file_name": file_name, "title": title, "url": url, "page": page_num, "paragraph_index": paragraph_index, } ) if not texts: raise RuntimeError("No text extracted from PDFs. Check PDF parsing.") embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vectorstore = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas) vectorstore.save_local(str(INDEX_DIR)) # Save a small manifest for debugging & UI manifest = { "num_texts": len(texts), "pdf_dir": str(PDF_DIR), "index_dir": str(INDEX_DIR), "files_indexed": sorted([p.name for p in PDF_DIR.glob("*.pdf")]), "index_version": INDEX_VERSION, "manual_exclusions": exclude_map, } with (INDEX_DIR / "manifest.json").open("w", encoding="utf-8") as f: json.dump(manifest, f, indent=2) print(f"Index built with {len(texts)} paragraphs. Saved to {INDEX_DIR}.") if __name__ == "__main__": build_index()