Spaces:
Sleeping
Sleeping
File size: 6,074 Bytes
31fd087 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | import json
import os
import re
from pathlib import Path
from typing import Dict, List, Tuple
import fitz # PyMuPDF
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
ROOT_DIR = Path(__file__).resolve().parent
PDF_DIR = ROOT_DIR / "data" / "pdf"
INDEX_DIR = ROOT_DIR / "data" / "index"
SOURCE_LINKS_PATH = ROOT_DIR / "data" / "source_links.json"
# Increment this when changing ingest logic so apps can trigger rebuilds
INDEX_VERSION = 3
def load_source_links(path: Path) -> Dict[str, str]:
with path.open("r", encoding="utf-8") as f:
return json.load(f)
def clean_text(text: str) -> str:
# Fix hyphenation and line breaks while preserving paragraph boundaries
# 1) Normalize Windows/Mac line endings
text = text.replace("\r\n", "\n").replace("\r", "\n")
# 2) Remove hyphenation at line breaks: "exam-\nple" -> "example"
text = text.replace("-\n", "")
# 3) Collapse single line breaks inside paragraphs into spaces
lines = text.split("\n")
paragraphs: List[str] = []
current: List[str] = []
for line in lines:
if line.strip() == "":
if current:
paragraphs.append(" ".join(s.strip() for s in current if s.strip()))
current = []
else:
current.append(line)
if current:
paragraphs.append(" ".join(s.strip() for s in current if s.strip()))
return "\n\n".join(p.strip() for p in paragraphs if p.strip())
NOISE_SECTION_KEYWORDS = {
"table of contents",
"contents",
"references",
"bibliography",
"glossary",
"acknowledgements",
"acknowledgments",
"foreword",
"index",
"list of figures",
"list of tables",
}
def looks_like_toc_or_index(text: str) -> bool:
if not text:
return False
# Many lines with dot leaders followed by page numbers
matches = re.findall(r"\.{2,}\s*\d{1,3}\b", text)
return len(matches) >= 5
def is_noise_page(raw_text: str, page_number: int) -> bool:
t = (raw_text or "").lower()
# Drop first page globally as requested
if page_number == 1:
return True
if any(kw in t for kw in NOISE_SECTION_KEYWORDS):
return True
if looks_like_toc_or_index(raw_text):
return True
return False
def extract_paragraphs_with_pages(pdf_path: Path) -> List[Tuple[int, List[str]]]:
doc = fitz.open(pdf_path)
results: List[Tuple[int, List[str]]] = []
for page_number in range(len(doc)):
page = doc.load_page(page_number)
raw_text = page.get_text("text") or ""
# Skip pages that are likely ToC, Index, References, Glossary, or boilerplate
if is_noise_page(raw_text, page_number + 1):
continue
cleaned = clean_text(raw_text)
# Split paragraphs on double newlines created in clean_text
paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()]
results.append((page_number + 1, paragraphs))
return results
def filename_to_title(file_name: str) -> str:
name = file_name.rsplit(".", 1)[0]
return name.replace("_", " ").replace("-", " ")
def load_exclude_pages(path: Path) -> Dict[str, List[int]]:
"""Optional per-file manual page exclusions.
JSON format: {"Some.pdf": [1,2,3], "Other.pdf": [10,11]}
"""
if not path.exists():
return {}
try:
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
# Normalize keys to file names only
norm: Dict[str, List[int]] = {}
for k, v in (data or {}).items():
try:
fname = Path(k).name
nums = [int(x) for x in (v or [])]
norm[fname] = nums
except Exception:
continue
return norm
except Exception:
return {}
def build_index():
if not PDF_DIR.exists():
raise FileNotFoundError(f"PDF directory not found: {PDF_DIR}")
INDEX_DIR.mkdir(parents=True, exist_ok=True)
source_links = load_source_links(SOURCE_LINKS_PATH)
exclude_map = load_exclude_pages(ROOT_DIR / "data" / "exclude_pages.json")
texts: List[str] = []
metadatas: List[Dict] = []
for pdf_file in sorted(PDF_DIR.glob("*.pdf")):
file_name = pdf_file.name
url = source_links.get(file_name, "")
title = filename_to_title(file_name)
para_pages = extract_paragraphs_with_pages(pdf_file)
manual_excludes = set(exclude_map.get(file_name, []))
for page_num, paragraphs in para_pages:
if page_num in manual_excludes:
continue
for paragraph_index, paragraph in enumerate(paragraphs):
# Skip tiny fragments
if len(paragraph) < 40:
continue
texts.append(paragraph)
metadatas.append(
{
"file_name": file_name,
"title": title,
"url": url,
"page": page_num,
"paragraph_index": paragraph_index,
}
)
if not texts:
raise RuntimeError("No text extracted from PDFs. Check PDF parsing.")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas)
vectorstore.save_local(str(INDEX_DIR))
# Save a small manifest for debugging & UI
manifest = {
"num_texts": len(texts),
"pdf_dir": str(PDF_DIR),
"index_dir": str(INDEX_DIR),
"files_indexed": sorted([p.name for p in PDF_DIR.glob("*.pdf")]),
"index_version": INDEX_VERSION,
"manual_exclusions": exclude_map,
}
with (INDEX_DIR / "manifest.json").open("w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2)
print(f"Index built with {len(texts)} paragraphs. Saved to {INDEX_DIR}.")
if __name__ == "__main__":
build_index()
|